diff --git a/.cirrus.star b/.cirrus.star index 36233872d1e50..e9bb672b95936 100644 --- a/.cirrus.star +++ b/.cirrus.star @@ -7,7 +7,7 @@ https://github.com/bazelbuild/starlark/blob/master/spec.md See also .cirrus.yml and src/tools/ci/README """ -load("cirrus", "env", "fs") +load("cirrus", "env", "fs", "re", "yaml") def main(): @@ -18,19 +18,36 @@ def main(): 1) the contents of .cirrus.yml - 2) if defined, the contents of the file referenced by the, repository + 2) computed environment variables + + 3) if defined, the contents of the file referenced by the, repository level, REPO_CI_CONFIG_GIT_URL variable (see https://cirrus-ci.org/guide/programming-tasks/#fs for the accepted format) - 3) .cirrus.tasks.yml + 4) .cirrus.tasks.yml """ output = "" # 1) is evaluated implicitly + # Add 2) + additional_env = compute_environment_vars() + env_fmt = """ +### +# Computed environment variables start here +### +{0} +### +# Computed environment variables end here +### +""" + output += env_fmt.format(yaml.dumps({'env': additional_env})) + + + # Add 3) repo_config_url = env.get("REPO_CI_CONFIG_GIT_URL") if repo_config_url != None: print("loading additional configuration from \"{}\"".format(repo_config_url)) @@ -38,12 +55,75 @@ def main(): else: output += "\n# REPO_CI_CONFIG_URL was not set\n" - # Add 3) + + # Add 4) output += config_from(".cirrus.tasks.yml") + return output +def compute_environment_vars(): + cenv = {} + + ### + # Some tasks are manually triggered by default because they might use too + # many resources for users of free Cirrus credits, but they can be + # triggered automatically by naming them in an environment variable e.g. + # REPO_CI_AUTOMATIC_TRIGGER_TASKS="task_name other_task" under "Repository + # Settings" on Cirrus CI's website. + + default_manual_trigger_tasks = ['mingw', 'netbsd', 'openbsd'] + + repo_ci_automatic_trigger_tasks = env.get('REPO_CI_AUTOMATIC_TRIGGER_TASKS', '') + for task in default_manual_trigger_tasks: + name = 'CI_TRIGGER_TYPE_' + task.upper() + if repo_ci_automatic_trigger_tasks.find(task) != -1: + value = 'automatic' + else: + value = 'manual' + cenv[name] = value + ### + + ### + # Parse "ci-os-only:" tag in commit message and set + # CI_{$OS}_ENABLED variable for each OS + + # We want to disable SanityCheck if testing just a specific OS. This + # shortens push-wait-for-ci cycle time a bit when debugging operating + # system specific failures. Just treating it as an OS in that case + # suffices. + + operating_systems = [ + 'compilerwarnings', + 'freebsd', + 'linux', + 'macos', + 'mingw', + 'netbsd', + 'openbsd', + 'sanitycheck', + 'windows', + ] + commit_message = env.get('CIRRUS_CHANGE_MESSAGE') + match_re = r"(^|.*\n)ci-os-only: ([^\n]+)($|\n.*)" + + # re.match() returns an array with a tuple of (matched-string, match_1, ...) + m = re.match(match_re, commit_message) + if m and len(m) > 0: + os_only = m[0][2] + os_only_list = re.split(r'[, ]+', os_only) + else: + os_only_list = operating_systems + + for os in operating_systems: + os_enabled = os in os_only_list + cenv['CI_{0}_ENABLED'.format(os.upper())] = os_enabled + ### + + return cenv + + def config_from(config_src): """return contents of config file `config_src`, surrounded by markers indicating start / end of the included file diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml index 92057006c9309..038d043d00e78 100644 --- a/.cirrus.tasks.yml +++ b/.cirrus.tasks.yml @@ -31,6 +31,31 @@ env: TEMP_CONFIG: ${CIRRUS_WORKING_DIR}/src/tools/ci/pg_ci_base.conf PG_TEST_EXTRA: kerberos ldap ssl libpq_encryption load_balance oauth + # Postgres config args for the meson builds, shared between all meson tasks + # except the 'SanityCheck' task + MESON_COMMON_PG_CONFIG_ARGS: -Dcassert=true -Dinjection_points=true + + # Meson feature flags shared by all meson tasks, except: + # SanityCheck: uses almost no dependencies. + # Windows - VS: has fewer dependencies than listed here, so defines its own. + # Linux: uses the 'auto' feature option to test meson feature autodetection. + MESON_COMMON_FEATURES: >- + -Dauto_features=disabled + -Dldap=enabled + -Dssl=openssl + -Dtap_tests=enabled + -Dplperl=enabled + -Dplpython=enabled + -Ddocs=enabled + -Dicu=enabled + -Dlibxml=enabled + -Dlibxslt=enabled + -Dlz4=enabled + -Dpltcl=enabled + -Dreadline=enabled + -Dzlib=enabled + -Dzstd=enabled + # What files to preserve in case tests fail on_failure_ac: &on_failure_ac @@ -72,13 +97,13 @@ task: # push-wait-for-ci cycle time a bit when debugging operating system specific # failures. Uses skip instead of only_if, as cirrus otherwise warns about # only_if conditions not matching. - skip: $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:.*' + skip: $CI_SANITYCHECK_ENABLED == false env: CPUS: 4 BUILD_JOBS: 8 TEST_JOBS: 8 - IMAGE_FAMILY: pg-ci-bookworm + IMAGE_FAMILY: pg-ci-trixie CCACHE_DIR: ${CIRRUS_WORKING_DIR}/ccache_dir # no options enabled, should be small CCACHE_MAXSIZE: "150M" @@ -104,6 +129,7 @@ task: configure_script: | su postgres <<-EOF + set -e meson setup \ --buildtype=debug \ --auto-features=disabled \ @@ -112,6 +138,7 @@ task: EOF build_script: | su postgres <<-EOF + set -e ninja -C build -j${BUILD_JOBS} ${MBUILD_TARGET} EOF upload_caches: ccache @@ -121,6 +148,7 @@ task: # tap test that exercises both a frontend binary and the backend. test_minimal_script: | su postgres <<-EOF + set -e ulimit -c unlimited meson test $MTEST_ARGS --suite setup meson test $MTEST_ARGS --num-processes ${TEST_JOBS} \ @@ -164,10 +192,19 @@ task: -c debug_parallel_query=regress PG_TEST_PG_UPGRADE_MODE: --link + MESON_FEATURES: >- + -Ddtrace=enabled + -Dgssapi=enabled + -Dlibcurl=enabled + -Dnls=enabled + -Dpam=enabled + -Dtcl_version=tcl86 + -Duuid=bsd + <<: *freebsd_task_template depends_on: SanityCheck - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' || $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:[^\n]*freebsd.*' + only_if: $CI_FREEBSD_ENABLED sysinfo_script: | id @@ -195,11 +232,12 @@ task: # already takes longer than other platforms except for windows. configure_script: | su postgres <<-EOF + set -e meson setup \ + ${MESON_COMMON_PG_CONFIG_ARGS} \ --buildtype=debug \ - -Dcassert=true -Dinjection_points=true \ - -Duuid=bsd -Dtcl_version=tcl86 -Ddtrace=auto \ -Dextra_lib_dirs=/usr/local/lib -Dextra_include_dirs=/usr/local/include/ \ + ${MESON_COMMON_FEATURES} ${MESON_FEATURES} \ build EOF build_script: su postgres -c 'ninja -C build -j${BUILD_JOBS} ${MBUILD_TARGET}' @@ -207,6 +245,7 @@ task: test_world_script: | su postgres <<-EOF + set -e ulimit -c unlimited meson test $MTEST_ARGS --num-processes ${TEST_JOBS} EOF @@ -231,6 +270,7 @@ task: # during upload, as it doesn't expect artifacts to change size stop_running_script: | su postgres <<-EOF + set -e build/tmp_install/usr/local/pgsql/bin/pg_ctl -D build/runningcheck stop || true EOF <<: *on_failure_meson @@ -239,7 +279,6 @@ task: task: depends_on: SanityCheck - trigger_type: manual env: # Below are experimentally derived to be a decent choice. @@ -257,7 +296,9 @@ task: matrix: - name: NetBSD - Meson - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' || $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:[^\n]*netbsd.*' + # See REPO_CI_AUTOMATIC_TRIGGER_TASKS in .cirrus.star + trigger_type: $CI_TRIGGER_TYPE_NETBSD + only_if: $CI_NETBSD_ENABLED env: OS_NAME: netbsd IMAGE_FAMILY: pg-ci-netbsd-postgres @@ -269,18 +310,32 @@ task: LC_ALL: "C" # -Duuid is not set for the NetBSD, see the comment below, above # configure_script, for more information. + MESON_FEATURES: >- + -Dgssapi=enabled + -Dlibcurl=enabled + -Dnls=enabled + -Dpam=enabled + setup_additional_packages_script: | #pkgin -y install ... <<: *netbsd_task_template - name: OpenBSD - Meson - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' || $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:[^\n]*openbsd.*' + # See REPO_CI_AUTOMATIC_TRIGGER_TASKS in .cirrus.star + trigger_type: $CI_TRIGGER_TYPE_OPENBSD + only_if: $CI_OPENBSD_ENABLED env: OS_NAME: openbsd IMAGE_FAMILY: pg-ci-openbsd-postgres PKGCONFIG_PATH: '/usr/lib/pkgconfig:/usr/local/lib/pkgconfig' - UUID: -Duuid=e2fs - TCL: -Dtcl_version=tcl86 + CORE_DUMP_EXECUTABLE_DIR: $CIRRUS_WORKING_DIR/build/tmp_install/usr/local/pgsql/bin + + MESON_FEATURES: >- + -Dbsd_auth=enabled + -Dlibcurl=enabled + -Dtcl_version=tcl86 + -Duuid=e2fs + setup_additional_packages_script: | #pkg_add -I ... # Always core dump to ${CORE_DUMP_DIR} @@ -313,12 +368,12 @@ task: # And other uuid options are not available on NetBSD. configure_script: | su postgres <<-EOF + set -e meson setup \ + ${MESON_COMMON_PG_CONFIG_ARGS} \ --buildtype=debugoptimized \ --pkg-config-path ${PKGCONFIG_PATH} \ - -Dcassert=true -Dinjection_points=true \ - -Dssl=openssl ${UUID} ${TCL} \ - -DPG_TEST_EXTRA="$PG_TEST_EXTRA" \ + ${MESON_COMMON_FEATURES} ${MESON_FEATURES} \ build EOF @@ -327,6 +382,7 @@ task: test_world_script: | su postgres <<-EOF + set -e ulimit -c unlimited # Otherwise tests will fail on OpenBSD, due to inability to start enough # processes. @@ -341,7 +397,7 @@ task: # ${CORE_DUMP_DIR}, they may not obey this. So, move core files to the # ${CORE_DUMP_DIR} directory. find build/ -type f -name '*.core' -exec mv '{}' ${CORE_DUMP_DIR} \; - src/tools/ci/cores_backtrace.sh ${OS_NAME} ${CORE_DUMP_DIR} + src/tools/ci/cores_backtrace.sh ${OS_NAME} ${CORE_DUMP_DIR} ${CORE_DUMP_EXECUTABLE_DIR} # configure feature flags, shared between the task running the linux tests and @@ -365,10 +421,6 @@ LINUX_CONFIGURE_FEATURES: &LINUX_CONFIGURE_FEATURES >- --with-uuid=ossp --with-zstd -LINUX_MESON_FEATURES: &LINUX_MESON_FEATURES >- - -Dllvm=enabled - -Duuid=e2fs - # Check SPECIAL in the matrix: below task: @@ -376,7 +428,7 @@ task: CPUS: 4 BUILD_JOBS: 4 TEST_JOBS: 8 # experimentally derived to be a decent choice - IMAGE_FAMILY: pg-ci-bookworm + IMAGE_FAMILY: pg-ci-trixie CCACHE_DIR: /tmp/ccache_dir DEBUGINFOD_URLS: "https://debuginfod.debian.net" @@ -397,7 +449,7 @@ task: # print_stacktraces=1,verbosity=2, duh # detect_leaks=0: too many uninteresting leak errors in short-lived binaries UBSAN_OPTIONS: print_stacktrace=1:disable_coredump=0:abort_on_error=1:verbosity=2 - ASAN_OPTIONS: print_stacktrace=1:disable_coredump=0:abort_on_error=1:detect_leaks=0 + ASAN_OPTIONS: print_stacktrace=1:disable_coredump=0:abort_on_error=1:detect_leaks=0:detect_stack_use_after_return=0 # SANITIZER_FLAGS is set in the tasks below CFLAGS: -Og -ggdb -fno-sanitize-recover=all $SANITIZER_FLAGS @@ -405,16 +457,15 @@ task: LDFLAGS: $SANITIZER_FLAGS CC: ccache gcc CXX: ccache g++ - # GCC emits a warning for llvm-14, so switch to a newer one. - LLVM_CONFIG: llvm-config-16 LINUX_CONFIGURE_FEATURES: *LINUX_CONFIGURE_FEATURES - LINUX_MESON_FEATURES: *LINUX_MESON_FEATURES + LINUX_MESON_FEATURES: >- + -Duuid=e2fs <<: *linux_task_template depends_on: SanityCheck - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' || $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:[^\n]*linux.*' + only_if: $CI_LINUX_ENABLED ccache_cache: folder: ${CCACHE_DIR} @@ -453,7 +504,7 @@ task: # - Uses address sanitizer, sanitizer failures are typically printed in # the server log # - Configures postgres with a small segment size - - name: Linux - Debian Bookworm - Autoconf + - name: Linux - Debian Trixie - Autoconf env: SANITIZER_FLAGS: -fsanitize=address @@ -467,6 +518,7 @@ task: # that. configure_script: | su postgres <<-EOF + set -e ./configure \ --enable-cassert --enable-injection-points --enable-debug \ --enable-tap-tests --enable-nls \ @@ -476,13 +528,14 @@ task: \ ${LINUX_CONFIGURE_FEATURES} \ \ - CLANG="ccache clang-16" + CLANG="ccache clang" EOF build_script: su postgres -c "make -s -j${BUILD_JOBS} world-bin" upload_caches: ccache test_world_script: | su postgres <<-EOF + set -e ulimit -c unlimited # default is 0 make -s ${CHECK} ${CHECKFLAGS} -j${TEST_JOBS} EOF @@ -495,7 +548,8 @@ task: # are typically printed in the server log # - Test both 64bit and 32 bit builds # - uses io_method=io_uring - - name: Linux - Debian Bookworm - Meson + # - Uses meson feature autodetection + - name: Linux - Debian Trixie - Meson env: CCACHE_MAXSIZE: "400M" # tests two different builds @@ -505,10 +559,11 @@ task: configure_script: | su postgres <<-EOF + set -e meson setup \ + ${MESON_COMMON_PG_CONFIG_ARGS} \ --buildtype=debug \ - -Dcassert=true -Dinjection_points=true \ - ${LINUX_MESON_FEATURES} \ + ${LINUX_MESON_FEATURES} -Dllvm=enabled \ build EOF @@ -516,26 +571,27 @@ task: # locally. configure_32_script: | su postgres <<-EOF + set -e export CC='ccache gcc -m32' meson setup \ + ${MESON_COMMON_PG_CONFIG_ARGS} \ --buildtype=debug \ - -Dcassert=true -Dinjection_points=true \ - ${LINUX_MESON_FEATURES} \ - -Dllvm=disabled \ --pkg-config-path /usr/lib/i386-linux-gnu/pkgconfig/ \ - -DPERL=perl5.36-i386-linux-gnu \ - -Dlibnuma=disabled \ + -DPERL=perl5.40-i386-linux-gnu \ + ${LINUX_MESON_FEATURES} -Dlibnuma=disabled \ build-32 EOF build_script: | su postgres <<-EOF + set -e ninja -C build -j${BUILD_JOBS} ${MBUILD_TARGET} ninja -C build -t missingdeps EOF build_32_script: | su postgres <<-EOF + set -e ninja -C build-32 -j${BUILD_JOBS} ${MBUILD_TARGET} ninja -C build -t missingdeps EOF @@ -544,6 +600,7 @@ task: test_world_script: | su postgres <<-EOF + set -e ulimit -c unlimited meson test $MTEST_ARGS --num-processes ${TEST_JOBS} EOF @@ -556,6 +613,7 @@ task: # from C, prevent that with PYTHONCOERCECLOCALE. test_world_32_script: | su postgres <<-EOF + set -e ulimit -c unlimited PYTHONCOERCECLOCALE=0 LANG=C meson test $MTEST_ARGS -C build-32 --num-processes ${TEST_JOBS} EOF @@ -573,7 +631,7 @@ task: # SPECIAL: # - Enables --clone for pg_upgrade and pg_combinebackup task: - name: macOS - Sonoma - Meson + name: macOS - Sequoia - Meson env: CPUS: 4 # always get that much for cirrusci macOS instances @@ -582,12 +640,20 @@ task: # work OK. See # https://postgr.es/m/20220927040208.l3shfcidovpzqxfh%40awork3.anarazel.de TEST_JOBS: 8 - IMAGE: ghcr.io/cirruslabs/macos-runner:sonoma + IMAGE: ghcr.io/cirruslabs/macos-runner:sequoia CIRRUS_WORKING_DIR: ${HOME}/pgsql/ CCACHE_DIR: ${HOME}/ccache MACPORTS_CACHE: ${HOME}/macports-cache + MESON_FEATURES: >- + -Dbonjour=enabled + -Ddtrace=enabled + -Dgssapi=enabled + -Dlibcurl=enabled + -Dnls=enabled + -Duuid=e2fs + MACOS_PACKAGE_LIST: >- ccache icu @@ -613,7 +679,7 @@ task: <<: *macos_task_template depends_on: SanityCheck - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' || $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:[^\n]*(macos|darwin|osx).*' + only_if: $CI_MACOS_ENABLED sysinfo_script: | id @@ -657,11 +723,11 @@ task: configure_script: | export PKG_CONFIG_PATH="/opt/local/lib/pkgconfig/" meson setup \ + ${MESON_COMMON_PG_CONFIG_ARGS} \ --buildtype=debug \ -Dextra_include_dirs=/opt/local/include \ -Dextra_lib_dirs=/opt/local/lib \ - -Dcassert=true -Dinjection_points=true \ - -Duuid=e2fs -Ddtrace=auto \ + ${MESON_COMMON_FEATURES} ${MESON_FEATURES} \ build build_script: ninja -C build -j${BUILD_JOBS} ${MBUILD_TARGET} @@ -701,7 +767,7 @@ WINDOWS_ENVIRONMENT_BASE: &WINDOWS_ENVIRONMENT_BASE task: - name: Windows - Server 2019, VS 2019 - Meson & ninja + name: Windows - Server 2022, VS 2019 - Meson & ninja << : *WINDOWS_ENVIRONMENT_BASE env: @@ -716,10 +782,18 @@ task: # 0x8001 is SEM_FAILCRITICALERRORS | SEM_NOOPENFILEERRORBOX CIRRUS_WINDOWS_ERROR_MODE: 0x8001 + MESON_FEATURES: + -Dauto_features=disabled + -Dldap=enabled + -Dssl=openssl + -Dtap_tests=enabled + -Dplperl=enabled + -Dplpython=enabled + <<: *windows_task_template depends_on: SanityCheck - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' || $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:[^\n]*windows.*' + only_if: $CI_WINDOWS_ENABLED setup_additional_packages_script: | REM choco install -y --no-progress ... @@ -730,10 +804,9 @@ task: echo 127.0.0.3 pg-loadbalancetest >> c:\Windows\System32\Drivers\etc\hosts type c:\Windows\System32\Drivers\etc\hosts - # Use /DEBUG:FASTLINK to avoid high memory usage during linking configure_script: | vcvarsall x64 - meson setup --backend ninja --buildtype debug -Dc_link_args=/DEBUG:FASTLINK -Dcassert=true -Dinjection_points=true -Db_pch=true -Dextra_lib_dirs=c:\openssl\1.1\lib -Dextra_include_dirs=c:\openssl\1.1\include -DTAR=%TAR% build + meson setup --backend ninja %MESON_COMMON_PG_CONFIG_ARGS% --buildtype debug -Db_pch=true -Dextra_lib_dirs=c:\openssl\1.1\lib -Dextra_include_dirs=c:\openssl\1.1\include -DTAR=%TAR% %MESON_FEATURES% build build_script: | vcvarsall x64 @@ -753,15 +826,13 @@ task: task: << : *WINDOWS_ENVIRONMENT_BASE - name: Windows - Server 2019, MinGW64 - Meson - - # due to resource constraints we don't run this task by default for now - trigger_type: manual - # worth using only_if despite being manual, otherwise this task will show up - # when e.g. ci-os-only: linux is used. - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' || $CIRRUS_CHANGE_MESSAGE =~ '.*\nci-os-only:[^\n]*mingw.*' - # otherwise it'll be sorted before other tasks + name: Windows - Server 2022, MinGW64 - Meson + + # See REPO_CI_AUTOMATIC_TRIGGER_TASKS in .cirrus.star. + trigger_type: $CI_TRIGGER_TYPE_MINGW + depends_on: SanityCheck + only_if: $CI_MINGW_ENABLED env: TEST_JOBS: 4 # higher concurrency causes occasional failures @@ -777,6 +848,11 @@ task: CHERE_INVOKING: 1 BASH: C:\msys64\usr\bin\bash.exe -l + # Keep -Dnls explicitly disabled, as the number of files it creates causes a + # noticeable slowdown. + MESON_FEATURES: >- + -Dnls=disabled + <<: *windows_task_template ccache_cache: @@ -791,9 +867,8 @@ task: %BASH% -c "where perl" %BASH% -c "perl --version" - # disable -Dnls as the number of files it creates cause a noticable slowdown configure_script: | - %BASH% -c "meson setup -Ddebug=true -Doptimization=g -Dcassert=true -Dinjection_points=true -Db_pch=true -Dnls=disabled -DTAR=%TAR% build" + %BASH% -c "meson setup %MESON_COMMON_PG_CONFIG_ARGS% -Ddebug=true -Doptimization=g -Db_pch=true %MESON_COMMON_FEATURES% %MESON_FEATURES% -DTAR=%TAR% build" build_script: | %BASH% -c "ninja -C build ${MBUILD_TARGET}" @@ -815,15 +890,14 @@ task: # To limit unnecessary work only run this once the SanityCheck # succeeds. This is particularly important for this task as we intentionally - # use always: to continue after failures. Task that did not run count as a - # success, so we need to recheck SanityChecks's condition here ... + # use always: to continue after failures. depends_on: SanityCheck - only_if: $CIRRUS_CHANGE_MESSAGE !=~ '.*\nci-os-only:.*' + only_if: $CI_COMPILERWARNINGS_ENABLED env: CPUS: 4 BUILD_JOBS: 4 - IMAGE_FAMILY: pg-ci-bookworm + IMAGE_FAMILY: pg-ci-trixie # Use larger ccache cache, as this task compiles with multiple compilers / # flag combinations @@ -831,10 +905,6 @@ task: CCACHE_DIR: "/tmp/ccache_dir" LINUX_CONFIGURE_FEATURES: *LINUX_CONFIGURE_FEATURES - LINUX_MESON_FEATURES: *LINUX_MESON_FEATURES - - # GCC emits a warning for llvm-14, so switch to a newer one. - LLVM_CONFIG: llvm-config-16 <<: *linux_task_template @@ -871,7 +941,7 @@ task: --cache gcc.cache \ --enable-dtrace \ ${LINUX_CONFIGURE_FEATURES} \ - CC="ccache gcc" CXX="ccache g++" CLANG="ccache clang-16" + CC="ccache gcc" CXX="ccache g++" CLANG="ccache clang" make -s -j${BUILD_JOBS} clean time make -s -j${BUILD_JOBS} world-bin @@ -882,7 +952,7 @@ task: --cache gcc.cache \ --enable-cassert \ ${LINUX_CONFIGURE_FEATURES} \ - CC="ccache gcc" CXX="ccache g++" CLANG="ccache clang-16" + CC="ccache gcc" CXX="ccache g++" CLANG="ccache clang" make -s -j${BUILD_JOBS} clean time make -s -j${BUILD_JOBS} world-bin @@ -892,7 +962,7 @@ task: time ./configure \ --cache clang.cache \ ${LINUX_CONFIGURE_FEATURES} \ - CC="ccache clang" CXX="ccache clang++-16" CLANG="ccache clang-16" + CC="ccache clang" CXX="ccache clang++" CLANG="ccache clang" make -s -j${BUILD_JOBS} clean time make -s -j${BUILD_JOBS} world-bin @@ -904,7 +974,7 @@ task: --enable-cassert \ --enable-dtrace \ ${LINUX_CONFIGURE_FEATURES} \ - CC="ccache clang" CXX="ccache clang++-16" CLANG="ccache clang-16" + CC="ccache clang" CXX="ccache clang++" CLANG="ccache clang" make -s -j${BUILD_JOBS} clean time make -s -j${BUILD_JOBS} world-bin @@ -912,11 +982,11 @@ task: always: mingw_cross_warning_script: | time ./configure \ - --host=x86_64-w64-mingw32 \ + --host=x86_64-w64-mingw32ucrt \ --enable-cassert \ --without-icu \ - CC="ccache x86_64-w64-mingw32-gcc" \ - CXX="ccache x86_64-w64-mingw32-g++" + CC="ccache x86_64-w64-mingw32ucrt-gcc" \ + CXX="ccache x86_64-w64-mingw32ucrt-g++" make -s -j${BUILD_JOBS} clean time make -s -j${BUILD_JOBS} world-bin @@ -928,26 +998,22 @@ task: docs_build_script: | time ./configure \ --cache gcc.cache \ - CC="ccache gcc" CXX="ccache g++" CLANG="ccache clang-16" + CC="ccache gcc" CXX="ccache g++" CLANG="ccache clang" make -s -j${BUILD_JOBS} clean time make -s -j${BUILD_JOBS} -C doc ### # Verify headerscheck / cpluspluscheck succeed # - # - Don't use ccache, the files are uncacheable, polluting ccache's - # cache # - Use -fmax-errors, as particularly cpluspluscheck can be very verbose - # - XXX have to disable ICU to avoid errors: - # https://postgr.es/m/20220323002024.f2g6tivduzrktgfa%40alap3.anarazel.de ### always: headers_headerscheck_script: | time ./configure \ ${LINUX_CONFIGURE_FEATURES} \ - --without-icu \ + --cache gcc.cache \ --quiet \ - CC="gcc" CXX"=g++" CLANG="clang-16" + CC="ccache gcc" CXX="ccache g++" CLANG="ccache clang" make -s -j${BUILD_JOBS} clean time make -s headerscheck EXTRAFLAGS='-fmax-errors=10' headers_cpluspluscheck_script: | diff --git a/.cirrus.yml b/.cirrus.yml index 33c6e481d746a..3f75852e84ecb 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -10,12 +10,20 @@ # # 1) the contents of this file # -# 2) if defined, the contents of the file referenced by the, repository +# 2) computed environment variables +# +# Used to enable/disable tasks based on the execution environment. See +# .cirrus.star: compute_environment_vars() +# +# 3) if defined, the contents of the file referenced by the, repository # level, REPO_CI_CONFIG_GIT_URL variable (see # https://cirrus-ci.org/guide/programming-tasks/#fs for the accepted # format) # -# 3) .cirrus.tasks.yml +# This allows running tasks in a different execution environment than the +# default, e.g. to have sufficient resources for cfbot. +# +# 4) .cirrus.tasks.yml # # This composition is done by .cirrus.star diff --git a/.editorconfig b/.editorconfig index e20d15d4533a4..0ee9bd28ac47d 100644 --- a/.editorconfig +++ b/.editorconfig @@ -85,6 +85,12 @@ insert_final_newline = true indent_style = unset tab_width = unset +[src/backend/utils/misc/postgresql.conf.sample] +trim_trailing_whitespace = true +insert_final_newline = true +indent_style = space +tab_width = unset + [*.out] indent_style = unset indent_size = unset diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index d132a32b975ed..377c3a7379792 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -14,6 +14,36 @@ # # $ git log --pretty=format:"%H # %cd%n# %s" $PGINDENTGITHASH -1 --date=iso +86b276a4a9b4b2c63ef00765f0e2867e1bcac4ca # 2025-11-19 10:41:28 +0100 +# Fix indentation + +f63ae72bbcea057534144eaf27ffe3f6e9267511 # 2025-11-18 10:28:36 -0600 +# Switch from tabs to spaces in postgresql.conf.sample. + +c2b0e3a0351e021dea9b61fe2f759570d3fedb70 # 2025-11-13 14:25:21 +0900 +# Fix indentation issue + +e94a7afe44bfa1bd8dc929204a2d4ac8b3fa9854 # 2025-10-21 09:56:26 -0500 +# Re-pgindent brin.c. + +7e9c216b5236cc61f677787b35e8c8f28f5f6959 # 2025-09-13 14:50:02 -0500 +# Re-pgindent nbtpreprocesskeys.c after commit 796962922e. + +1d1612aec7688139e1a5506df1366b4b6a69605d # 2025-07-29 09:10:41 -0400 +# Run pgindent. + +73873805fb3627cb23937c750fa83ffd8f16fc6c # 2025-07-25 16:36:44 -0400 +# Run pgindent on the changes of the previous patch. + +9e345415bcd3c4358350b89edfd710469b8bfaf9 # 2025-07-01 15:23:07 +0200 +# Fix indentation in pg_numa code + +b27644bade0348d0dafd3036c47880a349fe9332 # 2025-06-15 13:04:24 -0400 +# Sync typedefs.list with the buildfarm. + +4672b6223910687b2aab075bcd2dd54ce90d5171 # 2025-06-01 14:55:24 -0400 +# Run pgindent on the previous commit. + 918e7287ed20eb1fe280ab6c4056ccf94dcd53a8 # 2025-04-30 19:18:30 +1200 # Fix broken indentation diff --git a/.gitattributes b/.gitattributes index 8df6b75e653b8..00092168393ae 100644 --- a/.gitattributes +++ b/.gitattributes @@ -12,13 +12,14 @@ *.xsl whitespace=space-before-tab,trailing-space,tab-in-indent # Avoid confusing ASCII underlines with leftover merge conflict markers -README conflict-marker-size=32 -README.* conflict-marker-size=32 +README conflict-marker-size=48 +README.* conflict-marker-size=48 # Certain data files that contain special whitespace, and other special cases *.data -whitespace contrib/pgcrypto/sql/pgp-armor.sql whitespace=-blank-at-eol src/backend/catalog/sql_features.txt whitespace=space-before-tab,blank-at-eof,-blank-at-eol +src/backend/utils/misc/postgresql.conf.sample whitespace=space-before-tab,trailing-space,tab-in-indent # Test output files that contain extra whitespace *.out -whitespace diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5f3e1d1faf930..1509dbfa2abe7 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -7,10 +7,10 @@ # Select the format archetype to be used by gcc to check printf-type functions. # We prefer "gnu_printf", as that most closely matches the features supported # by src/port/snprintf.c (particularly the %m conversion spec). However, -# on some NetBSD versions, that doesn't work while "__syslog__" does. -# If all else fails, use "printf". +# on clang and on some NetBSD versions, that doesn't work while "__syslog__" +# does. If all else fails, use "printf". AC_DEFUN([PGAC_PRINTF_ARCHETYPE], -[AC_CACHE_CHECK([for printf format archetype], pgac_cv_printf_archetype, +[AC_CACHE_CHECK([for C printf format archetype], pgac_cv_printf_archetype, [pgac_cv_printf_archetype=gnu_printf PGAC_TEST_PRINTF_ARCHETYPE if [[ "$ac_archetype_ok" = no ]]; then @@ -20,8 +20,8 @@ if [[ "$ac_archetype_ok" = no ]]; then pgac_cv_printf_archetype=printf fi fi]) -AC_DEFINE_UNQUOTED([PG_PRINTF_ATTRIBUTE], [$pgac_cv_printf_archetype], -[Define to best printf format archetype, usually gnu_printf if available.]) +AC_DEFINE_UNQUOTED([PG_C_PRINTF_ATTRIBUTE], [$pgac_cv_printf_archetype], +[Define to best C printf format archetype, usually gnu_printf if available.]) ])# PGAC_PRINTF_ARCHETYPE # Subroutine: test $pgac_cv_printf_archetype, set $ac_archetype_ok to yes or no @@ -38,6 +38,42 @@ ac_c_werror_flag=$ac_save_c_werror_flag ])# PGAC_TEST_PRINTF_ARCHETYPE +# PGAC_CXX_PRINTF_ARCHETYPE +# ------------------------- +# Because we support using gcc as C compiler with clang as C++ compiler, +# we have to be prepared to use different printf archetypes in C++ code. +# So, do the above test all over in C++. +AC_DEFUN([PGAC_CXX_PRINTF_ARCHETYPE], +[AC_CACHE_CHECK([for C++ printf format archetype], pgac_cv_cxx_printf_archetype, +[pgac_cv_cxx_printf_archetype=gnu_printf +PGAC_TEST_CXX_PRINTF_ARCHETYPE +if [[ "$ac_archetype_ok" = no ]]; then + pgac_cv_cxx_printf_archetype=__syslog__ + PGAC_TEST_CXX_PRINTF_ARCHETYPE + if [[ "$ac_archetype_ok" = no ]]; then + pgac_cv_cxx_printf_archetype=printf + fi +fi]) +AC_DEFINE_UNQUOTED([PG_CXX_PRINTF_ATTRIBUTE], [$pgac_cv_cxx_printf_archetype], +[Define to best C++ printf format archetype, usually gnu_printf if available.]) +])# PGAC_CXX_PRINTF_ARCHETYPE + +# Subroutine: test $pgac_cv_cxx_printf_archetype, set $ac_archetype_ok to yes or no +AC_DEFUN([PGAC_TEST_CXX_PRINTF_ARCHETYPE], +[ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +AC_LANG_PUSH(C++) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM( +[extern void pgac_write(int ignore, const char *fmt,...) +__attribute__((format($pgac_cv_cxx_printf_archetype, 2, 3)));], +[pgac_write(0, "error %s: %m", "foo");])], + [ac_archetype_ok=yes], + [ac_archetype_ok=no]) +AC_LANG_POP([]) +ac_cxx_werror_flag=$ac_save_cxx_werror_flag +])# PGAC_TEST_CXX_PRINTF_ARCHETYPE + + # PGAC_TYPE_128BIT_INT # -------------------- # Check if __int128 is a working 128 bit integer type, and if so @@ -83,7 +119,7 @@ if test x"$pgac_cv__128bit_int" = xyes ; then AC_CACHE_CHECK([for __int128 alignment bug], [pgac_cv__128bit_int_bug], [AC_RUN_IFELSE([AC_LANG_PROGRAM([ /* This must match the corresponding code in c.h: */ -#if defined(__GNUC__) || defined(__SUNPRO_C) +#if defined(__GNUC__) #define pg_attribute_aligned(a) __attribute__((aligned(a))) #elif defined(_MSC_VER) #define pg_attribute_aligned(a) __declspec(align(a)) @@ -114,23 +150,19 @@ fi])# PGAC_TYPE_128BIT_INT -# PGAC_C_STATIC_ASSERT -# -------------------- -# Check if the C compiler understands _Static_assert(), -# and define HAVE__STATIC_ASSERT if so. -# -# We actually check the syntax ({ _Static_assert(...) }), because we need -# gcc-style compound expressions to be able to wrap the thing into macros. -AC_DEFUN([PGAC_C_STATIC_ASSERT], -[AC_CACHE_CHECK(for _Static_assert, pgac_cv__static_assert, +# PGAC_C_STATEMENT_EXPRESSIONS +# ---------------------------- +# Check if the C compiler understands GCC statement expressions. +AC_DEFUN([PGAC_C_STATEMENT_EXPRESSIONS], +[AC_CACHE_CHECK(for statement expressions, pgac_cv_statement_expressions, [AC_LINK_IFELSE([AC_LANG_PROGRAM([], [({ _Static_assert(1, "foo"); })])], -[pgac_cv__static_assert=yes], -[pgac_cv__static_assert=no])]) -if test x"$pgac_cv__static_assert" = xyes ; then -AC_DEFINE(HAVE__STATIC_ASSERT, 1, - [Define to 1 if your compiler understands _Static_assert.]) -fi])# PGAC_C_STATIC_ASSERT +[pgac_cv_statement_expressions=yes], +[pgac_cv_statement_expressions=no])]) +if test x"$pgac_cv_statement_expressions" = xyes ; then +AC_DEFINE(HAVE_STATEMENT_EXPRESSIONS, 1, + [Define to 1 if your compiler supports statement expressions.]) +fi])# PGAC_C_STATEMENT_EXPRESSIONS @@ -602,6 +634,7 @@ AC_CACHE_CHECK([for _mm512_clmulepi64_epi128], [Ac_cachevar], { __m128i z; + x = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi32_si128(0)), x); y = _mm512_clmulepi64_epi128(x, y, 0); z = _mm_ternarylogic_epi64( _mm512_castsi512_si128(y), diff --git a/config/llvm.m4 b/config/llvm.m4 index fa4bedd9370fc..9d6fe8199e364 100644 --- a/config/llvm.m4 +++ b/config/llvm.m4 @@ -4,7 +4,7 @@ # ----------------- # # Look for the LLVM installation, check that it's new enough, set the -# corresponding LLVM_{CFLAGS,CXXFLAGS,BINPATH} and LDFLAGS +# corresponding LLVM_{CFLAGS,CXXFLAGS,BINPATH,LIBS} # variables. Also verify that CLANG is available, to transform C # into bitcode. # @@ -55,7 +55,7 @@ AC_DEFUN([PGAC_LLVM_SUPPORT], for pgac_option in `$LLVM_CONFIG --ldflags`; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LLVM_LIBS="$LLVM_LIBS $pgac_option";; esac done diff --git a/config/prep_buildtree b/config/prep_buildtree index a0eabd3dee288..e148535ac112e 100644 --- a/config/prep_buildtree +++ b/config/prep_buildtree @@ -22,18 +22,14 @@ sourcetree=`cd $1 && pwd` buildtree=`cd ${2:-'.'} && pwd` -# We must not auto-create the subdirectories holding built documentation. -# If we did, it would interfere with installation of prebuilt docs from -# the source tree, if a VPATH build is done from a distribution tarball. -# See bug #5595. -for item in `find "$sourcetree" -type d \( \( -name CVS -prune \) -o \( -name .git -prune \) -o -print \) | grep -v "$sourcetree/doc/src/sgml/\+"`; do +for item in `find "$sourcetree"/config "$sourcetree"/contrib "$sourcetree"/doc "$sourcetree"/src -type d -print`; do subdir=`expr "$item" : "$sourcetree\(.*\)"` if test ! -d "$buildtree/$subdir"; then mkdir -p "$buildtree/$subdir" || exit 1 fi done -for item in `find "$sourcetree" -name Makefile -print -o -name GNUmakefile -print | grep -v "$sourcetree/doc/src/sgml/images/"`; do +for item in "$sourcetree"/Makefile `find "$sourcetree"/config "$sourcetree"/contrib "$sourcetree"/doc "$sourcetree"/src -name Makefile -print -o -name GNUmakefile -print`; do filename=`expr "$item" : "$sourcetree\(.*\)"` if test ! -f "${item}.in"; then if cmp "$item" "$buildtree/$filename" >/dev/null 2>&1; then : ; else diff --git a/config/programs.m4 b/config/programs.m4 index 0ad1e58b48d6b..e57fe4907b844 100644 --- a/config/programs.m4 +++ b/config/programs.m4 @@ -284,20 +284,26 @@ AC_DEFUN([PGAC_CHECK_STRIP], AC_DEFUN([PGAC_CHECK_LIBCURL], [ + # libcurl compiler/linker flags are kept separate from the global flags, so + # they have to be added back temporarily for the following tests. + pgac_save_CPPFLAGS=$CPPFLAGS + pgac_save_LDFLAGS=$LDFLAGS + pgac_save_LIBS=$LIBS + + CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS" + LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS" + AC_CHECK_HEADER(curl/curl.h, [], [AC_MSG_ERROR([header file is required for --with-libcurl])]) + + # LIBCURL_LDLIBS is determined here. Like the compiler flags, it should not + # pollute the global LIBS setting. AC_CHECK_LIB(curl, curl_multi_init, [ AC_DEFINE([HAVE_LIBCURL], [1], [Define to 1 if you have the `curl' library (-lcurl).]) AC_SUBST(LIBCURL_LDLIBS, -lcurl) ], [AC_MSG_ERROR([library 'curl' does not provide curl_multi_init])]) - pgac_save_CPPFLAGS=$CPPFLAGS - pgac_save_LDFLAGS=$LDFLAGS - pgac_save_LIBS=$LIBS - - CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS" - LDFLAGS="$LIBCURL_LDFLAGS $LDFLAGS" LIBS="$LIBCURL_LDLIBS $LIBS" # Check to see whether the current platform supports threadsafe Curl diff --git a/configure b/configure index 4f15347cc9503..14ad0a5006fa4 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for PostgreSQL 18beta1. +# Generated by GNU Autoconf 2.69 for PostgreSQL 19devel. # # Report bugs to . # @@ -582,8 +582,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='PostgreSQL' PACKAGE_TARNAME='postgresql' -PACKAGE_VERSION='18beta1' -PACKAGE_STRING='PostgreSQL 18beta1' +PACKAGE_VERSION='19devel' +PACKAGE_STRING='PostgreSQL 19devel' PACKAGE_BUGREPORT='pgsql-bugs@lists.postgresql.org' PACKAGE_URL='https://www.postgresql.org/' @@ -682,6 +682,7 @@ FLEXFLAGS FLEX BISONFLAGS BISON +NM MKDIR_P LN_S TAR @@ -739,7 +740,6 @@ PKG_CONFIG_LIBDIR PKG_CONFIG_PATH PKG_CONFIG DLSUFFIX -TAS GCC CPP CFLAGS_SL @@ -760,7 +760,6 @@ CLANG LLVM_CONFIG AWK with_llvm -SUN_STUDIO_CC ac_ct_CXX CXXFLAGS CXX @@ -1468,7 +1467,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures PostgreSQL 18beta1 to adapt to many kinds of systems. +\`configure' configures PostgreSQL 19devel to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1533,7 +1532,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of PostgreSQL 18beta1:";; + short | recursive ) echo "Configuration of PostgreSQL 19devel:";; esac cat <<\_ACEOF @@ -1724,7 +1723,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -PostgreSQL configure 18beta1 +PostgreSQL configure 19devel generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2477,7 +2476,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by PostgreSQL $as_me 18beta1, which was +It was created by PostgreSQL $as_me 19devel, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -3059,12 +3058,6 @@ $as_echo "$template" >&6; } PORTNAME=$template -# Initialize default assumption that we do not need separate assembly code -# for TAS (test-and-set). This can be overridden by the template file -# when it's executed. -need_tas=no -tas_file=dummy.s - # Default, works for most platforms, override in template file if needed DLSUFFIX=".so" @@ -4475,190 +4468,49 @@ ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5 -$as_echo_n "checking for $CC option to accept ISO C99... " >&6; } -if ${ac_cv_prog_cc_c99+:} false; then : + +# Detect option needed for C11 +# loosely modeled after code in later Autoconf versions +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C11" >&5 +$as_echo_n "checking for $CC option to accept ISO C11... " >&6; } + +if ${pgac_cv_prog_cc_c11+:} false; then : $as_echo_n "(cached) " >&6 else - ac_cv_prog_cc_c99=no -ac_save_CC=$CC -cat confdefs.h - <<_ACEOF >conftest.$ac_ext + pgac_cv_prog_cc_c11=no +pgac_save_CC=$CC +for pgac_arg in '' '-std=gnu11' '-std=c11'; do + CC="$pgac_save_CC $pgac_arg" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include -#include -#include -#include -#include - -// Check varargs macros. These examples are taken from C99 6.10.3.5. -#define debug(...) fprintf (stderr, __VA_ARGS__) -#define showlist(...) puts (#__VA_ARGS__) -#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__)) -static void -test_varargs_macros (void) -{ - int x = 1234; - int y = 5678; - debug ("Flag"); - debug ("X = %d\n", x); - showlist (The first, second, and third items.); - report (x>y, "x is %d but y is %d", x, y); -} - -// Check long long types. -#define BIG64 18446744073709551615ull -#define BIG32 4294967295ul -#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0) -#if !BIG_OK - your preprocessor is broken; -#endif -#if BIG_OK -#else - your preprocessor is broken; +#if !defined __STDC_VERSION__ || __STDC_VERSION__ < 201112L +# error "Compiler does not advertise C11 conformance" #endif -static long long int bignum = -9223372036854775807LL; -static unsigned long long int ubignum = BIG64; - -struct incomplete_array -{ - int datasize; - double data[]; -}; - -struct named_init { - int number; - const wchar_t *name; - double average; -}; - -typedef const char *ccp; - -static inline int -test_restrict (ccp restrict text) -{ - // See if C++-style comments work. - // Iterate through items via the restricted pointer. - // Also check for declarations in for loops. - for (unsigned int i = 0; *(text+i) != '\0'; ++i) - continue; - return 0; -} - -// Check varargs and va_copy. -static void -test_varargs (const char *format, ...) -{ - va_list args; - va_start (args, format); - va_list args_copy; - va_copy (args_copy, args); - - const char *str; - int number; - float fnumber; - - while (*format) - { - switch (*format++) - { - case 's': // string - str = va_arg (args_copy, const char *); - break; - case 'd': // int - number = va_arg (args_copy, int); - break; - case 'f': // float - fnumber = va_arg (args_copy, double); - break; - default: - break; - } - } - va_end (args_copy); - va_end (args); -} - -int -main () -{ - - // Check bool. - _Bool success = false; - - // Check restrict. - if (test_restrict ("String literal") == 0) - success = true; - char *restrict newvar = "Another string"; - - // Check varargs. - test_varargs ("s, d' f .", "string", 65, 34.234); - test_varargs_macros (); - - // Check flexible array members. - struct incomplete_array *ia = - malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10)); - ia->datasize = 10; - for (int i = 0; i < ia->datasize; ++i) - ia->data[i] = i * 1.234; - - // Check named initializers. - struct named_init ni = { - .number = 34, - .name = L"Test wide string", - .average = 543.34343, - }; - - ni.number = 58; - - int dynamic_array[ni.number]; - dynamic_array[ni.number - 1] = 543; - - // work around unused variable warnings - return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x' - || dynamic_array[ni.number - 1] != 543); - - ; - return 0; -} _ACEOF -for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99 -do - CC="$ac_save_CC $ac_arg" - if ac_fn_c_try_compile "$LINENO"; then : - ac_cv_prog_cc_c99=$ac_arg +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_cc_c11=$pgac_arg fi -rm -f core conftest.err conftest.$ac_objext - test "x$ac_cv_prog_cc_c99" != "xno" && break +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + test x"$pgac_cv_prog_cc_c11" != x"no" && break done -rm -f conftest.$ac_ext -CC=$ac_save_CC - +CC=$pgac_save_CC fi -# AC_CACHE_VAL -case "x$ac_cv_prog_cc_c99" in - x) - { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 -$as_echo "none needed" >&6; } ;; - xno) - { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 -$as_echo "unsupported" >&6; } ;; - *) - CC="$CC $ac_cv_prog_cc_c99" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5 -$as_echo "$ac_cv_prog_cc_c99" >&6; } ;; -esac -if test "x$ac_cv_prog_cc_c99" != xno; then : -fi - - -# Error out if the compiler does not support C99, as the codebase -# relies on that. -if test "$ac_cv_prog_cc_c99" = no; then - as_fn_error $? "C compiler \"$CC\" does not support C99" "$LINENO" 5 +if test x"$pgac_cv_prog_cc_c11" = x"no"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +$as_echo "unsupported" >&6; } + as_fn_error $? "C compiler \"$CC\" does not support C11" "$LINENO" 5 +elif test x"$pgac_cv_prog_cc_c11" = x""; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +$as_echo "none needed" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_cc_c11" >&5 +$as_echo "$pgac_cv_prog_cc_c11" >&6; } + CC="$CC $pgac_cv_prog_cc_c11" fi + ac_ext=cpp ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' @@ -4920,7 +4772,6 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu # Check if it's Intel's compiler, which (usually) pretends to be gcc, # but has idiosyncrasies of its own. We assume icc will define # __INTEL_COMPILER regardless of CFLAGS. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ @@ -4941,30 +4792,6 @@ else fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -# Check if it's Sun Studio compiler. We assume that -# __SUNPRO_C will be defined for Sun Studio compilers -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -main () -{ -#ifndef __SUNPRO_C -choke me -#endif - ; - return 0; -} -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - SUN_STUDIO_CC=yes -else - SUN_STUDIO_CC=no -fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext - - - # # LLVM @@ -5194,7 +5021,7 @@ fi for pgac_option in `$LLVM_CONFIG --ldflags`; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LLVM_LIBS="$LLVM_LIBS $pgac_option";; esac done @@ -6890,7 +6717,7 @@ fi # __attribute__((visibility("hidden"))) is supported, if we encounter a # compiler that supports one of the supported variants of -fvisibility=hidden # but uses a different syntax to mark a symbol as exported. -if test "$GCC" = yes -o "$SUN_STUDIO_CC" = yes ; then +if test "$GCC" = yes; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -fvisibility=hidden, for CFLAGS_SL_MODULE" >&5 $as_echo_n "checking whether ${CC} supports -fvisibility=hidden, for CFLAGS_SL_MODULE... " >&6; } if ${pgac_cv_prog_CC_cflags__fvisibility_hidden+:} false; then : @@ -7873,20 +7700,6 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu -# -# Set up TAS assembly code if needed; the template file has now had its -# chance to request this. -# -ac_config_links="$ac_config_links src/backend/port/tas.s:src/backend/port/tas/${tas_file}" - - -if test "$need_tas" = yes ; then - TAS=tas.o -else - TAS="" -fi - - cat >>confdefs.h <<_ACEOF #define DLSUFFIX "$DLSUFFIX" @@ -9436,12 +9249,12 @@ fi # Note the user could also set XML2_CFLAGS/XML2_LIBS directly for pgac_option in $XML2_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $XML2_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -9666,12 +9479,12 @@ fi # note that -llz4 will be added by AC_CHECK_LIB below. for pgac_option in $LZ4_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $LZ4_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -9807,12 +9620,12 @@ fi # note that -lzstd will be added by AC_CHECK_LIB below. for pgac_option in $ZSTD_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $ZSTD_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -10350,6 +10163,47 @@ case $MKDIR_P in *install-sh*) MKDIR_P='\${SHELL} \${top_srcdir}/config/install-sh -c -d';; esac +# Extract the first word of "nm", so it can be a program name with args. +set dummy nm; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_NM+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $NM in + [\\/]* | ?:[\\/]*) + ac_cv_path_NM="$NM" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_NM="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +NM=$ac_cv_path_NM +if test -n "$NM"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $NM" >&5 +$as_echo "$NM" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + if test -z "$BISON"; then for ac_prog in bison do @@ -12717,6 +12571,15 @@ fi if test "$with_libcurl" = yes ; then + # libcurl compiler/linker flags are kept separate from the global flags, so + # they have to be added back temporarily for the following tests. + pgac_save_CPPFLAGS=$CPPFLAGS + pgac_save_LDFLAGS=$LDFLAGS + pgac_save_LIBS=$LIBS + + CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS" + LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS" + ac_fn_c_check_header_mongrel "$LINENO" "curl/curl.h" "ac_cv_header_curl_curl_h" "$ac_includes_default" if test "x$ac_cv_header_curl_curl_h" = xyes; then : @@ -12725,6 +12588,9 @@ else fi + + # LIBCURL_LDLIBS is determined here. Like the compiler flags, it should not + # pollute the global LIBS setting. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for curl_multi_init in -lcurl" >&5 $as_echo_n "checking for curl_multi_init in -lcurl... " >&6; } if ${ac_cv_lib_curl_curl_multi_init+:} false; then : @@ -12774,12 +12640,6 @@ else fi - pgac_save_CPPFLAGS=$CPPFLAGS - pgac_save_LDFLAGS=$LDFLAGS - pgac_save_LIBS=$LIBS - - CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS" - LDFLAGS="$LIBCURL_LDFLAGS $LDFLAGS" LIBS="$LIBCURL_LDLIBS $LIBS" # Check to see whether the current platform supports threadsafe Curl @@ -13309,6 +13169,23 @@ fi fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + for ac_func in io_uring_queue_init_mem +do : + ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem" +if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_IO_URING_QUEUE_INIT_MEM 1 +_ACEOF + +fi +done + + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5 $as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; } @@ -13792,7 +13669,7 @@ fi ## Header files ## -for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h xlocale.h +for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h uchar.h ucred.h xlocale.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" @@ -14814,8 +14691,8 @@ _ACEOF ;; esac -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for printf format archetype" >&5 -$as_echo_n "checking for printf format archetype... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C printf format archetype" >&5 +$as_echo_n "checking for C printf format archetype... " >&6; } if ${pgac_cv_printf_archetype+:} false; then : $as_echo_n "(cached) " >&6 else @@ -14875,13 +14752,103 @@ fi $as_echo "$pgac_cv_printf_archetype" >&6; } cat >>confdefs.h <<_ACEOF -#define PG_PRINTF_ATTRIBUTE $pgac_cv_printf_archetype +#define PG_C_PRINTF_ATTRIBUTE $pgac_cv_printf_archetype _ACEOF -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _Static_assert" >&5 -$as_echo_n "checking for _Static_assert... " >&6; } -if ${pgac_cv__static_assert+:} false; then : + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ printf format archetype" >&5 +$as_echo_n "checking for C++ printf format archetype... " >&6; } +if ${pgac_cv_cxx_printf_archetype+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_cv_cxx_printf_archetype=gnu_printf +ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +extern void pgac_write(int ignore, const char *fmt,...) +__attribute__((format($pgac_cv_cxx_printf_archetype, 2, 3))); +int +main () +{ +pgac_write(0, "error %s: %m", "foo"); + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_archetype_ok=yes +else + ac_archetype_ok=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_cxx_werror_flag=$ac_save_cxx_werror_flag + +if [ "$ac_archetype_ok" = no ]; then + pgac_cv_cxx_printf_archetype=__syslog__ + ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +extern void pgac_write(int ignore, const char *fmt,...) +__attribute__((format($pgac_cv_cxx_printf_archetype, 2, 3))); +int +main () +{ +pgac_write(0, "error %s: %m", "foo"); + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_archetype_ok=yes +else + ac_archetype_ok=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_cxx_werror_flag=$ac_save_cxx_werror_flag + + if [ "$ac_archetype_ok" = no ]; then + pgac_cv_cxx_printf_archetype=printf + fi +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_cxx_printf_archetype" >&5 +$as_echo "$pgac_cv_cxx_printf_archetype" >&6; } + +cat >>confdefs.h <<_ACEOF +#define PG_CXX_PRINTF_ATTRIBUTE $pgac_cv_cxx_printf_archetype +_ACEOF + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for statement expressions" >&5 +$as_echo_n "checking for statement expressions... " >&6; } +if ${pgac_cv_statement_expressions+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -14896,18 +14863,18 @@ main () } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__static_assert=yes + pgac_cv_statement_expressions=yes else - pgac_cv__static_assert=no + pgac_cv_statement_expressions=no fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__static_assert" >&5 -$as_echo "$pgac_cv__static_assert" >&6; } -if test x"$pgac_cv__static_assert" = xyes ; then +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_statement_expressions" >&5 +$as_echo "$pgac_cv_statement_expressions" >&6; } +if test x"$pgac_cv_statement_expressions" = xyes ; then -$as_echo "#define HAVE__STATIC_ASSERT 1" >>confdefs.h +$as_echo "#define HAVE_STATEMENT_EXPRESSIONS 1" >>confdefs.h fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for typeof" >&5 @@ -15164,10 +15131,10 @@ _ACEOF fi -# MSVC doesn't cope well with defining restrict to __restrict, the -# spelling it understands, because it conflicts with -# __declspec(restrict). Therefore we define pg_restrict to the -# appropriate definition, which presumably won't conflict. +# Even though restrict is in C99 and should be supported by all +# supported compilers, this test is useful because it will prefer a +# spelling that also works in C++ (often __restrict). (restrict is +# not part of the C++ standard.) { $as_echo "$as_me:${as_lineno-$LINENO}: checking for C/C++ restrict keyword" >&5 $as_echo_n "checking for C/C++ restrict keyword... " >&6; } if ${ac_cv_c_restrict+:} false; then : @@ -15214,16 +15181,6 @@ _ACEOF ;; esac -if test "$ac_cv_c_restrict" = "no"; then - pg_restrict="" -else - pg_restrict="$ac_cv_c_restrict" -fi - -cat >>confdefs.h <<_ACEOF -#define pg_restrict $pg_restrict -_ACEOF - ac_fn_c_check_type "$LINENO" "struct option" "ac_cv_type_struct_option" "#ifdef HAVE_GETOPT_H #include @@ -15562,7 +15519,7 @@ _ACEOF # If we don't have largefile support, can't handle segment size >= 2GB. if test "$ac_cv_sizeof_off_t" -lt 8; then - if expr $RELSEG_SIZE '*' $blocksize '>=' 2 '*' 1024 '*' 1024; then + if expr $RELSEG_SIZE '*' $blocksize '>=' 2 '*' 1024 '*' 1024 >/dev/null; then as_fn_error $? "Large file support is not enabled. Segment size cannot be larger than 1GB." "$LINENO" 5 fi fi @@ -16635,7 +16592,7 @@ fi if test "$with_icu" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$ICU_CFLAGS $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $ICU_CFLAGS" # Verify we have ICU's header files ac_fn_c_check_header_mongrel "$LINENO" "unicode/ucol.h" "ac_cv_header_unicode_ucol_h" "$ac_includes_default" @@ -16986,6 +16943,39 @@ cat >>confdefs.h <<_ACEOF _ACEOF +# The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of intmax_t" >&5 +$as_echo_n "checking size of intmax_t... " >&6; } +if ${ac_cv_sizeof_intmax_t+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (intmax_t))" "ac_cv_sizeof_intmax_t" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_intmax_t" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (intmax_t) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_intmax_t=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_intmax_t" >&5 +$as_echo "$ac_cv_sizeof_intmax_t" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_INTMAX_T $ac_cv_sizeof_intmax_t +_ACEOF + + # Determine memory alignment requirements for the basic C data types. @@ -17260,7 +17250,7 @@ else /* end confdefs.h. */ /* This must match the corresponding code in c.h: */ -#if defined(__GNUC__) || defined(__SUNPRO_C) +#if defined(__GNUC__) #define pg_attribute_aligned(a) __attribute__((aligned(a))) #elif defined(_MSC_VER) #define pg_attribute_aligned(a) __declspec(align(a)) @@ -17542,7 +17532,7 @@ $as_echo "#define HAVE_GCC__ATOMIC_INT64_CAS 1" >>confdefs.h fi -# Check for x86 cpuid instruction +# Check for __get_cpuid() and __cpuid() { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid" >&5 $as_echo_n "checking for __get_cpuid... " >&6; } if ${pgac_cv__get_cpuid+:} false; then : @@ -17575,77 +17565,79 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h -fi - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 -$as_echo_n "checking for __get_cpuid_count... " >&6; } -if ${pgac_cv__get_cpuid_count+:} false; then : +else + # __cpuid() + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 +$as_echo_n "checking for __cpuid... " >&6; } +if ${pgac_cv__cpuid+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include +#include int main () { unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + __cpuid(exx, 1); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__get_cpuid_count="yes" + pgac_cv__cpuid="yes" else - pgac_cv__get_cpuid_count="no" + pgac_cv__cpuid="no" fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 -$as_echo "$pgac_cv__get_cpuid_count" >&6; } -if test x"$pgac_cv__get_cpuid_count" = x"yes"; then +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuid" >&5 +$as_echo "$pgac_cv__cpuid" >&6; } + if test x"$pgac_cv__cpuid" = x"yes"; then -$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h +$as_echo "#define HAVE__CPUID 1" >>confdefs.h + fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 -$as_echo_n "checking for __cpuid... " >&6; } -if ${pgac_cv__cpuid+:} false; then : +# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 +$as_echo_n "checking for __get_cpuid_count... " >&6; } +if ${pgac_cv__get_cpuid_count+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include +#include int main () { unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuid(exx[0], 1); + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__cpuid="yes" + pgac_cv__get_cpuid_count="yes" else - pgac_cv__cpuid="no" + pgac_cv__get_cpuid_count="no" fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuid" >&5 -$as_echo "$pgac_cv__cpuid" >&6; } -if test x"$pgac_cv__cpuid" = x"yes"; then - -$as_echo "#define HAVE__CPUID 1" >>confdefs.h +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 +$as_echo "$pgac_cv__get_cpuid_count" >&6; } +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then -fi +$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +else + # __cpuidex() + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 $as_echo_n "checking for __cpuidex... " >&6; } if ${pgac_cv__cpuidex+:} false; then : $as_echo_n "(cached) " >&6 @@ -17657,7 +17649,7 @@ int main () { unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuidex(exx[0], 7, 0); + __cpuidex(exx, 7, 0); ; return 0; @@ -17673,10 +17665,11 @@ rm -f core conftest.err conftest.$ac_objext \ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 $as_echo "$pgac_cv__cpuidex" >&6; } -if test x"$pgac_cv__cpuidex" = x"yes"; then + if test x"$pgac_cv__cpuidex" = x"yes"; then $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h + fi fi # Check for XSAVE intrinsics @@ -18227,6 +18220,7 @@ else { __m128i z; + x = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi32_si128(0)), x); y = _mm512_clmulepi64_epi128(x, y, 0); z = _mm_ternarylogic_epi64( _mm512_castsi512_si128(y), @@ -18852,7 +18846,7 @@ Use --without-tcl to disable building PL/Tcl." "$LINENO" 5 fi # now that we have TCL_INCLUDE_SPEC, we can check for ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$TCL_INCLUDE_SPEC $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $TCL_INCLUDE_SPEC" ac_fn_c_check_header_mongrel "$LINENO" "tcl.h" "ac_cv_header_tcl_h" "$ac_includes_default" if test "x$ac_cv_header_tcl_h" = xyes; then : @@ -18921,7 +18915,7 @@ fi # check for if test "$with_python" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$python_includespec $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $python_includespec" ac_fn_c_check_header_mongrel "$LINENO" "Python.h" "ac_cv_header_Python_h" "$ac_includes_default" if test "x$ac_cv_header_Python_h" = xyes; then : @@ -19459,8 +19453,6 @@ fi if test x"$GCC" = x"yes" ; then cc_string=`${CC} --version | sed q` case $cc_string in [A-Za-z]*) ;; *) cc_string="GCC $cc_string";; esac -elif test x"$SUN_STUDIO_CC" = x"yes" ; then - cc_string=`${CC} -V 2>&1 | sed q` else cc_string=$CC fi @@ -20062,7 +20054,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by PostgreSQL $as_me 18beta1, which was +This file was extended by PostgreSQL $as_me 19devel, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -20133,7 +20125,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -PostgreSQL config.status 18beta1 +PostgreSQL config.status 19devel configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" @@ -20257,7 +20249,6 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 for ac_config_target in $ac_config_targets do case $ac_config_target in - "src/backend/port/tas.s") CONFIG_LINKS="$CONFIG_LINKS src/backend/port/tas.s:src/backend/port/tas/${tas_file}" ;; "GNUmakefile") CONFIG_FILES="$CONFIG_FILES GNUmakefile" ;; "src/Makefile.global") CONFIG_FILES="$CONFIG_FILES src/Makefile.global" ;; "src/backend/port/pg_sema.c") CONFIG_LINKS="$CONFIG_LINKS src/backend/port/pg_sema.c:${SEMA_IMPLEMENTATION}" ;; diff --git a/configure.ac b/configure.ac index 4b8335dc6138e..01b3bbc1be82a 100644 --- a/configure.ac +++ b/configure.ac @@ -17,7 +17,7 @@ dnl Read the Autoconf manual for details. dnl m4_pattern_forbid(^PGAC_)dnl to catch undefined macros -AC_INIT([PostgreSQL], [18beta1], [pgsql-bugs@lists.postgresql.org], [], [https://www.postgresql.org/]) +AC_INIT([PostgreSQL], [19devel], [pgsql-bugs@lists.postgresql.org], [], [https://www.postgresql.org/]) m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required. Untested combinations of 'autoconf' and PostgreSQL versions are not @@ -95,12 +95,6 @@ AC_MSG_RESULT([$template]) PORTNAME=$template AC_SUBST(PORTNAME) -# Initialize default assumption that we do not need separate assembly code -# for TAS (test-and-set). This can be overridden by the template file -# when it's executed. -need_tas=no -tas_file=dummy.s - # Default, works for most platforms, override in template file if needed DLSUFFIX=".so" @@ -364,14 +358,33 @@ pgac_cc_list="gcc cc" pgac_cxx_list="g++ c++" AC_PROG_CC([$pgac_cc_list]) -AC_PROG_CC_C99() -# Error out if the compiler does not support C99, as the codebase -# relies on that. -if test "$ac_cv_prog_cc_c99" = no; then - AC_MSG_ERROR([C compiler "$CC" does not support C99]) +# Detect option needed for C11 +# loosely modeled after code in later Autoconf versions +AC_MSG_CHECKING([for $CC option to accept ISO C11]) +AC_CACHE_VAL([pgac_cv_prog_cc_c11], +[pgac_cv_prog_cc_c11=no +pgac_save_CC=$CC +for pgac_arg in '' '-std=gnu11' '-std=c11'; do + CC="$pgac_save_CC $pgac_arg" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#if !defined __STDC_VERSION__ || __STDC_VERSION__ < 201112L +# error "Compiler does not advertise C11 conformance" +#endif]])], [[pgac_cv_prog_cc_c11=$pgac_arg]]) + test x"$pgac_cv_prog_cc_c11" != x"no" && break +done +CC=$pgac_save_CC]) + +if test x"$pgac_cv_prog_cc_c11" = x"no"; then + AC_MSG_RESULT([unsupported]) + AC_MSG_ERROR([C compiler "$CC" does not support C11]) +elif test x"$pgac_cv_prog_cc_c11" = x""; then + AC_MSG_RESULT([none needed]) +else + AC_MSG_RESULT([$pgac_cv_prog_cc_c11]) + CC="$CC $pgac_cv_prog_cc_c11" fi + AC_PROG_CXX([$pgac_cxx_list]) # Check if it's Intel's compiler, which (usually) pretends to be gcc, @@ -381,14 +394,6 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [@%:@ifndef __INTEL_COMPILER choke me @%:@endif])], [ICC=yes], [ICC=no]) -# Check if it's Sun Studio compiler. We assume that -# __SUNPRO_C will be defined for Sun Studio compilers -AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [@%:@ifndef __SUNPRO_C -choke me -@%:@endif])], [SUN_STUDIO_CC=yes], [SUN_STUDIO_CC=no]) - -AC_SUBST(SUN_STUDIO_CC) - # # LLVM @@ -599,7 +604,7 @@ fi # __attribute__((visibility("hidden"))) is supported, if we encounter a # compiler that supports one of the supported variants of -fvisibility=hidden # but uses a different syntax to mark a symbol as exported. -if test "$GCC" = yes -o "$SUN_STUDIO_CC" = yes ; then +if test "$GCC" = yes; then PGAC_PROG_CC_VAR_OPT(CFLAGS_SL_MODULE, [-fvisibility=hidden]) # For C++ we additionally want -fvisibility-inlines-hidden PGAC_PROG_VARCXX_VARFLAGS_OPT(CXX, CXXFLAGS_SL_MODULE, [-fvisibility=hidden]) @@ -755,19 +760,6 @@ AC_PROG_CPP AC_SUBST(GCC) -# -# Set up TAS assembly code if needed; the template file has now had its -# chance to request this. -# -AC_CONFIG_LINKS([src/backend/port/tas.s:src/backend/port/tas/${tas_file}]) - -if test "$need_tas" = yes ; then - TAS=tas.o -else - TAS="" -fi -AC_SUBST(TAS) - AC_SUBST(DLSUFFIX)dnl AC_DEFINE_UNQUOTED([DLSUFFIX], ["$DLSUFFIX"], [Define to the file name extension of dynamically-loadable modules.]) @@ -1103,12 +1095,12 @@ if test "$with_libxml" = yes ; then # Note the user could also set XML2_CFLAGS/XML2_LIBS directly for pgac_option in $XML2_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $XML2_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1152,12 +1144,12 @@ if test "$with_lz4" = yes; then # note that -llz4 will be added by AC_CHECK_LIB below. for pgac_option in $LZ4_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $LZ4_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1177,12 +1169,12 @@ if test "$with_zstd" = yes; then # note that -lzstd will be added by AC_CHECK_LIB below. for pgac_option in $ZSTD_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $ZSTD_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1222,6 +1214,8 @@ case $MKDIR_P in *install-sh*) MKDIR_P='\${SHELL} \${top_srcdir}/config/install-sh -c -d';; esac +AC_PATH_PROG(NM, nm) +AC_SUBST(NM) PGAC_PATH_BISON PGAC_PATH_FLEX @@ -1420,6 +1414,13 @@ if test "$with_libxslt" = yes ; then AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])]) fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + AC_CHECK_FUNCS([io_uring_queue_init_mem]) + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])]) fi @@ -1514,6 +1515,7 @@ AC_CHECK_HEADERS(m4_normalize([ sys/signalfd.h sys/ucred.h termios.h + uchar.h ucred.h xlocale.h ])) @@ -1674,7 +1676,8 @@ m4_defun([AC_PROG_CC_STDC], []) dnl We don't want that. AC_C_BIGENDIAN AC_C_INLINE PGAC_PRINTF_ARCHETYPE -PGAC_C_STATIC_ASSERT +PGAC_CXX_PRINTF_ARCHETYPE +PGAC_C_STATEMENT_EXPRESSIONS PGAC_C_TYPEOF PGAC_C_TYPES_COMPATIBLE PGAC_C_BUILTIN_CONSTANT_P @@ -1686,19 +1689,11 @@ PGAC_UNION_SEMUN AC_CHECK_TYPES(socklen_t, [], [], [#include ]) PGAC_STRUCT_SOCKADDR_SA_LEN -# MSVC doesn't cope well with defining restrict to __restrict, the -# spelling it understands, because it conflicts with -# __declspec(restrict). Therefore we define pg_restrict to the -# appropriate definition, which presumably won't conflict. +# Even though restrict is in C99 and should be supported by all +# supported compilers, this test is useful because it will prefer a +# spelling that also works in C++ (often __restrict). (restrict is +# not part of the C++ standard.) AC_C_RESTRICT -if test "$ac_cv_c_restrict" = "no"; then - pg_restrict="" -else - pg_restrict="$ac_cv_c_restrict" -fi -AC_DEFINE_UNQUOTED([pg_restrict], [$pg_restrict], -[Define to keyword to use for C99 restrict support, or to nothing if not -supported]) AC_CHECK_TYPES([struct option], [], [], [#ifdef HAVE_GETOPT_H @@ -1761,7 +1756,7 @@ AC_CHECK_SIZEOF([off_t]) # If we don't have largefile support, can't handle segment size >= 2GB. if test "$ac_cv_sizeof_off_t" -lt 8; then - if expr $RELSEG_SIZE '*' $blocksize '>=' 2 '*' 1024 '*' 1024; then + if expr $RELSEG_SIZE '*' $blocksize '>=' 2 '*' 1024 '*' 1024 >/dev/null; then AC_MSG_ERROR([Large file support is not enabled. Segment size cannot be larger than 1GB.]) fi fi @@ -1937,7 +1932,7 @@ fi if test "$with_icu" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$ICU_CFLAGS $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $ICU_CFLAGS" # Verify we have ICU's header files AC_CHECK_HEADER(unicode/ucol.h, [], @@ -1989,6 +1984,7 @@ AC_CHECK_SIZEOF([void *]) AC_CHECK_SIZEOF([size_t]) AC_CHECK_SIZEOF([long]) AC_CHECK_SIZEOF([long long]) +AC_CHECK_SIZEOF([intmax_t]) # Determine memory alignment requirements for the basic C data types. @@ -2037,7 +2033,7 @@ PGAC_HAVE_GCC__ATOMIC_INT32_CAS PGAC_HAVE_GCC__ATOMIC_INT64_CAS -# Check for x86 cpuid instruction +# Check for __get_cpuid() and __cpuid() AC_CACHE_CHECK([for __get_cpuid], [pgac_cv__get_cpuid], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2047,8 +2043,21 @@ AC_CACHE_CHECK([for __get_cpuid], [pgac_cv__get_cpuid], [pgac_cv__get_cpuid="no"])]) if test x"$pgac_cv__get_cpuid" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.]) +else + # __cpuid() + AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __cpuid(exx, 1); + ]])], + [pgac_cv__cpuid="yes"], + [pgac_cv__cpuid="no"])]) + if test x"$pgac_cv__cpuid" = x"yes"; then + AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) + fi fi +# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2058,28 +2067,18 @@ AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [pgac_cv__get_cpuid_count="no"])]) if test x"$pgac_cv__get_cpuid_count" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.]) -fi - -AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], -[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [[unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuid(exx[0], 1); - ]])], - [pgac_cv__cpuid="yes"], - [pgac_cv__cpuid="no"])]) -if test x"$pgac_cv__cpuid" = x"yes"; then - AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) -fi - -AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], -[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [[unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuidex(exx[0], 7, 0); - ]])], - [pgac_cv__cpuidex="yes"], - [pgac_cv__cpuidex="no"])]) -if test x"$pgac_cv__cpuidex" = x"yes"; then - AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) +else + # __cpuidex() + AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) + if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) + fi fi # Check for XSAVE intrinsics @@ -2337,7 +2336,7 @@ Use --without-tcl to disable building PL/Tcl.]) fi # now that we have TCL_INCLUDE_SPEC, we can check for ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$TCL_INCLUDE_SPEC $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $TCL_INCLUDE_SPEC" AC_CHECK_HEADER(tcl.h, [], [AC_MSG_ERROR([header file is required for Tcl])]) CPPFLAGS=$ac_save_CPPFLAGS fi @@ -2374,7 +2373,7 @@ fi # check for if test "$with_python" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$python_includespec $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $python_includespec" AC_CHECK_HEADER(Python.h, [], [AC_MSG_ERROR([header file is required for Python])]) CPPFLAGS=$ac_save_CPPFLAGS fi @@ -2449,8 +2448,6 @@ AC_SUBST(LDFLAGS_EX_BE) if test x"$GCC" = x"yes" ; then cc_string=`${CC} --version | sed q` case $cc_string in [[A-Za-z]]*) ;; *) cc_string="GCC $cc_string";; esac -elif test x"$SUN_STUDIO_CC" = x"yes" ; then - cc_string=`${CC} -V 2>&1 | sed q` else cc_string=$CC fi diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out index c6f4b16c55615..6558f2c5a4ff4 100644 --- a/contrib/amcheck/expected/check_btree.out +++ b/contrib/amcheck/expected/check_btree.out @@ -60,6 +60,14 @@ SELECT bt_index_parent_check('bttest_a_brin_idx'); ERROR: expected "btree" index as targets for verification DETAIL: Relation "bttest_a_brin_idx" is a brin index. ROLLBACK; +-- verify partitioned indexes are rejected (error) +BEGIN; +CREATE TABLE bttest_partitioned (a int, b int) PARTITION BY list (a); +CREATE INDEX bttest_btree_partitioned_idx ON bttest_partitioned USING btree (b); +SELECT bt_index_parent_check('bttest_btree_partitioned_idx'); +ERROR: expected index as targets for verification +DETAIL: This operation is not supported for partitioned indexes. +ROLLBACK; -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); bt_index_check diff --git a/contrib/amcheck/expected/check_gin.out b/contrib/amcheck/expected/check_gin.out index b4f0b110747c3..8dd01ced8d15f 100644 --- a/contrib/amcheck/expected/check_gin.out +++ b/contrib/amcheck/expected/check_gin.out @@ -76,3 +76,15 @@ SELECT gin_index_check('gin_check_jsonb_idx'); -- cleanup DROP TABLE gin_check_jsonb; +-- Test GIN multicolumn index +CREATE TABLE "gin_check_multicolumn"(a text[], b text[]); +INSERT INTO gin_check_multicolumn (a,b) values ('{a,c,e}','{b,d,f}'); +CREATE INDEX "gin_check_multicolumn_idx" on gin_check_multicolumn USING GIN(a,b); +SELECT gin_index_check('gin_check_multicolumn_idx'); + gin_index_check +----------------- + +(1 row) + +-- cleanup +DROP TABLE gin_check_multicolumn; diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index b33e8c9b062fe..1f0c347ed5413 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -49,6 +49,7 @@ tests += { 't/003_cic_2pc.pl', 't/004_verify_nbtree_unique.pl', 't/005_pitr.pl', + 't/006_verify_gin.pl', ], }, } diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql index 0793dbfeebd82..171f7f691ec60 100644 --- a/contrib/amcheck/sql/check_btree.sql +++ b/contrib/amcheck/sql/check_btree.sql @@ -52,6 +52,13 @@ CREATE INDEX bttest_a_brin_idx ON bttest_a USING brin(id); SELECT bt_index_parent_check('bttest_a_brin_idx'); ROLLBACK; +-- verify partitioned indexes are rejected (error) +BEGIN; +CREATE TABLE bttest_partitioned (a int, b int) PARTITION BY list (a); +CREATE INDEX bttest_btree_partitioned_idx ON bttest_partitioned USING btree (b); +SELECT bt_index_parent_check('bttest_btree_partitioned_idx'); +ROLLBACK; + -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); -- more expansive tests diff --git a/contrib/amcheck/sql/check_gin.sql b/contrib/amcheck/sql/check_gin.sql index 66f42c34311db..11caed3d6a81b 100644 --- a/contrib/amcheck/sql/check_gin.sql +++ b/contrib/amcheck/sql/check_gin.sql @@ -50,3 +50,13 @@ SELECT gin_index_check('gin_check_jsonb_idx'); -- cleanup DROP TABLE gin_check_jsonb; + +-- Test GIN multicolumn index +CREATE TABLE "gin_check_multicolumn"(a text[], b text[]); +INSERT INTO gin_check_multicolumn (a,b) values ('{a,c,e}','{b,d,f}'); +CREATE INDEX "gin_check_multicolumn_idx" on gin_check_multicolumn USING GIN(a,b); + +SELECT gin_index_check('gin_check_multicolumn_idx'); + +-- cleanup +DROP TABLE gin_check_multicolumn; diff --git a/contrib/amcheck/t/002_cic.pl b/contrib/amcheck/t/002_cic.pl index 6a0c4f611258f..f4a24936b2c01 100644 --- a/contrib/amcheck/t/002_cic.pl +++ b/contrib/amcheck/t/002_cic.pl @@ -64,5 +64,28 @@ ) }); +# Test bt_index_parent_check() with indexes created with +# CREATE INDEX CONCURRENTLY. +$node->safe_psql('postgres', q(CREATE TABLE quebec(i int primary key))); +# Insert two rows into index +$node->safe_psql('postgres', + q(INSERT INTO quebec SELECT i FROM generate_series(1, 2) s(i);)); + +# start background transaction +my $in_progress_h = $node->background_psql('postgres'); +$in_progress_h->query_safe(q(BEGIN; SELECT pg_current_xact_id();)); + +# delete one row from table, while background transaction is in progress +$node->safe_psql('postgres', q(DELETE FROM quebec WHERE i = 1;)); +# create index concurrently, which will skip the deleted row +$node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY oscar ON quebec(i);)); + +# check index using bt_index_parent_check +my $result = $node->psql('postgres', + q(SELECT bt_index_parent_check('oscar', heapallindexed => true))); +is($result, '0', 'bt_index_parent_check for CIC after removed row'); + +$in_progress_h->quit; + $node->stop; done_testing(); diff --git a/contrib/amcheck/t/004_verify_nbtree_unique.pl b/contrib/amcheck/t/004_verify_nbtree_unique.pl index 6be08e3f38f79..2cd23fc20ed23 100644 --- a/contrib/amcheck/t/004_verify_nbtree_unique.pl +++ b/contrib/amcheck/t/004_verify_nbtree_unique.pl @@ -159,7 +159,9 @@ 'postgres', q( SELECT bt_index_check('bttest_unique_idx1', true, true); )); -ok( $stderr =~ /index uniqueness is violated for index "bttest_unique_idx1"/, +like( + $stderr, + qr/index uniqueness is violated for index "bttest_unique_idx1"/, 'detected uniqueness violation for index "bttest_unique_idx1"'); # @@ -177,7 +179,9 @@ 'postgres', q( SELECT bt_index_check('bttest_unique_idx2', true, true); )); -ok( $stderr =~ /item order invariant violated for index "bttest_unique_idx2"/, +like( + $stderr, + qr/item order invariant violated for index "bttest_unique_idx2"/, 'detected item order invariant violation for index "bttest_unique_idx2"'); $node->safe_psql( @@ -191,7 +195,9 @@ 'postgres', q( SELECT bt_index_check('bttest_unique_idx2', true, true); )); -ok( $stderr =~ /index uniqueness is violated for index "bttest_unique_idx2"/, +like( + $stderr, + qr/index uniqueness is violated for index "bttest_unique_idx2"/, 'detected uniqueness violation for index "bttest_unique_idx2"'); # @@ -208,7 +214,9 @@ 'postgres', q( SELECT bt_index_check('bttest_unique_idx3', true, true); )); -ok( $stderr =~ /item order invariant violated for index "bttest_unique_idx3"/, +like( + $stderr, + qr/item order invariant violated for index "bttest_unique_idx3"/, 'detected item order invariant violation for index "bttest_unique_idx3"'); # For unique index deduplication is possible only for same values, but @@ -237,7 +245,9 @@ 'postgres', q( SELECT bt_index_check('bttest_unique_idx3', true, true); )); -ok( $stderr =~ /index uniqueness is violated for index "bttest_unique_idx3"/, +like( + $stderr, + qr/index uniqueness is violated for index "bttest_unique_idx3"/, 'detected uniqueness violation for index "bttest_unique_idx3"'); $node->stop; diff --git a/contrib/amcheck/t/006_verify_gin.pl b/contrib/amcheck/t/006_verify_gin.pl new file mode 100644 index 0000000000000..5be0bee32183f --- /dev/null +++ b/contrib/amcheck/t/006_verify_gin.pl @@ -0,0 +1,293 @@ + +# Copyright (c) 2021-2025, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +my $node; +my $blksize; + +# to get the split fast, we want tuples to be as large as possible, but the same time we don't want them to be toasted. +my $filler_size = 1900; + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('test'); +$node->init(no_data_checksums => 1); +$node->append_conf('postgresql.conf', 'autovacuum=off'); +$node->start; +$blksize = int($node->safe_psql('postgres', 'SHOW block_size;')); +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql( + 'postgres', q( + CREATE OR REPLACE FUNCTION random_string( INT ) RETURNS text AS $$ + SELECT string_agg(substring('0123456789abcdefghijklmnopqrstuvwxyz', ceil(random() * 36)::integer, 1), '') from generate_series(1, $1); + $$ LANGUAGE SQL;)); + +# Tests +invalid_entry_order_leaf_page_test(); +invalid_entry_order_inner_page_test(); +invalid_entry_columns_order_test(); +inconsistent_with_parent_key__parent_key_corrupted_test(); +inconsistent_with_parent_key__child_key_corrupted_test(); +inconsistent_with_parent_key__parent_key_corrupted_posting_tree_test(); + +sub invalid_entry_order_leaf_page_test +{ + my $relname = "test"; + my $indexname = "test_gin_idx"; + + $node->safe_psql( + 'postgres', qq( + DROP TABLE IF EXISTS $relname; + CREATE TABLE $relname (a text[]); + INSERT INTO $relname (a) VALUES ('{aaaaa,bbbbb}'); + CREATE INDEX $indexname ON $relname USING gin (a); + )); + my $relpath = relation_filepath($indexname); + + $node->stop; + + my $blkno = 1; # root + + # produce wrong order by replacing aaaaa with ccccc + string_replace_block($relpath, 'aaaaa', 'ccccc', $blkno); + + $node->start; + + my ($result, $stdout, $stderr) = + $node->psql('postgres', qq(SELECT gin_index_check('$indexname'))); + my $expected = + "index \"$indexname\" has wrong tuple order on entry tree page, block 1, offset 2, rightlink 4294967295"; + like($stderr, qr/$expected/); +} + +sub invalid_entry_order_inner_page_test +{ + my $relname = "test"; + my $indexname = "test_gin_idx"; + + # to break the order in the inner page we need at least 3 items (rightmost key in the inner level is not checked for the order) + # so fill table until we have 2 splits + $node->safe_psql( + 'postgres', qq( + DROP TABLE IF EXISTS $relname; + CREATE TABLE $relname (a text[]); + INSERT INTO $relname (a) VALUES (('{' || 'pppppppppp' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'qqqqqqqqqq' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'rrrrrrrrrr' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'ssssssssss' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'tttttttttt' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'uuuuuuuuuu' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'vvvvvvvvvv' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'wwwwwwwwww' || random_string($filler_size) ||'}')::text[]); + CREATE INDEX $indexname ON $relname USING gin (a); + )); + my $relpath = relation_filepath($indexname); + + $node->stop; + + my $blkno = 1; # root + + # we have rrrrrrrrr... and tttttttttt... as keys in the root, so produce wrong order by replacing rrrrrrrrrr.... + string_replace_block($relpath, 'rrrrrrrrrr', 'zzzzzzzzzz', $blkno); + + $node->start; + + my ($result, $stdout, $stderr) = + $node->psql('postgres', qq(SELECT gin_index_check('$indexname'))); + my $expected = + "index \"$indexname\" has wrong tuple order on entry tree page, block 1, offset 2, rightlink 4294967295"; + like($stderr, qr/$expected/); +} + +sub invalid_entry_columns_order_test +{ + my $relname = "test"; + my $indexname = "test_gin_idx"; + + $node->safe_psql( + 'postgres', qq( + DROP TABLE IF EXISTS $relname; + CREATE TABLE $relname (a text[],b text[]); + INSERT INTO $relname (a,b) VALUES ('{aaa}','{bbb}'); + CREATE INDEX $indexname ON $relname USING gin (a,b); + )); + my $relpath = relation_filepath($indexname); + + $node->stop; + + my $blkno = 1; # root + + # mess column numbers + # root items order before: (1,aaa), (2,bbb) + # root items order after: (2,aaa), (1,bbb) + my $attrno_1 = pack('s', 1); + my $attrno_2 = pack('s', 2); + + my $find = qr/($attrno_1)(.)(aaa)/s; + my $replace = $attrno_2 . '$2$3'; + string_replace_block($relpath, $find, $replace, $blkno); + + $find = qr/($attrno_2)(.)(bbb)/s; + $replace = $attrno_1 . '$2$3'; + string_replace_block($relpath, $find, $replace, $blkno); + + $node->start; + + my ($result, $stdout, $stderr) = + $node->psql('postgres', qq(SELECT gin_index_check('$indexname'))); + my $expected = + "index \"$indexname\" has wrong tuple order on entry tree page, block 1, offset 2, rightlink 4294967295"; + like($stderr, qr/$expected/); +} + +sub inconsistent_with_parent_key__parent_key_corrupted_test +{ + my $relname = "test"; + my $indexname = "test_gin_idx"; + + # fill the table until we have a split + $node->safe_psql( + 'postgres', qq( + DROP TABLE IF EXISTS $relname; + CREATE TABLE $relname (a text[]); + INSERT INTO $relname (a) VALUES (('{' || 'llllllllll' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'mmmmmmmmmm' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'nnnnnnnnnn' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'xxxxxxxxxx' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'yyyyyyyyyy' || random_string($filler_size) ||'}')::text[]); + CREATE INDEX $indexname ON $relname USING gin (a); + )); + my $relpath = relation_filepath($indexname); + + $node->stop; + + my $blkno = 1; # root + + # we have nnnnnnnnnn... as parent key in the root, so replace it with something smaller then child's keys + string_replace_block($relpath, 'nnnnnnnnnn', 'aaaaaaaaaa', $blkno); + + $node->start; + + my ($result, $stdout, $stderr) = + $node->psql('postgres', qq(SELECT gin_index_check('$indexname'))); + my $expected = + "index \"$indexname\" has inconsistent records on page 3 offset 3"; + like($stderr, qr/$expected/); +} + +sub inconsistent_with_parent_key__child_key_corrupted_test +{ + my $relname = "test"; + my $indexname = "test_gin_idx"; + + # fill the table until we have a split + $node->safe_psql( + 'postgres', qq( + DROP TABLE IF EXISTS $relname; + CREATE TABLE $relname (a text[]); + INSERT INTO $relname (a) VALUES (('{' || 'llllllllll' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'mmmmmmmmmm' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'nnnnnnnnnn' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'xxxxxxxxxx' || random_string($filler_size) ||'}')::text[]); + INSERT INTO $relname (a) VALUES (('{' || 'yyyyyyyyyy' || random_string($filler_size) ||'}')::text[]); + CREATE INDEX $indexname ON $relname USING gin (a); + )); + my $relpath = relation_filepath($indexname); + + $node->stop; + + my $blkno = 3; # leaf + + # we have nnnnnnnnnn... as parent key in the root, so replace child key with something bigger + string_replace_block($relpath, 'nnnnnnnnnn', 'pppppppppp', $blkno); + + $node->start; + + my ($result, $stdout, $stderr) = + $node->psql('postgres', qq(SELECT gin_index_check('$indexname'))); + my $expected = + "index \"$indexname\" has inconsistent records on page 3 offset 3"; + like($stderr, qr/$expected/); +} + +sub inconsistent_with_parent_key__parent_key_corrupted_posting_tree_test +{ + my $relname = "test"; + my $indexname = "test_gin_idx"; + + $node->safe_psql( + 'postgres', qq( + DROP TABLE IF EXISTS $relname; + CREATE TABLE $relname (a text[]); + INSERT INTO $relname (a) select ('{aaaaa}') from generate_series(1,10000); + CREATE INDEX $indexname ON $relname USING gin (a); + )); + my $relpath = relation_filepath($indexname); + + $node->stop; + + my $blkno = 2; # posting tree root + + # we have a posting tree for 'aaaaa' key with the root at 2nd block + # and two leaf pages 3 and 4. replace 4th page's high key with (1,1) + # so that there are tid's in leaf page that are larger then the new high key. + my $find = pack('S*', 0, 4, 0) . '....'; + my $replace = pack('S*', 0, 4, 0, 1, 1); + string_replace_block($relpath, $find, $replace, $blkno); + + $node->start; + + my ($result, $stdout, $stderr) = + $node->psql('postgres', qq(SELECT gin_index_check('$indexname'))); + my $expected = + "index \"$indexname\": tid exceeds parent's high key in postingTree leaf on block 4"; + like($stderr, qr/$expected/); +} + + +# Returns the filesystem path for the named relation. +sub relation_filepath +{ + my ($relname) = @_; + + my $pgdata = $node->data_dir; + my $rel = $node->safe_psql('postgres', + qq(SELECT pg_relation_filepath('$relname'))); + die "path not found for relation $relname" unless defined $rel; + return "$pgdata/$rel"; +} + +# substitute pattern 'find' with 'replace' within the block with number 'blkno' in the file 'filename' +sub string_replace_block +{ + my ($filename, $find, $replace, $blkno) = @_; + + my $fh; + open($fh, '+<', $filename) or BAIL_OUT("open failed: $!"); + binmode $fh; + + my $offset = $blkno * $blksize; + my $buffer; + + sysseek($fh, $offset, 0) or BAIL_OUT("seek failed: $!"); + sysread($fh, $buffer, $blksize) or BAIL_OUT("read failed: $!"); + + $buffer =~ s/$find/'"' . $replace . '"'/gee; + + sysseek($fh, $offset, 0) or BAIL_OUT("seek failed: $!"); + syswrite($fh, $buffer) or BAIL_OUT("write failed: $!"); + + close($fh) or BAIL_OUT("close failed: $!"); + + return; +} + +done_testing(); diff --git a/contrib/amcheck/verify_common.c b/contrib/amcheck/verify_common.c index d095e62ce551f..a31ce06ed99a3 100644 --- a/contrib/amcheck/verify_common.c +++ b/contrib/amcheck/verify_common.c @@ -18,11 +18,13 @@ #include "verify_common.h" #include "catalog/index.h" #include "catalog/pg_am.h" +#include "commands/defrem.h" #include "commands/tablecmds.h" #include "utils/guc.h" #include "utils/syscache.h" static bool amcheck_index_mainfork_expected(Relation rel); +static bool index_checkable(Relation rel, Oid am_id); /* @@ -155,23 +157,21 @@ amcheck_lock_relation_and_check(Oid indrelid, * callable by non-superusers. If granted, it's useful to be able to check a * whole cluster. */ -bool +static bool index_checkable(Relation rel, Oid am_id) { - if (rel->rd_rel->relkind != RELKIND_INDEX || - rel->rd_rel->relam != am_id) - { - HeapTuple amtup; - HeapTuple amtuprel; + if (rel->rd_rel->relkind != RELKIND_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("expected index as targets for verification"), + errdetail_relkind_not_supported(rel->rd_rel->relkind))); - amtup = SearchSysCache1(AMOID, ObjectIdGetDatum(am_id)); - amtuprel = SearchSysCache1(AMOID, ObjectIdGetDatum(rel->rd_rel->relam)); + if (rel->rd_rel->relam != am_id) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("expected \"%s\" index as targets for verification", NameStr(((Form_pg_am) GETSTRUCT(amtup))->amname)), + errmsg("expected \"%s\" index as targets for verification", get_am_name(am_id)), errdetail("Relation \"%s\" is a %s index.", - RelationGetRelationName(rel), NameStr(((Form_pg_am) GETSTRUCT(amtuprel))->amname)))); - } + RelationGetRelationName(rel), get_am_name(rel->rd_rel->relam)))); if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, @@ -182,7 +182,7 @@ index_checkable(Relation rel, Oid am_id) if (!rel->rd_index->indisvalid) ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot check index \"%s\"", RelationGetRelationName(rel)), errdetail("Index is not valid."))); diff --git a/contrib/amcheck/verify_common.h b/contrib/amcheck/verify_common.h index e78adb68808f0..3fa63d2121ab8 100644 --- a/contrib/amcheck/verify_common.h +++ b/contrib/amcheck/verify_common.h @@ -1,12 +1,12 @@ /*------------------------------------------------------------------------- * - * amcheck.h + * verify_common.h * Shared routines for amcheck verifications. * * Copyright (c) 2016-2025, PostgreSQL Global Development Group * * IDENTIFICATION - * contrib/amcheck/amcheck.h + * contrib/amcheck/verify_common.h * *------------------------------------------------------------------------- */ @@ -16,8 +16,7 @@ #include "utils/relcache.h" #include "miscadmin.h" -/* Typedefs for callback functions for amcheck_lock_relation_and_check */ -typedef void (*IndexCheckableCallback) (Relation index); +/* Typedef for callback function for amcheck_lock_relation_and_check */ typedef void (*IndexDoCheckCallback) (Relation rel, Relation heaprel, void *state, @@ -27,5 +26,3 @@ extern void amcheck_lock_relation_and_check(Oid indrelid, Oid am_id, IndexDoCheckCallback check, LOCKMODE lockmode, void *state); - -extern bool index_checkable(Relation rel, Oid am_id); diff --git a/contrib/amcheck/verify_gin.c b/contrib/amcheck/verify_gin.c index b5f363562e32a..253da4b1f0bdf 100644 --- a/contrib/amcheck/verify_gin.c +++ b/contrib/amcheck/verify_gin.c @@ -38,7 +38,6 @@ typedef struct GinScanItem int depth; IndexTuple parenttup; BlockNumber parentblk; - XLogRecPtr parentlsn; BlockNumber blkno; struct GinScanItem *next; } GinScanItem; @@ -108,7 +107,7 @@ ginReadTupleWithoutState(IndexTuple itup, int *nitems) { if (nipd > 0) { - ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + ipd = ginPostingListDecode(ptr, &ndecoded); if (nipd != ndecoded) elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", nipd, ndecoded); @@ -118,7 +117,7 @@ ginReadTupleWithoutState(IndexTuple itup, int *nitems) } else { - ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + ipd = palloc_array(ItemPointerData, nipd); memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); } *nitems = nipd; @@ -153,7 +152,7 @@ gin_check_posting_tree_parent_keys_consistency(Relation rel, BlockNumber posting leafdepth = -1; /* Start the scan at the root page */ - stack = (GinPostingTreeScanItem *) palloc0(sizeof(GinPostingTreeScanItem)); + stack = palloc0_object(GinPostingTreeScanItem); stack->depth = 0; ItemPointerSetInvalid(&stack->parentkey); stack->parentblk = InvalidBlockNumber; @@ -175,7 +174,7 @@ gin_check_posting_tree_parent_keys_consistency(Relation rel, BlockNumber posting buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, strategy); LockBuffer(buffer, GIN_SHARE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); Assert(GinPageIsData(page)); @@ -346,7 +345,7 @@ gin_check_posting_tree_parent_keys_consistency(Relation rel, BlockNumber posting * Check if this tuple is consistent with the downlink in the * parent. */ - if (stack->parentblk != InvalidBlockNumber && i == maxoff && + if (i == maxoff && ItemPointerIsValid(&stack->parentkey) && ItemPointerCompare(&stack->parentkey, &posting_item->key) < 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -355,18 +354,14 @@ gin_check_posting_tree_parent_keys_consistency(Relation rel, BlockNumber posting stack->blkno, i))); /* This is an internal page, recurse into the child. */ - ptr = (GinPostingTreeScanItem *) palloc(sizeof(GinPostingTreeScanItem)); + ptr = palloc_object(GinPostingTreeScanItem); ptr->depth = stack->depth + 1; /* - * Set rightmost parent key to invalid item pointer. Its value - * is 'Infinity' and not explicitly stored. + * The rightmost parent key is always invalid item pointer. + * Its value is 'Infinity' and not explicitly stored. */ - if (rightlink == InvalidBlockNumber) - ItemPointerSetInvalid(&ptr->parentkey); - else - ptr->parentkey = posting_item->key; - + ptr->parentkey = posting_item->key; ptr->parentblk = stack->blkno; ptr->blkno = BlockIdGetBlockNumber(&posting_item->child_blkno); ptr->next = stack->next; @@ -417,11 +412,10 @@ gin_check_parent_keys_consistency(Relation rel, leafdepth = -1; /* Start the scan at the root page */ - stack = (GinScanItem *) palloc0(sizeof(GinScanItem)); + stack = palloc0_object(GinScanItem); stack->depth = 0; stack->parenttup = NULL; stack->parentblk = InvalidBlockNumber; - stack->parentlsn = InvalidXLogRecPtr; stack->blkno = GIN_ROOT_BLKNO; while (stack) @@ -432,7 +426,6 @@ gin_check_parent_keys_consistency(Relation rel, OffsetNumber i, maxoff, prev_attnum; - XLogRecPtr lsn; IndexTuple prev_tuple; BlockNumber rightlink; @@ -441,8 +434,7 @@ gin_check_parent_keys_consistency(Relation rel, buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, strategy); LockBuffer(buffer, GIN_SHARE); - page = (Page) BufferGetPage(buffer); - lsn = BufferGetLSNAtomic(buffer); + page = BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); rightlink = GinPageGetOpaque(page)->rightlink; @@ -463,28 +455,28 @@ gin_check_parent_keys_consistency(Relation rel, Datum parent_key = gintuple_get_key(&state, stack->parenttup, &parent_key_category); + OffsetNumber parent_key_attnum = gintuple_get_attrnum(&state, stack->parenttup); ItemId iid = PageGetItemIdCareful(rel, stack->blkno, page, maxoff); IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); - OffsetNumber attnum = gintuple_get_attrnum(&state, idxtuple); + OffsetNumber page_max_key_attnum = gintuple_get_attrnum(&state, idxtuple); GinNullCategory page_max_key_category; Datum page_max_key = gintuple_get_key(&state, idxtuple, &page_max_key_category); if (rightlink != InvalidBlockNumber && - ginCompareEntries(&state, attnum, page_max_key, - page_max_key_category, parent_key, - parent_key_category) > 0) + ginCompareAttEntries(&state, page_max_key_attnum, page_max_key, + page_max_key_category, parent_key_attnum, + parent_key, parent_key_category) < 0) { /* split page detected, install right link to the stack */ GinScanItem *ptr; elog(DEBUG3, "split detected for blk: %u, parent blk: %u", stack->blkno, stack->parentblk); - ptr = (GinScanItem *) palloc(sizeof(GinScanItem)); + ptr = palloc_object(GinScanItem); ptr->depth = stack->depth; ptr->parenttup = CopyIndexTuple(stack->parenttup); ptr->parentblk = stack->parentblk; - ptr->parentlsn = stack->parentlsn; ptr->blkno = rightlink; ptr->next = stack->next; stack->next = ptr; @@ -513,9 +505,7 @@ gin_check_parent_keys_consistency(Relation rel, { ItemId iid = PageGetItemIdCareful(rel, stack->blkno, page, i); IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); - OffsetNumber attnum = gintuple_get_attrnum(&state, idxtuple); - GinNullCategory prev_key_category; - Datum prev_key; + OffsetNumber current_attnum = gintuple_get_attrnum(&state, idxtuple); GinNullCategory current_key_category; Datum current_key; @@ -528,20 +518,24 @@ gin_check_parent_keys_consistency(Relation rel, current_key = gintuple_get_key(&state, idxtuple, ¤t_key_category); /* - * First block is metadata, skip order check. Also, never check - * for high key on rightmost page, as this key is not really - * stored explicitly. + * Compare the entry to the preceding one. + * + * Don't check for high key on the rightmost inner page, as this + * key is not really stored explicitly. * - * Also make sure to not compare entries for different attnums, - * which may be stored on the same page. + * The entries may be for different attributes, so make sure to + * use ginCompareAttEntries for comparison. */ - if (i != FirstOffsetNumber && attnum == prev_attnum && stack->blkno != GIN_ROOT_BLKNO && - !(i == maxoff && rightlink == InvalidBlockNumber)) + if ((i != FirstOffsetNumber) && + !(i == maxoff && rightlink == InvalidBlockNumber && !GinPageIsLeaf(page))) { + Datum prev_key; + GinNullCategory prev_key_category; + prev_key = gintuple_get_key(&state, prev_tuple, &prev_key_category); - if (ginCompareEntries(&state, attnum, prev_key, - prev_key_category, current_key, - current_key_category) >= 0) + if (ginCompareAttEntries(&state, prev_attnum, prev_key, + prev_key_category, current_attnum, + current_key, current_key_category) >= 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" has wrong tuple order on entry tree page, block %u, offset %u, rightlink %u", @@ -556,13 +550,14 @@ gin_check_parent_keys_consistency(Relation rel, i == maxoff) { GinNullCategory parent_key_category; + OffsetNumber parent_key_attnum = gintuple_get_attrnum(&state, stack->parenttup); Datum parent_key = gintuple_get_key(&state, stack->parenttup, &parent_key_category); - if (ginCompareEntries(&state, attnum, current_key, - current_key_category, parent_key, - parent_key_category) > 0) + if (ginCompareAttEntries(&state, current_attnum, current_key, + current_key_category, parent_key_attnum, + parent_key, parent_key_category) > 0) { /* * There was a discrepancy between parent and child @@ -581,6 +576,7 @@ gin_check_parent_keys_consistency(Relation rel, stack->blkno, stack->parentblk); else { + parent_key_attnum = gintuple_get_attrnum(&state, stack->parenttup); parent_key = gintuple_get_key(&state, stack->parenttup, &parent_key_category); @@ -589,9 +585,9 @@ gin_check_parent_keys_consistency(Relation rel, * Check if it is properly adjusted. If succeed, * proceed to the next key. */ - if (ginCompareEntries(&state, attnum, current_key, - current_key_category, parent_key, - parent_key_category) > 0) + if (ginCompareAttEntries(&state, current_attnum, current_key, + current_key_category, parent_key_attnum, + parent_key, parent_key_category) > 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" has inconsistent records on page %u offset %u", @@ -605,16 +601,15 @@ gin_check_parent_keys_consistency(Relation rel, { GinScanItem *ptr; - ptr = (GinScanItem *) palloc(sizeof(GinScanItem)); + ptr = palloc_object(GinScanItem); ptr->depth = stack->depth + 1; /* last tuple in layer has no high key */ - if (i != maxoff && !GinPageGetOpaque(page)->rightlink) - ptr->parenttup = CopyIndexTuple(idxtuple); - else + if (i == maxoff && rightlink == InvalidBlockNumber) ptr->parenttup = NULL; + else + ptr->parenttup = CopyIndexTuple(idxtuple); ptr->parentblk = stack->blkno; ptr->blkno = GinGetDownlink(idxtuple); - ptr->parentlsn = lsn; ptr->next = stack->next; stack->next = ptr; } @@ -644,7 +639,7 @@ gin_check_parent_keys_consistency(Relation rel, } prev_tuple = CopyIndexTuple(idxtuple); - prev_attnum = attnum; + prev_attnum = current_attnum; } LockBuffer(buffer, GIN_UNLOCK); @@ -749,7 +744,7 @@ gin_refind_parent(Relation rel, BlockNumber parentblkno, ItemId p_iid = PageGetItemIdCareful(rel, parentblkno, parentpage, o); IndexTuple itup = (IndexTuple) PageGetItem(parentpage, p_iid); - if (ItemPointerGetBlockNumber(&(itup->t_tid)) == childblkno) + if (GinGetDownlink(itup) == childblkno) { /* Found it! Make copy and return it */ result = CopyIndexTuple(itup); diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c index aa9cccd1da4fe..130b35334639f 100644 --- a/contrib/amcheck/verify_heapam.c +++ b/contrib/amcheck/verify_heapam.c @@ -526,17 +526,17 @@ verify_heapam(PG_FUNCTION_ARGS) if (rdoffnum < FirstOffsetNumber) { report_corruption(&ctx, - psprintf("line pointer redirection to item at offset %u precedes minimum offset %u", - (unsigned) rdoffnum, - (unsigned) FirstOffsetNumber)); + psprintf("line pointer redirection to item at offset %d precedes minimum offset %d", + rdoffnum, + FirstOffsetNumber)); continue; } if (rdoffnum > maxoff) { report_corruption(&ctx, - psprintf("line pointer redirection to item at offset %u exceeds maximum offset %u", - (unsigned) rdoffnum, - (unsigned) maxoff)); + psprintf("line pointer redirection to item at offset %d exceeds maximum offset %d", + rdoffnum, + maxoff)); continue; } @@ -550,22 +550,22 @@ verify_heapam(PG_FUNCTION_ARGS) if (!ItemIdIsUsed(rditem)) { report_corruption(&ctx, - psprintf("redirected line pointer points to an unused item at offset %u", - (unsigned) rdoffnum)); + psprintf("redirected line pointer points to an unused item at offset %d", + rdoffnum)); continue; } else if (ItemIdIsDead(rditem)) { report_corruption(&ctx, - psprintf("redirected line pointer points to a dead item at offset %u", - (unsigned) rdoffnum)); + psprintf("redirected line pointer points to a dead item at offset %d", + rdoffnum)); continue; } else if (ItemIdIsRedirected(rditem)) { report_corruption(&ctx, - psprintf("redirected line pointer points to another redirected line pointer at offset %u", - (unsigned) rdoffnum)); + psprintf("redirected line pointer points to another redirected line pointer at offset %d", + rdoffnum)); continue; } @@ -601,10 +601,10 @@ verify_heapam(PG_FUNCTION_ARGS) if (ctx.lp_off + ctx.lp_len > BLCKSZ) { report_corruption(&ctx, - psprintf("line pointer to page offset %u with length %u ends beyond maximum page offset %u", + psprintf("line pointer to page offset %u with length %u ends beyond maximum page offset %d", ctx.lp_off, ctx.lp_len, - (unsigned) BLCKSZ)); + BLCKSZ)); continue; } @@ -678,16 +678,16 @@ verify_heapam(PG_FUNCTION_ARGS) if (!HeapTupleHeaderIsHeapOnly(next_htup)) { report_corruption(&ctx, - psprintf("redirected line pointer points to a non-heap-only tuple at offset %u", - (unsigned) nextoffnum)); + psprintf("redirected line pointer points to a non-heap-only tuple at offset %d", + nextoffnum)); } /* HOT chains should not intersect. */ if (predecessor[nextoffnum] != InvalidOffsetNumber) { report_corruption(&ctx, - psprintf("redirect line pointer points to offset %u, but offset %u also points there", - (unsigned) nextoffnum, (unsigned) predecessor[nextoffnum])); + psprintf("redirect line pointer points to offset %d, but offset %d also points there", + nextoffnum, predecessor[nextoffnum])); continue; } @@ -719,8 +719,8 @@ verify_heapam(PG_FUNCTION_ARGS) if (predecessor[nextoffnum] != InvalidOffsetNumber) { report_corruption(&ctx, - psprintf("tuple points to new version at offset %u, but offset %u also points there", - (unsigned) nextoffnum, (unsigned) predecessor[nextoffnum])); + psprintf("tuple points to new version at offset %d, but offset %d also points there", + nextoffnum, predecessor[nextoffnum])); continue; } @@ -743,15 +743,15 @@ verify_heapam(PG_FUNCTION_ARGS) HeapTupleHeaderIsHeapOnly(next_htup)) { report_corruption(&ctx, - psprintf("non-heap-only update produced a heap-only tuple at offset %u", - (unsigned) nextoffnum)); + psprintf("non-heap-only update produced a heap-only tuple at offset %d", + nextoffnum)); } if ((curr_htup->t_infomask2 & HEAP_HOT_UPDATED) && !HeapTupleHeaderIsHeapOnly(next_htup)) { report_corruption(&ctx, - psprintf("heap-only update produced a non-heap only tuple at offset %u", - (unsigned) nextoffnum)); + psprintf("heap-only update produced a non-heap only tuple at offset %d", + nextoffnum)); } /* @@ -772,10 +772,10 @@ verify_heapam(PG_FUNCTION_ARGS) TransactionIdIsInProgress(curr_xmin)) { report_corruption(&ctx, - psprintf("tuple with in-progress xmin %u was updated to produce a tuple at offset %u with committed xmin %u", - (unsigned) curr_xmin, - (unsigned) ctx.offnum, - (unsigned) next_xmin)); + psprintf("tuple with in-progress xmin %u was updated to produce a tuple at offset %d with committed xmin %u", + curr_xmin, + ctx.offnum, + next_xmin)); } /* @@ -788,16 +788,16 @@ verify_heapam(PG_FUNCTION_ARGS) { if (xmin_commit_status[nextoffnum] == XID_IN_PROGRESS) report_corruption(&ctx, - psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %u with in-progress xmin %u", - (unsigned) curr_xmin, - (unsigned) ctx.offnum, - (unsigned) next_xmin)); + psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %d with in-progress xmin %u", + curr_xmin, + ctx.offnum, + next_xmin)); else if (xmin_commit_status[nextoffnum] == XID_COMMITTED) report_corruption(&ctx, - psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %u with committed xmin %u", - (unsigned) curr_xmin, - (unsigned) ctx.offnum, - (unsigned) next_xmin)); + psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %d with committed xmin %u", + curr_xmin, + ctx.offnum, + next_xmin)); } } @@ -1838,7 +1838,7 @@ check_tuple_attribute(HeapCheckContext *ctx) { ToastedAttribute *ta; - ta = (ToastedAttribute *) palloc0(sizeof(ToastedAttribute)); + ta = palloc0_object(ToastedAttribute); VARATT_EXTERNAL_GET_POINTER(ta->toast_pointer, attr); ta->blkno = ctx->blkno; @@ -1942,7 +1942,7 @@ check_tuple(HeapCheckContext *ctx, bool *xmin_commit_status_ok, if (RelationGetDescr(ctx->rel)->natts < ctx->natts) { report_corruption(ctx, - psprintf("number of attributes %u exceeds maximum expected for table %u", + psprintf("number of attributes %u exceeds maximum %u expected for table", ctx->natts, RelationGetDescr(ctx->rel)->natts)); return; diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index f11c43a0ed797..f91392a3a4977 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -382,7 +382,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, BTMetaPageData *metad; uint32 previouslevel; BtreeLevel current; - Snapshot snapshot = SnapshotAny; if (!readonly) elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"", @@ -400,7 +399,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Initialize state for entire verification operation */ - state = palloc0(sizeof(BtreeCheckState)); + state = palloc0_object(BtreeCheckState); state->rel = rel; state->heaprel = heaprel; state->heapkeyspace = heapkeyspace; @@ -433,54 +432,46 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->heaptuplespresent = 0; /* - * Register our own snapshot in !readonly case, rather than asking + * Register our own snapshot for heapallindexed, rather than asking * table_index_build_scan() to do this for us later. This needs to * happen before index fingerprinting begins, so we can later be * certain that index fingerprinting should have reached all tuples * returned by table_index_build_scan(). */ - if (!state->readonly) - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + state->snapshot = RegisterSnapshot(GetTransactionSnapshot()); - /* - * GetTransactionSnapshot() always acquires a new MVCC snapshot in - * READ COMMITTED mode. A new snapshot is guaranteed to have all - * the entries it requires in the index. - * - * We must defend against the possibility that an old xact - * snapshot was returned at higher isolation levels when that - * snapshot is not safe for index scans of the target index. This - * is possible when the snapshot sees tuples that are before the - * index's indcheckxmin horizon. Throwing an error here should be - * very rare. It doesn't seem worth using a secondary snapshot to - * avoid this. - */ - if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && - !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), - snapshot->xmin)) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("index \"%s\" cannot be verified using transaction snapshot", - RelationGetRelationName(rel)))); - } + /* + * GetTransactionSnapshot() always acquires a new MVCC snapshot in + * READ COMMITTED mode. A new snapshot is guaranteed to have all the + * entries it requires in the index. + * + * We must defend against the possibility that an old xact snapshot + * was returned at higher isolation levels when that snapshot is not + * safe for index scans of the target index. This is possible when + * the snapshot sees tuples that are before the index's indcheckxmin + * horizon. Throwing an error here should be very rare. It doesn't + * seem worth using a secondary snapshot to avoid this. + */ + if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), + state->snapshot->xmin)) + ereport(ERROR, + errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("index \"%s\" cannot be verified using transaction snapshot", + RelationGetRelationName(rel))); } /* - * We need a snapshot to check the uniqueness of the index. For better - * performance take it once per index check. If snapshot already taken - * reuse it. + * We need a snapshot to check the uniqueness of the index. For better + * performance, take it once per index check. If one was already taken + * above, use that. */ if (state->checkunique) { state->indexinfo = BuildIndexInfo(state->rel); - if (state->indexinfo->ii_Unique) - { - if (snapshot != SnapshotAny) - state->snapshot = snapshot; - else - state->snapshot = RegisterSnapshot(GetTransactionSnapshot()); - } + + if (state->indexinfo->ii_Unique && state->snapshot == InvalidSnapshot) + state->snapshot = RegisterSnapshot(GetTransactionSnapshot()); } Assert(!state->rootdescend || state->readonly); @@ -555,13 +546,12 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Create our own scan for table_index_build_scan(), rather than * getting it to do so for us. This is required so that we can - * actually use the MVCC snapshot registered earlier in !readonly - * case. + * actually use the MVCC snapshot registered earlier. * * Note that table_index_build_scan() calls heap_endscan() for us. */ scan = table_beginscan_strat(state->heaprel, /* relation */ - snapshot, /* snapshot */ + state->snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ @@ -569,16 +559,15 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY - * behaves in !readonly case. + * behaves. * * It's okay that we don't actually use the same lock strength for the - * heap relation as any other ii_Concurrent caller would in !readonly - * case. We have no reason to care about a concurrent VACUUM - * operation, since there isn't going to be a second scan of the heap - * that needs to be sure that there was no concurrent recycling of - * TIDs. + * heap relation as any other ii_Concurrent caller would. We have no + * reason to care about a concurrent VACUUM operation, since there + * isn't going to be a second scan of the heap that needs to be sure + * that there was no concurrent recycling of TIDs. */ - indexinfo->ii_Concurrent = !state->readonly; + indexinfo->ii_Concurrent = true; /* * Don't wait for uncommitted tuple xact commit/abort when index is a @@ -602,14 +591,11 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->heaptuplespresent, RelationGetRelationName(heaprel), 100.0 * bloom_prop_bits_set(state->filter)))); - if (snapshot != SnapshotAny) - UnregisterSnapshot(snapshot); - bloom_free(state->filter); } /* Be tidy: */ - if (snapshot == SnapshotAny && state->snapshot != InvalidSnapshot) + if (state->snapshot != InvalidSnapshot) UnregisterSnapshot(state->snapshot); MemoryContextDelete(state->targetcontext); } @@ -721,7 +707,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) errmsg("block %u is not leftmost in index \"%s\"", current, RelationGetRelationName(state->rel)))); - if (level.istruerootlevel && !P_ISROOT(opaque)) + if (level.istruerootlevel && (!P_ISROOT(opaque) && !P_INCOMPLETE_SPLIT(opaque))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u is not true root in index \"%s\"", @@ -913,7 +899,7 @@ bt_report_duplicate(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index uniqueness is violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail("Index %s%s and%s%s (point to heap %s and %s) page lsn=%X/%X.", + errdetail("Index %s%s and%s%s (point to heap %s and %s) page lsn=%X/%08X.", itid, pposting, nitid, pnposting, htid, nhtid, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1058,7 +1044,7 @@ bt_leftmost_ignoring_half_dead(BtreeCheckState *state, (errcode(ERRCODE_NO_DATA), errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", + errdetail_internal("Block=%u right block=%u page lsn=%X/%08X.", reached, reached_from, LSN_FORMAT_ARGS(pagelsn)))); @@ -1283,7 +1269,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("wrong number of high key index tuple attributes in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.", + errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%08X.", state->targetblock, BTreeTupleGetNAtts(itup, state->rel), P_ISLEAF(topaque) ? "heap" : "index", @@ -1332,7 +1318,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index tuple size does not equal lp_len in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.", + errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%08X.", state->targetblock, offset, tupsize, ItemIdGetLength(itemid), LSN_FORMAT_ARGS(state->targetlsn)), @@ -1356,7 +1342,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("wrong number of index tuple attributes in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%08X.", itid, BTreeTupleGetNAtts(itup, state->rel), P_ISLEAF(topaque) ? "heap" : "index", @@ -1406,7 +1392,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("could not find tuple using search from root page in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%08X.", itid, htid, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1435,7 +1421,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("posting list contains misplaced TID in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.", + errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%08X.", itid, i, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1488,7 +1474,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index row size %zu exceeds maximum for index \"%s\"", tupsize, RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%08X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, @@ -1595,7 +1581,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("high key invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%08X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, @@ -1641,9 +1627,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("item order invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Lower index tid=%s (points to %s tid=%s) " - "higher index tid=%s (points to %s tid=%s) " - "page lsn=%X/%X.", + errdetail_internal("Lower index tid=%s (points to %s tid=%s) higher index tid=%s (points to %s tid=%s) page lsn=%X/%08X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, @@ -1760,7 +1744,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("cross page item order invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.", + errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%08X.", state->targetblock, offset, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1813,7 +1797,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("right block of leaf block is non-leaf for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u page lsn=%X/%X.", + errdetail_internal("Block=%u page lsn=%X/%08X.", state->targetblock, LSN_FORMAT_ARGS(state->targetlsn)))); @@ -2237,7 +2221,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); @@ -2270,7 +2254,7 @@ bt_child_highkey_check(BtreeCheckState *state, * If we visit page with high key, check that it is equal to the * target key next to corresponding downlink. */ - if (!rightsplit && !P_RIGHTMOST(opaque)) + if (!rightsplit && !P_RIGHTMOST(opaque) && !P_ISHALFDEAD(opaque)) { BTPageOpaque topaque; IndexTuple highkey; @@ -2323,7 +2307,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); pivotkey_offset = P_HIKEY; @@ -2353,7 +2337,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("can't find left sibling high key in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); itup = state->lowkey; @@ -2365,7 +2349,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("mismatch between parent key and child high key in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -2505,7 +2489,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("downlink to deleted page found in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.", + errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%08X.", state->targetblock, childblock, LSN_FORMAT_ARGS(state->targetlsn)))); @@ -2546,7 +2530,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("down-link lower bound invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.", + errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%08X.", state->targetblock, childblock, offset, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -2616,7 +2600,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_NO_DATA), errmsg_internal("harmless interrupted page split detected in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", + errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%08X.", blkno, opaque->btpo_level, opaque->btpo_prev, LSN_FORMAT_ARGS(pagelsn)))); @@ -2638,7 +2622,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("leaf index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u page lsn=%X/%X.", + errdetail_internal("Block=%u page lsn=%X/%08X.", blkno, LSN_FORMAT_ARGS(pagelsn)))); @@ -2704,7 +2688,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("downlink to deleted leaf page found in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.", + errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%08X.", blkno, childblk, LSN_FORMAT_ARGS(pagelsn)))); @@ -2730,7 +2714,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u level=%u page lsn=%X/%X.", + errdetail_internal("Block=%u level=%u page lsn=%X/%08X.", blkno, opaque->btpo_level, LSN_FORMAT_ARGS(pagelsn)))); } diff --git a/contrib/auto_explain/Makefile b/contrib/auto_explain/Makefile index efd127d3cae64..94ab28e7c06b9 100644 --- a/contrib/auto_explain/Makefile +++ b/contrib/auto_explain/Makefile @@ -6,6 +6,8 @@ OBJS = \ auto_explain.o PGFILEDESC = "auto_explain - logging facility for execution plans" +REGRESS = alter_reset + TAP_TESTS = 1 ifdef USE_PGXS diff --git a/contrib/auto_explain/expected/alter_reset.out b/contrib/auto_explain/expected/alter_reset.out new file mode 100644 index 0000000000000..ec355189806ae --- /dev/null +++ b/contrib/auto_explain/expected/alter_reset.out @@ -0,0 +1,19 @@ +-- +-- This tests resetting unknown custom GUCs with reserved prefixes. There's +-- nothing specific to auto_explain; this is just a convenient place to put +-- this test. +-- +SELECT current_database() AS datname \gset +CREATE ROLE regress_ae_role; +ALTER DATABASE :"datname" SET auto_explain.bogus = 1; +ALTER ROLE regress_ae_role SET auto_explain.bogus = 1; +ALTER ROLE regress_ae_role IN DATABASE :"datname" SET auto_explain.bogus = 1; +ALTER SYSTEM SET auto_explain.bogus = 1; +LOAD 'auto_explain'; +WARNING: invalid configuration parameter name "auto_explain.bogus", removing it +DETAIL: "auto_explain" is now a reserved prefix. +ALTER DATABASE :"datname" RESET auto_explain.bogus; +ALTER ROLE regress_ae_role RESET auto_explain.bogus; +ALTER ROLE regress_ae_role IN DATABASE :"datname" RESET auto_explain.bogus; +ALTER SYSTEM RESET auto_explain.bogus; +DROP ROLE regress_ae_role; diff --git a/contrib/auto_explain/meson.build b/contrib/auto_explain/meson.build index 92dc9df6f7cac..a9b45cc235f12 100644 --- a/contrib/auto_explain/meson.build +++ b/contrib/auto_explain/meson.build @@ -20,6 +20,11 @@ tests += { 'name': 'auto_explain', 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'alter_reset', + ], + }, 'tap': { 'tests': [ 't/001_auto_explain.pl', diff --git a/contrib/auto_explain/sql/alter_reset.sql b/contrib/auto_explain/sql/alter_reset.sql new file mode 100644 index 0000000000000..bf621454ec24a --- /dev/null +++ b/contrib/auto_explain/sql/alter_reset.sql @@ -0,0 +1,22 @@ +-- +-- This tests resetting unknown custom GUCs with reserved prefixes. There's +-- nothing specific to auto_explain; this is just a convenient place to put +-- this test. +-- + +SELECT current_database() AS datname \gset +CREATE ROLE regress_ae_role; + +ALTER DATABASE :"datname" SET auto_explain.bogus = 1; +ALTER ROLE regress_ae_role SET auto_explain.bogus = 1; +ALTER ROLE regress_ae_role IN DATABASE :"datname" SET auto_explain.bogus = 1; +ALTER SYSTEM SET auto_explain.bogus = 1; + +LOAD 'auto_explain'; + +ALTER DATABASE :"datname" RESET auto_explain.bogus; +ALTER ROLE regress_ae_role RESET auto_explain.bogus; +ALTER ROLE regress_ae_role IN DATABASE :"datname" RESET auto_explain.bogus; +ALTER SYSTEM RESET auto_explain.bogus; + +DROP ROLE regress_ae_role; diff --git a/contrib/basebackup_to_shell/basebackup_to_shell.c b/contrib/basebackup_to_shell/basebackup_to_shell.c index 8720f5a43727d..345d3ed895d46 100644 --- a/contrib/basebackup_to_shell/basebackup_to_shell.c +++ b/contrib/basebackup_to_shell/basebackup_to_shell.c @@ -136,7 +136,7 @@ shell_get_sink(bbsink *next_sink, void *detail_arg) * We remember the current value of basebackup_to_shell.shell_command to * be certain that it can't change under us during the backup. */ - sink = palloc0(sizeof(bbsink_shell)); + sink = palloc0_object(bbsink_shell); *((const bbsink_ops **) &sink->base.bbs_ops) = &bbsink_shell_ops; sink->base.bbs_next = next_sink; sink->target_detail = detail_arg; diff --git a/contrib/basebackup_to_shell/meson.build b/contrib/basebackup_to_shell/meson.build index 8c88242456e80..8a4f170c5f829 100644 --- a/contrib/basebackup_to_shell/meson.build +++ b/contrib/basebackup_to_shell/meson.build @@ -24,7 +24,7 @@ tests += { 'tests': [ 't/001_basic.pl', ], - 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'TAR': tar.found() ? tar.path() : '' }, + 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'TAR': tar.found() ? tar.full_path() : '' }, }, } diff --git a/contrib/basic_archive/basic_archive.c b/contrib/basic_archive/basic_archive.c index 4a8b8c7ac29c1..8fc633d2cbf62 100644 --- a/contrib/basic_archive/basic_archive.c +++ b/contrib/basic_archive/basic_archive.c @@ -65,7 +65,7 @@ void _PG_init(void) { DefineCustomStringVariable("basic_archive.archive_directory", - gettext_noop("Archive file destination directory."), + "Archive file destination directory.", NULL, &archive_directory, "", diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index 7866438122f58..c11e06c34ee5f 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -151,7 +151,7 @@ blbuild(Relation heap, Relation index, IndexInfo *indexInfo) MemoryContextDelete(buildstate.tmpCtx); - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result = palloc_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h index 648167045f4e8..b2966d37077fa 100644 --- a/contrib/bloom/bloom.h +++ b/contrib/bloom/bloom.h @@ -72,7 +72,7 @@ typedef BloomPageOpaqueData *BloomPageOpaque; ((BloomTuple *)(PageGetContents(page) \ + (state)->sizeOfBloomTuple * ((offset) - 1))) #define BloomPageGetNextTuple(state, tuple) \ - ((BloomTuple *)((Pointer)(tuple) + (state)->sizeOfBloomTuple)) + ((BloomTuple *)((char *)(tuple) + (state)->sizeOfBloomTuple)) /* Preserved page numbers */ #define BLOOM_METAPAGE_BLKNO (0) diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c index d072f47fe28b5..0d71edbe91c36 100644 --- a/contrib/bloom/blscan.c +++ b/contrib/bloom/blscan.c @@ -29,7 +29,7 @@ blbeginscan(Relation r, int nkeys, int norderbys) scan = RelationGetIndexScan(r, nkeys, norderbys); - so = (BloomScanOpaque) palloc(sizeof(BloomScanOpaqueData)); + so = (BloomScanOpaque) palloc_object(BloomScanOpaqueData); initBloomState(&so->state, scan->indexRelation); so->sign = NULL; @@ -86,7 +86,7 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) /* New search: have to calculate search signature */ ScanKey skey = scan->keyData; - so->sign = palloc0(sizeof(BloomSignatureWord) * so->state.opts.bloomLength); + so->sign = palloc0_array(BloomSignatureWord, so->state.opts.bloomLength); for (i = 0; i < scan->numberOfKeys; i++) { diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 2c0e71eedc654..7a468b4a173ba 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -86,7 +86,7 @@ makeDefaultBloomOptions(void) BloomOptions *opts; int i; - opts = (BloomOptions *) palloc0(sizeof(BloomOptions)); + opts = palloc0_object(BloomOptions); /* Convert DEFAULT_BLOOM_LENGTH from # of bits to # of words */ opts->bloomLength = (DEFAULT_BLOOM_LENGTH + SIGNWORDBITS - 1) / SIGNWORDBITS; for (i = 0; i < INDEX_MAX_KEYS; i++) @@ -324,7 +324,7 @@ BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple) { BloomTuple *itup; BloomPageOpaque opaque; - Pointer ptr; + char *ptr; /* We shouldn't be pointed to an invalid page */ Assert(!PageIsNew(page) && !BloomPageIsDeleted(page)); @@ -336,11 +336,11 @@ BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple) /* Copy new tuple to the end of page */ opaque = BloomPageGetOpaque(page); itup = BloomPageGetTuple(state, page, opaque->maxoff + 1); - memcpy((Pointer) itup, (Pointer) tuple, state->sizeOfBloomTuple); + memcpy(itup, tuple, state->sizeOfBloomTuple); /* Adjust maxoff and pd_lower */ opaque->maxoff++; - ptr = (Pointer) BloomPageGetTuple(state, page, opaque->maxoff + 1); + ptr = (char *) BloomPageGetTuple(state, page, opaque->maxoff + 1); ((PageHeader) page)->pd_lower = ptr - page; /* Assert we didn't overrun available space */ diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c index 86b15a75f6fb9..e68a9008f56c8 100644 --- a/contrib/bloom/blvacuum.c +++ b/contrib/bloom/blvacuum.c @@ -42,7 +42,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, GenericXLogState *gxlogState; if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); initBloomState(&state, index); @@ -94,8 +94,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, { /* No; copy it to itupPtr++, but skip copy if not needed */ if (itupPtr != itup) - memmove((Pointer) itupPtr, (Pointer) itup, - state.sizeOfBloomTuple); + memmove(itupPtr, itup, state.sizeOfBloomTuple); itupPtr = BloomPageGetNextTuple(&state, itupPtr); } @@ -122,7 +121,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, if (BloomPageGetMaxOffset(page) == 0) BloomPageSetDeleted(page); /* Adjust pd_lower */ - ((PageHeader) page)->pd_lower = (Pointer) itupPtr - page; + ((PageHeader) page)->pd_lower = (char *) itupPtr - page; /* Finish WAL-logging */ GenericXLogFinish(gxlogState); } @@ -172,7 +171,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) return stats; if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); /* * Iterate over the pages: insert deleted pages into FSM and collect @@ -192,7 +191,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (PageIsNew(page) || BloomPageIsDeleted(page)) { diff --git a/contrib/btree_gin/Makefile b/contrib/btree_gin/Makefile index 0a15811516819..ad054598db6c9 100644 --- a/contrib/btree_gin/Makefile +++ b/contrib/btree_gin/Makefile @@ -7,7 +7,7 @@ OBJS = \ EXTENSION = btree_gin DATA = btree_gin--1.0.sql btree_gin--1.0--1.1.sql btree_gin--1.1--1.2.sql \ - btree_gin--1.2--1.3.sql + btree_gin--1.2--1.3.sql btree_gin--1.3--1.4.sql PGFILEDESC = "btree_gin - B-tree equivalent GIN operator classes" REGRESS = install_btree_gin int2 int4 int8 float4 float8 money oid \ diff --git a/contrib/btree_gin/btree_gin--1.3--1.4.sql b/contrib/btree_gin/btree_gin--1.3--1.4.sql new file mode 100644 index 0000000000000..61b5dcbede6c5 --- /dev/null +++ b/contrib/btree_gin/btree_gin--1.3--1.4.sql @@ -0,0 +1,151 @@ +/* contrib/btree_gin/btree_gin--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION btree_gin UPDATE TO '1.4'" to load this file. \quit + +-- +-- Cross-type operator support is new in 1.4. We only need to worry +-- about this for cross-type operators that exist in core. +-- +-- Because the opclass extractQuery and consistent methods don't directly +-- get any information about the datatype of the RHS value, we have to +-- encode that in the operator strategy numbers. The strategy numbers +-- are the operator's normal btree strategy (1-5) plus 16 times a code +-- for the RHS datatype. +-- + +ALTER OPERATOR FAMILY int2_ops USING gin +ADD + -- Code 1: RHS is int4 + OPERATOR 0x11 < (int2, int4), + OPERATOR 0x12 <= (int2, int4), + OPERATOR 0x13 = (int2, int4), + OPERATOR 0x14 >= (int2, int4), + OPERATOR 0x15 > (int2, int4), + -- Code 2: RHS is int8 + OPERATOR 0x21 < (int2, int8), + OPERATOR 0x22 <= (int2, int8), + OPERATOR 0x23 = (int2, int8), + OPERATOR 0x24 >= (int2, int8), + OPERATOR 0x25 > (int2, int8) +; + +ALTER OPERATOR FAMILY int4_ops USING gin +ADD + -- Code 1: RHS is int2 + OPERATOR 0x11 < (int4, int2), + OPERATOR 0x12 <= (int4, int2), + OPERATOR 0x13 = (int4, int2), + OPERATOR 0x14 >= (int4, int2), + OPERATOR 0x15 > (int4, int2), + -- Code 2: RHS is int8 + OPERATOR 0x21 < (int4, int8), + OPERATOR 0x22 <= (int4, int8), + OPERATOR 0x23 = (int4, int8), + OPERATOR 0x24 >= (int4, int8), + OPERATOR 0x25 > (int4, int8) +; + +ALTER OPERATOR FAMILY int8_ops USING gin +ADD + -- Code 1: RHS is int2 + OPERATOR 0x11 < (int8, int2), + OPERATOR 0x12 <= (int8, int2), + OPERATOR 0x13 = (int8, int2), + OPERATOR 0x14 >= (int8, int2), + OPERATOR 0x15 > (int8, int2), + -- Code 2: RHS is int4 + OPERATOR 0x21 < (int8, int4), + OPERATOR 0x22 <= (int8, int4), + OPERATOR 0x23 = (int8, int4), + OPERATOR 0x24 >= (int8, int4), + OPERATOR 0x25 > (int8, int4) +; + +ALTER OPERATOR FAMILY float4_ops USING gin +ADD + -- Code 1: RHS is float8 + OPERATOR 0x11 < (float4, float8), + OPERATOR 0x12 <= (float4, float8), + OPERATOR 0x13 = (float4, float8), + OPERATOR 0x14 >= (float4, float8), + OPERATOR 0x15 > (float4, float8) +; + +ALTER OPERATOR FAMILY float8_ops USING gin +ADD + -- Code 1: RHS is float4 + OPERATOR 0x11 < (float8, float4), + OPERATOR 0x12 <= (float8, float4), + OPERATOR 0x13 = (float8, float4), + OPERATOR 0x14 >= (float8, float4), + OPERATOR 0x15 > (float8, float4) +; + +ALTER OPERATOR FAMILY text_ops USING gin +ADD + -- Code 1: RHS is name + OPERATOR 0x11 < (text, name), + OPERATOR 0x12 <= (text, name), + OPERATOR 0x13 = (text, name), + OPERATOR 0x14 >= (text, name), + OPERATOR 0x15 > (text, name) +; + +ALTER OPERATOR FAMILY name_ops USING gin +ADD + -- Code 1: RHS is text + OPERATOR 0x11 < (name, text), + OPERATOR 0x12 <= (name, text), + OPERATOR 0x13 = (name, text), + OPERATOR 0x14 >= (name, text), + OPERATOR 0x15 > (name, text) +; + +ALTER OPERATOR FAMILY date_ops USING gin +ADD + -- Code 1: RHS is timestamp + OPERATOR 0x11 < (date, timestamp), + OPERATOR 0x12 <= (date, timestamp), + OPERATOR 0x13 = (date, timestamp), + OPERATOR 0x14 >= (date, timestamp), + OPERATOR 0x15 > (date, timestamp), + -- Code 2: RHS is timestamptz + OPERATOR 0x21 < (date, timestamptz), + OPERATOR 0x22 <= (date, timestamptz), + OPERATOR 0x23 = (date, timestamptz), + OPERATOR 0x24 >= (date, timestamptz), + OPERATOR 0x25 > (date, timestamptz) +; + +ALTER OPERATOR FAMILY timestamp_ops USING gin +ADD + -- Code 1: RHS is date + OPERATOR 0x11 < (timestamp, date), + OPERATOR 0x12 <= (timestamp, date), + OPERATOR 0x13 = (timestamp, date), + OPERATOR 0x14 >= (timestamp, date), + OPERATOR 0x15 > (timestamp, date), + -- Code 2: RHS is timestamptz + OPERATOR 0x21 < (timestamp, timestamptz), + OPERATOR 0x22 <= (timestamp, timestamptz), + OPERATOR 0x23 = (timestamp, timestamptz), + OPERATOR 0x24 >= (timestamp, timestamptz), + OPERATOR 0x25 > (timestamp, timestamptz) +; + +ALTER OPERATOR FAMILY timestamptz_ops USING gin +ADD + -- Code 1: RHS is date + OPERATOR 0x11 < (timestamptz, date), + OPERATOR 0x12 <= (timestamptz, date), + OPERATOR 0x13 = (timestamptz, date), + OPERATOR 0x14 >= (timestamptz, date), + OPERATOR 0x15 > (timestamptz, date), + -- Code 2: RHS is timestamp + OPERATOR 0x21 < (timestamptz, timestamp), + OPERATOR 0x22 <= (timestamptz, timestamp), + OPERATOR 0x23 = (timestamptz, timestamp), + OPERATOR 0x24 >= (timestamptz, timestamp), + OPERATOR 0x25 > (timestamptz, timestamp) +; diff --git a/contrib/btree_gin/btree_gin.c b/contrib/btree_gin/btree_gin.c index 98663cb86117e..afb8b3820af26 100644 --- a/contrib/btree_gin/btree_gin.c +++ b/contrib/btree_gin/btree_gin.c @@ -6,6 +6,8 @@ #include #include "access/stratnum.h" +#include "mb/pg_wchar.h" +#include "nodes/miscnodes.h" #include "utils/builtins.h" #include "utils/date.h" #include "utils/float.h" @@ -13,20 +15,36 @@ #include "utils/numeric.h" #include "utils/timestamp.h" #include "utils/uuid.h" +#include "varatt.h" PG_MODULE_MAGIC_EXT( .name = "btree_gin", .version = PG_VERSION ); +/* + * Our opclasses use the same strategy numbers as btree (1-5) for same-type + * comparison operators. For cross-type comparison operators, the + * low 4 bits of our strategy numbers are the btree strategy number, + * and the upper bits are a code for the right-hand-side data type. + */ +#define BTGIN_GET_BTREE_STRATEGY(strat) ((strat) & 0x0F) +#define BTGIN_GET_RHS_TYPE_CODE(strat) ((strat) >> 4) + +/* extra data passed from gin_btree_extract_query to gin_btree_compare_prefix */ typedef struct QueryInfo { - StrategyNumber strategy; - Datum datum; - bool is_varlena; - Datum (*typecmp) (FunctionCallInfo); + StrategyNumber strategy; /* operator strategy number */ + Datum orig_datum; /* original query (comparison) datum */ + Datum entry_datum; /* datum we reported as the entry value */ + PGFunction typecmp; /* appropriate btree comparison function */ } QueryInfo; +typedef Datum (*btree_gin_convert_function) (Datum input); + +typedef Datum (*btree_gin_leftmost_function) (void); + + /*** GIN support functions shared by all datatypes ***/ static Datum @@ -34,8 +52,9 @@ gin_btree_extract_value(FunctionCallInfo fcinfo, bool is_varlena) { Datum datum = PG_GETARG_DATUM(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); - Datum *entries = (Datum *) palloc(sizeof(Datum)); + Datum *entries = palloc_object(Datum); + /* Ensure that values stored in the index are not toasted */ if (is_varlena) datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); entries[0] = datum; @@ -44,42 +63,54 @@ gin_btree_extract_value(FunctionCallInfo fcinfo, bool is_varlena) PG_RETURN_POINTER(entries); } -/* - * For BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, and - * BTEqualStrategyNumber we want to start the index scan at the - * supplied query datum, and work forward. For BTLessStrategyNumber - * and BTLessEqualStrategyNumber, we need to start at the leftmost - * key, and work forward until the supplied query datum (which must be - * sent along inside the QueryInfo structure). - */ static Datum gin_btree_extract_query(FunctionCallInfo fcinfo, - bool is_varlena, - Datum (*leftmostvalue) (void), - Datum (*typecmp) (FunctionCallInfo)) + btree_gin_leftmost_function leftmostvalue, + const bool *rhs_is_varlena, + const btree_gin_convert_function *cvt_fns, + const PGFunction *cmp_fns) { Datum datum = PG_GETARG_DATUM(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); StrategyNumber strategy = PG_GETARG_UINT16(2); bool **partialmatch = (bool **) PG_GETARG_POINTER(3); Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4); - Datum *entries = (Datum *) palloc(sizeof(Datum)); - QueryInfo *data = (QueryInfo *) palloc(sizeof(QueryInfo)); - bool *ptr_partialmatch; + Datum *entries = palloc_object(Datum); + QueryInfo *data = palloc_object(QueryInfo); + bool *ptr_partialmatch = palloc_object(bool); + int btree_strat, + rhs_code; + + /* + * Extract the btree strategy code and the RHS data type code from the + * given strategy number. + */ + btree_strat = BTGIN_GET_BTREE_STRATEGY(strategy); + rhs_code = BTGIN_GET_RHS_TYPE_CODE(strategy); + /* + * Detoast the comparison datum. This isn't necessary for correctness, + * but it can save repeat detoastings within the comparison function. + */ + if (rhs_is_varlena[rhs_code]) + datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); + + /* Prep single comparison key with possible partial-match flag */ *nentries = 1; - ptr_partialmatch = *partialmatch = (bool *) palloc(sizeof(bool)); + *partialmatch = ptr_partialmatch; *ptr_partialmatch = false; - if (is_varlena) - datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); - data->strategy = strategy; - data->datum = datum; - data->is_varlena = is_varlena; - data->typecmp = typecmp; - *extra_data = (Pointer *) palloc(sizeof(Pointer)); - **extra_data = (Pointer) data; - switch (strategy) + /* + * For BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, and + * BTEqualStrategyNumber we want to start the index scan at the supplied + * query datum, and work forward. For BTLessStrategyNumber and + * BTLessEqualStrategyNumber, we need to start at the leftmost key, and + * work forward until the supplied query datum (which we'll send along + * inside the QueryInfo structure). Use partial match rules except for + * BTEqualStrategyNumber without a conversion function. (If there is a + * conversion function, comparison to the entry value is not trustworthy.) + */ + switch (btree_strat) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: @@ -91,75 +122,106 @@ gin_btree_extract_query(FunctionCallInfo fcinfo, *ptr_partialmatch = true; /* FALLTHROUGH */ case BTEqualStrategyNumber: - entries[0] = datum; + /* If we have a conversion function, apply it */ + if (cvt_fns && cvt_fns[rhs_code]) + { + entries[0] = (*cvt_fns[rhs_code]) (datum); + *ptr_partialmatch = true; + } + else + entries[0] = datum; break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); } + /* Fill "extra" data */ + data->strategy = strategy; + data->orig_datum = datum; + data->entry_datum = entries[0]; + data->typecmp = cmp_fns[rhs_code]; + *extra_data = palloc_object(Pointer); + **extra_data = (Pointer) data; + PG_RETURN_POINTER(entries); } -/* - * Datum a is a value from extract_query method and for BTLess* - * strategy it is a left-most value. So, use original datum from QueryInfo - * to decide to stop scanning or not. Datum b is always from index. - */ static Datum gin_btree_compare_prefix(FunctionCallInfo fcinfo) { - Datum a = PG_GETARG_DATUM(0); - Datum b = PG_GETARG_DATUM(1); + Datum partial_key PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(0); + Datum key = PG_GETARG_DATUM(1); QueryInfo *data = (QueryInfo *) PG_GETARG_POINTER(3); int32 res, cmp; + /* + * partial_key is only an approximation to the real comparison value, + * especially if it's a leftmost value. We can get an accurate answer by + * doing a possibly-cross-type comparison to the real comparison value. + * (Note that partial_key and key are of the indexed datatype while + * orig_datum is of the query operator's RHS datatype.) + * + * But just to be sure that things are what we expect, let's assert that + * partial_key is indeed what gin_btree_extract_query reported, so that + * we'll notice if anyone ever changes the core code in a way that breaks + * our assumptions. + */ + Assert(partial_key == data->entry_datum); + cmp = DatumGetInt32(CallerFInfoFunctionCall2(data->typecmp, fcinfo->flinfo, PG_GET_COLLATION(), - (data->strategy == BTLessStrategyNumber || - data->strategy == BTLessEqualStrategyNumber) - ? data->datum : a, - b)); + data->orig_datum, + key)); - switch (data->strategy) + /* + * Convert the comparison result to the correct thing for the search + * operator strategy. When dealing with cross-type comparisons, an + * imprecise entry datum could lead GIN to start the scan just before the + * first possible match, so we must continue the scan if the current index + * entry doesn't satisfy the search condition for >= and > cases. But if + * that happens in an = search we can stop, because an imprecise entry + * datum means that the search value is unrepresentable in the indexed + * data type, so that there will be no exact matches. + */ + switch (BTGIN_GET_BTREE_STRATEGY(data->strategy)) { case BTLessStrategyNumber: /* If original datum > indexed one then return match */ if (cmp > 0) res = 0; else - res = 1; + res = 1; /* end scan */ break; case BTLessEqualStrategyNumber: - /* The same except equality */ + /* If original datum >= indexed one then return match */ if (cmp >= 0) res = 0; else - res = 1; + res = 1; /* end scan */ break; case BTEqualStrategyNumber: - if (cmp != 0) - res = 1; - else + /* If original datum = indexed one then return match */ + /* See above about why we can end scan when cmp < 0 */ + if (cmp == 0) res = 0; + else + res = 1; /* end scan */ break; case BTGreaterEqualStrategyNumber: /* If original datum <= indexed one then return match */ if (cmp <= 0) res = 0; else - res = 1; + res = -1; /* keep scanning */ break; case BTGreaterStrategyNumber: - /* If original datum <= indexed one then return match */ - /* If original datum == indexed one then continue scan */ + /* If original datum < indexed one then return match */ if (cmp < 0) res = 0; - else if (cmp == 0) - res = -1; else - res = 1; + res = -1; /* keep scanning */ break; default: elog(ERROR, "unrecognized strategy number: %d", @@ -182,19 +244,20 @@ gin_btree_consistent(PG_FUNCTION_ARGS) /*** GIN_SUPPORT macro defines the datatype specific functions ***/ -#define GIN_SUPPORT(type, is_varlena, leftmostvalue, typecmp) \ +#define GIN_SUPPORT(type, leftmostvalue, is_varlena, cvtfns, cmpfns) \ PG_FUNCTION_INFO_V1(gin_extract_value_##type); \ Datum \ gin_extract_value_##type(PG_FUNCTION_ARGS) \ { \ - return gin_btree_extract_value(fcinfo, is_varlena); \ + return gin_btree_extract_value(fcinfo, is_varlena[0]); \ } \ PG_FUNCTION_INFO_V1(gin_extract_query_##type); \ Datum \ gin_extract_query_##type(PG_FUNCTION_ARGS) \ { \ return gin_btree_extract_query(fcinfo, \ - is_varlena, leftmostvalue, typecmp); \ + leftmostvalue, is_varlena, \ + cvtfns, cmpfns); \ } \ PG_FUNCTION_INFO_V1(gin_compare_prefix_##type); \ Datum \ @@ -206,13 +269,66 @@ gin_compare_prefix_##type(PG_FUNCTION_ARGS) \ /*** Datatype specifications ***/ +/* Function to produce the least possible value of the indexed datatype */ static Datum leftmostvalue_int2(void) { return Int16GetDatum(SHRT_MIN); } -GIN_SUPPORT(int2, false, leftmostvalue_int2, btint2cmp) +/* + * For cross-type support, we must provide conversion functions that produce + * a Datum of the indexed datatype, since GIN requires the "entry" datums to + * be of that type. If an exact conversion is not possible, produce a value + * that will lead GIN to find the first index entry that is greater than + * or equal to the actual comparison value. (But rounding down is OK, so + * sometimes we might find an index entry that's just less than the + * comparison value.) + * + * For integer values, it's sufficient to clamp the input to be in-range. + * + * Note: for out-of-range input values, we could in theory detect that the + * search condition matches all or none of the index, and avoid a useless + * index descent in the latter case. Such searches are probably rare though, + * so we don't contort this code enough to do that. + */ +static Datum +cvt_int4_int2(Datum input) +{ + int32 val = DatumGetInt32(input); + + val = Max(val, SHRT_MIN); + val = Min(val, SHRT_MAX); + return Int16GetDatum((int16) val); +} + +static Datum +cvt_int8_int2(Datum input) +{ + int64 val = DatumGetInt64(input); + + val = Max(val, SHRT_MIN); + val = Min(val, SHRT_MAX); + return Int16GetDatum((int16) val); +} + +/* + * RHS-type-is-varlena flags, conversion and comparison function arrays, + * indexed by high bits of the operator strategy number. A NULL in the + * conversion function array indicates that no conversion is needed, which + * will always be the case for the zero'th entry. Note that the cross-type + * comparison functions should be the ones with the indexed datatype second. + */ +static const bool int2_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function int2_cvt_fns[] = +{NULL, cvt_int4_int2, cvt_int8_int2}; + +static const PGFunction int2_cmp_fns[] = +{btint2cmp, btint42cmp, btint82cmp}; + +GIN_SUPPORT(int2, leftmostvalue_int2, int2_rhs_is_varlena, int2_cvt_fns, int2_cmp_fns) static Datum leftmostvalue_int4(void) @@ -220,7 +336,34 @@ leftmostvalue_int4(void) return Int32GetDatum(INT_MIN); } -GIN_SUPPORT(int4, false, leftmostvalue_int4, btint4cmp) +static Datum +cvt_int2_int4(Datum input) +{ + int16 val = DatumGetInt16(input); + + return Int32GetDatum((int32) val); +} + +static Datum +cvt_int8_int4(Datum input) +{ + int64 val = DatumGetInt64(input); + + val = Max(val, INT_MIN); + val = Min(val, INT_MAX); + return Int32GetDatum((int32) val); +} + +static const bool int4_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function int4_cvt_fns[] = +{NULL, cvt_int2_int4, cvt_int8_int4}; + +static const PGFunction int4_cmp_fns[] = +{btint4cmp, btint24cmp, btint84cmp}; + +GIN_SUPPORT(int4, leftmostvalue_int4, int4_rhs_is_varlena, int4_cvt_fns, int4_cmp_fns) static Datum leftmostvalue_int8(void) @@ -228,7 +371,32 @@ leftmostvalue_int8(void) return Int64GetDatum(PG_INT64_MIN); } -GIN_SUPPORT(int8, false, leftmostvalue_int8, btint8cmp) +static Datum +cvt_int2_int8(Datum input) +{ + int16 val = DatumGetInt16(input); + + return Int64GetDatum((int64) val); +} + +static Datum +cvt_int4_int8(Datum input) +{ + int32 val = DatumGetInt32(input); + + return Int64GetDatum((int64) val); +} + +static const bool int8_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function int8_cvt_fns[] = +{NULL, cvt_int2_int8, cvt_int4_int8}; + +static const PGFunction int8_cmp_fns[] = +{btint8cmp, btint28cmp, btint48cmp}; + +GIN_SUPPORT(int8, leftmostvalue_int8, int8_rhs_is_varlena, int8_cvt_fns, int8_cmp_fns) static Datum leftmostvalue_float4(void) @@ -236,7 +404,34 @@ leftmostvalue_float4(void) return Float4GetDatum(-get_float4_infinity()); } -GIN_SUPPORT(float4, false, leftmostvalue_float4, btfloat4cmp) +static Datum +cvt_float8_float4(Datum input) +{ + float8 val = DatumGetFloat8(input); + float4 result; + + /* + * Assume that ordinary C conversion will produce a usable result. + * (Compare dtof(), which raises error conditions that we don't need.) + * Note that for inputs that aren't exactly representable as float4, it + * doesn't matter whether the conversion rounds up or down. That might + * cause us to scan a few index entries that we'll reject as not matching, + * but we won't miss any that should match. + */ + result = (float4) val; + return Float4GetDatum(result); +} + +static const bool float4_rhs_is_varlena[] = +{false, false}; + +static const btree_gin_convert_function float4_cvt_fns[] = +{NULL, cvt_float8_float4}; + +static const PGFunction float4_cmp_fns[] = +{btfloat4cmp, btfloat84cmp}; + +GIN_SUPPORT(float4, leftmostvalue_float4, float4_rhs_is_varlena, float4_cvt_fns, float4_cmp_fns) static Datum leftmostvalue_float8(void) @@ -244,7 +439,24 @@ leftmostvalue_float8(void) return Float8GetDatum(-get_float8_infinity()); } -GIN_SUPPORT(float8, false, leftmostvalue_float8, btfloat8cmp) +static Datum +cvt_float4_float8(Datum input) +{ + float4 val = DatumGetFloat4(input); + + return Float8GetDatum((float8) val); +} + +static const bool float8_rhs_is_varlena[] = +{false, false}; + +static const btree_gin_convert_function float8_cvt_fns[] = +{NULL, cvt_float4_float8}; + +static const PGFunction float8_cmp_fns[] = +{btfloat8cmp, btfloat48cmp}; + +GIN_SUPPORT(float8, leftmostvalue_float8, float8_rhs_is_varlena, float8_cvt_fns, float8_cmp_fns) static Datum leftmostvalue_money(void) @@ -252,7 +464,13 @@ leftmostvalue_money(void) return Int64GetDatum(PG_INT64_MIN); } -GIN_SUPPORT(money, false, leftmostvalue_money, cash_cmp) +static const bool money_rhs_is_varlena[] = +{false}; + +static const PGFunction money_cmp_fns[] = +{cash_cmp}; + +GIN_SUPPORT(money, leftmostvalue_money, money_rhs_is_varlena, NULL, money_cmp_fns) static Datum leftmostvalue_oid(void) @@ -260,7 +478,13 @@ leftmostvalue_oid(void) return ObjectIdGetDatum(0); } -GIN_SUPPORT(oid, false, leftmostvalue_oid, btoidcmp) +static const bool oid_rhs_is_varlena[] = +{false}; + +static const PGFunction oid_cmp_fns[] = +{btoidcmp}; + +GIN_SUPPORT(oid, leftmostvalue_oid, oid_rhs_is_varlena, NULL, oid_cmp_fns) static Datum leftmostvalue_timestamp(void) @@ -268,9 +492,75 @@ leftmostvalue_timestamp(void) return TimestampGetDatum(DT_NOBEGIN); } -GIN_SUPPORT(timestamp, false, leftmostvalue_timestamp, timestamp_cmp) +static Datum +cvt_date_timestamp(Datum input) +{ + DateADT val = DatumGetDateADT(input); + Timestamp result; + ErrorSaveContext escontext = {T_ErrorSaveContext}; -GIN_SUPPORT(timestamptz, false, leftmostvalue_timestamp, timestamp_cmp) + result = date2timestamp_safe(val, (Node *) &escontext); + /* We can ignore errors, since result is useful as-is */ + return TimestampGetDatum(result); +} + +static Datum +cvt_timestamptz_timestamp(Datum input) +{ + TimestampTz val = DatumGetTimestampTz(input); + ErrorSaveContext escontext = {T_ErrorSaveContext}; + Timestamp result; + + result = timestamptz2timestamp_safe(val, (Node *) &escontext); + /* We can ignore errors, since result is useful as-is */ + return TimestampGetDatum(result); +} + +static const bool timestamp_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function timestamp_cvt_fns[] = +{NULL, cvt_date_timestamp, cvt_timestamptz_timestamp}; + +static const PGFunction timestamp_cmp_fns[] = +{timestamp_cmp, date_cmp_timestamp, timestamptz_cmp_timestamp}; + +GIN_SUPPORT(timestamp, leftmostvalue_timestamp, timestamp_rhs_is_varlena, timestamp_cvt_fns, timestamp_cmp_fns) + +static Datum +cvt_date_timestamptz(Datum input) +{ + DateADT val = DatumGetDateADT(input); + ErrorSaveContext escontext = {T_ErrorSaveContext}; + TimestampTz result; + + result = date2timestamptz_safe(val, (Node *) &escontext); + /* We can ignore errors, since result is useful as-is */ + return TimestampTzGetDatum(result); +} + +static Datum +cvt_timestamp_timestamptz(Datum input) +{ + Timestamp val = DatumGetTimestamp(input); + ErrorSaveContext escontext = {T_ErrorSaveContext}; + TimestampTz result; + + result = timestamp2timestamptz_safe(val, (Node *) &escontext); + /* We can ignore errors, since result is useful as-is */ + return TimestampTzGetDatum(result); +} + +static const bool timestamptz_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function timestamptz_cvt_fns[] = +{NULL, cvt_date_timestamptz, cvt_timestamp_timestamptz}; + +static const PGFunction timestamptz_cmp_fns[] = +{timestamp_cmp, date_cmp_timestamptz, timestamp_cmp_timestamptz}; + +GIN_SUPPORT(timestamptz, leftmostvalue_timestamp, timestamptz_rhs_is_varlena, timestamptz_cvt_fns, timestamptz_cmp_fns) static Datum leftmostvalue_time(void) @@ -278,12 +568,18 @@ leftmostvalue_time(void) return TimeADTGetDatum(0); } -GIN_SUPPORT(time, false, leftmostvalue_time, time_cmp) +static const bool time_rhs_is_varlena[] = +{false}; + +static const PGFunction time_cmp_fns[] = +{time_cmp}; + +GIN_SUPPORT(time, leftmostvalue_time, time_rhs_is_varlena, NULL, time_cmp_fns) static Datum leftmostvalue_timetz(void) { - TimeTzADT *v = palloc(sizeof(TimeTzADT)); + TimeTzADT *v = palloc_object(TimeTzADT); v->time = 0; v->zone = -24 * 3600; /* XXX is that true? */ @@ -291,7 +587,13 @@ leftmostvalue_timetz(void) return TimeTzADTPGetDatum(v); } -GIN_SUPPORT(timetz, false, leftmostvalue_timetz, timetz_cmp) +static const bool timetz_rhs_is_varlena[] = +{false}; + +static const PGFunction timetz_cmp_fns[] = +{timetz_cmp}; + +GIN_SUPPORT(timetz, leftmostvalue_timetz, timetz_rhs_is_varlena, NULL, timetz_cmp_fns) static Datum leftmostvalue_date(void) @@ -299,39 +601,90 @@ leftmostvalue_date(void) return DateADTGetDatum(DATEVAL_NOBEGIN); } -GIN_SUPPORT(date, false, leftmostvalue_date, date_cmp) +static Datum +cvt_timestamp_date(Datum input) +{ + Timestamp val = DatumGetTimestamp(input); + ErrorSaveContext escontext = {T_ErrorSaveContext}; + DateADT result; + + result = timestamp2date_safe(val, (Node *) &escontext); + /* We can ignore errors, since result is useful as-is */ + return DateADTGetDatum(result); +} + +static Datum +cvt_timestamptz_date(Datum input) +{ + TimestampTz val = DatumGetTimestampTz(input); + ErrorSaveContext escontext = {T_ErrorSaveContext}; + DateADT result; + + result = timestamptz2date_safe(val, (Node *) &escontext); + /* We can ignore errors, since result is useful as-is */ + return DateADTGetDatum(result); +} + +static const bool date_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function date_cvt_fns[] = +{NULL, cvt_timestamp_date, cvt_timestamptz_date}; + +static const PGFunction date_cmp_fns[] = +{date_cmp, timestamp_cmp_date, timestamptz_cmp_date}; + +GIN_SUPPORT(date, leftmostvalue_date, date_rhs_is_varlena, date_cvt_fns, date_cmp_fns) static Datum leftmostvalue_interval(void) { - Interval *v = palloc(sizeof(Interval)); + Interval *v = palloc_object(Interval); INTERVAL_NOBEGIN(v); return IntervalPGetDatum(v); } -GIN_SUPPORT(interval, false, leftmostvalue_interval, interval_cmp) +static const bool interval_rhs_is_varlena[] = +{false}; + +static const PGFunction interval_cmp_fns[] = +{interval_cmp}; + +GIN_SUPPORT(interval, leftmostvalue_interval, interval_rhs_is_varlena, NULL, interval_cmp_fns) static Datum leftmostvalue_macaddr(void) { - macaddr *v = palloc0(sizeof(macaddr)); + macaddr *v = palloc0_object(macaddr); return MacaddrPGetDatum(v); } -GIN_SUPPORT(macaddr, false, leftmostvalue_macaddr, macaddr_cmp) +static const bool macaddr_rhs_is_varlena[] = +{false}; + +static const PGFunction macaddr_cmp_fns[] = +{macaddr_cmp}; + +GIN_SUPPORT(macaddr, leftmostvalue_macaddr, macaddr_rhs_is_varlena, NULL, macaddr_cmp_fns) static Datum leftmostvalue_macaddr8(void) { - macaddr8 *v = palloc0(sizeof(macaddr8)); + macaddr8 *v = palloc0_object(macaddr8); return Macaddr8PGetDatum(v); } -GIN_SUPPORT(macaddr8, false, leftmostvalue_macaddr8, macaddr8_cmp) +static const bool macaddr8_rhs_is_varlena[] = +{false}; + +static const PGFunction macaddr8_cmp_fns[] = +{macaddr8_cmp}; + +GIN_SUPPORT(macaddr8, leftmostvalue_macaddr8, macaddr8_rhs_is_varlena, NULL, macaddr8_cmp_fns) static Datum leftmostvalue_inet(void) @@ -339,9 +692,21 @@ leftmostvalue_inet(void) return DirectFunctionCall1(inet_in, CStringGetDatum("0.0.0.0/0")); } -GIN_SUPPORT(inet, true, leftmostvalue_inet, network_cmp) +static const bool inet_rhs_is_varlena[] = +{true}; + +static const PGFunction inet_cmp_fns[] = +{network_cmp}; + +GIN_SUPPORT(inet, leftmostvalue_inet, inet_rhs_is_varlena, NULL, inet_cmp_fns) -GIN_SUPPORT(cidr, true, leftmostvalue_inet, network_cmp) +static const bool cidr_rhs_is_varlena[] = +{true}; + +static const PGFunction cidr_cmp_fns[] = +{network_cmp}; + +GIN_SUPPORT(cidr, leftmostvalue_inet, cidr_rhs_is_varlena, NULL, cidr_cmp_fns) static Datum leftmostvalue_text(void) @@ -349,9 +714,32 @@ leftmostvalue_text(void) return PointerGetDatum(cstring_to_text_with_len("", 0)); } -GIN_SUPPORT(text, true, leftmostvalue_text, bttextcmp) +static Datum +cvt_name_text(Datum input) +{ + Name val = DatumGetName(input); + + return PointerGetDatum(cstring_to_text(NameStr(*val))); +} + +static const bool text_rhs_is_varlena[] = +{true, false}; + +static const btree_gin_convert_function text_cvt_fns[] = +{NULL, cvt_name_text}; + +static const PGFunction text_cmp_fns[] = +{bttextcmp, btnametextcmp}; + +GIN_SUPPORT(text, leftmostvalue_text, text_rhs_is_varlena, text_cvt_fns, text_cmp_fns) + +static const bool bpchar_rhs_is_varlena[] = +{true}; + +static const PGFunction bpchar_cmp_fns[] = +{bpcharcmp}; -GIN_SUPPORT(bpchar, true, leftmostvalue_text, bpcharcmp) +GIN_SUPPORT(bpchar, leftmostvalue_text, bpchar_rhs_is_varlena, NULL, bpchar_cmp_fns) static Datum leftmostvalue_char(void) @@ -359,9 +747,21 @@ leftmostvalue_char(void) return CharGetDatum(0); } -GIN_SUPPORT(char, false, leftmostvalue_char, btcharcmp) +static const bool char_rhs_is_varlena[] = +{false}; -GIN_SUPPORT(bytea, true, leftmostvalue_text, byteacmp) +static const PGFunction char_cmp_fns[] = +{btcharcmp}; + +GIN_SUPPORT(char, leftmostvalue_char, char_rhs_is_varlena, NULL, char_cmp_fns) + +static const bool bytea_rhs_is_varlena[] = +{true}; + +static const PGFunction bytea_cmp_fns[] = +{byteacmp}; + +GIN_SUPPORT(bytea, leftmostvalue_text, bytea_rhs_is_varlena, NULL, bytea_cmp_fns) static Datum leftmostvalue_bit(void) @@ -372,7 +772,13 @@ leftmostvalue_bit(void) Int32GetDatum(-1)); } -GIN_SUPPORT(bit, true, leftmostvalue_bit, bitcmp) +static const bool bit_rhs_is_varlena[] = +{true}; + +static const PGFunction bit_cmp_fns[] = +{bitcmp}; + +GIN_SUPPORT(bit, leftmostvalue_bit, bit_rhs_is_varlena, NULL, bit_cmp_fns) static Datum leftmostvalue_varbit(void) @@ -383,7 +789,13 @@ leftmostvalue_varbit(void) Int32GetDatum(-1)); } -GIN_SUPPORT(varbit, true, leftmostvalue_varbit, bitcmp) +static const bool varbit_rhs_is_varlena[] = +{true}; + +static const PGFunction varbit_cmp_fns[] = +{bitcmp}; + +GIN_SUPPORT(varbit, leftmostvalue_varbit, varbit_rhs_is_varlena, NULL, varbit_cmp_fns) /* * Numeric type hasn't a real left-most value, so we use PointerGetDatum(NULL) @@ -428,7 +840,13 @@ leftmostvalue_numeric(void) return PointerGetDatum(NULL); } -GIN_SUPPORT(numeric, true, leftmostvalue_numeric, gin_numeric_cmp) +static const bool numeric_rhs_is_varlena[] = +{true}; + +static const PGFunction numeric_cmp_fns[] = +{gin_numeric_cmp}; + +GIN_SUPPORT(numeric, leftmostvalue_numeric, numeric_rhs_is_varlena, NULL, numeric_cmp_fns) /* * Use a similar trick to that used for numeric for enums, since we don't @@ -477,7 +895,13 @@ leftmostvalue_enum(void) return ObjectIdGetDatum(InvalidOid); } -GIN_SUPPORT(anyenum, false, leftmostvalue_enum, gin_enum_cmp) +static const bool enum_rhs_is_varlena[] = +{false}; + +static const PGFunction enum_cmp_fns[] = +{gin_enum_cmp}; + +GIN_SUPPORT(anyenum, leftmostvalue_enum, enum_rhs_is_varlena, NULL, enum_cmp_fns) static Datum leftmostvalue_uuid(void) @@ -486,12 +910,18 @@ leftmostvalue_uuid(void) * palloc0 will create the UUID with all zeroes: * "00000000-0000-0000-0000-000000000000" */ - pg_uuid_t *retval = (pg_uuid_t *) palloc0(sizeof(pg_uuid_t)); + pg_uuid_t *retval = palloc0_object(pg_uuid_t); return UUIDPGetDatum(retval); } -GIN_SUPPORT(uuid, false, leftmostvalue_uuid, uuid_cmp) +static const bool uuid_rhs_is_varlena[] = +{false}; + +static const PGFunction uuid_cmp_fns[] = +{uuid_cmp}; + +GIN_SUPPORT(uuid, leftmostvalue_uuid, uuid_rhs_is_varlena, NULL, uuid_cmp_fns) static Datum leftmostvalue_name(void) @@ -501,7 +931,37 @@ leftmostvalue_name(void) return NameGetDatum(result); } -GIN_SUPPORT(name, false, leftmostvalue_name, btnamecmp) +static Datum +cvt_text_name(Datum input) +{ + text *val = DatumGetTextPP(input); + NameData *result = (NameData *) palloc0(NAMEDATALEN); + int len = VARSIZE_ANY_EXHDR(val); + + /* + * Truncate oversize input. We're assuming this will produce a result + * considered less than the original. That could be a bad assumption in + * some collations, but fortunately an index on "name" is generally going + * to use C collation. + */ + if (len >= NAMEDATALEN) + len = pg_mbcliplen(VARDATA_ANY(val), len, NAMEDATALEN - 1); + + memcpy(NameStr(*result), VARDATA_ANY(val), len); + + return NameGetDatum(result); +} + +static const bool name_rhs_is_varlena[] = +{false, true}; + +static const btree_gin_convert_function name_cvt_fns[] = +{NULL, cvt_text_name}; + +static const PGFunction name_cmp_fns[] = +{btnamecmp, bttextnamecmp}; + +GIN_SUPPORT(name, leftmostvalue_name, name_rhs_is_varlena, name_cvt_fns, name_cmp_fns) static Datum leftmostvalue_bool(void) @@ -509,4 +969,10 @@ leftmostvalue_bool(void) return BoolGetDatum(false); } -GIN_SUPPORT(bool, false, leftmostvalue_bool, btboolcmp) +static const bool bool_rhs_is_varlena[] = +{false}; + +static const PGFunction bool_cmp_fns[] = +{btboolcmp}; + +GIN_SUPPORT(bool, leftmostvalue_bool, bool_rhs_is_varlena, NULL, bool_cmp_fns) diff --git a/contrib/btree_gin/btree_gin.control b/contrib/btree_gin/btree_gin.control index 67d0c997d8d26..0c77c81727117 100644 --- a/contrib/btree_gin/btree_gin.control +++ b/contrib/btree_gin/btree_gin.control @@ -1,6 +1,6 @@ # btree_gin extension comment = 'support for indexing common datatypes in GIN' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/btree_gin' relocatable = true trusted = true diff --git a/contrib/btree_gin/expected/date.out b/contrib/btree_gin/expected/date.out index 40dfa308cf753..e69c1da2000f2 100644 --- a/contrib/btree_gin/expected/date.out +++ b/contrib/btree_gin/expected/date.out @@ -49,3 +49,365 @@ SELECT * FROM test_date WHERE i>'2004-10-26'::date ORDER BY i; 10-28-2004 (2 rows) +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; + QUERY PLAN +----------------------------------------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_date + Recheck Cond: (i < 'Tue Oct 26 00:00:00 2004'::timestamp without time zone) + -> Bitmap Index Scan on idx_date + Index Cond: (i < 'Tue Oct 26 00:00:00 2004'::timestamp without time zone) +(6 rows) + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 +(3 rows) + +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 +(4 rows) + +SELECT * FROM test_date WHERE i='2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-26-2004 +(1 row) + +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 +(3 rows) + +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-27-2004 + 10-28-2004 +(2 rows) + +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; + QUERY PLAN +------------------------------------------------------------------------------------------ + Sort + Sort Key: i + -> Bitmap Heap Scan on test_date + Recheck Cond: (i < 'Tue Oct 26 00:00:00 2004 PDT'::timestamp with time zone) + -> Bitmap Index Scan on idx_date + Index Cond: (i < 'Tue Oct 26 00:00:00 2004 PDT'::timestamp with time zone) +(6 rows) + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 +(3 rows) + +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 +(4 rows) + +SELECT * FROM test_date WHERE i='2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 +(1 row) + +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 +(3 rows) + +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-27-2004 + 10-28-2004 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_date VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_date'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_date WHERE i<'-infinity'::timestamp ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i<='-infinity'::timestamp ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i='-infinity'::timestamp ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i>='-infinity'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i>'-infinity'::timestamp ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(7 rows) + +SELECT * FROM test_date WHERE i<'infinity'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 +(7 rows) + +SELECT * FROM test_date WHERE i<='infinity'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i='infinity'::timestamp ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>='infinity'::timestamp ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>'infinity'::timestamp ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i<'-infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i<='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i>='-infinity'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i>'-infinity'::timestamptz ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(7 rows) + +SELECT * FROM test_date WHERE i<'infinity'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 +(7 rows) + +SELECT * FROM test_date WHERE i<='infinity'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>'infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +-- Check rounding cases +-- '2004-10-25 00:00:01' rounds to '2004-10-25' for date. +-- '2004-10-25 23:59:59' also rounds to '2004-10-25', +-- so it's the same case as '2004-10-25 00:00:01' +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + diff --git a/contrib/btree_gin/expected/float4.out b/contrib/btree_gin/expected/float4.out index 7b9134fcd4bdc..c8bb04e59be9b 100644 --- a/contrib/btree_gin/expected/float4.out +++ b/contrib/btree_gin/expected/float4.out @@ -42,3 +42,324 @@ SELECT * FROM test_float4 WHERE i>1::float4 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; + QUERY PLAN +------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_float4 + Recheck Cond: (i < '1'::double precision) + -> Bitmap Index Scan on idx_float4 + Index Cond: (i < '1'::double precision) +(6 rows) + +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_float4 WHERE i<=1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_float4 WHERE i=1::float8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_float4 WHERE i>=1::float8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_float4 WHERE i>1::float8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_float4 VALUES ('NaN'), ('Inf'), ('-Inf'); +SELECT gin_clean_pending_list('idx_float4'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_float4 WHERE i<'-Inf'::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i<='-Inf'::float8 ORDER BY i; + i +----------- + -Infinity +(1 row) + +SELECT * FROM test_float4 WHERE i='-Inf'::float8 ORDER BY i; + i +----------- + -Infinity +(1 row) + +SELECT * FROM test_float4 WHERE i>='-Inf'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity + NaN +(9 rows) + +SELECT * FROM test_float4 WHERE i>'-Inf'::float8 ORDER BY i; + i +---------- + -2 + -1 + 0 + 1 + 2 + 3 + Infinity + NaN +(8 rows) + +SELECT * FROM test_float4 WHERE i<'Inf'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 +(7 rows) + +SELECT * FROM test_float4 WHERE i<='Inf'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity +(8 rows) + +SELECT * FROM test_float4 WHERE i='Inf'::float8 ORDER BY i; + i +---------- + Infinity +(1 row) + +SELECT * FROM test_float4 WHERE i>='Inf'::float8 ORDER BY i; + i +---------- + Infinity + NaN +(2 rows) + +SELECT * FROM test_float4 WHERE i>'Inf'::float8 ORDER BY i; + i +----- + NaN +(1 row) + +SELECT * FROM test_float4 WHERE i<'1e300'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 +(7 rows) + +SELECT * FROM test_float4 WHERE i<='1e300'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 +(7 rows) + +SELECT * FROM test_float4 WHERE i='1e300'::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i>='1e300'::float8 ORDER BY i; + i +---------- + Infinity + NaN +(2 rows) + +SELECT * FROM test_float4 WHERE i>'1e300'::float8 ORDER BY i; + i +---------- + Infinity + NaN +(2 rows) + +SELECT * FROM test_float4 WHERE i<'NaN'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity +(8 rows) + +SELECT * FROM test_float4 WHERE i<='NaN'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity + NaN +(9 rows) + +SELECT * FROM test_float4 WHERE i='NaN'::float8 ORDER BY i; + i +----- + NaN +(1 row) + +SELECT * FROM test_float4 WHERE i>='NaN'::float8 ORDER BY i; + i +----- + NaN +(1 row) + +SELECT * FROM test_float4 WHERE i>'NaN'::float8 ORDER BY i; + i +--- +(0 rows) + +-- Check rounding cases +-- 1e-300 rounds to 0 for float4 but not for float8 +SELECT * FROM test_float4 WHERE i < -1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 +(3 rows) + +SELECT * FROM test_float4 WHERE i <= -1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 +(3 rows) + +SELECT * FROM test_float4 WHERE i = -1e-300::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i > -1e-300::float8 ORDER BY i; + i +---------- + 0 + 1 + 2 + 3 + Infinity + NaN +(6 rows) + +SELECT * FROM test_float4 WHERE i >= -1e-300::float8 ORDER BY i; + i +---------- + 0 + 1 + 2 + 3 + Infinity + NaN +(6 rows) + +SELECT * FROM test_float4 WHERE i < 1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 +(4 rows) + +SELECT * FROM test_float4 WHERE i <= 1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 +(4 rows) + +SELECT * FROM test_float4 WHERE i = 1e-300::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i > 1e-300::float8 ORDER BY i; + i +---------- + 1 + 2 + 3 + Infinity + NaN +(5 rows) + +SELECT * FROM test_float4 WHERE i >= 1e-300::float8 ORDER BY i; + i +---------- + 1 + 2 + 3 + Infinity + NaN +(5 rows) + diff --git a/contrib/btree_gin/expected/float8.out b/contrib/btree_gin/expected/float8.out index a41d4f9f6bb05..b2877dfa3c1c2 100644 --- a/contrib/btree_gin/expected/float8.out +++ b/contrib/btree_gin/expected/float8.out @@ -42,3 +42,53 @@ SELECT * FROM test_float8 WHERE i>1::float8 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_float8 + Recheck Cond: (i < '1'::real) + -> Bitmap Index Scan on idx_float8 + Index Cond: (i < '1'::real) +(6 rows) + +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_float8 WHERE i<=1::float4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_float8 WHERE i=1::float4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_float8 WHERE i>=1::float4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_float8 WHERE i>1::float4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + diff --git a/contrib/btree_gin/expected/int2.out b/contrib/btree_gin/expected/int2.out index 20d66a1b05545..bcfa68f671a25 100644 --- a/contrib/btree_gin/expected/int2.out +++ b/contrib/btree_gin/expected/int2.out @@ -42,3 +42,193 @@ SELECT * FROM test_int2 WHERE i>1::int2 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; + QUERY PLAN +------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int2 + Recheck Cond: (i < 1) + -> Bitmap Index Scan on idx_int2 + Index Cond: (i < 1) +(6 rows) + +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int2 WHERE i<=1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int2 WHERE i=1::int4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int2 WHERE i>=1::int4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int2 WHERE i>1::int4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int2 + Recheck Cond: (i < '1'::bigint) + -> Bitmap Index Scan on idx_int2 + Index Cond: (i < '1'::bigint) +(6 rows) + +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int2 WHERE i<=1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int2 WHERE i=1::int8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int2 WHERE i>=1::int8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int2 WHERE i>1::int8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_int2 VALUES ((-32768)::int2),(32767); +SELECT gin_clean_pending_list('idx_int2'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_int2 WHERE i<(-32769)::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i<=(-32769)::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i=(-32769)::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i>=(-32769)::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i>(-32769)::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i<32768::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i<=32768::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i=32768::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i>=32768::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i>32768::int4 ORDER BY i; + i +--- +(0 rows) + diff --git a/contrib/btree_gin/expected/int4.out b/contrib/btree_gin/expected/int4.out index 0f0122c6f5e03..e62791e18bdc2 100644 --- a/contrib/btree_gin/expected/int4.out +++ b/contrib/btree_gin/expected/int4.out @@ -42,3 +42,103 @@ SELECT * FROM test_int4 WHERE i>1::int4 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; + QUERY PLAN +----------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int4 + Recheck Cond: (i < '1'::smallint) + -> Bitmap Index Scan on idx_int4 + Index Cond: (i < '1'::smallint) +(6 rows) + +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int4 WHERE i<=1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int4 WHERE i=1::int2 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int4 WHERE i>=1::int2 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int4 WHERE i>1::int2 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int4 + Recheck Cond: (i < '1'::bigint) + -> Bitmap Index Scan on idx_int4 + Index Cond: (i < '1'::bigint) +(6 rows) + +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int4 WHERE i<=1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int4 WHERE i=1::int8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int4 WHERE i>=1::int8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int4 WHERE i>1::int8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + diff --git a/contrib/btree_gin/expected/int8.out b/contrib/btree_gin/expected/int8.out index 307e19e7a056d..c9aceb9d357c6 100644 --- a/contrib/btree_gin/expected/int8.out +++ b/contrib/btree_gin/expected/int8.out @@ -42,3 +42,103 @@ SELECT * FROM test_int8 WHERE i>1::int8 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; + QUERY PLAN +----------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int8 + Recheck Cond: (i < '1'::smallint) + -> Bitmap Index Scan on idx_int8 + Index Cond: (i < '1'::smallint) +(6 rows) + +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int8 WHERE i<=1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int8 WHERE i=1::int2 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int8 WHERE i>=1::int2 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int8 WHERE i>1::int2 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; + QUERY PLAN +------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int8 + Recheck Cond: (i < 1) + -> Bitmap Index Scan on idx_int8 + Index Cond: (i < 1) +(6 rows) + +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int8 WHERE i<=1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int8 WHERE i=1::int4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int8 WHERE i>=1::int4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int8 WHERE i>1::int4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + diff --git a/contrib/btree_gin/expected/name.out b/contrib/btree_gin/expected/name.out index 174de6576f0f0..3a30f62519c67 100644 --- a/contrib/btree_gin/expected/name.out +++ b/contrib/btree_gin/expected/name.out @@ -95,3 +95,62 @@ EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i>'abc' ORDER BY i; Index Cond: (i > 'abc'::name) (6 rows) +explain (costs off) +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_name + Recheck Cond: (i < 'abc'::text) + -> Bitmap Index Scan on idx_name + Index Cond: (i < 'abc'::text) +(6 rows) + +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; + i +----- + a + ab + abb +(3 rows) + +SELECT * FROM test_name WHERE i<='abc'::text ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + +SELECT * FROM test_name WHERE i='abc'::text ORDER BY i; + i +----- + abc +(1 row) + +SELECT * FROM test_name WHERE i>='abc'::text ORDER BY i; + i +----- + abc + axy + xyz +(3 rows) + +SELECT * FROM test_name WHERE i>'abc'::text ORDER BY i; + i +----- + axy + xyz +(2 rows) + +SELECT * FROM test_name WHERE i<=repeat('abc', 100) ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + diff --git a/contrib/btree_gin/expected/text.out b/contrib/btree_gin/expected/text.out index 3e31ad744d6aa..7f52f3db7b38e 100644 --- a/contrib/btree_gin/expected/text.out +++ b/contrib/btree_gin/expected/text.out @@ -42,3 +42,53 @@ SELECT * FROM test_text WHERE i>'abc' ORDER BY i; xyz (2 rows) +explain (costs off) +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; + QUERY PLAN +--------------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_text + Recheck Cond: (i < 'abc'::name COLLATE "default") + -> Bitmap Index Scan on idx_text + Index Cond: (i < 'abc'::name COLLATE "default") +(6 rows) + +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; + i +----- + a + ab + abb +(3 rows) + +SELECT * FROM test_text WHERE i<='abc'::name COLLATE "default" ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + +SELECT * FROM test_text WHERE i='abc'::name COLLATE "default" ORDER BY i; + i +----- + abc +(1 row) + +SELECT * FROM test_text WHERE i>='abc'::name COLLATE "default" ORDER BY i; + i +----- + abc + axy + xyz +(3 rows) + +SELECT * FROM test_text WHERE i>'abc'::name COLLATE "default" ORDER BY i; + i +----- + axy + xyz +(2 rows) + diff --git a/contrib/btree_gin/expected/timestamp.out b/contrib/btree_gin/expected/timestamp.out index a236cdc94a9d2..b7565285e68ba 100644 --- a/contrib/btree_gin/expected/timestamp.out +++ b/contrib/btree_gin/expected/timestamp.out @@ -7,8 +7,8 @@ INSERT INTO test_timestamp VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamp ON test_timestamp USING gin (i); SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; @@ -38,14 +38,308 @@ SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i -------------------------- Tue Oct 26 08:55:08 2004 - Tue Oct 26 09:55:08 2004 - Tue Oct 26 10:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 (3 rows) SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; i -------------------------- - Tue Oct 26 09:55:08 2004 - Tue Oct 26 10:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 (2 rows) +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; + QUERY PLAN +---------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamp + Recheck Cond: (i < '10-27-2004'::date) + -> Bitmap Index Scan on idx_timestamp + Index Cond: (i < '10-27-2004'::date) +(6 rows) + +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 +(4 rows) + +SELECT * FROM test_timestamp WHERE i<='2004-10-27'::date ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 +(4 rows) + +SELECT * FROM test_timestamp WHERE i='2004-10-27'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i>='2004-10-27'::date ORDER BY i; + i +-------------------------- + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(2 rows) + +SELECT * FROM test_timestamp WHERE i>'2004-10-27'::date ORDER BY i; + i +-------------------------- + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(2 rows) + +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; + QUERY PLAN +------------------------------------------------------------------------------------------ + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamp + Recheck Cond: (i < 'Tue Oct 26 08:55:08 2004 PDT'::timestamp with time zone) + -> Bitmap Index Scan on idx_timestamp + Index Cond: (i < 'Tue Oct 26 08:55:08 2004 PDT'::timestamp with time zone) +(6 rows) + +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 +(3 rows) + +SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 +(4 rows) + +SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 08:55:08 2004 +(1 row) + +SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(3 rows) + +SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_timestamp VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_timestamp'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_timestamp WHERE i<'-infinity'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i<='-infinity'::date ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i='-infinity'::date ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='-infinity'::date ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i>'-infinity'::date ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(7 rows) + +SELECT * FROM test_timestamp WHERE i<'infinity'::date ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(7 rows) + +SELECT * FROM test_timestamp WHERE i<='infinity'::date ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i='infinity'::date ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='infinity'::date ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>'infinity'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i<'-infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i<='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='-infinity'::timestamptz ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i>'-infinity'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(7 rows) + +SELECT * FROM test_timestamp WHERE i<'infinity'::timestamptz ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(7 rows) + +SELECT * FROM test_timestamp WHERE i<='infinity'::timestamptz ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>'infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +-- This PST timestamptz will underflow if converted to timestamp +SELECT * FROM test_timestamp WHERE i<='4714-11-23 17:00 BC'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>'4714-11-23 17:00 BC'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(7 rows) + diff --git a/contrib/btree_gin/expected/timestamptz.out b/contrib/btree_gin/expected/timestamptz.out index d53963d2a04b8..0dada0b662cbb 100644 --- a/contrib/btree_gin/expected/timestamptz.out +++ b/contrib/btree_gin/expected/timestamptz.out @@ -7,8 +7,8 @@ INSERT INTO test_timestamptz VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamptz ON test_timestamptz USING gin (i); SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; @@ -38,14 +38,113 @@ SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER i ------------------------------ Tue Oct 26 08:55:08 2004 PDT - Tue Oct 26 09:55:08 2004 PDT - Tue Oct 26 10:55:08 2004 PDT + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT (3 rows) SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; i ------------------------------ - Tue Oct 26 09:55:08 2004 PDT - Tue Oct 26 10:55:08 2004 PDT + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(2 rows) + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; + QUERY PLAN +---------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamptz + Recheck Cond: (i < '10-27-2004'::date) + -> Bitmap Index Scan on idx_timestamptz + Index Cond: (i < '10-27-2004'::date) +(6 rows) + +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT + Tue Oct 26 08:55:08 2004 PDT +(4 rows) + +SELECT * FROM test_timestamptz WHERE i<='2004-10-27'::date ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT + Tue Oct 26 08:55:08 2004 PDT +(4 rows) + +SELECT * FROM test_timestamptz WHERE i='2004-10-27'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamptz WHERE i>='2004-10-27'::date ORDER BY i; + i +------------------------------ + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(2 rows) + +SELECT * FROM test_timestamptz WHERE i>'2004-10-27'::date ORDER BY i; + i +------------------------------ + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(2 rows) + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + QUERY PLAN +----------------------------------------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamptz + Recheck Cond: (i < 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) + -> Bitmap Index Scan on idx_timestamptz + Index Cond: (i < 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(6 rows) + +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT +(3 rows) + +SELECT * FROM test_timestamptz WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT + Tue Oct 26 08:55:08 2004 PDT +(4 rows) + +SELECT * FROM test_timestamptz WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 08:55:08 2004 PDT +(1 row) + +SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 08:55:08 2004 PDT + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(3 rows) + +SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT (2 rows) diff --git a/contrib/btree_gin/meson.build b/contrib/btree_gin/meson.build index b2749f6e66951..ece0a716973ce 100644 --- a/contrib/btree_gin/meson.build +++ b/contrib/btree_gin/meson.build @@ -22,6 +22,7 @@ install_data( 'btree_gin--1.0--1.1.sql', 'btree_gin--1.1--1.2.sql', 'btree_gin--1.2--1.3.sql', + 'btree_gin--1.3--1.4.sql', kwargs: contrib_data_args, ) diff --git a/contrib/btree_gin/sql/date.sql b/contrib/btree_gin/sql/date.sql index 35086f6b81b9b..006f6f528b835 100644 --- a/contrib/btree_gin/sql/date.sql +++ b/contrib/btree_gin/sql/date.sql @@ -20,3 +20,67 @@ SELECT * FROM test_date WHERE i<='2004-10-26'::date ORDER BY i; SELECT * FROM test_date WHERE i='2004-10-26'::date ORDER BY i; SELECT * FROM test_date WHERE i>='2004-10-26'::date ORDER BY i; SELECT * FROM test_date WHERE i>'2004-10-26'::date ORDER BY i; + +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i='2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamp ORDER BY i; + +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i='2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamptz ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_date VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_date'); + +SELECT * FROM test_date WHERE i<'-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i<='-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i='-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>='-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>'-infinity'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i<'infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i<='infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i='infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>='infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>'infinity'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i<'-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i<='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>'-infinity'::timestamptz ORDER BY i; + +SELECT * FROM test_date WHERE i<'infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i<='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>'infinity'::timestamptz ORDER BY i; + +-- Check rounding cases +-- '2004-10-25 00:00:01' rounds to '2004-10-25' for date. +-- '2004-10-25 23:59:59' also rounds to '2004-10-25', +-- so it's the same case as '2004-10-25 00:00:01' + +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamptz ORDER BY i; diff --git a/contrib/btree_gin/sql/float4.sql b/contrib/btree_gin/sql/float4.sql index 759778ad3c3b4..0707ed6518fa2 100644 --- a/contrib/btree_gin/sql/float4.sql +++ b/contrib/btree_gin/sql/float4.sql @@ -13,3 +13,56 @@ SELECT * FROM test_float4 WHERE i<=1::float4 ORDER BY i; SELECT * FROM test_float4 WHERE i=1::float4 ORDER BY i; SELECT * FROM test_float4 WHERE i>=1::float4 ORDER BY i; SELECT * FROM test_float4 WHERE i>1::float4 ORDER BY i; + +explain (costs off) +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<=1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i=1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>=1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>1::float8 ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_float4 VALUES ('NaN'), ('Inf'), ('-Inf'); +SELECT gin_clean_pending_list('idx_float4'); + +SELECT * FROM test_float4 WHERE i<'-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'-Inf'::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<'Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'Inf'::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<'1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'1e300'::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<'NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'NaN'::float8 ORDER BY i; + +-- Check rounding cases +-- 1e-300 rounds to 0 for float4 but not for float8 + +SELECT * FROM test_float4 WHERE i < -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i <= -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i = -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i > -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i >= -1e-300::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i < 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i <= 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i = 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i > 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i >= 1e-300::float8 ORDER BY i; diff --git a/contrib/btree_gin/sql/float8.sql b/contrib/btree_gin/sql/float8.sql index b046ac4e6c4bb..5f393147082b1 100644 --- a/contrib/btree_gin/sql/float8.sql +++ b/contrib/btree_gin/sql/float8.sql @@ -13,3 +13,12 @@ SELECT * FROM test_float8 WHERE i<=1::float8 ORDER BY i; SELECT * FROM test_float8 WHERE i=1::float8 ORDER BY i; SELECT * FROM test_float8 WHERE i>=1::float8 ORDER BY i; SELECT * FROM test_float8 WHERE i>1::float8 ORDER BY i; + +explain (costs off) +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; + +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i<=1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i=1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i>=1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i>1::float4 ORDER BY i; diff --git a/contrib/btree_gin/sql/int2.sql b/contrib/btree_gin/sql/int2.sql index f06f11702f54e..959e0f6cfde01 100644 --- a/contrib/btree_gin/sql/int2.sql +++ b/contrib/btree_gin/sql/int2.sql @@ -13,3 +13,38 @@ SELECT * FROM test_int2 WHERE i<=1::int2 ORDER BY i; SELECT * FROM test_int2 WHERE i=1::int2 ORDER BY i; SELECT * FROM test_int2 WHERE i>=1::int2 ORDER BY i; SELECT * FROM test_int2 WHERE i>1::int2 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; + +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i=1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>1::int4 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; + +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i=1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i>1::int8 ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_int2 VALUES ((-32768)::int2),(32767); +SELECT gin_clean_pending_list('idx_int2'); + +SELECT * FROM test_int2 WHERE i<(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i=(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>(-32769)::int4 ORDER BY i; + +SELECT * FROM test_int2 WHERE i<32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i=32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>32768::int4 ORDER BY i; diff --git a/contrib/btree_gin/sql/int4.sql b/contrib/btree_gin/sql/int4.sql index 6499c29630722..9a45530b63ad7 100644 --- a/contrib/btree_gin/sql/int4.sql +++ b/contrib/btree_gin/sql/int4.sql @@ -13,3 +13,21 @@ SELECT * FROM test_int4 WHERE i<=1::int4 ORDER BY i; SELECT * FROM test_int4 WHERE i=1::int4 ORDER BY i; SELECT * FROM test_int4 WHERE i>=1::int4 ORDER BY i; SELECT * FROM test_int4 WHERE i>1::int4 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; + +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i<=1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i=1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i>=1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i>1::int2 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; + +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i<=1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i=1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i>=1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i>1::int8 ORDER BY i; diff --git a/contrib/btree_gin/sql/int8.sql b/contrib/btree_gin/sql/int8.sql index 4d9c2871814c4..b31f27c69b90a 100644 --- a/contrib/btree_gin/sql/int8.sql +++ b/contrib/btree_gin/sql/int8.sql @@ -13,3 +13,21 @@ SELECT * FROM test_int8 WHERE i<=1::int8 ORDER BY i; SELECT * FROM test_int8 WHERE i=1::int8 ORDER BY i; SELECT * FROM test_int8 WHERE i>=1::int8 ORDER BY i; SELECT * FROM test_int8 WHERE i>1::int8 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; + +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i<=1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i=1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i>=1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i>1::int2 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; + +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i<=1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i=1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i>=1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i>1::int4 ORDER BY i; diff --git a/contrib/btree_gin/sql/name.sql b/contrib/btree_gin/sql/name.sql index c11580cdf9609..551d928940746 100644 --- a/contrib/btree_gin/sql/name.sql +++ b/contrib/btree_gin/sql/name.sql @@ -19,3 +19,14 @@ EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i<='abc' ORDER BY i; EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i='abc' ORDER BY i; EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i>='abc' ORDER BY i; EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i>'abc' ORDER BY i; + +explain (costs off) +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; + +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i<='abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i='abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i>='abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i>'abc'::text ORDER BY i; + +SELECT * FROM test_name WHERE i<=repeat('abc', 100) ORDER BY i; diff --git a/contrib/btree_gin/sql/text.sql b/contrib/btree_gin/sql/text.sql index d5b3b39898988..978b21376fd85 100644 --- a/contrib/btree_gin/sql/text.sql +++ b/contrib/btree_gin/sql/text.sql @@ -13,3 +13,12 @@ SELECT * FROM test_text WHERE i<='abc' ORDER BY i; SELECT * FROM test_text WHERE i='abc' ORDER BY i; SELECT * FROM test_text WHERE i>='abc' ORDER BY i; SELECT * FROM test_text WHERE i>'abc' ORDER BY i; + +explain (costs off) +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; + +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i<='abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i='abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i>='abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i>'abc'::name COLLATE "default" ORDER BY i; diff --git a/contrib/btree_gin/sql/timestamp.sql b/contrib/btree_gin/sql/timestamp.sql index 56727e81c4aff..1ee4edb5ea4d2 100644 --- a/contrib/btree_gin/sql/timestamp.sql +++ b/contrib/btree_gin/sql/timestamp.sql @@ -9,8 +9,8 @@ INSERT INTO test_timestamp VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamp ON test_timestamp USING gin (i); @@ -20,3 +20,54 @@ SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'2004-10-27'::date ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_timestamp VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_timestamp'); + +SELECT * FROM test_timestamp WHERE i<'-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i='-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'-infinity'::date ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i='infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'infinity'::date ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'-infinity'::timestamptz ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'infinity'::timestamptz ORDER BY i; + +-- This PST timestamptz will underflow if converted to timestamp +SELECT * FROM test_timestamp WHERE i<='4714-11-23 17:00 BC'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'4714-11-23 17:00 BC'::timestamptz ORDER BY i; diff --git a/contrib/btree_gin/sql/timestamptz.sql b/contrib/btree_gin/sql/timestamptz.sql index e6cfdb1b07447..40d2d7ed329d2 100644 --- a/contrib/btree_gin/sql/timestamptz.sql +++ b/contrib/btree_gin/sql/timestamptz.sql @@ -9,8 +9,8 @@ INSERT INTO test_timestamptz VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamptz ON test_timestamptz USING gin (i); @@ -20,3 +20,21 @@ SELECT * FROM test_timestamptz WHERE i<='2004-10-26 08:55:08'::timestamptz ORDER SELECT * FROM test_timestamptz WHERE i='2004-10-26 08:55:08'::timestamptz ORDER BY i; SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER BY i; SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; + +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i<='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>'2004-10-27'::date ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; diff --git a/contrib/btree_gist/Makefile b/contrib/btree_gist/Makefile index 68190ac5e4687..7ac2df26c1044 100644 --- a/contrib/btree_gist/Makefile +++ b/contrib/btree_gist/Makefile @@ -34,7 +34,7 @@ DATA = btree_gist--1.0--1.1.sql \ btree_gist--1.1--1.2.sql btree_gist--1.2.sql btree_gist--1.2--1.3.sql \ btree_gist--1.3--1.4.sql btree_gist--1.4--1.5.sql \ btree_gist--1.5--1.6.sql btree_gist--1.6--1.7.sql \ - btree_gist--1.7--1.8.sql btree_gist--1.8--1.9.sql + btree_gist--1.7--1.8.sql PGFILEDESC = "btree_gist - B-tree equivalent GiST operator classes" REGRESS = init int2 int4 int8 float4 float8 cash oid timestamp timestamptz \ diff --git a/contrib/btree_gist/btree_bit.c b/contrib/btree_gist/btree_bit.c index 0df2ae20d8b21..9199f8860975d 100644 --- a/contrib/btree_gist/btree_bit.c +++ b/contrib/btree_gist/btree_bit.c @@ -8,6 +8,7 @@ #include "utils/fmgrprotos.h" #include "utils/sortsupport.h" #include "utils/varbit.h" +#include "varatt.h" /* GiST support functions */ PG_FUNCTION_INFO_V1(gbt_bit_compress); diff --git a/contrib/btree_gist/btree_bool.c b/contrib/btree_gist/btree_bool.c index 1127597bb6017..344f059c78fde 100644 --- a/contrib/btree_gist/btree_bool.c +++ b/contrib/btree_gist/btree_bool.c @@ -5,6 +5,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct boolkey diff --git a/contrib/btree_gist/btree_cash.c b/contrib/btree_gist/btree_cash.c index 01c8d5a5f4074..282d5c5731fef 100644 --- a/contrib/btree_gist/btree_cash.c +++ b/contrib/btree_gist/btree_cash.c @@ -7,6 +7,7 @@ #include "btree_utils_num.h" #include "common/int.h" #include "utils/cash.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct diff --git a/contrib/btree_gist/btree_date.c b/contrib/btree_gist/btree_date.c index c008dc61ba5f5..1f1a3f32b56a9 100644 --- a/contrib/btree_gist/btree_date.c +++ b/contrib/btree_gist/btree_date.c @@ -7,6 +7,7 @@ #include "btree_utils_num.h" #include "utils/fmgrprotos.h" #include "utils/date.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct diff --git a/contrib/btree_gist/btree_enum.c b/contrib/btree_gist/btree_enum.c index 83c95c7bb0401..8f1ffff46965f 100644 --- a/contrib/btree_gist/btree_enum.c +++ b/contrib/btree_gist/btree_enum.c @@ -8,6 +8,7 @@ #include "fmgr.h" #include "utils/fmgrprotos.h" #include "utils/fmgroids.h" +#include "utils/rel.h" #include "utils/sortsupport.h" /* enums are really Oids, so we just use the same structure */ @@ -193,8 +194,8 @@ gbt_enum_ssup_cmp(Datum x, Datum y, SortSupport ssup) return DatumGetInt32(CallerFInfoFunctionCall2(enum_cmp, ssup->ssup_extra, InvalidOid, - arg1->lower, - arg2->lower)); + ObjectIdGetDatum(arg1->lower), + ObjectIdGetDatum(arg2->lower))); } Datum diff --git a/contrib/btree_gist/btree_float4.c b/contrib/btree_gist/btree_float4.c index bec026a923a18..d9c859835dacc 100644 --- a/contrib/btree_gist/btree_float4.c +++ b/contrib/btree_gist/btree_float4.c @@ -6,6 +6,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" #include "utils/float.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct float4key diff --git a/contrib/btree_gist/btree_float8.c b/contrib/btree_gist/btree_float8.c index 43e7cde2b6958..567beede178ad 100644 --- a/contrib/btree_gist/btree_float8.c +++ b/contrib/btree_gist/btree_float8.c @@ -6,6 +6,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" #include "utils/float.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct float8key diff --git a/contrib/btree_gist/btree_gist--1.7--1.8.sql b/contrib/btree_gist/btree_gist--1.7--1.8.sql index 4ff9c43a8ebe5..22316dc3f566c 100644 --- a/contrib/btree_gist/btree_gist--1.7--1.8.sql +++ b/contrib/btree_gist/btree_gist--1.7--1.8.sql @@ -3,85 +3,282 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION btree_gist UPDATE TO '1.8'" to load this file. \quit -CREATE FUNCTION gist_stratnum_btree(int) +-- Add sortsupport functions + +CREATE FUNCTION gbt_bit_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_varbit_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_bool_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_bytea_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_cash_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_date_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_enum_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_float4_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_float8_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_inet_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_int2_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_int4_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_int8_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_intv_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_macaddr_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_macad8_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_numeric_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_oid_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_text_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_bpchar_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_time_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_ts_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_uuid_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +ALTER OPERATOR FAMILY gist_bit_ops USING gist ADD + FUNCTION 11 (bit, bit) gbt_bit_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_vbit_ops USING gist ADD + FUNCTION 11 (varbit, varbit) gbt_varbit_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_bool_ops USING gist ADD + FUNCTION 11 (bool, bool) gbt_bool_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_bytea_ops USING gist ADD + FUNCTION 11 (bytea, bytea) gbt_bytea_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_cash_ops USING gist ADD + FUNCTION 11 (money, money) gbt_cash_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_date_ops USING gist ADD + FUNCTION 11 (date, date) gbt_date_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_enum_ops USING gist ADD + FUNCTION 11 (anyenum, anyenum) gbt_enum_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_float4_ops USING gist ADD + FUNCTION 11 (float4, float4) gbt_float4_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_float8_ops USING gist ADD + FUNCTION 11 (float8, float8) gbt_float8_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_inet_ops USING gist ADD + FUNCTION 11 (inet, inet) gbt_inet_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_cidr_ops USING gist ADD + FUNCTION 11 (cidr, cidr) gbt_inet_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_int2_ops USING gist ADD + FUNCTION 11 (int2, int2) gbt_int2_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_int4_ops USING gist ADD + FUNCTION 11 (int4, int4) gbt_int4_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_int8_ops USING gist ADD + FUNCTION 11 (int8, int8) gbt_int8_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_interval_ops USING gist ADD + FUNCTION 11 (interval, interval) gbt_intv_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_macaddr_ops USING gist ADD + FUNCTION 11 (macaddr, macaddr) gbt_macaddr_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_macaddr8_ops USING gist ADD + FUNCTION 11 (macaddr8, macaddr8) gbt_macad8_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_numeric_ops USING gist ADD + FUNCTION 11 (numeric, numeric) gbt_numeric_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_oid_ops USING gist ADD + FUNCTION 11 (oid, oid) gbt_oid_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_text_ops USING gist ADD + FUNCTION 11 (text, text) gbt_text_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_bpchar_ops USING gist ADD + FUNCTION 11 (bpchar, bpchar) gbt_bpchar_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_time_ops USING gist ADD + FUNCTION 11 (time, time) gbt_time_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_timetz_ops USING gist ADD + FUNCTION 11 (timetz, timetz) gbt_time_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_timestamp_ops USING gist ADD + FUNCTION 11 (timestamp, timestamp) gbt_ts_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_timestamptz_ops USING gist ADD + FUNCTION 11 (timestamptz, timestamptz) gbt_ts_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_uuid_ops USING gist ADD + FUNCTION 11 (uuid, uuid) gbt_uuid_sortsupport (internal) ; + +-- Add translate_cmptype functions + +CREATE FUNCTION gist_translate_cmptype_btree(int) RETURNS smallint AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; ALTER OPERATOR FAMILY gist_oid_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_int2_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_int4_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_int8_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_float4_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_float8_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_timestamp_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_timestamptz_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_time_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_date_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_interval_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_cash_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_macaddr_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_text_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_bpchar_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_bytea_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_numeric_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_bit_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_vbit_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_inet_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_cidr_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_timetz_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_uuid_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_macaddr8_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_enum_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; ALTER OPERATOR FAMILY gist_bool_ops USING gist ADD - FUNCTION 12 ("any", "any") gist_stratnum_btree (int) ; + FUNCTION 12 ("any", "any") gist_translate_cmptype_btree (int) ; diff --git a/contrib/btree_gist/btree_gist--1.8--1.9.sql b/contrib/btree_gist/btree_gist--1.8--1.9.sql deleted file mode 100644 index 4b38749bf5f34..0000000000000 --- a/contrib/btree_gist/btree_gist--1.8--1.9.sql +++ /dev/null @@ -1,197 +0,0 @@ -/* contrib/btree_gist/btree_gist--1.7--1.8.sql */ - --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "ALTER EXTENSION btree_gist UPDATE TO '1.9'" to load this file. \quit - -CREATE FUNCTION gbt_bit_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_varbit_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_bool_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_bytea_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_cash_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_date_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_enum_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_float4_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_float8_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_inet_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_int2_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_int4_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_int8_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_intv_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_macaddr_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_macad8_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_numeric_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_oid_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_text_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_bpchar_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_time_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_ts_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_uuid_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -ALTER OPERATOR FAMILY gist_bit_ops USING gist ADD - FUNCTION 11 (bit, bit) gbt_bit_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_vbit_ops USING gist ADD - FUNCTION 11 (varbit, varbit) gbt_varbit_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_bool_ops USING gist ADD - FUNCTION 11 (bool, bool) gbt_bool_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_bytea_ops USING gist ADD - FUNCTION 11 (bytea, bytea) gbt_bytea_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_cash_ops USING gist ADD - FUNCTION 11 (money, money) gbt_cash_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_date_ops USING gist ADD - FUNCTION 11 (date, date) gbt_date_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_enum_ops USING gist ADD - FUNCTION 11 (anyenum, anyenum) gbt_enum_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_float4_ops USING gist ADD - FUNCTION 11 (float4, float4) gbt_float4_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_float8_ops USING gist ADD - FUNCTION 11 (float8, float8) gbt_float8_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_inet_ops USING gist ADD - FUNCTION 11 (inet, inet) gbt_inet_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_cidr_ops USING gist ADD - FUNCTION 11 (cidr, cidr) gbt_inet_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_int2_ops USING gist ADD - FUNCTION 11 (int2, int2) gbt_int2_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_int4_ops USING gist ADD - FUNCTION 11 (int4, int4) gbt_int4_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_int8_ops USING gist ADD - FUNCTION 11 (int8, int8) gbt_int8_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_interval_ops USING gist ADD - FUNCTION 11 (interval, interval) gbt_intv_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_macaddr_ops USING gist ADD - FUNCTION 11 (macaddr, macaddr) gbt_macaddr_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_macaddr8_ops USING gist ADD - FUNCTION 11 (macaddr8, macaddr8) gbt_macad8_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_numeric_ops USING gist ADD - FUNCTION 11 (numeric, numeric) gbt_numeric_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_oid_ops USING gist ADD - FUNCTION 11 (oid, oid) gbt_oid_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_text_ops USING gist ADD - FUNCTION 11 (text, text) gbt_text_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_bpchar_ops USING gist ADD - FUNCTION 11 (bpchar, bpchar) gbt_bpchar_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_time_ops USING gist ADD - FUNCTION 11 (time, time) gbt_time_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_timetz_ops USING gist ADD - FUNCTION 11 (timetz, timetz) gbt_time_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_timestamp_ops USING gist ADD - FUNCTION 11 (timestamp, timestamp) gbt_ts_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_timestamptz_ops USING gist ADD - FUNCTION 11 (timestamptz, timestamptz) gbt_ts_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_uuid_ops USING gist ADD - FUNCTION 11 (uuid, uuid) gbt_uuid_sortsupport (internal) ; diff --git a/contrib/btree_gist/btree_gist.c b/contrib/btree_gist/btree_gist.c index 280ce808456b9..39fcbdad334f0 100644 --- a/contrib/btree_gist/btree_gist.c +++ b/contrib/btree_gist/btree_gist.c @@ -15,7 +15,7 @@ PG_MODULE_MAGIC_EXT( PG_FUNCTION_INFO_V1(gbt_decompress); PG_FUNCTION_INFO_V1(gbtreekey_in); PG_FUNCTION_INFO_V1(gbtreekey_out); -PG_FUNCTION_INFO_V1(gist_stratnum_btree); +PG_FUNCTION_INFO_V1(gist_translate_cmptype_btree); /************************************************** * In/Out for keys @@ -62,7 +62,7 @@ gbt_decompress(PG_FUNCTION_ARGS) * Returns the btree number for supported operators, otherwise invalid. */ Datum -gist_stratnum_btree(PG_FUNCTION_ARGS) +gist_translate_cmptype_btree(PG_FUNCTION_ARGS) { CompareType cmptype = PG_GETARG_INT32(0); diff --git a/contrib/btree_gist/btree_gist.control b/contrib/btree_gist/btree_gist.control index 69d9341a0adea..abf66538f3244 100644 --- a/contrib/btree_gist/btree_gist.control +++ b/contrib/btree_gist/btree_gist.control @@ -1,6 +1,6 @@ # btree_gist extension comment = 'support for indexing common datatypes in GiST' -default_version = '1.9' +default_version = '1.8' module_pathname = '$libdir/btree_gist' relocatable = true trusted = true diff --git a/contrib/btree_gist/btree_inet.c b/contrib/btree_gist/btree_inet.c index 8b23853bafbb7..e726375f61d06 100644 --- a/contrib/btree_gist/btree_inet.c +++ b/contrib/btree_gist/btree_inet.c @@ -7,6 +7,7 @@ #include "btree_utils_num.h" #include "catalog/pg_type.h" #include "utils/builtins.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct inetkey @@ -96,10 +97,10 @@ gbt_inet_compress(PG_FUNCTION_ARGS) if (entry->leafkey) { - inetKEY *r = (inetKEY *) palloc(sizeof(inetKEY)); + inetKEY *r = palloc_object(inetKEY); bool failure = false; - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); r->lower = convert_network_to_scalar(entry->key, INETOID, &failure); Assert(!failure); r->upper = r->lower; diff --git a/contrib/btree_gist/btree_int2.c b/contrib/btree_gist/btree_int2.c index 33eccdedd7049..faf456997bbf1 100644 --- a/contrib/btree_gist/btree_int2.c +++ b/contrib/btree_gist/btree_int2.c @@ -6,6 +6,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" #include "common/int.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct int16key diff --git a/contrib/btree_gist/btree_int4.c b/contrib/btree_gist/btree_int4.c index a82cee9a58a8c..0bdb9e58c5601 100644 --- a/contrib/btree_gist/btree_int4.c +++ b/contrib/btree_gist/btree_int4.c @@ -5,6 +5,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" #include "common/int.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct int32key diff --git a/contrib/btree_gist/btree_int8.c b/contrib/btree_gist/btree_int8.c index f0c56e017269a..a9a7b56927847 100644 --- a/contrib/btree_gist/btree_int8.c +++ b/contrib/btree_gist/btree_int8.c @@ -6,6 +6,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" #include "common/int.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct int64key diff --git a/contrib/btree_gist/btree_interval.c b/contrib/btree_gist/btree_interval.c index b5e365c6e09b4..1fc27f60384ea 100644 --- a/contrib/btree_gist/btree_interval.c +++ b/contrib/btree_gist/btree_interval.c @@ -6,6 +6,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" #include "utils/fmgrprotos.h" +#include "utils/rel.h" #include "utils/sortsupport.h" #include "utils/timestamp.h" @@ -149,7 +150,7 @@ gbt_intv_compress(PG_FUNCTION_ARGS) { char *r = (char *) palloc(2 * INTERVALSIZE); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); if (entry->leafkey) { @@ -189,10 +190,10 @@ gbt_intv_decompress(PG_FUNCTION_ARGS) if (INTERVALSIZE != sizeof(Interval)) { - intvKEY *r = palloc(sizeof(intvKEY)); + intvKEY *r = palloc_object(intvKEY); char *key = DatumGetPointer(entry->key); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); memcpy(&r->lower, key, INTERVALSIZE); memcpy(&r->upper, key + INTERVALSIZE, INTERVALSIZE); diff --git a/contrib/btree_gist/btree_macaddr.c b/contrib/btree_gist/btree_macaddr.c index 3b2f26719d5dc..c444a709853a7 100644 --- a/contrib/btree_gist/btree_macaddr.c +++ b/contrib/btree_gist/btree_macaddr.c @@ -7,6 +7,7 @@ #include "btree_utils_num.h" #include "utils/fmgrprotos.h" #include "utils/inet.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct diff --git a/contrib/btree_gist/btree_macaddr8.c b/contrib/btree_gist/btree_macaddr8.c index f2b104617e680..6d9837d90a340 100644 --- a/contrib/btree_gist/btree_macaddr8.c +++ b/contrib/btree_gist/btree_macaddr8.c @@ -7,6 +7,7 @@ #include "btree_utils_num.h" #include "utils/fmgrprotos.h" #include "utils/inet.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct diff --git a/contrib/btree_gist/btree_numeric.c b/contrib/btree_gist/btree_numeric.c index a39c05d9da1cf..052f27b07949e 100644 --- a/contrib/btree_gist/btree_numeric.c +++ b/contrib/btree_gist/btree_numeric.c @@ -192,7 +192,7 @@ gbt_numeric_penalty(PG_FUNCTION_ARGS) *result = 0.0; - if (DirectFunctionCall2(numeric_gt, NumericGetDatum(ds), NumericGetDatum(nul))) + if (DatumGetBool(DirectFunctionCall2(numeric_gt, NumericGetDatum(ds), NumericGetDatum(nul)))) { *result += FLT_MIN; os = DatumGetNumeric(DirectFunctionCall2(numeric_div, diff --git a/contrib/btree_gist/btree_oid.c b/contrib/btree_gist/btree_oid.c index ffe0d7983e40f..b8f2f661076c6 100644 --- a/contrib/btree_gist/btree_oid.c +++ b/contrib/btree_gist/btree_oid.c @@ -5,6 +5,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct diff --git a/contrib/btree_gist/btree_time.c b/contrib/btree_gist/btree_time.c index 1dba95057ba9f..e744f1be017fa 100644 --- a/contrib/btree_gist/btree_time.c +++ b/contrib/btree_gist/btree_time.c @@ -7,6 +7,7 @@ #include "btree_utils_num.h" #include "utils/fmgrprotos.h" #include "utils/date.h" +#include "utils/rel.h" #include "utils/sortsupport.h" #include "utils/timestamp.h" @@ -31,13 +32,6 @@ PG_FUNCTION_INFO_V1(gbt_time_sortsupport); PG_FUNCTION_INFO_V1(gbt_timetz_sortsupport); -#ifdef USE_FLOAT8_BYVAL -#define TimeADTGetDatumFast(X) TimeADTGetDatum(X) -#else -#define TimeADTGetDatumFast(X) PointerGetDatum(&(X)) -#endif - - static bool gbt_timegt(const void *a, const void *b, FmgrInfo *flinfo) { @@ -45,8 +39,8 @@ gbt_timegt(const void *a, const void *b, FmgrInfo *flinfo) const TimeADT *bb = (const TimeADT *) b; return DatumGetBool(DirectFunctionCall2(time_gt, - TimeADTGetDatumFast(*aa), - TimeADTGetDatumFast(*bb))); + TimeADTGetDatum(*aa), + TimeADTGetDatum(*bb))); } static bool @@ -56,8 +50,8 @@ gbt_timege(const void *a, const void *b, FmgrInfo *flinfo) const TimeADT *bb = (const TimeADT *) b; return DatumGetBool(DirectFunctionCall2(time_ge, - TimeADTGetDatumFast(*aa), - TimeADTGetDatumFast(*bb))); + TimeADTGetDatum(*aa), + TimeADTGetDatum(*bb))); } static bool @@ -67,8 +61,8 @@ gbt_timeeq(const void *a, const void *b, FmgrInfo *flinfo) const TimeADT *bb = (const TimeADT *) b; return DatumGetBool(DirectFunctionCall2(time_eq, - TimeADTGetDatumFast(*aa), - TimeADTGetDatumFast(*bb))); + TimeADTGetDatum(*aa), + TimeADTGetDatum(*bb))); } static bool @@ -78,8 +72,8 @@ gbt_timele(const void *a, const void *b, FmgrInfo *flinfo) const TimeADT *bb = (const TimeADT *) b; return DatumGetBool(DirectFunctionCall2(time_le, - TimeADTGetDatumFast(*aa), - TimeADTGetDatumFast(*bb))); + TimeADTGetDatum(*aa), + TimeADTGetDatum(*bb))); } static bool @@ -89,8 +83,8 @@ gbt_timelt(const void *a, const void *b, FmgrInfo *flinfo) const TimeADT *bb = (const TimeADT *) b; return DatumGetBool(DirectFunctionCall2(time_lt, - TimeADTGetDatumFast(*aa), - TimeADTGetDatumFast(*bb))); + TimeADTGetDatum(*aa), + TimeADTGetDatum(*bb))); } static int @@ -100,9 +94,9 @@ gbt_timekey_cmp(const void *a, const void *b, FmgrInfo *flinfo) timeKEY *ib = (timeKEY *) (((const Nsrt *) b)->t); int res; - res = DatumGetInt32(DirectFunctionCall2(time_cmp, TimeADTGetDatumFast(ia->lower), TimeADTGetDatumFast(ib->lower))); + res = DatumGetInt32(DirectFunctionCall2(time_cmp, TimeADTGetDatum(ia->lower), TimeADTGetDatum(ib->lower))); if (res == 0) - return DatumGetInt32(DirectFunctionCall2(time_cmp, TimeADTGetDatumFast(ia->upper), TimeADTGetDatumFast(ib->upper))); + return DatumGetInt32(DirectFunctionCall2(time_cmp, TimeADTGetDatum(ia->upper), TimeADTGetDatum(ib->upper))); return res; } @@ -115,8 +109,8 @@ gbt_time_dist(const void *a, const void *b, FmgrInfo *flinfo) Interval *i; i = DatumGetIntervalP(DirectFunctionCall2(time_mi_time, - TimeADTGetDatumFast(*aa), - TimeADTGetDatumFast(*bb))); + TimeADTGetDatum(*aa), + TimeADTGetDatum(*bb))); return fabs(INTERVAL_TO_SEC(i)); } @@ -168,11 +162,11 @@ gbt_timetz_compress(PG_FUNCTION_ARGS) if (entry->leafkey) { - timeKEY *r = (timeKEY *) palloc(sizeof(timeKEY)); + timeKEY *r = palloc_object(timeKEY); TimeTzADT *tz = DatumGetTimeTzADTP(entry->key); TimeADT tmp; - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); /* We are using the time + zone only to compress */ tmp = tz->time + (tz->zone * INT64CONST(1000000)); @@ -279,14 +273,14 @@ gbt_time_penalty(PG_FUNCTION_ARGS) double res2; intr = DatumGetIntervalP(DirectFunctionCall2(time_mi_time, - TimeADTGetDatumFast(newentry->upper), - TimeADTGetDatumFast(origentry->upper))); + TimeADTGetDatum(newentry->upper), + TimeADTGetDatum(origentry->upper))); res = INTERVAL_TO_SEC(intr); res = Max(res, 0); intr = DatumGetIntervalP(DirectFunctionCall2(time_mi_time, - TimeADTGetDatumFast(origentry->lower), - TimeADTGetDatumFast(newentry->lower))); + TimeADTGetDatum(origentry->lower), + TimeADTGetDatum(newentry->lower))); res2 = INTERVAL_TO_SEC(intr); res2 = Max(res2, 0); @@ -297,8 +291,8 @@ gbt_time_penalty(PG_FUNCTION_ARGS) if (res > 0) { intr = DatumGetIntervalP(DirectFunctionCall2(time_mi_time, - TimeADTGetDatumFast(origentry->upper), - TimeADTGetDatumFast(origentry->lower))); + TimeADTGetDatum(origentry->upper), + TimeADTGetDatum(origentry->lower))); *result += FLT_MIN; *result += (float) (res / (res + INTERVAL_TO_SEC(intr))); *result *= (FLT_MAX / (((GISTENTRY *) PG_GETARG_POINTER(0))->rel->rd_att->natts + 1)); @@ -334,8 +328,8 @@ gbt_timekey_ssup_cmp(Datum x, Datum y, SortSupport ssup) /* for leaf items we expect lower == upper, so only compare lower */ return DatumGetInt32(DirectFunctionCall2(time_cmp, - TimeADTGetDatumFast(arg1->lower), - TimeADTGetDatumFast(arg2->lower))); + TimeADTGetDatum(arg1->lower), + TimeADTGetDatum(arg2->lower))); } Datum diff --git a/contrib/btree_gist/btree_ts.c b/contrib/btree_gist/btree_ts.c index eb899c4d21363..3b163a729cbf4 100644 --- a/contrib/btree_gist/btree_ts.c +++ b/contrib/btree_gist/btree_ts.c @@ -10,6 +10,7 @@ #include "utils/fmgrprotos.h" #include "utils/timestamp.h" #include "utils/float.h" +#include "utils/rel.h" #include "utils/sortsupport.h" typedef struct @@ -33,13 +34,6 @@ PG_FUNCTION_INFO_V1(gbt_ts_same); PG_FUNCTION_INFO_V1(gbt_ts_sortsupport); -#ifdef USE_FLOAT8_BYVAL -#define TimestampGetDatumFast(X) TimestampGetDatum(X) -#else -#define TimestampGetDatumFast(X) PointerGetDatum(&(X)) -#endif - - /* define for comparison */ static bool @@ -49,8 +43,8 @@ gbt_tsgt(const void *a, const void *b, FmgrInfo *flinfo) const Timestamp *bb = (const Timestamp *) b; return DatumGetBool(DirectFunctionCall2(timestamp_gt, - TimestampGetDatumFast(*aa), - TimestampGetDatumFast(*bb))); + TimestampGetDatum(*aa), + TimestampGetDatum(*bb))); } static bool @@ -60,8 +54,8 @@ gbt_tsge(const void *a, const void *b, FmgrInfo *flinfo) const Timestamp *bb = (const Timestamp *) b; return DatumGetBool(DirectFunctionCall2(timestamp_ge, - TimestampGetDatumFast(*aa), - TimestampGetDatumFast(*bb))); + TimestampGetDatum(*aa), + TimestampGetDatum(*bb))); } static bool @@ -71,8 +65,8 @@ gbt_tseq(const void *a, const void *b, FmgrInfo *flinfo) const Timestamp *bb = (const Timestamp *) b; return DatumGetBool(DirectFunctionCall2(timestamp_eq, - TimestampGetDatumFast(*aa), - TimestampGetDatumFast(*bb))); + TimestampGetDatum(*aa), + TimestampGetDatum(*bb))); } static bool @@ -82,8 +76,8 @@ gbt_tsle(const void *a, const void *b, FmgrInfo *flinfo) const Timestamp *bb = (const Timestamp *) b; return DatumGetBool(DirectFunctionCall2(timestamp_le, - TimestampGetDatumFast(*aa), - TimestampGetDatumFast(*bb))); + TimestampGetDatum(*aa), + TimestampGetDatum(*bb))); } static bool @@ -93,8 +87,8 @@ gbt_tslt(const void *a, const void *b, FmgrInfo *flinfo) const Timestamp *bb = (const Timestamp *) b; return DatumGetBool(DirectFunctionCall2(timestamp_lt, - TimestampGetDatumFast(*aa), - TimestampGetDatumFast(*bb))); + TimestampGetDatum(*aa), + TimestampGetDatum(*bb))); } static int @@ -104,9 +98,9 @@ gbt_tskey_cmp(const void *a, const void *b, FmgrInfo *flinfo) tsKEY *ib = (tsKEY *) (((const Nsrt *) b)->t); int res; - res = DatumGetInt32(DirectFunctionCall2(timestamp_cmp, TimestampGetDatumFast(ia->lower), TimestampGetDatumFast(ib->lower))); + res = DatumGetInt32(DirectFunctionCall2(timestamp_cmp, TimestampGetDatum(ia->lower), TimestampGetDatum(ib->lower))); if (res == 0) - return DatumGetInt32(DirectFunctionCall2(timestamp_cmp, TimestampGetDatumFast(ia->upper), TimestampGetDatumFast(ib->upper))); + return DatumGetInt32(DirectFunctionCall2(timestamp_cmp, TimestampGetDatum(ia->upper), TimestampGetDatum(ib->upper))); return res; } @@ -122,8 +116,8 @@ gbt_ts_dist(const void *a, const void *b, FmgrInfo *flinfo) return get_float8_infinity(); i = DatumGetIntervalP(DirectFunctionCall2(timestamp_mi, - TimestampGetDatumFast(*aa), - TimestampGetDatumFast(*bb))); + TimestampGetDatum(*aa), + TimestampGetDatum(*bb))); return fabs(INTERVAL_TO_SEC(i)); } @@ -152,7 +146,7 @@ ts_dist(PG_FUNCTION_ARGS) if (TIMESTAMP_NOT_FINITE(a) || TIMESTAMP_NOT_FINITE(b)) { - Interval *p = palloc(sizeof(Interval)); + Interval *p = palloc_object(Interval); p->day = INT_MAX; p->month = INT_MAX; @@ -176,7 +170,7 @@ tstz_dist(PG_FUNCTION_ARGS) if (TIMESTAMP_NOT_FINITE(a) || TIMESTAMP_NOT_FINITE(b)) { - Interval *p = palloc(sizeof(Interval)); + Interval *p = palloc_object(Interval); p->day = INT_MAX; p->month = INT_MAX; @@ -218,13 +212,13 @@ gbt_tstz_compress(PG_FUNCTION_ARGS) if (entry->leafkey) { - tsKEY *r = (tsKEY *) palloc(sizeof(tsKEY)); + tsKEY *r = palloc_object(tsKEY); TimestampTz ts = DatumGetTimestampTz(entry->key); Timestamp gmt; gmt = tstz_to_ts_gmt(ts); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); r->lower = r->upper = gmt; gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, @@ -404,8 +398,8 @@ gbt_ts_ssup_cmp(Datum x, Datum y, SortSupport ssup) /* for leaf items we expect lower == upper, so only compare lower */ return DatumGetInt32(DirectFunctionCall2(timestamp_cmp, - TimestampGetDatumFast(arg1->lower), - TimestampGetDatumFast(arg2->lower))); + TimestampGetDatum(arg1->lower), + TimestampGetDatum(arg2->lower))); } Datum diff --git a/contrib/btree_gist/btree_utils_num.c b/contrib/btree_gist/btree_utils_num.c index 346ee837d75f4..51c8836f27a3d 100644 --- a/contrib/btree_gist/btree_utils_num.c +++ b/contrib/btree_gist/btree_utils_num.c @@ -89,7 +89,7 @@ gbt_num_compress(GISTENTRY *entry, const gbtree_ninfo *tinfo) memcpy(&r[0], leaf, tinfo->size); memcpy(&r[tinfo->size], leaf, tinfo->size); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); } @@ -119,44 +119,44 @@ gbt_num_fetch(GISTENTRY *entry, const gbtree_ninfo *tinfo) switch (tinfo->t) { case gbt_t_bool: - datum = BoolGetDatum(*(bool *) entry->key); + datum = BoolGetDatum(*(bool *) DatumGetPointer(entry->key)); break; case gbt_t_int2: - datum = Int16GetDatum(*(int16 *) entry->key); + datum = Int16GetDatum(*(int16 *) DatumGetPointer(entry->key)); break; case gbt_t_int4: - datum = Int32GetDatum(*(int32 *) entry->key); + datum = Int32GetDatum(*(int32 *) DatumGetPointer(entry->key)); break; case gbt_t_int8: - datum = Int64GetDatum(*(int64 *) entry->key); + datum = Int64GetDatum(*(int64 *) DatumGetPointer(entry->key)); break; case gbt_t_oid: case gbt_t_enum: - datum = ObjectIdGetDatum(*(Oid *) entry->key); + datum = ObjectIdGetDatum(*(Oid *) DatumGetPointer(entry->key)); break; case gbt_t_float4: - datum = Float4GetDatum(*(float4 *) entry->key); + datum = Float4GetDatum(*(float4 *) DatumGetPointer(entry->key)); break; case gbt_t_float8: - datum = Float8GetDatum(*(float8 *) entry->key); + datum = Float8GetDatum(*(float8 *) DatumGetPointer(entry->key)); break; case gbt_t_date: - datum = DateADTGetDatum(*(DateADT *) entry->key); + datum = DateADTGetDatum(*(DateADT *) DatumGetPointer(entry->key)); break; case gbt_t_time: - datum = TimeADTGetDatum(*(TimeADT *) entry->key); + datum = TimeADTGetDatum(*(TimeADT *) DatumGetPointer(entry->key)); break; case gbt_t_ts: - datum = TimestampGetDatum(*(Timestamp *) entry->key); + datum = TimestampGetDatum(*(Timestamp *) DatumGetPointer(entry->key)); break; case gbt_t_cash: - datum = CashGetDatum(*(Cash *) entry->key); + datum = CashGetDatum(*(Cash *) DatumGetPointer(entry->key)); break; default: datum = entry->key; } - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, datum, entry->rel, entry->page, entry->offset, false); return retval; @@ -181,8 +181,8 @@ gbt_num_union(GBT_NUMKEY *out, const GistEntryVector *entryvec, const gbtree_nin cur = (GBT_NUMKEY *) DatumGetPointer((entryvec->vector[0].key)); - o.lower = &((GBT_NUMKEY *) out)[0]; - o.upper = &((GBT_NUMKEY *) out)[tinfo->size]; + o.lower = &out[0]; + o.upper = &out[tinfo->size]; memcpy(out, cur, 2 * tinfo->size); diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c index d9df2356cd1e4..40e06ae490828 100644 --- a/contrib/btree_gist/btree_utils_var.c +++ b/contrib/btree_gist/btree_utils_var.c @@ -11,6 +11,7 @@ #include "btree_utils_var.h" #include "mb/pg_wchar.h" #include "utils/rel.h" +#include "varatt.h" /* used for key sorting */ typedef struct @@ -39,7 +40,7 @@ gbt_var_decompress(PG_FUNCTION_ARGS) if (key != (GBT_VARKEY *) DatumGetPointer(entry->key)) { - GISTENTRY *retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + GISTENTRY *retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, @@ -288,7 +289,7 @@ gbt_var_compress(GISTENTRY *entry, const gbtree_vinfo *tinfo) r = gbt_var_key_from_datum(leaf); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, true); @@ -308,7 +309,7 @@ gbt_var_fetch(PG_FUNCTION_ARGS) GBT_VARKEY_R r = gbt_var_key_readable(key); GISTENTRY *retval; - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r.lower), entry->rel, entry->page, entry->offset, true); @@ -466,7 +467,7 @@ gbt_var_picksplit(const GistEntryVector *entryvec, GIST_SPLITVEC *v, GBT_VARKEY **sv = NULL; gbt_vsrt_arg varg; - arr = (Vsrt *) palloc((maxoff + 1) * sizeof(Vsrt)); + arr = palloc_array(Vsrt, maxoff + 1); nbytes = (maxoff + 2) * sizeof(OffsetNumber); v->spl_left = (OffsetNumber *) palloc(nbytes); v->spl_right = (OffsetNumber *) palloc(nbytes); @@ -475,7 +476,7 @@ gbt_var_picksplit(const GistEntryVector *entryvec, GIST_SPLITVEC *v, v->spl_nleft = 0; v->spl_nright = 0; - sv = palloc(sizeof(bytea *) * (maxoff + 1)); + sv = palloc_array(GBT_VARKEY *, maxoff + 1); /* Sort entries */ diff --git a/contrib/btree_gist/btree_utils_var.h b/contrib/btree_gist/btree_utils_var.h index 75ad33d24fcd1..6cb3aadf3c33e 100644 --- a/contrib/btree_gist/btree_utils_var.h +++ b/contrib/btree_gist/btree_utils_var.h @@ -49,7 +49,7 @@ typedef struct */ #define GBT_FREE_IF_COPY(ptr1, ptr2) \ do { \ - if ((Pointer) (ptr1) != DatumGetPointer(ptr2)) \ + if ((ptr1) != DatumGetPointer(ptr2)) \ pfree(ptr1); \ } while (0) diff --git a/contrib/btree_gist/btree_uuid.c b/contrib/btree_gist/btree_uuid.c index 23a307a6a71d5..1091af222d1d1 100644 --- a/contrib/btree_gist/btree_uuid.c +++ b/contrib/btree_gist/btree_uuid.c @@ -6,6 +6,7 @@ #include "btree_gist.h" #include "btree_utils_num.h" #include "port/pg_bswap.h" +#include "utils/rel.h" #include "utils/sortsupport.h" #include "utils/uuid.h" @@ -107,7 +108,7 @@ gbt_uuid_compress(PG_FUNCTION_ARGS) char *r = (char *) palloc(2 * UUID_LEN); pg_uuid_t *key = DatumGetUUIDP(entry->key); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); memcpy(r, key, UUID_LEN); memcpy(r + UUID_LEN, key, UUID_LEN); diff --git a/contrib/btree_gist/expected/stratnum.out b/contrib/btree_gist/expected/stratnum.out index dd0edaf4a2062..8222b66153833 100644 --- a/contrib/btree_gist/expected/stratnum.out +++ b/contrib/btree_gist/expected/stratnum.out @@ -1,13 +1,13 @@ --- test stratnum support func -SELECT gist_stratnum_btree(7); - gist_stratnum_btree ---------------------- - 0 +-- test stratnum translation support func +SELECT gist_translate_cmptype_btree(7); + gist_translate_cmptype_btree +------------------------------ + 0 (1 row) -SELECT gist_stratnum_btree(3); - gist_stratnum_btree ---------------------- - 3 +SELECT gist_translate_cmptype_btree(3); + gist_translate_cmptype_btree +------------------------------ + 3 (1 row) diff --git a/contrib/btree_gist/meson.build b/contrib/btree_gist/meson.build index 89932dd3844ee..f4fa9574f1fd7 100644 --- a/contrib/btree_gist/meson.build +++ b/contrib/btree_gist/meson.build @@ -51,7 +51,6 @@ install_data( 'btree_gist--1.5--1.6.sql', 'btree_gist--1.6--1.7.sql', 'btree_gist--1.7--1.8.sql', - 'btree_gist--1.8--1.9.sql', kwargs: contrib_data_args, ) diff --git a/contrib/btree_gist/sql/stratnum.sql b/contrib/btree_gist/sql/stratnum.sql index 75adddad84925..da8bbf883b0cc 100644 --- a/contrib/btree_gist/sql/stratnum.sql +++ b/contrib/btree_gist/sql/stratnum.sql @@ -1,3 +1,3 @@ --- test stratnum support func -SELECT gist_stratnum_btree(7); -SELECT gist_stratnum_btree(3); +-- test stratnum translation support func +SELECT gist_translate_cmptype_btree(7); +SELECT gist_translate_cmptype_btree(3); diff --git a/contrib/cube/cube.c b/contrib/cube/cube.c index 8d3654ab7aafe..3600457cbc098 100644 --- a/contrib/cube/cube.c +++ b/contrib/cube/cube.c @@ -471,7 +471,7 @@ g_cube_decompress(PG_FUNCTION_ARGS) if (key != DatumGetNDBOXP(entry->key)) { - GISTENTRY *retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + GISTENTRY *retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, @@ -718,16 +718,16 @@ g_cube_internal_consistent(NDBOX *key, switch (strategy) { case RTOverlapStrategyNumber: - retval = (bool) cube_overlap_v0(key, query); + retval = cube_overlap_v0(key, query); break; case RTSameStrategyNumber: case RTContainsStrategyNumber: case RTOldContainsStrategyNumber: - retval = (bool) cube_contains_v0(key, query); + retval = cube_contains_v0(key, query); break; case RTContainedByStrategyNumber: case RTOldContainedByStrategyNumber: - retval = (bool) cube_overlap_v0(key, query); + retval = cube_overlap_v0(key, query); break; default: retval = false; diff --git a/contrib/cube/cubedata.h b/contrib/cube/cubedata.h index ad1e2bd699810..8bfcc6e99a27d 100644 --- a/contrib/cube/cubedata.h +++ b/contrib/cube/cubedata.h @@ -62,10 +62,7 @@ typedef struct NDBOX /* for cubescan.l and cubeparse.y */ /* All grammar constructs return strings */ #define YYSTYPE char * -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif /* in cubescan.l */ extern int cube_yylex(YYSTYPE *yylval_param, yyscan_t yyscanner); diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c index 98d4e3d7dac4c..8bf8fc8ea2f3b 100644 --- a/contrib/dblink/dblink.c +++ b/contrib/dblink/dblink.c @@ -101,11 +101,11 @@ static void materializeQueryResult(FunctionCallInfo fcinfo, const char *conname, const char *sql, bool fail); -static PGresult *storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql); -static void storeRow(volatile storeInfo *sinfo, PGresult *res, bool first); +static PGresult *storeQueryResult(storeInfo *sinfo, PGconn *conn, const char *sql); +static void storeRow(storeInfo *sinfo, PGresult *res, bool first); static remoteConn *getConnectionByName(const char *name); static HTAB *createConnHash(void); -static void createNewConnection(const char *name, remoteConn *rconn); +static remoteConn *createNewConnection(const char *name); static void deleteConnection(const char *name); static char **get_pkey_attnames(Relation rel, int16 *indnkeyatts); static char **get_text_array_contents(ArrayType *array, int *numitems); @@ -119,7 +119,8 @@ static Relation get_rel_from_relname(text *relname_text, LOCKMODE lockmode, AclM static char *generate_relation_name(Relation rel); static void dblink_connstr_check(const char *connstr); static bool dblink_connstr_has_pw(const char *connstr); -static void dblink_security_check(PGconn *conn, remoteConn *rconn, const char *connstr); +static void dblink_security_check(PGconn *conn, const char *connname, + const char *connstr); static void dblink_res_error(PGconn *conn, const char *conname, PGresult *res, bool fail, const char *fmt,...) pg_attribute_printf(5, 6); static char *get_connect_string(const char *servername); @@ -147,29 +148,27 @@ static uint32 dblink_we_get_conn = 0; static uint32 dblink_we_get_result = 0; /* - * Following is list that holds multiple remote connections. + * Following is hash that holds multiple remote connections. * Calling convention of each dblink function changes to accept - * connection name as the first parameter. The connection list is + * connection name as the first parameter. The connection hash is * much like ecpg e.g. a mapping between a name and a PGconn object. + * + * To avoid potentially leaking a PGconn object in case of out-of-memory + * errors, we first create the hash entry, then open the PGconn. + * Hence, a hash entry whose rconn.conn pointer is NULL must be + * understood as a leftover from a failed create; it should be ignored + * by lookup operations, and silently replaced by create operations. */ typedef struct remoteConnHashEnt { char name[NAMEDATALEN]; - remoteConn *rconn; + remoteConn rconn; } remoteConnHashEnt; /* initial number of connection hashes */ #define NUMCONN 16 -static char * -xpstrdup(const char *in) -{ - if (in == NULL) - return NULL; - return pstrdup(in); -} - pg_noreturn static void dblink_res_internalerror(PGconn *conn, PGresult *res, const char *p2) { @@ -233,7 +232,11 @@ dblink_get_conn(char *conname_or_str, errmsg("could not establish connection"), errdetail_internal("%s", msg))); } - dblink_security_check(conn, rconn, connstr); + + PQsetNoticeReceiver(conn, libpqsrv_notice_receiver, + "received message via remote connection"); + + dblink_security_check(conn, NULL, connstr); if (PQclientEncoding(conn) != GetDatabaseEncoding()) PQsetClientEncoding(conn, GetDatabaseEncodingName()); freeconn = true; @@ -296,15 +299,6 @@ dblink_connect(PG_FUNCTION_ARGS) else if (PG_NARGS() == 1) conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(0)); - if (connname) - { - rconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext, - sizeof(remoteConn)); - rconn->conn = NULL; - rconn->openCursorCount = 0; - rconn->newXactForCursor = false; - } - /* first check for valid foreign data server */ connstr = get_connect_string(conname_or_str); if (connstr == NULL) @@ -317,6 +311,13 @@ dblink_connect(PG_FUNCTION_ARGS) if (dblink_we_connect == 0) dblink_we_connect = WaitEventExtensionNew("DblinkConnect"); + /* if we need a hashtable entry, make that first, since it might fail */ + if (connname) + { + rconn = createNewConnection(connname); + Assert(rconn->conn == NULL); + } + /* OK to make connection */ conn = libpqsrv_connect(connstr, dblink_we_connect); @@ -324,8 +325,8 @@ dblink_connect(PG_FUNCTION_ARGS) { msg = pchomp(PQerrorMessage(conn)); libpqsrv_disconnect(conn); - if (rconn) - pfree(rconn); + if (connname) + deleteConnection(connname); ereport(ERROR, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), @@ -333,17 +334,20 @@ dblink_connect(PG_FUNCTION_ARGS) errdetail_internal("%s", msg))); } + PQsetNoticeReceiver(conn, libpqsrv_notice_receiver, + "received message via remote connection"); + /* check password actually used if not superuser */ - dblink_security_check(conn, rconn, connstr); + dblink_security_check(conn, connname, connstr); /* attempt to set client encoding to match server encoding, if needed */ if (PQclientEncoding(conn) != GetDatabaseEncoding()) PQsetClientEncoding(conn, GetDatabaseEncodingName()); + /* all OK, save away the conn */ if (connname) { rconn->conn = conn; - createNewConnection(connname, rconn); } else { @@ -383,10 +387,7 @@ dblink_disconnect(PG_FUNCTION_ARGS) libpqsrv_disconnect(conn); if (rconn) - { deleteConnection(conname); - pfree(rconn); - } else pconn->conn = NULL; @@ -861,131 +862,123 @@ static void materializeResult(FunctionCallInfo fcinfo, PGconn *conn, PGresult *res) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + bool is_sql_cmd; + int ntuples; + int nfields; /* prepTuplestoreResult must have been called previously */ Assert(rsinfo->returnMode == SFRM_Materialize); - PG_TRY(); + if (PQresultStatus(res) == PGRES_COMMAND_OK) { - TupleDesc tupdesc; - bool is_sql_cmd; - int ntuples; - int nfields; + is_sql_cmd = true; - if (PQresultStatus(res) == PGRES_COMMAND_OK) - { - is_sql_cmd = true; + /* + * need a tuple descriptor representing one TEXT column to return the + * command status string as our result tuple + */ + tupdesc = CreateTemplateTupleDesc(1); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", + TEXTOID, -1, 0); + ntuples = 1; + nfields = 1; + } + else + { + Assert(PQresultStatus(res) == PGRES_TUPLES_OK); - /* - * need a tuple descriptor representing one TEXT column to return - * the command status string as our result tuple - */ - tupdesc = CreateTemplateTupleDesc(1); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", - TEXTOID, -1, 0); - ntuples = 1; - nfields = 1; - } - else - { - Assert(PQresultStatus(res) == PGRES_TUPLES_OK); + is_sql_cmd = false; - is_sql_cmd = false; + /* get a tuple descriptor for our result type */ + switch (get_call_result_type(fcinfo, NULL, &tupdesc)) + { + case TYPEFUNC_COMPOSITE: + /* success */ + break; + case TYPEFUNC_RECORD: + /* failed to determine actual type of RECORD */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning record called in context " + "that cannot accept type record"))); + break; + default: + /* result type isn't composite */ + elog(ERROR, "return type must be a row type"); + break; + } - /* get a tuple descriptor for our result type */ - switch (get_call_result_type(fcinfo, NULL, &tupdesc)) - { - case TYPEFUNC_COMPOSITE: - /* success */ - break; - case TYPEFUNC_RECORD: - /* failed to determine actual type of RECORD */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("function returning record called in context " - "that cannot accept type record"))); - break; - default: - /* result type isn't composite */ - elog(ERROR, "return type must be a row type"); - break; - } + /* make sure we have a persistent copy of the tupdesc */ + tupdesc = CreateTupleDescCopy(tupdesc); + ntuples = PQntuples(res); + nfields = PQnfields(res); + } - /* make sure we have a persistent copy of the tupdesc */ - tupdesc = CreateTupleDescCopy(tupdesc); - ntuples = PQntuples(res); - nfields = PQnfields(res); - } + /* + * check result and tuple descriptor have the same number of columns + */ + if (nfields != tupdesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("remote query result rowtype does not match " + "the specified FROM clause rowtype"))); - /* - * check result and tuple descriptor have the same number of columns - */ - if (nfields != tupdesc->natts) - ereport(ERROR, - (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("remote query result rowtype does not match " - "the specified FROM clause rowtype"))); + if (ntuples > 0) + { + AttInMetadata *attinmeta; + int nestlevel = -1; + Tuplestorestate *tupstore; + MemoryContext oldcontext; + int row; + char **values; - if (ntuples > 0) - { - AttInMetadata *attinmeta; - int nestlevel = -1; - Tuplestorestate *tupstore; - MemoryContext oldcontext; - int row; - char **values; + attinmeta = TupleDescGetAttInMetadata(tupdesc); - attinmeta = TupleDescGetAttInMetadata(tupdesc); + /* Set GUCs to ensure we read GUC-sensitive data types correctly */ + if (!is_sql_cmd) + nestlevel = applyRemoteGucs(conn); - /* Set GUCs to ensure we read GUC-sensitive data types correctly */ - if (!is_sql_cmd) - nestlevel = applyRemoteGucs(conn); + oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + MemoryContextSwitchTo(oldcontext); - oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); - tupstore = tuplestore_begin_heap(true, false, work_mem); - rsinfo->setResult = tupstore; - rsinfo->setDesc = tupdesc; - MemoryContextSwitchTo(oldcontext); + values = palloc_array(char *, nfields); - values = palloc_array(char *, nfields); + /* put all tuples into the tuplestore */ + for (row = 0; row < ntuples; row++) + { + HeapTuple tuple; - /* put all tuples into the tuplestore */ - for (row = 0; row < ntuples; row++) + if (!is_sql_cmd) { - HeapTuple tuple; + int i; - if (!is_sql_cmd) + for (i = 0; i < nfields; i++) { - int i; - - for (i = 0; i < nfields; i++) - { - if (PQgetisnull(res, row, i)) - values[i] = NULL; - else - values[i] = PQgetvalue(res, row, i); - } + if (PQgetisnull(res, row, i)) + values[i] = NULL; + else + values[i] = PQgetvalue(res, row, i); } - else - { - values[0] = PQcmdStatus(res); - } - - /* build the tuple and put it into the tuplestore. */ - tuple = BuildTupleFromCStrings(attinmeta, values); - tuplestore_puttuple(tupstore, tuple); + } + else + { + values[0] = PQcmdStatus(res); } - /* clean up GUC settings, if we changed any */ - restoreLocalGucs(nestlevel); + /* build the tuple and put it into the tuplestore. */ + tuple = BuildTupleFromCStrings(attinmeta, values); + tuplestore_puttuple(tupstore, tuple); } + + /* clean up GUC settings, if we changed any */ + restoreLocalGucs(nestlevel); } - PG_FINALLY(); - { - /* be sure to release the libpq result */ - PQclear(res); - } - PG_END_TRY(); + + PQclear(res); } /* @@ -1004,16 +997,17 @@ materializeQueryResult(FunctionCallInfo fcinfo, bool fail) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - PGresult *volatile res = NULL; - volatile storeInfo sinfo = {0}; /* prepTuplestoreResult must have been called previously */ Assert(rsinfo->returnMode == SFRM_Materialize); - sinfo.fcinfo = fcinfo; - + /* Use a PG_TRY block to ensure we pump libpq dry of results */ PG_TRY(); { + storeInfo sinfo = {0}; + PGresult *res; + + sinfo.fcinfo = fcinfo; /* Create short-lived memory context for data conversions */ sinfo.tmpcontext = AllocSetContextCreate(CurrentMemoryContext, "dblink temporary context", @@ -1026,14 +1020,7 @@ materializeQueryResult(FunctionCallInfo fcinfo, (PQresultStatus(res) != PGRES_COMMAND_OK && PQresultStatus(res) != PGRES_TUPLES_OK)) { - /* - * dblink_res_error will clear the passed PGresult, so we need - * this ugly dance to avoid doing so twice during error exit - */ - PGresult *res1 = res; - - res = NULL; - dblink_res_error(conn, conname, res1, fail, + dblink_res_error(conn, conname, res, fail, "while executing query"); /* if fail isn't set, we'll return an empty query result */ } @@ -1072,7 +1059,6 @@ materializeQueryResult(FunctionCallInfo fcinfo, tuplestore_puttuple(tupstore, tuple); PQclear(res); - res = NULL; } else { @@ -1081,26 +1067,20 @@ materializeQueryResult(FunctionCallInfo fcinfo, Assert(rsinfo->setResult != NULL); PQclear(res); - res = NULL; } /* clean up data conversion short-lived memory context */ if (sinfo.tmpcontext != NULL) MemoryContextDelete(sinfo.tmpcontext); - sinfo.tmpcontext = NULL; PQclear(sinfo.last_res); - sinfo.last_res = NULL; PQclear(sinfo.cur_res); - sinfo.cur_res = NULL; } PG_CATCH(); { - /* be sure to release any libpq result we collected */ - PQclear(res); - PQclear(sinfo.last_res); - PQclear(sinfo.cur_res); - /* and clear out any pending data in libpq */ + PGresult *res; + + /* be sure to clear out any pending data in libpq */ while ((res = libpqsrv_get_result(conn, dblink_we_get_result)) != NULL) PQclear(res); @@ -1113,7 +1093,7 @@ materializeQueryResult(FunctionCallInfo fcinfo, * Execute query, and send any result rows to sinfo->tuplestore. */ static PGresult * -storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql) +storeQueryResult(storeInfo *sinfo, PGconn *conn, const char *sql) { bool first = true; int nestlevel = -1; @@ -1181,7 +1161,7 @@ storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql) * (in this case the PGresult might contain either zero or one row). */ static void -storeRow(volatile storeInfo *sinfo, PGresult *res, bool first) +storeRow(storeInfo *sinfo, PGresult *res, bool first) { int nfields = PQnfields(res); HeapTuple tuple; @@ -1304,6 +1284,9 @@ dblink_get_connections(PG_FUNCTION_ARGS) hash_seq_init(&status, remoteConnHash); while ((hentry = (remoteConnHashEnt *) hash_seq_search(&status)) != NULL) { + /* ignore it if it's not an open connection */ + if (hentry->rconn.conn == NULL) + continue; /* stash away current value */ astate = accumArrayResult(astate, CStringGetTextDatum(hentry->name), @@ -2477,6 +2460,21 @@ get_tuple_of_interest(Relation rel, int *pkattnums, int pknumatts, char **src_pk return NULL; } +static void +RangeVarCallbackForDblink(const RangeVar *relation, + Oid relId, Oid oldRelId, void *arg) +{ + AclResult aclresult; + + if (!OidIsValid(relId)) + return; + + aclresult = pg_class_aclcheck(relId, GetUserId(), *((AclMode *) arg)); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, get_relkind_objtype(get_rel_relkind(relId)), + relation->relname); +} + /* * Open the relation named by relname_text, acquire specified type of lock, * verify we have specified permissions. @@ -2486,19 +2484,13 @@ static Relation get_rel_from_relname(text *relname_text, LOCKMODE lockmode, AclMode aclmode) { RangeVar *relvar; - Relation rel; - AclResult aclresult; + Oid relid; relvar = makeRangeVarFromNameList(textToQualifiedNameList(relname_text)); - rel = table_openrv(relvar, lockmode); - - aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), - aclmode); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind), - RelationGetRelationName(rel)); + relid = RangeVarGetRelidExtended(relvar, lockmode, 0, + RangeVarCallbackForDblink, &aclmode); - return rel; + return table_open(relid, NoLock); } /* @@ -2539,8 +2531,8 @@ getConnectionByName(const char *name) hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, key, HASH_FIND, NULL); - if (hentry) - return hentry->rconn; + if (hentry && hentry->rconn.conn != NULL) + return &hentry->rconn; return NULL; } @@ -2557,8 +2549,8 @@ createConnHash(void) HASH_ELEM | HASH_STRINGS); } -static void -createNewConnection(const char *name, remoteConn *rconn) +static remoteConn * +createNewConnection(const char *name) { remoteConnHashEnt *hentry; bool found; @@ -2572,17 +2564,15 @@ createNewConnection(const char *name, remoteConn *rconn) hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, key, HASH_ENTER, &found); - if (found) - { - libpqsrv_disconnect(rconn->conn); - pfree(rconn); - + if (found && hentry->rconn.conn != NULL) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("duplicate connection name"))); - } - hentry->rconn = rconn; + /* New, or reusable, so initialize the rconn struct to zeroes */ + memset(&hentry->rconn, 0, sizeof(remoteConn)); + + return &hentry->rconn; } static void @@ -2662,7 +2652,7 @@ dblink_connstr_has_required_scram_options(const char *connstr) PQconninfoFree(options); } - has_scram_keys = has_scram_client_key && has_scram_server_key && MyProcPort->has_scram_keys; + has_scram_keys = has_scram_client_key && has_scram_server_key && MyProcPort != NULL && MyProcPort->has_scram_keys; return (has_scram_keys && has_require_auth); } @@ -2671,9 +2661,12 @@ dblink_connstr_has_required_scram_options(const char *connstr) * We need to make sure that the connection made used credentials * which were provided by the user, so check what credentials were * used to connect and then make sure that they came from the user. + * + * On failure, we close "conn" and also delete the hashtable entry + * identified by "connname" (if that's not NULL). */ static void -dblink_security_check(PGconn *conn, remoteConn *rconn, const char *connstr) +dblink_security_check(PGconn *conn, const char *connname, const char *connstr) { /* Superuser bypasses security check */ if (superuser()) @@ -2692,7 +2685,7 @@ dblink_security_check(PGconn *conn, remoteConn *rconn, const char *connstr) * only added if UseScramPassthrough is set, and the user is not allowed * to add the SCRAM keys on fdw and user mapping options. */ - if (MyProcPort->has_scram_keys && dblink_connstr_has_required_scram_options(connstr)) + if (MyProcPort != NULL && MyProcPort->has_scram_keys && dblink_connstr_has_required_scram_options(connstr)) return; #ifdef ENABLE_GSS @@ -2703,8 +2696,8 @@ dblink_security_check(PGconn *conn, remoteConn *rconn, const char *connstr) /* Otherwise, fail out */ libpqsrv_disconnect(conn); - if (rconn) - pfree(rconn); + if (connname) + deleteConnection(connname); ereport(ERROR, (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), @@ -2765,7 +2758,7 @@ dblink_connstr_check(const char *connstr) if (dblink_connstr_has_pw(connstr)) return; - if (MyProcPort->has_scram_keys && dblink_connstr_has_required_scram_options(connstr)) + if (MyProcPort != NULL && MyProcPort->has_scram_keys && dblink_connstr_has_required_scram_options(connstr)) return; #ifdef ENABLE_GSS @@ -2782,10 +2775,13 @@ dblink_connstr_check(const char *connstr) /* * Report an error received from the remote server * - * res: the received error result (will be freed) + * res: the received error result * fail: true for ERROR ereport, false for NOTICE * fmt and following args: sprintf-style format and values for errcontext; * the resulting string should be worded like "while " + * + * If "res" is not NULL, it'll be PQclear'ed here (unless we throw error, + * in which case memory context cleanup will clear it eventually). */ static void dblink_res_error(PGconn *conn, const char *conname, PGresult *res, @@ -2793,15 +2789,11 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, { int level; char *pg_diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); - char *pg_diag_message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); - char *pg_diag_message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); - char *pg_diag_message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); - char *pg_diag_context = PQresultErrorField(res, PG_DIAG_CONTEXT); + char *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); + char *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); + char *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); + char *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT); int sqlstate; - char *message_primary; - char *message_detail; - char *message_hint; - char *message_context; va_list ap; char dblink_context_msg[512]; @@ -2819,11 +2811,6 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, else sqlstate = ERRCODE_CONNECTION_FAILURE; - message_primary = xpstrdup(pg_diag_message_primary); - message_detail = xpstrdup(pg_diag_message_detail); - message_hint = xpstrdup(pg_diag_message_hint); - message_context = xpstrdup(pg_diag_context); - /* * If we don't get a message from the PGresult, try the PGconn. This is * needed because for connection-level failures, PQgetResult may just @@ -2832,14 +2819,6 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, if (message_primary == NULL) message_primary = pchomp(PQerrorMessage(conn)); - /* - * Now that we've copied all the data we need out of the PGresult, it's - * safe to free it. We must do this to avoid PGresult leakage. We're - * leaking all the strings too, but those are in palloc'd memory that will - * get cleaned up eventually. - */ - PQclear(res); - /* * Format the basic errcontext string. Below, we'll add on something * about the connection name. That's a violation of the translatability @@ -2864,6 +2843,7 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, dblink_context_msg, conname)) : (errcontext("%s on unnamed dblink connection", dblink_context_msg)))); + PQclear(res); } /* @@ -2925,7 +2905,7 @@ get_connect_string(const char *servername) * the user overwrites these options we can ereport on * dblink_connstr_check and dblink_security_check. */ - if (MyProcPort->has_scram_keys && UseScramPassthrough(foreign_server, user_mapping)) + if (MyProcPort != NULL && MyProcPort->has_scram_keys && UseScramPassthrough(foreign_server, user_mapping)) appendSCRAMKeysInfo(&buf); foreach(cell, fdw->options) @@ -3040,7 +3020,7 @@ validate_pkattnums(Relation rel, for (j = 0; j < natts; j++) { /* dropped columns don't count */ - if (TupleDescAttr(tupdesc, j)->attisdropped) + if (TupleDescCompactAttr(tupdesc, j)->attisdropped) continue; if (++lnum == pkattnum) diff --git a/contrib/dblink/meson.build b/contrib/dblink/meson.build index dfd8eb6877e90..a19ce6cf4b924 100644 --- a/contrib/dblink/meson.build +++ b/contrib/dblink/meson.build @@ -34,7 +34,7 @@ tests += { 'sql': [ 'dblink', ], - 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], + 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], }, 'tap': { 'tests': [ diff --git a/contrib/dict_int/dict_int.c b/contrib/dict_int/dict_int.c index bdad52d202897..5d9523a1a632b 100644 --- a/contrib/dict_int/dict_int.c +++ b/contrib/dict_int/dict_int.c @@ -38,7 +38,7 @@ dintdict_init(PG_FUNCTION_ARGS) DictInt *d; ListCell *l; - d = (DictInt *) palloc0(sizeof(DictInt)); + d = palloc0_object(DictInt); d->maxlen = 6; d->rejectlong = false; d->absval = false; @@ -83,7 +83,7 @@ dintdict_lexize(PG_FUNCTION_ARGS) char *in = (char *) PG_GETARG_POINTER(1); int len = PG_GETARG_INT32(2); char *txt; - TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); + TSLexeme *res = palloc0_array(TSLexeme, 2); res[1].lexeme = NULL; diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c index 1ec5285d6d1fc..bbb2fd850fd0c 100644 --- a/contrib/dict_xsyn/dict_xsyn.c +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -109,9 +109,9 @@ read_dictionary(DictSyn *d, const char *filename) { d->len = (d->len > 0) ? 2 * d->len : 16; if (d->syn) - d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + d->syn = repalloc_array(d->syn, Syn, d->len); else - d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + d->syn = palloc_array(Syn, d->len); } /* Save first word only if we will match it */ @@ -150,7 +150,7 @@ dxsyn_init(PG_FUNCTION_ARGS) ListCell *l; char *filename = NULL; - d = (DictSyn *) palloc0(sizeof(DictSyn)); + d = palloc0_object(DictSyn); d->len = 0; d->syn = NULL; d->matchorig = true; @@ -235,7 +235,7 @@ dxsyn_lexize(PG_FUNCTION_ARGS) char *end; int nsyns = 0; - res = palloc(sizeof(TSLexeme)); + res = palloc_object(TSLexeme); pos = value; while ((syn = find_word(pos, &end)) != NULL) diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out index df8d43b374989..5121e27dce57b 100644 --- a/contrib/file_fdw/expected/file_fdw.out +++ b/contrib/file_fdw/expected/file_fdw.out @@ -48,6 +48,10 @@ SET ROLE regress_file_fdw_superuser; CREATE USER MAPPING FOR regress_file_fdw_superuser SERVER file_server; CREATE USER MAPPING FOR regress_no_priv_user SERVER file_server; -- validator tests +CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (foo 'bar'); -- ERROR +ERROR: invalid option "foo" +CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS ("a=b" 'true'); -- ERROR +ERROR: invalid option name "a=b": must not contain "=" CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'xml'); -- ERROR ERROR: COPY format "xml" not recognized CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', quote ':'); -- ERROR @@ -318,6 +322,7 @@ SET constraint_exclusion = 'on'; SELECT explain_filter('EXPLAIN (VERBOSE, COSTS FALSE) SELECT * FROM agg_csv WHERE a < 0'); Result Output: a, b + Replaces: Scan on agg_csv One-Time Filter: false \t off diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index a9a5671d95a6f..e9cda3c47d153 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -531,7 +531,7 @@ fileGetForeignRelSize(PlannerInfo *root, * we might as well get everything and not need to re-fetch it later in * planning. */ - fdw_private = (FileFdwPlanState *) palloc(sizeof(FileFdwPlanState)); + fdw_private = palloc_object(FileFdwPlanState); fileGetOptions(foreigntableid, &fdw_private->filename, &fdw_private->is_program, @@ -712,7 +712,7 @@ fileBeginForeignScan(ForeignScanState *node, int eflags) * Save state in node->fdw_state. We must save enough information to call * BeginCopyFrom() again. */ - festate = (FileFdwExecutionState *) palloc(sizeof(FileFdwExecutionState)); + festate = palloc_object(FileFdwExecutionState); festate->filename = filename; festate->is_program = is_program; festate->options = options; @@ -1026,9 +1026,7 @@ check_selective_binary_conversion(RelOptInfo *baserel, numattrs = 0; for (i = 0; i < tupleDesc->natts; i++) { - Form_pg_attribute attr = TupleDescAttr(tupleDesc, i); - - if (attr->attisdropped) + if (TupleDescCompactAttr(tupleDesc, i)->attisdropped) continue; numattrs++; } diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql index 2cdbe7a8a4c52..1a397ad4bd150 100644 --- a/contrib/file_fdw/sql/file_fdw.sql +++ b/contrib/file_fdw/sql/file_fdw.sql @@ -55,6 +55,8 @@ CREATE USER MAPPING FOR regress_file_fdw_superuser SERVER file_server; CREATE USER MAPPING FOR regress_no_priv_user SERVER file_server; -- validator tests +CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (foo 'bar'); -- ERROR +CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS ("a=b" 'true'); -- ERROR CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'xml'); -- ERROR CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', quote ':'); -- ERROR CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', escape ':'); -- ERROR diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c index 6627b2b89433a..227d8b11ddca8 100644 --- a/contrib/fuzzystrmatch/dmetaphone.c +++ b/contrib/fuzzystrmatch/dmetaphone.c @@ -327,7 +327,7 @@ GetAt(metastring *s, int pos) if ((pos < 0) || (pos >= s->length)) return '\0'; - return ((char) *(s->str + pos)); + return *(s->str + pos); } diff --git a/contrib/hstore/hstore_compat.c b/contrib/hstore/hstore_compat.c index d75e9cb23f5cd..3a9f7f45cb71c 100644 --- a/contrib/hstore/hstore_compat.c +++ b/contrib/hstore/hstore_compat.c @@ -94,7 +94,7 @@ * etc. are compatible. * * If the above statement isn't true on some bizarre platform, we're - * a bit hosed (see StaticAssertStmt in hstoreValidOldFormat). + * a bit hosed. */ typedef struct { @@ -105,6 +105,9 @@ typedef struct pos:31; } HOldEntry; +StaticAssertDecl(sizeof(HOldEntry) == 2 * sizeof(HEntry), + "old hstore format is not upward-compatible"); + static int hstoreValidNewFormat(HStore *hs); static int hstoreValidOldFormat(HStore *hs); @@ -179,10 +182,6 @@ hstoreValidOldFormat(HStore *hs) if (hs->size_ & HS_FLAG_NEWVERSION) return 0; - /* New format uses an HEntry for key and another for value */ - StaticAssertStmt(sizeof(HOldEntry) == 2 * sizeof(HEntry), - "old hstore format is not upward-compatible"); - if (count == 0) return 2; diff --git a/contrib/hstore/hstore_gin.c b/contrib/hstore/hstore_gin.c index 766c00bb6a735..2e5fa115924ba 100644 --- a/contrib/hstore/hstore_gin.c +++ b/contrib/hstore/hstore_gin.c @@ -127,7 +127,7 @@ gin_extract_hstore_query(PG_FUNCTION_ARGS) /* Nulls in the array are ignored, cf hstoreArrayToPairs */ if (key_nulls[i]) continue; - item = makeitem(VARDATA(key_datums[i]), VARSIZE(key_datums[i]) - VARHDRSZ, KEYFLAG); + item = makeitem(VARDATA(DatumGetPointer(key_datums[i])), VARSIZE(DatumGetPointer(key_datums[i])) - VARHDRSZ, KEYFLAG); entries[j++] = PointerGetDatum(item); } diff --git a/contrib/hstore/hstore_gist.c b/contrib/hstore/hstore_gist.c index a3b08af385016..36825ef867b42 100644 --- a/contrib/hstore/hstore_gist.c +++ b/contrib/hstore/hstore_gist.c @@ -175,7 +175,7 @@ ghstore_compress(PG_FUNCTION_ARGS) } } - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, @@ -195,7 +195,7 @@ ghstore_compress(PG_FUNCTION_ARGS) res = ghstore_alloc(true, siglen, NULL); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, @@ -429,7 +429,7 @@ ghstore_picksplit(PG_FUNCTION_ARGS) maxoff = OffsetNumberNext(maxoff); /* sort before ... */ - costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + costvector = palloc_array(SPLITCOST, maxoff); for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) { costvector[j - 1].pos = j; @@ -576,7 +576,7 @@ ghstore_consistent(PG_FUNCTION_ARGS) if (key_nulls[i]) continue; - crc = crc32_sz(VARDATA(key_datums[i]), VARSIZE(key_datums[i]) - VARHDRSZ); + crc = crc32_sz(VARDATA(DatumGetPointer(key_datums[i])), VARSIZE(DatumGetPointer(key_datums[i])) - VARHDRSZ); if (!(GETBIT(sign, HASHVAL(crc, siglen)))) res = false; } @@ -599,7 +599,7 @@ ghstore_consistent(PG_FUNCTION_ARGS) if (key_nulls[i]) continue; - crc = crc32_sz(VARDATA(key_datums[i]), VARSIZE(key_datums[i]) - VARHDRSZ); + crc = crc32_sz(VARDATA(DatumGetPointer(key_datums[i])), VARSIZE(DatumGetPointer(key_datums[i])) - VARHDRSZ); if (GETBIT(sign, HASHVAL(crc, siglen))) res = true; } diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 4f867e4bd1f1c..34e3918811cd4 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -221,7 +221,7 @@ parse_hstore(HSParser *state) bool escaped = false; state->plen = 16; - state->pairs = (Pairs *) palloc(sizeof(Pairs) * state->plen); + state->pairs = palloc_array(Pairs, state->plen); state->pcur = 0; state->ptr = state->begin; state->word = NULL; @@ -684,22 +684,22 @@ hstore_from_arrays(PG_FUNCTION_ARGS) if (!value_nulls || value_nulls[i]) { - pairs[i].key = VARDATA(key_datums[i]); + pairs[i].key = VARDATA(DatumGetPointer(key_datums[i])); pairs[i].val = NULL; pairs[i].keylen = - hstoreCheckKeyLen(VARSIZE(key_datums[i]) - VARHDRSZ); + hstoreCheckKeyLen(VARSIZE(DatumGetPointer(key_datums[i])) - VARHDRSZ); pairs[i].vallen = 4; pairs[i].isnull = true; pairs[i].needfree = false; } else { - pairs[i].key = VARDATA(key_datums[i]); - pairs[i].val = VARDATA(value_datums[i]); + pairs[i].key = VARDATA(DatumGetPointer(key_datums[i])); + pairs[i].val = VARDATA(DatumGetPointer(value_datums[i])); pairs[i].keylen = - hstoreCheckKeyLen(VARSIZE(key_datums[i]) - VARHDRSZ); + hstoreCheckKeyLen(VARSIZE(DatumGetPointer(key_datums[i])) - VARHDRSZ); pairs[i].vallen = - hstoreCheckValLen(VARSIZE(value_datums[i]) - VARHDRSZ); + hstoreCheckValLen(VARSIZE(DatumGetPointer(value_datums[i])) - VARHDRSZ); pairs[i].isnull = false; pairs[i].needfree = false; } @@ -778,22 +778,22 @@ hstore_from_array(PG_FUNCTION_ARGS) if (in_nulls[i * 2 + 1]) { - pairs[i].key = VARDATA(in_datums[i * 2]); + pairs[i].key = VARDATA(DatumGetPointer(in_datums[i * 2])); pairs[i].val = NULL; pairs[i].keylen = - hstoreCheckKeyLen(VARSIZE(in_datums[i * 2]) - VARHDRSZ); + hstoreCheckKeyLen(VARSIZE(DatumGetPointer(in_datums[i * 2])) - VARHDRSZ); pairs[i].vallen = 4; pairs[i].isnull = true; pairs[i].needfree = false; } else { - pairs[i].key = VARDATA(in_datums[i * 2]); - pairs[i].val = VARDATA(in_datums[i * 2 + 1]); + pairs[i].key = VARDATA(DatumGetPointer(in_datums[i * 2])); + pairs[i].val = VARDATA(DatumGetPointer(in_datums[i * 2 + 1])); pairs[i].keylen = - hstoreCheckKeyLen(VARSIZE(in_datums[i * 2]) - VARHDRSZ); + hstoreCheckKeyLen(VARSIZE(DatumGetPointer(in_datums[i * 2])) - VARHDRSZ); pairs[i].vallen = - hstoreCheckValLen(VARSIZE(in_datums[i * 2 + 1]) - VARHDRSZ); + hstoreCheckValLen(VARSIZE(DatumGetPointer(in_datums[i * 2 + 1])) - VARHDRSZ); pairs[i].isnull = false; pairs[i].needfree = false; } @@ -1439,10 +1439,9 @@ hstore_to_jsonb(PG_FUNCTION_ARGS) int count = HS_COUNT(in); char *base = STRPTR(in); HEntry *entries = ARRPTR(in); - JsonbParseState *state = NULL; - JsonbValue *res; + JsonbInState state = {0}; - (void) pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); for (i = 0; i < count; i++) { @@ -1453,7 +1452,7 @@ hstore_to_jsonb(PG_FUNCTION_ARGS) key.val.string.len = HSTORE_KEYLEN(entries, i); key.val.string.val = HSTORE_KEY(entries, base, i); - (void) pushJsonbValue(&state, WJB_KEY, &key); + pushJsonbValue(&state, WJB_KEY, &key); if (HSTORE_VALISNULL(entries, i)) { @@ -1465,12 +1464,12 @@ hstore_to_jsonb(PG_FUNCTION_ARGS) val.val.string.len = HSTORE_VALLEN(entries, i); val.val.string.val = HSTORE_VAL(entries, base, i); } - (void) pushJsonbValue(&state, WJB_VALUE, &val); + pushJsonbValue(&state, WJB_VALUE, &val); } - res = pushJsonbValue(&state, WJB_END_OBJECT, NULL); + pushJsonbValue(&state, WJB_END_OBJECT, NULL); - PG_RETURN_POINTER(JsonbValueToJsonb(res)); + PG_RETURN_POINTER(JsonbValueToJsonb(state.result)); } PG_FUNCTION_INFO_V1(hstore_to_jsonb_loose); @@ -1482,13 +1481,12 @@ hstore_to_jsonb_loose(PG_FUNCTION_ARGS) int count = HS_COUNT(in); char *base = STRPTR(in); HEntry *entries = ARRPTR(in); - JsonbParseState *state = NULL; - JsonbValue *res; + JsonbInState state = {0}; StringInfoData tmp; initStringInfo(&tmp); - (void) pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); for (i = 0; i < count; i++) { @@ -1499,7 +1497,7 @@ hstore_to_jsonb_loose(PG_FUNCTION_ARGS) key.val.string.len = HSTORE_KEYLEN(entries, i); key.val.string.val = HSTORE_KEY(entries, base, i); - (void) pushJsonbValue(&state, WJB_KEY, &key); + pushJsonbValue(&state, WJB_KEY, &key); if (HSTORE_VALISNULL(entries, i)) { @@ -1541,10 +1539,10 @@ hstore_to_jsonb_loose(PG_FUNCTION_ARGS) val.val.string.val = HSTORE_VAL(entries, base, i); } } - (void) pushJsonbValue(&state, WJB_VALUE, &val); + pushJsonbValue(&state, WJB_VALUE, &val); } - res = pushJsonbValue(&state, WJB_END_OBJECT, NULL); + pushJsonbValue(&state, WJB_END_OBJECT, NULL); - PG_RETURN_POINTER(JsonbValueToJsonb(res)); + PG_RETURN_POINTER(JsonbValueToJsonb(state.result)); } diff --git a/contrib/hstore/hstore_op.c b/contrib/hstore/hstore_op.c index 5e57eceffc817..bcba75f925808 100644 --- a/contrib/hstore/hstore_op.c +++ b/contrib/hstore/hstore_op.c @@ -107,8 +107,8 @@ hstoreArrayToPairs(ArrayType *a, int *npairs) { if (!key_nulls[i]) { - key_pairs[j].key = VARDATA(key_datums[i]); - key_pairs[j].keylen = VARSIZE(key_datums[i]) - VARHDRSZ; + key_pairs[j].key = VARDATA(DatumGetPointer(key_datums[i])); + key_pairs[j].keylen = VARSIZE(DatumGetPointer(key_datums[i])) - VARHDRSZ; key_pairs[j].val = NULL; key_pairs[j].vallen = 0; key_pairs[j].needfree = 0; diff --git a/contrib/hstore/hstore_subs.c b/contrib/hstore/hstore_subs.c index 3d03f66fa0dfb..1bae69e4e2c63 100644 --- a/contrib/hstore/hstore_subs.c +++ b/contrib/hstore/hstore_subs.c @@ -74,7 +74,7 @@ hstore_subscript_transform(SubscriptingRef *sbsref, errmsg("hstore subscript must have type text"), parser_errposition(pstate, exprLocation(ai->uidx)))); - /* ... and store the transformed subscript into the SubscriptRef node */ + /* ... and store the transformed subscript into the SubscriptingRef node */ sbsref->refupperindexpr = list_make1(subexpr); sbsref->reflowerindexpr = NIL; diff --git a/contrib/intarray/_int_bool.c b/contrib/intarray/_int_bool.c index 2b2c3f4029ec5..f45df86d60c38 100644 --- a/contrib/intarray/_int_bool.c +++ b/contrib/intarray/_int_bool.c @@ -135,7 +135,7 @@ gettoken(WORKSTATE *state, int32 *val) static void pushquery(WORKSTATE *state, int32 type, int32 val) { - NODE *tmp = (NODE *) palloc(sizeof(NODE)); + NODE *tmp = palloc_object(NODE); tmp->type = type; tmp->val = val; @@ -346,7 +346,7 @@ gin_bool_consistent(QUERYTYPE *query, bool *check) * extraction code in ginint4_queryextract. */ gcv.first = items; - gcv.mapped_check = (bool *) palloc(sizeof(bool) * query->size); + gcv.mapped_check = palloc_array(bool, query->size); for (i = 0; i < query->size; i++) { if (items[i].type == VAL) @@ -613,7 +613,7 @@ infix(INFIX *in, bool first) nrm.curpol = in->curpol; nrm.buflen = 16; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); /* get right operand */ infix(&nrm, false); @@ -651,7 +651,7 @@ bqarr_out(PG_FUNCTION_ARGS) nrm.curpol = GETQUERY(query) + query->size - 1; nrm.buflen = 32; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); *(nrm.cur) = '\0'; infix(&nrm, true); diff --git a/contrib/intarray/_int_gin.c b/contrib/intarray/_int_gin.c index b7958d8eca5fb..c60616c3f772c 100644 --- a/contrib/intarray/_int_gin.c +++ b/contrib/intarray/_int_gin.c @@ -42,7 +42,7 @@ ginint4_queryextract(PG_FUNCTION_ARGS) /* * Extract all the VAL items as things we want GIN to check for. */ - res = (Datum *) palloc(sizeof(Datum) * query->size); + res = palloc_array(Datum, query->size); *nentries = 0; for (i = 0; i < query->size; i++) @@ -65,7 +65,7 @@ ginint4_queryextract(PG_FUNCTION_ARGS) int32 *arr; int32 i; - res = (Datum *) palloc(sizeof(Datum) * (*nentries)); + res = palloc_array(Datum, *nentries); arr = ARRPTR(query); for (i = 0; i < *nentries; i++) diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c index a09b7fa812cb2..90cf11c01a50b 100644 --- a/contrib/intarray/_int_gist.c +++ b/contrib/intarray/_int_gist.c @@ -186,7 +186,7 @@ g_int_compress(PG_FUNCTION_ARGS) errmsg("input array is too big (%d maximum allowed, %d current), use gist__intbig_ops opclass instead", 2 * num_ranges - 1, ARRNELEMS(r)))); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); @@ -276,7 +276,7 @@ g_int_compress(PG_FUNCTION_ARGS) errmsg("data is too sparse, recreate index using gist__intbig_ops opclass instead"))); r = resize_intArrayType(r, len); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); PG_RETURN_POINTER(retval); @@ -306,7 +306,7 @@ g_int_decompress(PG_FUNCTION_ARGS) { if (in != (ArrayType *) DatumGetPointer(entry->key)) { - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(in), entry->rel, entry->page, entry->offset, false); PG_RETURN_POINTER(retval); @@ -321,7 +321,7 @@ g_int_decompress(PG_FUNCTION_ARGS) { /* not compressed value */ if (in != (ArrayType *) DatumGetPointer(entry->key)) { - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(in), entry->rel, entry->page, entry->offset, false); @@ -350,7 +350,7 @@ g_int_decompress(PG_FUNCTION_ARGS) if (in != (ArrayType *) DatumGetPointer(entry->key)) pfree(in); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); @@ -535,7 +535,7 @@ g_int_picksplit(PG_FUNCTION_ARGS) /* * sort entries */ - costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + costvector = palloc_array(SPLITCOST, maxoff); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { costvector[i - 1].pos = i; diff --git a/contrib/intarray/_int_op.c b/contrib/intarray/_int_op.c index ba6d0a99995ed..a706e353c6f94 100644 --- a/contrib/intarray/_int_op.c +++ b/contrib/intarray/_int_op.c @@ -108,7 +108,7 @@ _int_overlap(PG_FUNCTION_ARGS) CHECKARRVALID(a); CHECKARRVALID(b); if (ARRISEMPTY(a) || ARRISEMPTY(b)) - return false; + PG_RETURN_BOOL(false); SORT(a); SORT(b); diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c index 6c3b7ace146aa..ddffd69cb6e60 100644 --- a/contrib/intarray/_int_selfuncs.c +++ b/contrib/intarray/_int_selfuncs.c @@ -177,7 +177,7 @@ _int_matchsel(PG_FUNCTION_ARGS) if (query->size == 0) { ReleaseVariableStats(vardata); - return (Selectivity) 0.0; + PG_RETURN_FLOAT8(0.0); } /* @@ -210,8 +210,8 @@ _int_matchsel(PG_FUNCTION_ARGS) */ if (sslot.nnumbers == sslot.nvalues + 3) { - /* Grab the lowest frequency. */ - minfreq = sslot.numbers[sslot.nnumbers - (sslot.nnumbers - sslot.nvalues)]; + /* Grab the minimal MCE frequency. */ + minfreq = sslot.numbers[sslot.nvalues]; mcelems = sslot.values; mcefreqs = sslot.numbers; @@ -269,8 +269,11 @@ int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs, else { /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. + * The element is not in MCELEM. Estimate its frequency as half + * that of the least-frequent MCE. (We know it cannot be more + * than minfreq, and it could be a great deal less. Half seems + * like a good compromise.) For probably-historical reasons, + * clamp to not more than DEFAULT_EQ_SEL. */ selec = Min(DEFAULT_EQ_SEL, minfreq / 2); } diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c index 9699fbf3b4fe5..0afa8a73b6827 100644 --- a/contrib/intarray/_intbig_gist.c +++ b/contrib/intarray/_intbig_gist.c @@ -174,7 +174,7 @@ g_intbig_compress(PG_FUNCTION_ARGS) ptr++; } - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, false); @@ -195,7 +195,7 @@ g_intbig_compress(PG_FUNCTION_ARGS) } res = _intbig_alloc(true, siglen, sign); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, false); @@ -385,7 +385,7 @@ g_intbig_picksplit(PG_FUNCTION_ARGS) maxoff = OffsetNumberNext(maxoff); /* sort before ... */ - costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + costvector = palloc_array(SPLITCOST, maxoff); for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) { costvector[j - 1].pos = j; diff --git a/contrib/isn/UPC.h b/contrib/isn/UPC.h index 01b9f1559255c..9af19a369c7b8 100644 --- a/contrib/isn/UPC.h +++ b/contrib/isn/UPC.h @@ -1,5 +1,5 @@ /* - * ISSN.h + * UPC.h * PostgreSQL type definitions for ISNs (ISBN, ISMN, ISSN, EAN13, UPC) * * No information available for UPC prefixes diff --git a/contrib/isn/isn.c b/contrib/isn/isn.c index 038c8ed4db7bd..3caa3af8b4c41 100644 --- a/contrib/isn/isn.c +++ b/contrib/isn/isn.c @@ -423,19 +423,10 @@ ean2isn(ean13 ean, bool errorOK, ean13 *result, enum isn_type accept) eantoobig: if (!errorOK) - { - char eanbuf[64]; - - /* - * Format the number separately to keep the machine-dependent format - * code out of the translatable message text - */ - snprintf(eanbuf, sizeof(eanbuf), EAN13_FORMAT, ean); ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value \"%s\" is out of range for %s type", - eanbuf, isn_names[type]))); - } + errmsg("value \"%" PRIu64 "\" is out of range for %s type", + ean, isn_names[type]))); return false; } @@ -660,19 +651,10 @@ ean2string(ean13 ean, bool errorOK, char *result, bool shortType) eantoobig: if (!errorOK) - { - char eanbuf[64]; - - /* - * Format the number separately to keep the machine-dependent format - * code out of the translatable message text - */ - snprintf(eanbuf, sizeof(eanbuf), EAN13_FORMAT, ean); ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value \"%s\" is out of range for %s type", - eanbuf, isn_names[type]))); - } + errmsg("value \"%" PRIu64 "\" is out of range for %s type", + ean, isn_names[type]))); return false; } @@ -726,7 +708,7 @@ string2ean(const char *str, struct Node *escontext, ean13 *result, if (type != INVALID) goto eaninvalid; type = ISSN; - *aux1++ = toupper((unsigned char) *aux2); + *aux1++ = pg_ascii_toupper((unsigned char) *aux2); length++; } else if (length == 9 && (digit || *aux2 == 'X' || *aux2 == 'x') && last) @@ -736,7 +718,7 @@ string2ean(const char *str, struct Node *escontext, ean13 *result, goto eaninvalid; if (type == INVALID) type = ISBN; /* ISMN must start with 'M' */ - *aux1++ = toupper((unsigned char) *aux2); + *aux1++ = pg_ascii_toupper((unsigned char) *aux2); length++; } else if (length == 11 && digit && last) diff --git a/contrib/isn/isn.h b/contrib/isn/isn.h index 399896ad417c0..4551d7828fb92 100644 --- a/contrib/isn/isn.h +++ b/contrib/isn/isn.h @@ -24,8 +24,6 @@ */ typedef uint64 ean13; -#define EAN13_FORMAT UINT64_FORMAT - #define PG_GETARG_EAN13(n) PG_GETARG_INT64(n) #define PG_RETURN_EAN13(x) PG_RETURN_INT64(x) diff --git a/contrib/jsonb_plperl/jsonb_plperl.c b/contrib/jsonb_plperl/jsonb_plperl.c index c02e2d41af108..67e5e1940d089 100644 --- a/contrib/jsonb_plperl/jsonb_plperl.c +++ b/contrib/jsonb_plperl/jsonb_plperl.c @@ -13,7 +13,7 @@ PG_MODULE_MAGIC_EXT( ); static SV *Jsonb_to_SV(JsonbContainer *jsonb); -static JsonbValue *SV_to_JsonbValue(SV *obj, JsonbParseState **ps, bool is_elem); +static void SV_to_JsonbValue(SV *obj, JsonbInState *ps, bool is_elem); static SV * @@ -127,8 +127,8 @@ Jsonb_to_SV(JsonbContainer *jsonb) } } -static JsonbValue * -AV_to_JsonbValue(AV *in, JsonbParseState **jsonb_state) +static void +AV_to_JsonbValue(AV *in, JsonbInState *jsonb_state) { dTHX; SSize_t pcount = av_len(in) + 1; @@ -141,14 +141,14 @@ AV_to_JsonbValue(AV *in, JsonbParseState **jsonb_state) SV **value = av_fetch(in, i, FALSE); if (value) - (void) SV_to_JsonbValue(*value, jsonb_state, true); + SV_to_JsonbValue(*value, jsonb_state, true); } - return pushJsonbValue(jsonb_state, WJB_END_ARRAY, NULL); + pushJsonbValue(jsonb_state, WJB_END_ARRAY, NULL); } -static JsonbValue * -HV_to_JsonbValue(HV *obj, JsonbParseState **jsonb_state) +static void +HV_to_JsonbValue(HV *obj, JsonbInState *jsonb_state) { dTHX; JsonbValue key; @@ -167,14 +167,14 @@ HV_to_JsonbValue(HV *obj, JsonbParseState **jsonb_state) key.val.string.val = pnstrdup(kstr, klen); key.val.string.len = klen; pushJsonbValue(jsonb_state, WJB_KEY, &key); - (void) SV_to_JsonbValue(val, jsonb_state, false); + SV_to_JsonbValue(val, jsonb_state, false); } - return pushJsonbValue(jsonb_state, WJB_END_OBJECT, NULL); + pushJsonbValue(jsonb_state, WJB_END_OBJECT, NULL); } -static JsonbValue * -SV_to_JsonbValue(SV *in, JsonbParseState **jsonb_state, bool is_elem) +static void +SV_to_JsonbValue(SV *in, JsonbInState *jsonb_state, bool is_elem) { dTHX; JsonbValue out; /* result */ @@ -186,10 +186,12 @@ SV_to_JsonbValue(SV *in, JsonbParseState **jsonb_state, bool is_elem) switch (SvTYPE(in)) { case SVt_PVAV: - return AV_to_JsonbValue((AV *) in, jsonb_state); + AV_to_JsonbValue((AV *) in, jsonb_state); + return; case SVt_PVHV: - return HV_to_JsonbValue((HV *) in, jsonb_state); + HV_to_JsonbValue((HV *) in, jsonb_state); + return; default: if (!SvOK(in)) @@ -259,14 +261,24 @@ SV_to_JsonbValue(SV *in, JsonbParseState **jsonb_state, bool is_elem) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot transform this Perl type to jsonb"))); - return NULL; } } - /* Push result into 'jsonb_state' unless it is a raw scalar. */ - return *jsonb_state - ? pushJsonbValue(jsonb_state, is_elem ? WJB_ELEM : WJB_VALUE, &out) - : memcpy(palloc(sizeof(JsonbValue)), &out, sizeof(JsonbValue)); + if (jsonb_state->parseState) + { + /* We're in an array or object, so push value as element or field. */ + pushJsonbValue(jsonb_state, is_elem ? WJB_ELEM : WJB_VALUE, &out); + } + else + { + /* + * We are at top level, so it's a raw scalar. If we just shove the + * scalar value into jsonb_state->result, JsonbValueToJsonb will take + * care of wrapping it into a dummy array. + */ + jsonb_state->result = palloc_object(JsonbValue); + memcpy(jsonb_state->result, &out, sizeof(JsonbValue)); + } } @@ -289,10 +301,9 @@ Datum plperl_to_jsonb(PG_FUNCTION_ARGS) { dTHX; - JsonbParseState *jsonb_state = NULL; SV *in = (SV *) PG_GETARG_POINTER(0); - JsonbValue *out = SV_to_JsonbValue(in, &jsonb_state, true); - Jsonb *result = JsonbValueToJsonb(out); + JsonbInState jsonb_state = {0}; - PG_RETURN_JSONB_P(result); + SV_to_JsonbValue(in, &jsonb_state, true); + PG_RETURN_JSONB_P(JsonbValueToJsonb(jsonb_state.result)); } diff --git a/contrib/jsonb_plpython/jsonb_plpython.c b/contrib/jsonb_plpython/jsonb_plpython.c index 9383615abbfa3..7e8e1d6674f6e 100644 --- a/contrib/jsonb_plpython/jsonb_plpython.c +++ b/contrib/jsonb_plpython/jsonb_plpython.c @@ -26,8 +26,8 @@ static PLy_elog_impl_t PLy_elog_impl_p; static PyObject *decimal_constructor; static PyObject *PLyObject_FromJsonbContainer(JsonbContainer *jsonb); -static JsonbValue *PLyObject_ToJsonbValue(PyObject *obj, - JsonbParseState **jsonb_state, bool is_elem); +static void PLyObject_ToJsonbValue(PyObject *obj, + JsonbInState *jsonb_state, bool is_elem); typedef PyObject *(*PLyUnicode_FromStringAndSize_t) (const char *s, Py_ssize_t size); @@ -261,12 +261,11 @@ PLyObject_FromJsonbContainer(JsonbContainer *jsonb) * * Transform Python dict to JsonbValue. */ -static JsonbValue * -PLyMapping_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state) +static void +PLyMapping_ToJsonbValue(PyObject *obj, JsonbInState *jsonb_state) { Py_ssize_t pcount; PyObject *volatile items; - JsonbValue *volatile out; pcount = PyMapping_Size(obj); items = PyMapping_Items(obj); @@ -297,19 +296,17 @@ PLyMapping_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state) PLyUnicode_ToJsonbValue(key, &jbvKey); } - (void) pushJsonbValue(jsonb_state, WJB_KEY, &jbvKey); - (void) PLyObject_ToJsonbValue(value, jsonb_state, false); + pushJsonbValue(jsonb_state, WJB_KEY, &jbvKey); + PLyObject_ToJsonbValue(value, jsonb_state, false); } - out = pushJsonbValue(jsonb_state, WJB_END_OBJECT, NULL); + pushJsonbValue(jsonb_state, WJB_END_OBJECT, NULL); } PG_FINALLY(); { Py_DECREF(items); } PG_END_TRY(); - - return out; } /* @@ -318,8 +315,8 @@ PLyMapping_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state) * Transform python list to JsonbValue. Expects transformed PyObject and * a state required for jsonb construction. */ -static JsonbValue * -PLySequence_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state) +static void +PLySequence_ToJsonbValue(PyObject *obj, JsonbInState *jsonb_state) { Py_ssize_t i; Py_ssize_t pcount; @@ -336,7 +333,7 @@ PLySequence_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state) value = PySequence_GetItem(obj, i); Assert(value); - (void) PLyObject_ToJsonbValue(value, jsonb_state, true); + PLyObject_ToJsonbValue(value, jsonb_state, true); Py_XDECREF(value); value = NULL; } @@ -348,7 +345,7 @@ PLySequence_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state) } PG_END_TRY(); - return pushJsonbValue(jsonb_state, WJB_END_ARRAY, NULL); + pushJsonbValue(jsonb_state, WJB_END_ARRAY, NULL); } /* @@ -406,20 +403,26 @@ PLyNumber_ToJsonbValue(PyObject *obj, JsonbValue *jbvNum) * * Transform python object to JsonbValue. */ -static JsonbValue * -PLyObject_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state, bool is_elem) +static void +PLyObject_ToJsonbValue(PyObject *obj, JsonbInState *jsonb_state, bool is_elem) { JsonbValue *out; if (!PyUnicode_Check(obj)) { if (PySequence_Check(obj)) - return PLySequence_ToJsonbValue(obj, jsonb_state); + { + PLySequence_ToJsonbValue(obj, jsonb_state); + return; + } else if (PyMapping_Check(obj)) - return PLyMapping_ToJsonbValue(obj, jsonb_state); + { + PLyMapping_ToJsonbValue(obj, jsonb_state); + return; + } } - out = palloc(sizeof(JsonbValue)); + out = palloc_object(JsonbValue); if (obj == Py_None) out->type = jbvNull; @@ -443,10 +446,20 @@ PLyObject_ToJsonbValue(PyObject *obj, JsonbParseState **jsonb_state, bool is_ele errmsg("Python type \"%s\" cannot be transformed to jsonb", PLyObject_AsString((PyObject *) obj->ob_type)))); - /* Push result into 'jsonb_state' unless it is raw scalar value. */ - return (*jsonb_state ? - pushJsonbValue(jsonb_state, is_elem ? WJB_ELEM : WJB_VALUE, out) : - out); + if (jsonb_state->parseState) + { + /* We're in an array or object, so push value as element or field. */ + pushJsonbValue(jsonb_state, is_elem ? WJB_ELEM : WJB_VALUE, out); + } + else + { + /* + * We are at top level, so it's a raw scalar. If we just shove the + * scalar value into jsonb_state->result, JsonbValueToJsonb will take + * care of wrapping it into a dummy array. + */ + jsonb_state->result = out; + } } /* @@ -458,13 +471,11 @@ PG_FUNCTION_INFO_V1(plpython_to_jsonb); Datum plpython_to_jsonb(PG_FUNCTION_ARGS) { - PyObject *obj; - JsonbValue *out; - JsonbParseState *jsonb_state = NULL; + PyObject *obj = (PyObject *) PG_GETARG_POINTER(0); + JsonbInState jsonb_state = {0}; - obj = (PyObject *) PG_GETARG_POINTER(0); - out = PLyObject_ToJsonbValue(obj, &jsonb_state, true); - PG_RETURN_POINTER(JsonbValueToJsonb(out)); + PLyObject_ToJsonbValue(obj, &jsonb_state, true); + PG_RETURN_POINTER(JsonbValueToJsonb(jsonb_state.result)); } /* diff --git a/contrib/ltree/_ltree_gist.c b/contrib/ltree/_ltree_gist.c index 286ad24fbe847..ceb92a6304da2 100644 --- a/contrib/ltree/_ltree_gist.c +++ b/contrib/ltree/_ltree_gist.c @@ -79,12 +79,12 @@ _ltree_compress(PG_FUNCTION_ARGS) item = NEXTVAL(item); } - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, entry->offset, false); } - else if (!LTG_ISALLTRUE(entry->key)) + else if (!LTG_ISALLTRUE(DatumGetPointer(entry->key))) { int32 i; ltree_gist *key; @@ -97,7 +97,7 @@ _ltree_compress(PG_FUNCTION_ARGS) } key = ltree_gist_alloc(true, sign, siglen, NULL, NULL); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, entry->offset, false); @@ -310,7 +310,7 @@ _ltree_picksplit(PG_FUNCTION_ARGS) maxoff = OffsetNumberNext(maxoff); /* sort before ... */ - costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + costvector = palloc_array(SPLITCOST, maxoff); for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) { costvector[j - 1].pos = j; diff --git a/contrib/ltree/_ltree_op.c b/contrib/ltree/_ltree_op.c index b4a8097328d3a..4d54ad34bb69f 100644 --- a/contrib/ltree/_ltree_op.c +++ b/contrib/ltree/_ltree_op.c @@ -307,7 +307,7 @@ _lca(PG_FUNCTION_ARGS) (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("array must not contain nulls"))); - a = (ltree **) palloc(sizeof(ltree *) * num); + a = palloc_array(ltree *, num); while (num > 0) { num--; diff --git a/contrib/ltree/crc32.c b/contrib/ltree/crc32.c index 134f46a805e56..3918d4a0ec251 100644 --- a/contrib/ltree/crc32.c +++ b/contrib/ltree/crc32.c @@ -10,31 +10,61 @@ #include "postgres.h" #include "ltree.h" +#include "crc32.h" +#include "utils/pg_crc.h" #ifdef LOWER_NODE -#include -#define TOLOWER(x) tolower((unsigned char) (x)) -#else -#define TOLOWER(x) (x) +#include "utils/pg_locale.h" #endif -#include "crc32.h" -#include "utils/pg_crc.h" +#ifdef LOWER_NODE unsigned int ltree_crc32_sz(const char *buf, int size) { pg_crc32 crc; const char *p = buf; + static pg_locale_t locale = NULL; + + if (!locale) + locale = pg_database_locale(); INIT_TRADITIONAL_CRC32(crc); while (size > 0) { - char c = (char) TOLOWER(*p); + char foldstr[UNICODE_CASEMAP_BUFSZ]; + int srclen = pg_mblen(p); + size_t foldlen; + + /* fold one codepoint at a time */ + foldlen = pg_strfold(foldstr, UNICODE_CASEMAP_BUFSZ, p, srclen, + locale); + + COMP_TRADITIONAL_CRC32(crc, foldstr, foldlen); + + size -= srclen; + p += srclen; + } + FIN_TRADITIONAL_CRC32(crc); + return (unsigned int) crc; +} + +#else - COMP_TRADITIONAL_CRC32(crc, &c, 1); +unsigned int +ltree_crc32_sz(const char *buf, int size) +{ + pg_crc32 crc; + const char *p = buf; + + INIT_TRADITIONAL_CRC32(crc); + while (size > 0) + { + COMP_TRADITIONAL_CRC32(crc, p, 1); size--; p++; } FIN_TRADITIONAL_CRC32(crc); return (unsigned int) crc; } + +#endif /* !LOWER_NODE */ diff --git a/contrib/ltree/expected/ltree.out b/contrib/ltree/expected/ltree.out index c8eac3f6b21bc..d2a566284755b 100644 --- a/contrib/ltree/expected/ltree.out +++ b/contrib/ltree/expected/ltree.out @@ -128,6 +128,8 @@ SELECT subpath('Top.Child1.Child2',1); Child1.Child2 (1 row) +SELECT subpath('Top.Child1.Child2',-4); -- error +ERROR: invalid positions SELECT index('1.2.3.4.5.6','1.2'); index ------- diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c index a6466f575fd7d..a28ddbf40de34 100644 --- a/contrib/ltree/lquery_op.c +++ b/contrib/ltree/lquery_op.c @@ -41,7 +41,8 @@ getlexeme(char *start, char *end, int *len) } bool -compare_subnode(ltree_level *t, char *qn, int len, int (*cmpptr) (const char *, const char *, size_t), bool anyend) +compare_subnode(ltree_level *t, char *qn, int len, + ltree_prefix_eq_func prefix_eq, bool anyend) { char *endt = t->name + t->len; char *endq = qn + len; @@ -57,7 +58,7 @@ compare_subnode(ltree_level *t, char *qn, int len, int (*cmpptr) (const char *, while ((tn = getlexeme(tn, endt, &lent)) != NULL) { if ((lent == lenq || (lent > lenq && anyend)) && - (*cmpptr) (qn, tn, lenq) == 0) + (*prefix_eq) (qn, lenq, tn, lent)) { isok = true; @@ -74,14 +75,62 @@ compare_subnode(ltree_level *t, char *qn, int len, int (*cmpptr) (const char *, return true; } -int -ltree_strncasecmp(const char *a, const char *b, size_t s) +/* + * Check if 'a' is a prefix of 'b'. + */ +bool +ltree_prefix_eq(const char *a, size_t a_sz, const char *b, size_t b_sz) +{ + if (a_sz > b_sz) + return false; + else + return (strncmp(a, b, a_sz) == 0); +} + +/* + * Case-insensitive check if 'a' is a prefix of 'b'. + */ +bool +ltree_prefix_eq_ci(const char *a, size_t a_sz, const char *b, size_t b_sz) { - char *al = str_tolower(a, s, DEFAULT_COLLATION_OID); - char *bl = str_tolower(b, s, DEFAULT_COLLATION_OID); - int res; + static pg_locale_t locale = NULL; + size_t al_sz = a_sz + 1; + size_t al_len; + char *al = palloc(al_sz); + size_t bl_sz = b_sz + 1; + size_t bl_len; + char *bl = palloc(bl_sz); + bool res; + + if (!locale) + locale = pg_database_locale(); + + /* casefold both a and b */ + + al_len = pg_strfold(al, al_sz, a, a_sz, locale); + if (al_len + 1 > al_sz) + { + /* grow buffer if needed and retry */ + al_sz = al_len + 1; + al = repalloc(al, al_sz); + al_len = pg_strfold(al, al_sz, a, a_sz, locale); + Assert(al_len + 1 <= al_sz); + } + + bl_len = pg_strfold(bl, bl_sz, b, b_sz, locale); + if (bl_len + 1 > bl_sz) + { + /* grow buffer if needed and retry */ + bl_sz = bl_len + 1; + bl = repalloc(bl, bl_sz); + bl_len = pg_strfold(bl, bl_sz, b, b_sz, locale); + Assert(bl_len + 1 <= bl_sz); + } - res = strncmp(al, bl, s); + if (al_len > bl_len) + res = false; + else + res = (strncmp(al, bl, al_len) == 0); pfree(al); pfree(bl); @@ -109,19 +158,19 @@ checkLevel(lquery_level *curq, ltree_level *curt) for (int i = 0; i < curq->numvar; i++) { - int (*cmpptr) (const char *, const char *, size_t); + ltree_prefix_eq_func prefix_eq; - cmpptr = (curvar->flag & LVAR_INCASE) ? ltree_strncasecmp : strncmp; + prefix_eq = (curvar->flag & LVAR_INCASE) ? ltree_prefix_eq_ci : ltree_prefix_eq; if (curvar->flag & LVAR_SUBLEXEME) { - if (compare_subnode(curt, curvar->name, curvar->len, cmpptr, + if (compare_subnode(curt, curvar->name, curvar->len, prefix_eq, (curvar->flag & LVAR_ANYEND))) return success; } else if ((curvar->len == curt->len || (curt->len > curvar->len && (curvar->flag & LVAR_ANYEND))) && - (*cmpptr) (curvar->name, curt->name, curvar->len) == 0) + (*prefix_eq) (curvar->name, curvar->len, curt->name, curt->len)) return success; curvar = LVAR_NEXT(curvar); diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h index 5e0761641d32a..78478dec173d4 100644 --- a/contrib/ltree/ltree.h +++ b/contrib/ltree/ltree.h @@ -157,6 +157,8 @@ typedef struct char data[FLEXIBLE_ARRAY_MEMBER]; } ltxtquery; +typedef bool (*ltree_prefix_eq_func) (const char *, size_t, const char *, size_t); + #define HDRSIZEQT MAXALIGN(VARHDRSZ + sizeof(int32)) #define COMPUTESIZE(size,lenofoperand) ( HDRSIZEQT + (size) * sizeof(ITEM) + (lenofoperand) ) #define LTXTQUERY_TOO_BIG(size,lenofoperand) \ @@ -208,9 +210,10 @@ bool ltree_execute(ITEM *curitem, void *checkval, int ltree_compare(const ltree *a, const ltree *b); bool inner_isparent(const ltree *c, const ltree *p); bool compare_subnode(ltree_level *t, char *qn, int len, - int (*cmpptr) (const char *, const char *, size_t), bool anyend); + ltree_prefix_eq_func prefix_eq, bool anyend); ltree *lca_inner(ltree **a, int len); -int ltree_strncasecmp(const char *a, const char *b, size_t s); +bool ltree_prefix_eq(const char *a, size_t a_sz, const char *b, size_t b_sz); +bool ltree_prefix_eq_ci(const char *a, size_t a_sz, const char *b, size_t b_sz); /* fmgr macros for ltree objects */ #define DatumGetLtreeP(X) ((ltree *) PG_DETOAST_DATUM(X)) diff --git a/contrib/ltree/ltree_gist.c b/contrib/ltree/ltree_gist.c index 932f69bff2d18..bb7f46347221e 100644 --- a/contrib/ltree/ltree_gist.c +++ b/contrib/ltree/ltree_gist.c @@ -101,7 +101,7 @@ ltree_compress(PG_FUNCTION_ARGS) ltree *val = DatumGetLtreeP(entry->key); ltree_gist *key = ltree_gist_alloc(false, NULL, 0, val, 0); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, entry->offset, false); @@ -117,7 +117,7 @@ ltree_decompress(PG_FUNCTION_ARGS) if (PointerGetDatum(key) != entry->key) { - GISTENTRY *retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + GISTENTRY *retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, @@ -318,7 +318,7 @@ ltree_picksplit(PG_FUNCTION_ARGS) v->spl_right = (OffsetNumber *) palloc(nbytes); v->spl_nleft = 0; v->spl_nright = 0; - array = (RIX *) palloc(sizeof(RIX) * (maxoff + 1)); + array = palloc_array(RIX, maxoff + 1); /* copy the data into RIXes, and sort the RIXes */ for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c index b54a15d6c685e..59c4462df8061 100644 --- a/contrib/ltree/ltree_io.c +++ b/contrib/ltree/ltree_io.c @@ -65,7 +65,7 @@ parse_ltree(const char *buf, struct Node *escontext) (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("number of ltree labels (%d) exceeds the maximum allowed (%d)", num + 1, LTREE_MAX_LEVELS))); - list = lptr = (nodeitem *) palloc(sizeof(nodeitem) * (num + 1)); + list = lptr = palloc_array(nodeitem, num + 1); ptr = buf; while (*ptr) { @@ -318,14 +318,14 @@ parse_lquery(const char *buf, struct Node *escontext) case LQPRS_WAITLEVEL: if (ISLABEL(ptr)) { - GETVAR(curqlevel) = lptr = (nodeitem *) palloc0(sizeof(nodeitem) * (numOR + 1)); + GETVAR(curqlevel) = lptr = palloc0_array(nodeitem, numOR + 1); lptr->start = ptr; state = LQPRS_WAITDELIM; curqlevel->numvar = 1; } else if (t_iseq(ptr, '!')) { - GETVAR(curqlevel) = lptr = (nodeitem *) palloc0(sizeof(nodeitem) * (numOR + 1)); + GETVAR(curqlevel) = lptr = palloc0_array(nodeitem, numOR + 1); lptr->start = ptr + 1; lptr->wlen = -1; /* compensate for counting ! below */ state = LQPRS_WAITDELIM; diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c index ce9f4caad4feb..c1fc77fc804c0 100644 --- a/contrib/ltree/ltree_op.c +++ b/contrib/ltree/ltree_op.c @@ -316,23 +316,15 @@ subpath(PG_FUNCTION_ARGS) int32 end; ltree *res; - end = start + len; - - if (start < 0) - { - start = t->numlevel + start; - end = start + len; - } if (start < 0) - { /* start > t->numlevel */ start = t->numlevel + start; - end = start + len; - } if (len < 0) end = t->numlevel + len; else if (len == 0) - end = (fcinfo->nargs == 3) ? start : 0xffff; + end = (fcinfo->nargs == 3) ? start : LTREE_MAX_LEVELS; + else + end = start + len; res = inner_subltree(t, start, end); @@ -574,7 +566,7 @@ lca(PG_FUNCTION_ARGS) ltree **a, *res; - a = (ltree **) palloc(sizeof(ltree *) * fcinfo->nargs); + a = palloc_array(ltree *, fcinfo->nargs); for (i = 0; i < fcinfo->nargs; i++) a[i] = PG_GETARG_LTREE_P(i); res = lca_inner(a, (int) fcinfo->nargs); diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c index ec331607793a7..3a2aa223c3ed5 100644 --- a/contrib/ltree/ltxtquery_io.c +++ b/contrib/ltree/ltxtquery_io.c @@ -154,7 +154,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint static bool pushquery(QPRS_STATE *state, int32 type, int32 val, int32 distance, int32 lenval, uint16 flag) { - NODE *tmp = (NODE *) palloc(sizeof(NODE)); + NODE *tmp = palloc_object(NODE); tmp->type = type; tmp->val = val; @@ -543,7 +543,7 @@ infix(INFIX *in, bool first) nrm.curpol = in->curpol; nrm.op = in->op; nrm.buflen = 16; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); /* get right operand */ infix(&nrm, false); @@ -582,7 +582,7 @@ ltxtq_out(PG_FUNCTION_ARGS) nrm.curpol = GETQUERY(query); nrm.buflen = 32; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); *(nrm.cur) = '\0'; nrm.op = GETOPERAND(query); infix(&nrm, true); @@ -615,7 +615,7 @@ ltxtq_send(PG_FUNCTION_ARGS) nrm.curpol = GETQUERY(query); nrm.buflen = 32; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); *(nrm.cur) = '\0'; nrm.op = GETOPERAND(query); infix(&nrm, true); diff --git a/contrib/ltree/ltxtquery_op.c b/contrib/ltree/ltxtquery_op.c index 002102c9c75b6..3dcbab2c48460 100644 --- a/contrib/ltree/ltxtquery_op.c +++ b/contrib/ltree/ltxtquery_op.c @@ -58,19 +58,19 @@ checkcondition_str(void *checkval, ITEM *val) ltree_level *level = LTREE_FIRST(((CHKVAL *) checkval)->node); int tlen = ((CHKVAL *) checkval)->node->numlevel; char *op = ((CHKVAL *) checkval)->operand + val->distance; - int (*cmpptr) (const char *, const char *, size_t); + ltree_prefix_eq_func prefix_eq; - cmpptr = (val->flag & LVAR_INCASE) ? ltree_strncasecmp : strncmp; + prefix_eq = (val->flag & LVAR_INCASE) ? ltree_prefix_eq_ci : ltree_prefix_eq; while (tlen > 0) { if (val->flag & LVAR_SUBLEXEME) { - if (compare_subnode(level, op, val->length, cmpptr, (val->flag & LVAR_ANYEND))) + if (compare_subnode(level, op, val->length, prefix_eq, (val->flag & LVAR_ANYEND))) return true; } else if ((val->length == level->len || (level->len > val->length && (val->flag & LVAR_ANYEND))) && - (*cmpptr) (op, level->name, val->length) == 0) + (*prefix_eq) (op, val->length, level->name, level->len)) return true; tlen--; diff --git a/contrib/ltree/sql/ltree.sql b/contrib/ltree/sql/ltree.sql index dd705d9d7ca00..77e6958c62a7b 100644 --- a/contrib/ltree/sql/ltree.sql +++ b/contrib/ltree/sql/ltree.sql @@ -34,6 +34,7 @@ SELECT subpath('Top.Child1.Child2',0,0); SELECT subpath('Top.Child1.Child2',1,0); SELECT subpath('Top.Child1.Child2',0); SELECT subpath('Top.Child1.Child2',1); +SELECT subpath('Top.Child1.Child2',-4); -- error SELECT index('1.2.3.4.5.6','1.2'); diff --git a/contrib/pageinspect/brinfuncs.c b/contrib/pageinspect/brinfuncs.c index 990c965aa9241..b7dcc8ac8a967 100644 --- a/contrib/pageinspect/brinfuncs.c +++ b/contrib/pageinspect/brinfuncs.c @@ -186,7 +186,7 @@ brin_page_items(PG_FUNCTION_ARGS) * Initialize output functions for all indexed datatypes; simplifies * calling them later. */ - columns = palloc(sizeof(brin_column_state *) * RelationGetDescr(indexRel)->natts); + columns = palloc_array(brin_column_state *, RelationGetDescr(indexRel)->natts); for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) { Oid output; diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 294821231fc3b..62c905c6e7c2c 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -27,6 +27,7 @@ #include "postgres.h" +#include "access/htup_details.h" #include "access/nbtree.h" #include "access/relation.h" #include "catalog/namespace.h" @@ -378,7 +379,7 @@ bt_multi_page_stats(PG_FUNCTION_ARGS) /* Save arguments for reuse */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); - uargs = palloc(sizeof(ua_page_stats)); + uargs = palloc_object(ua_page_stats); uargs->relid = RelationGetRelid(rel); uargs->blkno = blkno; @@ -506,7 +507,7 @@ bt_page_print_tuples(ua_page_items *uargs) j = 0; memset(nulls, 0, sizeof(nulls)); - values[j++] = DatumGetInt16(offset); + values[j++] = Int16GetDatum(offset); values[j++] = ItemPointerGetDatum(&itup->t_tid); values[j++] = Int32GetDatum((int) IndexTupleSize(itup)); values[j++] = BoolGetDatum(IndexTupleHasNulls(itup)); @@ -659,7 +660,7 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); - uargs = palloc(sizeof(ua_page_items)); + uargs = palloc_object(ua_page_items); uargs->page = palloc(BLCKSZ); memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ); @@ -751,7 +752,7 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) fctx = SRF_FIRSTCALL_INIT(); mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); - uargs = palloc(sizeof(ua_page_items)); + uargs = palloc_object(ua_page_items); uargs->page = get_page_from_raw(raw_page); @@ -900,10 +901,10 @@ bt_metap(PG_FUNCTION_ARGS) j = 0; values[j++] = psprintf("%d", metad->btm_magic); values[j++] = psprintf("%d", metad->btm_version); - values[j++] = psprintf(INT64_FORMAT, (int64) metad->btm_root); - values[j++] = psprintf(INT64_FORMAT, (int64) metad->btm_level); - values[j++] = psprintf(INT64_FORMAT, (int64) metad->btm_fastroot); - values[j++] = psprintf(INT64_FORMAT, (int64) metad->btm_fastlevel); + values[j++] = psprintf("%u", metad->btm_root); + values[j++] = psprintf("%u", metad->btm_level); + values[j++] = psprintf("%u", metad->btm_fastroot); + values[j++] = psprintf("%u", metad->btm_fastlevel); /* * Get values of extended metadata if available, use default values @@ -913,8 +914,7 @@ bt_metap(PG_FUNCTION_ARGS) */ if (metad->btm_version >= BTREE_NOVAC_VERSION) { - values[j++] = psprintf(INT64_FORMAT, - (int64) metad->btm_last_cleanup_num_delpages); + values[j++] = psprintf("%u", metad->btm_last_cleanup_num_delpages); values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples); values[j++] = metad->btm_allequalimage ? "t" : "f"; } diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out index 2b1d54a627949..8502f9efb4190 100644 --- a/contrib/pageinspect/expected/gist.out +++ b/contrib/pageinspect/expected/gist.out @@ -5,21 +5,21 @@ CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM CREATE INDEX test_gist_idx ON test_gist USING gist (p); -- Page 0 is the root, the rest are leaf pages SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); - lsn | nsn | rightlink | flags ------+-----+------------+------- - 0/1 | 0/0 | 4294967295 | {} + lsn | nsn | rightlink | flags +------------+------------+------------+------- + 0/00000001 | 0/00000000 | 4294967295 | {} (1 row) SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); - lsn | nsn | rightlink | flags ------+-----+------------+-------- - 0/1 | 0/0 | 4294967295 | {leaf} + lsn | nsn | rightlink | flags +------------+------------+------------+-------- + 0/00000001 | 0/00000000 | 4294967295 | {leaf} (1 row) SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); - lsn | nsn | rightlink | flags ------+-----+-----------+-------- - 0/1 | 0/0 | 1 | {leaf} + lsn | nsn | rightlink | flags +------------+------------+-----------+-------- + 0/00000001 | 0/00000000 | 1 | {leaf} (1 row) SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index e42fd9747fd1c..fcf19c5ca5a50 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -265,9 +265,9 @@ SELECT fsm_page_contents(decode(repeat('00', :block_size), 'hex')); (1 row) SELECT page_header(decode(repeat('00', :block_size), 'hex')); - page_header ------------------------ - (0/0,0,0,0,0,0,0,0,0) + page_header +------------------------------ + (0/00000000,0,0,0,0,0,0,0,0) (1 row) SELECT page_checksum(decode(repeat('00', :block_size), 'hex'), 1); diff --git a/contrib/pageinspect/ginfuncs.c b/contrib/pageinspect/ginfuncs.c index 09a90957081f1..f6168d8e8953c 100644 --- a/contrib/pageinspect/ginfuncs.c +++ b/contrib/pageinspect/ginfuncs.c @@ -222,7 +222,7 @@ gin_leafpage_items(PG_FUNCTION_ARGS) opaq->flags, (GIN_DATA | GIN_LEAF | GIN_COMPRESSED)))); - inter_call_data = palloc(sizeof(gin_leafpage_items_state)); + inter_call_data = palloc_object(gin_leafpage_items_state); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) diff --git a/contrib/pageinspect/gistfuncs.c b/contrib/pageinspect/gistfuncs.c index 7b16e2a1ef33c..414513c395b6a 100644 --- a/contrib/pageinspect/gistfuncs.c +++ b/contrib/pageinspect/gistfuncs.c @@ -11,6 +11,7 @@ #include "access/gist.h" #include "access/htup.h" +#include "access/htup_details.h" #include "access/relation.h" #include "catalog/pg_am_d.h" #include "funcapi.h" @@ -174,7 +175,7 @@ gist_page_items_bytea(PG_FUNCTION_ARGS) memset(nulls, 0, sizeof(nulls)); - values[0] = DatumGetInt16(offset); + values[0] = Int16GetDatum(offset); values[1] = ItemPointerGetDatum(&itup->t_tid); values[2] = Int32GetDatum((int) IndexTupleSize(itup)); @@ -281,7 +282,7 @@ gist_page_items(PG_FUNCTION_ARGS) memset(nulls, 0, sizeof(nulls)); - values[0] = DatumGetInt16(offset); + values[0] = Int16GetDatum(offset); values[1] = ItemPointerGetDatum(&itup->t_tid); values[2] = Int32GetDatum((int) IndexTupleSize(itup)); values[3] = BoolGetDatum(ItemIdIsDead(id)); @@ -360,7 +361,7 @@ gist_page_items(PG_FUNCTION_ARGS) tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } - relation_close(indexRel, AccessShareLock); + index_close(indexRel, AccessShareLock); return (Datum) 0; } diff --git a/contrib/pageinspect/hashfuncs.c b/contrib/pageinspect/hashfuncs.c index ca7f1f6e7410d..0e898889fa513 100644 --- a/contrib/pageinspect/hashfuncs.c +++ b/contrib/pageinspect/hashfuncs.c @@ -325,7 +325,7 @@ hash_page_items(PG_FUNCTION_ARGS) page = verify_hash_page(raw_page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); - uargs = palloc(sizeof(struct user_args)); + uargs = palloc_object(struct user_args); uargs->page = page; diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 377ae30d1fa71..ca7bffdbea5fd 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -154,7 +154,7 @@ heap_page_items(PG_FUNCTION_ARGS) fctx = SRF_FIRSTCALL_INIT(); mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); - inter_call_data = palloc(sizeof(heap_page_items_state)); + inter_call_data = palloc_object(heap_page_items_state); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) @@ -256,7 +256,7 @@ heap_page_items(PG_FUNCTION_ARGS) nulls[11] = true; if (tuphdr->t_infomask & HEAP_HASOID_OLD) - values[12] = HeapTupleHeaderGetOidOld(tuphdr); + values[12] = ObjectIdGetDatum(HeapTupleHeaderGetOidOld(tuphdr)); else nulls[12] = true; @@ -553,7 +553,7 @@ heap_tuple_infomask_flags(PG_FUNCTION_ARGS) } /* build set of raw flags */ - flags = (Datum *) palloc0(sizeof(Datum) * bitcnt); + flags = palloc0_array(Datum, bitcnt); /* decode t_infomask */ if ((t_infomask & HEAP_HASNULL) != 0) diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c index 0d57123aa2669..aef442b5db30a 100644 --- a/contrib/pageinspect/rawpage.c +++ b/contrib/pageinspect/rawpage.c @@ -282,7 +282,7 @@ page_header(PG_FUNCTION_ARGS) { char lsnchar[64]; - snprintf(lsnchar, sizeof(lsnchar), "%X/%X", LSN_FORMAT_ARGS(lsn)); + snprintf(lsnchar, sizeof(lsnchar), "%X/%08X", LSN_FORMAT_ARGS(lsn)); values[0] = CStringGetTextDatum(lsnchar); } else diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index 5f748543e2ea2..0e618f66aec6e 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -9,7 +9,7 @@ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ - pg_buffercache--1.5--1.6.sql + pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache pg_buffercache_numa diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 9a9216dc7b1bf..886dea770f626 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -8,6 +8,16 @@ from pg_buffercache; t (1 row) +-- For pg_buffercache_os_pages, we expect at least one entry for each buffer +select count(*) >= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_os_pages; + ?column? +---------- + t +(1 row) + select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -28,6 +38,8 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; ERROR: permission denied for view pg_buffercache +SELECT * FROM pg_buffercache_os_pages; +ERROR: permission denied for view pg_buffercache_os_pages SELECT * FROM pg_buffercache_pages() AS p (wrong int); ERROR: permission denied for function pg_buffercache_pages SELECT * FROM pg_buffercache_summary(); @@ -43,6 +55,12 @@ SELECT count(*) > 0 FROM pg_buffercache; t (1 row) +SELECT count(*) > 0 FROM pg_buffercache_os_pages; + ?column? +---------- + t +(1 row) + SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); ?column? ---------- @@ -57,7 +75,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); RESET role; ------ ----- Test pg_buffercache_evict* functions +---- Test pg_buffercache_evict* and pg_buffercache_mark_dirty* functions ------ CREATE ROLE regress_buffercache_normal; SET ROLE regress_buffercache_normal; @@ -68,6 +86,12 @@ SELECT * FROM pg_buffercache_evict_relation(1); ERROR: must be superuser to use pg_buffercache_evict_relation() SELECT * FROM pg_buffercache_evict_all(); ERROR: must be superuser to use pg_buffercache_evict_all() +SELECT * FROM pg_buffercache_mark_dirty(1); +ERROR: must be superuser to use pg_buffercache_mark_dirty() +SELECT * FROM pg_buffercache_mark_dirty_relation(1); +ERROR: must be superuser to use pg_buffercache_mark_dirty_relation() +SELECT * FROM pg_buffercache_mark_dirty_all(); +ERROR: must be superuser to use pg_buffercache_mark_dirty_all() RESET ROLE; -- These should return nothing, because these are STRICT functions SELECT * FROM pg_buffercache_evict(NULL); @@ -82,6 +106,18 @@ SELECT * FROM pg_buffercache_evict_relation(NULL); | | (1 row) +SELECT * FROM pg_buffercache_mark_dirty(NULL); + buffer_dirtied | buffer_already_dirty +----------------+---------------------- + | +(1 row) + +SELECT * FROM pg_buffercache_mark_dirty_relation(NULL); + buffers_dirtied | buffers_already_dirty | buffers_skipped +-----------------+-----------------------+----------------- + | | +(1 row) + -- These should fail because they are not called by valid range of buffers -- Number of the shared buffers are limited by max integer SELECT 2147483647 max_buffers \gset @@ -91,11 +127,18 @@ SELECT * FROM pg_buffercache_evict(0); ERROR: bad buffer ID: 0 SELECT * FROM pg_buffercache_evict(:max_buffers); ERROR: bad buffer ID: 2147483647 --- This should fail because pg_buffercache_evict_relation() doesn't accept --- local relations +SELECT * FROM pg_buffercache_mark_dirty(-1); +ERROR: bad buffer ID: -1 +SELECT * FROM pg_buffercache_mark_dirty(0); +ERROR: bad buffer ID: 0 +SELECT * FROM pg_buffercache_mark_dirty(:max_buffers); +ERROR: bad buffer ID: 2147483647 +-- These should fail because they don't accept local relations CREATE TEMP TABLE temp_pg_buffercache(); SELECT * FROM pg_buffercache_evict_relation('temp_pg_buffercache'); ERROR: relation uses local buffers, pg_buffercache_evict_relation() is intended to be used for shared buffers only +SELECT * FROM pg_buffercache_mark_dirty_relation('temp_pg_buffercache'); +ERROR: relation uses local buffers, pg_buffercache_mark_dirty_relation() is intended to be used for shared buffers only DROP TABLE temp_pg_buffercache; -- These shouldn't fail SELECT buffer_evicted IS NOT NULL FROM pg_buffercache_evict(1); @@ -117,5 +160,23 @@ SELECT buffers_evicted IS NOT NULL FROM pg_buffercache_evict_relation('shared_pg t (1 row) +SELECT buffers_dirtied IS NOT NULL FROM pg_buffercache_mark_dirty_relation('shared_pg_buffercache'); + ?column? +---------- + t +(1 row) + DROP TABLE shared_pg_buffercache; +SELECT pg_buffercache_mark_dirty(1) IS NOT NULL; + ?column? +---------- + t +(1 row) + +SELECT pg_buffercache_mark_dirty_all() IS NOT NULL; + ?column? +---------- + t +(1 row) + DROP ROLE regress_buffercache_normal; diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 7cd039a1df9cb..7c31141881f61 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -24,6 +24,7 @@ install_data( 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', 'pg_buffercache--1.5--1.6.sql', + 'pg_buffercache--1.6--1.7.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql new file mode 100644 index 0000000000000..9a7bf66dab54b --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql @@ -0,0 +1,56 @@ +/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit + +-- Function to retrieve information about OS pages, with optional NUMA +-- information. +CREATE FUNCTION pg_buffercache_os_pages(IN include_numa boolean, + OUT bufferid integer, + OUT os_page_num bigint, + OUT numa_node integer) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages' +LANGUAGE C PARALLEL SAFE; + +-- View for OS page information, without NUMA. +CREATE VIEW pg_buffercache_os_pages AS + SELECT bufferid, os_page_num + FROM pg_buffercache_os_pages(false); + +-- Re-create view for OS page information, with NUMA. +DROP VIEW pg_buffercache_numa; +CREATE VIEW pg_buffercache_numa AS + SELECT bufferid, os_page_num, numa_node + FROM pg_buffercache_os_pages(true); + +REVOKE ALL ON FUNCTION pg_buffercache_os_pages(boolean) FROM PUBLIC; +REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages(boolean) TO pg_monitor; +GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; + +-- Functions to mark buffers as dirty. +CREATE FUNCTION pg_buffercache_mark_dirty( + IN int, + OUT buffer_dirtied boolean, + OUT buffer_already_dirty boolean) +AS 'MODULE_PATHNAME', 'pg_buffercache_mark_dirty' +LANGUAGE C PARALLEL SAFE VOLATILE STRICT; + +CREATE FUNCTION pg_buffercache_mark_dirty_relation( + IN regclass, + OUT buffers_dirtied int4, + OUT buffers_already_dirty int4, + OUT buffers_skipped int4) +AS 'MODULE_PATHNAME', 'pg_buffercache_mark_dirty_relation' +LANGUAGE C PARALLEL SAFE VOLATILE STRICT; + +CREATE FUNCTION pg_buffercache_mark_dirty_all( + OUT buffers_dirtied int4, + OUT buffers_already_dirty int4, + OUT buffers_skipped int4) +AS 'MODULE_PATHNAME', 'pg_buffercache_mark_dirty_all' +LANGUAGE C PARALLEL SAFE VOLATILE; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index b030ba3a6faba..11499550945ee 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.6' +default_version = '1.7' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 4b007f6e1b06a..0c58e4b265c0c 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -25,8 +25,11 @@ #define NUM_BUFFERCACHE_EVICT_ELEM 2 #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3 +#define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2 +#define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3 +#define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3 -#define NUM_BUFFERCACHE_NUMA_ELEM 3 +#define NUM_BUFFERCACHE_OS_PAGES_ELEM 3 PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", @@ -67,14 +70,16 @@ typedef struct } BufferCachePagesContext; /* - * Record structure holding the to be exposed cache data. + * Record structure holding the to be exposed cache data for OS pages. This + * structure is used by pg_buffercache_os_pages(), where NUMA information may + * or may not be included. */ typedef struct { uint32 bufferid; int64 page_num; int32 numa_node; -} BufferCacheNumaRec; +} BufferCacheOsPagesRec; /* * Function context for data persisting over repeated calls. @@ -82,11 +87,9 @@ typedef struct typedef struct { TupleDesc tupdesc; - int buffers_per_page; - int pages_per_buffer; - int os_page_size; - BufferCacheNumaRec *record; -} BufferCacheNumaContext; + bool include_numa; + BufferCacheOsPagesRec *record; +} BufferCacheOsPagesContext; /* @@ -94,12 +97,16 @@ typedef struct * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_os_pages); PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation); PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); +PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty); +PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation); +PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all); /* Only need to touch memory once per backend process lifetime */ @@ -127,7 +134,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ - fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); + fctx = palloc_object(BufferCachePagesContext); /* * To smoothly support upgrades from version 1.0 of this extension @@ -194,6 +201,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) BufferDesc *bufHdr; uint32 buf_state; + CHECK_FOR_INTERRUPTS(); + bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); @@ -218,7 +227,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) else fctx->record[i].isvalid = false; - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } } @@ -261,7 +270,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[2] = false; values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); nulls[3] = false; - values[4] = ObjectIdGetDatum(fctx->record[i].forknum); + values[4] = Int16GetDatum(fctx->record[i].forknum); nulls[4] = false; values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); nulls[5] = false; @@ -285,26 +294,32 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } /* - * Inquire about NUMA memory mappings for shared buffers. + * Inquire about OS pages mappings for shared buffers, with NUMA information, + * optionally. + * + * When "include_numa" is false, this routines ignores everything related + * to NUMA (returned as NULL values), returning mapping information between + * shared buffers and OS pages. + * + * When "include_numa" is true, NUMA is initialized and numa_node values + * are generated. In order to get reliable results we also need to touch + * memory pages, so that the inquiry about NUMA memory node does not return + * -2, indicating unmapped/unallocated pages. * - * Returns NUMA node ID for each memory page used by the buffer. Buffers may - * be smaller or larger than OS memory pages. For each buffer we return one - * entry for each memory page used by the buffer (if the buffer is smaller, - * it only uses a part of one memory page). + * Buffers may be smaller or larger than OS memory pages. For each buffer we + * return one entry for each memory page used by the buffer (if the buffer is + * smaller, it only uses a part of one memory page). * * We expect both sizes (for buffers and memory pages) to be a power-of-2, so * one is always a multiple of the other. * - * In order to get reliable results we also need to touch memory pages, so - * that the inquiry about NUMA memory node doesn't return -2 (which indicates - * unmapped/unallocated pages). */ -Datum -pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +static Datum +pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa) { FuncCallContext *funcctx; MemoryContext oldcontext; - BufferCacheNumaContext *fctx; /* User function context. */ + BufferCacheOsPagesContext *fctx; /* User function context. */ TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; @@ -315,16 +330,15 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) int i, idx; Size os_page_size; - void **os_page_ptrs; - int *os_page_status; - uint64 os_page_count; int pages_per_buffer; + int *os_page_status = NULL; + uint64 os_page_count = 0; int max_entries; - volatile uint64 touch pg_attribute_unused(); char *startptr, *endptr; - if (pg_numa_init() == -1) + /* If NUMA information is requested, initialize NUMA support. */ + if (include_numa && pg_numa_init() == -1) elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); /* @@ -352,46 +366,56 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) */ Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0)); - /* - * How many addresses we are going to query? Simply get the page for - * the first buffer, and first page after the last buffer, and count - * the pages from that. - */ - startptr = (char *) TYPEALIGN_DOWN(os_page_size, - BufferGetBlock(1)); - endptr = (char *) TYPEALIGN(os_page_size, - (char *) BufferGetBlock(NBuffers) + BLCKSZ); - os_page_count = (endptr - startptr) / os_page_size; - - /* Used to determine the NUMA node for all OS pages at once */ - os_page_ptrs = palloc0(sizeof(void *) * os_page_count); - os_page_status = palloc(sizeof(uint64) * os_page_count); - - /* Fill pointers for all the memory pages. */ - idx = 0; - for (char *ptr = startptr; ptr < endptr; ptr += os_page_size) + if (include_numa) { - os_page_ptrs[idx++] = ptr; + void **os_page_ptrs = NULL; + + /* + * How many addresses we are going to query? Simply get the page + * for the first buffer, and first page after the last buffer, and + * count the pages from that. + */ + startptr = (char *) TYPEALIGN_DOWN(os_page_size, + BufferGetBlock(1)); + endptr = (char *) TYPEALIGN(os_page_size, + (char *) BufferGetBlock(NBuffers) + BLCKSZ); + os_page_count = (endptr - startptr) / os_page_size; + + /* Used to determine the NUMA node for all OS pages at once */ + os_page_ptrs = palloc0_array(void *, os_page_count); + os_page_status = palloc_array(int, os_page_count); + + /* + * Fill pointers for all the memory pages. This loop stores and + * touches (if needed) addresses into os_page_ptrs[] as input to + * one big move_pages(2) inquiry system call, as done in + * pg_numa_query_pages(). + */ + idx = 0; + for (char *ptr = startptr; ptr < endptr; ptr += os_page_size) + { + os_page_ptrs[idx++] = ptr; - /* Only need to touch memory once per backend process lifetime */ - if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, ptr); - } + /* Only need to touch memory once per backend process lifetime */ + if (firstNumaTouch) + pg_numa_touch_mem_if_required(ptr); + } - Assert(idx == os_page_count); + Assert(idx == os_page_count); - elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " " - "os_page_size=%zu", NBuffers, os_page_count, os_page_size); + elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " " + "os_page_size=%zu", NBuffers, os_page_count, os_page_size); - /* - * If we ever get 0xff back from kernel inquiry, then we probably have - * bug in our buffers to OS page mapping code here. - */ - memset(os_page_status, 0xff, sizeof(int) * os_page_count); + /* + * If we ever get 0xff back from kernel inquiry, then we probably + * have bug in our buffers to OS page mapping code here. + */ + memset(os_page_status, 0xff, sizeof(int) * os_page_count); - /* Query NUMA status for all the pointers */ - if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1) - elog(ERROR, "failed NUMA pages inquiry: %m"); + /* Query NUMA status for all the pointers */ + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + } /* Initialize the multi-call context, load entries about buffers */ @@ -401,12 +425,12 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ - fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext)); + fctx = palloc_object(BufferCacheOsPagesContext); if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM) + if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM) elog(ERROR, "incorrect number of output arguments"); /* Construct a tuple descriptor for the result rows. */ @@ -419,6 +443,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) INT4OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); + fctx->include_numa = include_numa; /* * Each buffer needs at least one entry, but it might be offset in @@ -430,15 +455,15 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1; max_entries = NBuffers * pages_per_buffer; - /* Allocate entries for BufferCachePagesRec records. */ - fctx->record = (BufferCacheNumaRec *) + /* Allocate entries for BufferCacheOsPagesRec records. */ + fctx->record = (BufferCacheOsPagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCacheNumaRec) * max_entries); + sizeof(BufferCacheOsPagesRec) * max_entries); /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); - if (firstNumaTouch) + if (include_numa && firstNumaTouch) elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); /* @@ -448,10 +473,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) * We don't hold the partition locks, so we don't get a consistent * snapshot across all buffers, but we do grab the buffer header * locks, so the information of each buffer is self-consistent. - * - * This loop touches and stores addresses into os_page_ptrs[] as input - * to one big move_pages(2) inquiry system call. Basically we ask for - * all memory pages for NBuffers. */ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1)); idx = 0; @@ -459,7 +480,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) { char *buffptr = (char *) BufferGetBlock(i + 1); BufferDesc *bufHdr; - uint32 buf_state; uint32 bufferid; int32 page_num; char *startptr_buff, @@ -470,9 +490,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ - buf_state = LockBufHdr(bufHdr); + LockBufHdr(bufHdr); bufferid = BufferDescriptorGetBuffer(bufHdr); - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); /* start of the first page of this buffer */ startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); @@ -490,7 +510,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) { fctx->record[idx].bufferid = bufferid; fctx->record[idx].page_num = page_num; - fctx->record[idx].numa_node = os_page_status[page_num]; + fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1; /* advance to the next entry/page */ ++idx; @@ -498,14 +518,18 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) } } - Assert((idx >= os_page_count) && (idx <= max_entries)); + Assert(idx <= max_entries); + + if (include_numa) + Assert(idx >= os_page_count); /* Set max calls and remember the user function context. */ funcctx->max_calls = idx; funcctx->user_fctx = fctx; - /* Remember this backend touched the pages */ - firstNumaTouch = false; + /* Remember this backend touched the pages (only relevant for NUMA) */ + if (include_numa) + firstNumaTouch = false; } funcctx = SRF_PERCALL_SETUP(); @@ -516,8 +540,8 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; - Datum values[NUM_BUFFERCACHE_NUMA_ELEM]; - bool nulls[NUM_BUFFERCACHE_NUMA_ELEM]; + Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM]; + bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; @@ -525,8 +549,16 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) values[1] = Int64GetDatum(fctx->record[i].page_num); nulls[1] = false; - values[2] = Int32GetDatum(fctx->record[i].numa_node); - nulls[2] = false; + if (fctx->include_numa) + { + values[2] = Int32GetDatum(fctx->record[i].numa_node); + nulls[2] = false; + } + else + { + values[2] = (Datum) 0; + nulls[2] = true; + } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); @@ -538,6 +570,30 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* + * pg_buffercache_os_pages + * + * Retrieve information about OS pages, with or without NUMA information. + */ +Datum +pg_buffercache_os_pages(PG_FUNCTION_ARGS) +{ + bool include_numa; + + /* Get the boolean parameter that controls the NUMA behavior. */ + include_numa = PG_GETARG_BOOL(0); + + return pg_buffercache_os_pages_internal(fcinfo, include_numa); +} + +/* Backward-compatible wrapper for v1.6. */ +Datum +pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +{ + /* Call internal function with include_numa=true */ + return pg_buffercache_os_pages_internal(fcinfo, true); +} + Datum pg_buffercache_summary(PG_FUNCTION_ARGS) { @@ -561,6 +617,8 @@ pg_buffercache_summary(PG_FUNCTION_ARGS) BufferDesc *bufHdr; uint32 buf_state; + CHECK_FOR_INTERRUPTS(); + /* * This function summarizes the state of all headers. Locking the * buffer headers wouldn't provide an improved result as the state of @@ -621,6 +679,8 @@ pg_buffercache_usage_counts(PG_FUNCTION_ARGS) uint32 buf_state = pg_atomic_read_u32(&bufHdr->state); int usage_count; + CHECK_FOR_INTERRUPTS(); + usage_count = BUF_STATE_GET_USAGECOUNT(buf_state); usage_counts[usage_count]++; @@ -772,3 +832,119 @@ pg_buffercache_evict_all(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } + +/* + * Try to mark a shared buffer as dirty. + */ +Datum +pg_buffercache_mark_dirty(PG_FUNCTION_ARGS) +{ + + Datum result; + TupleDesc tupledesc; + HeapTuple tuple; + Datum values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM]; + bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0}; + + Buffer buf = PG_GETARG_INT32(0); + bool buffer_already_dirty; + + if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + pg_buffercache_superuser_check("pg_buffercache_mark_dirty"); + + if (buf < 1 || buf > NBuffers) + elog(ERROR, "bad buffer ID: %d", buf); + + values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty)); + values[1] = BoolGetDatum(buffer_already_dirty); + + tuple = heap_form_tuple(tupledesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + PG_RETURN_DATUM(result); +} + +/* + * Try to mark all the shared buffers of a relation as dirty. + */ +Datum +pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS) +{ + Datum result; + TupleDesc tupledesc; + HeapTuple tuple; + Datum values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM]; + bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0}; + + Oid relOid; + Relation rel; + + int32 buffers_already_dirty = 0; + int32 buffers_dirtied = 0; + int32 buffers_skipped = 0; + + if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation"); + + relOid = PG_GETARG_OID(0); + + rel = relation_open(relOid, AccessShareLock); + + if (RelationUsesLocalBuffers(rel)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only", + "pg_buffercache_mark_dirty_relation"))); + + MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty, + &buffers_skipped); + + relation_close(rel, AccessShareLock); + + values[0] = Int32GetDatum(buffers_dirtied); + values[1] = Int32GetDatum(buffers_already_dirty); + values[2] = Int32GetDatum(buffers_skipped); + + tuple = heap_form_tuple(tupledesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + PG_RETURN_DATUM(result); +} + +/* + * Try to mark all the shared buffers as dirty. + */ +Datum +pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS) +{ + Datum result; + TupleDesc tupledesc; + HeapTuple tuple; + Datum values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM]; + bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0}; + + int32 buffers_already_dirty = 0; + int32 buffers_dirtied = 0; + int32 buffers_skipped = 0; + + if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all"); + + MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty, + &buffers_skipped); + + values[0] = Int32GetDatum(buffers_dirtied); + values[1] = Int32GetDatum(buffers_already_dirty); + values[2] = Int32GetDatum(buffers_skipped); + + tuple = heap_form_tuple(tupledesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + PG_RETURN_DATUM(result); +} diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 47cca1907c74b..127d604905ca0 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -5,6 +5,12 @@ select count(*) = (select setting::bigint where name = 'shared_buffers') from pg_buffercache; +-- For pg_buffercache_os_pages, we expect at least one entry for each buffer +select count(*) >= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_os_pages; + select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -16,6 +22,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; +SELECT * FROM pg_buffercache_os_pages; SELECT * FROM pg_buffercache_pages() AS p (wrong int); SELECT * FROM pg_buffercache_summary(); SELECT * FROM pg_buffercache_usage_counts(); @@ -24,13 +31,14 @@ RESET role; -- Check that pg_monitor is allowed to query view / function SET ROLE pg_monitor; SELECT count(*) > 0 FROM pg_buffercache; +SELECT count(*) > 0 FROM pg_buffercache_os_pages; SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); RESET role; ------ ----- Test pg_buffercache_evict* functions +---- Test pg_buffercache_evict* and pg_buffercache_mark_dirty* functions ------ CREATE ROLE regress_buffercache_normal; @@ -40,12 +48,17 @@ SET ROLE regress_buffercache_normal; SELECT * FROM pg_buffercache_evict(1); SELECT * FROM pg_buffercache_evict_relation(1); SELECT * FROM pg_buffercache_evict_all(); +SELECT * FROM pg_buffercache_mark_dirty(1); +SELECT * FROM pg_buffercache_mark_dirty_relation(1); +SELECT * FROM pg_buffercache_mark_dirty_all(); RESET ROLE; -- These should return nothing, because these are STRICT functions SELECT * FROM pg_buffercache_evict(NULL); SELECT * FROM pg_buffercache_evict_relation(NULL); +SELECT * FROM pg_buffercache_mark_dirty(NULL); +SELECT * FROM pg_buffercache_mark_dirty_relation(NULL); -- These should fail because they are not called by valid range of buffers -- Number of the shared buffers are limited by max integer @@ -53,11 +66,14 @@ SELECT 2147483647 max_buffers \gset SELECT * FROM pg_buffercache_evict(-1); SELECT * FROM pg_buffercache_evict(0); SELECT * FROM pg_buffercache_evict(:max_buffers); +SELECT * FROM pg_buffercache_mark_dirty(-1); +SELECT * FROM pg_buffercache_mark_dirty(0); +SELECT * FROM pg_buffercache_mark_dirty(:max_buffers); --- This should fail because pg_buffercache_evict_relation() doesn't accept --- local relations +-- These should fail because they don't accept local relations CREATE TEMP TABLE temp_pg_buffercache(); SELECT * FROM pg_buffercache_evict_relation('temp_pg_buffercache'); +SELECT * FROM pg_buffercache_mark_dirty_relation('temp_pg_buffercache'); DROP TABLE temp_pg_buffercache; -- These shouldn't fail @@ -65,6 +81,9 @@ SELECT buffer_evicted IS NOT NULL FROM pg_buffercache_evict(1); SELECT buffers_evicted IS NOT NULL FROM pg_buffercache_evict_all(); CREATE TABLE shared_pg_buffercache(); SELECT buffers_evicted IS NOT NULL FROM pg_buffercache_evict_relation('shared_pg_buffercache'); +SELECT buffers_dirtied IS NOT NULL FROM pg_buffercache_mark_dirty_relation('shared_pg_buffercache'); DROP TABLE shared_pg_buffercache; +SELECT pg_buffercache_mark_dirty(1) IS NOT NULL; +SELECT pg_buffercache_mark_dirty_all() IS NOT NULL; DROP ROLE regress_buffercache_normal; diff --git a/contrib/pg_overexplain/expected/pg_overexplain.out b/contrib/pg_overexplain/expected/pg_overexplain.out index cb5c396c51925..55d34666d87f8 100644 --- a/contrib/pg_overexplain/expected/pg_overexplain.out +++ b/contrib/pg_overexplain/expected/pg_overexplain.out @@ -37,16 +37,17 @@ EXPLAIN (DEBUG) SELECT 1; Subplans Needing Rewind: none Relation OIDs: none Executor Parameter Types: none - Parse Location: 16 for 8 bytes + Parse Location: 0 to end (11 rows) EXPLAIN (RANGE_TABLE) SELECT 1; QUERY PLAN ------------------------------------------ Result (cost=0.00..0.01 rows=1 width=4) + RTIs: 1 RTI 1 (result): Eref: "*RESULT*" () -(3 rows) +(4 rows) -- Create a partitioned table. CREATE TABLE vegetables (id serial, name text, genus text) @@ -119,7 +120,7 @@ $$); Subplans Needing Rewind: none Relation OIDs: NNN... Executor Parameter Types: none - Parse Location: 41 to end + Parse Location: 0 to end RTI 1 (relation, inherited, in-from-clause): Eref: vegetables (id, name, genus) Relation: vegetables @@ -240,7 +241,7 @@ $$); none + NNN... + none + - 53 to end + + 0 to end + + + + @@ -344,7 +345,7 @@ $$); Subplans Needing Rewind: none Relation OIDs: NNN... Executor Parameter Types: none - Parse Location: 28 to end + Parse Location: 0 to end (37 rows) SET debug_parallel_query = false; @@ -372,7 +373,7 @@ $$); Subplans Needing Rewind: none Relation OIDs: NNN... Executor Parameter Types: 0 - Parse Location: 28 to end + Parse Location: 0 to end (15 rows) -- Create an index, and then attempt to force a nested loop with inner index @@ -436,7 +437,7 @@ $$); Subplans Needing Rewind: none Relation OIDs: NNN... Executor Parameter Types: 23 - Parse Location: 75 for 62 bytes + Parse Location: 0 to end (47 rows) RESET enable_hashjoin; @@ -475,6 +476,7 @@ INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica'); Nominal RTI: 1 Exclude Relation RTI: 0 -> Result + RTIs: 2 RTI 1 (relation): Eref: vegetables (id, name, genus) Relation: vegetables @@ -485,5 +487,5 @@ INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica'); Eref: "*RESULT*" () Unprunable RTIs: 1 Result RTIs: 1 -(14 rows) +(15 rows) diff --git a/contrib/pg_overexplain/pg_overexplain.c b/contrib/pg_overexplain/pg_overexplain.c index de824566f8c90..fcdc17012da2e 100644 --- a/contrib/pg_overexplain/pg_overexplain.c +++ b/contrib/pg_overexplain/pg_overexplain.c @@ -95,7 +95,7 @@ overexplain_ensure_options(ExplainState *es) if (options == NULL) { - options = palloc0(sizeof(overexplain_options)); + options = palloc0_object(overexplain_options); SetExplainExtensionState(es, es_extension_id, options); } @@ -236,6 +236,18 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors, ((MergeAppend *) plan)->apprelids, es); break; + case T_Result: + + /* + * 'relids' is only meaningful when plan->lefttree is NULL, + * but if somehow it ends up set when plan->lefttree is not + * NULL, print it anyway. + */ + if (plan->lefttree == NULL || + ((Result *) plan)->relids != NULL) + overexplain_bitmapset("RTIs", + ((Result *) plan)->relids, + es); default: break; } diff --git a/contrib/pg_prewarm/Makefile b/contrib/pg_prewarm/Makefile index 9cfde8c4e4fad..617ac8e09b2d8 100644 --- a/contrib/pg_prewarm/Makefile +++ b/contrib/pg_prewarm/Makefile @@ -10,6 +10,8 @@ EXTENSION = pg_prewarm DATA = pg_prewarm--1.1--1.2.sql pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql PGFILEDESC = "pg_prewarm - preload relation data into system buffer cache" +REGRESS = pg_prewarm + TAP_TESTS = 1 ifdef USE_PGXS diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index c52f4d4dc9ea2..a87b046d8be97 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -370,6 +370,15 @@ apw_load_buffers(void) apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0; apw_state->prewarmed_blocks = 0; + /* Don't prewarm more than we can fit. */ + if (num_elements > NBuffers) + { + num_elements = NBuffers; + ereport(LOG, + (errmsg("autoprewarm capping prewarmed blocks to %d (shared_buffers size)", + NBuffers))); + } + /* Get the info position of the first block of the next database. */ while (apw_state->prewarm_start_idx < num_elements) { @@ -410,10 +419,6 @@ apw_load_buffers(void) apw_state->database = current_db; Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx); - /* If we've run out of free buffers, don't launch another worker. */ - if (!have_free_buffer()) - break; - /* * Likewise, don't launch if we've already been told to shut down. * (The launch would fail anyway, but we might as well skip it.) @@ -462,12 +467,6 @@ apw_read_stream_next_block(ReadStream *stream, { BlockInfoRecord blk = p->block_info[p->pos]; - if (!have_free_buffer()) - { - p->pos = apw_state->prewarm_stop_idx; - return InvalidBlockNumber; - } - if (blk.tablespace != p->tablespace) return InvalidBlockNumber; @@ -523,10 +522,10 @@ autoprewarm_database_main(Datum main_arg) blk = block_info[i]; /* - * Loop until we run out of blocks to prewarm or until we run out of free + * Loop until we run out of blocks to prewarm or until we run out of * buffers. */ - while (i < apw_state->prewarm_stop_idx && have_free_buffer()) + while (i < apw_state->prewarm_stop_idx) { Oid tablespace = blk.tablespace; RelFileNumber filenumber = blk.filenumber; @@ -568,14 +567,13 @@ autoprewarm_database_main(Datum main_arg) /* * We have a relation; now let's loop until we find a valid fork of - * the relation or we run out of free buffers. Once we've read from - * all valid forks or run out of options, we'll close the relation and + * the relation or we run out of buffers. Once we've read from all + * valid forks or run out of options, we'll close the relation and * move on. */ while (i < apw_state->prewarm_stop_idx && blk.tablespace == tablespace && - blk.filenumber == filenumber && - have_free_buffer()) + blk.filenumber == filenumber) { ForkNumber forknum = blk.forknum; BlockNumber nblocks; @@ -693,8 +691,15 @@ apw_dump_now(bool is_bgworker, bool dump_unlogged) return 0; } - block_info_array = - (BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers); + /* + * With sufficiently large shared_buffers, allocation will exceed 1GB, so + * allow for a huge allocation to prevent outright failure. + * + * (In the future, it might be a good idea to redesign this to use a more + * memory-efficient data structure.) + */ + block_info_array = (BlockInfoRecord *) + palloc_extended((sizeof(BlockInfoRecord) * NBuffers), MCXT_ALLOC_HUGE); for (num_blocks = 0, i = 0; i < NBuffers; i++) { @@ -725,7 +730,7 @@ apw_dump_now(bool is_bgworker, bool dump_unlogged) ++num_blocks; } - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } snprintf(transient_dump_file_path, MAXPGPATH, "%s.tmp", AUTOPREWARM_FILE); @@ -853,11 +858,11 @@ autoprewarm_dump_now(PG_FUNCTION_ARGS) } static void -apw_init_state(void *ptr) +apw_init_state(void *ptr, void *arg) { AutoPrewarmSharedState *state = (AutoPrewarmSharedState *) ptr; - LWLockInitialize(&state->lock, LWLockNewTrancheId()); + LWLockInitialize(&state->lock, LWLockNewTrancheId("autoprewarm")); state->bgworker_pid = InvalidPid; state->pid_using_dumpfile = InvalidPid; } @@ -875,8 +880,7 @@ apw_init_shmem(void) apw_state = GetNamedDSMSegment("autoprewarm", sizeof(AutoPrewarmSharedState), apw_init_state, - &found); - LWLockRegisterTranche(apw_state->lock.tranche, "autoprewarm"); + &found, NULL); return found; } diff --git a/contrib/pg_prewarm/expected/pg_prewarm.out b/contrib/pg_prewarm/expected/pg_prewarm.out new file mode 100644 index 0000000000000..94e4fa1a9d237 --- /dev/null +++ b/contrib/pg_prewarm/expected/pg_prewarm.out @@ -0,0 +1,10 @@ +-- Test pg_prewarm extension +CREATE EXTENSION pg_prewarm; +-- pg_prewarm() should fail if the target relation has no storage. +CREATE TABLE test (c1 int) PARTITION BY RANGE (c1); +SELECT pg_prewarm('test', 'buffer'); +ERROR: relation "test" does not have storage +DETAIL: This operation is not supported for partitioned tables. +-- Cleanup +DROP TABLE test; +DROP EXTENSION pg_prewarm; diff --git a/contrib/pg_prewarm/meson.build b/contrib/pg_prewarm/meson.build index 82b9851303ce3..f24c47ef6a533 100644 --- a/contrib/pg_prewarm/meson.build +++ b/contrib/pg_prewarm/meson.build @@ -29,6 +29,11 @@ tests += { 'name': 'pg_prewarm', 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_prewarm', + ], + }, 'tap': { 'tests': [ 't/001_basic.pl', diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index 50808569bd741..5b519a2c85422 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -16,9 +16,11 @@ #include #include "access/relation.h" +#include "catalog/index.h" #include "fmgr.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/lmgr.h" #include "storage/read_stream.h" #include "storage/smgr.h" #include "utils/acl.h" @@ -71,6 +73,8 @@ pg_prewarm(PG_FUNCTION_ARGS) char *ttype; PrewarmType ptype; AclResult aclresult; + char relkind; + Oid privOid; /* Basic sanity checking. */ if (PG_ARGISNULL(0)) @@ -106,12 +110,54 @@ pg_prewarm(PG_FUNCTION_ARGS) forkString = text_to_cstring(forkName); forkNumber = forkname_to_number(forkString); - /* Open relation and check privileges. */ + /* + * Open relation and check privileges. If the relation is an index, we + * must check the privileges on its parent table instead. + */ + relkind = get_rel_relkind(relOid); + if (relkind == RELKIND_INDEX || + relkind == RELKIND_PARTITIONED_INDEX) + { + privOid = IndexGetRelation(relOid, true); + + /* Lock table before index to avoid deadlock. */ + if (OidIsValid(privOid)) + LockRelationOid(privOid, AccessShareLock); + } + else + privOid = relOid; + rel = relation_open(relOid, AccessShareLock); - aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT); + + /* + * It's possible that the relation with OID "privOid" was dropped and the + * OID was reused before we locked it. If that happens, we could be left + * with the wrong parent table OID, in which case we must ERROR. It's + * possible that such a race would change the outcome of + * get_rel_relkind(), too, but the worst case scenario there is that we'll + * check privileges on the index instead of its parent table, which isn't + * too terrible. + */ + if (!OidIsValid(privOid) || + (privOid != relOid && + privOid != IndexGetRelation(relOid, true))) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not find parent table of index \"%s\"", + RelationGetRelationName(rel)))); + + aclresult = pg_class_aclcheck(privOid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind), get_rel_name(relOid)); + /* Check that the relation has storage. */ + if (!RELKIND_HAS_STORAGE(rel->rd_rel->relkind)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" does not have storage", + RelationGetRelationName(rel)), + errdetail_relkind_not_supported(rel->rd_rel->relkind))); + /* Check that the fork exists. */ if (!smgrexists(RelationGetSmgr(rel), forkNumber)) ereport(ERROR, @@ -225,8 +271,11 @@ pg_prewarm(PG_FUNCTION_ARGS) read_stream_end(stream); } - /* Close relation, release lock. */ + /* Close relation, release locks. */ relation_close(rel, AccessShareLock); + if (privOid != relOid) + UnlockRelationOid(privOid, AccessShareLock); + PG_RETURN_INT64(blocks_done); } diff --git a/contrib/pg_prewarm/sql/pg_prewarm.sql b/contrib/pg_prewarm/sql/pg_prewarm.sql new file mode 100644 index 0000000000000..c76f2c7916436 --- /dev/null +++ b/contrib/pg_prewarm/sql/pg_prewarm.sql @@ -0,0 +1,10 @@ +-- Test pg_prewarm extension +CREATE EXTENSION pg_prewarm; + +-- pg_prewarm() should fail if the target relation has no storage. +CREATE TABLE test (c1 int) PARTITION BY RANGE (c1); +SELECT pg_prewarm('test', 'buffer'); + +-- Cleanup +DROP TABLE test; +DROP EXTENSION pg_prewarm; diff --git a/contrib/pg_prewarm/t/001_basic.pl b/contrib/pg_prewarm/t/001_basic.pl index 0a8259d367854..ed70ceb4fca03 100644 --- a/contrib/pg_prewarm/t/001_basic.pl +++ b/contrib/pg_prewarm/t/001_basic.pl @@ -11,7 +11,7 @@ my $node = PostgreSQL::Test::Cluster->new('main'); -$node->init; +$node->init('auth_extra' => [ '--create-role', 'test_user' ]); $node->append_conf( 'postgresql.conf', qq{shared_preload_libraries = 'pg_prewarm' @@ -23,7 +23,9 @@ $node->safe_psql("postgres", "CREATE EXTENSION pg_prewarm;\n" . "CREATE TABLE test(c1 int);\n" - . "INSERT INTO test SELECT generate_series(1, 100);"); + . "INSERT INTO test SELECT generate_series(1, 100);\n" + . "CREATE INDEX test_idx ON test(c1);\n" + . "CREATE ROLE test_user LOGIN;"); # test read mode my $result = @@ -42,6 +44,31 @@ or $stderr =~ qr/prefetch is not supported by this build/), 'prefetch mode succeeded'); +# test_user should be unable to prewarm table/index without privileges +($cmdret, $stdout, $stderr) = + $node->psql( + "postgres", "SELECT pg_prewarm('test');", + extra_params => [ '--username' => 'test_user' ]); +ok($stderr =~ /permission denied for table test/, 'pg_prewarm failed as expected'); +($cmdret, $stdout, $stderr) = + $node->psql( + "postgres", "SELECT pg_prewarm('test_idx');", + extra_params => [ '--username' => 'test_user' ]); +ok($stderr =~ /permission denied for index test_idx/, 'pg_prewarm failed as expected'); + +# test_user should be able to prewarm table/index with privileges +$node->safe_psql("postgres", "GRANT SELECT ON test TO test_user;"); +$result = + $node->safe_psql( + "postgres", "SELECT pg_prewarm('test');", + extra_params => [ '--username' => 'test_user' ]); +like($result, qr/^[1-9][0-9]*$/, 'pg_prewarm succeeded as expected'); +$result = + $node->safe_psql( + "postgres", "SELECT pg_prewarm('test_idx');", + extra_params => [ '--username' => 'test_user' ]); +like($result, qr/^[1-9][0-9]*$/, 'pg_prewarm succeeded as expected'); + # test autoprewarm_dump_now() $result = $node->safe_psql("postgres", "SELECT autoprewarm_dump_now();"); like($result, qr/^[1-9][0-9]*$/, 'autoprewarm_dump_now succeeded'); diff --git a/contrib/pg_stat_statements/Makefile b/contrib/pg_stat_statements/Makefile index b2bd8794d2a14..fe0478ac55266 100644 --- a/contrib/pg_stat_statements/Makefile +++ b/contrib/pg_stat_statements/Makefile @@ -7,6 +7,7 @@ OBJS = \ EXTENSION = pg_stat_statements DATA = pg_stat_statements--1.4.sql \ + pg_stat_statements--1.12--1.13.sql \ pg_stat_statements--1.11--1.12.sql pg_stat_statements--1.10--1.11.sql \ pg_stat_statements--1.9--1.10.sql pg_stat_statements--1.8--1.9.sql \ pg_stat_statements--1.7--1.8.sql pg_stat_statements--1.6--1.7.sql \ @@ -20,7 +21,7 @@ LDFLAGS_SL += $(filter -lm, $(LIBS)) REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_stat_statements/pg_stat_statements.conf REGRESS = select dml cursors utility level_tracking planning \ user_activity wal entry_timestamp privileges extended \ - parallel cleanup oldextversions squashing + parallel plancache cleanup oldextversions squashing # Disabled because these tests require "shared_preload_libraries=pg_stat_statements", # which typical installcheck users do not have (e.g. buildfarm clients). NO_INSTALLCHECK = 1 diff --git a/contrib/pg_stat_statements/expected/cursors.out b/contrib/pg_stat_statements/expected/cursors.out index 0fc4b2c098d0e..6afb48ace9220 100644 --- a/contrib/pg_stat_statements/expected/cursors.out +++ b/contrib/pg_stat_statements/expected/cursors.out @@ -57,8 +57,8 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 0 | COMMIT 1 | 0 | DECLARE cursor_stats_1 CURSOR WITH HOLD FOR SELECT $1 1 | 0 | DECLARE cursor_stats_2 CURSOR WITH HOLD FOR SELECT $1 - 1 | 1 | FETCH 1 IN cursor_stats_1 - 1 | 1 | FETCH 1 IN cursor_stats_2 + 1 | 1 | FETCH $1 IN cursor_stats_1 + 1 | 1 | FETCH $1 IN cursor_stats_2 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (9 rows) @@ -68,3 +68,140 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; t (1 row) +-- Normalization of FETCH statements +BEGIN; +DECLARE pgss_cursor CURSOR FOR SELECT FROM generate_series(1, 10); +-- implicit directions +FETCH pgss_cursor; +-- +(1 row) + +FETCH 1 pgss_cursor; +-- +(1 row) + +FETCH 2 pgss_cursor; +-- +(2 rows) + +FETCH -1 pgss_cursor; +-- +(1 row) + +-- explicit NEXT +FETCH NEXT pgss_cursor; +-- +(1 row) + +-- explicit PRIOR +FETCH PRIOR pgss_cursor; +-- +(1 row) + +-- explicit FIRST +FETCH FIRST pgss_cursor; +-- +(1 row) + +-- explicit LAST +FETCH LAST pgss_cursor; +-- +(1 row) + +-- explicit ABSOLUTE +FETCH ABSOLUTE 1 pgss_cursor; +-- +(1 row) + +FETCH ABSOLUTE 2 pgss_cursor; +-- +(1 row) + +FETCH ABSOLUTE -1 pgss_cursor; +-- +(1 row) + +-- explicit RELATIVE +FETCH RELATIVE 1 pgss_cursor; +-- +(0 rows) + +FETCH RELATIVE 2 pgss_cursor; +-- +(0 rows) + +FETCH RELATIVE -1 pgss_cursor; +-- +(1 row) + +-- explicit FORWARD +FETCH ALL pgss_cursor; +-- +(0 rows) + +-- explicit FORWARD ALL +FETCH FORWARD ALL pgss_cursor; +-- +(0 rows) + +-- explicit FETCH FORWARD +FETCH FORWARD pgss_cursor; +-- +(0 rows) + +FETCH FORWARD 1 pgss_cursor; +-- +(0 rows) + +FETCH FORWARD 2 pgss_cursor; +-- +(0 rows) + +FETCH FORWARD -1 pgss_cursor; +-- +(1 row) + +-- explicit FETCH BACKWARD +FETCH BACKWARD pgss_cursor; +-- +(1 row) + +FETCH BACKWARD 1 pgss_cursor; +-- +(1 row) + +FETCH BACKWARD 2 pgss_cursor; +-- +(2 rows) + +FETCH BACKWARD -1 pgss_cursor; +-- +(1 row) + +-- explicit BACKWARD ALL +FETCH BACKWARD ALL pgss_cursor; +-- +(6 rows) + +COMMIT; +SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; + calls | query +-------+-------------------------------------------------------------------- + 1 | BEGIN + 1 | COMMIT + 1 | DECLARE pgss_cursor CURSOR FOR SELECT FROM generate_series($1, $2) + 3 | FETCH ABSOLUTE $1 pgss_cursor + 1 | FETCH ALL pgss_cursor + 1 | FETCH BACKWARD ALL pgss_cursor + 4 | FETCH BACKWARD pgss_cursor + 1 | FETCH FIRST pgss_cursor + 1 | FETCH FORWARD ALL pgss_cursor + 4 | FETCH FORWARD pgss_cursor + 1 | FETCH LAST pgss_cursor + 1 | FETCH NEXT pgss_cursor + 1 | FETCH PRIOR pgss_cursor + 3 | FETCH RELATIVE $1 pgss_cursor + 4 | FETCH pgss_cursor + 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t +(16 rows) + diff --git a/contrib/pg_stat_statements/expected/extended.out b/contrib/pg_stat_statements/expected/extended.out index 04a05943372b9..1bfd0c1ca242f 100644 --- a/contrib/pg_stat_statements/expected/extended.out +++ b/contrib/pg_stat_statements/expected/extended.out @@ -68,3 +68,97 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (4 rows) +-- Various parameter numbering patterns +-- Unique query IDs with parameter numbers switched. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT WHERE ($1::int, 7) IN ((8, $2::int), ($3::int, 9)) \bind '1' '2' '3' \g +-- +(0 rows) + +SELECT WHERE ($2::int, 10) IN ((11, $3::int), ($1::int, 12)) \bind '1' '2' '3' \g +-- +(0 rows) + +SELECT WHERE $1::int IN ($2::int, $3::int) \bind '1' '2' '3' \g +-- +(0 rows) + +SELECT WHERE $2::int IN ($3::int, $1::int) \bind '1' '2' '3' \g +-- +(0 rows) + +SELECT WHERE $3::int IN ($1::int, $2::int) \bind '1' '2' '3' \g +-- +(0 rows) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +--------------------------------------------------------------+------- + SELECT WHERE $1::int IN ($2 /*, ... */) | 1 + SELECT WHERE $1::int IN ($2 /*, ... */) | 1 + SELECT WHERE $1::int IN ($2 /*, ... */) | 1 + SELECT WHERE ($1::int, $4) IN (($5, $2::int), ($3::int, $6)) | 1 + SELECT WHERE ($2::int, $4) IN (($5, $3::int), ($1::int, $6)) | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(6 rows) + +-- Two groups of two queries with the same query ID. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT WHERE '1'::int IN ($1::int, '2'::int) \bind '1' \g +-- +(1 row) + +SELECT WHERE '4'::int IN ($1::int, '5'::int) \bind '2' \g +-- +(0 rows) + +SELECT WHERE $2::int IN ($1::int, '1'::int) \bind '1' '2' \g +-- +(0 rows) + +SELECT WHERE $2::int IN ($1::int, '2'::int) \bind '3' '4' \g +-- +(0 rows) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------+------- + SELECT WHERE $1::int IN ($2 /*, ... */) | 2 + SELECT WHERE $1::int IN ($2 /*, ... */) | 2 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +-- no squashable list, the parameters id's are kept as-is +SELECT WHERE $3 = $1 AND $2 = $4 \bind 1 2 1 2 \g +-- +(1 row) + +-- squashable list, so the parameter IDs will be re-assigned +SELECT WHERE 1 IN (1, 2, 3) AND $3 = $1 AND $2 = $4 \bind 1 2 1 2 \g +-- +(1 row) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +------------------------------------------------------------+------- + SELECT WHERE $1 IN ($2 /*, ... */) AND $3 = $4 AND $5 = $6 | 1 + SELECT WHERE $3 = $1 AND $2 = $4 | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) + diff --git a/contrib/pg_stat_statements/expected/level_tracking.out b/contrib/pg_stat_statements/expected/level_tracking.out index 75e785e1719ea..8e8388dd5cb1f 100644 --- a/contrib/pg_stat_statements/expected/level_tracking.out +++ b/contrib/pg_stat_statements/expected/level_tracking.out @@ -206,37 +206,37 @@ EXPLAIN (COSTS OFF) SELECT 1 UNION SELECT 2; SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+-------------------------------------------------------------------- - f | 1 | DELETE FROM stats_track_tab + toplevel | calls | query +----------+-------+--------------------------------------------------------------------- t | 1 | EXPLAIN (COSTS OFF) (SELECT $1, $2) + f | 1 | EXPLAIN (COSTS OFF) (SELECT $1, $2); t | 1 | EXPLAIN (COSTS OFF) (TABLE test_table) + f | 1 | EXPLAIN (COSTS OFF) (TABLE test_table); t | 1 | EXPLAIN (COSTS OFF) (VALUES ($1, $2)) + f | 1 | EXPLAIN (COSTS OFF) (VALUES ($1, $2)); t | 1 | EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab + f | 1 | EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab; t | 1 | EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES (($1)) - t | 1 | EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab + - | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id + - | | WHEN MATCHED THEN UPDATE SET x = id + + f | 1 | EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES (($1)); + t | 1 | EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab + + | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) + f | 1 | EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab + + | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id); t | 1 | EXPLAIN (COSTS OFF) SELECT $1 t | 1 | EXPLAIN (COSTS OFF) SELECT $1 UNION SELECT $2 + f | 1 | EXPLAIN (COSTS OFF) SELECT $1 UNION SELECT $2; + f | 1 | EXPLAIN (COSTS OFF) SELECT $1; t | 1 | EXPLAIN (COSTS OFF) TABLE stats_track_tab + f | 1 | EXPLAIN (COSTS OFF) TABLE stats_track_tab; t | 1 | EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = $1 WHERE x = $2 + f | 1 | EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = $1 WHERE x = $2; t | 1 | EXPLAIN (COSTS OFF) VALUES ($1) - f | 1 | INSERT INTO stats_track_tab VALUES (($1)) - f | 1 | MERGE INTO stats_track_tab + - | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id + - | | WHEN MATCHED THEN UPDATE SET x = id + - | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) - f | 1 | SELECT $1 - f | 1 | SELECT $1 UNION SELECT $2 - f | 1 | SELECT $1, $2 + f | 1 | EXPLAIN (COSTS OFF) VALUES ($1); t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t - f | 1 | TABLE stats_track_tab - f | 1 | TABLE test_table - f | 1 | UPDATE stats_track_tab SET x = $1 WHERE x = $2 - f | 1 | VALUES ($1) - f | 1 | VALUES ($1, $2) (23 rows) -- EXPLAIN - top-level tracking. @@ -405,20 +405,20 @@ EXPLAIN (COSTS OFF) SELECT 1, 2 UNION SELECT 3, 4\; EXPLAIN (COSTS OFF) (SELECT SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+----------------------------------------------------------------- - f | 1 | (SELECT $1, $2, $3) UNION SELECT $4, $5, $6 + toplevel | calls | query +----------+-------+--------------------------------------------------------------------------------------------------------------------- t | 1 | EXPLAIN (COSTS OFF) (SELECT $1, $2, $3) t | 1 | EXPLAIN (COSTS OFF) (SELECT $1, $2, $3) UNION SELECT $4, $5, $6 + f | 1 | EXPLAIN (COSTS OFF) (SELECT $1, $2, $3); EXPLAIN (COSTS OFF) (SELECT 1, 2, 3, 4); t | 1 | EXPLAIN (COSTS OFF) (SELECT $1, $2, $3, $4) + f | 1 | EXPLAIN (COSTS OFF) (SELECT 1, 2, 3); EXPLAIN (COSTS OFF) (SELECT $1, $2, $3, $4); t | 1 | EXPLAIN (COSTS OFF) SELECT $1 t | 1 | EXPLAIN (COSTS OFF) SELECT $1, $2 t | 1 | EXPLAIN (COSTS OFF) SELECT $1, $2 UNION SELECT $3, $4 - f | 1 | SELECT $1 - f | 1 | SELECT $1, $2 - f | 1 | SELECT $1, $2 UNION SELECT $3, $4 - f | 1 | SELECT $1, $2, $3 - f | 1 | SELECT $1, $2, $3, $4 + f | 1 | EXPLAIN (COSTS OFF) SELECT $1, $2 UNION SELECT $3, $4; EXPLAIN (COSTS OFF) (SELECT 1, 2, 3) UNION SELECT 3, 4, 5; + f | 1 | EXPLAIN (COSTS OFF) SELECT $1; EXPLAIN (COSTS OFF) SELECT 1, 2; + f | 1 | EXPLAIN (COSTS OFF) SELECT 1, 2 UNION SELECT 3, 4; EXPLAIN (COSTS OFF) (SELECT $1, $2, $3) UNION SELECT $4, $5, $6; + f | 1 | EXPLAIN (COSTS OFF) SELECT 1; EXPLAIN (COSTS OFF) SELECT $1, $2; t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (13 rows) @@ -494,29 +494,29 @@ EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES ((1))\; EXPLAIN (COSTS OF SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+-------------------------------------------------------------------- - f | 1 | DELETE FROM stats_track_tab - f | 1 | DELETE FROM stats_track_tab WHERE x = $1 + toplevel | calls | query +----------+-------+---------------------------------------------------------------------------------------------------------------------------------- t | 1 | EXPLAIN (COSTS OFF) (TABLE test_table) t | 1 | EXPLAIN (COSTS OFF) (VALUES ($1, $2)) t | 1 | EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab t | 1 | EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab WHERE x = $1 + f | 1 | EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab; EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab WHERE x = $1; + f | 1 | EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab; EXPLAIN (COSTS OFF) DELETE FROM stats_track_tab WHERE x = 1; t | 1 | EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES ($1), ($2) t | 1 | EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES (($1)) + f | 1 | EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES (($1)); EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES (1), (2); + f | 1 | EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES ((1)); EXPLAIN (COSTS OFF) INSERT INTO stats_track_tab VALUES ($1), ($2); t | 1 | EXPLAIN (COSTS OFF) TABLE stats_track_tab + f | 1 | EXPLAIN (COSTS OFF) TABLE stats_track_tab; EXPLAIN (COSTS OFF) (TABLE test_table); + f | 1 | EXPLAIN (COSTS OFF) TABLE stats_track_tab; EXPLAIN (COSTS OFF) (TABLE test_table); t | 1 | EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = $1 t | 1 | EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = $1 WHERE x = $2 + f | 1 | EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = $1 WHERE x = $2; EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = 1; + f | 1 | EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = 1 WHERE x = 1; EXPLAIN (COSTS OFF) UPDATE stats_track_tab SET x = $1; t | 1 | EXPLAIN (COSTS OFF) VALUES ($1) - f | 1 | INSERT INTO stats_track_tab VALUES ($1), ($2) - f | 1 | INSERT INTO stats_track_tab VALUES (($1)) + f | 1 | EXPLAIN (COSTS OFF) VALUES ($1); EXPLAIN (COSTS OFF) (VALUES (1, 2)); + f | 1 | EXPLAIN (COSTS OFF) VALUES (1); EXPLAIN (COSTS OFF) (VALUES ($1, $2)); t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t - f | 1 | TABLE stats_track_tab - f | 1 | TABLE test_table - f | 1 | UPDATE stats_track_tab SET x = $1 - f | 1 | UPDATE stats_track_tab SET x = $1 WHERE x = $2 - f | 1 | VALUES ($1) - f | 1 | VALUES ($1, $2) (21 rows) SELECT pg_stat_statements_reset() IS NOT NULL AS t; @@ -547,18 +547,21 @@ EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+--------------------------------------------------------------- - t | 1 | EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab + - | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id+ - | | WHEN MATCHED THEN UPDATE SET x = id + + toplevel | calls | query +----------+-------+------------------------------------------------------------------------------------------------ + t | 1 | EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab + + | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) + f | 1 | EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab + + | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id); EXPLAIN (COSTS OFF) SELECT 1, 2, 3, 4, 5; + f | 1 | EXPLAIN (COSTS OFF) MERGE INTO stats_track_tab + + | | USING (SELECT id FROM generate_series(1, 10) id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id); EXPLAIN (COSTS OFF) SELECT $1, $2, $3, $4, $5; t | 1 | EXPLAIN (COSTS OFF) SELECT $1, $2, $3, $4, $5 - f | 1 | MERGE INTO stats_track_tab + - | | USING (SELECT id FROM generate_series($1, $2) id) ON x = id+ - | | WHEN MATCHED THEN UPDATE SET x = id + - | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) - f | 1 | SELECT $1, $2, $3, $4, $5 t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (5 rows) @@ -786,29 +789,29 @@ EXPLAIN (COSTS OFF) WITH a AS (select 4) SELECT 1 UNION SELECT 2; SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+------------------------------------------------------------------------------------------ + toplevel | calls | query +----------+-------+------------------------------------------------------------------------------------------- t | 1 | EXPLAIN (COSTS OFF) (WITH a AS (SELECT $1) (SELECT $2, $3)) + f | 1 | EXPLAIN (COSTS OFF) (WITH a AS (SELECT $1) (SELECT $2, $3)); t | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) DELETE FROM stats_track_tab + f | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) DELETE FROM stats_track_tab; t | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) INSERT INTO stats_track_tab VALUES (($2)) - t | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) MERGE INTO stats_track_tab + - | | USING (SELECT id FROM generate_series($2, $3) id) ON x = id + - | | WHEN MATCHED THEN UPDATE SET x = id + + f | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) INSERT INTO stats_track_tab VALUES (($2)); + t | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) MERGE INTO stats_track_tab + + | | USING (SELECT id FROM generate_series($2, $3) id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) + f | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) MERGE INTO stats_track_tab + + | | USING (SELECT id FROM generate_series($2, $3) id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id); t | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) SELECT $2 + f | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) SELECT $2; t | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) UPDATE stats_track_tab SET x = $2 WHERE x = $3 + f | 1 | EXPLAIN (COSTS OFF) WITH a AS (SELECT $1) UPDATE stats_track_tab SET x = $2 WHERE x = $3; t | 1 | EXPLAIN (COSTS OFF) WITH a AS (select $1) SELECT $2 UNION SELECT $3 + f | 1 | EXPLAIN (COSTS OFF) WITH a AS (select $1) SELECT $2 UNION SELECT $3; t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t - f | 1 | WITH a AS (SELECT $1) (SELECT $2, $3) - f | 1 | WITH a AS (SELECT $1) DELETE FROM stats_track_tab - f | 1 | WITH a AS (SELECT $1) INSERT INTO stats_track_tab VALUES (($2)) - f | 1 | WITH a AS (SELECT $1) MERGE INTO stats_track_tab + - | | USING (SELECT id FROM generate_series($2, $3) id) ON x = id + - | | WHEN MATCHED THEN UPDATE SET x = id + - | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) - f | 1 | WITH a AS (SELECT $1) SELECT $2 - f | 1 | WITH a AS (SELECT $1) UPDATE stats_track_tab SET x = $2 WHERE x = $3 - f | 1 | WITH a AS (select $1) SELECT $2 UNION SELECT $3 (15 rows) -- EXPLAIN with CTEs - top-level tracking @@ -918,13 +921,14 @@ EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+------------------------------------------------------------------------------ - t | 1 | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) + + toplevel | calls | query +----------+-------+------------------------------------------------------------------------------- + t | 1 | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) + | | DECLARE foocur CURSOR FOR SELECT * FROM stats_track_tab + f | 1 | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) + + | | DECLARE foocur CURSOR FOR SELECT * FROM stats_track_tab; t | 1 | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT $1 - f | 1 | SELECT $1 - f | 1 | SELECT * FROM stats_track_tab + f | 1 | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT $1; t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (5 rows) @@ -1047,10 +1051,10 @@ SELECT toplevel, calls, query FROM pg_stat_statements toplevel | calls | query ----------+-------+----------------------------------------------------------------- t | 1 | CREATE TEMPORARY TABLE pgss_ctas_1 AS SELECT $1 + f | 1 | CREATE TEMPORARY TABLE pgss_ctas_1 AS SELECT $1; t | 1 | CREATE TEMPORARY TABLE pgss_ctas_2 AS EXECUTE test_prepare_pgss - f | 1 | SELECT $1 + f | 1 | PREPARE test_prepare_pgss AS select generate_series($1, $2) t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t - f | 1 | select generate_series($1, $2) (5 rows) -- CREATE TABLE AS, top-level tracking. @@ -1088,10 +1092,10 @@ EXPLAIN (COSTS OFF) CREATE TEMPORARY TABLE pgss_explain_ctas AS SELECT 1; SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+--------------------------------------------------------------------------- + toplevel | calls | query +----------+-------+---------------------------------------------------------------------------- t | 1 | EXPLAIN (COSTS OFF) CREATE TEMPORARY TABLE pgss_explain_ctas AS SELECT $1 - f | 1 | SELECT $1 + f | 1 | EXPLAIN (COSTS OFF) CREATE TEMPORARY TABLE pgss_explain_ctas AS SELECT $1; t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (3 rows) @@ -1136,14 +1140,14 @@ CLOSE foocur; COMMIT; SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+--------------------------------------------------------- + toplevel | calls | query +----------+-------+---------------------------------------------------------- t | 1 | BEGIN t | 1 | CLOSE foocur t | 1 | COMMIT t | 1 | DECLARE FOOCUR CURSOR FOR SELECT * from stats_track_tab - t | 1 | FETCH FORWARD 1 FROM foocur - f | 1 | SELECT * from stats_track_tab + f | 1 | DECLARE FOOCUR CURSOR FOR SELECT * from stats_track_tab; + t | 1 | FETCH FORWARD $1 FROM foocur t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (7 rows) @@ -1172,7 +1176,7 @@ SELECT toplevel, calls, query FROM pg_stat_statements t | 1 | CLOSE foocur t | 1 | COMMIT t | 1 | DECLARE FOOCUR CURSOR FOR SELECT * FROM stats_track_tab - t | 1 | FETCH FORWARD 1 FROM foocur + t | 1 | FETCH FORWARD $1 FROM foocur t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (6 rows) @@ -1203,25 +1207,25 @@ COPY (DELETE FROM stats_track_tab WHERE x = 2 RETURNING x) TO stdout; 2 SELECT toplevel, calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; - toplevel | calls | query -----------+-------+--------------------------------------------------------------------------- + toplevel | calls | query +----------+-------+----------------------------------------------------------------------------- + f | 1 | COPY (DELETE FROM stats_track_tab WHERE x = $1 RETURNING x) TO stdout t | 1 | COPY (DELETE FROM stats_track_tab WHERE x = 2 RETURNING x) TO stdout + f | 1 | COPY (INSERT INTO stats_track_tab (x) VALUES ($1) RETURNING x) TO stdout t | 1 | COPY (INSERT INTO stats_track_tab (x) VALUES (1) RETURNING x) TO stdout - t | 1 | COPY (MERGE INTO stats_track_tab USING (SELECT 1 id) ON x = id + - | | WHEN MATCHED THEN UPDATE SET x = id + + f | 1 | COPY (MERGE INTO stats_track_tab USING (SELECT $1 id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) RETURNING x) TO stdout + t | 1 | COPY (MERGE INTO stats_track_tab USING (SELECT 1 id) ON x = id + + | | WHEN MATCHED THEN UPDATE SET x = id + + | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) RETURNING x) TO stdout + f | 1 | COPY (SELECT $1 UNION SELECT $2) TO stdout + f | 1 | COPY (SELECT $1) TO stdout t | 1 | COPY (SELECT 1 UNION SELECT 2) TO stdout t | 1 | COPY (SELECT 1) TO stdout + f | 1 | COPY (UPDATE stats_track_tab SET x = $1 WHERE x = $2 RETURNING x) TO stdout t | 1 | COPY (UPDATE stats_track_tab SET x = 2 WHERE x = 1 RETURNING x) TO stdout - f | 1 | DELETE FROM stats_track_tab WHERE x = $1 RETURNING x - f | 1 | INSERT INTO stats_track_tab (x) VALUES ($1) RETURNING x - f | 1 | MERGE INTO stats_track_tab USING (SELECT $1 id) ON x = id + - | | WHEN MATCHED THEN UPDATE SET x = id + - | | WHEN NOT MATCHED THEN INSERT (x) VALUES (id) RETURNING x - f | 1 | SELECT $1 - f | 1 | SELECT $1 UNION SELECT $2 t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t - f | 1 | UPDATE stats_track_tab SET x = $1 WHERE x = $2 RETURNING x (13 rows) -- COPY - top-level tracking. diff --git a/contrib/pg_stat_statements/expected/oldextversions.out b/contrib/pg_stat_statements/expected/oldextversions.out index de679b19711ab..726383a99d7c1 100644 --- a/contrib/pg_stat_statements/expected/oldextversions.out +++ b/contrib/pg_stat_statements/expected/oldextversions.out @@ -407,4 +407,71 @@ SELECT count(*) > 0 AS has_data FROM pg_stat_statements; t (1 row) +-- New functions and views for pg_stat_statements in 1.13 +AlTER EXTENSION pg_stat_statements UPDATE TO '1.13'; +\d pg_stat_statements + View "public.pg_stat_statements" + Column | Type | Collation | Nullable | Default +----------------------------+--------------------------+-----------+----------+--------- + userid | oid | | | + dbid | oid | | | + toplevel | boolean | | | + queryid | bigint | | | + query | text | | | + plans | bigint | | | + total_plan_time | double precision | | | + min_plan_time | double precision | | | + max_plan_time | double precision | | | + mean_plan_time | double precision | | | + stddev_plan_time | double precision | | | + calls | bigint | | | + total_exec_time | double precision | | | + min_exec_time | double precision | | | + max_exec_time | double precision | | | + mean_exec_time | double precision | | | + stddev_exec_time | double precision | | | + rows | bigint | | | + shared_blks_hit | bigint | | | + shared_blks_read | bigint | | | + shared_blks_dirtied | bigint | | | + shared_blks_written | bigint | | | + local_blks_hit | bigint | | | + local_blks_read | bigint | | | + local_blks_dirtied | bigint | | | + local_blks_written | bigint | | | + temp_blks_read | bigint | | | + temp_blks_written | bigint | | | + shared_blk_read_time | double precision | | | + shared_blk_write_time | double precision | | | + local_blk_read_time | double precision | | | + local_blk_write_time | double precision | | | + temp_blk_read_time | double precision | | | + temp_blk_write_time | double precision | | | + wal_records | bigint | | | + wal_fpi | bigint | | | + wal_bytes | numeric | | | + wal_buffers_full | bigint | | | + jit_functions | bigint | | | + jit_generation_time | double precision | | | + jit_inlining_count | bigint | | | + jit_inlining_time | double precision | | | + jit_optimization_count | bigint | | | + jit_optimization_time | double precision | | | + jit_emission_count | bigint | | | + jit_emission_time | double precision | | | + jit_deform_count | bigint | | | + jit_deform_time | double precision | | | + parallel_workers_to_launch | bigint | | | + parallel_workers_launched | bigint | | | + generic_plan_calls | bigint | | | + custom_plan_calls | bigint | | | + stats_since | timestamp with time zone | | | + minmax_stats_since | timestamp with time zone | | | + +SELECT count(*) > 0 AS has_data FROM pg_stat_statements; + has_data +---------- + t +(1 row) + DROP EXTENSION pg_stat_statements; diff --git a/contrib/pg_stat_statements/expected/plancache.out b/contrib/pg_stat_statements/expected/plancache.out new file mode 100644 index 0000000000000..e152de9f55130 --- /dev/null +++ b/contrib/pg_stat_statements/expected/plancache.out @@ -0,0 +1,224 @@ +-- +-- Tests with plan cache +-- +-- Setup +CREATE OR REPLACE FUNCTION select_one_func(int) RETURNS VOID AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; +CREATE OR REPLACE PROCEDURE select_one_proc(int) AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; +-- Prepared statements +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +PREPARE p1 AS SELECT $1 AS a; +SET plan_cache_mode TO force_generic_plan; +EXECUTE p1(1); + a +--- + 1 +(1 row) + +SET plan_cache_mode TO force_custom_plan; +EXECUTE p1(1); + a +--- + 1 +(1 row) + +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | query +-------+--------------------+-------------------+---------------------------------------------------- + 2 | 1 | 1 | PREPARE p1 AS SELECT $1 AS a + 1 | 0 | 0 | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | SET plan_cache_mode TO $1 +(3 rows) + +DEALLOCATE p1; +-- Extended query protocol +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT $1 AS a \parse p1 +SET plan_cache_mode TO force_generic_plan; +\bind_named p1 1 +; + a +--- + 1 +(1 row) + +SET plan_cache_mode TO force_custom_plan; +\bind_named p1 1 +; + a +--- + 1 +(1 row) + +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | query +-------+--------------------+-------------------+---------------------------------------------------- + 2 | 1 | 1 | SELECT $1 AS a + 1 | 0 | 0 | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | SET plan_cache_mode TO $1 +(3 rows) + +\close_prepared p1 +-- EXPLAIN [ANALYZE] EXECUTE +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +PREPARE p1 AS SELECT $1; +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); + QUERY PLAN +------------ + Result +(1 row) + +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); + QUERY PLAN +------------ + Result +(1 row) + +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | toplevel | query +-------+--------------------+-------------------+----------+---------------------------------------------------------------------------------- + 2 | 0 | 0 | t | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1) + 2 | 0 | 0 | t | EXPLAIN (COSTS OFF) EXECUTE p1(1) + 4 | 2 | 2 | f | PREPARE p1 AS SELECT $1 + 1 | 0 | 0 | t | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | t | SET plan_cache_mode TO $1 +(5 rows) + +RESET pg_stat_statements.track; +DEALLOCATE p1; +-- Functions/procedures +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SET plan_cache_mode TO force_generic_plan; +SELECT select_one_func(1); + select_one_func +----------------- + +(1 row) + +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +SELECT select_one_func(1); + select_one_func +----------------- + +(1 row) + +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | toplevel | query +-------+--------------------+-------------------+----------+---------------------------------------------------- + 2 | 0 | 0 | t | CALL select_one_proc($1) + 4 | 2 | 2 | f | SELECT $1 + 1 | 0 | 0 | t | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | t | SELECT select_one_func($1) + 2 | 0 | 0 | t | SET plan_cache_mode TO $1 +(5 rows) + +-- +-- EXPLAIN [ANALYZE] EXECUTE + functions/procedures +-- +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +EXPLAIN (COSTS OFF) SELECT select_one_func(1); + QUERY PLAN +------------ + Result +(1 row) + +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +EXPLAIN (COSTS OFF) SELECT select_one_func(1); + QUERY PLAN +------------ + Result +(1 row) + +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C", toplevel; + calls | generic_plan_calls | custom_plan_calls | toplevel | query +-------+--------------------+-------------------+----------+------------------------------------------------------------------------------------------------ + 2 | 0 | 0 | t | CALL select_one_proc($1) + 2 | 0 | 0 | t | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func($1) + 4 | 0 | 0 | f | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func($1); + 2 | 0 | 0 | t | EXPLAIN (COSTS OFF) SELECT select_one_func($1) + 4 | 2 | 2 | f | SELECT $1 + 1 | 0 | 0 | t | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | t | SET plan_cache_mode TO $1 +(7 rows) + +RESET pg_stat_statements.track; +-- +-- Cleanup +-- +DROP FUNCTION select_one_func(int); +DROP PROCEDURE select_one_proc(int); diff --git a/contrib/pg_stat_statements/expected/planning.out b/contrib/pg_stat_statements/expected/planning.out index 3ee1928cbe94a..9effd11fdc859 100644 --- a/contrib/pg_stat_statements/expected/planning.out +++ b/contrib/pg_stat_statements/expected/planning.out @@ -58,7 +58,7 @@ SELECT 42; (1 row) SELECT plans, calls, rows, query FROM pg_stat_statements - WHERE query NOT LIKE 'SELECT COUNT%' ORDER BY query COLLATE "C"; + WHERE query NOT LIKE 'PREPARE%' ORDER BY query COLLATE "C"; plans | calls | rows | query -------+-------+------+---------------------------------------------------------- 0 | 1 | 0 | ALTER TABLE stats_plan_test ADD COLUMN x int @@ -72,10 +72,10 @@ SELECT plans, calls, rows, query FROM pg_stat_statements -- for the prepared statement we expect at least one replan, but cache -- invalidations could force more SELECT plans >= 2 AND plans <= calls AS plans_ok, calls, rows, query FROM pg_stat_statements - WHERE query LIKE 'SELECT COUNT%' ORDER BY query COLLATE "C"; - plans_ok | calls | rows | query -----------+-------+------+-------------------------------------- - t | 4 | 4 | SELECT COUNT(*) FROM stats_plan_test + WHERE query LIKE 'PREPARE%' ORDER BY query COLLATE "C"; + plans_ok | calls | rows | query +----------+-------+------+------------------------------------------------------- + t | 4 | 4 | PREPARE prep1 AS SELECT COUNT(*) FROM stats_plan_test (1 row) -- Cleanup diff --git a/contrib/pg_stat_statements/expected/select.out b/contrib/pg_stat_statements/expected/select.out index 09476a7b699e9..75c896f388512 100644 --- a/contrib/pg_stat_statements/expected/select.out +++ b/contrib/pg_stat_statements/expected/select.out @@ -208,6 +208,7 @@ DEALLOCATE pgss_test; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; calls | rows | query -------+------+------------------------------------------------------------------------------ + 1 | 1 | PREPARE pgss_test (int) AS SELECT $1, $2 LIMIT $3 4 | 4 | SELECT $1 + | | -- but this one will appear + | | AS "text" @@ -221,7 +222,6 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 2 | 2 | SELECT $1 AS "int" ORDER BY 1 1 | 2 | SELECT $1 AS i UNION SELECT $2 ORDER BY i 1 | 1 | SELECT $1 || $2 - 1 | 1 | SELECT $1, $2 LIMIT $3 2 | 2 | SELECT DISTINCT $1 AS "int" 0 | 0 | SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C" 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t @@ -238,6 +238,65 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; t (1 row) +-- normalization of constants and parameters, with constant locations +-- recorded one or more times. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT WHERE '1' IN ('1'::int, '3'::int::text); +-- +(1 row) + +SELECT WHERE (1, 2) IN ((1, 2), (2, 3)); +-- +(1 row) + +SELECT WHERE (3, 4) IN ((5, 6), (8, 7)); +-- +(0 rows) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +------------------------------------------------------------------------+------- + SELECT WHERE $1 IN ($2::int, $3::int::text) | 1 + SELECT WHERE ($1, $2) IN (($3, $4), ($5, $6)) | 2 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" | 0 +(4 rows) + +-- with the last element being an explicit function call with an argument, ensure +-- the normalization of the squashing interval is correct. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT WHERE 1 IN (1, int4(1), int4(2)); +-- +(1 row) + +SELECT WHERE 1 = ANY (ARRAY[1, int4(1), int4(2)]); +-- +(1 row) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +------------------------------------------------------------------------+------- + SELECT WHERE $1 IN ($2 /*, ... */) | 2 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" | 0 +(3 rows) + -- -- queries with locking clauses -- diff --git a/contrib/pg_stat_statements/expected/squashing.out b/contrib/pg_stat_statements/expected/squashing.out index 7b138af098c9f..d5bb67c7222fa 100644 --- a/contrib/pg_stat_statements/expected/squashing.out +++ b/contrib/pg_stat_statements/expected/squashing.out @@ -2,9 +2,11 @@ -- Const squashing functionality -- CREATE EXTENSION pg_stat_statements; +-- +-- Simple Lists +-- CREATE TABLE test_squash (id int, data int); --- IN queries --- Normal scenario, too many simple constants for an IN query +-- single element will not be squashed SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- @@ -16,42 +18,150 @@ SELECT * FROM test_squash WHERE id IN (1); ----+------ (0 rows) +SELECT ARRAY[1]; + array +------- + {1} +(1 row) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------+------- + SELECT * FROM test_squash WHERE id IN ($1) | 1 + SELECT ARRAY[$1] | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) + +-- more than 1 element in a list will be squashed +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + SELECT * FROM test_squash WHERE id IN (1, 2, 3); id | data ----+------ (0 rows) +SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4); + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5); + id | data +----+------ +(0 rows) + +SELECT ARRAY[1, 2, 3]; + array +--------- + {1,2,3} +(1 row) + +SELECT ARRAY[1, 2, 3, 4]; + array +----------- + {1,2,3,4} +(1 row) + +SELECT ARRAY[1, 2, 3, 4, 5]; + array +------------- + {1,2,3,4,5} +(1 row) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls -------------------------------------------------------+------- - SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) | 1 - SELECT * FROM test_squash WHERE id IN ($1) | 1 + SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) | 3 + SELECT ARRAY[$1 /*, ... */] | 3 SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (3 rows) -SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9); +-- built-in functions will be squashed +-- the IN and ARRAY forms of this statement will have the same queryId +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT WHERE 1 IN (1, int4(1), int4(2), 2); +-- +(1 row) + +SELECT WHERE 1 = ANY (ARRAY[1, int4(1), int4(2), 2]); +-- +(1 row) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------+------- + SELECT WHERE $1 IN ($2 /*, ... */) | 2 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(2 rows) + +-- external parameters will be squashed +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT * FROM test_squash WHERE id IN ($1, $2, $3, $4, $5) \bind 1 2 3 4 5 +; + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id::text = ANY(ARRAY[$1, $2, $3, $4, $5]) \bind 1 2 3 4 5 +; id | data ----+------ (0 rows) -SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10); +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------------------------+------- + SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) | 1 + SELECT * FROM test_squash WHERE id::text = ANY(ARRAY[$1 /*, ... */]) | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) + +-- prepared statements will also be squashed +-- the IN and ARRAY forms of this statement will have the same queryId +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +PREPARE p1(int, int, int, int, int) AS +SELECT * FROM test_squash WHERE id IN ($1, $2, $3, $4, $5); +EXECUTE p1(1, 2, 3, 4, 5); id | data ----+------ (0 rows) -SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); +DEALLOCATE p1; +PREPARE p1(int, int, int, int, int) AS +SELECT * FROM test_squash WHERE id = ANY(ARRAY[$1, $2, $3, $4, $5]); +EXECUTE p1(1, 2, 3, 4, 5); id | data ----+------ (0 rows) +DEALLOCATE p1; SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; - query | calls -------------------------------------------------------------------------+------- - SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) | 4 - SELECT * FROM test_squash WHERE id IN ($1) | 1 - SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 - SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" | 1 -(4 rows) + query | calls +-------------------------------------------------------+------- + DEALLOCATE $1 | 2 + PREPARE p1(int, int, int, int, int) AS +| 2 + SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) | + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) -- More conditions in the query SELECT pg_stat_statements_reset() IS NOT NULL AS t; @@ -75,10 +185,25 @@ SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) AND da ----+------ (0 rows) +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9]) AND data = 2; + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) AND data = 2; + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) AND data = 2; + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls ---------------------------------------------------------------------+------- - SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) AND data = $2 | 3 + SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) AND data = $2 | 6 SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) @@ -107,24 +232,46 @@ SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) ----+------ (0 rows) +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9]) + AND data = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9]); + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + AND data = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) + AND data = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls -------------------------------------------------------+------- - SELECT * FROM test_squash WHERE id IN ($1 /*, ... */)+| 3 + SELECT * FROM test_squash WHERE id IN ($1 /*, ... */)+| 6 AND data IN ($2 /*, ... */) | SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) --- No constants simplification for OpExpr SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- t (1 row) --- In the following two queries the operator expressions (+) and (@) have --- different oppno, and will be given different query_id if squashed, even though --- the normalized query will be the same +-- No constants squashing for OpExpr +-- The IN and ARRAY forms of this statement will have the same queryId +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + SELECT * FROM test_squash WHERE id IN (1 + 1, 2 + 2, 3 + 3, 4 + 4, 5 + 5, 6 + 6, 7 + 7, 8 + 8, 9 + 9); id | data @@ -137,19 +284,35 @@ SELECT * FROM test_squash WHERE id IN ----+------ (0 rows) +SELECT * FROM test_squash WHERE id = ANY(ARRAY + [1 + 1, 2 + 2, 3 + 3, 4 + 4, 5 + 5, 6 + 6, 7 + 7, 8 + 8, 9 + 9]); + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id = ANY(ARRAY + [@ '-1', @ '-2', @ '-3', @ '-4', @ '-5', @ '-6', @ '-7', @ '-8', @ '-9']); + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls ----------------------------------------------------------------------------------------------------+------- - SELECT * FROM test_squash WHERE id IN +| 1 + SELECT * FROM test_squash WHERE id IN +| 2 ($1 + $2, $3 + $4, $5 + $6, $7 + $8, $9 + $10, $11 + $12, $13 + $14, $15 + $16, $17 + $18) | - SELECT * FROM test_squash WHERE id IN +| 1 + SELECT * FROM test_squash WHERE id IN +| 2 (@ $1, @ $2, @ $3, @ $4, @ $5, @ $6, @ $7, @ $8, @ $9) | SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (3 rows) +-- -- FuncExpr +-- -- Verify multiple type representation end up with the same query_id CREATE TABLE test_float (data float); +-- The casted ARRAY expressions will have the same queryId as the IN clause +-- form of the query SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- @@ -181,12 +344,38 @@ SELECT data FROM test_float WHERE data IN (1.0, 1.0); ------ (0 rows) +SELECT data FROM test_float WHERE data = ANY(ARRAY['1'::double precision, '2'::double precision]); + data +------ +(0 rows) + +SELECT data FROM test_float WHERE data = ANY(ARRAY[1.0::double precision, 1.0::double precision]); + data +------ +(0 rows) + +SELECT data FROM test_float WHERE data = ANY(ARRAY[1, 2]); + data +------ +(0 rows) + +SELECT data FROM test_float WHERE data = ANY(ARRAY[1, '2']); + data +------ +(0 rows) + +SELECT data FROM test_float WHERE data = ANY(ARRAY['1', 2]); + data +------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; - query | calls ------------------------------------------------------------+------- - SELECT data FROM test_float WHERE data IN ($1 /*, ... */) | 5 - SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 -(2 rows) + query | calls +--------------------------------------------------------------------+------- + SELECT data FROM test_float WHERE data = ANY(ARRAY[$1 /*, ... */]) | 3 + SELECT data FROM test_float WHERE data IN ($1 /*, ... */) | 7 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) -- Numeric type, implicit cast is squashed CREATE TABLE test_squash_numeric (id int, data numeric(5, 2)); @@ -201,12 +390,18 @@ SELECT * FROM test_squash_numeric WHERE data IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ----+------ (0 rows) +SELECT * FROM test_squash_numeric WHERE data = ANY(ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; - query | calls ------------------------------------------------------------------+------- - SELECT * FROM test_squash_numeric WHERE data IN ($1 /*, ... */) | 1 - SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 -(2 rows) + query | calls +--------------------------------------------------------------------------+------- + SELECT * FROM test_squash_numeric WHERE data = ANY(ARRAY[$1 /*, ... */]) | 1 + SELECT * FROM test_squash_numeric WHERE data IN ($1 /*, ... */) | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) -- Bigint, implicit cast is squashed CREATE TABLE test_squash_bigint (id int, data bigint); @@ -221,14 +416,20 @@ SELECT * FROM test_squash_bigint WHERE data IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1 ----+------ (0 rows) +SELECT * FROM test_squash_bigint WHERE data = ANY(ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; - query | calls -----------------------------------------------------------------+------- - SELECT * FROM test_squash_bigint WHERE data IN ($1 /*, ... */) | 1 - SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 -(2 rows) + query | calls +-------------------------------------------------------------------------+------- + SELECT * FROM test_squash_bigint WHERE data = ANY(ARRAY[$1 /*, ... */]) | 1 + SELECT * FROM test_squash_bigint WHERE data IN ($1 /*, ... */) | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) --- Bigint, explicit cast is not squashed +-- Bigint, explicit cast is squashed SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- @@ -242,15 +443,22 @@ SELECT * FROM test_squash_bigint WHERE data IN ----+------ (0 rows) +SELECT * FROM test_squash_bigint WHERE data = ANY(ARRAY[ + 1::bigint, 2::bigint, 3::bigint, 4::bigint, 5::bigint, 6::bigint, + 7::bigint, 8::bigint, 9::bigint, 10::bigint, 11::bigint]); + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls ----------------------------------------------------+------- - SELECT * FROM test_squash_bigint WHERE data IN +| 1 - ($1 /*, ... */::bigint) | + SELECT * FROM test_squash_bigint WHERE data IN +| 2 + ($1 /*, ... */) | SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) --- Bigint, long tokens with parenthesis +-- Bigint, long tokens with parenthesis, will not squash SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- @@ -264,44 +472,47 @@ SELECT * FROM test_squash_bigint WHERE id IN ----+------ (0 rows) +SELECT * FROM test_squash_bigint WHERE id = ANY(ARRAY[ + abs(100), abs(200), abs(300), abs(400), abs(500), abs(600), abs(700), + abs(800), abs(900), abs(1000), ((abs(1100)))]); + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls -------------------------------------------------------------------------+------- - SELECT * FROM test_squash_bigint WHERE id IN +| 1 + SELECT * FROM test_squash_bigint WHERE id IN +| 2 (abs($1), abs($2), abs($3), abs($4), abs($5), abs($6), abs($7),+| abs($8), abs($9), abs($10), ((abs($11)))) | SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) --- CoerceViaIO, SubLink instead of a Const -CREATE TABLE test_squash_jsonb (id int, data jsonb); +-- Multiple FuncExpr's. Will not squash SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- t (1 row) -SELECT * FROM test_squash_jsonb WHERE data IN - ((SELECT '"1"')::jsonb, (SELECT '"2"')::jsonb, (SELECT '"3"')::jsonb, - (SELECT '"4"')::jsonb, (SELECT '"5"')::jsonb, (SELECT '"6"')::jsonb, - (SELECT '"7"')::jsonb, (SELECT '"8"')::jsonb, (SELECT '"9"')::jsonb, - (SELECT '"10"')::jsonb); - id | data -----+------ -(0 rows) +SELECT WHERE 1 IN (1::int::bigint::int, 2::int::bigint::int); +-- +(1 row) + +SELECT WHERE 1 = ANY(ARRAY[1::int::bigint::int, 2::int::bigint::int]); +-- +(1 row) SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; - query | calls -----------------------------------------------------------------------+------- - SELECT * FROM test_squash_jsonb WHERE data IN +| 1 - ((SELECT $1)::jsonb, (SELECT $2)::jsonb, (SELECT $3)::jsonb,+| - (SELECT $4)::jsonb, (SELECT $5)::jsonb, (SELECT $6)::jsonb,+| - (SELECT $7)::jsonb, (SELECT $8)::jsonb, (SELECT $9)::jsonb,+| - (SELECT $10)::jsonb) | - SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 + query | calls +----------------------------------------------------+------- + SELECT WHERE $1 IN ($2 /*, ... */) | 2 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) +-- -- CoerceViaIO +-- -- Create some dummy type to force CoerceViaIO CREATE TYPE casttesttype; CREATE FUNCTION casttesttype_in(cstring) @@ -349,15 +560,25 @@ SELECT * FROM test_squash_cast WHERE data IN ----+------ (0 rows) +SELECT * FROM test_squash_cast WHERE data = ANY (ARRAY + [1::int4::casttesttype, 2::int4::casttesttype, 3::int4::casttesttype, + 4::int4::casttesttype, 5::int4::casttesttype, 6::int4::casttesttype, + 7::int4::casttesttype, 8::int4::casttesttype, 9::int4::casttesttype, + 10::int4::casttesttype, 11::int4::casttesttype]); + id | data +----+------ +(0 rows) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls ----------------------------------------------------+------- - SELECT * FROM test_squash_cast WHERE data IN +| 1 - ($1 /*, ... */::int4::casttesttype) | + SELECT * FROM test_squash_cast WHERE data IN +| 2 + ($1 /*, ... */) | SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) -- Some casting expression are simplified to Const +CREATE TABLE test_squash_jsonb (id int, data jsonb); SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- @@ -366,37 +587,169 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash_jsonb WHERE data IN (('"1"')::jsonb, ('"2"')::jsonb, ('"3"')::jsonb, ('"4"')::jsonb, - ( '"5"')::jsonb, ( '"6"')::jsonb, ( '"7"')::jsonb, ( '"8"')::jsonb, - ( '"9"')::jsonb, ( '"10"')::jsonb); + ('"5"')::jsonb, ('"6"')::jsonb, ('"7"')::jsonb, ('"8"')::jsonb, + ('"9"')::jsonb, ('"10"')::jsonb); + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash_jsonb WHERE data = ANY (ARRAY + [('"1"')::jsonb, ('"2"')::jsonb, ('"3"')::jsonb, ('"4"')::jsonb, + ('"5"')::jsonb, ('"6"')::jsonb, ('"7"')::jsonb, ('"8"')::jsonb, + ('"9"')::jsonb, ('"10"')::jsonb]); + id | data +----+------ +(0 rows) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------+------- + SELECT * FROM test_squash_jsonb WHERE data IN +| 2 + ($1 /*, ... */) | + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(2 rows) + +-- CoerceViaIO, SubLink instead of a Const. Will not squash +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT * FROM test_squash_jsonb WHERE data IN + ((SELECT '"1"')::jsonb, (SELECT '"2"')::jsonb, (SELECT '"3"')::jsonb, + (SELECT '"4"')::jsonb, (SELECT '"5"')::jsonb, (SELECT '"6"')::jsonb, + (SELECT '"7"')::jsonb, (SELECT '"8"')::jsonb, (SELECT '"9"')::jsonb, + (SELECT '"10"')::jsonb); id | data ----+------ (0 rows) +SELECT * FROM test_squash_jsonb WHERE data = ANY(ARRAY + [(SELECT '"1"')::jsonb, (SELECT '"2"')::jsonb, (SELECT '"3"')::jsonb, + (SELECT '"4"')::jsonb, (SELECT '"5"')::jsonb, (SELECT '"6"')::jsonb, + (SELECT '"7"')::jsonb, (SELECT '"8"')::jsonb, (SELECT '"9"')::jsonb, + (SELECT '"10"')::jsonb]); + id | data +----+------ +(0 rows) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------------------------+------- + SELECT * FROM test_squash_jsonb WHERE data IN +| 2 + ((SELECT $1)::jsonb, (SELECT $2)::jsonb, (SELECT $3)::jsonb,+| + (SELECT $4)::jsonb, (SELECT $5)::jsonb, (SELECT $6)::jsonb,+| + (SELECT $7)::jsonb, (SELECT $8)::jsonb, (SELECT $9)::jsonb,+| + (SELECT $10)::jsonb) | + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(2 rows) + +-- Multiple CoerceViaIO are squashed +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT WHERE 1 IN (1::text::int::text::int, 1::text::int::text::int); +-- +(1 row) + +SELECT WHERE 1 = ANY(ARRAY[1::text::int::text::int, 1::text::int::text::int]); +-- +(1 row) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls ----------------------------------------------------+------- - SELECT * FROM test_squash_jsonb WHERE data IN +| 1 - (($1 /*, ... */)::jsonb) | + SELECT WHERE $1 IN ($2 /*, ... */) | 2 SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) +-- -- RelabelType +-- SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- t (1 row) -SELECT * FROM test_squash WHERE id IN (1::oid, 2::oid, 3::oid, 4::oid, 5::oid, 6::oid, 7::oid, 8::oid, 9::oid); +-- However many layers of RelabelType there are, the list will be squashable. +SELECT * FROM test_squash WHERE id IN + (1::oid, 2::oid, 3::oid, 4::oid, 5::oid, 6::oid, 7::oid, 8::oid, 9::oid); id | data ----+------ (0 rows) +SELECT ARRAY[1::oid, 2::oid, 3::oid, 4::oid, 5::oid, 6::oid, 7::oid, 8::oid, 9::oid]; + array +--------------------- + {1,2,3,4,5,6,7,8,9} +(1 row) + +SELECT * FROM test_squash WHERE id IN (1::oid, 2::oid::int::oid); + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id = ANY(ARRAY[1::oid, 2::oid::int::oid]); + id | data +----+------ +(0 rows) + +-- RelabelType together with CoerceViaIO is also squashable +SELECT * FROM test_squash WHERE id = ANY(ARRAY[1::oid::text::int::oid, 2::oid::int::oid]); + id | data +----+------ +(0 rows) + +SELECT * FROM test_squash WHERE id = ANY(ARRAY[1::text::int::oid, 2::oid::int::oid]); + id | data +----+------ +(0 rows) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------+------- + SELECT * FROM test_squash WHERE id IN +| 5 + ($1 /*, ... */) | + SELECT ARRAY[$1 /*, ... */] | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(3 rows) + +-- +-- edge cases +-- +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +-- for nested arrays, only constants are squashed +SELECT ARRAY[ + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + ]; + array +----------------------------------------------------------------------------------------------- + {{1,2,3,4,5,6,7,8,9,10},{1,2,3,4,5,6,7,8,9,10},{1,2,3,4,5,6,7,8,9,10},{1,2,3,4,5,6,7,8,9,10}} +(1 row) + SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; - query | calls -------------------------------------------------------------+------- - SELECT * FROM test_squash WHERE id IN ($1 /*, ... */::oid) | 1 - SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 + query | calls +----------------------------------------------------+------- + SELECT ARRAY[ +| 1 + ARRAY[$1 /*, ... */], +| + ARRAY[$2 /*, ... */], +| + ARRAY[$3 /*, ... */], +| + ARRAY[$4 /*, ... */] +| + ] | + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 (2 rows) -- Test constants evaluation in a CTE, which was causing issues in the past @@ -409,23 +762,139 @@ FROM cte; -------- (0 rows) --- Simple array would be squashed as well SELECT pg_stat_statements_reset() IS NOT NULL AS t; t --- t (1 row) -SELECT ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; - array ------------------------- - {1,2,3,4,5,6,7,8,9,10} +-- Rewritten as an OpExpr, so it will not be squashed +select where '1' IN ('1'::int, '2'::int::text); +-- +(1 row) + +-- Rewritten as an ArrayExpr, so it will be squashed +select where '1' IN ('1'::int, '2'::int); +-- (1 row) SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; query | calls ----------------------------------------------------+------- - SELECT ARRAY[$1 /*, ... */] | 1 SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 + select where $1 IN ($2 /*, ... */) | 1 + select where $1 IN ($2::int, $3::int::text) | 1 +(3 rows) + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +-- Both of these queries will be rewritten as an ArrayExpr, so they +-- will be squashed, and have a similar queryId +select where '1' IN ('1'::int::text, '2'::int::text); +-- +(1 row) + +select where '1' = ANY (array['1'::int::text, '2'::int::text]); +-- +(1 row) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +----------------------------------------------------+------- + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 + select where $1 IN ($2 /*, ... */) | 2 (2 rows) +-- composite function with row expansion +create table test_composite(x integer); +CREATE FUNCTION composite_f(a integer[], out x integer, out y integer) returns +record as $$ begin + x = a[1]; + y = a[2]; + end; +$$ language plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT ((composite_f(array[1, 2]))).* FROM test_composite; + x | y +---+--- +(0 rows) + +SELECT ((composite_f(array[1, 2, 3]))).* FROM test_composite; + x | y +---+--- +(0 rows) + +SELECT ((composite_f(array[1, 2, 3]))).*, 1, 2, 3, ((composite_f(array[1, 2, 3]))).*, 1, 2 +FROM test_composite +WHERE x IN (1, 2, 3); + x | y | ?column? | ?column? | ?column? | x | y | ?column? | ?column? +---+---+----------+----------+----------+---+---+----------+---------- +(0 rows) + +SELECT ((composite_f(array[1, $1, 3]))).*, 1 FROM test_composite \bind 1 +; + x | y | ?column? +---+---+---------- +(0 rows) + +-- ROW() expression with row expansion +SELECT (ROW(ARRAY[1,2])).*; + f1 +------- + {1,2} +(1 row) + +SELECT (ROW(ARRAY[1, 2], ARRAY[1, 2, 3])).*; + f1 | f2 +-------+--------- + {1,2} | {1,2,3} +(1 row) + +SELECT 1, 2, (ROW(ARRAY[1, 2], ARRAY[1, 2, 3])).*, 3, 4; + ?column? | ?column? | f1 | f2 | ?column? | ?column? +----------+----------+-------+---------+----------+---------- + 1 | 2 | {1,2} | {1,2,3} | 3 | 4 +(1 row) + +SELECT (ROW(ARRAY[1, 2], ARRAY[1, $1, 3])).*, 1 \bind 1 +; + f1 | f2 | ?column? +-------+---------+---------- + {1,2} | {1,1,3} | 1 +(1 row) + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + query | calls +-------------------------------------------------------------------------------------------------------------+------- + SELECT $1, $2, (ROW(ARRAY[$3 /*, ... */], ARRAY[$4 /*, ... */])).*, $5, $6 | 1 + SELECT ((composite_f(array[$1 /*, ... */]))).* FROM test_composite | 2 + SELECT ((composite_f(array[$1 /*, ... */]))).*, $2 FROM test_composite | 1 + SELECT ((composite_f(array[$1 /*, ... */]))).*, $2, $3, $4, ((composite_f(array[$5 /*, ... */]))).*, $6, $7+| 1 + FROM test_composite +| + WHERE x IN ($8 /*, ... */) | + SELECT (ROW(ARRAY[$1 /*, ... */])).* | 1 + SELECT (ROW(ARRAY[$1 /*, ... */], ARRAY[$2 /*, ... */])).* | 1 + SELECT (ROW(ARRAY[$1 /*, ... */], ARRAY[$2 /*, ... */])).*, $3 | 1 + SELECT pg_stat_statements_reset() IS NOT NULL AS t | 1 +(8 rows) + +-- +-- cleanup +-- +DROP TABLE test_squash; +DROP TABLE test_float; +DROP TABLE test_squash_numeric; +DROP TABLE test_squash_bigint; +DROP TABLE test_squash_cast CASCADE; +DROP TABLE test_squash_jsonb; +DROP TABLE test_composite; +DROP FUNCTION composite_f; diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out index aa4f0f7e62805..e4d6564ea5b5a 100644 --- a/contrib/pg_stat_statements/expected/utility.out +++ b/contrib/pg_stat_statements/expected/utility.out @@ -540,7 +540,7 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; -------+------+---------------------------------------------------- 2 | 0 | DEALLOCATE $1 2 | 0 | DEALLOCATE ALL - 2 | 2 | SELECT $1 AS a + 2 | 2 | PREPARE stat_select AS SELECT $1 AS a 1 | 1 | SELECT $1 as a 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (5 rows) @@ -702,7 +702,7 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 13 | CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas 1 | 10 | CREATE TABLE pgss_ctas AS SELECT a, $1 b FROM generate_series($2, $3) a 1 | 0 | DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv - 1 | 5 | FETCH FORWARD 5 pgss_cursor + 1 | 5 | FETCH FORWARD $1 pgss_cursor 1 | 7 | FETCH FORWARD ALL pgss_cursor 1 | 1 | FETCH NEXT pgss_cursor 1 | 13 | REFRESH MATERIALIZED VIEW pgss_matv diff --git a/contrib/pg_stat_statements/meson.build b/contrib/pg_stat_statements/meson.build index 01a6cbdcf6139..7b8bfbb1de78c 100644 --- a/contrib/pg_stat_statements/meson.build +++ b/contrib/pg_stat_statements/meson.build @@ -21,6 +21,7 @@ contrib_targets += pg_stat_statements install_data( 'pg_stat_statements.control', 'pg_stat_statements--1.4.sql', + 'pg_stat_statements--1.12--1.13.sql', 'pg_stat_statements--1.11--1.12.sql', 'pg_stat_statements--1.10--1.11.sql', 'pg_stat_statements--1.9--1.10.sql', @@ -54,6 +55,7 @@ tests += { 'privileges', 'extended', 'parallel', + 'plancache', 'cleanup', 'oldextversions', 'squashing', diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql new file mode 100644 index 0000000000000..2f0eaf14ec34d --- /dev/null +++ b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql @@ -0,0 +1,78 @@ +/* contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_stat_statements UPDATE TO '1.13'" to load this file. \quit + +/* First we have to remove them from the extension */ +ALTER EXTENSION pg_stat_statements DROP VIEW pg_stat_statements; +ALTER EXTENSION pg_stat_statements DROP FUNCTION pg_stat_statements(boolean); + +/* Then we can drop them */ +DROP VIEW pg_stat_statements; +DROP FUNCTION pg_stat_statements(boolean); + +/* Now redefine */ +CREATE FUNCTION pg_stat_statements(IN showtext boolean, + OUT userid oid, + OUT dbid oid, + OUT toplevel bool, + OUT queryid bigint, + OUT query text, + OUT plans int8, + OUT total_plan_time float8, + OUT min_plan_time float8, + OUT max_plan_time float8, + OUT mean_plan_time float8, + OUT stddev_plan_time float8, + OUT calls int8, + OUT total_exec_time float8, + OUT min_exec_time float8, + OUT max_exec_time float8, + OUT mean_exec_time float8, + OUT stddev_exec_time float8, + OUT rows int8, + OUT shared_blks_hit int8, + OUT shared_blks_read int8, + OUT shared_blks_dirtied int8, + OUT shared_blks_written int8, + OUT local_blks_hit int8, + OUT local_blks_read int8, + OUT local_blks_dirtied int8, + OUT local_blks_written int8, + OUT temp_blks_read int8, + OUT temp_blks_written int8, + OUT shared_blk_read_time float8, + OUT shared_blk_write_time float8, + OUT local_blk_read_time float8, + OUT local_blk_write_time float8, + OUT temp_blk_read_time float8, + OUT temp_blk_write_time float8, + OUT wal_records int8, + OUT wal_fpi int8, + OUT wal_bytes numeric, + OUT wal_buffers_full int8, + OUT jit_functions int8, + OUT jit_generation_time float8, + OUT jit_inlining_count int8, + OUT jit_inlining_time float8, + OUT jit_optimization_count int8, + OUT jit_optimization_time float8, + OUT jit_emission_count int8, + OUT jit_emission_time float8, + OUT jit_deform_count int8, + OUT jit_deform_time float8, + OUT parallel_workers_to_launch int8, + OUT parallel_workers_launched int8, + OUT generic_plan_calls int8, + OUT custom_plan_calls int8, + OUT stats_since timestamp with time zone, + OUT minmax_stats_since timestamp with time zone +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_stat_statements_1_13' +LANGUAGE C STRICT VOLATILE PARALLEL SAFE; + +CREATE VIEW pg_stat_statements AS + SELECT * FROM pg_stat_statements(true); + +GRANT SELECT ON pg_stat_statements TO PUBLIC; diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index d8fdf42df7935..39208f80b5bb7 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -47,6 +47,7 @@ #include #include +#include "access/htup_details.h" #include "access/parallel.h" #include "catalog/pg_authid.h" #include "common/int.h" @@ -85,7 +86,7 @@ PG_MODULE_MAGIC_EXT( #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat" /* Magic number identifying the stats file format */ -static const uint32 PGSS_FILE_HEADER = 0x20220408; +static const uint32 PGSS_FILE_HEADER = 0x20250731; /* PostgreSQL major version number, changes in which invalidate all entries */ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; @@ -114,6 +115,7 @@ typedef enum pgssVersion PGSS_V1_10, PGSS_V1_11, PGSS_V1_12, + PGSS_V1_13, } pgssVersion; typedef enum pgssStoreKind @@ -138,13 +140,12 @@ typedef enum pgssStoreKind * If you add a new key to this struct, make sure to teach pgss_store() to * zero the padding bytes. Otherwise, things will break, because pgss_hash is * created using HASH_BLOBS, and thus tag_hash is used to hash this. - */ typedef struct pgssHashKey { Oid userid; /* user OID */ Oid dbid; /* database OID */ - uint64 queryid; /* query identifier */ + int64 queryid; /* query identifier */ bool toplevel; /* query executed at top level */ } pgssHashKey; @@ -210,6 +211,8 @@ typedef struct Counters * to be launched */ int64 parallel_workers_launched; /* # of parallel workers actually * launched */ + int64 generic_plan_calls; /* number of calls using a generic plan */ + int64 custom_plan_calls; /* number of calls using a custom plan */ } Counters; /* @@ -323,6 +326,7 @@ PG_FUNCTION_INFO_V1(pg_stat_statements_1_9); PG_FUNCTION_INFO_V1(pg_stat_statements_1_10); PG_FUNCTION_INFO_V1(pg_stat_statements_1_11); PG_FUNCTION_INFO_V1(pg_stat_statements_1_12); +PG_FUNCTION_INFO_V1(pg_stat_statements_1_13); PG_FUNCTION_INFO_V1(pg_stat_statements); PG_FUNCTION_INFO_V1(pg_stat_statements_info); @@ -334,7 +338,8 @@ static void pgss_post_parse_analyze(ParseState *pstate, Query *query, static PlannedStmt *pgss_planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams); + ParamListInfo boundParams, + ExplainState *es); static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags); static void pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, @@ -346,7 +351,7 @@ static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, ProcessUtilityContext context, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *qc); -static void pgss_store(const char *query, uint64 queryId, +static void pgss_store(const char *query, int64 queryId, int query_location, int query_len, pgssStoreKind kind, double total_time, uint64 rows, @@ -355,7 +360,8 @@ static void pgss_store(const char *query, uint64 queryId, const struct JitInstrumentation *jitusage, JumbleState *jstate, int parallel_workers_to_launch, - int parallel_workers_launched); + int parallel_workers_launched, + PlannedStmtOrigin planOrigin); static void pg_stat_statements_internal(FunctionCallInfo fcinfo, pgssVersion api_version, bool showtext); @@ -370,7 +376,7 @@ static char *qtext_fetch(Size query_offset, int query_len, char *buffer, Size buffer_size); static bool need_gc_qtexts(void); static void gc_qtexts(void); -static TimestampTz entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only); +static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only); static char *generate_normalized_query(JumbleState *jstate, const char *query, int query_loc, int *query_len_p); static void fill_in_constant_lengths(JumbleState *jstate, const char *query, @@ -852,7 +858,7 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) { if (pgss_track_utility && IsA(query->utilityStmt, ExecuteStmt)) { - query->queryId = UINT64CONST(0); + query->queryId = INT64CONST(0); return; } } @@ -877,7 +883,8 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) NULL, jstate, 0, - 0); + 0, + PLAN_STMT_UNKNOWN); } /* @@ -888,7 +895,8 @@ static PlannedStmt * pgss_planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams) + ParamListInfo boundParams, + ExplainState *es) { PlannedStmt *result; @@ -899,7 +907,7 @@ pgss_planner(Query *parse, */ if (pgss_enabled(nesting_level) && pgss_track_planning && query_string - && parse->queryId != UINT64CONST(0)) + && parse->queryId != INT64CONST(0)) { instr_time start; instr_time duration; @@ -923,10 +931,10 @@ pgss_planner(Query *parse, { if (prev_planner_hook) result = prev_planner_hook(parse, query_string, cursorOptions, - boundParams); + boundParams, es); else result = standard_planner(parse, query_string, cursorOptions, - boundParams); + boundParams, es); } PG_FINALLY(); { @@ -957,7 +965,8 @@ pgss_planner(Query *parse, NULL, NULL, 0, - 0); + 0, + result->planOrigin); } else { @@ -971,10 +980,10 @@ pgss_planner(Query *parse, { if (prev_planner_hook) result = prev_planner_hook(parse, query_string, cursorOptions, - boundParams); + boundParams, es); else result = standard_planner(parse, query_string, cursorOptions, - boundParams); + boundParams, es); } PG_FINALLY(); { @@ -1002,7 +1011,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) * counting of optimizable statements that are directly contained in * utility statements. */ - if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0)) + if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0)) { /* * Set up to track total elapsed time in ExecutorRun. Make sure the @@ -1068,9 +1077,9 @@ pgss_ExecutorFinish(QueryDesc *queryDesc) static void pgss_ExecutorEnd(QueryDesc *queryDesc) { - uint64 queryId = queryDesc->plannedstmt->queryId; + int64 queryId = queryDesc->plannedstmt->queryId; - if (queryId != UINT64CONST(0) && queryDesc->totaltime && + if (queryId != INT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { /* @@ -1091,7 +1100,8 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, - queryDesc->estate->es_parallel_workers_launched); + queryDesc->estate->es_parallel_workers_launched, + queryDesc->plannedstmt->planOrigin); } if (prev_ExecutorEnd) @@ -1111,7 +1121,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, DestReceiver *dest, QueryCompletion *qc) { Node *parsetree = pstmt->utilityStmt; - uint64 saved_queryId = pstmt->queryId; + int64 saved_queryId = pstmt->queryId; int saved_stmt_location = pstmt->stmt_location; int saved_stmt_len = pstmt->stmt_len; bool enabled = pgss_track_utility && pgss_enabled(nesting_level); @@ -1131,7 +1141,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * only. */ if (enabled) - pstmt->queryId = UINT64CONST(0); + pstmt->queryId = INT64CONST(0); /* * If it's an EXECUTE statement, we don't track it and don't increment the @@ -1224,7 +1234,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, NULL, NULL, 0, - 0); + 0, + pstmt->planOrigin); } else { @@ -1278,7 +1289,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * for the arrays in the Counters field. */ static void -pgss_store(const char *query, uint64 queryId, +pgss_store(const char *query, int64 queryId, int query_location, int query_len, pgssStoreKind kind, double total_time, uint64 rows, @@ -1287,7 +1298,8 @@ pgss_store(const char *query, uint64 queryId, const struct JitInstrumentation *jitusage, JumbleState *jstate, int parallel_workers_to_launch, - int parallel_workers_launched) + int parallel_workers_launched, + PlannedStmtOrigin planOrigin) { pgssHashKey key; pgssEntry *entry; @@ -1304,7 +1316,7 @@ pgss_store(const char *query, uint64 queryId, * Nothing to do if compute_query_id isn't enabled and no other module * computed a query identifier. */ - if (queryId == UINT64CONST(0)) + if (queryId == INT64CONST(0)) return; /* @@ -1495,6 +1507,12 @@ pgss_store(const char *query, uint64 queryId, entry->counters.parallel_workers_to_launch += parallel_workers_to_launch; entry->counters.parallel_workers_launched += parallel_workers_launched; + /* plan cache counters */ + if (planOrigin == PLAN_STMT_CACHE_GENERIC) + entry->counters.generic_plan_calls++; + else if (planOrigin == PLAN_STMT_CACHE_CUSTOM) + entry->counters.custom_plan_calls++; + SpinLockRelease(&entry->mutex); } @@ -1514,11 +1532,11 @@ pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS) { Oid userid; Oid dbid; - uint64 queryid; + int64 queryid; userid = PG_GETARG_OID(0); dbid = PG_GETARG_OID(1); - queryid = (uint64) PG_GETARG_INT64(2); + queryid = PG_GETARG_INT64(2); entry_reset(userid, dbid, queryid, false); @@ -1530,12 +1548,12 @@ pg_stat_statements_reset_1_11(PG_FUNCTION_ARGS) { Oid userid; Oid dbid; - uint64 queryid; + int64 queryid; bool minmax_only; userid = PG_GETARG_OID(0); dbid = PG_GETARG_OID(1); - queryid = (uint64) PG_GETARG_INT64(2); + queryid = PG_GETARG_INT64(2); minmax_only = PG_GETARG_BOOL(3); PG_RETURN_TIMESTAMPTZ(entry_reset(userid, dbid, queryid, minmax_only)); @@ -1562,7 +1580,8 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS) #define PG_STAT_STATEMENTS_COLS_V1_10 43 #define PG_STAT_STATEMENTS_COLS_V1_11 49 #define PG_STAT_STATEMENTS_COLS_V1_12 52 -#define PG_STAT_STATEMENTS_COLS 52 /* maximum of above */ +#define PG_STAT_STATEMENTS_COLS_V1_13 54 +#define PG_STAT_STATEMENTS_COLS 54 /* maximum of above */ /* * Retrieve statement statistics. @@ -1574,6 +1593,16 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS) * expected API version is identified by embedding it in the C name of the * function. Unfortunately we weren't bright enough to do that for 1.1. */ +Datum +pg_stat_statements_1_13(PG_FUNCTION_ARGS) +{ + bool showtext = PG_GETARG_BOOL(0); + + pg_stat_statements_internal(fcinfo, PGSS_V1_13, showtext); + + return (Datum) 0; +} + Datum pg_stat_statements_1_12(PG_FUNCTION_ARGS) { @@ -1732,6 +1761,10 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, if (api_version != PGSS_V1_12) elog(ERROR, "incorrect number of output arguments"); break; + case PG_STAT_STATEMENTS_COLS_V1_13: + if (api_version != PGSS_V1_13) + elog(ERROR, "incorrect number of output arguments"); + break; default: elog(ERROR, "incorrect number of output arguments"); } @@ -1984,6 +2017,11 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, values[i++] = Int64GetDatumFast(tmp.parallel_workers_to_launch); values[i++] = Int64GetDatumFast(tmp.parallel_workers_launched); } + if (api_version >= PGSS_V1_13) + { + values[i++] = Int64GetDatumFast(tmp.generic_plan_calls); + values[i++] = Int64GetDatumFast(tmp.custom_plan_calls); + } if (api_version >= PGSS_V1_11) { values[i++] = TimestampTzGetDatum(stats_since); @@ -1999,6 +2037,7 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, api_version == PGSS_V1_10 ? PG_STAT_STATEMENTS_COLS_V1_10 : api_version == PGSS_V1_11 ? PG_STAT_STATEMENTS_COLS_V1_11 : api_version == PGSS_V1_12 ? PG_STAT_STATEMENTS_COLS_V1_12 : + api_version == PGSS_V1_13 ? PG_STAT_STATEMENTS_COLS_V1_13 : -1 /* fail if you forget to update this assert */ )); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); @@ -2671,13 +2710,13 @@ if (e) { \ * Reset entries corresponding to parameters passed. */ static TimestampTz -entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only) +entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only) { HASH_SEQ_STATUS hash_seq; pgssEntry *entry; FILE *qfile; - long num_entries; - long num_remove = 0; + int64 num_entries; + int64 num_remove = 0; pgssHashKey key; TimestampTz stats_reset; @@ -2691,7 +2730,7 @@ entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only) stats_reset = GetCurrentTimestamp(); - if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0)) + if (userid != 0 && dbid != 0 && queryid != INT64CONST(0)) { /* If all the parameters are available, use the fast path. */ memset(&key, 0, sizeof(pgssHashKey)); @@ -2714,7 +2753,7 @@ entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only) SINGLE_ENTRY_RESET(entry); } - else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0)) + else if (userid != 0 || dbid != 0 || queryid != INT64CONST(0)) { /* Reset entries corresponding to valid parameters. */ hash_seq_init(&hash_seq, pgss_hash); @@ -2810,17 +2849,13 @@ generate_normalized_query(JumbleState *jstate, const char *query, { char *norm_query; int query_len = *query_len_p; - int i, - norm_query_buflen, /* Space allowed for norm_query */ + int norm_query_buflen, /* Space allowed for norm_query */ len_to_wrt, /* Length (in bytes) to write */ quer_loc = 0, /* Source query byte location */ n_quer_loc = 0, /* Normalized query byte location */ last_off = 0, /* Offset from start for previous tok */ last_tok_len = 0; /* Length (in bytes) of that tok */ - bool in_squashed = false; /* in a run of squashed consts? */ - int skipped_constants = 0; /* Position adjustment of later - * constants after squashed ones */ - + int num_constants_replaced = 0; /* * Get constants' lengths (core system only gives us locations). Note @@ -2834,20 +2869,27 @@ generate_normalized_query(JumbleState *jstate, const char *query, * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We * could refine that limit based on the max value of n for the current * query, but it hardly seems worth any extra effort to do so. - * - * Note this also gives enough room for the commented-out ", ..." list - * syntax used by constant squashing. */ norm_query_buflen = query_len + jstate->clocations_count * 10; /* Allocate result buffer */ norm_query = palloc(norm_query_buflen + 1); - for (i = 0; i < jstate->clocations_count; i++) + for (int i = 0; i < jstate->clocations_count; i++) { int off, /* Offset from start for cur tok */ tok_len; /* Length (in bytes) of that tok */ + /* + * If we have an external param at this location, but no lists are + * being squashed across the query, then we skip here; this will make + * us print the characters found in the original query that represent + * the parameter in the next iteration (or after the loop is done), + * which is a bit odd but seems to work okay in most cases. + */ + if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists) + continue; + off = jstate->clocations[i].location; /* Adjust recorded location if we're dealing with partial string */ @@ -2858,67 +2900,24 @@ generate_normalized_query(JumbleState *jstate, const char *query, if (tok_len < 0) continue; /* ignore any duplicates */ + /* Copy next chunk (what precedes the next constant) */ + len_to_wrt = off - last_off; + len_to_wrt -= last_tok_len; + Assert(len_to_wrt >= 0); + memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); + n_quer_loc += len_to_wrt; + /* - * What to do next depends on whether we're squashing constant lists, - * and whether we're already in a run of such constants. + * And insert a param symbol in place of the constant token; and, if + * we have a squashable list, insert a placeholder comment starting + * from the list's second value. */ - if (!jstate->clocations[i].squashed) - { - /* - * This location corresponds to a constant not to be squashed. - * Print what comes before the constant ... - */ - len_to_wrt = off - last_off; - len_to_wrt -= last_tok_len; - - Assert(len_to_wrt >= 0); - - memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); - n_quer_loc += len_to_wrt; - - /* ... and then a param symbol replacing the constant itself */ - n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d", - i + 1 + jstate->highest_extern_param_id - skipped_constants); - - /* In case previous constants were merged away, stop doing that */ - in_squashed = false; - } - else if (!in_squashed) - { - /* - * This location is the start position of a run of constants to be - * squashed, so we need to print the representation of starting a - * group of stashed constants. - * - * Print what comes before the constant ... - */ - len_to_wrt = off - last_off; - len_to_wrt -= last_tok_len; - Assert(len_to_wrt >= 0); - Assert(i + 1 < jstate->clocations_count); - Assert(jstate->clocations[i + 1].squashed); - memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); - n_quer_loc += len_to_wrt; - - /* ... and then start a run of squashed constants */ - n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d /*, ... */", - i + 1 + jstate->highest_extern_param_id - skipped_constants); - - /* The next location will match the block below, to end the run */ - in_squashed = true; - - skipped_constants++; - } - else - { - /* - * The second location of a run of squashable elements; this - * indicates its end. - */ - in_squashed = false; - } + n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s", + num_constants_replaced + 1 + jstate->highest_extern_param_id, + jstate->clocations[i].squashed ? " /*, ... */" : ""); + num_constants_replaced++; - /* Otherwise the constant is squashed away -- move forward */ + /* move forward */ quer_loc = off + tok_len; last_off = off; last_tok_len = tok_len; @@ -2955,9 +2954,8 @@ generate_normalized_query(JumbleState *jstate, const char *query, * have originated from within the authoritative parser, this should not be * a problem. * - * Duplicate constant pointers are possible, and will have their lengths - * marked as '-1', so that they are later ignored. (Actually, we assume the - * lengths were initialized as -1 to start with, and don't change them here.) + * Multiple constants can have the same location. We reset lengths of those + * past the first to -1 so that they can later be ignored. * * If query_loc > 0, then "query" has been advanced by that much compared to * the original string start, so we need to translate the provided locations @@ -2977,8 +2975,6 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query, core_yy_extra_type yyextra; core_YYSTYPE yylval; YYLTYPE yylloc; - int last_loc = -1; - int i; /* * Sort the records by location so that we can process them in order while @@ -2999,20 +2995,29 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query, yyextra.escape_string_warning = false; /* Search for each constant, in sequence */ - for (i = 0; i < jstate->clocations_count; i++) + for (int i = 0; i < jstate->clocations_count; i++) { - int loc = locs[i].location; + int loc; int tok; - /* Adjust recorded location if we're dealing with partial string */ - loc -= query_loc; + /* Ignore constants after the first one in the same location */ + if (i > 0 && locs[i].location == locs[i - 1].location) + { + locs[i].length = -1; + continue; + } - Assert(loc >= 0); + if (locs[i].squashed) + continue; /* squashable list, ignore */ - if (loc <= last_loc) - continue; /* Duplicate constant, ignore */ + /* Adjust recorded location if we're dealing with partial string */ + loc = locs[i].location - query_loc; + Assert(loc >= 0); - /* Lex tokens until we find the desired constant */ + /* + * We have a valid location for a constant that's not a dupe. Lex + * tokens until we find the desired constant. + */ for (;;) { tok = core_yylex(&yylval, &yylloc, yyscanner); @@ -3058,8 +3063,6 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query, /* If we hit end-of-string, give up, leaving remaining lengths -1 */ if (tok == 0) break; - - last_loc = loc; } scanner_finish(yyscanner); diff --git a/contrib/pg_stat_statements/pg_stat_statements.control b/contrib/pg_stat_statements/pg_stat_statements.control index d45ebc12e3605..2eee0ceffa894 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.control +++ b/contrib/pg_stat_statements/pg_stat_statements.control @@ -1,5 +1,5 @@ # pg_stat_statements extension comment = 'track planning and execution statistics of all SQL statements executed' -default_version = '1.12' +default_version = '1.13' module_pathname = '$libdir/pg_stat_statements' relocatable = true diff --git a/contrib/pg_stat_statements/sql/cursors.sql b/contrib/pg_stat_statements/sql/cursors.sql index 61738ac470e82..78bb42284331f 100644 --- a/contrib/pg_stat_statements/sql/cursors.sql +++ b/contrib/pg_stat_statements/sql/cursors.sql @@ -28,3 +28,46 @@ COMMIT; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- Normalization of FETCH statements +BEGIN; +DECLARE pgss_cursor CURSOR FOR SELECT FROM generate_series(1, 10); +-- implicit directions +FETCH pgss_cursor; +FETCH 1 pgss_cursor; +FETCH 2 pgss_cursor; +FETCH -1 pgss_cursor; +-- explicit NEXT +FETCH NEXT pgss_cursor; +-- explicit PRIOR +FETCH PRIOR pgss_cursor; +-- explicit FIRST +FETCH FIRST pgss_cursor; +-- explicit LAST +FETCH LAST pgss_cursor; +-- explicit ABSOLUTE +FETCH ABSOLUTE 1 pgss_cursor; +FETCH ABSOLUTE 2 pgss_cursor; +FETCH ABSOLUTE -1 pgss_cursor; +-- explicit RELATIVE +FETCH RELATIVE 1 pgss_cursor; +FETCH RELATIVE 2 pgss_cursor; +FETCH RELATIVE -1 pgss_cursor; +-- explicit FORWARD +FETCH ALL pgss_cursor; +-- explicit FORWARD ALL +FETCH FORWARD ALL pgss_cursor; +-- explicit FETCH FORWARD +FETCH FORWARD pgss_cursor; +FETCH FORWARD 1 pgss_cursor; +FETCH FORWARD 2 pgss_cursor; +FETCH FORWARD -1 pgss_cursor; +-- explicit FETCH BACKWARD +FETCH BACKWARD pgss_cursor; +FETCH BACKWARD 1 pgss_cursor; +FETCH BACKWARD 2 pgss_cursor; +FETCH BACKWARD -1 pgss_cursor; +-- explicit BACKWARD ALL +FETCH BACKWARD ALL pgss_cursor; +COMMIT; +SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; diff --git a/contrib/pg_stat_statements/sql/extended.sql b/contrib/pg_stat_statements/sql/extended.sql index 1af0711020c41..9a6518e2f0487 100644 --- a/contrib/pg_stat_statements/sql/extended.sql +++ b/contrib/pg_stat_statements/sql/extended.sql @@ -19,3 +19,28 @@ SELECT $1 \bind 'unnamed_val1' \g \bind_named stmt1 'stmt1_val1' \g SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- Various parameter numbering patterns +-- Unique query IDs with parameter numbers switched. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT WHERE ($1::int, 7) IN ((8, $2::int), ($3::int, 9)) \bind '1' '2' '3' \g +SELECT WHERE ($2::int, 10) IN ((11, $3::int), ($1::int, 12)) \bind '1' '2' '3' \g +SELECT WHERE $1::int IN ($2::int, $3::int) \bind '1' '2' '3' \g +SELECT WHERE $2::int IN ($3::int, $1::int) \bind '1' '2' '3' \g +SELECT WHERE $3::int IN ($1::int, $2::int) \bind '1' '2' '3' \g +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- Two groups of two queries with the same query ID. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT WHERE '1'::int IN ($1::int, '2'::int) \bind '1' \g +SELECT WHERE '4'::int IN ($1::int, '5'::int) \bind '2' \g +SELECT WHERE $2::int IN ($1::int, '1'::int) \bind '1' '2' \g +SELECT WHERE $2::int IN ($1::int, '2'::int) \bind '3' '4' \g +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- no squashable list, the parameters id's are kept as-is +SELECT WHERE $3 = $1 AND $2 = $4 \bind 1 2 1 2 \g +-- squashable list, so the parameter IDs will be re-assigned +SELECT WHERE 1 IN (1, 2, 3) AND $3 = $1 AND $2 = $4 \bind 1 2 1 2 \g + +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; diff --git a/contrib/pg_stat_statements/sql/oldextversions.sql b/contrib/pg_stat_statements/sql/oldextversions.sql index 13b8ca28586d1..e416efe9ffbee 100644 --- a/contrib/pg_stat_statements/sql/oldextversions.sql +++ b/contrib/pg_stat_statements/sql/oldextversions.sql @@ -63,4 +63,9 @@ AlTER EXTENSION pg_stat_statements UPDATE TO '1.12'; \d pg_stat_statements SELECT count(*) > 0 AS has_data FROM pg_stat_statements; +-- New functions and views for pg_stat_statements in 1.13 +AlTER EXTENSION pg_stat_statements UPDATE TO '1.13'; +\d pg_stat_statements +SELECT count(*) > 0 AS has_data FROM pg_stat_statements; + DROP EXTENSION pg_stat_statements; diff --git a/contrib/pg_stat_statements/sql/plancache.sql b/contrib/pg_stat_statements/sql/plancache.sql new file mode 100644 index 0000000000000..160ced7add368 --- /dev/null +++ b/contrib/pg_stat_statements/sql/plancache.sql @@ -0,0 +1,94 @@ +-- +-- Tests with plan cache +-- + +-- Setup +CREATE OR REPLACE FUNCTION select_one_func(int) RETURNS VOID AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; +CREATE OR REPLACE PROCEDURE select_one_proc(int) AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; + +-- Prepared statements +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +PREPARE p1 AS SELECT $1 AS a; +SET plan_cache_mode TO force_generic_plan; +EXECUTE p1(1); +SET plan_cache_mode TO force_custom_plan; +EXECUTE p1(1); +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; +DEALLOCATE p1; + +-- Extended query protocol +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT $1 AS a \parse p1 +SET plan_cache_mode TO force_generic_plan; +\bind_named p1 1 +; +SET plan_cache_mode TO force_custom_plan; +\bind_named p1 1 +; +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; +\close_prepared p1 + +-- EXPLAIN [ANALYZE] EXECUTE +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +PREPARE p1 AS SELECT $1; +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; +RESET pg_stat_statements.track; +DEALLOCATE p1; + +-- Functions/procedures +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SET plan_cache_mode TO force_generic_plan; +SELECT select_one_func(1); +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +SELECT select_one_func(1); +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + +-- +-- EXPLAIN [ANALYZE] EXECUTE + functions/procedures +-- +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); +EXPLAIN (COSTS OFF) SELECT select_one_func(1); +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); +EXPLAIN (COSTS OFF) SELECT select_one_func(1); +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C", toplevel; + +RESET pg_stat_statements.track; + +-- +-- Cleanup +-- +DROP FUNCTION select_one_func(int); +DROP PROCEDURE select_one_proc(int); diff --git a/contrib/pg_stat_statements/sql/planning.sql b/contrib/pg_stat_statements/sql/planning.sql index 9cfe206b3b049..46f5d9b951c45 100644 --- a/contrib/pg_stat_statements/sql/planning.sql +++ b/contrib/pg_stat_statements/sql/planning.sql @@ -20,11 +20,11 @@ SELECT 42; SELECT 42; SELECT 42; SELECT plans, calls, rows, query FROM pg_stat_statements - WHERE query NOT LIKE 'SELECT COUNT%' ORDER BY query COLLATE "C"; + WHERE query NOT LIKE 'PREPARE%' ORDER BY query COLLATE "C"; -- for the prepared statement we expect at least one replan, but cache -- invalidations could force more SELECT plans >= 2 AND plans <= calls AS plans_ok, calls, rows, query FROM pg_stat_statements - WHERE query LIKE 'SELECT COUNT%' ORDER BY query COLLATE "C"; + WHERE query LIKE 'PREPARE%' ORDER BY query COLLATE "C"; -- Cleanup DROP TABLE stats_plan_test; diff --git a/contrib/pg_stat_statements/sql/select.sql b/contrib/pg_stat_statements/sql/select.sql index c5e0b84ee5bf5..11662cde08c92 100644 --- a/contrib/pg_stat_statements/sql/select.sql +++ b/contrib/pg_stat_statements/sql/select.sql @@ -79,6 +79,22 @@ DEALLOCATE pgss_test; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; SELECT pg_stat_statements_reset() IS NOT NULL AS t; +-- normalization of constants and parameters, with constant locations +-- recorded one or more times. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT WHERE '1' IN ('1'::int, '3'::int::text); +SELECT WHERE (1, 2) IN ((1, 2), (2, 3)); +SELECT WHERE (3, 4) IN ((5, 6), (8, 7)); +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- with the last element being an explicit function call with an argument, ensure +-- the normalization of the squashing interval is correct. +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT WHERE 1 IN (1, int4(1), int4(2)); +SELECT WHERE 1 = ANY (ARRAY[1, int4(1), int4(2)]); +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + -- -- queries with locking clauses -- diff --git a/contrib/pg_stat_statements/sql/squashing.sql b/contrib/pg_stat_statements/sql/squashing.sql index 03efd4b40c8e7..03b0515f87285 100644 --- a/contrib/pg_stat_statements/sql/squashing.sql +++ b/contrib/pg_stat_statements/sql/squashing.sql @@ -3,101 +3,160 @@ -- CREATE EXTENSION pg_stat_statements; -CREATE TABLE test_squash (id int, data int); +-- +-- Simple Lists +-- --- IN queries +CREATE TABLE test_squash (id int, data int); --- Normal scenario, too many simple constants for an IN query +-- single element will not be squashed SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash WHERE id IN (1); +SELECT ARRAY[1]; +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- more than 1 element in a list will be squashed +SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash WHERE id IN (1, 2, 3); +SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4); +SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5); +SELECT ARRAY[1, 2, 3]; +SELECT ARRAY[1, 2, 3, 4]; +SELECT ARRAY[1, 2, 3, 4, 5]; +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- built-in functions will be squashed +-- the IN and ARRAY forms of this statement will have the same queryId +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT WHERE 1 IN (1, int4(1), int4(2), 2); +SELECT WHERE 1 = ANY (ARRAY[1, int4(1), int4(2), 2]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; -SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9); -SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10); -SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); +-- external parameters will be squashed +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT * FROM test_squash WHERE id IN ($1, $2, $3, $4, $5) \bind 1 2 3 4 5 +; +SELECT * FROM test_squash WHERE id::text = ANY(ARRAY[$1, $2, $3, $4, $5]) \bind 1 2 3 4 5 +; SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; --- More conditions in the query +-- prepared statements will also be squashed +-- the IN and ARRAY forms of this statement will have the same queryId SELECT pg_stat_statements_reset() IS NOT NULL AS t; +PREPARE p1(int, int, int, int, int) AS +SELECT * FROM test_squash WHERE id IN ($1, $2, $3, $4, $5); +EXECUTE p1(1, 2, 3, 4, 5); +DEALLOCATE p1; +PREPARE p1(int, int, int, int, int) AS +SELECT * FROM test_squash WHERE id = ANY(ARRAY[$1, $2, $3, $4, $5]); +EXECUTE p1(1, 2, 3, 4, 5); +DEALLOCATE p1; +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- More conditions in the query +SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9) AND data = 2; SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10) AND data = 2; SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) AND data = 2; +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9]) AND data = 2; +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) AND data = 2; +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) AND data = 2; SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; -- Multiple squashed intervals SELECT pg_stat_statements_reset() IS NOT NULL AS t; - SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9) AND data IN (1, 2, 3, 4, 5, 6, 7, 8, 9); SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10) AND data IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10); SELECT * FROM test_squash WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) AND data IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9]) + AND data = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9]); +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + AND data = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); +SELECT * FROM test_squash WHERE id = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) + AND data = ANY (ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; - --- No constants simplification for OpExpr SELECT pg_stat_statements_reset() IS NOT NULL AS t; --- In the following two queries the operator expressions (+) and (@) have --- different oppno, and will be given different query_id if squashed, even though --- the normalized query will be the same +-- No constants squashing for OpExpr +-- The IN and ARRAY forms of this statement will have the same queryId +SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash WHERE id IN (1 + 1, 2 + 2, 3 + 3, 4 + 4, 5 + 5, 6 + 6, 7 + 7, 8 + 8, 9 + 9); SELECT * FROM test_squash WHERE id IN (@ '-1', @ '-2', @ '-3', @ '-4', @ '-5', @ '-6', @ '-7', @ '-8', @ '-9'); +SELECT * FROM test_squash WHERE id = ANY(ARRAY + [1 + 1, 2 + 2, 3 + 3, 4 + 4, 5 + 5, 6 + 6, 7 + 7, 8 + 8, 9 + 9]); +SELECT * FROM test_squash WHERE id = ANY(ARRAY + [@ '-1', @ '-2', @ '-3', @ '-4', @ '-5', @ '-6', @ '-7', @ '-8', @ '-9']); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- -- FuncExpr +-- -- Verify multiple type representation end up with the same query_id CREATE TABLE test_float (data float); +-- The casted ARRAY expressions will have the same queryId as the IN clause +-- form of the query SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT data FROM test_float WHERE data IN (1, 2); SELECT data FROM test_float WHERE data IN (1, '2'); SELECT data FROM test_float WHERE data IN ('1', 2); SELECT data FROM test_float WHERE data IN ('1', '2'); SELECT data FROM test_float WHERE data IN (1.0, 1.0); +SELECT data FROM test_float WHERE data = ANY(ARRAY['1'::double precision, '2'::double precision]); +SELECT data FROM test_float WHERE data = ANY(ARRAY[1.0::double precision, 1.0::double precision]); +SELECT data FROM test_float WHERE data = ANY(ARRAY[1, 2]); +SELECT data FROM test_float WHERE data = ANY(ARRAY[1, '2']); +SELECT data FROM test_float WHERE data = ANY(ARRAY['1', 2]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; -- Numeric type, implicit cast is squashed CREATE TABLE test_squash_numeric (id int, data numeric(5, 2)); SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash_numeric WHERE data IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); +SELECT * FROM test_squash_numeric WHERE data = ANY(ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; -- Bigint, implicit cast is squashed CREATE TABLE test_squash_bigint (id int, data bigint); SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash_bigint WHERE data IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); +SELECT * FROM test_squash_bigint WHERE data = ANY(ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; --- Bigint, explicit cast is not squashed +-- Bigint, explicit cast is squashed SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash_bigint WHERE data IN (1::bigint, 2::bigint, 3::bigint, 4::bigint, 5::bigint, 6::bigint, 7::bigint, 8::bigint, 9::bigint, 10::bigint, 11::bigint); +SELECT * FROM test_squash_bigint WHERE data = ANY(ARRAY[ + 1::bigint, 2::bigint, 3::bigint, 4::bigint, 5::bigint, 6::bigint, + 7::bigint, 8::bigint, 9::bigint, 10::bigint, 11::bigint]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; --- Bigint, long tokens with parenthesis +-- Bigint, long tokens with parenthesis, will not squash SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash_bigint WHERE id IN (abs(100), abs(200), abs(300), abs(400), abs(500), abs(600), abs(700), abs(800), abs(900), abs(1000), ((abs(1100)))); +SELECT * FROM test_squash_bigint WHERE id = ANY(ARRAY[ + abs(100), abs(200), abs(300), abs(400), abs(500), abs(600), abs(700), + abs(800), abs(900), abs(1000), ((abs(1100)))]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; --- CoerceViaIO, SubLink instead of a Const -CREATE TABLE test_squash_jsonb (id int, data jsonb); +-- Multiple FuncExpr's. Will not squash SELECT pg_stat_statements_reset() IS NOT NULL AS t; -SELECT * FROM test_squash_jsonb WHERE data IN - ((SELECT '"1"')::jsonb, (SELECT '"2"')::jsonb, (SELECT '"3"')::jsonb, - (SELECT '"4"')::jsonb, (SELECT '"5"')::jsonb, (SELECT '"6"')::jsonb, - (SELECT '"7"')::jsonb, (SELECT '"8"')::jsonb, (SELECT '"9"')::jsonb, - (SELECT '"10"')::jsonb); +SELECT WHERE 1 IN (1::int::bigint::int, 2::int::bigint::int); +SELECT WHERE 1 = ANY(ARRAY[1::int::bigint::int, 2::int::bigint::int]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- -- CoerceViaIO +-- -- Create some dummy type to force CoerceViaIO CREATE TYPE casttesttype; @@ -141,19 +200,74 @@ SELECT * FROM test_squash_cast WHERE data IN 4::int4::casttesttype, 5::int4::casttesttype, 6::int4::casttesttype, 7::int4::casttesttype, 8::int4::casttesttype, 9::int4::casttesttype, 10::int4::casttesttype, 11::int4::casttesttype); +SELECT * FROM test_squash_cast WHERE data = ANY (ARRAY + [1::int4::casttesttype, 2::int4::casttesttype, 3::int4::casttesttype, + 4::int4::casttesttype, 5::int4::casttesttype, 6::int4::casttesttype, + 7::int4::casttesttype, 8::int4::casttesttype, 9::int4::casttesttype, + 10::int4::casttesttype, 11::int4::casttesttype]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; -- Some casting expression are simplified to Const +CREATE TABLE test_squash_jsonb (id int, data jsonb); SELECT pg_stat_statements_reset() IS NOT NULL AS t; SELECT * FROM test_squash_jsonb WHERE data IN (('"1"')::jsonb, ('"2"')::jsonb, ('"3"')::jsonb, ('"4"')::jsonb, - ( '"5"')::jsonb, ( '"6"')::jsonb, ( '"7"')::jsonb, ( '"8"')::jsonb, - ( '"9"')::jsonb, ( '"10"')::jsonb); + ('"5"')::jsonb, ('"6"')::jsonb, ('"7"')::jsonb, ('"8"')::jsonb, + ('"9"')::jsonb, ('"10"')::jsonb); +SELECT * FROM test_squash_jsonb WHERE data = ANY (ARRAY + [('"1"')::jsonb, ('"2"')::jsonb, ('"3"')::jsonb, ('"4"')::jsonb, + ('"5"')::jsonb, ('"6"')::jsonb, ('"7"')::jsonb, ('"8"')::jsonb, + ('"9"')::jsonb, ('"10"')::jsonb]); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- CoerceViaIO, SubLink instead of a Const. Will not squash +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT * FROM test_squash_jsonb WHERE data IN + ((SELECT '"1"')::jsonb, (SELECT '"2"')::jsonb, (SELECT '"3"')::jsonb, + (SELECT '"4"')::jsonb, (SELECT '"5"')::jsonb, (SELECT '"6"')::jsonb, + (SELECT '"7"')::jsonb, (SELECT '"8"')::jsonb, (SELECT '"9"')::jsonb, + (SELECT '"10"')::jsonb); +SELECT * FROM test_squash_jsonb WHERE data = ANY(ARRAY + [(SELECT '"1"')::jsonb, (SELECT '"2"')::jsonb, (SELECT '"3"')::jsonb, + (SELECT '"4"')::jsonb, (SELECT '"5"')::jsonb, (SELECT '"6"')::jsonb, + (SELECT '"7"')::jsonb, (SELECT '"8"')::jsonb, (SELECT '"9"')::jsonb, + (SELECT '"10"')::jsonb]); +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- Multiple CoerceViaIO are squashed +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT WHERE 1 IN (1::text::int::text::int, 1::text::int::text::int); +SELECT WHERE 1 = ANY(ARRAY[1::text::int::text::int, 1::text::int::text::int]); +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- -- RelabelType +-- + SELECT pg_stat_statements_reset() IS NOT NULL AS t; -SELECT * FROM test_squash WHERE id IN (1::oid, 2::oid, 3::oid, 4::oid, 5::oid, 6::oid, 7::oid, 8::oid, 9::oid); +-- However many layers of RelabelType there are, the list will be squashable. +SELECT * FROM test_squash WHERE id IN + (1::oid, 2::oid, 3::oid, 4::oid, 5::oid, 6::oid, 7::oid, 8::oid, 9::oid); +SELECT ARRAY[1::oid, 2::oid, 3::oid, 4::oid, 5::oid, 6::oid, 7::oid, 8::oid, 9::oid]; +SELECT * FROM test_squash WHERE id IN (1::oid, 2::oid::int::oid); +SELECT * FROM test_squash WHERE id = ANY(ARRAY[1::oid, 2::oid::int::oid]); +-- RelabelType together with CoerceViaIO is also squashable +SELECT * FROM test_squash WHERE id = ANY(ARRAY[1::oid::text::int::oid, 2::oid::int::oid]); +SELECT * FROM test_squash WHERE id = ANY(ARRAY[1::text::int::oid, 2::oid::int::oid]); +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- +-- edge cases +-- + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +-- for nested arrays, only constants are squashed +SELECT ARRAY[ + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + ]; SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; -- Test constants evaluation in a CTE, which was causing issues in the past @@ -163,7 +277,52 @@ WITH cte AS ( SELECT ARRAY['a', 'b', 'c', const::varchar] AS result FROM cte; --- Simple array would be squashed as well SELECT pg_stat_statements_reset() IS NOT NULL AS t; -SELECT ARRAY[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; +-- Rewritten as an OpExpr, so it will not be squashed +select where '1' IN ('1'::int, '2'::int::text); +-- Rewritten as an ArrayExpr, so it will be squashed +select where '1' IN ('1'::int, '2'::int); SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +-- Both of these queries will be rewritten as an ArrayExpr, so they +-- will be squashed, and have a similar queryId +select where '1' IN ('1'::int::text, '2'::int::text); +select where '1' = ANY (array['1'::int::text, '2'::int::text]); +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- composite function with row expansion +create table test_composite(x integer); +CREATE FUNCTION composite_f(a integer[], out x integer, out y integer) returns +record as $$ begin + x = a[1]; + y = a[2]; + end; +$$ language plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT ((composite_f(array[1, 2]))).* FROM test_composite; +SELECT ((composite_f(array[1, 2, 3]))).* FROM test_composite; +SELECT ((composite_f(array[1, 2, 3]))).*, 1, 2, 3, ((composite_f(array[1, 2, 3]))).*, 1, 2 +FROM test_composite +WHERE x IN (1, 2, 3); +SELECT ((composite_f(array[1, $1, 3]))).*, 1 FROM test_composite \bind 1 +; +-- ROW() expression with row expansion +SELECT (ROW(ARRAY[1,2])).*; +SELECT (ROW(ARRAY[1, 2], ARRAY[1, 2, 3])).*; +SELECT 1, 2, (ROW(ARRAY[1, 2], ARRAY[1, 2, 3])).*, 3, 4; +SELECT (ROW(ARRAY[1, 2], ARRAY[1, $1, 3])).*, 1 \bind 1 +; +SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"; + +-- +-- cleanup +-- +DROP TABLE test_squash; +DROP TABLE test_float; +DROP TABLE test_squash_numeric; +DROP TABLE test_squash_bigint; +DROP TABLE test_squash_cast CASCADE; +DROP TABLE test_squash_jsonb; +DROP TABLE test_composite; +DROP FUNCTION composite_f; diff --git a/contrib/pg_surgery/heap_surgery.c b/contrib/pg_surgery/heap_surgery.c index 3e86283beb7cf..1096b05d7825b 100644 --- a/contrib/pg_surgery/heap_surgery.c +++ b/contrib/pg_surgery/heap_surgery.c @@ -356,8 +356,8 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) static int32 tidcmp(const void *a, const void *b) { - ItemPointer iptr1 = ((const ItemPointer) a); - ItemPointer iptr2 = ((const ItemPointer) b); + const ItemPointerData *iptr1 = a; + const ItemPointerData *iptr2 = b; return ItemPointerCompare(iptr1, iptr2); } diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index 0b70d9de25624..04da98170ab15 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -4693,6 +4693,23 @@ select count(*) from test_trgm where t like '%99%' and t like '%qw%'; 19 (1 row) +explain (costs off) +select count(*) from test_trgm where t %> '' and t %> '%qwerty%'; + QUERY PLAN +------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on test_trgm + Recheck Cond: ((t %> ''::text) AND (t %> '%qwerty%'::text)) + -> Bitmap Index Scan on trgm_idx + Index Cond: ((t %> ''::text) AND (t %> '%qwerty%'::text)) +(5 rows) + +select count(*) from test_trgm where t %> '' and t %> '%qwerty%'; + count +------- + 0 +(1 row) + -- ensure that pending-list items are handled correctly, too create temp table t_test_trgm(t text COLLATE "C"); create index t_trgm_idx on t_test_trgm using gin (t gin_trgm_ops); @@ -4731,6 +4748,23 @@ select count(*) from t_test_trgm where t like '%99%' and t like '%qw%'; 1 (1 row) +explain (costs off) +select count(*) from t_test_trgm where t %> '' and t %> '%qwerty%'; + QUERY PLAN +------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on t_test_trgm + Recheck Cond: ((t %> ''::text) AND (t %> '%qwerty%'::text)) + -> Bitmap Index Scan on t_trgm_idx + Index Cond: ((t %> ''::text) AND (t %> '%qwerty%'::text)) +(5 rows) + +select count(*) from t_test_trgm where t %> '' and t %> '%qwerty%'; + count +------- + 0 +(1 row) + -- run the same queries with sequential scan to check the results set enable_bitmapscan=off; set enable_seqscan=on; @@ -4746,6 +4780,12 @@ select count(*) from test_trgm where t like '%99%' and t like '%qw%'; 19 (1 row) +select count(*) from test_trgm where t %> '' and t %> '%qwerty%'; + count +------- + 0 +(1 row) + select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%'; count ------- @@ -4758,6 +4798,12 @@ select count(*) from t_test_trgm where t like '%99%' and t like '%qw%'; 1 (1 row) +select count(*) from t_test_trgm where t %> '' and t %> '%qwerty%'; + count +------- + 0 +(1 row) + reset enable_bitmapscan; create table test2(t text COLLATE "C"); insert into test2 values ('abcdef'); diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index 340c9891899f0..44debced6d581 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -80,6 +80,9 @@ select count(*) from test_trgm where t like '%99%' and t like '%qwerty%'; explain (costs off) select count(*) from test_trgm where t like '%99%' and t like '%qw%'; select count(*) from test_trgm where t like '%99%' and t like '%qw%'; +explain (costs off) +select count(*) from test_trgm where t %> '' and t %> '%qwerty%'; +select count(*) from test_trgm where t %> '' and t %> '%qwerty%'; -- ensure that pending-list items are handled correctly, too create temp table t_test_trgm(t text COLLATE "C"); create index t_trgm_idx on t_test_trgm using gin (t gin_trgm_ops); @@ -90,14 +93,19 @@ select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%'; explain (costs off) select count(*) from t_test_trgm where t like '%99%' and t like '%qw%'; select count(*) from t_test_trgm where t like '%99%' and t like '%qw%'; +explain (costs off) +select count(*) from t_test_trgm where t %> '' and t %> '%qwerty%'; +select count(*) from t_test_trgm where t %> '' and t %> '%qwerty%'; -- run the same queries with sequential scan to check the results set enable_bitmapscan=off; set enable_seqscan=on; select count(*) from test_trgm where t like '%99%' and t like '%qwerty%'; select count(*) from test_trgm where t like '%99%' and t like '%qw%'; +select count(*) from test_trgm where t %> '' and t %> '%qwerty%'; select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%'; select count(*) from t_test_trgm where t like '%99%' and t like '%qw%'; +select count(*) from t_test_trgm where t %> '' and t %> '%qwerty%'; reset enable_bitmapscan; create table test2(t text COLLATE "C"); diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index 29a52eac7afa4..66ff6adde9978 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -51,7 +51,7 @@ gin_extract_value_trgm(PG_FUNCTION_ARGS) int32 i; *nentries = trglen; - entries = (Datum *) palloc(sizeof(Datum) * trglen); + entries = palloc_array(Datum, trglen); ptr = GETARR(trg); for (i = 0; i < trglen; i++) @@ -123,7 +123,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) * Pointers, but we just put the same value in each element. */ trglen = ARRNELEM(trg); - *extra_data = (Pointer *) palloc(sizeof(Pointer) * trglen); + *extra_data = palloc_array(Pointer, trglen); for (i = 0; i < trglen; i++) (*extra_data)[i] = (Pointer) graph; } @@ -146,7 +146,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) if (trglen > 0) { - entries = (Datum *) palloc(sizeof(Datum) * trglen); + entries = palloc_array(Datum, trglen); ptr = GETARR(trg); for (i = 0; i < trglen; i++) { @@ -247,8 +247,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) res = true; } else - res = trigramsMatchGraph((TrgmPackedGraph *) extra_data[0], - check); + res = trigramsMatchGraph(extra_data[0], check); break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); @@ -339,11 +338,10 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS) * function, promoting all GIN_MAYBE keys to GIN_TRUE will * give a conservative result. */ - boolcheck = (bool *) palloc(sizeof(bool) * nkeys); + boolcheck = palloc_array(bool, nkeys); for (i = 0; i < nkeys; i++) boolcheck[i] = (check[i] != GIN_FALSE); - if (!trigramsMatchGraph((TrgmPackedGraph *) extra_data[0], - boolcheck)) + if (!trigramsMatchGraph(extra_data[0], boolcheck)) res = GIN_FALSE; pfree(boolcheck); } diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index 5ba895217b0a9..5c7deb103a636 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -124,7 +124,7 @@ gtrgm_compress(PG_FUNCTION_ARGS) text *val = DatumGetTextPP(entry->key); res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val)); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, false); @@ -143,7 +143,7 @@ gtrgm_compress(PG_FUNCTION_ARGS) } res = gtrgm_alloc(true, siglen, sign); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, false); @@ -163,7 +163,7 @@ gtrgm_decompress(PG_FUNCTION_ARGS) if (key != (text *) DatumGetPointer(entry->key)) { /* need to pass back the decompressed item */ - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, entry->offset, entry->leafkey); PG_RETURN_POINTER(retval); @@ -820,7 +820,7 @@ gtrgm_picksplit(PG_FUNCTION_ARGS) SPLITCOST *costvector; /* cache the sign data for each existing item */ - cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 1)); + cache = palloc_array(CACHESIGN, maxoff + 1); cache_sign = palloc(siglen * (maxoff + 1)); for (k = FirstOffsetNumber; k <= maxoff; k = OffsetNumberNext(k)) @@ -864,7 +864,7 @@ gtrgm_picksplit(PG_FUNCTION_ARGS) union_r = GETSIGN(datum_r); /* sort before ... */ - costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + costvector = palloc_array(SPLITCOST, maxoff); for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) { costvector[j - 1].pos = j; diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 29b39ec8a4c25..81182a15e07ae 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -452,7 +452,7 @@ make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2) int i, len = len1 + len2; - result = (pos_trgm *) palloc(sizeof(pos_trgm) * len); + result = palloc_array(pos_trgm, len); for (i = 0; i < len1; i++) { @@ -535,7 +535,7 @@ iterate_word_similarity(int *trg2indexes, lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1; /* Memorise last position of each trigram */ - lastpos = (int *) palloc(sizeof(int) * len); + lastpos = palloc_array(int, len); memset(lastpos, -1, sizeof(int) * len); for (i = 0; i < len2; i++) @@ -711,8 +711,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, * Merge positional trigrams array: enumerate each trigram and find its * presence in required word. */ - trg2indexes = (int *) palloc(sizeof(int) * len2); - found = (bool *) palloc0(sizeof(bool) * len); + trg2indexes = palloc_array(int, len2); + found = palloc0_array(bool, len); ulen1 = 0; j = 0; @@ -938,7 +938,7 @@ generate_wildcard_trgm(const char *str, int slen) tptr = GETARR(trg); /* Allocate a buffer for blank-padded, but not yet case-folded, words */ - buf = palloc(sizeof(char) * (slen + 4)); + buf = palloc_array(char, slen + 4); /* * Extract trigrams from each substring extracted by get_wildcard_part. @@ -1008,7 +1008,7 @@ show_trgm(PG_FUNCTION_ARGS) int i; trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); - d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg))); + d = palloc_array(Datum, 1 + ARRNELEM(trg)); for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++) { @@ -1136,7 +1136,7 @@ trgm_presence_map(TRGM *query, TRGM *key) lenk = ARRNELEM(key), i; - result = (bool *) palloc0(lenq * sizeof(bool)); + result = palloc0_array(bool, lenq); /* for each query trigram, do a binary search in the key array */ for (i = 0; i < lenq; i++) diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 149f9eb259c01..1a76794c42298 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -791,12 +791,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) colorInfo->expandable = true; colorInfo->containsNonWord = false; - colorInfo->wordChars = (trgm_mb_char *) - palloc(sizeof(trgm_mb_char) * charsCount); + colorInfo->wordChars = palloc_array(trgm_mb_char, charsCount); colorInfo->wordCharsCount = 0; /* Extract all the chars in this color */ - chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount); + chars = palloc_array(pg_wchar, charsCount); pg_reg_getcharacters(regex, i, chars, charsCount); /* @@ -1063,7 +1062,7 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key) * original NFA. */ arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); - arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); + arcs = palloc_array(regex_arc_t, arcsCount); pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); for (i = 0; i < arcsCount; i++) @@ -1177,7 +1176,7 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key) static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key) { - TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey)); + TrgmStateKey *keyCopy = palloc_object(TrgmStateKey); memcpy(keyCopy, key, sizeof(TrgmStateKey)); trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy); @@ -1215,7 +1214,7 @@ addArcs(TrgmNFA *trgmNFA, TrgmState *state) TrgmStateKey *key = (TrgmStateKey *) lfirst(cell); arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); - arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); + arcs = palloc_array(regex_arc_t, arcsCount); pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); for (i = 0; i < arcsCount; i++) @@ -1311,7 +1310,7 @@ addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, } /* Checks were successful, add new arc */ - arc = (TrgmArc *) palloc(sizeof(TrgmArc)); + arc = palloc_object(TrgmArc); arc->target = getState(trgmNFA, destKey); arc->ctrgm.colors[0] = key->prefix.colors[0]; arc->ctrgm.colors[1] = key->prefix.colors[1]; @@ -1467,7 +1466,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA) int cnumber; /* Collect color trigrams from all arcs */ - colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount); + colorTrgms = palloc0_array(ColorTrgmInfo, arcsCount); trgmNFA->colorTrgms = colorTrgms; i = 0; @@ -1479,7 +1478,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA) foreach(cell, state->arcs) { TrgmArc *arc = (TrgmArc *) lfirst(cell); - TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo)); + TrgmArcInfo *arcInfo = palloc_object(TrgmArcInfo); ColorTrgmInfo *trgmInfo = &colorTrgms[i]; arcInfo->source = state; @@ -1964,8 +1963,7 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) } /* Collect array of all arcs */ - arcs = (TrgmPackArcInfo *) - palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount); + arcs = palloc_array(TrgmPackArcInfo, trgmNFA->arcsCount); arcIndex = 0; hash_seq_init(&scan_status, trgmNFA->states); while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) @@ -2147,7 +2145,7 @@ printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors) appendStringInfoString(&buf, ";\n"); arcsCount = pg_reg_getnumoutarcs(regex, state); - arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); + arcs = palloc_array(regex_arc_t, arcsCount); pg_reg_getoutarcs(regex, state, arcs, arcsCount); for (i = 0; i < arcsCount; i++) diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index d79ef35006bfa..715f5cdd17c57 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -270,11 +270,8 @@ pg_visibility_map_summary(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); Relation rel; - BlockNumber nblocks; - BlockNumber blkno; - Buffer vmbuffer = InvalidBuffer; - int64 all_visible = 0; - int64 all_frozen = 0; + BlockNumber all_visible = 0; + BlockNumber all_frozen = 0; TupleDesc tupdesc; Datum values[2]; bool nulls[2] = {0}; @@ -284,33 +281,15 @@ pg_visibility_map_summary(PG_FUNCTION_ARGS) /* Only some relkinds have a visibility map */ check_relation_relkind(rel); - nblocks = RelationGetNumberOfBlocks(rel); - - for (blkno = 0; blkno < nblocks; ++blkno) - { - int32 mapbits; - - /* Make sure we are interruptible. */ - CHECK_FOR_INTERRUPTS(); - - /* Get map info. */ - mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer); - if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0) - ++all_visible; - if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0) - ++all_frozen; - } + visibilitymap_count(rel, &all_visible, &all_frozen); - /* Clean up. */ - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); relation_close(rel, AccessShareLock); if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - values[0] = Int64GetDatum(all_visible); - values[1] = Int64GetDatum(all_frozen); + values[0] = Int64GetDatum((int64) all_visible); + values[1] = Int64GetDatum((int64) all_frozen); PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); } @@ -741,7 +720,7 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) * number of entries allocated. We'll repurpose these fields before * returning. */ - items = palloc0(sizeof(corrupt_items)); + items = palloc0_object(corrupt_items); items->next = 0; items->count = 64; items->tids = palloc(items->count * sizeof(ItemPointerData)); diff --git a/contrib/pg_visibility/t/002_corrupt_vm.pl b/contrib/pg_visibility/t/002_corrupt_vm.pl index b9b319564669b..e558b2c13dc5b 100644 --- a/contrib/pg_visibility/t/002_corrupt_vm.pl +++ b/contrib/pg_visibility/t/002_corrupt_vm.pl @@ -40,7 +40,7 @@ "SELECT relpages FROM pg_class WHERE relname = 'corruption_test';" ); -ok($npages >= 10, 'table has at least 10 pages'); +cmp_ok($npages, '>=', 10, 'table has at least 10 pages'); my $file = $node->safe_psql("postgres", "SELECT pg_relation_filepath('corruption_test');"); diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out index c010eed8c5d6e..f955ff5d3c52a 100644 --- a/contrib/pg_walinspect/expected/pg_walinspect.out +++ b/contrib/pg_walinspect/expected/pg_walinspect.out @@ -19,14 +19,14 @@ INSERT INTO sample_tbl SELECT * FROM generate_series(3, 4); -- =================================================================== -- Invalid input LSN. SELECT * FROM pg_get_wal_record_info('0/0'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 -- Invalid start LSN. SELECT * FROM pg_get_wal_records_info('0/0', :'wal_lsn1'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 SELECT * FROM pg_get_wal_stats('0/0', :'wal_lsn1'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 SELECT * FROM pg_get_wal_block_info('0/0', :'wal_lsn1'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 -- Start LSN > End LSN. SELECT * FROM pg_get_wal_records_info(:'wal_lsn2', :'wal_lsn1'); ERROR: WAL start LSN must be less than end LSN diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c index 64745564cc249..6945bac1306da 100644 --- a/contrib/pg_walinspect/pg_walinspect.c +++ b/contrib/pg_walinspect/pg_walinspect.c @@ -12,6 +12,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" @@ -82,7 +83,7 @@ GetCurrentLSN(void) else curr_lsn = GetXLogReplayRecPtr(NULL); - Assert(!XLogRecPtrIsInvalid(curr_lsn)); + Assert(XLogRecPtrIsValid(curr_lsn)); return curr_lsn; } @@ -105,11 +106,10 @@ InitXLogReaderState(XLogRecPtr lsn) if (lsn < XLOG_BLCKSZ) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not read WAL at LSN %X/%X", + errmsg("could not read WAL at LSN %X/%08X", LSN_FORMAT_ARGS(lsn)))); - private_data = (ReadLocalXLogPageNoWaitPrivate *) - palloc0(sizeof(ReadLocalXLogPageNoWaitPrivate)); + private_data = palloc0_object(ReadLocalXLogPageNoWaitPrivate); xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.page_read = &read_local_xlog_page_no_wait, @@ -126,10 +126,10 @@ InitXLogReaderState(XLogRecPtr lsn) /* first find a valid recptr to start from */ first_valid_record = XLogFindNextRecord(xlogreader, lsn); - if (XLogRecPtrIsInvalid(first_valid_record)) + if (!XLogRecPtrIsValid(first_valid_record)) ereport(ERROR, - (errmsg("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(lsn)))); + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(lsn))); return xlogreader; } @@ -168,12 +168,12 @@ ReadNextXLogRecord(XLogReaderState *xlogreader) if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL at %X/%X: %s", + errmsg("could not read WAL at %X/%08X: %s", LSN_FORMAT_ARGS(xlogreader->EndRecPtr), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL at %X/%X", + errmsg("could not read WAL at %X/%08X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); } @@ -309,7 +309,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record, /* Construct and save block_fpi_info */ bitcnt = pg_popcount((const char *) &blk->bimg_info, sizeof(uint8)); - flags = (Datum *) palloc0(sizeof(Datum) * bitcnt); + flags = palloc0_array(Datum, bitcnt); if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) != 0) flags[cnt++] = CStringGetTextDatum("HAS_HOLE"); if (blk->apply_image) @@ -479,7 +479,7 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL input LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(curr_lsn)))); /* Build a tuple descriptor for our result type. */ @@ -491,7 +491,7 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS) if (!ReadNextXLogRecord(xlogreader)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not read WAL at %X/%X", + errmsg("could not read WAL at %X/%08X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS); @@ -521,7 +521,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL start LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(curr_lsn)))); if (start_lsn > *end_lsn) @@ -827,7 +827,7 @@ pg_get_wal_records_info_till_end_of_wal(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL start LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(end_lsn)))); GetWALRecordsInfo(fcinfo, start_lsn, end_lsn); @@ -846,7 +846,7 @@ pg_get_wal_stats_till_end_of_wal(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL start LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(end_lsn)))); GetWalStats(fcinfo, start_lsn, end_lsn, stats_per_record); diff --git a/contrib/pgcrypto/expected/hmac-md5_2.out b/contrib/pgcrypto/expected/hmac-md5_2.out new file mode 100644 index 0000000000000..08cdf95d53245 --- /dev/null +++ b/contrib/pgcrypto/expected/hmac-md5_2.out @@ -0,0 +1,44 @@ +-- +-- HMAC-MD5 +-- +SELECT hmac( +'Hi There', +'\x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b'::bytea, +'md5'); +ERROR: Cannot use "md5": No such hash algorithm +-- 2 +SELECT hmac( +'Jefe', +'what do ya want for nothing?', +'md5'); +ERROR: Cannot use "md5": No such hash algorithm +-- 3 +SELECT hmac( +'\xdddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd'::bytea, +'\xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'::bytea, +'md5'); +ERROR: Cannot use "md5": No such hash algorithm +-- 4 +SELECT hmac( +'\xcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd'::bytea, +'\x0102030405060708090a0b0c0d0e0f10111213141516171819'::bytea, +'md5'); +ERROR: Cannot use "md5": No such hash algorithm +-- 5 +SELECT hmac( +'Test With Truncation', +'\x0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c'::bytea, +'md5'); +ERROR: Cannot use "md5": No such hash algorithm +-- 6 +SELECT hmac( +'Test Using Larger Than Block-Size Key - Hash Key First', +'\xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'::bytea, +'md5'); +ERROR: Cannot use "md5": No such hash algorithm +-- 7 +SELECT hmac( +'Test Using Larger Than Block-Size Key and Larger Than One Block-Size Data', +'\xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'::bytea, +'md5'); +ERROR: Cannot use "md5": No such hash algorithm diff --git a/contrib/pgcrypto/expected/md5_2.out b/contrib/pgcrypto/expected/md5_2.out new file mode 100644 index 0000000000000..51bdaa86f32b4 --- /dev/null +++ b/contrib/pgcrypto/expected/md5_2.out @@ -0,0 +1,17 @@ +-- +-- MD5 message digest +-- +SELECT digest('', 'md5'); +ERROR: Cannot use "md5": No such hash algorithm +SELECT digest('a', 'md5'); +ERROR: Cannot use "md5": No such hash algorithm +SELECT digest('abc', 'md5'); +ERROR: Cannot use "md5": No such hash algorithm +SELECT digest('message digest', 'md5'); +ERROR: Cannot use "md5": No such hash algorithm +SELECT digest('abcdefghijklmnopqrstuvwxyz', 'md5'); +ERROR: Cannot use "md5": No such hash algorithm +SELECT digest('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789', 'md5'); +ERROR: Cannot use "md5": No such hash algorithm +SELECT digest('12345678901234567890123456789012345678901234567890123456789012345678901234567890', 'md5'); +ERROR: Cannot use "md5": No such hash algorithm diff --git a/contrib/pgcrypto/mbuf.c b/contrib/pgcrypto/mbuf.c index 99f8957b00414..6a23ad9970664 100644 --- a/contrib/pgcrypto/mbuf.c +++ b/contrib/pgcrypto/mbuf.c @@ -115,7 +115,7 @@ mbuf_create(int len) if (!len) len = 8192; - mbuf = palloc(sizeof *mbuf); + mbuf = palloc_object(MBuf); mbuf->data = palloc(len); mbuf->buf_end = mbuf->data + len; mbuf->data_end = mbuf->data; @@ -132,8 +132,8 @@ mbuf_create_from_data(uint8 *data, int len) { MBuf *mbuf; - mbuf = palloc(sizeof *mbuf); - mbuf->data = (uint8 *) data; + mbuf = palloc_object(MBuf); + mbuf->data = data; mbuf->buf_end = mbuf->data + len; mbuf->data_end = mbuf->data + len; mbuf->read_pos = mbuf->data; @@ -206,7 +206,7 @@ pullf_create(PullFilter **pf_p, const PullFilterOps *op, void *init_arg, PullFil res = 0; } - pf = palloc0(sizeof(*pf)); + pf = palloc0_object(PullFilter); pf->buflen = res; pf->op = op; pf->priv = priv; @@ -372,7 +372,7 @@ pushf_create(PushFilter **mp_p, const PushFilterOps *op, void *init_arg, PushFil res = 0; } - mp = palloc0(sizeof(*mp)); + mp = palloc0_object(PushFilter); mp->block_size = res; mp->op = op; mp->priv = priv; diff --git a/contrib/pgcrypto/openssl.c b/contrib/pgcrypto/openssl.c index f179e80c8429e..d3c12e7fda36a 100644 --- a/contrib/pgcrypto/openssl.c +++ b/contrib/pgcrypto/openssl.c @@ -197,7 +197,7 @@ px_find_digest(const char *name, PX_MD **res) ResourceOwnerRememberOSSLDigest(digest->owner, digest); /* The PX_MD object is allocated in the current memory context. */ - h = palloc(sizeof(*h)); + h = palloc_object(PX_MD); h->result_size = digest_result_size; h->block_size = digest_block_size; h->reset = digest_reset; @@ -813,7 +813,7 @@ px_find_cipher(const char *name, PX_Cipher **res) od->evp_ciph = i->ciph->cipher_func(); /* The PX_Cipher is allocated in current memory context */ - c = palloc(sizeof(*c)); + c = palloc_object(PX_Cipher); c->block_size = gen_ossl_block_size; c->key_size = gen_ossl_key_size; c->iv_size = gen_ossl_iv_size; diff --git a/contrib/pgcrypto/pgp-cfb.c b/contrib/pgcrypto/pgp-cfb.c index de41e825b0ce8..d8f1afc3aba42 100644 --- a/contrib/pgcrypto/pgp-cfb.c +++ b/contrib/pgcrypto/pgp-cfb.c @@ -67,7 +67,7 @@ pgp_cfb_create(PGP_CFB **ctx_p, int algo, const uint8 *key, int key_len, return res; } - ctx = palloc0(sizeof(*ctx)); + ctx = palloc0_object(PGP_CFB); ctx->ciph = ciph; ctx->block_size = px_cipher_block_size(ciph); ctx->resync = resync; diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c index 961cf21e74891..caa80ecdb4596 100644 --- a/contrib/pgcrypto/pgp-compress.c +++ b/contrib/pgcrypto/pgp-compress.c @@ -80,7 +80,7 @@ compress_init(PushFilter *next, void *init_arg, void **priv_p) /* * init */ - st = palloc0(sizeof(*st)); + st = palloc0_object(struct ZipStat); st->buf_len = ZIP_OUT_BUF; st->stream.zalloc = z_alloc; st->stream.zfree = z_free; @@ -211,7 +211,7 @@ decompress_init(void **priv_p, void *arg, PullFilter *src) && ctx->compress_algo != PGP_COMPR_ZIP) return PXE_PGP_UNSUPPORTED_COMPR; - dec = palloc0(sizeof(*dec)); + dec = palloc0_object(struct DecomprData); dec->buf_len = ZIP_OUT_BUF; *priv_p = dec; diff --git a/contrib/pgcrypto/pgp-decrypt.c b/contrib/pgcrypto/pgp-decrypt.c index e1ea5b3e58dcf..52ca7840c6d1e 100644 --- a/contrib/pgcrypto/pgp-decrypt.c +++ b/contrib/pgcrypto/pgp-decrypt.c @@ -224,7 +224,7 @@ pgp_create_pkt_reader(PullFilter **pf_p, PullFilter *src, int len, int pkttype, PGP_Context *ctx) { int res; - struct PktData *pkt = palloc(sizeof(*pkt)); + struct PktData *pkt = palloc_object(struct PktData); pkt->type = pkttype; pkt->len = len; @@ -448,7 +448,7 @@ mdcbuf_init(void **priv_p, void *arg, PullFilter *src) PGP_Context *ctx = arg; struct MDCBufData *st; - st = palloc0(sizeof(*st)); + st = palloc0_object(struct MDCBufData); st->buflen = sizeof(st->buf); st->ctx = ctx; *priv_p = st; diff --git a/contrib/pgcrypto/pgp-encrypt.c b/contrib/pgcrypto/pgp-encrypt.c index f7467c9b1cb1c..2c05980470625 100644 --- a/contrib/pgcrypto/pgp-encrypt.c +++ b/contrib/pgcrypto/pgp-encrypt.c @@ -178,7 +178,7 @@ encrypt_init(PushFilter *next, void *init_arg, void **priv_p) if (res < 0) return res; - st = palloc0(sizeof(*st)); + st = palloc0_object(struct EncStat); st->ciph = ciph; *priv_p = st; @@ -240,7 +240,7 @@ pkt_stream_init(PushFilter *next, void *init_arg, void **priv_p) { struct PktStreamStat *st; - st = palloc(sizeof(*st)); + st = palloc_object(struct PktStreamStat); st->final_done = 0; st->pkt_block = 1 << STREAM_BLOCK_SHIFT; *priv_p = st; diff --git a/contrib/pgcrypto/pgp-pgsql.c b/contrib/pgcrypto/pgp-pgsql.c index 7c9f4c7b39b88..3e47b9364ab35 100644 --- a/contrib/pgcrypto/pgp-pgsql.c +++ b/contrib/pgcrypto/pgp-pgsql.c @@ -782,8 +782,8 @@ parse_key_value_arrays(ArrayType *key_array, ArrayType *val_array, (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), errmsg("mismatched array dimensions"))); - keys = (char **) palloc(sizeof(char *) * key_count); - values = (char **) palloc(sizeof(char *) * val_count); + keys = palloc_array(char *, key_count); + values = palloc_array(char *, val_count); for (i = 0; i < key_count; i++) { @@ -937,7 +937,7 @@ pgp_armor_headers(PG_FUNCTION_ARGS) attinmeta = TupleDescGetAttInMetadata(tupdesc); funcctx->attinmeta = attinmeta; - state = (pgp_armor_headers_state *) palloc(sizeof(pgp_armor_headers_state)); + state = palloc_object(pgp_armor_headers_state); res = pgp_extract_armor_headers((uint8 *) VARDATA_ANY(data), VARSIZE_ANY_EXHDR(data), diff --git a/contrib/pgcrypto/pgp-pubkey.c b/contrib/pgcrypto/pgp-pubkey.c index 9a6561caf9dde..6f1188659178b 100644 --- a/contrib/pgcrypto/pgp-pubkey.c +++ b/contrib/pgcrypto/pgp-pubkey.c @@ -39,7 +39,7 @@ pgp_key_alloc(PGP_PubKey **pk_p) { PGP_PubKey *pk; - pk = palloc0(sizeof(*pk)); + pk = palloc0_object(PGP_PubKey); *pk_p = pk; return 0; } diff --git a/contrib/pgcrypto/px-hmac.c b/contrib/pgcrypto/px-hmac.c index 99174d265517b..68e5cff6d6acd 100644 --- a/contrib/pgcrypto/px-hmac.c +++ b/contrib/pgcrypto/px-hmac.c @@ -157,7 +157,7 @@ px_find_hmac(const char *name, PX_HMAC **res) return PXE_HASH_UNUSABLE_FOR_HMAC; } - h = palloc(sizeof(*h)); + h = palloc_object(PX_HMAC); h->p.ipad = palloc(bs); h->p.opad = palloc(bs); h->md = md; diff --git a/contrib/pgcrypto/px.c b/contrib/pgcrypto/px.c index d35ccca77746d..4d668d4e4969e 100644 --- a/contrib/pgcrypto/px.c +++ b/contrib/pgcrypto/px.c @@ -291,7 +291,7 @@ px_find_combo(const char *name, PX_Combo **res) PX_Combo *cx; - cx = palloc0(sizeof(*cx)); + cx = palloc0_object(PX_Combo); buf = pstrdup(name); err = parse_cipher_name(buf, &s_cipher, &s_pad); diff --git a/contrib/pgrowlocks/pgrowlocks.c b/contrib/pgrowlocks/pgrowlocks.c index b75d80fa7a9c2..f88269332b6be 100644 --- a/contrib/pgrowlocks/pgrowlocks.c +++ b/contrib/pgrowlocks/pgrowlocks.c @@ -141,8 +141,8 @@ pgrowlocks(PG_FUNCTION_ARGS) */ if (htsu == TM_BeingModified) { - values[Atnum_tid] = (char *) DirectFunctionCall1(tidout, - PointerGetDatum(&tuple->t_self)); + values[Atnum_tid] = DatumGetCString(DirectFunctionCall1(tidout, + PointerGetDatum(&tuple->t_self))); values[Atnum_xmax] = palloc(NCHARS * sizeof(char)); snprintf(values[Atnum_xmax], NCHARS, "%u", xmax); diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index 4b9d76ec4e4df..40823d54fcac0 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -647,7 +647,7 @@ pgstathashindex(PG_FUNCTION_ARGS) buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); - page = (Page) BufferGetPage(buf); + page = BufferGetPage(buf); if (PageIsNew(page)) stats.unused_pages++; diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 0d9c2b0b65369..6a7f8cb4a7ca5 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -378,7 +378,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, RBM_NORMAL, hscan->rs_strategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); - stat.free_space += PageGetExactFreeSpace((Page) BufferGetPage(buffer)); + stat.free_space += PageGetExactFreeSpace(BufferGetPage(buffer)); UnlockReleaseBuffer(buffer); block++; } @@ -391,7 +391,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, RBM_NORMAL, hscan->rs_strategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); - stat.free_space += PageGetExactFreeSpace((Page) BufferGetPage(buffer)); + stat.free_space += PageGetExactFreeSpace(BufferGetPage(buffer)); UnlockReleaseBuffer(buffer); block++; } @@ -424,7 +424,7 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, /* fully empty page */ stat->free_space += BLCKSZ; } - else + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(BTPageOpaqueData))) { BTPageOpaque opaque; @@ -458,10 +458,16 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, Buffer buf; Page page; - buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy); + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + LockBuffer(buf, HASH_READ); page = BufferGetPage(buf); - if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData))) + if (PageIsNew(page)) + { + /* fully empty page */ + stat->free_space += BLCKSZ; + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData))) { HashPageOpaque opaque; @@ -502,17 +508,23 @@ pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, GIST_SHARE); - gistcheckpage(rel, buf); page = BufferGetPage(buf); - - if (GistPageIsLeaf(page)) + if (PageIsNew(page)) { - pgstat_index_page(stat, page, FirstOffsetNumber, - PageGetMaxOffsetNumber(page)); + /* fully empty page */ + stat->free_space += BLCKSZ; } - else + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(GISTPageOpaqueData))) { - /* root or node */ + if (GistPageIsLeaf(page)) + { + pgstat_index_page(stat, page, FirstOffsetNumber, + PageGetMaxOffsetNumber(page)); + } + else + { + /* root or node */ + } } UnlockReleaseBuffer(buf); diff --git a/contrib/postgres_fdw/.gitignore b/contrib/postgres_fdw/.gitignore index 5dcb3ff972350..b4903eba657fa 100644 --- a/contrib/postgres_fdw/.gitignore +++ b/contrib/postgres_fdw/.gitignore @@ -1,4 +1,6 @@ # Generated subdirectories /log/ /results/ +/output_iso/ /tmp_check/ +/tmp_check_iso/ diff --git a/contrib/postgres_fdw/Makefile b/contrib/postgres_fdw/Makefile index adfbd2ef758e0..8eaf4d263b688 100644 --- a/contrib/postgres_fdw/Makefile +++ b/contrib/postgres_fdw/Makefile @@ -17,6 +17,8 @@ EXTENSION = postgres_fdw DATA = postgres_fdw--1.0.sql postgres_fdw--1.0--1.1.sql postgres_fdw--1.1--1.2.sql REGRESS = postgres_fdw query_cancel +ISOLATION = eval_plan_qual +ISOLATION_OPTS = --load-extension=postgres_fdw TAP_TESTS = 1 ifdef USE_PGXS diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c index 304f3c20f8356..953c2e0ab8285 100644 --- a/contrib/postgres_fdw/connection.c +++ b/contrib/postgres_fdw/connection.c @@ -16,6 +16,7 @@ #include #endif +#include "access/htup_details.h" #include "access/xact.h" #include "catalog/pg_user_mapping.h" #include "commands/defrem.h" @@ -142,6 +143,8 @@ static void do_sql_command_begin(PGconn *conn, const char *sql); static void do_sql_command_end(PGconn *conn, const char *sql, bool consume_input); static void begin_remote_xact(ConnCacheEntry *entry); +static void pgfdw_report_internal(int elevel, PGresult *res, PGconn *conn, + const char *sql); static void pgfdw_xact_callback(XactEvent event, void *arg); static void pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid, @@ -462,7 +465,7 @@ pgfdw_security_check(const char **keywords, const char **values, UserMapping *us * assume that UseScramPassthrough is also true since SCRAM options are * only set when UseScramPassthrough is enabled. */ - if (MyProcPort->has_scram_keys && pgfdw_has_required_scram_options(keywords, values)) + if (MyProcPort != NULL && MyProcPort->has_scram_keys && pgfdw_has_required_scram_options(keywords, values)) return; ereport(ERROR, @@ -568,7 +571,7 @@ connect_pg_server(ForeignServer *server, UserMapping *user) n++; /* Add required SCRAM pass-through connection options if it's enabled. */ - if (MyProcPort->has_scram_keys && UseScramPassthrough(server, user)) + if (MyProcPort != NULL && MyProcPort->has_scram_keys && UseScramPassthrough(server, user)) { int len; int encoded_len; @@ -625,6 +628,9 @@ connect_pg_server(ForeignServer *server, UserMapping *user) server->servername), errdetail_internal("%s", pchomp(PQerrorMessage(conn))))); + PQsetNoticeReceiver(conn, libpqsrv_notice_receiver, + "received message via remote connection"); + /* Perform post-connection security checks. */ pgfdw_security_check(keywords, values, user, conn); @@ -743,7 +749,7 @@ check_conn_params(const char **keywords, const char **values, UserMapping *user) * assume that UseScramPassthrough is also true since SCRAM options are * only set when UseScramPassthrough is enabled. */ - if (MyProcPort->has_scram_keys && pgfdw_has_required_scram_options(keywords, values)) + if (MyProcPort != NULL && MyProcPort->has_scram_keys && pgfdw_has_required_scram_options(keywords, values)) return; ereport(ERROR, @@ -812,7 +818,7 @@ static void do_sql_command_begin(PGconn *conn, const char *sql) { if (!PQsendQuery(conn, sql)) - pgfdw_report_error(ERROR, NULL, conn, false, sql); + pgfdw_report_error(NULL, conn, sql); } static void @@ -827,10 +833,10 @@ do_sql_command_end(PGconn *conn, const char *sql, bool consume_input) * would be large compared to the overhead of PQconsumeInput.) */ if (consume_input && !PQconsumeInput(conn)) - pgfdw_report_error(ERROR, NULL, conn, false, sql); + pgfdw_report_error(NULL, conn, sql); res = pgfdw_get_result(conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, true, sql); + pgfdw_report_error(res, conn, sql); PQclear(res); } @@ -963,63 +969,73 @@ pgfdw_get_result(PGconn *conn) /* * Report an error we got from the remote server. * - * elevel: error level to use (typically ERROR, but might be less) - * res: PGresult containing the error + * Callers should use pgfdw_report_error() to throw an error, or use + * pgfdw_report() for lesser message levels. (We make this distinction + * so that pgfdw_report_error() can be marked noreturn.) + * + * res: PGresult containing the error (might be NULL) * conn: connection we did the query on - * clear: if true, PQclear the result (otherwise caller will handle it) * sql: NULL, or text of remote command we tried to execute * + * If "res" is not NULL, it'll be PQclear'ed here (unless we throw error, + * in which case memory context cleanup will clear it eventually). + * * Note: callers that choose not to throw ERROR for a remote error are * responsible for making sure that the associated ConnCacheEntry gets * marked with have_error = true. */ void -pgfdw_report_error(int elevel, PGresult *res, PGconn *conn, - bool clear, const char *sql) +pgfdw_report_error(PGresult *res, PGconn *conn, const char *sql) { - /* If requested, PGresult must be released before leaving this function. */ - PG_TRY(); - { - char *diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); - char *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); - char *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); - char *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); - char *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT); - int sqlstate; - - if (diag_sqlstate) - sqlstate = MAKE_SQLSTATE(diag_sqlstate[0], - diag_sqlstate[1], - diag_sqlstate[2], - diag_sqlstate[3], - diag_sqlstate[4]); - else - sqlstate = ERRCODE_CONNECTION_FAILURE; + pgfdw_report_internal(ERROR, res, conn, sql); + pg_unreachable(); +} - /* - * If we don't get a message from the PGresult, try the PGconn. This - * is needed because for connection-level failures, PQgetResult may - * just return NULL, not a PGresult at all. - */ - if (message_primary == NULL) - message_primary = pchomp(PQerrorMessage(conn)); - - ereport(elevel, - (errcode(sqlstate), - (message_primary != NULL && message_primary[0] != '\0') ? - errmsg_internal("%s", message_primary) : - errmsg("could not obtain message string for remote error"), - message_detail ? errdetail_internal("%s", message_detail) : 0, - message_hint ? errhint("%s", message_hint) : 0, - message_context ? errcontext("%s", message_context) : 0, - sql ? errcontext("remote SQL command: %s", sql) : 0)); - } - PG_FINALLY(); - { - if (clear) - PQclear(res); - } - PG_END_TRY(); +void +pgfdw_report(int elevel, PGresult *res, PGconn *conn, const char *sql) +{ + Assert(elevel < ERROR); /* use pgfdw_report_error for that */ + pgfdw_report_internal(elevel, res, conn, sql); +} + +static void +pgfdw_report_internal(int elevel, PGresult *res, PGconn *conn, + const char *sql) +{ + char *diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); + char *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); + char *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); + char *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); + char *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT); + int sqlstate; + + if (diag_sqlstate) + sqlstate = MAKE_SQLSTATE(diag_sqlstate[0], + diag_sqlstate[1], + diag_sqlstate[2], + diag_sqlstate[3], + diag_sqlstate[4]); + else + sqlstate = ERRCODE_CONNECTION_FAILURE; + + /* + * If we don't get a message from the PGresult, try the PGconn. This is + * needed because for connection-level failures, PQgetResult may just + * return NULL, not a PGresult at all. + */ + if (message_primary == NULL) + message_primary = pchomp(PQerrorMessage(conn)); + + ereport(elevel, + (errcode(sqlstate), + (message_primary != NULL && message_primary[0] != '\0') ? + errmsg_internal("%s", message_primary) : + errmsg("could not obtain message string for remote error"), + message_detail ? errdetail_internal("%s", message_detail) : 0, + message_hint ? errhint("%s", message_hint) : 0, + message_context ? errcontext("%s", message_context) : 0, + sql ? errcontext("remote SQL command: %s", sql) : 0)); + PQclear(res); } /* @@ -1542,7 +1558,7 @@ pgfdw_exec_cleanup_query_begin(PGconn *conn, const char *query) */ if (!PQsendQuery(conn, query)) { - pgfdw_report_error(WARNING, NULL, conn, false, query); + pgfdw_report(WARNING, NULL, conn, query); return false; } @@ -1567,7 +1583,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query, */ if (consume_input && !PQconsumeInput(conn)) { - pgfdw_report_error(WARNING, NULL, conn, false, query); + pgfdw_report(WARNING, NULL, conn, query); return false; } @@ -1579,7 +1595,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query, (errmsg("could not get query result due to timeout"), errcontext("remote SQL command: %s", query))); else - pgfdw_report_error(WARNING, NULL, conn, false, query); + pgfdw_report(WARNING, NULL, conn, query); return false; } @@ -1587,7 +1603,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query, /* Issue a warning if not successful. */ if (PQresultStatus(result) != PGRES_COMMAND_OK) { - pgfdw_report_error(WARNING, result, conn, true, query); + pgfdw_report(WARNING, result, conn, query); return ignore_errors; } PQclear(result); @@ -1615,103 +1631,90 @@ pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime, PGresult **result, bool *timed_out) { - volatile bool failed = false; - PGresult *volatile last_res = NULL; + bool failed = false; + PGresult *last_res = NULL; + int canceldelta = RETRY_CANCEL_TIMEOUT * 2; *result = NULL; *timed_out = false; - - /* In what follows, do not leak any PGresults on an error. */ - PG_TRY(); + for (;;) { - int canceldelta = RETRY_CANCEL_TIMEOUT * 2; + PGresult *res; - for (;;) + while (PQisBusy(conn)) { - PGresult *res; + int wc; + TimestampTz now = GetCurrentTimestamp(); + long cur_timeout; - while (PQisBusy(conn)) + /* If timeout has expired, give up. */ + if (now >= endtime) { - int wc; - TimestampTz now = GetCurrentTimestamp(); - long cur_timeout; - - /* If timeout has expired, give up. */ - if (now >= endtime) - { - *timed_out = true; - failed = true; - goto exit; - } + *timed_out = true; + failed = true; + goto exit; + } - /* If we need to re-issue the cancel request, do that. */ - if (now >= retrycanceltime) - { - /* We ignore failure to issue the repeated request. */ - (void) libpqsrv_cancel(conn, endtime); + /* If we need to re-issue the cancel request, do that. */ + if (now >= retrycanceltime) + { + /* We ignore failure to issue the repeated request. */ + (void) libpqsrv_cancel(conn, endtime); - /* Recompute "now" in case that took measurable time. */ - now = GetCurrentTimestamp(); + /* Recompute "now" in case that took measurable time. */ + now = GetCurrentTimestamp(); - /* Adjust re-cancel timeout in increasing steps. */ - retrycanceltime = TimestampTzPlusMilliseconds(now, - canceldelta); - canceldelta += canceldelta; - } + /* Adjust re-cancel timeout in increasing steps. */ + retrycanceltime = TimestampTzPlusMilliseconds(now, + canceldelta); + canceldelta += canceldelta; + } - /* If timeout has expired, give up, else get sleep time. */ - cur_timeout = TimestampDifferenceMilliseconds(now, - Min(endtime, - retrycanceltime)); - if (cur_timeout <= 0) - { - *timed_out = true; - failed = true; - goto exit; - } + /* If timeout has expired, give up, else get sleep time. */ + cur_timeout = TimestampDifferenceMilliseconds(now, + Min(endtime, + retrycanceltime)); + if (cur_timeout <= 0) + { + *timed_out = true; + failed = true; + goto exit; + } - /* first time, allocate or get the custom wait event */ - if (pgfdw_we_cleanup_result == 0) - pgfdw_we_cleanup_result = WaitEventExtensionNew("PostgresFdwCleanupResult"); + /* first time, allocate or get the custom wait event */ + if (pgfdw_we_cleanup_result == 0) + pgfdw_we_cleanup_result = WaitEventExtensionNew("PostgresFdwCleanupResult"); - /* Sleep until there's something to do */ - wc = WaitLatchOrSocket(MyLatch, - WL_LATCH_SET | WL_SOCKET_READABLE | - WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, - PQsocket(conn), - cur_timeout, pgfdw_we_cleanup_result); - ResetLatch(MyLatch); + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + PQsocket(conn), + cur_timeout, pgfdw_we_cleanup_result); + ResetLatch(MyLatch); - CHECK_FOR_INTERRUPTS(); + CHECK_FOR_INTERRUPTS(); - /* Data available in socket? */ - if (wc & WL_SOCKET_READABLE) + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(conn)) { - if (!PQconsumeInput(conn)) - { - /* connection trouble */ - failed = true; - goto exit; - } + /* connection trouble */ + failed = true; + goto exit; } } + } - res = PQgetResult(conn); - if (res == NULL) - break; /* query is complete */ + res = PQgetResult(conn); + if (res == NULL) + break; /* query is complete */ - PQclear(last_res); - last_res = res; - } -exit: ; - } - PG_CATCH(); - { PQclear(last_res); - PG_RE_THROW(); + last_res = res; } - PG_END_TRY(); - +exit: if (failed) PQclear(last_res); else @@ -2557,7 +2560,7 @@ pgfdw_has_required_scram_options(const char **keywords, const char **values) } } - has_scram_keys = has_scram_client_key && has_scram_server_key && MyProcPort->has_scram_keys; + has_scram_keys = has_scram_client_key && has_scram_server_key && MyProcPort != NULL && MyProcPort->has_scram_keys; return (has_scram_keys && has_require_auth); } diff --git a/contrib/postgres_fdw/deparse.c b/contrib/postgres_fdw/deparse.c index d9970dd675336..f2fb005184316 100644 --- a/contrib/postgres_fdw/deparse.c +++ b/contrib/postgres_fdw/deparse.c @@ -39,6 +39,7 @@ #include "catalog/pg_aggregate.h" #include "catalog/pg_authid.h" #include "catalog/pg_collation.h" +#include "catalog/pg_database.h" #include "catalog/pg_namespace.h" #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" @@ -160,6 +161,7 @@ static void deparseDistinctExpr(DistinctExpr *node, deparse_expr_cxt *context); static void deparseScalarArrayOpExpr(ScalarArrayOpExpr *node, deparse_expr_cxt *context); static void deparseRelabelType(RelabelType *node, deparse_expr_cxt *context); +static void deparseArrayCoerceExpr(ArrayCoerceExpr *node, deparse_expr_cxt *context); static void deparseBoolExpr(BoolExpr *node, deparse_expr_cxt *context); static void deparseNullTest(NullTest *node, deparse_expr_cxt *context); static void deparseCaseExpr(CaseExpr *node, deparse_expr_cxt *context); @@ -455,6 +457,11 @@ foreign_expr_walker(Node *node, AuthIdRelationId, fpinfo)) return false; break; + case REGDATABASEOID: + if (!is_shippable(DatumGetObjectId(c->constvalue), + DatabaseRelationId, fpinfo)) + return false; + break; } } @@ -696,6 +703,34 @@ foreign_expr_walker(Node *node, state = FDW_COLLATE_UNSAFE; } break; + case T_ArrayCoerceExpr: + { + ArrayCoerceExpr *e = (ArrayCoerceExpr *) node; + + /* + * Recurse to input subexpression. + */ + if (!foreign_expr_walker((Node *) e->arg, + glob_cxt, &inner_cxt, case_arg_cxt)) + return false; + + /* + * T_ArrayCoerceExpr must not introduce a collation not + * derived from an input foreign Var (same logic as for a + * function). + */ + collation = e->resultcollid; + if (collation == InvalidOid) + state = FDW_COLLATE_NONE; + else if (inner_cxt.state == FDW_COLLATE_SAFE && + collation == inner_cxt.collation) + state = FDW_COLLATE_SAFE; + else if (collation == DEFAULT_COLLATION_OID) + state = FDW_COLLATE_NONE; + else + state = FDW_COLLATE_UNSAFE; + } + break; case T_BoolExpr: { BoolExpr *b = (BoolExpr *) node; @@ -1423,10 +1458,8 @@ deparseTargetList(StringInfo buf, first = true; for (i = 1; i <= tupdesc->natts; i++) { - Form_pg_attribute attr = TupleDescAttr(tupdesc, i - 1); - /* Ignore dropped attributes. */ - if (attr->attisdropped) + if (TupleDescCompactAttr(tupdesc, i - 1)->attisdropped) continue; if (have_wholerow || @@ -2115,7 +2148,7 @@ deparseInsertSql(StringInfo buf, RangeTblEntry *rte, foreach(lc, targetAttrs) { int attnum = lfirst_int(lc); - Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum - 1); + CompactAttribute *attr = TupleDescCompactAttr(tupdesc, attnum - 1); if (!first) appendStringInfoString(buf, ", "); @@ -2181,7 +2214,7 @@ rebuildInsertSql(StringInfo buf, Relation rel, foreach(lc, target_attrs) { int attnum = lfirst_int(lc); - Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum - 1); + CompactAttribute *attr = TupleDescCompactAttr(tupdesc, attnum - 1); if (!first) appendStringInfoString(buf, ", "); @@ -2231,7 +2264,7 @@ deparseUpdateSql(StringInfo buf, RangeTblEntry *rte, foreach(lc, targetAttrs) { int attnum = lfirst_int(lc); - Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum - 1); + CompactAttribute *attr = TupleDescCompactAttr(tupdesc, attnum - 1); if (!first) appendStringInfoString(buf, ", "); @@ -2913,6 +2946,9 @@ deparseExpr(Expr *node, deparse_expr_cxt *context) case T_RelabelType: deparseRelabelType((RelabelType *) node, context); break; + case T_ArrayCoerceExpr: + deparseArrayCoerceExpr((ArrayCoerceExpr *) node, context); + break; case T_BoolExpr: deparseBoolExpr((BoolExpr *) node, context); break; @@ -3501,6 +3537,24 @@ deparseRelabelType(RelabelType *node, deparse_expr_cxt *context) node->resulttypmod)); } +/* + * Deparse an ArrayCoerceExpr (array-type conversion) node. + */ +static void +deparseArrayCoerceExpr(ArrayCoerceExpr *node, deparse_expr_cxt *context) +{ + deparseExpr(node->arg, context); + + /* + * No difference how to deparse explicit cast, but if we omit implicit + * cast in the query, it'll be more user-friendly + */ + if (node->coerceformat != COERCE_IMPLICIT_CAST) + appendStringInfo(context->buf, "::%s", + deparse_type_name(node->resulttype, + node->resulttypmod)); +} + /* * Deparse a BoolExpr node. */ diff --git a/contrib/postgres_fdw/expected/eval_plan_qual.out b/contrib/postgres_fdw/expected/eval_plan_qual.out new file mode 100644 index 0000000000000..5361fe6f32989 --- /dev/null +++ b/contrib/postgres_fdw/expected/eval_plan_qual.out @@ -0,0 +1,131 @@ +Parsed test spec with 2 sessions + +starting permutation: s0_update_l s1_tuplock_l_0 s0_commit s1_commit +step s0_update_l: UPDATE l SET i = i + 1; +step s1_tuplock_l_0: + EXPLAIN (VERBOSE, COSTS OFF) + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.i = 123 FOR UPDATE OF l; + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.i = 123 FOR UPDATE OF l; + +step s0_commit: COMMIT; +step s1_tuplock_l_0: <... completed> +QUERY PLAN +--------------------------------------------------------------------- +LockRows + Output: l.i, l.v, l.ctid, ft.* + -> Nested Loop + Output: l.i, l.v, l.ctid, ft.* + -> Seq Scan on public.l + Output: l.i, l.v, l.ctid + Filter: (l.i = 123) + -> Foreign Scan on public.ft + Output: ft.*, ft.i + Remote SQL: SELECT i, v FROM public.t WHERE ((i = 123)) +(10 rows) + +i|v +-+- +(0 rows) + +step s1_commit: COMMIT; + +starting permutation: s0_update_l s1_tuplock_l_1 s0_commit s1_commit +step s0_update_l: UPDATE l SET i = i + 1; +step s1_tuplock_l_1: + EXPLAIN (VERBOSE, COSTS OFF) + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.v = 'foo' FOR UPDATE OF l; + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.v = 'foo' FOR UPDATE OF l; + +step s0_commit: COMMIT; +step s1_tuplock_l_1: <... completed> +QUERY PLAN +----------------------------------------------------------------------------- +LockRows + Output: l.i, l.v, l.ctid, ft.* + -> Nested Loop + Output: l.i, l.v, l.ctid, ft.* + -> Seq Scan on public.l + Output: l.i, l.v, l.ctid + Filter: (l.v = 'foo'::text) + -> Foreign Scan on public.ft + Output: ft.*, ft.i + Remote SQL: SELECT i, v FROM public.t WHERE ((i = $1::integer)) +(10 rows) + +i|v +-+- +(0 rows) + +step s1_commit: COMMIT; + +starting permutation: s0_update_a s1_tuplock_a_0 s0_commit s1_commit +step s0_update_a: UPDATE a SET i = i + 1; +step s1_tuplock_a_0: + EXPLAIN (VERBOSE, COSTS OFF) + SELECT a.i FROM a, fb, fc WHERE a.i = fb.i AND fb.i = fc.i FOR UPDATE OF a; + SELECT a.i FROM a, fb, fc WHERE a.i = fb.i AND fb.i = fc.i FOR UPDATE OF a; + +step s0_commit: COMMIT; +step s1_tuplock_a_0: <... completed> +QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +LockRows + Output: a.i, a.ctid, fb.*, fc.* + -> Nested Loop + Output: a.i, a.ctid, fb.*, fc.* + Join Filter: (fb.i = a.i) + -> Foreign Scan + Output: fb.*, fb.i, fc.*, fc.i + Relations: (public.fb) INNER JOIN (public.fc) + Remote SQL: SELECT CASE WHEN (r2.*)::text IS NOT NULL THEN ROW(r2.i) END, r2.i, CASE WHEN (r3.*)::text IS NOT NULL THEN ROW(r3.i) END, r3.i FROM (public.b r2 INNER JOIN public.c r3 ON (((r2.i = r3.i)))) + -> Nested Loop + Output: fb.*, fb.i, fc.*, fc.i + Join Filter: (fb.i = fc.i) + -> Foreign Scan on public.fb + Output: fb.*, fb.i + Remote SQL: SELECT i FROM public.b ORDER BY i ASC NULLS LAST + -> Foreign Scan on public.fc + Output: fc.*, fc.i + Remote SQL: SELECT i FROM public.c + -> Seq Scan on public.a + Output: a.i, a.ctid +(20 rows) + +i +- +(0 rows) + +step s1_commit: COMMIT; + +starting permutation: s0_update_a s1_tuplock_a_1 s0_commit s1_commit +step s0_update_a: UPDATE a SET i = i + 1; +step s1_tuplock_a_1: + EXPLAIN (VERBOSE, COSTS OFF) + SELECT a.i, + (SELECT 1 FROM fb, fc WHERE a.i = fb.i AND fb.i = fc.i) + FROM a FOR UPDATE; + SELECT a.i, + (SELECT 1 FROM fb, fc WHERE a.i = fb.i AND fb.i = fc.i) + FROM a FOR UPDATE; + +step s0_commit: COMMIT; +step s1_tuplock_a_1: <... completed> +QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------- +LockRows + Output: a.i, ((SubPlan expr_1)), a.ctid + -> Seq Scan on public.a + Output: a.i, (SubPlan expr_1), a.ctid + SubPlan expr_1 + -> Foreign Scan + Output: 1 + Relations: (public.fb) INNER JOIN (public.fc) + Remote SQL: SELECT NULL FROM (public.b r1 INNER JOIN public.c r2 ON (((r2.i = $1::integer)) AND ((r1.i = $1::integer)))) +(9 rows) + +i|?column? +-+-------- +2| +(1 row) + +step s1_commit: COMMIT; diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 2185b42bb4f79..48e3185b227de 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -2,23 +2,16 @@ -- create FDW objects -- =================================================================== CREATE EXTENSION postgres_fdw; +SELECT current_database() AS current_database, + current_setting('port') AS current_port +\gset CREATE SERVER testserver1 FOREIGN DATA WRAPPER postgres_fdw; -DO $d$ - BEGIN - EXECUTE $$CREATE SERVER loopback FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - EXECUTE $$CREATE SERVER loopback2 FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - EXECUTE $$CREATE SERVER loopback3 FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - END; -$d$; +CREATE SERVER loopback FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); +CREATE SERVER loopback2 FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); +CREATE SERVER loopback3 FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); CREATE USER MAPPING FOR public SERVER testserver1 OPTIONS (user 'value', password 'value'); CREATE USER MAPPING FOR CURRENT_USER SERVER loopback; @@ -235,12 +228,7 @@ SELECT c3, c4 FROM ft1 ORDER BY c3, c1 LIMIT 1; -- should work ALTER SERVER loopback OPTIONS (SET dbname 'no such database'); SELECT c3, c4 FROM ft1 ORDER BY c3, c1 LIMIT 1; -- should fail ERROR: could not connect to server "loopback" -DO $d$ - BEGIN - EXECUTE $$ALTER SERVER loopback - OPTIONS (SET dbname '$$||current_database()||$$')$$; - END; -$d$; +ALTER SERVER loopback OPTIONS (SET dbname :'current_database'); SELECT c3, c4 FROM ft1 ORDER BY c3, c1 LIMIT 1; -- should work again c3 | c4 -------+------------------------------ @@ -710,12 +698,12 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- Op Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (("C 1" = (- "C 1"))) (3 rows) -EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c1 IS NOT NULL) IS DISTINCT FROM (c1 IS NOT NULL); -- DistinctExpr - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------- +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------- Foreign Scan on public.ft1 t1 Output: c1, c2, c3, c4, c5, c6, c7, c8 - Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE ((("C 1" IS NOT NULL) IS DISTINCT FROM ("C 1" IS NOT NULL))) + Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (((c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL))) (3 rows) EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr @@ -1180,6 +1168,27 @@ SELECT * FROM ft1 WHERE CASE c3 COLLATE "C" WHEN c6 THEN true ELSE c3 < 'bar' EN Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" (4 rows) +-- Test array type conversion pushdown +SET plan_cache_mode = force_generic_plan; +PREPARE s(varchar[]) AS SELECT count(*) FROM ft2 WHERE c6 = ANY ($1); +EXPLAIN (VERBOSE, COSTS OFF) +EXECUTE s(ARRAY['1','2']); + QUERY PLAN +--------------------------------------------------------------------------------------------- + Foreign Scan + Output: (count(*)) + Relations: Aggregate on (public.ft2) + Remote SQL: SELECT count(*) FROM "S 1"."T 1" WHERE ((c6 = ANY ($1::character varying[]))) +(4 rows) + +EXECUTE s(ARRAY['1','2']); + count +------- + 200 +(1 row) + +DEALLOCATE s; +RESET plan_cache_mode; -- a regconfig constant referring to this text search configuration -- is initially unshippable CREATE TEXT SEARCH CONFIGURATION public.custom_search @@ -2966,9 +2975,9 @@ select sum(t1.c1), count(t2.c1) from ft1 t1 inner join ft2 t2 on (t1.c1 = t2.c1) QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------- Aggregate - Output: sum(t1.c1), count(t2.c1) + Output: sum(t1.c1), count(*) -> Foreign Scan - Output: t1.c1, t2.c1 + Output: t1.c1 Filter: (((((t1.c1 * t2.c1) / (t1.c1 * t2.c1)))::double precision * random()) <= '1'::double precision) Relations: (public.ft1 t1) INNER JOIN (public.ft2 t2) Remote SQL: SELECT r1."C 1", r2."C 1" FROM ("S 1"."T 1" r1 INNER JOIN "S 1"."T 1" r2 ON (((r2."C 1" = r1."C 1")))) @@ -3064,12 +3073,12 @@ select c2 * (random() <= 1)::int as c2 from ft2 group by c2 * (random() <= 1)::i -- GROUP BY clause in various forms, cardinal, alias and constant expression explain (verbose, costs off) select count(c2) w, c2 x, 5 y, 7.0 z from ft1 group by 2, y, 9.0::int order by 2; - QUERY PLAN ------------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------------------- Foreign Scan - Output: (count(c2)), c2, 5, 7.0, 9 + Output: (count(*)), c2, 5, 7.0, 9 Relations: Aggregate on (public.ft1) - Remote SQL: SELECT count(c2), c2, 5, 7.0, 9 FROM "S 1"."T 1" GROUP BY 2, 3, 5 ORDER BY c2 ASC NULLS LAST + Remote SQL: SELECT count(*), c2, 5, 7.0, 9 FROM "S 1"."T 1" GROUP BY 2, 3, 5 ORDER BY c2 ASC NULLS LAST (4 rows) select count(c2) w, c2 x, 5 y, 7.0 z from ft1 group by 2, y, 9.0::int order by 2; @@ -3166,13 +3175,13 @@ select sum(c1) from ft1 group by c2 having avg(c1 * (random() <= 1)::int) > 100 -- of an initplan) can be trouble, per bug #15781 explain (verbose, costs off) select exists(select 1 from pg_enum), sum(c1) from ft1; - QUERY PLAN --------------------------------------------------- + QUERY PLAN +--------------------------------------------------- Foreign Scan - Output: (InitPlan 1).col1, (sum(ft1.c1)) + Output: (InitPlan exists_1).col1, (sum(ft1.c1)) Relations: Aggregate on (public.ft1) Remote SQL: SELECT sum("C 1") FROM "S 1"."T 1" - InitPlan 1 + InitPlan exists_1 -> Seq Scan on pg_catalog.pg_enum (6 rows) @@ -3187,8 +3196,8 @@ select exists(select 1 from pg_enum), sum(c1) from ft1 group by 1; QUERY PLAN --------------------------------------------------- GroupAggregate - Output: (InitPlan 1).col1, sum(ft1.c1) - InitPlan 1 + Output: (InitPlan exists_1).col1, sum(ft1.c1) + InitPlan exists_1 -> Seq Scan on pg_catalog.pg_enum -> Foreign Scan on public.ft1 Output: ft1.c1 @@ -3347,15 +3356,15 @@ select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------ Unique - Output: ((SubPlan 1)) + Output: ((SubPlan expr_1)) -> Sort - Output: ((SubPlan 1)) - Sort Key: ((SubPlan 1)) + Output: ((SubPlan expr_1)) + Sort Key: ((SubPlan expr_1)) -> Foreign Scan - Output: (SubPlan 1) + Output: (SubPlan expr_1) Relations: Aggregate on (public.ft2 t2) Remote SQL: SELECT count(*) FILTER (WHERE ((c2 = 6) AND ("C 1" < 10))) FROM "S 1"."T 1" WHERE (((c2 % 6) = 0)) - SubPlan 1 + SubPlan expr_1 -> Foreign Scan on public.ft1 t1 Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10)))) Remote SQL: SELECT NULL FROM "S 1"."T 1" WHERE (("C 1" = 6)) @@ -3370,21 +3379,21 @@ select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft -- Inner query is aggregation query explain (verbose, costs off) select distinct (select count(t1.c1) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------- Unique - Output: ((SubPlan 1)) + Output: ((SubPlan expr_1)) -> Sort - Output: ((SubPlan 1)) - Sort Key: ((SubPlan 1)) + Output: ((SubPlan expr_1)) + Sort Key: ((SubPlan expr_1)) -> Foreign Scan on public.ft2 t2 - Output: (SubPlan 1) + Output: (SubPlan expr_1) Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE (((c2 % 6) = 0)) - SubPlan 1 + SubPlan expr_1 -> Foreign Scan - Output: (count(t1.c1) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10)))) + Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10)))) Relations: Aggregate on (public.ft1 t1) - Remote SQL: SELECT count("C 1") FILTER (WHERE (($1::integer = 6) AND ($2::integer < 10))) FROM "S 1"."T 1" WHERE (("C 1" = 6)) + Remote SQL: SELECT count(*) FILTER (WHERE (($1::integer = 6) AND ($2::integer < 10))) FROM "S 1"."T 1" WHERE (("C 1" = 6)) (13 rows) select distinct (select count(t1.c1) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1; @@ -3412,14 +3421,14 @@ select sum(c1) filter (where (c1 / c1) * random() <= 1) from ft1 group by c2 ord explain (verbose, costs off) select sum(c2) filter (where c2 in (select c2 from ft1 where c2 < 5)) from ft1; - QUERY PLAN -------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------- Aggregate - Output: sum(ft1.c2) FILTER (WHERE (ANY (ft1.c2 = (hashed SubPlan 1).col1))) + Output: sum(ft1.c2) FILTER (WHERE (ANY (ft1.c2 = (hashed SubPlan any_1).col1))) -> Foreign Scan on public.ft1 Output: ft1.c2 Remote SQL: SELECT c2 FROM "S 1"."T 1" - SubPlan 1 + SubPlan any_1 -> Foreign Scan on public.ft1 ft1_1 Output: ft1_1.c2 Remote SQL: SELECT c2 FROM "S 1"."T 1" WHERE ((c2 < 5)) @@ -3692,30 +3701,33 @@ select count(t1.c3) from ft2 t1 left join ft2 t2 on (t1.c1 = random() * t2.c2); -- Subquery in FROM clause having aggregate explain (verbose, costs off) select count(*), x.b from ft1, (select c2 a, sum(c1) b from ft1 group by c2) x where ft1.c2 = x.a group by x.b order by 1, 2; - QUERY PLAN ------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------- Sort - Output: (count(*)), x.b - Sort Key: (count(*)), x.b - -> HashAggregate - Output: count(*), x.b - Group Key: x.b - -> Hash Join - Output: x.b - Inner Unique: true - Hash Cond: (ft1.c2 = x.a) - -> Foreign Scan on public.ft1 - Output: ft1.c2 - Remote SQL: SELECT c2 FROM "S 1"."T 1" - -> Hash - Output: x.b, x.a - -> Subquery Scan on x - Output: x.b, x.a - -> Foreign Scan - Output: ft1_1.c2, (sum(ft1_1.c1)) - Relations: Aggregate on (public.ft1 ft1_1) - Remote SQL: SELECT c2, sum("C 1") FROM "S 1"."T 1" GROUP BY 1 -(21 rows) + Output: (count(*)), (sum(ft1_1.c1)) + Sort Key: (count(*)), (sum(ft1_1.c1)) + -> Finalize GroupAggregate + Output: count(*), (sum(ft1_1.c1)) + Group Key: (sum(ft1_1.c1)) + -> Sort + Output: (sum(ft1_1.c1)), (PARTIAL count(*)) + Sort Key: (sum(ft1_1.c1)) + -> Hash Join + Output: (sum(ft1_1.c1)), (PARTIAL count(*)) + Hash Cond: (ft1_1.c2 = ft1.c2) + -> Foreign Scan + Output: ft1_1.c2, (sum(ft1_1.c1)) + Relations: Aggregate on (public.ft1 ft1_1) + Remote SQL: SELECT c2, sum("C 1") FROM "S 1"."T 1" GROUP BY 1 + -> Hash + Output: ft1.c2, (PARTIAL count(*)) + -> Partial HashAggregate + Output: ft1.c2, PARTIAL count(*) + Group Key: ft1.c2 + -> Foreign Scan on public.ft1 + Output: ft1.c2 + Remote SQL: SELECT c2 FROM "S 1"."T 1" +(24 rows) select count(*), x.b from ft1, (select c2 a, sum(c1) b from ft1 group by c2) x where ft1.c2 = x.a group by x.b order by 1, 2; count | b @@ -4600,11 +4612,13 @@ SELECT * FROM ft1 WHERE 'foo' = c8 LIMIT 1; -- with that remote type SELECT * FROM ft1 WHERE c8 LIKE 'foo' LIMIT 1; -- ERROR ERROR: operator does not exist: public.user_enum ~~ unknown -HINT: No operator matches the given name and argument types. You might need to add explicit type casts. +DETAIL: No operator of that name accepts the given argument types. +HINT: You might need to add explicit type casts. CONTEXT: remote SQL command: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE ((c8 ~~ 'foo')) LIMIT 1::bigint SELECT * FROM ft1 WHERE c8::text LIKE 'foo' LIMIT 1; -- ERROR; cast not pushed down ERROR: operator does not exist: public.user_enum ~~ unknown -HINT: No operator matches the given name and argument types. You might need to add explicit type casts. +DETAIL: No operator of that name accepts the given argument types. +HINT: You might need to add explicit type casts. CONTEXT: remote SQL command: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE ((c8 ~~ 'foo')) LIMIT 1::bigint ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 TYPE user_enum; -- =================================================================== @@ -5077,13 +5091,13 @@ SELECT ft1.c1 FROM ft1 JOIN ft2 on ft1.c1 = ft2.c1 WHERE -- =================================================================== EXPLAIN (verbose, costs off) INSERT INTO ft2 (c1,c2,c3) SELECT c1+1000,c2+100, c3 || c3 FROM ft2 LIMIT 20; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ Insert on public.ft2 Remote SQL: INSERT INTO "S 1"."T 1"("C 1", c2, c3, c4, c5, c6, c7, c8) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) Batch Size: 1 - -> Subquery Scan on "*SELECT*" - Output: "*SELECT*"."?column?", "*SELECT*"."?column?_1", NULL::integer, "*SELECT*"."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum + -> Subquery Scan on unnamed_subquery + Output: unnamed_subquery."?column?", unnamed_subquery."?column?_1", NULL::integer, unnamed_subquery."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum -> Foreign Scan on public.ft2 ft2_1 Output: (ft2_1.c1 + 1000), (ft2_1.c2 + 100), (ft2_1.c3 || ft2_1.c3) Remote SQL: SELECT "C 1", c2, c3 FROM "S 1"."T 1" LIMIT 20::bigint @@ -6433,14 +6447,14 @@ UPDATE ft2 AS target SET (c2, c7) = ( FROM ft2 AS src WHERE target.c1 = src.c1 ) WHERE c1 > 1100; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------- Update on public.ft2 target Remote SQL: UPDATE "S 1"."T 1" SET c2 = $2, c7 = $3 WHERE ctid = $1 -> Foreign Scan on public.ft2 target - Output: (SubPlan 1).col1, (SubPlan 1).col2, (rescan SubPlan 1), target.ctid, target.* + Output: (SubPlan multiexpr_1).col1, (SubPlan multiexpr_1).col2, (rescan SubPlan multiexpr_1), target.ctid, target.* Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8, ctid FROM "S 1"."T 1" WHERE (("C 1" > 1100)) FOR UPDATE - SubPlan 1 + SubPlan multiexpr_1 -> Foreign Scan on public.ft2 src Output: (src.c2 * 10), src.c7 Remote SQL: SELECT c2, c7 FROM "S 1"."T 1" WHERE (($1::integer = "C 1")) @@ -7148,8 +7162,9 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM ft1 WHERE c2 < 0; Aggregate Output: count(*) -> Result + Replaces: Scan on ft1 One-Time Filter: false -(4 rows) +(5 rows) SELECT count(*) FROM ft1 WHERE c2 < 0; count @@ -7192,8 +7207,9 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM ft1 WHERE c2 >= 0; Aggregate Output: count(*) -> Result + Replaces: Scan on ft1 One-Time Filter: false -(4 rows) +(5 rows) SELECT count(*) FROM ft1 WHERE c2 >= 0; count @@ -8021,8 +8037,9 @@ DELETE FROM rem1 WHERE false; -- currently can't be pushed down Remote SQL: DELETE FROM public.loc1 WHERE ctid = $1 -> Result Output: ctid + Replaces: Scan on rem1 One-Time Filter: false -(5 rows) +(6 rows) -- Test with statement-level triggers CREATE TRIGGER trig_stmt_before @@ -8212,6 +8229,119 @@ DELETE FROM rem1; -- can't be pushed down (5 rows) DROP TRIGGER trig_row_after_delete ON rem1; +-- We are allowed to create transition-table triggers on both kinds of +-- inheritance even if they contain foreign tables as children, but currently +-- collecting transition tuples from such foreign tables is not supported. +CREATE TABLE local_tbl (a text, b int); +CREATE FOREIGN TABLE foreign_tbl (a text, b int) + SERVER loopback OPTIONS (table_name 'local_tbl'); +INSERT INTO foreign_tbl VALUES ('AAA', 42); +-- Test case for partition hierarchy +CREATE TABLE parent_tbl (a text, b int) PARTITION BY LIST (a); +ALTER TABLE parent_tbl ATTACH PARTITION foreign_tbl FOR VALUES IN ('AAA'); +CREATE TRIGGER parent_tbl_insert_trig + AFTER INSERT ON parent_tbl REFERENCING NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +CREATE TRIGGER parent_tbl_update_trig + AFTER UPDATE ON parent_tbl REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +CREATE TRIGGER parent_tbl_delete_trig + AFTER DELETE ON parent_tbl REFERENCING OLD TABLE AS old_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +INSERT INTO parent_tbl VALUES ('AAA', 42); +ERROR: cannot collect transition tuples from child foreign tables +COPY parent_tbl (a, b) FROM stdin; +ERROR: cannot collect transition tuples from child foreign tables +CONTEXT: COPY parent_tbl, line 1: "AAA 42" +ALTER SERVER loopback OPTIONS (ADD batch_size '10'); +INSERT INTO parent_tbl VALUES ('AAA', 42); +ERROR: cannot collect transition tuples from child foreign tables +COPY parent_tbl (a, b) FROM stdin; +ERROR: cannot collect transition tuples from child foreign tables +CONTEXT: COPY parent_tbl, line 1: "AAA 42" +ALTER SERVER loopback OPTIONS (DROP batch_size); +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE parent_tbl SET b = b + 1; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Update on public.parent_tbl + Foreign Update on public.foreign_tbl parent_tbl_1 + Remote SQL: UPDATE public.local_tbl SET b = $2 WHERE ctid = $1 + -> Foreign Scan on public.foreign_tbl parent_tbl_1 + Output: (parent_tbl_1.b + 1), parent_tbl_1.tableoid, parent_tbl_1.ctid, parent_tbl_1.* + Remote SQL: SELECT a, b, ctid FROM public.local_tbl FOR UPDATE +(6 rows) + +UPDATE parent_tbl SET b = b + 1; +ERROR: cannot collect transition tuples from child foreign tables +EXPLAIN (VERBOSE, COSTS OFF) +DELETE FROM parent_tbl; + QUERY PLAN +------------------------------------------------------------------ + Delete on public.parent_tbl + Foreign Delete on public.foreign_tbl parent_tbl_1 + Remote SQL: DELETE FROM public.local_tbl WHERE ctid = $1 + -> Foreign Scan on public.foreign_tbl parent_tbl_1 + Output: parent_tbl_1.tableoid, parent_tbl_1.ctid + Remote SQL: SELECT ctid FROM public.local_tbl FOR UPDATE +(6 rows) + +DELETE FROM parent_tbl; +ERROR: cannot collect transition tuples from child foreign tables +ALTER TABLE parent_tbl DETACH PARTITION foreign_tbl; +DROP TABLE parent_tbl; +-- Test case for non-partition hierarchy +CREATE TABLE parent_tbl (a text, b int); +ALTER FOREIGN TABLE foreign_tbl INHERIT parent_tbl; +CREATE TRIGGER parent_tbl_update_trig + AFTER UPDATE ON parent_tbl REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +CREATE TRIGGER parent_tbl_delete_trig + AFTER DELETE ON parent_tbl REFERENCING OLD TABLE AS old_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE parent_tbl SET b = b + 1; + QUERY PLAN +------------------------------------------------------------------------------------------------------ + Update on public.parent_tbl + Update on public.parent_tbl parent_tbl_1 + Foreign Update on public.foreign_tbl parent_tbl_2 + Remote SQL: UPDATE public.local_tbl SET b = $2 WHERE ctid = $1 + -> Result + Output: (parent_tbl.b + 1), parent_tbl.tableoid, parent_tbl.ctid, (NULL::record) + -> Append + -> Seq Scan on public.parent_tbl parent_tbl_1 + Output: parent_tbl_1.b, parent_tbl_1.tableoid, parent_tbl_1.ctid, NULL::record + -> Foreign Scan on public.foreign_tbl parent_tbl_2 + Output: parent_tbl_2.b, parent_tbl_2.tableoid, parent_tbl_2.ctid, parent_tbl_2.* + Remote SQL: SELECT a, b, ctid FROM public.local_tbl FOR UPDATE +(12 rows) + +UPDATE parent_tbl SET b = b + 1; +ERROR: cannot collect transition tuples from child foreign tables +EXPLAIN (VERBOSE, COSTS OFF) +DELETE FROM parent_tbl; + QUERY PLAN +------------------------------------------------------------------------ + Delete on public.parent_tbl + Delete on public.parent_tbl parent_tbl_1 + Foreign Delete on public.foreign_tbl parent_tbl_2 + Remote SQL: DELETE FROM public.local_tbl WHERE ctid = $1 + -> Append + -> Seq Scan on public.parent_tbl parent_tbl_1 + Output: parent_tbl_1.tableoid, parent_tbl_1.ctid + -> Foreign Scan on public.foreign_tbl parent_tbl_2 + Output: parent_tbl_2.tableoid, parent_tbl_2.ctid + Remote SQL: SELECT ctid FROM public.local_tbl FOR UPDATE +(10 rows) + +DELETE FROM parent_tbl; +ERROR: cannot collect transition tuples from child foreign tables +ALTER FOREIGN TABLE foreign_tbl NO INHERIT parent_tbl; +DROP TABLE parent_tbl; +-- Cleanup +DROP FOREIGN TABLE foreign_tbl; +DROP TABLE local_tbl; -- =================================================================== -- test inheritance features -- =================================================================== @@ -10509,14 +10639,8 @@ SHOW is_superuser; (1 row) -- This will be OK, we can create the FDW -DO $d$ - BEGIN - EXECUTE $$CREATE SERVER loopback_nopw FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - END; -$d$; +CREATE SERVER loopback_nopw FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); -- But creation of user mappings for non-superusers should fail CREATE USER MAPPING FOR public SERVER loopback_nopw; CREATE USER MAPPING FOR CURRENT_USER SERVER loopback_nopw; @@ -11475,6 +11599,11 @@ SELECT * FROM result_tbl ORDER BY a; (3 rows) DELETE FROM result_tbl; +-- Test COPY TO when foreign table is partition +COPY async_pt TO stdout; --error +ERROR: cannot copy from foreign table "async_p1" +DETAIL: Partition "async_p1" is a foreign table in partitioned table "async_pt" +HINT: Try the COPY (SELECT ...) TO variant. DROP FOREIGN TABLE async_p3; DROP TABLE base_tbl3; -- Check case where the partitioned table has local/remote partitions @@ -12011,12 +12140,12 @@ INSERT INTO local_tbl VALUES (1505, 505, 'foo'); ANALYZE local_tbl; EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM local_tbl t1 LEFT JOIN (SELECT *, (SELECT count(*) FROM async_pt WHERE a < 3000) FROM async_pt WHERE a < 3000) t2 ON t1.a = t2.a; - QUERY PLAN ----------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------ Nested Loop Left Join - Output: t1.a, t1.b, t1.c, async_pt.a, async_pt.b, async_pt.c, ((InitPlan 1).col1) + Output: t1.a, t1.b, t1.c, async_pt.a, async_pt.b, async_pt.c, ((InitPlan expr_1).col1) Join Filter: (t1.a = async_pt.a) - InitPlan 1 + InitPlan expr_1 -> Aggregate Output: count(*) -> Append @@ -12028,10 +12157,10 @@ SELECT * FROM local_tbl t1 LEFT JOIN (SELECT *, (SELECT count(*) FROM async_pt W Output: t1.a, t1.b, t1.c -> Append -> Async Foreign Scan on public.async_p1 async_pt_1 - Output: async_pt_1.a, async_pt_1.b, async_pt_1.c, (InitPlan 1).col1 + Output: async_pt_1.a, async_pt_1.b, async_pt_1.c, (InitPlan expr_1).col1 Remote SQL: SELECT a, b, c FROM public.base_tbl1 WHERE ((a < 3000)) -> Async Foreign Scan on public.async_p2 async_pt_2 - Output: async_pt_2.a, async_pt_2.b, async_pt_2.c, (InitPlan 1).col1 + Output: async_pt_2.a, async_pt_2.b, async_pt_2.c, (InitPlan expr_1).col1 Remote SQL: SELECT a, b, c FROM public.base_tbl2 WHERE ((a < 3000)) (20 rows) @@ -12042,7 +12171,7 @@ SELECT * FROM local_tbl t1 LEFT JOIN (SELECT *, (SELECT count(*) FROM async_pt W Nested Loop Left Join (actual rows=1.00 loops=1) Join Filter: (t1.a = async_pt.a) Rows Removed by Join Filter: 399 - InitPlan 1 + InitPlan expr_1 -> Aggregate (actual rows=1.00 loops=1) -> Append (actual rows=400.00 loops=1) -> Async Foreign Scan on async_p1 async_pt_4 (actual rows=200.00 loops=1) @@ -12265,12 +12394,12 @@ CREATE FOREIGN TABLE foreign_tbl2 () INHERITS (foreign_tbl) SERVER loopback OPTIONS (table_name 'base_tbl'); EXPLAIN (VERBOSE, COSTS OFF) SELECT a FROM base_tbl WHERE (a, random() > 0) IN (SELECT a, random() > 0 FROM foreign_tbl); - QUERY PLAN ---------------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------- Seq Scan on public.base_tbl Output: base_tbl.a - Filter: (ANY ((base_tbl.a = (SubPlan 1).col1) AND ((random() > '0'::double precision) = (SubPlan 1).col2))) - SubPlan 1 + Filter: (ANY ((base_tbl.a = (SubPlan any_1).col1) AND ((random() > '0'::double precision) = (SubPlan any_1).col2))) + SubPlan any_1 -> Result Output: base_tbl.a, (random() > '0'::double precision) -> Append @@ -12515,7 +12644,7 @@ ALTER SERVER loopback2 OPTIONS (DROP parallel_abort); -- =================================================================== CREATE TABLE analyze_table (id int, a text, b bigint); CREATE FOREIGN TABLE analyze_ftable (id int, a text, b bigint) - SERVER loopback OPTIONS (table_name 'analyze_rtable1'); + SERVER loopback OPTIONS (table_name 'analyze_table'); INSERT INTO analyze_table (SELECT x FROM generate_series(1,1000) x); ANALYZE analyze_table; SET default_statistics_target = 10; @@ -12523,15 +12652,15 @@ ANALYZE analyze_table; ALTER SERVER loopback OPTIONS (analyze_sampling 'invalid'); ERROR: invalid value for string option "analyze_sampling": invalid ALTER SERVER loopback OPTIONS (analyze_sampling 'auto'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'system'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'bernoulli'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'random'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'off'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; -- cleanup DROP FOREIGN TABLE analyze_ftable; DROP TABLE analyze_table; diff --git a/contrib/postgres_fdw/meson.build b/contrib/postgres_fdw/meson.build index 8b29be24deeb7..aac89ffdde886 100644 --- a/contrib/postgres_fdw/meson.build +++ b/contrib/postgres_fdw/meson.build @@ -39,7 +39,13 @@ tests += { 'postgres_fdw', 'query_cancel', ], - 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], + 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], + }, + 'isolation': { + 'specs': [ + 'eval_plan_qual', + ], + 'regress_args': ['--load-extension=postgres_fdw'], }, 'tap': { 'tests': [ diff --git a/contrib/postgres_fdw/option.c b/contrib/postgres_fdw/option.c index c2f936640bca8..04788b7e8b35f 100644 --- a/contrib/postgres_fdw/option.c +++ b/contrib/postgres_fdw/option.c @@ -21,6 +21,7 @@ #include "libpq/libpq-be.h" #include "postgres_fdw.h" #include "utils/guc.h" +#include "utils/memutils.h" #include "utils/varlena.h" /* @@ -39,12 +40,6 @@ typedef struct PgFdwOption */ static PgFdwOption *postgres_fdw_options; -/* - * Valid options for libpq. - * Allocated and filled in InitPgFdwOptions. - */ -static PQconninfoOption *libpq_options; - /* * GUC parameters */ @@ -239,6 +234,7 @@ static void InitPgFdwOptions(void) { int num_libpq_opts; + PQconninfoOption *libpq_options; PQconninfoOption *lopt; PgFdwOption *popt; @@ -307,8 +303,8 @@ InitPgFdwOptions(void) * Get list of valid libpq options. * * To avoid unnecessary work, we get the list once and use it throughout - * the lifetime of this backend process. We don't need to care about - * memory context issues, because PQconndefaults allocates with malloc. + * the lifetime of this backend process. Hence, we'll allocate it in + * TopMemoryContext. */ libpq_options = PQconndefaults(); if (!libpq_options) /* assume reason for failure is OOM */ @@ -325,19 +321,11 @@ InitPgFdwOptions(void) /* * Construct an array which consists of all valid options for * postgres_fdw, by appending FDW-specific options to libpq options. - * - * We use plain malloc here to allocate postgres_fdw_options because it - * lives as long as the backend process does. Besides, keeping - * libpq_options in memory allows us to avoid copying every keyword - * string. */ postgres_fdw_options = (PgFdwOption *) - malloc(sizeof(PgFdwOption) * num_libpq_opts + - sizeof(non_libpq_options)); - if (postgres_fdw_options == NULL) - ereport(ERROR, - (errcode(ERRCODE_FDW_OUT_OF_MEMORY), - errmsg("out of memory"))); + MemoryContextAlloc(TopMemoryContext, + sizeof(PgFdwOption) * num_libpq_opts + + sizeof(non_libpq_options)); popt = postgres_fdw_options; for (lopt = libpq_options; lopt->keyword; lopt++) @@ -355,8 +343,8 @@ InitPgFdwOptions(void) if (strncmp(lopt->keyword, "oauth_", strlen("oauth_")) == 0) continue; - /* We don't have to copy keyword string, as described above. */ - popt->keyword = lopt->keyword; + popt->keyword = MemoryContextStrdup(TopMemoryContext, + lopt->keyword); /* * "user" and any secret options are allowed only on user mappings. @@ -371,6 +359,9 @@ InitPgFdwOptions(void) popt++; } + /* Done with libpq's output structure. */ + PQconninfoFree(libpq_options); + /* Append FDW-specific options and dummy terminator. */ memcpy(popt, non_libpq_options, sizeof(non_libpq_options)); } @@ -531,7 +522,7 @@ process_pgfdw_appname(const char *appname) appendStringInfoString(&buf, application_name); break; case 'c': - appendStringInfo(&buf, INT64_HEX_FORMAT ".%x", MyStartTime, MyProcPid); + appendStringInfo(&buf, "%" PRIx64 ".%x", MyStartTime, MyProcPid); break; case 'C': appendStringInfoString(&buf, cluster_name); diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 331f3fc088d1b..5e178c21b390e 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -633,7 +633,7 @@ postgresGetForeignRelSize(PlannerInfo *root, * We use PgFdwRelationInfo to pass various information to subsequent * functions. */ - fpinfo = (PgFdwRelationInfo *) palloc0(sizeof(PgFdwRelationInfo)); + fpinfo = palloc0_object(PgFdwRelationInfo); baserel->fdw_private = fpinfo; /* Base foreign tables need to be pushed down always. */ @@ -1517,7 +1517,7 @@ postgresBeginForeignScan(ForeignScanState *node, int eflags) /* * We'll save private state in node->fdw_state. */ - fsstate = (PgFdwScanState *) palloc0(sizeof(PgFdwScanState)); + fsstate = palloc0_object(PgFdwScanState); node->fdw_state = fsstate; /* @@ -1702,13 +1702,9 @@ postgresReScanForeignScan(ForeignScanState *node) return; } - /* - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. - */ res = pgfdw_exec_query(fsstate->conn, sql, fsstate->conn_state); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, fsstate->conn, true, sql); + pgfdw_report_error(res, fsstate->conn, sql); PQclear(res); /* Now force a fresh FETCH. */ @@ -2667,7 +2663,7 @@ postgresBeginDirectModify(ForeignScanState *node, int eflags) /* * We'll save private state in node->fdw_state. */ - dmstate = (PgFdwDirectModifyState *) palloc0(sizeof(PgFdwDirectModifyState)); + dmstate = palloc0_object(PgFdwDirectModifyState); node->fdw_state = dmstate; /* @@ -2845,7 +2841,7 @@ postgresExplainForeignScan(ForeignScanState *node, ExplainState *es) */ if (list_length(fdw_private) > FdwScanPrivateRelations) { - StringInfo relations; + StringInfoData relations; char *rawrelations; char *ptr; int minrti, @@ -2879,7 +2875,7 @@ postgresExplainForeignScan(ForeignScanState *node, ExplainState *es) rtoffset = bms_next_member(plan->fs_base_relids, -1) - minrti; /* Now we can translate the string */ - relations = makeStringInfo(); + initStringInfo(&relations); ptr = rawrelations; while (*ptr) { @@ -2901,24 +2897,24 @@ postgresExplainForeignScan(ForeignScanState *node, ExplainState *es) char *namespace; namespace = get_namespace_name_or_temp(get_rel_namespace(rte->relid)); - appendStringInfo(relations, "%s.%s", + appendStringInfo(&relations, "%s.%s", quote_identifier(namespace), quote_identifier(relname)); } else - appendStringInfoString(relations, + appendStringInfoString(&relations, quote_identifier(relname)); refname = (char *) list_nth(es->rtable_names, rti - 1); if (refname == NULL) refname = rte->eref->aliasname; if (strcmp(refname, relname) != 0) - appendStringInfo(relations, " %s", + appendStringInfo(&relations, " %s", quote_identifier(refname)); } else - appendStringInfoChar(relations, *ptr++); + appendStringInfoChar(&relations, *ptr++); } - ExplainPropertyText("Relations", relations->data, es); + ExplainPropertyText("Relations", relations.data, es); } /* @@ -3489,6 +3485,13 @@ estimate_path_cost_size(PlannerInfo *root, { Assert(foreignrel->reloptkind == RELOPT_UPPER_REL && fpinfo->stage == UPPERREL_GROUP_AGG); + + /* + * We can only get here when this function is called from + * add_foreign_ordered_paths() or add_foreign_final_paths(); + * in which cases, the passed-in fpextra should not be NULL. + */ + Assert(fpextra); adjust_foreign_grouping_path_cost(root, pathkeys, retrieved_rows, width, fpextra->limit_tuples, @@ -3601,41 +3604,32 @@ get_remote_estimate(const char *sql, PGconn *conn, double *rows, int *width, Cost *startup_cost, Cost *total_cost) { - PGresult *volatile res = NULL; - - /* PGresult must be released before leaving this function. */ - PG_TRY(); - { - char *line; - char *p; - int n; + PGresult *res; + char *line; + char *p; + int n; - /* - * Execute EXPLAIN remotely. - */ - res = pgfdw_exec_query(conn, sql, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql); + /* + * Execute EXPLAIN remotely. + */ + res = pgfdw_exec_query(conn, sql, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql); - /* - * Extract cost numbers for topmost plan node. Note we search for a - * left paren from the end of the line to avoid being confused by - * other uses of parentheses. - */ - line = PQgetvalue(res, 0, 0); - p = strrchr(line, '('); - if (p == NULL) - elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); - n = sscanf(p, "(cost=%lf..%lf rows=%lf width=%d)", - startup_cost, total_cost, rows, width); - if (n != 4) - elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); - } - PG_FINALLY(); - { - PQclear(res); - } - PG_END_TRY(); + /* + * Extract cost numbers for topmost plan node. Note we search for a left + * paren from the end of the line to avoid being confused by other uses of + * parentheses. + */ + line = PQgetvalue(res, 0, 0); + p = strrchr(line, '('); + if (p == NULL) + elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); + n = sscanf(p, "(cost=%lf..%lf rows=%lf width=%d)", + startup_cost, total_cost, rows, width); + if (n != 4) + elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); + PQclear(res); } /* @@ -3775,17 +3769,14 @@ create_cursor(ForeignScanState *node) */ if (!PQsendQueryParams(conn, buf.data, numParams, NULL, values, NULL, NULL, 0)) - pgfdw_report_error(ERROR, NULL, conn, false, buf.data); + pgfdw_report_error(NULL, conn, buf.data); /* * Get the result, and check for success. - * - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. */ res = pgfdw_get_result(conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, true, fsstate->query); + pgfdw_report_error(res, conn, fsstate->query); PQclear(res); /* Mark the cursor as created, and show no tuples have been retrieved */ @@ -3807,7 +3798,10 @@ static void fetch_more_data(ForeignScanState *node) { PgFdwScanState *fsstate = (PgFdwScanState *) node->fdw_state; - PGresult *volatile res = NULL; + PGconn *conn = fsstate->conn; + PGresult *res; + int numrows; + int i; MemoryContext oldcontext; /* @@ -3818,74 +3812,63 @@ fetch_more_data(ForeignScanState *node) MemoryContextReset(fsstate->batch_cxt); oldcontext = MemoryContextSwitchTo(fsstate->batch_cxt); - /* PGresult must be released before leaving this function. */ - PG_TRY(); + if (fsstate->async_capable) { - PGconn *conn = fsstate->conn; - int numrows; - int i; + Assert(fsstate->conn_state->pendingAreq); - if (fsstate->async_capable) - { - Assert(fsstate->conn_state->pendingAreq); + /* + * The query was already sent by an earlier call to + * fetch_more_data_begin. So now we just fetch the result. + */ + res = pgfdw_get_result(conn); + /* On error, report the original query, not the FETCH. */ + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, fsstate->query); - /* - * The query was already sent by an earlier call to - * fetch_more_data_begin. So now we just fetch the result. - */ - res = pgfdw_get_result(conn); - /* On error, report the original query, not the FETCH. */ - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, fsstate->query); + /* Reset per-connection state */ + fsstate->conn_state->pendingAreq = NULL; + } + else + { + char sql[64]; - /* Reset per-connection state */ - fsstate->conn_state->pendingAreq = NULL; - } - else - { - char sql[64]; + /* This is a regular synchronous fetch. */ + snprintf(sql, sizeof(sql), "FETCH %d FROM c%u", + fsstate->fetch_size, fsstate->cursor_number); - /* This is a regular synchronous fetch. */ - snprintf(sql, sizeof(sql), "FETCH %d FROM c%u", - fsstate->fetch_size, fsstate->cursor_number); + res = pgfdw_exec_query(conn, sql, fsstate->conn_state); + /* On error, report the original query, not the FETCH. */ + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, fsstate->query); + } - res = pgfdw_exec_query(conn, sql, fsstate->conn_state); - /* On error, report the original query, not the FETCH. */ - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, fsstate->query); - } + /* Convert the data into HeapTuples */ + numrows = PQntuples(res); + fsstate->tuples = (HeapTuple *) palloc0(numrows * sizeof(HeapTuple)); + fsstate->num_tuples = numrows; + fsstate->next_tuple = 0; - /* Convert the data into HeapTuples */ - numrows = PQntuples(res); - fsstate->tuples = (HeapTuple *) palloc0(numrows * sizeof(HeapTuple)); - fsstate->num_tuples = numrows; - fsstate->next_tuple = 0; + for (i = 0; i < numrows; i++) + { + Assert(IsA(node->ss.ps.plan, ForeignScan)); - for (i = 0; i < numrows; i++) - { - Assert(IsA(node->ss.ps.plan, ForeignScan)); - - fsstate->tuples[i] = - make_tuple_from_result_row(res, i, - fsstate->rel, - fsstate->attinmeta, - fsstate->retrieved_attrs, - node, - fsstate->temp_cxt); - } + fsstate->tuples[i] = + make_tuple_from_result_row(res, i, + fsstate->rel, + fsstate->attinmeta, + fsstate->retrieved_attrs, + node, + fsstate->temp_cxt); + } - /* Update fetch_ct_2 */ - if (fsstate->fetch_ct_2 < 2) - fsstate->fetch_ct_2++; + /* Update fetch_ct_2 */ + if (fsstate->fetch_ct_2 < 2) + fsstate->fetch_ct_2++; - /* Must be EOF if we didn't get as many tuples as we asked for. */ - fsstate->eof_reached = (numrows < fsstate->fetch_size); - } - PG_FINALLY(); - { - PQclear(res); - } - PG_END_TRY(); + /* Must be EOF if we didn't get as many tuples as we asked for. */ + fsstate->eof_reached = (numrows < fsstate->fetch_size); + + PQclear(res); MemoryContextSwitchTo(oldcontext); } @@ -3959,14 +3942,9 @@ close_cursor(PGconn *conn, unsigned int cursor_number, PGresult *res; snprintf(sql, sizeof(sql), "CLOSE c%u", cursor_number); - - /* - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. - */ res = pgfdw_exec_query(conn, sql, conn_state); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, true, sql); + pgfdw_report_error(res, conn, sql); PQclear(res); } @@ -3999,7 +3977,7 @@ create_foreign_modify(EState *estate, ListCell *lc; /* Begin constructing PgFdwModifyState. */ - fmstate = (PgFdwModifyState *) palloc0(sizeof(PgFdwModifyState)); + fmstate = palloc0_object(PgFdwModifyState); fmstate->rel = rel; /* Identify which user to do the remote access as. */ @@ -4036,7 +4014,7 @@ create_foreign_modify(EState *estate, /* Prepare for output conversion of parameters used in prepared stmt. */ n_params = list_length(fmstate->target_attrs) + 1; - fmstate->p_flinfo = (FmgrInfo *) palloc0(sizeof(FmgrInfo) * n_params); + fmstate->p_flinfo = palloc0_array(FmgrInfo, n_params); fmstate->p_nums = 0; if (operation == CMD_UPDATE || operation == CMD_DELETE) @@ -4174,18 +4152,15 @@ execute_foreign_modify(EState *estate, NULL, NULL, 0)) - pgfdw_report_error(ERROR, NULL, fmstate->conn, false, fmstate->query); + pgfdw_report_error(NULL, fmstate->conn, fmstate->query); /* * Get the result, and check for success. - * - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. */ res = pgfdw_get_result(fmstate->conn); if (PQresultStatus(res) != (fmstate->has_returning ? PGRES_TUPLES_OK : PGRES_COMMAND_OK)) - pgfdw_report_error(ERROR, res, fmstate->conn, true, fmstate->query); + pgfdw_report_error(res, fmstate->conn, fmstate->query); /* Check number of rows affected, and fetch RETURNING tuple if any */ if (fmstate->has_returning) @@ -4244,17 +4219,14 @@ prepare_foreign_modify(PgFdwModifyState *fmstate) fmstate->query, 0, NULL)) - pgfdw_report_error(ERROR, NULL, fmstate->conn, false, fmstate->query); + pgfdw_report_error(NULL, fmstate->conn, fmstate->query); /* * Get the result, and check for success. - * - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. */ res = pgfdw_get_result(fmstate->conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, fmstate->conn, true, fmstate->query); + pgfdw_report_error(res, fmstate->conn, fmstate->query); PQclear(res); /* This action shows that the prepare has been done. */ @@ -4345,37 +4317,25 @@ convert_prep_stmt_params(PgFdwModifyState *fmstate, /* * store_returning_result * Store the result of a RETURNING clause - * - * On error, be sure to release the PGresult on the way out. Callers do not - * have PG_TRY blocks to ensure this happens. */ static void store_returning_result(PgFdwModifyState *fmstate, TupleTableSlot *slot, PGresult *res) { - PG_TRY(); - { - HeapTuple newtup; + HeapTuple newtup; - newtup = make_tuple_from_result_row(res, 0, - fmstate->rel, - fmstate->attinmeta, - fmstate->retrieved_attrs, - NULL, - fmstate->temp_cxt); + newtup = make_tuple_from_result_row(res, 0, + fmstate->rel, + fmstate->attinmeta, + fmstate->retrieved_attrs, + NULL, + fmstate->temp_cxt); - /* - * The returning slot will not necessarily be suitable to store - * heaptuples directly, so allow for conversion. - */ - ExecForceStoreHeapTuple(newtup, slot, true); - } - PG_CATCH(); - { - PQclear(res); - PG_RE_THROW(); - } - PG_END_TRY(); + /* + * The returning slot will not necessarily be suitable to store heaptuples + * directly, so allow for conversion. + */ + ExecForceStoreHeapTuple(newtup, slot, true); } /* @@ -4411,14 +4371,9 @@ deallocate_query(PgFdwModifyState *fmstate) return; snprintf(sql, sizeof(sql), "DEALLOCATE %s", fmstate->p_name); - - /* - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. - */ res = pgfdw_exec_query(fmstate->conn, sql, fmstate->conn_state); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, fmstate->conn, true, sql); + pgfdw_report_error(res, fmstate->conn, sql); PQclear(res); pfree(fmstate->p_name); fmstate->p_name = NULL; @@ -4586,20 +4541,24 @@ execute_dml_stmt(ForeignScanState *node) */ if (!PQsendQueryParams(dmstate->conn, dmstate->query, numParams, NULL, values, NULL, NULL, 0)) - pgfdw_report_error(ERROR, NULL, dmstate->conn, false, dmstate->query); + pgfdw_report_error(NULL, dmstate->conn, dmstate->query); /* * Get the result, and check for success. - * - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. */ dmstate->result = pgfdw_get_result(dmstate->conn); if (PQresultStatus(dmstate->result) != (dmstate->has_returning ? PGRES_TUPLES_OK : PGRES_COMMAND_OK)) - pgfdw_report_error(ERROR, dmstate->result, dmstate->conn, true, + pgfdw_report_error(dmstate->result, dmstate->conn, dmstate->query); + /* + * The result potentially needs to survive across multiple executor row + * cycles, so move it to the context where the dmstate is. + */ + dmstate->result = libpqsrv_PGresultSetParent(dmstate->result, + GetMemoryChunkContext(dmstate)); + /* Get the number of rows affected. */ if (dmstate->has_returning) dmstate->num_tuples = PQntuples(dmstate->result); @@ -4641,30 +4600,16 @@ get_returning_data(ForeignScanState *node) } else { - /* - * On error, be sure to release the PGresult on the way out. Callers - * do not have PG_TRY blocks to ensure this happens. - */ - PG_TRY(); - { - HeapTuple newtup; - - newtup = make_tuple_from_result_row(dmstate->result, - dmstate->next_tuple, - dmstate->rel, - dmstate->attinmeta, - dmstate->retrieved_attrs, - node, - dmstate->temp_cxt); - ExecStoreHeapTuple(newtup, slot, false); - } - PG_CATCH(); - { - PQclear(dmstate->result); - PG_RE_THROW(); - } - PG_END_TRY(); + HeapTuple newtup; + newtup = make_tuple_from_result_row(dmstate->result, + dmstate->next_tuple, + dmstate->rel, + dmstate->attinmeta, + dmstate->retrieved_attrs, + node, + dmstate->temp_cxt); + ExecStoreHeapTuple(newtup, slot, false); /* Get the updated/deleted tuple. */ if (dmstate->rel) resultSlot = slot; @@ -4869,7 +4814,7 @@ prepare_query_params(PlanState *node, Assert(numParams > 0); /* Prepare for output conversion of parameters used in remote query. */ - *param_flinfo = (FmgrInfo *) palloc0(sizeof(FmgrInfo) * numParams); + *param_flinfo = palloc0_array(FmgrInfo, numParams); i = 0; foreach(lc, fdw_exprs) @@ -4950,7 +4895,7 @@ postgresAnalyzeForeignTable(Relation relation, UserMapping *user; PGconn *conn; StringInfoData sql; - PGresult *volatile res = NULL; + PGresult *res; /* Return the row-analysis function pointer */ *func = postgresAcquireSampleRowsFunc; @@ -4976,22 +4921,14 @@ postgresAnalyzeForeignTable(Relation relation, initStringInfo(&sql); deparseAnalyzeSizeSql(&sql, relation); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - res = pgfdw_exec_query(conn, sql.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); + res = pgfdw_exec_query(conn, sql.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql.data); - if (PQntuples(res) != 1 || PQnfields(res) != 1) - elog(ERROR, "unexpected result from deparseAnalyzeSizeSql query"); - *totalpages = strtoul(PQgetvalue(res, 0, 0), NULL, 10); - } - PG_FINALLY(); - { - PQclear(res); - } - PG_END_TRY(); + if (PQntuples(res) != 1 || PQnfields(res) != 1) + elog(ERROR, "unexpected result from deparseAnalyzeSizeSql query"); + *totalpages = strtoul(PQgetvalue(res, 0, 0), NULL, 10); + PQclear(res); ReleaseConnection(conn); @@ -5012,9 +4949,9 @@ postgresGetAnalyzeInfoForForeignTable(Relation relation, bool *can_tablesample) UserMapping *user; PGconn *conn; StringInfoData sql; - PGresult *volatile res = NULL; - volatile double reltuples = -1; - volatile char relkind = 0; + PGresult *res; + double reltuples; + char relkind; /* assume the remote relation does not support TABLESAMPLE */ *can_tablesample = false; @@ -5033,24 +4970,15 @@ postgresGetAnalyzeInfoForForeignTable(Relation relation, bool *can_tablesample) initStringInfo(&sql); deparseAnalyzeInfoSql(&sql, relation); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - res = pgfdw_exec_query(conn, sql.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); + res = pgfdw_exec_query(conn, sql.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql.data); - if (PQntuples(res) != 1 || PQnfields(res) != 2) - elog(ERROR, "unexpected result from deparseAnalyzeInfoSql query"); - reltuples = strtod(PQgetvalue(res, 0, 0), NULL); - relkind = *(PQgetvalue(res, 0, 1)); - } - PG_FINALLY(); - { - if (res) - PQclear(res); - } - PG_END_TRY(); + if (PQntuples(res) != 1 || PQnfields(res) != 2) + elog(ERROR, "unexpected result from deparseAnalyzeInfoSql query"); + reltuples = strtod(PQgetvalue(res, 0, 0), NULL); + relkind = *(PQgetvalue(res, 0, 1)); + PQclear(res); ReleaseConnection(conn); @@ -5090,10 +5018,12 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel, int server_version_num; PgFdwSamplingMethod method = ANALYZE_SAMPLE_AUTO; /* auto is default */ double sample_frac = -1.0; - double reltuples; + double reltuples = -1.0; unsigned int cursor_number; StringInfoData sql; - PGresult *volatile res = NULL; + PGresult *res; + char fetch_sql[64]; + int fetch_size; ListCell *lc; /* Initialize workspace state */ @@ -5270,91 +5200,76 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel, deparseAnalyzeSql(&sql, relation, method, sample_frac, &astate.retrieved_attrs); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - char fetch_sql[64]; - int fetch_size; - - res = pgfdw_exec_query(conn, sql.data, NULL); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); - PQclear(res); - res = NULL; + res = pgfdw_exec_query(conn, sql.data, NULL); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + pgfdw_report_error(res, conn, sql.data); + PQclear(res); - /* - * Determine the fetch size. The default is arbitrary, but shouldn't - * be enormous. - */ - fetch_size = 100; - foreach(lc, server->options) - { - DefElem *def = (DefElem *) lfirst(lc); + /* + * Determine the fetch size. The default is arbitrary, but shouldn't be + * enormous. + */ + fetch_size = 100; + foreach(lc, server->options) + { + DefElem *def = (DefElem *) lfirst(lc); - if (strcmp(def->defname, "fetch_size") == 0) - { - (void) parse_int(defGetString(def), &fetch_size, 0, NULL); - break; - } - } - foreach(lc, table->options) + if (strcmp(def->defname, "fetch_size") == 0) { - DefElem *def = (DefElem *) lfirst(lc); - - if (strcmp(def->defname, "fetch_size") == 0) - { - (void) parse_int(defGetString(def), &fetch_size, 0, NULL); - break; - } + (void) parse_int(defGetString(def), &fetch_size, 0, NULL); + break; } + } + foreach(lc, table->options) + { + DefElem *def = (DefElem *) lfirst(lc); - /* Construct command to fetch rows from remote. */ - snprintf(fetch_sql, sizeof(fetch_sql), "FETCH %d FROM c%u", - fetch_size, cursor_number); - - /* Retrieve and process rows a batch at a time. */ - for (;;) + if (strcmp(def->defname, "fetch_size") == 0) { - int numrows; - int i; + (void) parse_int(defGetString(def), &fetch_size, 0, NULL); + break; + } + } - /* Allow users to cancel long query */ - CHECK_FOR_INTERRUPTS(); + /* Construct command to fetch rows from remote. */ + snprintf(fetch_sql, sizeof(fetch_sql), "FETCH %d FROM c%u", + fetch_size, cursor_number); - /* - * XXX possible future improvement: if rowstoskip is large, we - * could issue a MOVE rather than physically fetching the rows, - * then just adjust rowstoskip and samplerows appropriately. - */ + /* Retrieve and process rows a batch at a time. */ + for (;;) + { + int numrows; + int i; - /* Fetch some rows */ - res = pgfdw_exec_query(conn, fetch_sql, NULL); - /* On error, report the original query, not the FETCH. */ - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); + /* Allow users to cancel long query */ + CHECK_FOR_INTERRUPTS(); - /* Process whatever we got. */ - numrows = PQntuples(res); - for (i = 0; i < numrows; i++) - analyze_row_processor(res, i, &astate); + /* + * XXX possible future improvement: if rowstoskip is large, we could + * issue a MOVE rather than physically fetching the rows, then just + * adjust rowstoskip and samplerows appropriately. + */ - PQclear(res); - res = NULL; + /* Fetch some rows */ + res = pgfdw_exec_query(conn, fetch_sql, NULL); + /* On error, report the original query, not the FETCH. */ + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql.data); - /* Must be EOF if we didn't get all the rows requested. */ - if (numrows < fetch_size) - break; - } + /* Process whatever we got. */ + numrows = PQntuples(res); + for (i = 0; i < numrows; i++) + analyze_row_processor(res, i, &astate); - /* Close the cursor, just to be tidy. */ - close_cursor(conn, cursor_number, NULL); - } - PG_CATCH(); - { PQclear(res); - PG_RE_THROW(); + + /* Must be EOF if we didn't get all the rows requested. */ + if (numrows < fetch_size) + break; } - PG_END_TRY(); + + /* Close the cursor, just to be tidy. */ + close_cursor(conn, cursor_number, NULL); ReleaseConnection(conn); @@ -5466,7 +5381,7 @@ postgresImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) UserMapping *mapping; PGconn *conn; StringInfoData buf; - PGresult *volatile res = NULL; + PGresult *res; int numrows, i; ListCell *lc; @@ -5505,243 +5420,231 @@ postgresImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) /* Create workspace for strings */ initStringInfo(&buf); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - /* Check that the schema really exists */ - appendStringInfoString(&buf, "SELECT 1 FROM pg_catalog.pg_namespace WHERE nspname = "); - deparseStringLiteral(&buf, stmt->remote_schema); + /* Check that the schema really exists */ + appendStringInfoString(&buf, "SELECT 1 FROM pg_catalog.pg_namespace WHERE nspname = "); + deparseStringLiteral(&buf, stmt->remote_schema); - res = pgfdw_exec_query(conn, buf.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, buf.data); + res = pgfdw_exec_query(conn, buf.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, buf.data); - if (PQntuples(res) != 1) - ereport(ERROR, - (errcode(ERRCODE_FDW_SCHEMA_NOT_FOUND), - errmsg("schema \"%s\" is not present on foreign server \"%s\"", - stmt->remote_schema, server->servername))); + if (PQntuples(res) != 1) + ereport(ERROR, + (errcode(ERRCODE_FDW_SCHEMA_NOT_FOUND), + errmsg("schema \"%s\" is not present on foreign server \"%s\"", + stmt->remote_schema, server->servername))); - PQclear(res); - res = NULL; - resetStringInfo(&buf); + PQclear(res); + resetStringInfo(&buf); - /* - * Fetch all table data from this schema, possibly restricted by - * EXCEPT or LIMIT TO. (We don't actually need to pay any attention - * to EXCEPT/LIMIT TO here, because the core code will filter the - * statements we return according to those lists anyway. But it - * should save a few cycles to not process excluded tables in the - * first place.) - * - * Import table data for partitions only when they are explicitly - * specified in LIMIT TO clause. Otherwise ignore them and only - * include the definitions of the root partitioned tables to allow - * access to the complete remote data set locally in the schema - * imported. - * - * Note: because we run the connection with search_path restricted to - * pg_catalog, the format_type() and pg_get_expr() outputs will always - * include a schema name for types/functions in other schemas, which - * is what we want. - */ + /* + * Fetch all table data from this schema, possibly restricted by EXCEPT or + * LIMIT TO. (We don't actually need to pay any attention to EXCEPT/LIMIT + * TO here, because the core code will filter the statements we return + * according to those lists anyway. But it should save a few cycles to + * not process excluded tables in the first place.) + * + * Import table data for partitions only when they are explicitly + * specified in LIMIT TO clause. Otherwise ignore them and only include + * the definitions of the root partitioned tables to allow access to the + * complete remote data set locally in the schema imported. + * + * Note: because we run the connection with search_path restricted to + * pg_catalog, the format_type() and pg_get_expr() outputs will always + * include a schema name for types/functions in other schemas, which is + * what we want. + */ + appendStringInfoString(&buf, + "SELECT relname, " + " attname, " + " format_type(atttypid, atttypmod), " + " attnotnull, " + " pg_get_expr(adbin, adrelid), "); + + /* Generated columns are supported since Postgres 12 */ + if (PQserverVersion(conn) >= 120000) appendStringInfoString(&buf, - "SELECT relname, " - " attname, " - " format_type(atttypid, atttypmod), " - " attnotnull, " - " pg_get_expr(adbin, adrelid), "); - - /* Generated columns are supported since Postgres 12 */ - if (PQserverVersion(conn) >= 120000) - appendStringInfoString(&buf, - " attgenerated, "); - else - appendStringInfoString(&buf, - " NULL, "); - - if (import_collate) - appendStringInfoString(&buf, - " collname, " - " collnsp.nspname "); - else - appendStringInfoString(&buf, - " NULL, NULL "); - + " attgenerated, "); + else appendStringInfoString(&buf, - "FROM pg_class c " - " JOIN pg_namespace n ON " - " relnamespace = n.oid " - " LEFT JOIN pg_attribute a ON " - " attrelid = c.oid AND attnum > 0 " - " AND NOT attisdropped " - " LEFT JOIN pg_attrdef ad ON " - " adrelid = c.oid AND adnum = attnum "); - - if (import_collate) - appendStringInfoString(&buf, - " LEFT JOIN pg_collation coll ON " - " coll.oid = attcollation " - " LEFT JOIN pg_namespace collnsp ON " - " collnsp.oid = collnamespace "); + " NULL, "); + if (import_collate) appendStringInfoString(&buf, - "WHERE c.relkind IN (" - CppAsString2(RELKIND_RELATION) "," - CppAsString2(RELKIND_VIEW) "," - CppAsString2(RELKIND_FOREIGN_TABLE) "," - CppAsString2(RELKIND_MATVIEW) "," - CppAsString2(RELKIND_PARTITIONED_TABLE) ") " - " AND n.nspname = "); - deparseStringLiteral(&buf, stmt->remote_schema); - - /* Partitions are supported since Postgres 10 */ - if (PQserverVersion(conn) >= 100000 && - stmt->list_type != FDW_IMPORT_SCHEMA_LIMIT_TO) - appendStringInfoString(&buf, " AND NOT c.relispartition "); - - /* Apply restrictions for LIMIT TO and EXCEPT */ - if (stmt->list_type == FDW_IMPORT_SCHEMA_LIMIT_TO || - stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) + " collname, " + " collnsp.nspname "); + else + appendStringInfoString(&buf, + " NULL, NULL "); + + appendStringInfoString(&buf, + "FROM pg_class c " + " JOIN pg_namespace n ON " + " relnamespace = n.oid " + " LEFT JOIN pg_attribute a ON " + " attrelid = c.oid AND attnum > 0 " + " AND NOT attisdropped " + " LEFT JOIN pg_attrdef ad ON " + " adrelid = c.oid AND adnum = attnum "); + + if (import_collate) + appendStringInfoString(&buf, + " LEFT JOIN pg_collation coll ON " + " coll.oid = attcollation " + " LEFT JOIN pg_namespace collnsp ON " + " collnsp.oid = collnamespace "); + + appendStringInfoString(&buf, + "WHERE c.relkind IN (" + CppAsString2(RELKIND_RELATION) "," + CppAsString2(RELKIND_VIEW) "," + CppAsString2(RELKIND_FOREIGN_TABLE) "," + CppAsString2(RELKIND_MATVIEW) "," + CppAsString2(RELKIND_PARTITIONED_TABLE) ") " + " AND n.nspname = "); + deparseStringLiteral(&buf, stmt->remote_schema); + + /* Partitions are supported since Postgres 10 */ + if (PQserverVersion(conn) >= 100000 && + stmt->list_type != FDW_IMPORT_SCHEMA_LIMIT_TO) + appendStringInfoString(&buf, " AND NOT c.relispartition "); + + /* Apply restrictions for LIMIT TO and EXCEPT */ + if (stmt->list_type == FDW_IMPORT_SCHEMA_LIMIT_TO || + stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) + { + bool first_item = true; + + appendStringInfoString(&buf, " AND c.relname "); + if (stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) + appendStringInfoString(&buf, "NOT "); + appendStringInfoString(&buf, "IN ("); + + /* Append list of table names within IN clause */ + foreach(lc, stmt->table_list) { - bool first_item = true; + RangeVar *rv = (RangeVar *) lfirst(lc); - appendStringInfoString(&buf, " AND c.relname "); - if (stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) - appendStringInfoString(&buf, "NOT "); - appendStringInfoString(&buf, "IN ("); + if (first_item) + first_item = false; + else + appendStringInfoString(&buf, ", "); + deparseStringLiteral(&buf, rv->relname); + } + appendStringInfoChar(&buf, ')'); + } - /* Append list of table names within IN clause */ - foreach(lc, stmt->table_list) - { - RangeVar *rv = (RangeVar *) lfirst(lc); + /* Append ORDER BY at the end of query to ensure output ordering */ + appendStringInfoString(&buf, " ORDER BY c.relname, a.attnum"); - if (first_item) - first_item = false; - else - appendStringInfoString(&buf, ", "); - deparseStringLiteral(&buf, rv->relname); - } - appendStringInfoChar(&buf, ')'); - } + /* Fetch the data */ + res = pgfdw_exec_query(conn, buf.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, buf.data); - /* Append ORDER BY at the end of query to ensure output ordering */ - appendStringInfoString(&buf, " ORDER BY c.relname, a.attnum"); + /* Process results */ + numrows = PQntuples(res); + /* note: incrementation of i happens in inner loop's while() test */ + for (i = 0; i < numrows;) + { + char *tablename = PQgetvalue(res, i, 0); + bool first_item = true; - /* Fetch the data */ - res = pgfdw_exec_query(conn, buf.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, buf.data); + resetStringInfo(&buf); + appendStringInfo(&buf, "CREATE FOREIGN TABLE %s (\n", + quote_identifier(tablename)); - /* Process results */ - numrows = PQntuples(res); - /* note: incrementation of i happens in inner loop's while() test */ - for (i = 0; i < numrows;) + /* Scan all rows for this table */ + do { - char *tablename = PQgetvalue(res, i, 0); - bool first_item = true; + char *attname; + char *typename; + char *attnotnull; + char *attgenerated; + char *attdefault; + char *collname; + char *collnamespace; + + /* If table has no columns, we'll see nulls here */ + if (PQgetisnull(res, i, 1)) + continue; - resetStringInfo(&buf); - appendStringInfo(&buf, "CREATE FOREIGN TABLE %s (\n", - quote_identifier(tablename)); + attname = PQgetvalue(res, i, 1); + typename = PQgetvalue(res, i, 2); + attnotnull = PQgetvalue(res, i, 3); + attdefault = PQgetisnull(res, i, 4) ? NULL : + PQgetvalue(res, i, 4); + attgenerated = PQgetisnull(res, i, 5) ? NULL : + PQgetvalue(res, i, 5); + collname = PQgetisnull(res, i, 6) ? NULL : + PQgetvalue(res, i, 6); + collnamespace = PQgetisnull(res, i, 7) ? NULL : + PQgetvalue(res, i, 7); + + if (first_item) + first_item = false; + else + appendStringInfoString(&buf, ",\n"); - /* Scan all rows for this table */ - do - { - char *attname; - char *typename; - char *attnotnull; - char *attgenerated; - char *attdefault; - char *collname; - char *collnamespace; - - /* If table has no columns, we'll see nulls here */ - if (PQgetisnull(res, i, 1)) - continue; + /* Print column name and type */ + appendStringInfo(&buf, " %s %s", + quote_identifier(attname), + typename); - attname = PQgetvalue(res, i, 1); - typename = PQgetvalue(res, i, 2); - attnotnull = PQgetvalue(res, i, 3); - attdefault = PQgetisnull(res, i, 4) ? NULL : - PQgetvalue(res, i, 4); - attgenerated = PQgetisnull(res, i, 5) ? NULL : - PQgetvalue(res, i, 5); - collname = PQgetisnull(res, i, 6) ? NULL : - PQgetvalue(res, i, 6); - collnamespace = PQgetisnull(res, i, 7) ? NULL : - PQgetvalue(res, i, 7); - - if (first_item) - first_item = false; - else - appendStringInfoString(&buf, ",\n"); + /* + * Add column_name option so that renaming the foreign table's + * column doesn't break the association to the underlying column. + */ + appendStringInfoString(&buf, " OPTIONS (column_name "); + deparseStringLiteral(&buf, attname); + appendStringInfoChar(&buf, ')'); - /* Print column name and type */ - appendStringInfo(&buf, " %s %s", - quote_identifier(attname), - typename); + /* Add COLLATE if needed */ + if (import_collate && collname != NULL && collnamespace != NULL) + appendStringInfo(&buf, " COLLATE %s.%s", + quote_identifier(collnamespace), + quote_identifier(collname)); - /* - * Add column_name option so that renaming the foreign table's - * column doesn't break the association to the underlying - * column. - */ - appendStringInfoString(&buf, " OPTIONS (column_name "); - deparseStringLiteral(&buf, attname); - appendStringInfoChar(&buf, ')'); - - /* Add COLLATE if needed */ - if (import_collate && collname != NULL && collnamespace != NULL) - appendStringInfo(&buf, " COLLATE %s.%s", - quote_identifier(collnamespace), - quote_identifier(collname)); - - /* Add DEFAULT if needed */ - if (import_default && attdefault != NULL && - (!attgenerated || !attgenerated[0])) - appendStringInfo(&buf, " DEFAULT %s", attdefault); - - /* Add GENERATED if needed */ - if (import_generated && attgenerated != NULL && - attgenerated[0] == ATTRIBUTE_GENERATED_STORED) - { - Assert(attdefault != NULL); - appendStringInfo(&buf, - " GENERATED ALWAYS AS (%s) STORED", - attdefault); - } + /* Add DEFAULT if needed */ + if (import_default && attdefault != NULL && + (!attgenerated || !attgenerated[0])) + appendStringInfo(&buf, " DEFAULT %s", attdefault); - /* Add NOT NULL if needed */ - if (import_not_null && attnotnull[0] == 't') - appendStringInfoString(&buf, " NOT NULL"); + /* Add GENERATED if needed */ + if (import_generated && attgenerated != NULL && + attgenerated[0] == ATTRIBUTE_GENERATED_STORED) + { + Assert(attdefault != NULL); + appendStringInfo(&buf, + " GENERATED ALWAYS AS (%s) STORED", + attdefault); } - while (++i < numrows && - strcmp(PQgetvalue(res, i, 0), tablename) == 0); - /* - * Add server name and table-level options. We specify remote - * schema and table name as options (the latter to ensure that - * renaming the foreign table doesn't break the association). - */ - appendStringInfo(&buf, "\n) SERVER %s\nOPTIONS (", - quote_identifier(server->servername)); + /* Add NOT NULL if needed */ + if (import_not_null && attnotnull[0] == 't') + appendStringInfoString(&buf, " NOT NULL"); + } + while (++i < numrows && + strcmp(PQgetvalue(res, i, 0), tablename) == 0); - appendStringInfoString(&buf, "schema_name "); - deparseStringLiteral(&buf, stmt->remote_schema); - appendStringInfoString(&buf, ", table_name "); - deparseStringLiteral(&buf, tablename); + /* + * Add server name and table-level options. We specify remote schema + * and table name as options (the latter to ensure that renaming the + * foreign table doesn't break the association). + */ + appendStringInfo(&buf, "\n) SERVER %s\nOPTIONS (", + quote_identifier(server->servername)); - appendStringInfoString(&buf, ");"); + appendStringInfoString(&buf, "schema_name "); + deparseStringLiteral(&buf, stmt->remote_schema); + appendStringInfoString(&buf, ", table_name "); + deparseStringLiteral(&buf, tablename); - commands = lappend(commands, pstrdup(buf.data)); - } - } - PG_FINALLY(); - { - PQclear(res); + appendStringInfoString(&buf, ");"); + + commands = lappend(commands, pstrdup(buf.data)); } - PG_END_TRY(); + PQclear(res); ReleaseConnection(conn); @@ -6394,7 +6297,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root, * if found safe. Once we know that this join can be pushed down, we fill * the entry. */ - fpinfo = (PgFdwRelationInfo *) palloc0(sizeof(PgFdwRelationInfo)); + fpinfo = palloc0_object(PgFdwRelationInfo); fpinfo->pushdown_safe = false; joinrel->fdw_private = fpinfo; /* attrs_used is only for base relations. */ @@ -6763,7 +6666,7 @@ postgresGetForeignUpperPaths(PlannerInfo *root, UpperRelationKind stage, output_rel->fdw_private) return; - fpinfo = (PgFdwRelationInfo *) palloc0(sizeof(PgFdwRelationInfo)); + fpinfo = palloc0_object(PgFdwRelationInfo); fpinfo->pushdown_safe = false; fpinfo->stage = stage; output_rel->fdw_private = fpinfo; @@ -6988,7 +6891,7 @@ add_foreign_ordered_paths(PlannerInfo *root, RelOptInfo *input_rel, fpinfo->pushdown_safe = true; /* Construct PgFdwPathExtraData */ - fpextra = (PgFdwPathExtraData *) palloc0(sizeof(PgFdwPathExtraData)); + fpextra = palloc0_object(PgFdwPathExtraData); fpextra->target = root->upper_targets[UPPERREL_ORDERED]; fpextra->has_final_sort = true; @@ -7222,7 +7125,7 @@ add_foreign_final_paths(PlannerInfo *root, RelOptInfo *input_rel, fpinfo->pushdown_safe = true; /* Construct PgFdwPathExtraData */ - fpextra = (PgFdwPathExtraData *) palloc0(sizeof(PgFdwPathExtraData)); + fpextra = palloc0_object(PgFdwPathExtraData); fpextra->target = root->upper_targets[UPPERREL_FINAL]; fpextra->has_final_sort = has_final_sort; fpextra->has_limit = extra->limit_needed; @@ -7409,7 +7312,7 @@ postgresForeignAsyncNotify(AsyncRequest *areq) /* On error, report the original query, not the FETCH. */ if (!PQconsumeInput(fsstate->conn)) - pgfdw_report_error(ERROR, NULL, fsstate->conn, false, fsstate->query); + pgfdw_report_error(NULL, fsstate->conn, fsstate->query); fetch_more_data(node); @@ -7508,7 +7411,7 @@ fetch_more_data_begin(AsyncRequest *areq) fsstate->fetch_size, fsstate->cursor_number); if (!PQsendQuery(fsstate->conn, sql)) - pgfdw_report_error(ERROR, NULL, fsstate->conn, false, fsstate->query); + pgfdw_report_error(NULL, fsstate->conn, fsstate->query); /* Remember that the request is in process */ fsstate->conn_state->pendingAreq = areq; diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h index 81358f3bde7df..e69735298d78f 100644 --- a/contrib/postgres_fdw/postgres_fdw.h +++ b/contrib/postgres_fdw/postgres_fdw.h @@ -15,7 +15,7 @@ #include "foreign/foreign.h" #include "lib/stringinfo.h" -#include "libpq-fe.h" +#include "libpq/libpq-be-fe.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" #include "utils/relcache.h" @@ -166,8 +166,10 @@ extern void do_sql_command(PGconn *conn, const char *sql); extern PGresult *pgfdw_get_result(PGconn *conn); extern PGresult *pgfdw_exec_query(PGconn *conn, const char *query, PgFdwConnState *state); -extern void pgfdw_report_error(int elevel, PGresult *res, PGconn *conn, - bool clear, const char *sql); +pg_noreturn extern void pgfdw_report_error(PGresult *res, PGconn *conn, + const char *sql); +extern void pgfdw_report(int elevel, PGresult *res, PGconn *conn, + const char *sql); /* in option.c */ extern int ExtractConnectionOptions(List *defelems, diff --git a/contrib/postgres_fdw/specs/eval_plan_qual.spec b/contrib/postgres_fdw/specs/eval_plan_qual.spec new file mode 100644 index 0000000000000..9f52270db6984 --- /dev/null +++ b/contrib/postgres_fdw/specs/eval_plan_qual.spec @@ -0,0 +1,102 @@ +# Tests for the EvalPlanQual mechanism involving foreign tables + +setup +{ + DO $d$ + BEGIN + EXECUTE $$CREATE SERVER loopback FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname '$$||current_database()||$$', + port '$$||current_setting('port')||$$', + use_remote_estimate 'true' + )$$; + END; + $d$; + CREATE USER MAPPING FOR PUBLIC SERVER loopback; + + CREATE TABLE l (i int, v text); + CREATE TABLE t (i int, v text); + CREATE FOREIGN TABLE ft (i int, v text) SERVER loopback OPTIONS (table_name 't'); + + INSERT INTO l VALUES (123, 'foo'), (456, 'bar'), (789, 'baz'); + INSERT INTO t SELECT i, to_char(i, 'FM0000') FROM generate_series(1, 1000) i; + CREATE INDEX t_idx ON t (i); + ANALYZE l, t; + + CREATE TABLE a (i int); + CREATE TABLE b (i int); + CREATE TABLE c (i int); + CREATE FOREIGN TABLE fb (i int) SERVER loopback OPTIONS (table_name 'b'); + CREATE FOREIGN TABLE fc (i int) SERVER loopback OPTIONS (table_name 'c'); + + INSERT INTO a VALUES (1); + INSERT INTO b VALUES (1); + INSERT INTO c VALUES (1); + ANALYZE a, b, c; +} + +teardown +{ + DROP TABLE l; + DROP TABLE t; + DROP TABLE a; + DROP TABLE b; + DROP TABLE c; + DROP SERVER loopback CASCADE; +} + +session s0 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step s0_update_l { UPDATE l SET i = i + 1; } +step s0_update_a { UPDATE a SET i = i + 1; } +step s0_commit { COMMIT; } + +session s1 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } + +# Test for EPQ with a foreign scan pushing down a qual +step s1_tuplock_l_0 { + EXPLAIN (VERBOSE, COSTS OFF) + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.i = 123 FOR UPDATE OF l; + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.i = 123 FOR UPDATE OF l; +} + +# Same test, except that the qual is parameterized +step s1_tuplock_l_1 { + EXPLAIN (VERBOSE, COSTS OFF) + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.v = 'foo' FOR UPDATE OF l; + SELECT l.* FROM l, ft WHERE l.i = ft.i AND l.v = 'foo' FOR UPDATE OF l; +} + +# Test for EPQ with a foreign scan pushing down a join +step s1_tuplock_a_0 { + EXPLAIN (VERBOSE, COSTS OFF) + SELECT a.i FROM a, fb, fc WHERE a.i = fb.i AND fb.i = fc.i FOR UPDATE OF a; + SELECT a.i FROM a, fb, fc WHERE a.i = fb.i AND fb.i = fc.i FOR UPDATE OF a; +} + +# Same test, except that the join is contained in a SubLink sub-select, not +# in the main query +step s1_tuplock_a_1 { + EXPLAIN (VERBOSE, COSTS OFF) + SELECT a.i, + (SELECT 1 FROM fb, fc WHERE a.i = fb.i AND fb.i = fc.i) + FROM a FOR UPDATE; + SELECT a.i, + (SELECT 1 FROM fb, fc WHERE a.i = fb.i AND fb.i = fc.i) + FROM a FOR UPDATE; +} + +step s1_commit { COMMIT; } + +# This test checks the case of rechecking a pushed-down qual. +permutation s0_update_l s1_tuplock_l_0 s0_commit s1_commit + +# This test checks the same case, except that the qual is parameterized. +permutation s0_update_l s1_tuplock_l_1 s0_commit s1_commit + +# This test checks the case of rechecking a pushed-down join. +permutation s0_update_a s1_tuplock_a_0 s0_commit s1_commit + +# This test exercises EvalPlanQual with a SubLink sub-select (which should +# be unaffected by any EPQ recheck behavior in the outer query). +permutation s0_update_a s1_tuplock_a_1 s0_commit s1_commit diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index e534b40de3c76..9a8f9e2813539 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -4,24 +4,17 @@ CREATE EXTENSION postgres_fdw; -CREATE SERVER testserver1 FOREIGN DATA WRAPPER postgres_fdw; -DO $d$ - BEGIN - EXECUTE $$CREATE SERVER loopback FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - EXECUTE $$CREATE SERVER loopback2 FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - EXECUTE $$CREATE SERVER loopback3 FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - END; -$d$; +SELECT current_database() AS current_database, + current_setting('port') AS current_port +\gset +CREATE SERVER testserver1 FOREIGN DATA WRAPPER postgres_fdw; +CREATE SERVER loopback FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); +CREATE SERVER loopback2 FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); +CREATE SERVER loopback3 FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); CREATE USER MAPPING FOR public SERVER testserver1 OPTIONS (user 'value', password 'value'); CREATE USER MAPPING FOR CURRENT_USER SERVER loopback; @@ -233,12 +226,7 @@ ALTER FOREIGN TABLE ft2 ALTER COLUMN c1 OPTIONS (column_name 'C 1'); SELECT c3, c4 FROM ft1 ORDER BY c3, c1 LIMIT 1; -- should work ALTER SERVER loopback OPTIONS (SET dbname 'no such database'); SELECT c3, c4 FROM ft1 ORDER BY c3, c1 LIMIT 1; -- should fail -DO $d$ - BEGIN - EXECUTE $$ALTER SERVER loopback - OPTIONS (SET dbname '$$||current_database()||$$')$$; - END; -$d$; +ALTER SERVER loopback OPTIONS (SET dbname :'current_database'); SELECT c3, c4 FROM ft1 ORDER BY c3, c1 LIMIT 1; -- should work again -- Test that alteration of user mapping options causes reconnection @@ -352,7 +340,7 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NULL; -- Nu EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NOT NULL; -- NullTest EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE round(abs(c1), 0) = 1; -- FuncExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- OpExpr(l) -EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c1 IS NOT NULL) IS DISTINCT FROM (c1 IS NOT NULL); -- DistinctExpr +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = (ARRAY[c1,c2,3])[1]; -- SubscriptingRef EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c6 = E'foo''s\\bar'; -- check special chars @@ -458,6 +446,15 @@ SELECT * FROM ft1 WHERE CASE c3 WHEN c6 THEN true ELSE c3 < 'bar' END; EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 WHERE CASE c3 COLLATE "C" WHEN c6 THEN true ELSE c3 < 'bar' END; +-- Test array type conversion pushdown +SET plan_cache_mode = force_generic_plan; +PREPARE s(varchar[]) AS SELECT count(*) FROM ft2 WHERE c6 = ANY ($1); +EXPLAIN (VERBOSE, COSTS OFF) +EXECUTE s(ARRAY['1','2']); +EXECUTE s(ARRAY['1','2']); +DEALLOCATE s; +RESET plan_cache_mode; + -- a regconfig constant referring to this text search configuration -- is initially unshippable CREATE TEXT SEARCH CONFIGURATION public.custom_search @@ -2272,6 +2269,84 @@ EXPLAIN (verbose, costs off) DELETE FROM rem1; -- can't be pushed down DROP TRIGGER trig_row_after_delete ON rem1; + +-- We are allowed to create transition-table triggers on both kinds of +-- inheritance even if they contain foreign tables as children, but currently +-- collecting transition tuples from such foreign tables is not supported. + +CREATE TABLE local_tbl (a text, b int); +CREATE FOREIGN TABLE foreign_tbl (a text, b int) + SERVER loopback OPTIONS (table_name 'local_tbl'); + +INSERT INTO foreign_tbl VALUES ('AAA', 42); + +-- Test case for partition hierarchy +CREATE TABLE parent_tbl (a text, b int) PARTITION BY LIST (a); +ALTER TABLE parent_tbl ATTACH PARTITION foreign_tbl FOR VALUES IN ('AAA'); + +CREATE TRIGGER parent_tbl_insert_trig + AFTER INSERT ON parent_tbl REFERENCING NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +CREATE TRIGGER parent_tbl_update_trig + AFTER UPDATE ON parent_tbl REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +CREATE TRIGGER parent_tbl_delete_trig + AFTER DELETE ON parent_tbl REFERENCING OLD TABLE AS old_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); + +INSERT INTO parent_tbl VALUES ('AAA', 42); + +COPY parent_tbl (a, b) FROM stdin; +AAA 42 +\. + +ALTER SERVER loopback OPTIONS (ADD batch_size '10'); + +INSERT INTO parent_tbl VALUES ('AAA', 42); + +COPY parent_tbl (a, b) FROM stdin; +AAA 42 +\. + +ALTER SERVER loopback OPTIONS (DROP batch_size); + +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE parent_tbl SET b = b + 1; +UPDATE parent_tbl SET b = b + 1; + +EXPLAIN (VERBOSE, COSTS OFF) +DELETE FROM parent_tbl; +DELETE FROM parent_tbl; + +ALTER TABLE parent_tbl DETACH PARTITION foreign_tbl; +DROP TABLE parent_tbl; + +-- Test case for non-partition hierarchy +CREATE TABLE parent_tbl (a text, b int); +ALTER FOREIGN TABLE foreign_tbl INHERIT parent_tbl; + +CREATE TRIGGER parent_tbl_update_trig + AFTER UPDATE ON parent_tbl REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); +CREATE TRIGGER parent_tbl_delete_trig + AFTER DELETE ON parent_tbl REFERENCING OLD TABLE AS old_table + FOR EACH STATEMENT EXECUTE PROCEDURE trigger_func(); + +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE parent_tbl SET b = b + 1; +UPDATE parent_tbl SET b = b + 1; + +EXPLAIN (VERBOSE, COSTS OFF) +DELETE FROM parent_tbl; +DELETE FROM parent_tbl; + +ALTER FOREIGN TABLE foreign_tbl NO INHERIT parent_tbl; +DROP TABLE parent_tbl; + +-- Cleanup +DROP FOREIGN TABLE foreign_tbl; +DROP TABLE local_tbl; + -- =================================================================== -- test inheritance features -- =================================================================== @@ -3288,14 +3363,8 @@ SET ROLE regress_nosuper; SHOW is_superuser; -- This will be OK, we can create the FDW -DO $d$ - BEGIN - EXECUTE $$CREATE SERVER loopback_nopw FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (dbname '$$||current_database()||$$', - port '$$||current_setting('port')||$$' - )$$; - END; -$d$; +CREATE SERVER loopback_nopw FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (dbname :'current_database', port :'current_port'); -- But creation of user mappings for non-superusers should fail CREATE USER MAPPING FOR public SERVER loopback_nopw; @@ -3872,6 +3941,9 @@ INSERT INTO result_tbl SELECT * FROM async_pt WHERE b === 505; SELECT * FROM result_tbl ORDER BY a; DELETE FROM result_tbl; +-- Test COPY TO when foreign table is partition +COPY async_pt TO stdout; --error + DROP FOREIGN TABLE async_p3; DROP TABLE base_tbl3; @@ -4278,7 +4350,7 @@ ALTER SERVER loopback2 OPTIONS (DROP parallel_abort); CREATE TABLE analyze_table (id int, a text, b bigint); CREATE FOREIGN TABLE analyze_ftable (id int, a text, b bigint) - SERVER loopback OPTIONS (table_name 'analyze_rtable1'); + SERVER loopback OPTIONS (table_name 'analyze_table'); INSERT INTO analyze_table (SELECT x FROM generate_series(1,1000) x); ANALYZE analyze_table; @@ -4289,19 +4361,19 @@ ANALYZE analyze_table; ALTER SERVER loopback OPTIONS (analyze_sampling 'invalid'); ALTER SERVER loopback OPTIONS (analyze_sampling 'auto'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'system'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'bernoulli'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'random'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; ALTER SERVER loopback OPTIONS (SET analyze_sampling 'off'); -ANALYZE analyze_table; +ANALYZE analyze_ftable; -- cleanup DROP FOREIGN TABLE analyze_ftable; diff --git a/contrib/seg/seg.c b/contrib/seg/seg.c index 151cbb954b9a1..2d3a048c73ecb 100644 --- a/contrib/seg/seg.c +++ b/contrib/seg/seg.c @@ -107,7 +107,7 @@ Datum seg_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); - SEG *result = palloc(sizeof(SEG)); + SEG *result = palloc_object(SEG); yyscan_t scanner; seg_scanner_init(str, &scanner); @@ -370,7 +370,7 @@ gseg_picksplit(PG_FUNCTION_ARGS) /* * Emit segments to the left output page, and compute its bounding box. */ - seg_l = (SEG *) palloc(sizeof(SEG)); + seg_l = palloc_object(SEG); memcpy(seg_l, sort_items[0].data, sizeof(SEG)); *left++ = sort_items[0].index; v->spl_nleft++; @@ -388,7 +388,7 @@ gseg_picksplit(PG_FUNCTION_ARGS) /* * Likewise for the right page. */ - seg_r = (SEG *) palloc(sizeof(SEG)); + seg_r = palloc_object(SEG); memcpy(seg_r, sort_items[firstright].data, sizeof(SEG)); *right++ = sort_items[firstright].index; v->spl_nright++; @@ -417,7 +417,7 @@ gseg_same(PG_FUNCTION_ARGS) { bool *result = (bool *) PG_GETARG_POINTER(2); - if (DirectFunctionCall2(seg_same, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1))) + if (DatumGetBool(DirectFunctionCall2(seg_same, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1)))) *result = true; else *result = false; @@ -470,7 +470,7 @@ gseg_leaf_consistent(Datum key, Datum query, StrategyNumber strategy) retval = DirectFunctionCall2(seg_contained, key, query); break; default: - retval = false; + retval = BoolGetDatum(false); } PG_RETURN_DATUM(retval); @@ -632,7 +632,7 @@ seg_union(PG_FUNCTION_ARGS) SEG *b = PG_GETARG_SEG_P(1); SEG *n; - n = (SEG *) palloc(sizeof(*n)); + n = palloc_object(SEG); /* take max of upper endpoints */ if (a->upper > b->upper) @@ -672,7 +672,7 @@ seg_inter(PG_FUNCTION_ARGS) SEG *b = PG_GETARG_SEG_P(1); SEG *n; - n = (SEG *) palloc(sizeof(*n)); + n = palloc_object(SEG); /* take min of upper endpoints */ if (a->upper < b->upper) diff --git a/contrib/seg/segdata.h b/contrib/seg/segdata.h index 4347c31c28e94..7bc7c83dca309 100644 --- a/contrib/seg/segdata.h +++ b/contrib/seg/segdata.h @@ -16,10 +16,7 @@ extern int significant_digits(const char *s); /* for segscan.l and segparse.y */ union YYSTYPE; -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif /* in segscan.l */ extern int seg_yylex(union YYSTYPE *yylval_param, yyscan_t yyscanner); diff --git a/contrib/sepgsql/.gitignore b/contrib/sepgsql/.gitignore index b1778d05bbd0b..7e240e44c3692 100644 --- a/contrib/sepgsql/.gitignore +++ b/contrib/sepgsql/.gitignore @@ -3,5 +3,7 @@ /sepgsql-regtest.if /sepgsql-regtest.pp /tmp -# Generated by test suite +# Generated subdirectories +/log/ +/results/ /tmp_check/ diff --git a/contrib/sepgsql/database.c b/contrib/sepgsql/database.c index 6eeb429a28c08..c4ed646436990 100644 --- a/contrib/sepgsql/database.c +++ b/contrib/sepgsql/database.c @@ -16,7 +16,6 @@ #include "access/table.h" #include "catalog/dependency.h" #include "catalog/pg_database.h" -#include "commands/dbcommands.h" #include "commands/seclabel.h" #include "sepgsql.h" #include "utils/builtins.h" diff --git a/contrib/sepgsql/expected/ddl.out b/contrib/sepgsql/expected/ddl.out index 7e8deae4f9320..accb903f5cefc 100644 --- a/contrib/sepgsql/expected/ddl.out +++ b/contrib/sepgsql/expected/ddl.out @@ -304,6 +304,8 @@ ALTER TABLE regtest_table_4 ALTER COLUMN y TYPE float; LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="regtest_schema" permissive=0 LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="public" permissive=0 LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="pg_catalog" permissive=0 +LINE 1: ALTER TABLE regtest_table_4 ALTER COLUMN y TYPE float; + ^ LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="pg_catalog" permissive=0 LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="regtest_schema.regtest_table_4.y" permissive=0 LOG: SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="pg_catalog.float8(integer)" permissive=0 @@ -388,7 +390,11 @@ ALTER TABLE regtest_ptable_4 ALTER COLUMN y TYPE float; LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="regtest_schema" permissive=0 LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="public" permissive=0 LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="pg_catalog" permissive=0 +LINE 1: ALTER TABLE regtest_ptable_4 ALTER COLUMN y TYPE float; + ^ LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="pg_catalog" permissive=0 +LINE 1: ALTER TABLE regtest_ptable_4 ALTER COLUMN y TYPE float; + ^ LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="pg_catalog" permissive=0 LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="regtest_schema.regtest_ptable_4.y" permissive=0 LOG: SELinux: allowed { search } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0 tcontext=system_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="pg_catalog" permissive=0 diff --git a/contrib/sepgsql/label.c b/contrib/sepgsql/label.c index 996ce174454dc..978067e039147 100644 --- a/contrib/sepgsql/label.c +++ b/contrib/sepgsql/label.c @@ -23,7 +23,6 @@ #include "catalog/pg_database.h" #include "catalog/pg_namespace.h" #include "catalog/pg_proc.h" -#include "commands/dbcommands.h" #include "commands/seclabel.h" #include "libpq/auth.h" #include "libpq/libpq-be.h" @@ -146,7 +145,7 @@ sepgsql_set_client_label(const char *new_label) */ oldcxt = MemoryContextSwitchTo(CurTransactionContext); - plabel = palloc0(sizeof(pending_label)); + plabel = palloc0_object(pending_label); plabel->subid = GetCurrentSubTransactionId(); if (new_label) plabel->label = pstrdup(new_label); diff --git a/contrib/sepgsql/uavc.c b/contrib/sepgsql/uavc.c index 65ea8e7946a6e..5e57971bb4dc8 100644 --- a/contrib/sepgsql/uavc.c +++ b/contrib/sepgsql/uavc.c @@ -66,8 +66,8 @@ static char *avc_unlabeled; /* system 'unlabeled' label */ static uint32 sepgsql_avc_hash(const char *scontext, const char *tcontext, uint16 tclass) { - return hash_any((const unsigned char *) scontext, strlen(scontext)) - ^ hash_any((const unsigned char *) tcontext, strlen(tcontext)) + return hash_bytes((const unsigned char *) scontext, strlen(scontext)) + ^ hash_bytes((const unsigned char *) tcontext, strlen(tcontext)) ^ tclass; } @@ -257,7 +257,7 @@ sepgsql_avc_compute(const char *scontext, const char *tcontext, uint16 tclass) */ oldctx = MemoryContextSwitchTo(avc_mem_cxt); - cache = palloc0(sizeof(avc_cache)); + cache = palloc0_object(avc_cache); cache->hash = hash; cache->scontext = pstrdup(scontext); diff --git a/contrib/spi/refint.c b/contrib/spi/refint.c index d5e25e07ae9e2..fbbd558ca1eb3 100644 --- a/contrib/spi/refint.c +++ b/contrib/spi/refint.c @@ -321,7 +321,7 @@ check_foreign_key(PG_FUNCTION_ARGS) if (nrefs < 1) /* internal error */ elog(ERROR, "check_foreign_key: %d (< 1) number of references specified", nrefs); - action = tolower((unsigned char) *(args[1])); + action = pg_ascii_tolower((unsigned char) *(args[1])); if (action != 'r' && action != 'c' && action != 's') /* internal error */ elog(ERROR, "check_foreign_key: invalid action %s", args[1]); @@ -651,7 +651,7 @@ find_plan(char *ident, EPlan **eplan, int *nplans) } else { - newp = *eplan = (EPlan *) palloc(sizeof(EPlan)); + newp = *eplan = palloc_object(EPlan); (*nplans) = i = 0; } diff --git a/contrib/sslinfo/sslinfo.c b/contrib/sslinfo/sslinfo.c index da70201119317..2b9eb90b09389 100644 --- a/contrib/sslinfo/sslinfo.c +++ b/contrib/sslinfo/sslinfo.c @@ -374,7 +374,7 @@ ssl_extension_info(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ - fctx = (SSLExtensionInfoContext *) palloc(sizeof(SSLExtensionInfoContext)); + fctx = palloc_object(SSLExtensionInfoContext); /* Construct tuple descriptor */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) diff --git a/contrib/tablefunc/tablefunc.c b/contrib/tablefunc/tablefunc.c index 74afdc0977f47..c40fd36dc9663 100644 --- a/contrib/tablefunc/tablefunc.c +++ b/contrib/tablefunc/tablefunc.c @@ -207,7 +207,7 @@ normal_rand(PG_FUNCTION_ARGS) funcctx->max_calls = num_tuples; /* allocate memory for user context */ - fctx = (normal_rand_fctx *) palloc(sizeof(normal_rand_fctx)); + fctx = palloc_object(normal_rand_fctx); /* * Use fctx to keep track of upper and lower bounds from call to call. @@ -766,7 +766,7 @@ load_categories_hash(char *cats_sql, MemoryContext per_query_ctx) SPIcontext = MemoryContextSwitchTo(per_query_ctx); - catdesc = (crosstab_cat_desc *) palloc(sizeof(crosstab_cat_desc)); + catdesc = palloc_object(crosstab_cat_desc); catdesc->catname = catname; catdesc->attidx = i; diff --git a/contrib/tcn/tcn.c b/contrib/tcn/tcn.c index 3158dee0f26a9..9ca47f17caca4 100644 --- a/contrib/tcn/tcn.c +++ b/contrib/tcn/tcn.c @@ -66,12 +66,13 @@ triggered_change_notification(PG_FUNCTION_ARGS) TupleDesc tupdesc; char *channel; char operation; - StringInfo payload = makeStringInfo(); + StringInfoData payload; bool foundPK; List *indexoidlist; ListCell *indexoidscan; + initStringInfo(&payload); /* make sure it's called as a trigger */ if (!CALLED_AS_TRIGGER(fcinfo)) ereport(ERROR, @@ -149,22 +150,22 @@ triggered_change_notification(PG_FUNCTION_ARGS) foundPK = true; - strcpy_quoted(payload, RelationGetRelationName(rel), '"'); - appendStringInfoCharMacro(payload, ','); - appendStringInfoCharMacro(payload, operation); + strcpy_quoted(&payload, RelationGetRelationName(rel), '"'); + appendStringInfoCharMacro(&payload, ','); + appendStringInfoCharMacro(&payload, operation); for (i = 0; i < indnkeyatts; i++) { int colno = index->indkey.values[i]; Form_pg_attribute attr = TupleDescAttr(tupdesc, colno - 1); - appendStringInfoCharMacro(payload, ','); - strcpy_quoted(payload, NameStr(attr->attname), '"'); - appendStringInfoCharMacro(payload, '='); - strcpy_quoted(payload, SPI_getvalue(trigtuple, tupdesc, colno), '\''); + appendStringInfoCharMacro(&payload, ','); + strcpy_quoted(&payload, NameStr(attr->attname), '"'); + appendStringInfoCharMacro(&payload, '='); + strcpy_quoted(&payload, SPI_getvalue(trigtuple, tupdesc, colno), '\''); } - Async_Notify(channel, payload->data); + Async_Notify(channel, payload.data); } ReleaseSysCache(indexTuple); break; diff --git a/contrib/test_decoding/Makefile b/contrib/test_decoding/Makefile index 02e961f4d3144..acbcaed2febfb 100644 --- a/contrib/test_decoding/Makefile +++ b/contrib/test_decoding/Makefile @@ -9,7 +9,7 @@ REGRESS = ddl xact rewrite toast permissions decoding_in_xact \ ISOLATION = mxact delayed_startup ondisk_startup concurrent_ddl_dml \ oldest_xmin snapshot_transfer subxact_without_top concurrent_stream \ twophase_snapshot slot_creation_error catalog_change_snapshot \ - skip_snapshot_restore invalidation_distribution + skip_snapshot_restore invalidation_distribution parallel_session_origin REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/test_decoding/logical.conf ISOLATION_OPTS = --temp-config $(top_srcdir)/contrib/test_decoding/logical.conf diff --git a/contrib/test_decoding/expected/invalidation_distribution.out b/contrib/test_decoding/expected/invalidation_distribution.out index ad0a944cbf303..ae53b1e61de3e 100644 --- a/contrib/test_decoding/expected/invalidation_distribution.out +++ b/contrib/test_decoding/expected/invalidation_distribution.out @@ -1,4 +1,4 @@ -Parsed test spec with 2 sessions +Parsed test spec with 3 sessions starting permutation: s1_insert_tbl1 s1_begin s1_insert_tbl1 s2_alter_pub_add_tbl s1_commit s1_insert_tbl1 s2_get_binary_changes step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); @@ -18,3 +18,24 @@ count stop (1 row) + +starting permutation: s1_begin s1_insert_tbl1 s3_begin s3_insert_tbl1 s2_alter_pub_add_tbl s1_insert_tbl1 s1_commit s3_commit s2_get_binary_changes +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s3_begin: BEGIN; +step s3_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (2, 2); +step s2_alter_pub_add_tbl: ALTER PUBLICATION pub ADD TABLE tbl1; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s3_commit: COMMIT; +step s2_get_binary_changes: SELECT count(data) FROM pg_logical_slot_get_binary_changes('isolation_slot', NULL, NULL, 'proto_version', '4', 'publication_names', 'pub') WHERE get_byte(data, 0) = 73; +count +----- + 1 +(1 row) + +?column? +-------- +stop +(1 row) + diff --git a/contrib/test_decoding/expected/parallel_session_origin.out b/contrib/test_decoding/expected/parallel_session_origin.out new file mode 100644 index 0000000000000..e515b39f7ce86 --- /dev/null +++ b/contrib/test_decoding/expected/parallel_session_origin.out @@ -0,0 +1,79 @@ +Parsed test spec with 2 sessions + +starting permutation: s0_setup s0_is_setup s1_setup s1_is_setup s0_add_message s0_store_lsn s1_add_message s1_store_lsn s0_compare s0_reset s1_reset +step s0_setup: SELECT pg_replication_origin_session_setup('origin'); +pg_replication_origin_session_setup +----------------------------------- + +(1 row) + +step s0_is_setup: SELECT pg_replication_origin_session_is_setup(); +pg_replication_origin_session_is_setup +-------------------------------------- +t +(1 row) + +step s1_setup: + SELECT pg_replication_origin_session_setup('origin', pid) + FROM pg_stat_activity + WHERE application_name = 'isolation/parallel_session_origin/s0'; + +pg_replication_origin_session_setup +----------------------------------- + +(1 row) + +step s1_is_setup: SELECT pg_replication_origin_session_is_setup(); +pg_replication_origin_session_is_setup +-------------------------------------- +t +(1 row) + +step s0_add_message: + SELECT 1 + FROM pg_logical_emit_message(true, 'prefix', 'message on s0'); + +?column? +-------- + 1 +(1 row) + +step s0_store_lsn: + INSERT INTO local_lsn_store + SELECT 0, local_lsn FROM pg_replication_origin_status; + +step s1_add_message: + SELECT 1 + FROM pg_logical_emit_message(true, 'prefix', 'message on s1'); + +?column? +-------- + 1 +(1 row) + +step s1_store_lsn: + INSERT INTO local_lsn_store + SELECT 1, local_lsn FROM pg_replication_origin_status; + +step s0_compare: + SELECT s0.lsn < s1.lsn + FROM local_lsn_store as s0, local_lsn_store as s1 + WHERE s0.session = 0 AND s1.session = 1; + +?column? +-------- +t +(1 row) + +step s0_reset: SELECT pg_replication_origin_session_reset(); +pg_replication_origin_session_reset +----------------------------------- + +(1 row) + +step s1_reset: SELECT pg_replication_origin_session_reset(); +pg_replication_origin_session_reset +----------------------------------- + +(1 row) + diff --git a/contrib/test_decoding/expected/replorigin.out b/contrib/test_decoding/expected/replorigin.out index c85e1a01b231c..29a9630c9006b 100644 --- a/contrib/test_decoding/expected/replorigin.out +++ b/contrib/test_decoding/expected/replorigin.out @@ -41,6 +41,9 @@ SELECT pg_replication_origin_create('regress_test_decoding: regression_slot'); SELECT pg_replication_origin_create('regress_test_decoding: regression_slot'); ERROR: duplicate key value violates unique constraint "pg_replication_origin_roname_index" DETAIL: Key (roname)=(regress_test_decoding: regression_slot) already exists. +-- ensure inactive origin cannot be set as session one if pid is specified +SELECT pg_replication_origin_session_setup('regress_test_decoding: regression_slot', -1); +ERROR: cannot use PID -1 for inactive replication origin with ID 1 --ensure deletions work (once) SELECT pg_replication_origin_create('regress_test_decoding: temp'); pg_replication_origin_create diff --git a/contrib/test_decoding/expected/stats.out b/contrib/test_decoding/expected/stats.out index de6dc416130a0..a9ead3c41aa31 100644 --- a/contrib/test_decoding/expected/stats.out +++ b/contrib/test_decoding/expected/stats.out @@ -37,12 +37,12 @@ SELECT pg_stat_force_next_flush(); (1 row) -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; - slot_name | spill_txns | spill_count | total_txns | total_bytes -------------------------+------------+-------------+------------+------------- - regression_slot_stats1 | t | t | t | t - regression_slot_stats2 | t | t | t | t - regression_slot_stats3 | t | t | t | t +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; + slot_name | spill_txns | spill_count | total_txns | total_bytes | mem_exceeded_count +------------------------+------------+-------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | t | t | t + regression_slot_stats2 | t | t | t | t | t + regression_slot_stats3 | t | t | t | t | t (3 rows) RESET logical_decoding_work_mem; @@ -53,12 +53,12 @@ SELECT pg_stat_reset_replication_slot('regression_slot_stats1'); (1 row) -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; - slot_name | spill_txns | spill_count | total_txns | total_bytes -------------------------+------------+-------------+------------+------------- - regression_slot_stats1 | t | t | f | f - regression_slot_stats2 | t | t | t | t - regression_slot_stats3 | t | t | t | t +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; + slot_name | spill_txns | spill_count | total_txns | total_bytes | mem_exceeded_count +------------------------+------------+-------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | f | f | t + regression_slot_stats2 | t | t | t | t | t + regression_slot_stats3 | t | t | t | t | t (3 rows) -- reset stats for all slots @@ -68,27 +68,27 @@ SELECT pg_stat_reset_replication_slot(NULL); (1 row) -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; - slot_name | spill_txns | spill_count | total_txns | total_bytes -------------------------+------------+-------------+------------+------------- - regression_slot_stats1 | t | t | f | f - regression_slot_stats2 | t | t | f | f - regression_slot_stats3 | t | t | f | f +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; + slot_name | spill_txns | spill_count | total_txns | total_bytes | mem_exceeded_count +------------------------+------------+-------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | f | f | t + regression_slot_stats2 | t | t | f | f | t + regression_slot_stats3 | t | t | f | f | t (3 rows) -- verify accessing/resetting stats for non-existent slot does something reasonable SELECT * FROM pg_stat_get_replication_slot('do-not-exist'); - slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | total_txns | total_bytes | stats_reset ---------------+------------+-------------+-------------+-------------+--------------+--------------+------------+-------------+------------- - do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | mem_exceeded_count | total_txns | total_bytes | slotsync_skip_count | slotsync_last_skip | stats_reset +--------------+------------+-------------+-------------+-------------+--------------+--------------+--------------------+------------+-------------+---------------------+--------------------+------------- + do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | (1 row) SELECT pg_stat_reset_replication_slot('do-not-exist'); ERROR: replication slot "do-not-exist" does not exist SELECT * FROM pg_stat_get_replication_slot('do-not-exist'); - slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | total_txns | total_bytes | stats_reset ---------------+------------+-------------+-------------+-------------+--------------+--------------+------------+-------------+------------- - do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | mem_exceeded_count | total_txns | total_bytes | slotsync_skip_count | slotsync_last_skip | stats_reset +--------------+------------+-------------+-------------+-------------+--------------+--------------+--------------------+------------+-------------+---------------------+--------------------+------------- + do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | (1 row) -- spilling the xact @@ -110,12 +110,12 @@ SELECT pg_stat_force_next_flush(); (1 row) -SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count FROM pg_stat_replication_slots; - slot_name | spill_txns | spill_count -------------------------+------------+------------- - regression_slot_stats1 | t | t - regression_slot_stats2 | f | f - regression_slot_stats3 | f | f +SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count, mem_exceeded_count > 0 AS mem_exceeded_count FROM pg_stat_replication_slots; + slot_name | spill_txns | spill_count | mem_exceeded_count +------------------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | t + regression_slot_stats2 | f | f | f + regression_slot_stats3 | f | f | f (3 rows) -- Ensure stats can be repeatedly accessed using the same stats snapshot. See @@ -159,16 +159,19 @@ SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats4_twophas (1 row) -- Verify that the decoding doesn't spill already-aborted transaction's changes. +-- Given that there is no concurrent activities that are capturable by logical decoding, +-- mem_exceeded_count should theoretically be 1 but we check if >0 here since it's +-- more flexible for potential future changes and adequate for the testing purpose. SELECT pg_stat_force_next_flush(); pg_stat_force_next_flush -------------------------- (1 row) -SELECT slot_name, spill_txns, spill_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; - slot_name | spill_txns | spill_count ----------------------------------+------------+------------- - regression_slot_stats4_twophase | 0 | 0 +SELECT slot_name, spill_txns, spill_count, mem_exceeded_count > 0 as mem_exceeded_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; + slot_name | spill_txns | spill_count | mem_exceeded_count +---------------------------------+------------+-------------+-------------------- + regression_slot_stats4_twophase | 0 | 0 | t (1 row) DROP TABLE stats_test; diff --git a/contrib/test_decoding/meson.build b/contrib/test_decoding/meson.build index 25f6b8a90826b..99310555e6ce2 100644 --- a/contrib/test_decoding/meson.build +++ b/contrib/test_decoding/meson.build @@ -64,6 +64,7 @@ tests += { 'slot_creation_error', 'skip_snapshot_restore', 'invalidation_distribution', + 'parallel_session_origin', ], 'regress_args': [ '--temp-config', files('logical.conf'), diff --git a/contrib/test_decoding/specs/invalidation_distribution.spec b/contrib/test_decoding/specs/invalidation_distribution.spec index decbed627e327..67d41969ac1d6 100644 --- a/contrib/test_decoding/specs/invalidation_distribution.spec +++ b/contrib/test_decoding/specs/invalidation_distribution.spec @@ -28,5 +28,16 @@ setup { SET synchronous_commit=on; } step "s2_alter_pub_add_tbl" { ALTER PUBLICATION pub ADD TABLE tbl1; } step "s2_get_binary_changes" { SELECT count(data) FROM pg_logical_slot_get_binary_changes('isolation_slot', NULL, NULL, 'proto_version', '4', 'publication_names', 'pub') WHERE get_byte(data, 0) = 73; } +session "s3" +setup { SET synchronous_commit=on; } +step "s3_begin" { BEGIN; } +step "s3_insert_tbl1" { INSERT INTO tbl1 (val1, val2) VALUES (2, 2); } +step "s3_commit" { COMMIT; } + # Expect to get one insert change. LOGICAL_REP_MSG_INSERT = 'I' permutation "s1_insert_tbl1" "s1_begin" "s1_insert_tbl1" "s2_alter_pub_add_tbl" "s1_commit" "s1_insert_tbl1" "s2_get_binary_changes" + +# Expect to get one insert change with LOGICAL_REP_MSG_INSERT = 'I' from +# the second "s1_insert_tbl1" executed after adding the table tbl1 to the +# publication in "s2_alter_pub_add_tbl". +permutation "s1_begin" "s1_insert_tbl1" "s3_begin" "s3_insert_tbl1" "s2_alter_pub_add_tbl" "s1_insert_tbl1" "s1_commit" "s3_commit" "s2_get_binary_changes" diff --git a/contrib/test_decoding/specs/parallel_session_origin.spec b/contrib/test_decoding/specs/parallel_session_origin.spec new file mode 100644 index 0000000000000..c0e5fda07236a --- /dev/null +++ b/contrib/test_decoding/specs/parallel_session_origin.spec @@ -0,0 +1,56 @@ +# Test parallel replication origin manipulations; ensure local_lsn can be +# updated by all attached sessions. + +setup +{ + SELECT pg_replication_origin_create('origin'); + CREATE UNLOGGED TABLE local_lsn_store (session int, lsn pg_lsn); +} + +teardown +{ + SELECT pg_replication_origin_drop('origin'); + DROP TABLE local_lsn_store; +} + +session "s0" +setup { SET synchronous_commit = on; } +step "s0_setup" { SELECT pg_replication_origin_session_setup('origin'); } +step "s0_is_setup" { SELECT pg_replication_origin_session_is_setup(); } +step "s0_add_message" { + SELECT 1 + FROM pg_logical_emit_message(true, 'prefix', 'message on s0'); +} +step "s0_store_lsn" { + INSERT INTO local_lsn_store + SELECT 0, local_lsn FROM pg_replication_origin_status; +} +step "s0_compare" { + SELECT s0.lsn < s1.lsn + FROM local_lsn_store as s0, local_lsn_store as s1 + WHERE s0.session = 0 AND s1.session = 1; +} +step "s0_reset" { SELECT pg_replication_origin_session_reset(); } + +session "s1" +setup { SET synchronous_commit = on; } +step "s1_setup" { + SELECT pg_replication_origin_session_setup('origin', pid) + FROM pg_stat_activity + WHERE application_name = 'isolation/parallel_session_origin/s0'; +} +step "s1_is_setup" { SELECT pg_replication_origin_session_is_setup(); } +step "s1_add_message" { + SELECT 1 + FROM pg_logical_emit_message(true, 'prefix', 'message on s1'); +} +step "s1_store_lsn" { + INSERT INTO local_lsn_store + SELECT 1, local_lsn FROM pg_replication_origin_status; +} +step "s1_reset" { SELECT pg_replication_origin_session_reset(); } + +# Firstly s0 attaches to a origin and s1 attaches to the same. Both sessions +# commits a transaction and store the local_lsn of the replication origin. +# Compare LSNs and expect latter transaction (done by s1) has larger local_lsn. +permutation "s0_setup" "s0_is_setup" "s1_setup" "s1_is_setup" "s0_add_message" "s0_store_lsn" "s1_add_message" "s1_store_lsn" "s0_compare" "s0_reset" "s1_reset" diff --git a/contrib/test_decoding/sql/replorigin.sql b/contrib/test_decoding/sql/replorigin.sql index e71ee02d050a0..17f2b888238ee 100644 --- a/contrib/test_decoding/sql/replorigin.sql +++ b/contrib/test_decoding/sql/replorigin.sql @@ -26,6 +26,9 @@ SELECT pg_replication_origin_create('regress_test_decoding: regression_slot'); -- ensure duplicate creations fail SELECT pg_replication_origin_create('regress_test_decoding: regression_slot'); +-- ensure inactive origin cannot be set as session one if pid is specified +SELECT pg_replication_origin_session_setup('regress_test_decoding: regression_slot', -1); + --ensure deletions work (once) SELECT pg_replication_origin_create('regress_test_decoding: temp'); SELECT pg_replication_origin_drop('regress_test_decoding: temp'); diff --git a/contrib/test_decoding/sql/stats.sql b/contrib/test_decoding/sql/stats.sql index a022fe1bf0750..6661dbcb85c3a 100644 --- a/contrib/test_decoding/sql/stats.sql +++ b/contrib/test_decoding/sql/stats.sql @@ -15,16 +15,16 @@ SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats1', NULL, SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats2', NULL, NULL, 'skip-empty-xacts', '1'); SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats3', NULL, NULL, 'skip-empty-xacts', '1'); SELECT pg_stat_force_next_flush(); -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; RESET logical_decoding_work_mem; -- reset stats for one slot, others should be unaffected SELECT pg_stat_reset_replication_slot('regression_slot_stats1'); -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; -- reset stats for all slots SELECT pg_stat_reset_replication_slot(NULL); -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; -- verify accessing/resetting stats for non-existent slot does something reasonable SELECT * FROM pg_stat_get_replication_slot('do-not-exist'); @@ -41,7 +41,7 @@ SELECT count(*) FROM pg_logical_slot_peek_changes('regression_slot_stats1', NULL -- background transaction (say by autovacuum) happens in parallel to the main -- transaction. SELECT pg_stat_force_next_flush(); -SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count FROM pg_stat_replication_slots; +SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count, mem_exceeded_count > 0 AS mem_exceeded_count FROM pg_stat_replication_slots; -- Ensure stats can be repeatedly accessed using the same stats snapshot. See -- https://postgr.es/m/20210317230447.c7uc4g3vbs4wi32i%40alap3.anarazel.de @@ -64,8 +64,11 @@ ROLLBACK PREPARED 'test1_abort'; SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats4_twophase', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); -- Verify that the decoding doesn't spill already-aborted transaction's changes. +-- Given that there is no concurrent activities that are capturable by logical decoding, +-- mem_exceeded_count should theoretically be 1 but we check if >0 here since it's +-- more flexible for potential future changes and adequate for the testing purpose. SELECT pg_stat_force_next_flush(); -SELECT slot_name, spill_txns, spill_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; +SELECT slot_name, spill_txns, spill_count, mem_exceeded_count > 0 as mem_exceeded_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; DROP TABLE stats_test; SELECT pg_drop_replication_slot('regression_slot_stats1'), diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c index bb495563200c3..47094f86f5fe9 100644 --- a/contrib/test_decoding/test_decoding.c +++ b/contrib/test_decoding/test_decoding.c @@ -163,7 +163,7 @@ pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, TestDecodingData *data; bool enable_streaming = false; - data = palloc0(sizeof(TestDecodingData)); + data = palloc0_object(TestDecodingData); data->context = AllocSetContextCreate(ctx->context, "text conversion context", ALLOCSET_DEFAULT_SIZES); @@ -340,7 +340,7 @@ pg_decode_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, if (data->include_timestamp) appendStringInfo(ctx->out, " (at %s)", - timestamptz_to_str(txn->xact_time.commit_time)); + timestamptz_to_str(txn->commit_time)); OutputPluginWrite(ctx, true); } @@ -391,7 +391,7 @@ pg_decode_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, if (data->include_timestamp) appendStringInfo(ctx->out, " (at %s)", - timestamptz_to_str(txn->xact_time.prepare_time)); + timestamptz_to_str(txn->prepare_time)); OutputPluginWrite(ctx, true); } @@ -413,7 +413,7 @@ pg_decode_commit_prepared_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn if (data->include_timestamp) appendStringInfo(ctx->out, " (at %s)", - timestamptz_to_str(txn->xact_time.commit_time)); + timestamptz_to_str(txn->commit_time)); OutputPluginWrite(ctx, true); } @@ -437,7 +437,7 @@ pg_decode_rollback_prepared_txn(LogicalDecodingContext *ctx, if (data->include_timestamp) appendStringInfo(ctx->out, " (at %s)", - timestamptz_to_str(txn->xact_time.commit_time)); + timestamptz_to_str(txn->commit_time)); OutputPluginWrite(ctx, true); } @@ -581,7 +581,7 @@ tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple, bool skip_ /* print data */ if (isnull) appendStringInfoString(s, "null"); - else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval)) + else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(origval))) appendStringInfoString(s, "unchanged-toast-datum"); else if (!typisvarlena) print_literal(s, typid, @@ -874,7 +874,7 @@ pg_decode_stream_prepare(LogicalDecodingContext *ctx, if (data->include_timestamp) appendStringInfo(ctx->out, " (at %s)", - timestamptz_to_str(txn->xact_time.prepare_time)); + timestamptz_to_str(txn->prepare_time)); OutputPluginWrite(ctx, true); } @@ -903,7 +903,7 @@ pg_decode_stream_commit(LogicalDecodingContext *ctx, if (data->include_timestamp) appendStringInfo(ctx->out, " (at %s)", - timestamptz_to_str(txn->xact_time.commit_time)); + timestamptz_to_str(txn->commit_time)); OutputPluginWrite(ctx, true); } diff --git a/contrib/tsm_system_rows/tsm_system_rows.c b/contrib/tsm_system_rows/tsm_system_rows.c index f401efa2131fc..ef145fa6747a7 100644 --- a/contrib/tsm_system_rows/tsm_system_rows.c +++ b/contrib/tsm_system_rows/tsm_system_rows.c @@ -163,7 +163,7 @@ system_rows_samplescangetsamplesize(PlannerInfo *root, static void system_rows_initsamplescan(SampleScanState *node, int eflags) { - node->tsm_state = palloc0(sizeof(SystemRowsSamplerData)); + node->tsm_state = palloc0_object(SystemRowsSamplerData); /* Note the above leaves tsm_state->step equal to zero */ } diff --git a/contrib/tsm_system_time/tsm_system_time.c b/contrib/tsm_system_time/tsm_system_time.c index c9c71d8c3af39..1041258ea1a08 100644 --- a/contrib/tsm_system_time/tsm_system_time.c +++ b/contrib/tsm_system_time/tsm_system_time.c @@ -179,7 +179,7 @@ system_time_samplescangetsamplesize(PlannerInfo *root, static void system_time_initsamplescan(SampleScanState *node, int eflags) { - node->tsm_state = palloc0(sizeof(SystemTimeSamplerData)); + node->tsm_state = palloc0_object(SystemTimeSamplerData); /* Note the above leaves tsm_state->step equal to zero */ } diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index 336ba31047a4a..68251660887cf 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -60,7 +60,7 @@ placeChar(TrieChar *node, const unsigned char *str, int lenstr, TrieChar *curnode; if (!node) - node = (TrieChar *) palloc0(sizeof(TrieChar) * 256); + node = palloc0_array(TrieChar, 256); Assert(lenstr > 0); /* else str[0] doesn't exist */ @@ -239,7 +239,7 @@ initTrie(const char *filename) if (trgquoted && state > 0) { /* Ignore first and end quotes */ - trgstore = (char *) palloc(sizeof(char) * (trglen - 2)); + trgstore = palloc_array(char, trglen - 2); trgstorelen = 0; for (int i = 1; i < trglen - 1; i++) { @@ -252,7 +252,7 @@ initTrie(const char *filename) } else { - trgstore = (char *) palloc(sizeof(char) * trglen); + trgstore = palloc_array(char, trglen); trgstorelen = trglen; memcpy(trgstore, trg, trgstorelen); } @@ -421,7 +421,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) /* return a result only if we made at least one substitution */ if (buf.data != NULL) { - res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2); + res = palloc0_array(TSLexeme, 2); res->lexeme = buf.data; res->flags = TSL_FILTER; } diff --git a/contrib/xml2/xpath.c b/contrib/xml2/xpath.c index 23d3f332dbaa7..662d7d02f27b3 100644 --- a/contrib/xml2/xpath.c +++ b/contrib/xml2/xpath.c @@ -51,8 +51,8 @@ static text *pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar *toptag, static xmlChar *pgxml_texttoxmlchar(text *textstring); -static xmlXPathObjectPtr pgxml_xpath(text *document, xmlChar *xpath, - xpath_workspace *workspace); +static xpath_workspace *pgxml_xpath(text *document, xmlChar *xpath, + PgXmlErrorContext *xmlerrcxt); static void cleanup_workspace(xpath_workspace *workspace); @@ -88,19 +88,41 @@ Datum xml_encode_special_chars(PG_FUNCTION_ARGS) { text *tin = PG_GETARG_TEXT_PP(0); - text *tout; - xmlChar *ts, - *tt; + text *volatile tout = NULL; + xmlChar *volatile tt = NULL; + PgXmlErrorContext *xmlerrcxt; + + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + + PG_TRY(); + { + xmlChar *ts; - ts = pgxml_texttoxmlchar(tin); + ts = pgxml_texttoxmlchar(tin); + + tt = xmlEncodeSpecialChars(NULL, ts); + if (tt == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); + pfree(ts); + + tout = cstring_to_text((char *) tt); + } + PG_CATCH(); + { + if (tt != NULL) + xmlFree(tt); - tt = xmlEncodeSpecialChars(NULL, ts); + pg_xml_done(xmlerrcxt, true); - pfree(ts); + PG_RE_THROW(); + } + PG_END_TRY(); - tout = cstring_to_text((char *) tt); + if (tt != NULL) + xmlFree(tt); - xmlFree(tt); + pg_xml_done(xmlerrcxt, false); PG_RETURN_TEXT_P(tout); } @@ -122,62 +144,89 @@ pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlChar *septagname, xmlChar *plainsep) { - xmlBufferPtr buf; - xmlChar *result; - int i; + volatile xmlBufferPtr buf = NULL; + xmlChar *volatile result = NULL; + PgXmlErrorContext *xmlerrcxt; - buf = xmlBufferCreate(); + /* spin up some error handling */ + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) - { - xmlBufferWriteChar(buf, "<"); - xmlBufferWriteCHAR(buf, toptagname); - xmlBufferWriteChar(buf, ">"); - } - if (nodeset != NULL) + PG_TRY(); { - for (i = 0; i < nodeset->nodeNr; i++) - { - if (plainsep != NULL) - { - xmlBufferWriteCHAR(buf, - xmlXPathCastNodeToString(nodeset->nodeTab[i])); + buf = xmlBufferCreate(); - /* If this isn't the last entry, write the plain sep. */ - if (i < (nodeset->nodeNr) - 1) - xmlBufferWriteChar(buf, (char *) plainsep); - } - else + if (buf == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlBuffer"); + + if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + { + xmlBufferWriteChar(buf, "<"); + xmlBufferWriteCHAR(buf, toptagname); + xmlBufferWriteChar(buf, ">"); + } + if (nodeset != NULL) + { + for (int i = 0; i < nodeset->nodeNr; i++) { - if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + if (plainsep != NULL) { - xmlBufferWriteChar(buf, "<"); - xmlBufferWriteCHAR(buf, septagname); - xmlBufferWriteChar(buf, ">"); - } - xmlNodeDump(buf, - nodeset->nodeTab[i]->doc, - nodeset->nodeTab[i], - 1, 0); + xmlBufferWriteCHAR(buf, + xmlXPathCastNodeToString(nodeset->nodeTab[i])); - if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + /* If this isn't the last entry, write the plain sep. */ + if (i < (nodeset->nodeNr) - 1) + xmlBufferWriteChar(buf, (char *) plainsep); + } + else { - xmlBufferWriteChar(buf, ""); + if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + { + xmlBufferWriteChar(buf, "<"); + xmlBufferWriteCHAR(buf, septagname); + xmlBufferWriteChar(buf, ">"); + } + xmlNodeDump(buf, + nodeset->nodeTab[i]->doc, + nodeset->nodeTab[i], + 1, 0); + + if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + { + xmlBufferWriteChar(buf, ""); + } } } } - } - if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + { + xmlBufferWriteChar(buf, ""); + } + + result = xmlStrdup(xmlBufferContent(buf)); + if (result == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); + } + PG_CATCH(); { - xmlBufferWriteChar(buf, ""); + if (buf) + xmlBufferFree(buf); + + pg_xml_done(xmlerrcxt, true); + + PG_RE_THROW(); } - result = xmlStrdup(buf->content); + PG_END_TRY(); + xmlBufferFree(buf); + pg_xml_done(xmlerrcxt, false); + return result; } @@ -207,17 +256,30 @@ xpath_nodeset(PG_FUNCTION_ARGS) xmlChar *toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2)); xmlChar *septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(3)); xmlChar *xpath; - text *xpres; - xmlXPathObjectPtr res; - xpath_workspace workspace; + text *volatile xpres = NULL; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + xpres = pgxml_result_to_text(workspace->res, toptag, septag, NULL); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - xpres = pgxml_result_to_text(res, toptag, septag, NULL); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); pfree(xpath); @@ -239,17 +301,30 @@ xpath_list(PG_FUNCTION_ARGS) text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *plainsep = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2)); xmlChar *xpath; - text *xpres; - xmlXPathObjectPtr res; - xpath_workspace workspace; + text *volatile xpres = NULL; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + xpres = pgxml_result_to_text(workspace->res, NULL, NULL, plainsep); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - xpres = pgxml_result_to_text(res, NULL, NULL, plainsep); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); pfree(xpath); @@ -268,9 +343,9 @@ xpath_string(PG_FUNCTION_ARGS) text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *xpath; int32 pathsize; - text *xpres; - xmlXPathObjectPtr res; - xpath_workspace workspace; + text *volatile xpres = NULL; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; pathsize = VARSIZE_ANY_EXHDR(xpathsupp); @@ -286,11 +361,25 @@ xpath_string(PG_FUNCTION_ARGS) xpath[pathsize + 7] = ')'; xpath[pathsize + 8] = '\0'; - res = pgxml_xpath(document, xpath, &workspace); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); + + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + xpres = pgxml_result_to_text(workspace->res, NULL, NULL, NULL); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - xpres = pgxml_result_to_text(res, NULL, NULL, NULL); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); pfree(xpath); @@ -308,24 +397,38 @@ xpath_number(PG_FUNCTION_ARGS) text *document = PG_GETARG_TEXT_PP(0); text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *xpath; - float4 fRes; - xmlXPathObjectPtr res; - xpath_workspace workspace; + volatile float4 fRes = 0.0; + volatile bool isNull = false; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); - - pfree(xpath); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + pfree(xpath); - if (res == NULL) - PG_RETURN_NULL(); + if (workspace->res == NULL) + isNull = true; + else + fRes = xmlXPathCastToNumber(workspace->res); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - fRes = xmlXPathCastToNumber(res); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); - if (xmlXPathIsNaN(fRes)) + if (isNull || xmlXPathIsNaN(fRes)) PG_RETURN_NULL(); PG_RETURN_FLOAT4(fRes); @@ -340,22 +443,35 @@ xpath_bool(PG_FUNCTION_ARGS) text *document = PG_GETARG_TEXT_PP(0); text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *xpath; - int bRes; - xmlXPathObjectPtr res; - xpath_workspace workspace; + volatile int bRes = 0; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); - - pfree(xpath); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + pfree(xpath); - if (res == NULL) - PG_RETURN_BOOL(false); + if (workspace->res == NULL) + bRes = 0; + else + bRes = xmlXPathCastToBoolean(workspace->res); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - bRes = xmlXPathCastToBoolean(res); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); PG_RETURN_BOOL(bRes); } @@ -364,57 +480,38 @@ xpath_bool(PG_FUNCTION_ARGS) /* Core function to evaluate XPath query */ -static xmlXPathObjectPtr -pgxml_xpath(text *document, xmlChar *xpath, xpath_workspace *workspace) +static xpath_workspace * +pgxml_xpath(text *document, xmlChar *xpath, PgXmlErrorContext *xmlerrcxt) { int32 docsize = VARSIZE_ANY_EXHDR(document); - PgXmlErrorContext *xmlerrcxt; xmlXPathCompExprPtr comppath; + xpath_workspace *workspace = palloc0_object(xpath_workspace); workspace->doctree = NULL; workspace->ctxt = NULL; workspace->res = NULL; - xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - - PG_TRY(); + workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document), + docsize, NULL, NULL, + XML_PARSE_NOENT); + if (workspace->doctree != NULL) { - workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document), - docsize, NULL, NULL, - XML_PARSE_NOENT); - if (workspace->doctree != NULL) - { - workspace->ctxt = xmlXPathNewContext(workspace->doctree); - workspace->ctxt->node = xmlDocGetRootElement(workspace->doctree); - - /* compile the path */ - comppath = xmlXPathCtxtCompile(workspace->ctxt, xpath); - if (comppath == NULL) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, - "XPath Syntax Error"); + workspace->ctxt = xmlXPathNewContext(workspace->doctree); + workspace->ctxt->node = xmlDocGetRootElement(workspace->doctree); - /* Now evaluate the path expression. */ - workspace->res = xmlXPathCompiledEval(comppath, workspace->ctxt); + /* compile the path */ + comppath = xmlXPathCtxtCompile(workspace->ctxt, xpath); + if (comppath == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, + "XPath Syntax Error"); - xmlXPathFreeCompExpr(comppath); - } - } - PG_CATCH(); - { - cleanup_workspace(workspace); - - pg_xml_done(xmlerrcxt, true); + /* Now evaluate the path expression. */ + workspace->res = xmlXPathCompiledEval(comppath, workspace->ctxt); - PG_RE_THROW(); + xmlXPathFreeCompExpr(comppath); } - PG_END_TRY(); - if (workspace->res == NULL) - cleanup_workspace(workspace); - - pg_xml_done(xmlerrcxt, false); - - return workspace->res; + return workspace; } /* Clean up after processing the result of pgxml_xpath() */ @@ -438,35 +535,60 @@ pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar *septag, xmlChar *plainsep) { - xmlChar *xpresstr; - text *xpres; + xmlChar *volatile xpresstr = NULL; + text *volatile xpres = NULL; + PgXmlErrorContext *xmlerrcxt; if (res == NULL) return NULL; - switch (res->type) - { - case XPATH_NODESET: - xpresstr = pgxmlNodeSetToText(res->nodesetval, - toptag, - septag, plainsep); - break; + /* spin some error handling */ + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - case XPATH_STRING: - xpresstr = xmlStrdup(res->stringval); - break; + PG_TRY(); + { + switch (res->type) + { + case XPATH_NODESET: + xpresstr = pgxmlNodeSetToText(res->nodesetval, + toptag, + septag, plainsep); + break; + + case XPATH_STRING: + xpresstr = xmlStrdup(res->stringval); + if (xpresstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); + break; + + default: + elog(NOTICE, "unsupported XQuery result: %d", res->type); + xpresstr = xmlStrdup((const xmlChar *) ""); + if (xpresstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); + } - default: - elog(NOTICE, "unsupported XQuery result: %d", res->type); - xpresstr = xmlStrdup((const xmlChar *) ""); + /* Now convert this result back to text */ + xpres = cstring_to_text((char *) xpresstr); } + PG_CATCH(); + { + if (xpresstr != NULL) + xmlFree(xpresstr); - /* Now convert this result back to text */ - xpres = cstring_to_text((char *) xpresstr); + pg_xml_done(xmlerrcxt, true); + + PG_RE_THROW(); + } + PG_END_TRY(); /* Free various storage */ xmlFree(xpresstr); + pg_xml_done(xmlerrcxt, false); + return xpres; } @@ -648,11 +770,16 @@ xpath_table(PG_FUNCTION_ARGS) for (j = 0; j < numpaths; j++) { ctxt = xmlXPathNewContext(doctree); + if (ctxt == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate XPath context"); + ctxt->node = xmlDocGetRootElement(doctree); /* compile the path */ comppath = xmlXPathCtxtCompile(ctxt, xpaths[j]); - if (comppath == NULL) + if (comppath == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, "XPath Syntax Error"); @@ -671,6 +798,10 @@ xpath_table(PG_FUNCTION_ARGS) rownr < res->nodesetval->nodeNr) { resstr = xmlXPathCastNodeToString(res->nodesetval->nodeTab[rownr]); + if (resstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); had_values = true; } else @@ -680,11 +811,19 @@ xpath_table(PG_FUNCTION_ARGS) case XPATH_STRING: resstr = xmlStrdup(res->stringval); + if (resstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); break; default: elog(NOTICE, "unsupported XQuery result: %d", res->type); resstr = xmlStrdup((const xmlChar *) ""); + if (resstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); } /* diff --git a/contrib/xml2/xslt_proc.c b/contrib/xml2/xslt_proc.c index b720d89f754ae..2be87bec0cdf7 100644 --- a/contrib/xml2/xslt_proc.c +++ b/contrib/xml2/xslt_proc.c @@ -10,6 +10,7 @@ #include "fmgr.h" #include "utils/builtins.h" #include "utils/xml.h" +#include "varatt.h" #ifdef USE_LIBXSLT @@ -48,7 +49,7 @@ xslt_process(PG_FUNCTION_ARGS) text *doct = PG_GETARG_TEXT_PP(0); text *ssheet = PG_GETARG_TEXT_PP(1); - text *result; + text *volatile result = NULL; text *paramstr; const char **params; PgXmlErrorContext *xmlerrcxt; @@ -58,8 +59,7 @@ xslt_process(PG_FUNCTION_ARGS) volatile xsltSecurityPrefsPtr xslt_sec_prefs = NULL; volatile xsltTransformContextPtr xslt_ctxt = NULL; volatile int resstat = -1; - xmlChar *resstr = NULL; - int reslen = 0; + xmlChar *volatile resstr = NULL; if (fcinfo->nargs == 3) { @@ -69,7 +69,7 @@ xslt_process(PG_FUNCTION_ARGS) else { /* No parameters */ - params = (const char **) palloc(sizeof(char *)); + params = palloc_object(const char *); params[0] = NULL; } @@ -80,13 +80,14 @@ xslt_process(PG_FUNCTION_ARGS) { xmlDocPtr ssdoc; bool xslt_sec_prefs_error; + int reslen = 0; /* Parse document */ doctree = xmlReadMemory((char *) VARDATA_ANY(doct), VARSIZE_ANY_EXHDR(doct), NULL, NULL, XML_PARSE_NOENT); - if (doctree == NULL) + if (doctree == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "error parsing XML document"); @@ -95,14 +96,14 @@ xslt_process(PG_FUNCTION_ARGS) VARSIZE_ANY_EXHDR(ssheet), NULL, NULL, XML_PARSE_NOENT); - if (ssdoc == NULL) + if (ssdoc == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "error parsing stylesheet as XML document"); /* After this call we need not free ssdoc separately */ stylesheet = xsltParseStylesheetDoc(ssdoc); - if (stylesheet == NULL) + if (stylesheet == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, "failed to parse stylesheet"); @@ -137,11 +138,15 @@ xslt_process(PG_FUNCTION_ARGS) restree = xsltApplyStylesheetUser(stylesheet, doctree, params, NULL, NULL, xslt_ctxt); - if (restree == NULL) + if (restree == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, "failed to apply stylesheet"); - resstat = xsltSaveResultToString(&resstr, &reslen, restree, stylesheet); + resstat = xsltSaveResultToString((xmlChar **) &resstr, &reslen, + restree, stylesheet); + + if (resstat >= 0) + result = cstring_to_text_with_len((char *) resstr, reslen); } PG_CATCH(); { @@ -155,6 +160,8 @@ xslt_process(PG_FUNCTION_ARGS) xsltFreeStylesheet(stylesheet); if (doctree != NULL) xmlFreeDoc(doctree); + if (resstr != NULL) + xmlFree(resstr); xsltCleanupGlobals(); pg_xml_done(xmlerrcxt, true); @@ -170,17 +177,15 @@ xslt_process(PG_FUNCTION_ARGS) xmlFreeDoc(doctree); xsltCleanupGlobals(); + if (resstr) + xmlFree(resstr); + pg_xml_done(xmlerrcxt, false); /* XXX this is pretty dubious, really ought to throw error instead */ if (resstat < 0) PG_RETURN_NULL(); - result = cstring_to_text_with_len((char *) resstr, reslen); - - if (resstr) - xmlFree(resstr); - PG_RETURN_TEXT_P(result); #else /* !USE_LIBXSLT */ diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile index 11aac91381258..b53b2694a6b7e 100644 --- a/doc/src/sgml/Makefile +++ b/doc/src/sgml/Makefile @@ -59,7 +59,7 @@ GENERATED_SGML = version.sgml \ features-supported.sgml features-unsupported.sgml errcodes-table.sgml \ keywords-table.sgml targets-meson.sgml wait_event_types.sgml -ALL_SGML := $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml) $(GENERATED_SGML) +ALL_SGML := $(wildcard $(srcdir)/*.sgml $(srcdir)/func/*.sgml $(srcdir)/ref/*.sgml) $(GENERATED_SGML) ALL_IMAGES := $(wildcard $(srcdir)/images/*.svg) @@ -263,14 +263,14 @@ endif # sqlmansectnum != 7 # tabs are harmless, but it is best to avoid them in SGML files check-tabs: - @( ! grep ' ' $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \ + @( ! grep ' ' $(wildcard $(srcdir)/*.sgml $(srcdir)/func/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \ (echo "Tabs appear in SGML/XML files" 1>&2; exit 1) # Non-breaking spaces are harmless, but it is best to avoid them in SGML files. # Use perl command because non-GNU grep or sed could not have hex escape sequence. check-nbsp: @ ( $(PERL) -ne '/\xC2\xA0/ and print("$$ARGV:$$_"),$$n++; END {exit($$n>0)}' \ - $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl $(srcdir)/images/*.xsl) ) || \ + $(wildcard $(srcdir)/*.sgml $(srcdir)/func/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl $(srcdir)/images/*.xsl) ) || \ (echo "Non-breaking spaces appear in SGML/XML files" 1>&2; exit 1) ## diff --git a/doc/src/sgml/advanced.sgml b/doc/src/sgml/advanced.sgml index e15a3323dfbfd..451bcb202ec6f 100644 --- a/doc/src/sgml/advanced.sgml +++ b/doc/src/sgml/advanced.sgml @@ -80,18 +80,18 @@ SELECT * FROM myview; - Recall the weather and - cities tables from weather and + cities tables from . Consider the following problem: You want to make sure that no one can insert rows in the - weather table that do not have a matching - entry in the cities table. This is called + weather table that do not have a matching + entry in the cities table. This is called maintaining the referential integrity of your data. In simplistic database systems this would be implemented (if at all) by first looking at the - cities table to check if a matching record + cities table to check if a matching record exists, and then inserting or rejecting the new - weather records. This approach has a + weather records. This approach has a number of problems and is very inconvenient, so PostgreSQL can do this for you. @@ -101,12 +101,12 @@ SELECT * FROM myview; CREATE TABLE cities ( - name varchar(80) primary key, + name varchar(80) PRIMARY KEY, location point ); CREATE TABLE weather ( - city varchar(80) references cities(name), + city varchar(80) REFERENCES cities (name), temp_lo int, temp_hi int, prcp real, @@ -578,8 +578,8 @@ SELECT sum(salary) OVER w, avg(salary) OVER w - Let's create two tables: A table cities - and a table capitals. Naturally, capitals + Let's create two tables: A table cities + and a table capitals. Naturally, capitals are also cities, so you want some way to show the capitals implicitly when you list all cities. If you're really clever you might invent some scheme like this: @@ -625,14 +625,14 @@ CREATE TABLE capitals ( - In this case, a row of capitals + In this case, a row of capitals inherits all columns (name, population, and elevation) from its - parent, cities. The + parent, cities. The type of the column name is text, a native PostgreSQL type for variable length character strings. The - capitals table has + capitals table has an additional column, state, which shows its state abbreviation. In PostgreSQL, a table can inherit from @@ -685,8 +685,8 @@ SELECT name, elevation Here the ONLY before cities indicates that the query should be run over only the - cities table, and not tables below - cities in the inheritance hierarchy. Many + cities table, and not tables below + cities in the inheritance hierarchy. Many of the commands that we have already discussed — SELECT, UPDATE, and DELETE — support this ONLY diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 211a0ae1945bb..08006856579ad 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -278,8 +278,8 @@ SET client_min_messages = DEBUG1; TOAST table. - This option is known to be slow. Also, if the toast table or its - index is corrupt, checking it against toast values could conceivably + This option is known to be slow. Also, if the TOAST table or its + index is corrupt, checking it against TOAST values could conceivably crash the server, although in many cases this would just produce an error. @@ -382,7 +382,7 @@ SET client_min_messages = DEBUG1; verification functions is true, an additional phase of verification is performed against the table associated with the target index relation. This consists of a dummy - CREATE INDEX operation, which checks for the + CREATE INDEX CONCURRENTLY operation, which checks for the presence of all hypothetical new index tuples against a temporary, in-memory summarizing structure (this is built when needed during the basic first phase of verification). The summarizing structure diff --git a/doc/src/sgml/arch-dev.sgml b/doc/src/sgml/arch-dev.sgml index 976db1e599984..06b6e2a849356 100644 --- a/doc/src/sgml/arch-dev.sgml +++ b/doc/src/sgml/arch-dev.sgml @@ -445,7 +445,7 @@ join sequence. The planner preferentially considers joins between any two relations for which there exists a corresponding join clause in the WHERE qualification (i.e., for - which a restriction like where rel1.attr1=rel2.attr2 + which a restriction like WHERE rel1.attr1 = rel2.attr2 exists). Join pairs with no join clause are considered only when there is no other choice, that is, a particular relation has no available join clauses to any other relation. All possible plans are generated for diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index 25b8904baf7cd..168444eccc570 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -627,7 +627,7 @@ tar -cf backup.tar /usr/local/pgsql/data character in the command. The simplest useful command is something like: -archive_command = 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' # Unix +archive_command = 'test ! -f "/mnt/server/archivedir/%f" && cp "%p" "/mnt/server/archivedir/%f"' # Unix archive_command = 'copy "%p" "C:\\server\\archivedir\\%f"' # Windows which will copy archivable WAL segments to the directory @@ -991,7 +991,7 @@ SELECT pg_backup_start(label => 'label', fast => false); usually preferable as it minimizes the impact on the running system. If you want to start the backup as soon as possible, pass true as the second parameter to pg_backup_start and it will - request an immediate checkpoint, which will finish as fast as possible using + request a fast checkpoint, which will finish as fast as possible using as much I/O as possible. @@ -1294,7 +1294,7 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); character in the command. The simplest useful command is something like: -restore_command = 'cp /mnt/server/archivedir/%f %p' +restore_command = 'cp "/mnt/server/archivedir/%f" "%p"' which will copy previously archived WAL segments from the directory /mnt/server/archivedir. Of course, you can use something @@ -1493,11 +1493,11 @@ restore_command = 'cp /mnt/server/archivedir/%f %p' If archive storage size is a concern, you can use gzip to compress the archive files: -archive_command = 'gzip < %p > /mnt/server/archivedir/%f.gz' +archive_command = 'gzip < "%p" > "/mnt/server/archivedir/%f.gz"' You will then need to use gunzip during recovery: -restore_command = 'gunzip < /mnt/server/archivedir/%f.gz > %p' +restore_command = 'gunzip < "/mnt/server/archivedir/%f.gz" > "%p"' diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 3cd5bee7ffaf4..53a982bf60d29 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -1042,7 +1042,7 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat - Define indexes and toast tables. + Define indexes and TOAST tables. diff --git a/doc/src/sgml/bloom.sgml b/doc/src/sgml/bloom.sgml index ec5d077679b14..3f6d38f377b7a 100644 --- a/doc/src/sgml/bloom.sgml +++ b/doc/src/sgml/bloom.sgml @@ -101,12 +101,12 @@ CREATE INDEX bloomidx ON tbloom USING bloom (i1,i2,i3) =# CREATE TABLE tbloom AS SELECT - (random() * 1000000)::int as i1, - (random() * 1000000)::int as i2, - (random() * 1000000)::int as i3, - (random() * 1000000)::int as i4, - (random() * 1000000)::int as i5, - (random() * 1000000)::int as i6 + (random() * 1000000)::int AS i1, + (random() * 1000000)::int AS i2, + (random() * 1000000)::int AS i3, + (random() * 1000000)::int AS i4, + (random() * 1000000)::int AS i5, + (random() * 1000000)::int AS i6 FROM generate_series(1,10000000); SELECT 10000000 diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index cbd4e40a320b3..2fc6344298026 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -596,7 +596,10 @@ Approximate average size (in bytes) of the transition state - data, or zero to use a default estimate + data. A positive value provides an estimate; zero means to + use a default estimate. A negative value indicates the state + data can grow unboundedly in size, such as when the aggregate + accumulates input rows (e.g., array_agg, string_agg). @@ -1582,7 +1585,7 @@ rolpassword text - Password (possibly encrypted); null if none. The format depends + Encrypted password; null if none. The format depends on the form of encryption used. @@ -1627,11 +1630,6 @@ SCRAM-SHA-256$<iteration count>:&l ServerKey are in Base64 encoded format. This format is the same as that specified by RFC 5803. - - - A password that does not follow either of those formats is assumed to be - unencrypted. - @@ -1956,7 +1954,7 @@ SCRAM-SHA-256$<iteration count>:&l The OID of the data type that corresponds to this table's row type, - if any; zero for indexes, sequences, and toast tables, which have + if any; zero for indexes, sequences, and TOAST tables, which have no pg_type entry @@ -2629,7 +2627,6 @@ SCRAM-SHA-256$<iteration count>:&l Has the constraint been validated? - Currently, can be false only for foreign keys and CHECK constraints @@ -3164,7 +3161,7 @@ SCRAM-SHA-256$<iteration count>:&l datcollate text - LC_COLLATE for this database + LC_COLLATE for this database (ignored unless datlocprovider is c) @@ -6377,6 +6374,16 @@ SCRAM-SHA-256$<iteration count>:&l + + + puballsequences bool + + + If true, this publication automatically includes all sequences + in the database, including any that will be created in the future. + + + pubinsert bool @@ -6561,7 +6568,7 @@ SCRAM-SHA-256$<iteration count>:&l (references pg_class.oid) - Reference to relation + Reference to table or sequence @@ -7977,7 +7984,7 @@ SCRAM-SHA-256$<iteration count>:&l Finish LSN of the transaction whose changes are to be skipped, if a valid - LSN; otherwise 0/0. + LSN; otherwise 0/0000000. @@ -8088,6 +8095,43 @@ SCRAM-SHA-256$<iteration count>:&l + + + subretaindeadtuples bool + + + If true, the detection of is + enabled and the information (e.g., dead tuples, commit timestamps, and + origins) on the subscriber that is useful for conflict detection is + retained. + + + + + + submaxretention int4 + + + The maximum duration (in milliseconds) for which information (e.g., dead + tuples, commit timestamps, and origins) useful for conflict detection can + be retained. + + + + + + subretentionactive bool + + + The retention status of information (e.g., dead tuples, commit + timestamps, and origins) useful for conflict detection. True if + retain_dead_tuples + is enabled, and the retention duration has not exceeded + max_retention_duration, + when defined. + + + subconninfo text @@ -8155,16 +8199,19 @@ SCRAM-SHA-256$<iteration count>:&l - The catalog pg_subscription_rel contains the - state for each replicated relation in each subscription. This is a - many-to-many mapping. + The catalog pg_subscription_rel stores the + state of each replicated table and sequence for each subscription. This + is a many-to-many mapping. - This catalog only contains tables known to the subscription after running - either CREATE SUBSCRIPTION or - ALTER SUBSCRIPTION ... REFRESH - PUBLICATION. + This catalog contains tables and sequences known to the subscription + after running: + CREATE SUBSCRIPTION, + + ALTER SUBSCRIPTION ... REFRESH PUBLICATION, or + + ALTER SUBSCRIPTION ... REFRESH SEQUENCES. @@ -8198,7 +8245,7 @@ SCRAM-SHA-256$<iteration count>:&l (references pg_class.oid) - Reference to relation + Reference to table or sequence @@ -8207,12 +8254,20 @@ SCRAM-SHA-256$<iteration count>:&l srsubstate char - State code: + State code for the table or sequence. + + + State codes for tables: i = initialize, d = data is being copied, f = finished table copy, s = synchronized, r = ready (normal replication) + + + State codes for sequences: + i = initialize, + r = ready diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 5a0e97f6f3158..3aabc79801222 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -100,7 +100,7 @@ initdb --locale=sv_SE LC_COLLATE - String sort order + String sort order (ignored unless the provider is libc) LC_CTYPE @@ -570,13 +570,13 @@ CREATE COLLATION CREATE COLLATION mycollation5 (provider = icu, deterministic = false, locale = 'en-US-u-kn-ks-level2'); -SELECT 'aB' = 'Ab' COLLATE mycollation5 as result; +SELECT 'aB' = 'Ab' COLLATE mycollation5 AS result; result -------- t (1 row) -SELECT 'N-45' < 'N-123' COLLATE mycollation5 as result; +SELECT 'N-45' < 'N-123' COLLATE mycollation5 AS result; result -------- t @@ -1876,7 +1876,7 @@ ORDER BY c COLLATE ebcdic; GB18030 - National Standard + National Standard, version 2022 Chinese No No diff --git a/doc/src/sgml/client-auth.sgml b/doc/src/sgml/client-auth.sgml index 832b616a7bbff..a347ee18980a5 100644 --- a/doc/src/sgml/client-auth.sgml +++ b/doc/src/sgml/client-auth.sgml @@ -305,7 +305,7 @@ include_dir directory Specifies which database user name(s) this record matches. The value all specifies that it matches all users. Otherwise, this is either the name of a specific - database user, a regular expression (when starting with a slash + database user, a regular expression when starting with a slash (/), or a group name preceded by +. (Recall that there is no real distinction between users and groups in PostgreSQL; a + mark really means @@ -889,16 +889,16 @@ host all all 192.168.0.0/16 ident map=omicro # list of names of administrators. Passwords are required in all cases. # # TYPE DATABASE USER ADDRESS METHOD -local sameuser all md5 -local all /^.*helpdesk$ md5 -local all @admins md5 -local all +support md5 +local sameuser all scram-sha-256 +local all /^.*helpdesk$ scram-sha-256 +local all @admins scram-sha-256 +local all +support scram-sha-256 # The last two lines above can be combined into a single line: -local all @admins,+support md5 +local all @admins,+support scram-sha-256 # The database column can also use lists and file names: -local db1,db2,@demodbs all md5 +local db1,db2,@demodbs all scram-sha-256 @@ -1003,8 +1003,9 @@ local db1,db2,@demodbs all md5 the remainder of the field is treated as a regular expression. (See for details of PostgreSQL's regular expression syntax.) The regular - expression can include a single capture, or parenthesized subexpression, - which can then be referenced in the database-username + expression can include a single capture, or parenthesized subexpression. + The portion of the system user name that matched the capture can then + be referenced in the database-username field as \1 (backslash-one). This allows the mapping of multiple user names in a single line, which is particularly useful for simple syntax substitutions. For example, these entries @@ -1022,12 +1023,11 @@ mymap /^(.*)@otherdomain\.com$ guest If the database-username field starts with a slash (/), the remainder of the field is treated - as a regular expression (see - for details of PostgreSQL's regular - expression syntax). It is not possible to use \1 - to use a capture from regular expression on - system-username for a regular expression - on database-username. + as a regular expression. + When the database-username field is a regular + expression, it is not possible to use \1 within it to + refer to a capture from the system-username + field. diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index ca2a567b2b19f..cdfe8e376f0c4 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1234,7 +1234,7 @@ include_dir 'conf.d' - The library/libraries to use for validating OAuth connection tokens. If + Sets the library/libraries to use for validating OAuth connection tokens. If only one validator library is provided, it will be used by default for any OAuth connections; otherwise, all oauth HBA entries @@ -1400,7 +1400,7 @@ include_dir 'conf.d' Specifies a list of cipher suites that are allowed by connections using TLS version 1.3. Multiple cipher suites can be - specified by using a colon separated list. If left blank, the default + specified by using a colon-separated list. If left blank, the default set of cipher suites in OpenSSL will be used. @@ -1680,7 +1680,7 @@ include_dir 'conf.d' This parameter determines whether the passphrase command set by ssl_passphrase_command will also be called during a configuration reload if a key file needs a passphrase. If this - parameter is off (the default), then + parameter is off (the default), then ssl_passphrase_command will be ignored during a reload and the SSL configuration will not be reloaded if a passphrase is needed. That setting is appropriate for a command that requires a @@ -1688,6 +1688,12 @@ include_dir 'conf.d' running. Setting this parameter to on might be appropriate if the passphrase is obtained from a file, for example. + + This parameter must be set to on when running on + Windows since all connections + will perform a configuration reload due to the different process model + of that platform. + This parameter can only be set in the postgresql.conf file or on the server command line. @@ -1760,7 +1766,8 @@ include_dir 'conf.d' Controls whether huge pages are requested for the main shared memory area. Valid values are try (the default), - on, and off. With + on, and off. + This parameter can only be set at server start. With huge_pages set to try, the server will try to request huge pages, but fall back to the default if that fails. With on, failure to request huge pages @@ -2068,7 +2075,7 @@ include_dir 'conf.d' Specifies the maximum amount of memory to be used by logical decoding, before some of the decoded changes are written to local disk. This - limits the amount of memory used by logical streaming replication + limits the amount of memory used by streaming logical replication connections. It defaults to 64 megabytes (64MB). Since each replication connection only uses a single buffer of this size, and an installation normally doesn't have many such connections @@ -2273,6 +2280,7 @@ include_dir 'conf.d' platform, is generally discouraged because it typically requires non-default kernel settings to allow for large allocations (see ). + This parameter can only be set at server start. @@ -2300,6 +2308,7 @@ include_dir 'conf.d' however, it may be useful for debugging, when the pg_dynshmem directory is stored on a RAM disk, or when other shared memory facilities are not available. + This parameter can only be set at server start. @@ -2363,7 +2372,7 @@ include_dir 'conf.d' - + file_copy_method (enum) file_copy_method configuration parameter @@ -2413,6 +2422,7 @@ include_dir 'conf.d' / queue. The default value is 1048576. For 8 KB pages it allows to consume up to 8 GB of disk space. + This parameter can only be set at server start. @@ -2432,8 +2442,8 @@ include_dir 'conf.d' - Sets the maximum number of open files each server subprocess is - allowed to open simultaneously; files already opened in the + Sets the maximum number of files each server subprocess is + allowed to have open simultaneously; files already opened in the postmaster are not counted toward this limit. The default is one thousand files. @@ -2694,9 +2704,9 @@ include_dir 'conf.d' Controls the largest I/O size in operations that combine I/O, and silently limits the user-settable parameter io_combine_limit. - This parameter can only be set in - the postgresql.conf file or on the server - command line. + This parameter can only be set at server start. + If this value is specified without units, it is taken as blocks, + that is BLCKSZ bytes, typically 8kB. The maximum possible size depends on the operating system and block size, but is typically 1MB on Unix and 128kB on Windows. The default is 128kB. @@ -2716,6 +2726,8 @@ include_dir 'conf.d' higher than the io_max_combine_limit parameter, the lower value will silently be used instead, so both may need to be raised to increase the I/O size. + If this value is specified without units, it is taken as blocks, + that is BLCKSZ bytes, typically 8kB. The maximum possible size depends on the operating system and block size, but is typically 1MB on Unix and 128kB on Windows. The default is 128kB. @@ -2779,6 +2791,7 @@ include_dir 'conf.d' + The default is worker. This parameter can only be set at server start. @@ -2787,7 +2800,7 @@ include_dir 'conf.d' - io_workers (int) + io_workers (integer) io_workers configuration parameter @@ -2893,7 +2906,8 @@ include_dir 'conf.d' Sets the maximum number of parallel workers that can be started by a single utility command. Currently, the parallel utility commands that support the use of parallel workers are - CREATE INDEX when building a B-tree or BRIN index, + CREATE INDEX when building a B-tree, + GIN, or BRIN index, and VACUUM without FULL option. Parallel workers are taken from the pool of processes established by , limited @@ -3032,6 +3046,17 @@ include_dir 'conf.d' many UPDATE and DELETE statements are executed. + + It is important to note that when wal_level is set to + replica, the effective WAL level can automatically change + based on the presence of + logical replication slots. The system automatically increases the + effective WAL level to logical when creating the first + logical replication slot, and decreases it back to replica + when dropping or invalidating the last logical replication slot. The current + effective WAL level can be monitored through + parameter. + In releases prior to 9.6, this parameter also allowed the values archive and hot_standby. @@ -3397,8 +3422,9 @@ include_dir 'conf.d' This parameter enables compression of WAL using the specified compression method. When enabled, the PostgreSQL - server compresses full page images written to WAL when - is on or during a base backup. + server compresses full page images written to WAL (e.g. when + is on, during a base backup, + etc.). A compressed page image will be decompressed during WAL replay. The supported methods are pglz, lz4 (if PostgreSQL @@ -3785,7 +3811,7 @@ include_dir 'conf.d' difference between the two modes, but when set to always the WAL archiver is enabled also during archive recovery or standby mode. In always mode, all files restored from the archive - or streamed with streaming replication will be archived (again). See + or streamed with streaming physical replication will be archived (again). See for details. @@ -3891,7 +3917,7 @@ include_dir 'conf.d' full files. Therefore, it is unwise to use a very short archive_timeout — it will bloat your archive storage. archive_timeout settings of a minute or so are - usually reasonable. You should consider using streaming replication, + usually reasonable. You should consider using streaming physical replication, instead of archiving, if you want data to be copied off the primary server more quickly than that. If this value is specified without units, it is taken as seconds. @@ -3916,7 +3942,7 @@ include_dir 'conf.d' This section describes the settings that apply to recovery in general, - affecting crash recovery, streaming replication and archive-based + affecting crash recovery, streaming physical replication and archive-based replication. @@ -3960,6 +3986,7 @@ include_dir 'conf.d' blocks to prefetch. If this value is specified without units, it is taken as bytes. The default is 512kB. + This parameter can only be set at server start. @@ -4026,7 +4053,7 @@ include_dir 'conf.d' The local shell command to execute to retrieve an archived segment of the WAL file series. This parameter is required for archive recovery, - but optional for streaming replication. + but optional for streaming physical replication. Any %f in the string is replaced by the name of the file to retrieve from the archive, and any %p is replaced by the copy destination path name @@ -4034,7 +4061,7 @@ include_dir 'conf.d' (The path name is relative to the current working directory, i.e., the cluster's data directory.) Any %r is replaced by the name of the file containing the - last valid restart point. That is the earliest file that must be kept + last valid restartpoint. That is the earliest file that must be kept to allow a restore to be restartable, so this information can be used to truncate the archive to just the minimum required to support restarting from the current restore. %r is typically only @@ -4049,7 +4076,7 @@ include_dir 'conf.d' names that are not present in the archive; it must return nonzero when so asked. Examples: -restore_command = 'cp /mnt/server/archivedir/%f "%p"' +restore_command = 'cp "/mnt/server/archivedir/%f" "%p"' restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows An exception is that if the command was terminated by a signal (other @@ -4079,7 +4106,7 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows cleaning up old archived WAL files that are no longer needed by the standby server. Any %r is replaced by the name of the file containing the - last valid restart point. + last valid restartpoint. That is the earliest file that must be kept to allow a restore to be restartable, and so all files earlier than %r may be safely removed. @@ -4088,7 +4115,7 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows The module is often used in archive_cleanup_command for single-standby configurations, for example: -archive_cleanup_command = 'pg_archivecleanup /mnt/server/archivedir %r' +archive_cleanup_command = 'pg_archivecleanup /mnt/server/archivedir "%r"' Note however that if multiple standby servers are restoring from the same archive directory, you will need to ensure that you do not delete WAL files until they are no longer needed by any of the servers. @@ -4123,7 +4150,7 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows recovery_end_command is to provide a mechanism for cleanup following replication or recovery. Any %r is replaced by the name of the file containing the - last valid restart point, like in . + last valid restartpoint, like in . If the command returns a nonzero exit status then a warning log @@ -4452,15 +4479,16 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows Replication - These settings control the behavior of the built-in - streaming replication feature (see - ), and the built-in - logical replication feature (see + These settings control the behavior of + streaming replication, + both physical replication + (see ) and + logical replication (see ). - For streaming replication, servers will be either a + For physical replication, servers will be either a primary or a standby server. Primaries can send data, while standbys are always receivers of replicated data. When cascading replication (see ) is used, standby servers @@ -4616,10 +4644,12 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows - Invalidate replication slots that have remained idle longer than this - duration. If this value is specified without units, it is taken as - minutes. A value of zero (the default) disables the idle timeout - invalidation mechanism. This parameter can only be set in the + Invalidate replication slots that have remained inactive (not used by + a replication connection) + for longer than this duration. + If this value is specified without units, it is taken as seconds. + A value of zero (the default) disables the idle timeout + invalidation mechanism. This parameter can only be set in the postgresql.conf file or on the server command line. @@ -4683,9 +4713,9 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows - Record commit time of transactions. This parameter - can only be set in postgresql.conf file or on the server - command line. The default value is off. + Record commit time of transactions. + This parameter can only be set at server start. + The default value is off. @@ -4889,7 +4919,7 @@ ANY num_sync ( standby server that is - to receive replication data. Their values on the primary server + to receive physical replication data. Their values on the primary server are irrelevant. @@ -4961,6 +4991,8 @@ ANY num_sync ( num_sync ( . max_standby_streaming_delay applies when WAL data is - being received via streaming replication. + being received via streaming physical replication. If this value is specified without units, it is taken as milliseconds. The default is 30 seconds. A value of -1 allows the standby to wait forever for conflicting @@ -5163,7 +5195,7 @@ ANY num_sync ( num_sync ( num_sync ( num_sync ( num_sync ( num_sync ( + enable_eager_aggregate (boolean) + + enable_eager_aggregate configuration parameter + + + + + Enables or disables the query planner's ability to partially push + aggregation past a join, and finalize it once all the relations are + joined. The default is on. + + + + enable_gathermerge (boolean) @@ -5764,7 +5815,7 @@ ANY num_sync ( + enable_self_join_elimination (boolean) enable_self_join_elimination configuration parameter @@ -5902,24 +5953,24 @@ ANY num_sync ( ( + min_eager_agg_group_size (floating point) + + min_eager_agg_group_size configuration parameter + + + + + Sets the minimum average group size required to consider applying + eager aggregation. This helps avoid the overhead of eager + aggregation when it does not offer significant row count reduction. + The default is 8. + + + + jit_above_cost (floating point) @@ -6356,8 +6423,8 @@ ANY num_sync ( + debug_print_raw_parse (boolean) + + debug_print_raw_parse configuration parameter + + debug_print_parse (boolean) debug_print_parse configuration parameter @@ -7394,8 +7465,8 @@ local0.* /var/log/postgresql These parameters enable various debugging output to be emitted. - When set, they print the resulting parse tree, the query rewriter - output, or the execution plan for each executed query. + When set, they print the resulting raw parse tree, the parse tree, the query + rewriter output, or the execution plan for each executed query. These messages are emitted at LOG message level, so by default they will appear in the server log but will not be sent to the client. You can change that by adjusting @@ -7415,7 +7486,8 @@ local0.* /var/log/postgresql When set, debug_pretty_print indents the messages - produced by debug_print_parse, + produced by debug_print_raw_parse, + debug_print_parse, debug_print_rewritten, or debug_print_plan. This results in more readable but much longer output than the compact format used when @@ -7433,17 +7505,44 @@ local0.* /var/log/postgresql - Causes each action executed by autovacuum to be logged if it ran for at + Causes vacuum action executed by autovacuum to be logged if it ran for at + least the specified amount of time. Setting this to zero logs + all vacuum actions by autovacuum. -1 disables logging + vacuum actions by autovacuum. If this value is specified without units, + it is taken as milliseconds. For example, if you set this to + 250ms then all automatic vacuums that run + 250ms or longer will be logged. In addition, when this parameter is + set to any value other than -1, a message will be + logged if a vacuum action by autovacuum is skipped due to a conflicting lock or a + concurrently dropped relation. The default is 10min. + Enabling this parameter can be helpful in tracking vacuum activity by autovacuum. + This parameter can only be set in the postgresql.conf + file or on the server command line; but the setting can be overridden for + individual tables by changing table storage parameters. + + + + + + log_autoanalyze_min_duration (integer) + + log_autoanalyze_min_duration + configuration parameter + + + + + Causes analyze action executed by autovacuum to be logged if it ran for at least the specified amount of time. Setting this to zero logs - all autovacuum actions. -1 disables logging autovacuum - actions. If this value is specified without units, it is taken as milliseconds. - For example, if you set this to - 250ms then all automatic vacuums and analyzes that run + all analyze actions by autovacuum. -1 disables logging + analyze actions by autovacuum. If this value is specified without units, + it is taken as milliseconds. For example, if you set this to + 250ms then all automatic analyzes that run 250ms or longer will be logged. In addition, when this parameter is set to any value other than -1, a message will be - logged if an autovacuum action is skipped due to a conflicting lock or a + logged if an analyze action by autovacuum is skipped due to a conflicting lock or a concurrently dropped relation. The default is 10min. - Enabling this parameter can be helpful in tracking autovacuum activity. + Enabling this parameter can be helpful in tracking analyze activity by autovacuum. This parameter can only be set in the postgresql.conf file or on the server command line; but the setting can be overridden for individual tables by changing table storage parameters. @@ -7527,12 +7626,12 @@ local0.* /var/log/postgresql setup_durations Logs the time spent establishing the connection and setting up the - backend at the time the connection is ready to execute its first - query. The log message includes the total setup duration, starting - from the postmaster accepting the incoming connection and ending - when the connection is ready for query. It also includes the time - it took to fork the new backend and the time it took to - authenticate the user. + backend until the connection is ready to execute its first + query. The log message includes three durations: the total + setup duration (starting from the postmaster accepting the + incoming connection and ending when the connection is ready + for query), the time it took to fork the new backend, and + the time it took to authenticate the user. @@ -7916,17 +8015,17 @@ log_line_prefix = '%m [%p] %q%u@%d/%a ' Controls whether a log message is produced when a session waits longer than to acquire a lock. This is useful in determining if lock waits are causing - poor performance. The default is off. + poor performance. The default is on. Only superusers and users with the appropriate SET privilege can change this setting. - - log_lock_failure (boolean) + + log_lock_failures (boolean) - log_lock_failure configuration parameter + log_lock_failures configuration parameter @@ -8600,7 +8699,8 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; timing information is displayed in pg_stat_progress_vacuum, pg_stat_progress_analyze, - in the output of when the + in the output of and + when the VERBOSE option is used, and by autovacuum for auto-vacuums and auto-analyzes when is set. @@ -9338,7 +9438,8 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; vacuum_truncate (boolean) - vacuum_truncate configuration parameter + vacuum_truncate + configuration parameter @@ -9542,7 +9643,8 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; vacuum_max_eager_freeze_failure_rate (floating point) - vacuum_max_eager_freeze_failure_rate configuration parameter + vacuum_max_eager_freeze_failure_rate + configuration parameter @@ -11049,6 +11151,12 @@ extension_control_path = 'C:\tools\postgresql;H:\my_project\share;$system' string, the default '$system' is also assumed. + + If extensions with equal names are present in multiple directories in + the configured path, only the instance found first in the path will be + used. + + This parameter can be changed at run time by superusers and users with the appropriate SET privilege, but a @@ -11737,6 +11845,55 @@ dynamic_library_path = '/usr/local/lib/postgresql:$libdir' + + debug_exec_backend (boolean) + + debug_exec_backend configuration parameter + + + + + Reports whether PostgreSQL has been built + with EXEC_BACKEND enabled. That is the case on + Windows or if the + macro EXEC_BACKEND is defined + when PostgreSQL is built. + + + + + + effective_wal_level (enum) + + effective_wal_level configuration parameter + + + + + Reports the actual WAL logging level currently in effect in the + system. This parameter shares the same set of values as + , but reflects the operational WAL + level rather than the configured setting. For descriptions of + possible values, refer to the wal_level + parameter documentation. + + + The effective WAL level can differ from the configured + wal_level in certain situations. For example, + when wal_level is set to replica + and the system has one or more logical replication slots, + effective_wal_level will show logical + to indicate that the system is maintaining WAL records at + logical level equivalent. + + + On standby servers, effective_wal_level matches + the value of effective_wal_level from the most + upstream server in the replication chain. + + + + huge_pages_status (enum) @@ -12189,6 +12346,7 @@ dynamic_library_path = '/usr/local/lib/postgresql:$libdir' main data files, wal for WAL files, and wal_init for WAL files when being initially allocated. + This parameter can only be set at server start. Some operating systems and file systems do not support direct I/O, so diff --git a/doc/src/sgml/cube.sgml b/doc/src/sgml/cube.sgml index 0fb7080748673..a11c0cbd767c9 100644 --- a/doc/src/sgml/cube.sgml +++ b/doc/src/sgml/cube.sgml @@ -249,7 +249,7 @@ For example, the nearest neighbor of the 3-D point (0.5, 0.5, 0.5) could be found efficiently with: -SELECT c FROM test ORDER BY c <-> cube(array[0.5,0.5,0.5]) LIMIT 1; +SELECT c FROM test ORDER BY c <-> cube(ARRAY[0.5, 0.5, 0.5]) LIMIT 1; @@ -540,7 +540,7 @@ SELECT c FROM test ORDER BY c ~> 3 DESC LIMIT 5; This union: -select cube_union('(0,5,2),(2,3,1)', '0'); +SELECT cube_union('(0,5,2),(2,3,1)', '0'); cube_union ------------------- (0, 0, 0),(2, 5, 2) @@ -552,7 +552,7 @@ cube_union -select cube_inter('(0,-1),(1,1)', '(-2),(2)'); +SELECT cube_inter('(0,-1),(1,1)', '(-2),(2)'); cube_inter ------------- (0, 0),(1, 0) @@ -579,7 +579,7 @@ cube_inter('(0,-1),(1,1)','(-2,0),(2,0)'); -select cube_contains('(0,0),(1,1)', '0.5,0.5'); +SELECT cube_contains('(0,0),(1,1)', '0.5,0.5'); cube_contains -------------- t diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 09309ba0390b7..e5267a8e4be61 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -117,7 +117,7 @@ double precision - float8 + float, float8 double precision floating-point number (8 bytes) @@ -315,6 +315,7 @@ character varying, character, varchar, date, double precision, integer, interval, + json, numeric, decimal, real, smallint, time (with or without time zone), timestamp (with or without time zone), @@ -717,7 +718,7 @@ NUMERIC(3, 5) SELECT x, round(x::numeric) AS num_round, round(x::double precision) AS dbl_round -FROM generate_series(-3.5, 3.5, 1) as x; +FROM generate_series(-3.5, 3.5, 1) AS x; x | num_round | dbl_round ------+-----------+----------- -3.5 | -4 | -4 @@ -1259,7 +1260,7 @@ SELECT '52093.89'::money::numeric::float8; semantically insignificant and disregarded when comparing two values of type character. In collations where whitespace is significant, this behavior can produce unexpected results; - for example SELECT 'a '::CHAR(2) collate "C" < + for example SELECT 'a '::CHAR(2) COLLATE "C" < E'a\n'::CHAR(2) returns true, even though C locale would consider a space to be greater than a newline. Trailing spaces are removed when converting a character value @@ -2054,8 +2055,6 @@ MINUTE TO SECOND
Time Input - - Example @@ -4737,6 +4736,10 @@ INSERT INTO mytable VALUES(-1); -- fails regconfig + + regdatabase + + regdictionary @@ -4878,6 +4881,13 @@ SELECT * FROM pg_attribute english + + regdatabase + pg_database + database name + template1 + + regdictionary pg_ts_dict @@ -5049,8 +5059,8 @@ WHERE ... be dropped without first removing the default expression. The alternative of nextval('my_seq'::text) does not create a dependency. - (regrole is an exception to this property. Constants of this - type are not allowed in stored expressions.) + (regdatabase and regrole are exceptions to this + property. Constants of these types are not allowed in stored expressions.) @@ -5110,7 +5120,7 @@ WHERE ... +(pg_lsn,numeric) and -(pg_lsn,numeric) operators, respectively. Note that the calculated LSN should be in the range of pg_lsn type, - i.e., between 0/0 and + i.e., between 0/00000000 and FFFFFFFF/FFFFFFFF. @@ -5234,8 +5244,8 @@ WHERE ...
Pseudo-Types - - + + Name diff --git a/doc/src/sgml/datetime.sgml b/doc/src/sgml/datetime.sgml index 3e24170acbfcc..5905f5fa5506a 100644 --- a/doc/src/sgml/datetime.sgml +++ b/doc/src/sgml/datetime.sgml @@ -942,17 +942,17 @@ $ cal 9 1752 definition when you need it: do the arithmetic in time zone UTC+12. For example, -=> SELECT extract(julian from '2021-06-23 7:00:00-04'::timestamptz at time zone 'UTC+12'); +=> SELECT extract(julian FROM '2021-06-23 7:00:00-04'::timestamptz AT TIME ZONE 'UTC+12'); extract ------------------------------ 2459388.95833333333333333333 (1 row) -=> SELECT extract(julian from '2021-06-23 8:00:00-04'::timestamptz at time zone 'UTC+12'); +=> SELECT extract(julian FROM '2021-06-23 8:00:00-04'::timestamptz AT TIME ZONE 'UTC+12'); extract -------------------------------------- 2459389.0000000000000000000000000000 (1 row) -=> SELECT extract(julian from date '2021-06-23'); +=> SELECT extract(julian FROM date '2021-06-23'); extract --------- 2459389 diff --git a/doc/src/sgml/dblink.sgml b/doc/src/sgml/dblink.sgml index 808c690985b73..dd6778d22a84a 100644 --- a/doc/src/sgml/dblink.sgml +++ b/doc/src/sgml/dblink.sgml @@ -444,7 +444,7 @@ dblink(text sql [, bool fail_on_error]) returns setof record The SQL query that you wish to execute in the remote database, - for example select * from foo. + for example SELECT * FROM foo. @@ -478,7 +478,7 @@ dblink(text sql [, bool fail_on_error]) returns setof record SELECT * FROM dblink('dbname=mydb options=-csearch_path=', - 'select proname, prosrc from pg_proc') + 'SELECT proname, prosrc FROM pg_proc') AS t1(proname name, prosrc text) WHERE proname LIKE 'bytea%'; @@ -513,7 +513,7 @@ SELECT * CREATE VIEW myremote_pg_proc AS SELECT * FROM dblink('dbname=postgres options=-csearch_path=', - 'select proname, prosrc from pg_proc') + 'SELECT proname, prosrc FROM pg_proc') AS t1(proname name, prosrc text); SELECT * FROM myremote_pg_proc WHERE proname LIKE 'bytea%'; @@ -525,7 +525,7 @@ SELECT * FROM myremote_pg_proc WHERE proname LIKE 'bytea%'; SELECT * FROM dblink('dbname=postgres options=-csearch_path=', - 'select proname, prosrc from pg_proc') + 'SELECT proname, prosrc FROM pg_proc') AS t1(proname name, prosrc text) WHERE proname LIKE 'bytea%'; proname | prosrc ------------+------------ @@ -549,7 +549,7 @@ SELECT dblink_connect('dbname=postgres options=-csearch_path='); OK (1 row) -SELECT * FROM dblink('select proname, prosrc from pg_proc') +SELECT * FROM dblink('SELECT proname, prosrc FROM pg_proc') AS t1(proname name, prosrc text) WHERE proname LIKE 'bytea%'; proname | prosrc ------------+------------ @@ -573,7 +573,7 @@ SELECT dblink_connect('myconn', 'dbname=regression options=-csearch_path='); OK (1 row) -SELECT * FROM dblink('myconn', 'select proname, prosrc from pg_proc') +SELECT * FROM dblink('myconn', 'SELECT proname, prosrc FROM pg_proc') AS t1(proname name, prosrc text) WHERE proname LIKE 'bytea%'; proname | prosrc ------------+------------ @@ -666,7 +666,7 @@ dblink_exec(text sql [, bool fail_on_error]) returns text The SQL command that you wish to execute in the remote database, for example - insert into foo values(0, 'a', '{"a0","b0","c0"}'). + INSERT INTO foo VALUES (0, 'a', '{"a0","b0","c0"}'). @@ -793,7 +793,7 @@ dblink_open(text connname, text cursorname, text sql [, bool fail_on_error]) ret The SELECT statement that you wish to execute in the remote - database, for example select * from pg_class. + database, for example SELECT * FROM pg_class. @@ -848,7 +848,7 @@ SELECT dblink_connect('dbname=postgres options=-csearch_path='); OK (1 row) -SELECT dblink_open('foo', 'select proname, prosrc from pg_proc'); +SELECT dblink_open('foo', 'SELECT proname, prosrc FROM pg_proc'); dblink_open ------------- OK @@ -969,7 +969,7 @@ SELECT dblink_connect('dbname=postgres options=-csearch_path='); OK (1 row) -SELECT dblink_open('foo', 'select proname, prosrc from pg_proc where proname like ''bytea%'''); +SELECT dblink_open('foo', 'SELECT proname, prosrc FROM pg_proc WHERE proname LIKE ''bytea%'''); dblink_open ------------- OK @@ -1106,7 +1106,7 @@ SELECT dblink_connect('dbname=postgres options=-csearch_path='); OK (1 row) -SELECT dblink_open('foo', 'select proname, prosrc from pg_proc'); +SELECT dblink_open('foo', 'SELECT proname, prosrc FROM pg_proc'); dblink_open ------------- OK @@ -1301,7 +1301,7 @@ dblink_send_query(text connname, text sql) returns int The SQL statement that you wish to execute in the remote database, - for example select * from pg_class. + for example SELECT * FROM pg_class. @@ -1583,7 +1583,7 @@ contrib_regression=# SELECT dblink_connect('dtest1', 'dbname=contrib_regression' (1 row) contrib_regression=# SELECT * FROM -contrib_regression-# dblink_send_query('dtest1', 'select * from foo where f1 < 3') AS t1; +contrib_regression-# dblink_send_query('dtest1', 'SELECT * FROM foo WHERE f1 < 3') AS t1; t1 ---- 1 @@ -1603,7 +1603,7 @@ contrib_regression=# SELECT * FROM dblink_get_result('dtest1') AS t1(f1 int, f2 (0 rows) contrib_regression=# SELECT * FROM -contrib_regression-# dblink_send_query('dtest1', 'select * from foo where f1 < 3; select * from foo where f1 > 6') AS t1; +contrib_regression-# dblink_send_query('dtest1', 'SELECT * FROM foo WHERE f1 < 3; SELECT * FROM foo WHERE f1 > 6') AS t1; t1 ---- 1 diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index fcd1cb85352fc..9070aaa5a7cd8 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -419,6 +419,16 @@ CREATE TABLE people ( tableoid. + + + A virtual generated column cannot have a user-defined type, and the + generation expression of a virtual generated column must not reference + user-defined functions or types, that is, it can only use built-in + functions or types. This applies also indirectly, such as for functions + or types that underlie operators or casts. (This restriction does not + exist for stored generated columns.) + + A generated column cannot have a column default or an identity definition. @@ -1343,7 +1353,7 @@ CREATE TABLE posts ( ); Without the specification of the column, the foreign key would also set - the column tenant_id to null, but that column is still + the column tenant_id to null, but that column is still required as part of the primary key. @@ -1548,7 +1558,7 @@ CREATE TABLE circles ( locate the row version very quickly, a row's ctid will change if it is updated or moved by VACUUM FULL. Therefore - ctid is useless as a long-term row + ctid should not be used as a row identifier. A primary key should be used to identify logical rows. @@ -1575,6 +1585,297 @@ CREATE TABLE circles ( + + Temporal Tables + + + temporal + + + + Temporal tables allow users to track different + dimensions of history. Application time tracks the + history of a thing out in the world, and system time + tracks the history of the database itself. (A database that does both is + also called bitemporal.) This section describes how + to express and manage such histories in temporal tables. + + + + Application Time + + + application time + + + + Application time refers to a history of the entity + described by a table. In a typical non-temporal table, there is a single + row for each entity. In a temporal table, an entity may have multiple + rows, as long as those rows describe non-overlapping periods from its + history. Application time requires each row to have a start and end time, + expressing when the row is applicable. + + + + The following SQL creates a temporal table that can store application time: + +CREATE TABLE products ( + product_no integer, + price numeric, + valid_at daterange +); + + + + + Records in a temporal table can be imagined on a timeline, as in . Here we show three records + describing two products. Each record is a tuple with three attributes: + the product number, the price, and the application time. So product 5 was + first offered for a price of 5.00 starting January 1, 2020, but then + became 8.00 starting January 1, 2022. Its second record has no specified + end time, indicating that it is true indefinitely, or for all future time. + The last record shows that product 6 was introduced January 1, 2021 for + 9.00, then canceled January 1, 2024. + + +
+ Application Time Example + + + + + +
+ + + In a table, these records would be: + + product_no | price | valid_at +------------+-------+------------------------- + 5 | 5.00 | [2020-01-01,2022-01-01) + 5 | 8.00 | [2022-01-01,) + 6 | 9.00 | [2021-01-01,2024-01-01) + + + + + We show the application time using range-type notation, because it is + stored as a single column (either a range or multirange). Ranges include + their start point but exclude their end point. That way two adjacent + ranges cover all points without overlapping. See for more information about range types. + + + + In principle, a table with application-time ranges/multiranges is + equivalent to a table that stores application-time + instants: one for each second, millisecond, nanosecond, or + whatever finest granularity is available. But such a table would contain + far too many rows, so ranges/multiranges offer an optimization to + represent the same information in a compact form. In addition, ranges and + multiranges offer a more convenient interface for typical temporal + operations, where records change infrequently enough that separate + versions persist for extended periods of time. + + + + Temporal Primary Keys and Unique Constraints + + + A table with application time has a different concept of entity + uniqueness than a non-temporal table. Temporal entity uniqueness can be + enforced with a temporal primary key. A regular primary key has at least + one column, all columns are NOT NULL, and the combined + value of all columns is unique. A temporal primary key also has at least + one such column, but in addition it has a final column that is of a range + type or multirange type that shows when the row is applicable. The + regular parts of the key must be unique for any moment in time, but + non-unique rows are allowed if their application time does not overlap. + + + + The syntax to create a temporal primary key is as follows: + + +CREATE TABLE products ( + product_no integer, + price numeric, + valid_at daterange, + PRIMARY KEY (product_no, valid_at WITHOUT OVERLAPS) +); + + + In this example, product_no is the non-temporal part + of the key, and valid_at is a range column containing + the application time. + + + + The WITHOUT OVERLAPS column is implicitly NOT + NULL (like the other parts of the key). In addition it may not + contain empty values, that is, a range of 'empty' or a + multirange of {}. An empty application time would + have no meaning. + + + + It is also possible to create a temporal unique constraint that is + not a primary key. The syntax is similar: + + +CREATE TABLE products ( + product_no integer, + price numeric, + valid_at daterange, + UNIQUE (product_no, valid_at WITHOUT OVERLAPS) +); + + + Temporal unique constraints also forbid empty ranges/multiranges for + their application time, but that column is permitted to be null (like the + other columns of the unique constraint). + + + + Temporal primary keys and unique constraints are backed by GiST indexes + (see ) rather than B-Tree indexes. In practice, + creating a temporal primary key or constraint requires installing the + extension, so that the database has GiST + operator classes for the non-temporal parts of the key. + + + + Temporal primary keys and unique constraints have the same behavior as + exclusion constraints (see ), + where each regular key part is compared with equality, and the + application time is compared with overlaps, for example EXCLUDE + USING gist (id WITH =, valid_at WITH &&). The only + difference is that they also forbid an empty application time. + + + + + Temporal Foreign Keys + + + A temporal foreign key is a reference from one application-time table to + another application-time table. Just as a non-temporal reference + requires a referenced key to exist, so a temporal reference requires a + referenced key to exist, but during whatever history the reference exists + (at least). So if the products table is referenced by + a variants table, and a variant of product 5 has an + application-time of [2020-01-01,2026-01-01), then + product 5 must exist throughout that period. + + + + We can create the variants table with the following + schema (without a foreign key yet): + + +CREATE TABLE variants ( + id integer, + product_no integer, + name text, + valid_at daterange, + PRIMARY KEY (id, valid_at WITHOUT OVERLAPS) +); + + + We have included a temporal primary key as a best practice, but it is not + strictly required by foreign keys. + + + + plots product 5 (in green) + and two variants referencing it (in yellow) on the same timeline. + Variant 8 (Medium) was introduced first, then variant 9 (XXL). Both + satisfy the foreign key constraint, because the referenced product exists + throughout their entire history. + + +
+ Temporal Foreign Key Example + + + + + +
+ + + In a table, these records would be: + + id | product_no | name | valid_at +----+------------+--------+------------------------- + 8 | 5 | Medium | [2021-01-01,2023-06-01) + 9 | 5 | XXL | [2022-03-01,2024-06-01) + + + + + Note that a temporal reference need not be fulfilled by a single row in + the referenced table. Product 5 had a price change in the middle of + variant 8's history, but the reference is still valid. The combination + of all matching rows is used to test whether the referenced history + contains the referencing row. + + + + The syntax to add a temporal foreign key to our table is: + + +CREATE TABLE variants ( + id integer, + product_no integer, + name text, + valid_at daterange, + PRIMARY KEY (id, valid_at WITHOUT OVERLAPS), + FOREIGN KEY (product_no, PERIOD valid_at) REFERENCES products (product_no, PERIOD valid_at) +); + + + Note that the keyword PERIOD must be used for the + application-time column in both the referencing and referenced table. + + + + A temporal primary key or unique constraint matching the referenced columns + must exist on the referenced table. + + + + PostgreSQL supports temporal foreign keys with + action NO ACTION, but not RESTRICT, + CASCADE, SET NULL, or SET + DEFAULT. + +
+
+ + + System Time + + + system time + + + + System time refers to the history of the database + table, not the entity it describes. It captures when each row was + inserted/updated/deleted. + + + + PostgreSQL does not currently support system + time, but it could be emulated using triggers, and there are external + extensions that provide such functionality. + + +
+ Modifying Tables @@ -1949,6 +2250,8 @@ ALTER TABLE table_name OWNER TO new_owne Superusers can always do this; ordinary roles can only do it if they are both the current owner of the object (or inherit the privileges of the owning role) and able to SET ROLE to the new owning role. + All object privileges of the old owner are transferred to the new owner + along with the ownership. @@ -2223,8 +2526,9 @@ REVOKE ALL ON accounts FROM PUBLIC; Allows VACUUM, ANALYZE, CLUSTER, REFRESH MATERIALIZED VIEW, - REINDEX, and LOCK TABLE on a - relation. + REINDEX, LOCK TABLE, + and database object statistics manipulation functions + (see ) on a relation. @@ -2528,7 +2832,7 @@ REVOKE ALL ON accounts FROM PUBLIC; As an example, suppose that user miriam creates - table mytable and does: + table mytable and does: GRANT SELECT ON mytable TO PUBLIC; GRANT SELECT, UPDATE, INSERT ON mytable TO admin; @@ -2756,7 +3060,7 @@ CREATE POLICY user_mod_policy ON users Below is a larger example of how this feature can be used in production - environments. The table passwd emulates a Unix password + environments. The table passwd emulates a Unix password file: @@ -2823,9 +3127,9 @@ GRANT UPDATE -- admin can view all rows and fields -postgres=> set role admin; +postgres=> SET ROLE admin; SET -postgres=> table passwd; +postgres=> TABLE passwd; user_name | pwhash | uid | gid | real_name | home_phone | extra_info | home_dir | shell -----------+--------+-----+-----+-----------+--------------+------------+-------------+----------- admin | xxx | 0 | 0 | Admin | 111-222-3333 | | /root | /bin/dash @@ -2834,11 +3138,11 @@ postgres=> table passwd; (3 rows) -- Test what Alice is able to do -postgres=> set role alice; +postgres=> SET ROLE alice; SET -postgres=> table passwd; +postgres=> TABLE passwd; ERROR: permission denied for table passwd -postgres=> select user_name,real_name,home_phone,extra_info,home_dir,shell from passwd; +postgres=> SELECT user_name, real_name, home_phone, extra_info, home_dir, shell FROM passwd; user_name | real_name | home_phone | extra_info | home_dir | shell -----------+-----------+--------------+------------+-------------+----------- admin | Admin | 111-222-3333 | | /root | /bin/dash @@ -2846,21 +3150,21 @@ postgres=> select user_name,real_name,home_phone,extra_info,home_dir,shell fr alice | Alice | 098-765-4321 | | /home/alice | /bin/zsh (3 rows) -postgres=> update passwd set user_name = 'joe'; +postgres=> UPDATE passwd SET user_name = 'joe'; ERROR: permission denied for table passwd -- Alice is allowed to change her own real_name, but no others -postgres=> update passwd set real_name = 'Alice Doe'; +postgres=> UPDATE passwd SET real_name = 'Alice Doe'; UPDATE 1 -postgres=> update passwd set real_name = 'John Doe' where user_name = 'admin'; +postgres=> UPDATE passwd SET real_name = 'John Doe' WHERE user_name = 'admin'; UPDATE 0 -postgres=> update passwd set shell = '/bin/xx'; +postgres=> UPDATE passwd SET shell = '/bin/xx'; ERROR: new row violates WITH CHECK OPTION for "passwd" -postgres=> delete from passwd; +postgres=> DELETE FROM passwd; ERROR: permission denied for table passwd -postgres=> insert into passwd (user_name) values ('xxx'); +postgres=> INSERT INTO passwd (user_name) VALUES ('xxx'); ERROR: permission denied for table passwd -- Alice can change her own password; RLS silently prevents updating other rows -postgres=> update passwd set pwhash = 'abc'; +postgres=> UPDATE passwd SET pwhash = 'abc'; UPDATE 1 @@ -2893,7 +3197,7 @@ CREATE POLICY admin_local_only ON passwd AS RESTRICTIVE TO admin admin (1 row) -=> select inet_client_addr(); +=> SELECT inet_client_addr(); inet_client_addr ------------------ 127.0.0.1 @@ -2904,7 +3208,7 @@ CREATE POLICY admin_local_only ON passwd AS RESTRICTIVE TO admin -----------+--------+-----+-----+-----------+------------+------------+----------+------- (0 rows) -=> UPDATE passwd set pwhash = NULL; +=> UPDATE passwd SET pwhash = NULL; UPDATE 0 @@ -4441,6 +4745,44 @@ ALTER INDEX measurement_city_id_logdate_key ... + + + There is also an option for merging multiple table partitions into + a single partition using the + ALTER TABLE ... MERGE PARTITIONS. + This feature simplifies the management of partitioned tables by allowing + users to combine partitions that are no longer needed as + separate entities. It's important to note that this operation is not + supported for hash-partitioned tables and acquires an + ACCESS EXCLUSIVE lock, which could impact high-load + systems due to the lock's restrictive nature. For example, we can + merge three monthly partitions into one quarter partition: + +ALTER TABLE measurement + MERGE PARTITIONS (measurement_y2006m01, + measurement_y2006m02, + measurement_y2006m03) INTO measurement_y2006q1; + + + + + Similarly to merging multiple table partitions, there is an option for + splitting a single partition into multiple using the + ALTER TABLE ... SPLIT PARTITION. + This feature could come in handy when one partition grows too big + and needs to be split into multiple. It's important to note that + this operation is not supported for hash-partitioned tables and acquires + an ACCESS EXCLUSIVE lock, which could impact high-load + systems due to the lock's restrictive nature. For example, we can split + the quarter partition back to monthly partitions: + +ALTER TABLE measurement SPLIT PARTITION measurement_y2006q1 INTO + (PARTITION measurement_y2006m01 FOR VALUES FROM ('2006-01-01') TO ('2006-02-01'), + PARTITION measurement_y2006m02 FOR VALUES FROM ('2006-02-01') TO ('2006-03-01'), + PARTITION measurement_y2006m03 FOR VALUES FROM ('2006-03-01') TO ('2006-04-01')); + + + diff --git a/doc/src/sgml/dfunc.sgml b/doc/src/sgml/dfunc.sgml index b94aefcd0ca6c..3778efc83ebfa 100644 --- a/doc/src/sgml/dfunc.sgml +++ b/doc/src/sgml/dfunc.sgml @@ -157,19 +157,12 @@ ld -Bshareable -o foo.so foo.o The compiler flag to create PIC is - with the Sun compiler and with GCC. To link shared libraries, the compiler option is - with either compiler or alternatively with GCC. -cc -KPIC -c foo.c -cc -G -o foo.so foo.o - - or - gcc -fPIC -c foo.c -gcc -G -o foo.so foo.o +gcc -shared -o foo.so foo.o diff --git a/doc/src/sgml/dict-int.sgml b/doc/src/sgml/dict-int.sgml index 8dd07b9bc1270..b4ce54848232f 100644 --- a/doc/src/sgml/dict-int.sgml +++ b/doc/src/sgml/dict-int.sgml @@ -80,7 +80,7 @@ ALTER TEXT SEARCH DICTIONARY To test the dictionary, you can try -mydb# select ts_lexize('intdict', '12345678'); +mydb# SELECT ts_lexize('intdict', '12345678'); ts_lexize ----------- {123456} diff --git a/doc/src/sgml/dml.sgml b/doc/src/sgml/dml.sgml index 458aee788b7fb..61c64cf6c498c 100644 --- a/doc/src/sgml/dml.sgml +++ b/doc/src/sgml/dml.sgml @@ -317,7 +317,7 @@ DELETE FROM products; column to provide unique identifiers, RETURNING can return the ID assigned to a new row: -CREATE TABLE users (firstname text, lastname text, id serial primary key); +CREATE TABLE users (firstname text, lastname text, id serial PRIMARY KEY); INSERT INTO users (firstname, lastname) VALUES ('Joe', 'Cool') RETURNING id; diff --git a/doc/src/sgml/docguide.sgml b/doc/src/sgml/docguide.sgml index db4bcce56eac6..7b61b4841aa03 100644 --- a/doc/src/sgml/docguide.sgml +++ b/doc/src/sgml/docguide.sgml @@ -60,9 +60,7 @@ maintained by the OASIS group. The official DocBook site has good introductory and reference documentation and - a complete O'Reilly book for your online reading pleasure. The - - NewbieDoc Docbook Guide is very helpful for beginners. + a complete O'Reilly book for your online reading pleasure. The FreeBSD Documentation Project also uses DocBook and has some good information, including a number of style guidelines that might be diff --git a/doc/src/sgml/ecpg.sgml b/doc/src/sgml/ecpg.sgml index e7a53f3c9d00d..e6e3a82905c07 100644 --- a/doc/src/sgml/ecpg.sgml +++ b/doc/src/sgml/ecpg.sgml @@ -1823,11 +1823,11 @@ while (1) representation of that type is (%f,%f), which is defined in the functions complex_in() - and complex_out() functions + and complex_out() in . The following example inserts the complex type values (1,1) and (3,3) into the - columns a and b, and select + columns a and b, and select them from the table after that. @@ -2042,7 +2042,7 @@ EXEC SQL EXECUTE mystmt INTO :v1, :v2, :v3 USING 37; EXEC SQL BEGIN DECLARE SECTION; char dbaname[128]; char datname[128]; -char *stmt = "SELECT u.usename as dbaname, d.datname " +char *stmt = "SELECT u.usename AS dbaname, d.datname " " FROM pg_database d, pg_user u " " WHERE d.datdba = u.usesysid"; EXEC SQL END DECLARE SECTION; @@ -6685,7 +6685,7 @@ EXEC SQL CONNECT TO 'unix:postgresql://localhost/connectdb' AS main USER :user; EXEC SQL CONNECT TO :db AS :id; EXEC SQL CONNECT TO :db USER connectuser USING :pw; EXEC SQL CONNECT TO @localhost AS main USER connectdb; -EXEC SQL CONNECT TO REGRESSDB1 as main; +EXEC SQL CONNECT TO REGRESSDB1 AS main; EXEC SQL CONNECT TO AS main USER connectdb; EXEC SQL CONNECT TO connectdb AS :id; EXEC SQL CONNECT TO connectdb AS main USER connectuser/connectdb; @@ -6959,7 +6959,7 @@ EXEC SQL [ AT connection_name ] DEC The namespace of the declaration is the precompile unit, and multiple declarations to the same SQL statement identifier are not allowed. Note that if the precompiler runs in Informix compatibility mode and - some SQL statement is declared, "database" can not be used as a cursor + some SQL statement is declared, "database" cannot be used as a cursor name. diff --git a/doc/src/sgml/event-trigger.sgml b/doc/src/sgml/event-trigger.sgml index 1bd9abb667650..c10627554bdbd 100644 --- a/doc/src/sgml/event-trigger.sgml +++ b/doc/src/sgml/event-trigger.sgml @@ -433,7 +433,7 @@ $$ --- DECLARE table_oid oid := pg_event_trigger_table_rewrite_oid(); - current_hour integer := extract('hour' from current_time); + current_hour integer := extract('hour' FROM current_time); pages integer; max_pages integer := 100; BEGIN diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml index b80320504d695..c6d66414b8ea7 100644 --- a/doc/src/sgml/fdwhandler.sgml +++ b/doc/src/sgml/fdwhandler.sgml @@ -1320,7 +1320,7 @@ ExplainForeignModify(ModifyTableState *mtstate, ResultRelInfo *rinfo, List *fdw_private, int subplan_index, - struct ExplainState *es); + ExplainState *es); Print additional EXPLAIN output for a foreign table update. diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index fef9584f908ec..ac66fcbdb5727 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -17,7 +17,10 @@ - + + +%allfiles_func; + @@ -180,7 +183,7 @@ - + diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml deleted file mode 100644 index c67688cbf5f98..0000000000000 --- a/doc/src/sgml/func.sgml +++ /dev/null @@ -1,32009 +0,0 @@ - - - - Functions and Operators - - - function - - - - operator - - - - PostgreSQL provides a large number of - functions and operators for the built-in data types. This chapter - describes most of them, although additional special-purpose functions - appear in relevant sections of the manual. Users can also - define their own functions and operators, as described in - . The - psql commands \df and - \do can be used to list all - available functions and operators, respectively. - - - - The notation used throughout this chapter to describe the argument and - result data types of a function or operator is like this: - -repeat ( text, integer ) text - - which says that the function repeat takes one text and - one integer argument and returns a result of type text. The right arrow - is also used to indicate the result of an example, thus: - -repeat('Pg', 4) PgPgPgPg - - - - - If you are concerned about portability then note that most of - the functions and operators described in this chapter, with the - exception of the most trivial arithmetic and comparison operators - and some explicitly marked functions, are not specified by the - SQL standard. Some of this extended functionality - is present in other SQL database management - systems, and in many cases this functionality is compatible and - consistent between the various implementations. - - - - - Logical Operators - - - operator - logical - - - - Boolean - operators - operators, logical - - - - The usual logical operators are available: - - - AND (operator) - - - - OR (operator) - - - - NOT (operator) - - - - conjunction - - - - disjunction - - - - negation - - - -boolean AND boolean boolean -boolean OR boolean boolean -NOT boolean boolean - - - SQL uses a three-valued logic system with true, - false, and null, which represents unknown. - Observe the following truth tables: - - - -
- - a - b - a AND b - a OR b - - - - - - TRUE - TRUE - TRUE - TRUE - - - - TRUE - FALSE - FALSE - TRUE - - - - TRUE - NULL - NULL - TRUE - - - - FALSE - FALSE - FALSE - FALSE - - - - FALSE - NULL - FALSE - NULL - - - - NULL - NULL - NULL - NULL - - - - - - - - - - a - NOT a - - - - - - TRUE - FALSE - - - - FALSE - TRUE - - - - NULL - NULL - - - - - - - - The operators AND and OR are - commutative, that is, you can switch the left and right operands - without affecting the result. (However, it is not guaranteed that - the left operand is evaluated before the right operand. See for more information about the - order of evaluation of subexpressions.) - - - - - Comparison Functions and Operators - - - comparison - operators - - - - The usual comparison operators are available, as shown in . - - -
- Comparison Operators - - - - Operator - Description - - - - - - - datatype < datatype - boolean - - Less than - - - - - datatype > datatype - boolean - - Greater than - - - - - datatype <= datatype - boolean - - Less than or equal to - - - - - datatype >= datatype - boolean - - Greater than or equal to - - - - - datatype = datatype - boolean - - Equal - - - - - datatype <> datatype - boolean - - Not equal - - - - - datatype != datatype - boolean - - Not equal - - - -
- - - - <> is the standard SQL notation for not - equal. != is an alias, which is converted - to <> at a very early stage of parsing. - Hence, it is not possible to implement != - and <> operators that do different things. - - - - - These comparison operators are available for all built-in data types - that have a natural ordering, including numeric, string, and date/time - types. In addition, arrays, composite types, and ranges can be compared - if their component data types are comparable. - - - - It is usually possible to compare values of related data - types as well; for example integer > - bigint will work. Some cases of this sort are implemented - directly by cross-type comparison operators, but if no - such operator is available, the parser will coerce the less-general type - to the more-general type and apply the latter's comparison operator. - - - - As shown above, all comparison operators are binary operators that - return values of type boolean. Thus, expressions like - 1 < 2 < 3 are not valid (because there is - no < operator to compare a Boolean value with - 3). Use the BETWEEN predicates - shown below to perform range tests. - - - - There are also some comparison predicates, as shown in . These behave much like - operators, but have special syntax mandated by the SQL standard. - - - - Comparison Predicates - - - - - Predicate - - - Description - - - Example(s) - - - - - - - - datatype BETWEEN datatype AND datatype - boolean - - - Between (inclusive of the range endpoints). - - - 2 BETWEEN 1 AND 3 - t - - - 2 BETWEEN 3 AND 1 - f - - - - - - datatype NOT BETWEEN datatype AND datatype - boolean - - - Not between (the negation of BETWEEN). - - - 2 NOT BETWEEN 1 AND 3 - f - - - - - - datatype BETWEEN SYMMETRIC datatype AND datatype - boolean - - - Between, after sorting the two endpoint values. - - - 2 BETWEEN SYMMETRIC 3 AND 1 - t - - - - - - datatype NOT BETWEEN SYMMETRIC datatype AND datatype - boolean - - - Not between, after sorting the two endpoint values. - - - 2 NOT BETWEEN SYMMETRIC 3 AND 1 - f - - - - - - datatype IS DISTINCT FROM datatype - boolean - - - Not equal, treating null as a comparable value. - - - 1 IS DISTINCT FROM NULL - t (rather than NULL) - - - NULL IS DISTINCT FROM NULL - f (rather than NULL) - - - - - - datatype IS NOT DISTINCT FROM datatype - boolean - - - Equal, treating null as a comparable value. - - - 1 IS NOT DISTINCT FROM NULL - f (rather than NULL) - - - NULL IS NOT DISTINCT FROM NULL - t (rather than NULL) - - - - - - datatype IS NULL - boolean - - - Test whether value is null. - - - 1.5 IS NULL - f - - - - - - datatype IS NOT NULL - boolean - - - Test whether value is not null. - - - 'null' IS NOT NULL - t - - - - - - datatype ISNULL - boolean - - - Test whether value is null (nonstandard syntax). - - - - - - datatype NOTNULL - boolean - - - Test whether value is not null (nonstandard syntax). - - - - - - boolean IS TRUE - boolean - - - Test whether boolean expression yields true. - - - true IS TRUE - t - - - NULL::boolean IS TRUE - f (rather than NULL) - - - - - - boolean IS NOT TRUE - boolean - - - Test whether boolean expression yields false or unknown. - - - true IS NOT TRUE - f - - - NULL::boolean IS NOT TRUE - t (rather than NULL) - - - - - - boolean IS FALSE - boolean - - - Test whether boolean expression yields false. - - - true IS FALSE - f - - - NULL::boolean IS FALSE - f (rather than NULL) - - - - - - boolean IS NOT FALSE - boolean - - - Test whether boolean expression yields true or unknown. - - - true IS NOT FALSE - t - - - NULL::boolean IS NOT FALSE - t (rather than NULL) - - - - - - boolean IS UNKNOWN - boolean - - - Test whether boolean expression yields unknown. - - - true IS UNKNOWN - f - - - NULL::boolean IS UNKNOWN - t (rather than NULL) - - - - - - boolean IS NOT UNKNOWN - boolean - - - Test whether boolean expression yields true or false. - - - true IS NOT UNKNOWN - t - - - NULL::boolean IS NOT UNKNOWN - f (rather than NULL) - - - - -
- - - - BETWEEN - - - BETWEEN SYMMETRIC - - The BETWEEN predicate simplifies range tests: - -a BETWEEN x AND y - - is equivalent to - -a >= x AND a <= y - - Notice that BETWEEN treats the endpoint values as included - in the range. - BETWEEN SYMMETRIC is like BETWEEN - except there is no requirement that the argument to the left of - AND be less than or equal to the argument on the right. - If it is not, those two arguments are automatically swapped, so that - a nonempty range is always implied. - - - - The various variants of BETWEEN are implemented in - terms of the ordinary comparison operators, and therefore will work for - any data type(s) that can be compared. - - - - - The use of AND in the BETWEEN - syntax creates an ambiguity with the use of AND as a - logical operator. To resolve this, only a limited set of expression - types are allowed as the second argument of a BETWEEN - clause. If you need to write a more complex sub-expression - in BETWEEN, write parentheses around the - sub-expression. - - - - - - IS DISTINCT FROM - - - IS NOT DISTINCT FROM - - Ordinary comparison operators yield null (signifying unknown), - not true or false, when either input is null. For example, - 7 = NULL yields null, as does 7 <> NULL. When - this behavior is not suitable, use the - IS NOT DISTINCT FROM predicates: - -a IS DISTINCT FROM b -a IS NOT DISTINCT FROM b - - For non-null inputs, IS DISTINCT FROM is - the same as the <> operator. However, if both - inputs are null it returns false, and if only one input is - null it returns true. Similarly, IS NOT DISTINCT - FROM is identical to = for non-null - inputs, but it returns true when both inputs are null, and false when only - one input is null. Thus, these predicates effectively act as though null - were a normal data value, rather than unknown. - - - - - IS NULL - - - IS NOT NULL - - - ISNULL - - - NOTNULL - - To check whether a value is or is not null, use the predicates: - -expression IS NULL -expression IS NOT NULL - - or the equivalent, but nonstandard, predicates: - -expression ISNULL -expression NOTNULL - - null valuecomparing - - - - Do not write - expression = NULL - because NULL is not equal to - NULL. (The null value represents an unknown value, - and it is not known whether two unknown values are equal.) - - - - - Some applications might expect that - expression = NULL - returns true if expression evaluates to - the null value. It is highly recommended that these applications - be modified to comply with the SQL standard. However, if that - cannot be done the - configuration variable is available. If it is enabled, - PostgreSQL will convert x = - NULL clauses to x IS NULL. - - - - - If the expression is row-valued, then - IS NULL is true when the row expression itself is null - or when all the row's fields are null, while - IS NOT NULL is true when the row expression itself is non-null - and all the row's fields are non-null. Because of this behavior, - IS NULL and IS NOT NULL do not always return - inverse results for row-valued expressions; in particular, a row-valued - expression that contains both null and non-null fields will return false - for both tests. For example: - - -SELECT ROW(1,2.5,'this is a test') = ROW(1, 3, 'not the same'); - -SELECT ROW(table.*) IS NULL FROM table; -- detect all-null rows - -SELECT ROW(table.*) IS NOT NULL FROM table; -- detect all-non-null rows - -SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in rows - - - In some cases, it may be preferable to - write row IS DISTINCT FROM NULL - or row IS NOT DISTINCT FROM NULL, - which will simply check whether the overall row value is null without any - additional tests on the row fields. - - - - - IS TRUE - - - IS NOT TRUE - - - IS FALSE - - - IS NOT FALSE - - - IS UNKNOWN - - - IS NOT UNKNOWN - - Boolean values can also be tested using the predicates - -boolean_expression IS TRUE -boolean_expression IS NOT TRUE -boolean_expression IS FALSE -boolean_expression IS NOT FALSE -boolean_expression IS UNKNOWN -boolean_expression IS NOT UNKNOWN - - These will always return true or false, never a null value, even when the - operand is null. - A null input is treated as the logical value unknown. - Notice that IS UNKNOWN and IS NOT UNKNOWN are - effectively the same as IS NULL and - IS NOT NULL, respectively, except that the input - expression must be of Boolean type. - - - - Some comparison-related functions are also available, as shown in . - - - - Comparison Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - num_nonnulls - - num_nonnulls ( VARIADIC "any" ) - integer - - - Returns the number of non-null arguments. - - - num_nonnulls(1, NULL, 2) - 2 - - - - - - num_nulls - - num_nulls ( VARIADIC "any" ) - integer - - - Returns the number of null arguments. - - - num_nulls(1, NULL, 2) - 1 - - - - -
- - - - - Mathematical Functions and Operators - - - Mathematical operators are provided for many - PostgreSQL types. For types without - standard mathematical conventions - (e.g., date/time types) we - describe the actual behavior in subsequent sections. - - - - shows the mathematical - operators that are available for the standard numeric types. - Unless otherwise noted, operators shown as - accepting numeric_type are available for all - the types smallint, integer, - bigint, numeric, real, - and double precision. - Operators shown as accepting integral_type - are available for the types smallint, integer, - and bigint. - Except where noted, each form of an operator returns the same data type - as its argument(s). Calls involving multiple argument data types, such - as integer + numeric, - are resolved by using the type appearing later in these lists. - - - - Mathematical Operators - - - - - - Operator - - - Description - - - Example(s) - - - - - - - - numeric_type + numeric_type - numeric_type - - - Addition - - - 2 + 3 - 5 - - - - - - + numeric_type - numeric_type - - - Unary plus (no operation) - - - + 3.5 - 3.5 - - - - - - numeric_type - numeric_type - numeric_type - - - Subtraction - - - 2 - 3 - -1 - - - - - - - numeric_type - numeric_type - - - Negation - - - - (-4) - 4 - - - - - - numeric_type * numeric_type - numeric_type - - - Multiplication - - - 2 * 3 - 6 - - - - - - numeric_type / numeric_type - numeric_type - - - Division (for integral types, division truncates the result towards - zero) - - - 5.0 / 2 - 2.5000000000000000 - - - 5 / 2 - 2 - - - (-5) / 2 - -2 - - - - - - numeric_type % numeric_type - numeric_type - - - Modulo (remainder); available for smallint, - integer, bigint, and numeric - - - 5 % 4 - 1 - - - - - - numeric ^ numeric - numeric - - - double precision ^ double precision - double precision - - - Exponentiation - - - 2 ^ 3 - 8 - - - Unlike typical mathematical practice, multiple uses of - ^ will associate left to right by default: - - - 2 ^ 3 ^ 3 - 512 - - - 2 ^ (3 ^ 3) - 134217728 - - - - - - |/ double precision - double precision - - - Square root - - - |/ 25.0 - 5 - - - - - - ||/ double precision - double precision - - - Cube root - - - ||/ 64.0 - 4 - - - - - - @ numeric_type - numeric_type - - - Absolute value - - - @ -5.0 - 5.0 - - - - - - integral_type & integral_type - integral_type - - - Bitwise AND - - - 91 & 15 - 11 - - - - - - integral_type | integral_type - integral_type - - - Bitwise OR - - - 32 | 3 - 35 - - - - - - integral_type # integral_type - integral_type - - - Bitwise exclusive OR - - - 17 # 5 - 20 - - - - - - ~ integral_type - integral_type - - - Bitwise NOT - - - ~1 - -2 - - - - - - integral_type << integer - integral_type - - - Bitwise shift left - - - 1 << 4 - 16 - - - - - - integral_type >> integer - integral_type - - - Bitwise shift right - - - 8 >> 2 - 2 - - - - - -
- - - shows the available - mathematical functions. - Many of these functions are provided in multiple forms with different - argument types. - Except where noted, any given form of a function returns the same - data type as its argument(s); cross-type cases are resolved in the - same way as explained above for operators. - The functions working with double precision data are mostly - implemented on top of the host system's C library; accuracy and behavior in - boundary cases can therefore vary depending on the host system. - - - - Mathematical Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - abs - - abs ( numeric_type ) - numeric_type - - - Absolute value - - - abs(-17.4) - 17.4 - - - - - - - cbrt - - cbrt ( double precision ) - double precision - - - Cube root - - - cbrt(64.0) - 4 - - - - - - - ceil - - ceil ( numeric ) - numeric - - - ceil ( double precision ) - double precision - - - Nearest integer greater than or equal to argument - - - ceil(42.2) - 43 - - - ceil(-42.8) - -42 - - - - - - - ceiling - - ceiling ( numeric ) - numeric - - - ceiling ( double precision ) - double precision - - - Nearest integer greater than or equal to argument (same - as ceil) - - - ceiling(95.3) - 96 - - - - - - - degrees - - degrees ( double precision ) - double precision - - - Converts radians to degrees - - - degrees(0.5) - 28.64788975654116 - - - - - - - div - - div ( y numeric, - x numeric ) - numeric - - - Integer quotient of y/x - (truncates towards zero) - - - div(9, 4) - 2 - - - - - - - erf - - erf ( double precision ) - double precision - - - Error function - - - erf(1.0) - 0.8427007929497149 - - - - - - - erfc - - erfc ( double precision ) - double precision - - - Complementary error function (1 - erf(x), without - loss of precision for large inputs) - - - erfc(1.0) - 0.15729920705028513 - - - - - - - exp - - exp ( numeric ) - numeric - - - exp ( double precision ) - double precision - - - Exponential (e raised to the given power) - - - exp(1.0) - 2.7182818284590452 - - - - - - - factorial - - factorial ( bigint ) - numeric - - - Factorial - - - factorial(5) - 120 - - - - - - - floor - - floor ( numeric ) - numeric - - - floor ( double precision ) - double precision - - - Nearest integer less than or equal to argument - - - floor(42.8) - 42 - - - floor(-42.8) - -43 - - - - - - - gamma - - gamma ( double precision ) - double precision - - - Gamma function - - - gamma(0.5) - 1.772453850905516 - - - gamma(6) - 120 - - - - - - - gcd - - gcd ( numeric_type, numeric_type ) - numeric_type - - - Greatest common divisor (the largest positive number that divides both - inputs with no remainder); returns 0 if both inputs - are zero; available for integer, bigint, - and numeric - - - gcd(1071, 462) - 21 - - - - - - - lcm - - lcm ( numeric_type, numeric_type ) - numeric_type - - - Least common multiple (the smallest strictly positive number that is - an integral multiple of both inputs); returns 0 if - either input is zero; available for integer, - bigint, and numeric - - - lcm(1071, 462) - 23562 - - - - - - - lgamma - - lgamma ( double precision ) - double precision - - - Natural logarithm of the absolute value of the gamma function - - - lgamma(1000) - 5905.220423209181 - - - - - - - ln - - ln ( numeric ) - numeric - - - ln ( double precision ) - double precision - - - Natural logarithm - - - ln(2.0) - 0.6931471805599453 - - - - - - - log - - log ( numeric ) - numeric - - - log ( double precision ) - double precision - - - Base 10 logarithm - - - log(100) - 2 - - - - - - - log10 - - log10 ( numeric ) - numeric - - - log10 ( double precision ) - double precision - - - Base 10 logarithm (same as log) - - - log10(1000) - 3 - - - - - - log ( b numeric, - x numeric ) - numeric - - - Logarithm of x to base b - - - log(2.0, 64.0) - 6.0000000000000000 - - - - - - - min_scale - - min_scale ( numeric ) - integer - - - Minimum scale (number of fractional decimal digits) needed - to represent the supplied value precisely - - - min_scale(8.4100) - 2 - - - - - - - mod - - mod ( y numeric_type, - x numeric_type ) - numeric_type - - - Remainder of y/x; - available for smallint, integer, - bigint, and numeric - - - mod(9, 4) - 1 - - - - - - - pi - - pi ( ) - double precision - - - Approximate value of π - - - pi() - 3.141592653589793 - - - - - - - power - - power ( a numeric, - b numeric ) - numeric - - - power ( a double precision, - b double precision ) - double precision - - - a raised to the power of b - - - power(9, 3) - 729 - - - - - - - radians - - radians ( double precision ) - double precision - - - Converts degrees to radians - - - radians(45.0) - 0.7853981633974483 - - - - - - - round - - round ( numeric ) - numeric - - - round ( double precision ) - double precision - - - Rounds to nearest integer. For numeric, ties are - broken by rounding away from zero. For double precision, - the tie-breaking behavior is platform dependent, but - round to nearest even is the most common rule. - - - round(42.4) - 42 - - - - - - round ( v numeric, s integer ) - numeric - - - Rounds v to s decimal - places. Ties are broken by rounding away from zero. - - - round(42.4382, 2) - 42.44 - - - round(1234.56, -1) - 1230 - - - - - - - scale - - scale ( numeric ) - integer - - - Scale of the argument (the number of decimal digits in the fractional part) - - - scale(8.4100) - 4 - - - - - - - sign - - sign ( numeric ) - numeric - - - sign ( double precision ) - double precision - - - Sign of the argument (-1, 0, or +1) - - - sign(-8.4) - -1 - - - - - - - sqrt - - sqrt ( numeric ) - numeric - - - sqrt ( double precision ) - double precision - - - Square root - - - sqrt(2) - 1.4142135623730951 - - - - - - - trim_scale - - trim_scale ( numeric ) - numeric - - - Reduces the value's scale (number of fractional decimal digits) by - removing trailing zeroes - - - trim_scale(8.4100) - 8.41 - - - - - - - trunc - - trunc ( numeric ) - numeric - - - trunc ( double precision ) - double precision - - - Truncates to integer (towards zero) - - - trunc(42.8) - 42 - - - trunc(-42.8) - -42 - - - - - - trunc ( v numeric, s integer ) - numeric - - - Truncates v to s - decimal places - - - trunc(42.4382, 2) - 42.43 - - - - - - - width_bucket - - width_bucket ( operand numeric, low numeric, high numeric, count integer ) - integer - - - width_bucket ( operand double precision, low double precision, high double precision, count integer ) - integer - - - Returns the number of the bucket in - which operand falls in a histogram - having count equal-width buckets spanning the - range low to high. - Returns 0 - or count+1 for an input - outside that range. - - - width_bucket(5.35, 0.024, 10.06, 5) - 3 - - - - - - width_bucket ( operand anycompatible, thresholds anycompatiblearray ) - integer - - - Returns the number of the bucket in - which operand falls given an array listing the - lower bounds of the buckets. Returns 0 for an - input less than the first lower - bound. operand and the array elements can be - of any type having standard comparison operators. - The thresholds array must be - sorted, smallest first, or unexpected results will be - obtained. - - - width_bucket(now(), array['yesterday', 'today', 'tomorrow']::timestamptz[]) - 2 - - - - -
- - - shows functions for - generating random numbers. - - - - Random Functions - - - - - - Function - - - Description - - - Example(s) - - - - - - - - - random - - random ( ) - double precision - - - Returns a random value in the range 0.0 <= x < 1.0 - - - random() - 0.897124072839091 - - - - - - - random - - random ( min integer, max integer ) - integer - - - random ( min bigint, max bigint ) - bigint - - - random ( min numeric, max numeric ) - numeric - - - Returns a random value in the range - min <= x <= max. - For type numeric, the result will have the same number of - fractional decimal digits as min or - max, whichever has more. - - - random(1, 10) - 7 - - - random(-0.499, 0.499) - 0.347 - - - - - - - random_normal - - - random_normal ( - mean double precision - , stddev double precision ) - double precision - - - Returns a random value from the normal distribution with the given - parameters; mean defaults to 0.0 - and stddev defaults to 1.0 - - - random_normal(0.0, 1.0) - 0.051285419 - - - - - - - setseed - - setseed ( double precision ) - void - - - Sets the seed for subsequent random() and - random_normal() calls; - argument must be between -1.0 and 1.0, inclusive - - - setseed(0.12345) - - - - -
- - - The random() and random_normal() - functions listed in use a - deterministic pseudo-random number generator. - It is fast but not suitable for cryptographic - applications; see the module for a more - secure alternative. - If setseed() is called, the series of results of - subsequent calls to these functions in the current session - can be repeated by re-issuing setseed() with the same - argument. - Without any prior setseed() call in the same - session, the first call to any of these functions obtains a seed - from a platform-dependent source of random bits. - - - - shows the - available trigonometric functions. Each of these functions comes in - two variants, one that measures angles in radians and one that - measures angles in degrees. - - - - Trigonometric Functions - - - - - - Function - - - Description - - - Example(s) - - - - - - - - - acos - - acos ( double precision ) - double precision - - - Inverse cosine, result in radians - - - acos(1) - 0 - - - - - - - acosd - - acosd ( double precision ) - double precision - - - Inverse cosine, result in degrees - - - acosd(0.5) - 60 - - - - - - - asin - - asin ( double precision ) - double precision - - - Inverse sine, result in radians - - - asin(1) - 1.5707963267948966 - - - - - - - asind - - asind ( double precision ) - double precision - - - Inverse sine, result in degrees - - - asind(0.5) - 30 - - - - - - - atan - - atan ( double precision ) - double precision - - - Inverse tangent, result in radians - - - atan(1) - 0.7853981633974483 - - - - - - - atand - - atand ( double precision ) - double precision - - - Inverse tangent, result in degrees - - - atand(1) - 45 - - - - - - - atan2 - - atan2 ( y double precision, - x double precision ) - double precision - - - Inverse tangent of - y/x, - result in radians - - - atan2(1, 0) - 1.5707963267948966 - - - - - - - atan2d - - atan2d ( y double precision, - x double precision ) - double precision - - - Inverse tangent of - y/x, - result in degrees - - - atan2d(1, 0) - 90 - - - - - - - cos - - cos ( double precision ) - double precision - - - Cosine, argument in radians - - - cos(0) - 1 - - - - - - - cosd - - cosd ( double precision ) - double precision - - - Cosine, argument in degrees - - - cosd(60) - 0.5 - - - - - - - cot - - cot ( double precision ) - double precision - - - Cotangent, argument in radians - - - cot(0.5) - 1.830487721712452 - - - - - - - cotd - - cotd ( double precision ) - double precision - - - Cotangent, argument in degrees - - - cotd(45) - 1 - - - - - - - sin - - sin ( double precision ) - double precision - - - Sine, argument in radians - - - sin(1) - 0.8414709848078965 - - - - - - - sind - - sind ( double precision ) - double precision - - - Sine, argument in degrees - - - sind(30) - 0.5 - - - - - - - tan - - tan ( double precision ) - double precision - - - Tangent, argument in radians - - - tan(1) - 1.5574077246549023 - - - - - - - tand - - tand ( double precision ) - double precision - - - Tangent, argument in degrees - - - tand(45) - 1 - - - - -
- - - - Another way to work with angles measured in degrees is to use the unit - transformation functions radians() - and degrees() shown earlier. - However, using the degree-based trigonometric functions is preferred, - as that way avoids round-off error for special cases such - as sind(30). - - - - - shows the - available hyperbolic functions. - - - - Hyperbolic Functions - - - - - - Function - - - Description - - - Example(s) - - - - - - - - - sinh - - sinh ( double precision ) - double precision - - - Hyperbolic sine - - - sinh(1) - 1.1752011936438014 - - - - - - - cosh - - cosh ( double precision ) - double precision - - - Hyperbolic cosine - - - cosh(0) - 1 - - - - - - - tanh - - tanh ( double precision ) - double precision - - - Hyperbolic tangent - - - tanh(1) - 0.7615941559557649 - - - - - - - asinh - - asinh ( double precision ) - double precision - - - Inverse hyperbolic sine - - - asinh(1) - 0.881373587019543 - - - - - - - acosh - - acosh ( double precision ) - double precision - - - Inverse hyperbolic cosine - - - acosh(1) - 0 - - - - - - - atanh - - atanh ( double precision ) - double precision - - - Inverse hyperbolic tangent - - - atanh(0.5) - 0.5493061443340548 - - - - -
- -
- - - - String Functions and Operators - - - This section describes functions and operators for examining and - manipulating string values. Strings in this context include values - of the types character, character varying, - and text. Except where noted, these functions and operators - are declared to accept and return type text. They will - interchangeably accept character varying arguments. - Values of type character will be converted - to text before the function or operator is applied, resulting - in stripping any trailing spaces in the character value. - - - - SQL defines some string functions that use - key words, rather than commas, to separate - arguments. Details are in - . - PostgreSQL also provides versions of these functions - that use the regular function invocation syntax - (see ). - - - - - The string concatenation operator (||) will accept - non-string input, so long as at least one input is of string type, as shown - in . For other cases, inserting an - explicit coercion to text can be used to have non-string input - accepted. - - - - - <acronym>SQL</acronym> String Functions and Operators - - - - - Function/Operator - - - Description - - - Example(s) - - - - - - - - - character string - concatenation - - text || text - text - - - Concatenates the two strings. - - - 'Post' || 'greSQL' - PostgreSQL - - - - - - text || anynonarray - text - - - anynonarray || text - text - - - Converts the non-string input to text, then concatenates the two - strings. (The non-string input cannot be of an array type, because - that would create ambiguity with the array || - operators. If you want to concatenate an array's text equivalent, - cast it to text explicitly.) - - - 'Value: ' || 42 - Value: 42 - - - - - - - btrim - - btrim ( string text - , characters text ) - text - - - Removes the longest string containing only characters - in characters (a space by default) - from the start and end of string. - - - btrim('xyxtrimyyx', 'xyz') - trim - - - - - - - normalized - - - Unicode normalization - - text IS NOT form NORMALIZED - boolean - - - Checks whether the string is in the specified Unicode normalization - form. The optional form key word specifies the - form: NFC (the default), NFD, - NFKC, or NFKD. This expression can - only be used when the server encoding is UTF8. Note - that checking for normalization using this expression is often faster - than normalizing possibly already normalized strings. - - - U&'\0061\0308bc' IS NFD NORMALIZED - t - - - - - - - bit_length - - bit_length ( text ) - integer - - - Returns number of bits in the string (8 - times the octet_length). - - - bit_length('jose') - 32 - - - - - - - char_length - - - character string - length - - - length - of a character string - character string, length - - char_length ( text ) - integer - - - - character_length - - character_length ( text ) - integer - - - Returns number of characters in the string. - - - char_length('josé') - 4 - - - - - - - lower - - lower ( text ) - text - - - Converts the string to all lower case, according to the rules of the - database's locale. - - - lower('TOM') - tom - - - - - - - lpad - - lpad ( string text, - length integer - , fill text ) - text - - - Extends the string to length - length by prepending the characters - fill (a space by default). If the - string is already longer than - length then it is truncated (on the right). - - - lpad('hi', 5, 'xy') - xyxhi - - - - - - - ltrim - - ltrim ( string text - , characters text ) - text - - - Removes the longest string containing only characters in - characters (a space by default) from the start of - string. - - - ltrim('zzzytest', 'xyz') - test - - - - - - - normalize - - - Unicode normalization - - normalize ( text - , form ) - text - - - Converts the string to the specified Unicode - normalization form. The optional form key word - specifies the form: NFC (the default), - NFD, NFKC, or - NFKD. This function can only be used when the - server encoding is UTF8. - - - normalize(U&'\0061\0308bc', NFC) - U&'\00E4bc' - - - - - - - octet_length - - octet_length ( text ) - integer - - - Returns number of bytes in the string. - - - octet_length('josé') - 5 (if server encoding is UTF8) - - - - - - - octet_length - - octet_length ( character ) - integer - - - Returns number of bytes in the string. Since this version of the - function accepts type character directly, it will not - strip trailing spaces. - - - octet_length('abc '::character(4)) - 4 - - - - - - - overlay - - overlay ( string text PLACING newsubstring text FROM start integer FOR count integer ) - text - - - Replaces the substring of string that starts at - the start'th character and extends - for count characters - with newsubstring. - If count is omitted, it defaults to the length - of newsubstring. - - - overlay('Txxxxas' placing 'hom' from 2 for 4) - Thomas - - - - - - - position - - position ( substring text IN string text ) - integer - - - Returns first starting index of the specified - substring within - string, or zero if it's not present. - - - position('om' in 'Thomas') - 3 - - - - - - - rpad - - rpad ( string text, - length integer - , fill text ) - text - - - Extends the string to length - length by appending the characters - fill (a space by default). If the - string is already longer than - length then it is truncated. - - - rpad('hi', 5, 'xy') - hixyx - - - - - - - rtrim - - rtrim ( string text - , characters text ) - text - - - Removes the longest string containing only characters in - characters (a space by default) from the end of - string. - - - rtrim('testxxzx', 'xyz') - test - - - - - - - substring - - substring ( string text FROM start integer FOR count integer ) - text - - - Extracts the substring of string starting at - the start'th character if that is specified, - and stopping after count characters if that is - specified. Provide at least one of start - and count. - - - substring('Thomas' from 2 for 3) - hom - - - substring('Thomas' from 3) - omas - - - substring('Thomas' for 2) - Th - - - - - - substring ( string text FROM pattern text ) - text - - - Extracts the first substring matching POSIX regular expression; see - . - - - substring('Thomas' from '...$') - mas - - - - - - substring ( string text SIMILAR pattern text ESCAPE escape text ) - text - - - substring ( string text FROM pattern text FOR escape text ) - text - - - Extracts the first substring matching SQL regular expression; - see . The first form has - been specified since SQL:2003; the second form was only in SQL:1999 - and should be considered obsolete. - - - substring('Thomas' similar '%#"o_a#"_' escape '#') - oma - - - - - - - trim - - trim ( LEADING | TRAILING | BOTH - characters text FROM - string text ) - text - - - Removes the longest string containing only characters in - characters (a space by default) from the - start, end, or both ends (BOTH is the default) - of string. - - - trim(both 'xyz' from 'yxTomxx') - Tom - - - - - - trim ( LEADING | TRAILING | BOTH FROM - string text , - characters text ) - text - - - This is a non-standard syntax for trim(). - - - trim(both from 'yxTomxx', 'xyz') - Tom - - - - - - - unicode_assigned - - unicode_assigned ( text ) - boolean - - - Returns true if all characters in the string are - assigned Unicode codepoints; false otherwise. This - function can only be used when the server encoding is - UTF8. - - - - - - - upper - - upper ( text ) - text - - - Converts the string to all upper case, according to the rules of the - database's locale. - - - upper('tom') - TOM - - - - -
- - - Additional string manipulation functions and operators are available - and are listed in . (Some of - these are used internally to implement - the SQL-standard string functions listed in - .) - There are also pattern-matching operators, which are described in - , and operators for full-text - search, which are described in . - - - - Other String Functions and Operators - - - - - Function/Operator - - - Description - - - Example(s) - - - - - - - - - character string - prefix test - - text ^@ text - boolean - - - Returns true if the first string starts with the second string - (equivalent to the starts_with() function). - - - 'alphabet' ^@ 'alph' - t - - - - - - - ascii - - ascii ( text ) - integer - - - Returns the numeric code of the first character of the argument. - In UTF8 encoding, returns the Unicode code point - of the character. In other multibyte encodings, the argument must - be an ASCII character. - - - ascii('x') - 120 - - - - - - - chr - - chr ( integer ) - text - - - Returns the character with the given code. In UTF8 - encoding the argument is treated as a Unicode code point. In other - multibyte encodings the argument must designate - an ASCII character. chr(0) is - disallowed because text data types cannot store that character. - - - chr(65) - A - - - - - - - concat - - concat ( val1 "any" - , val2 "any" , ... ) - text - - - Concatenates the text representations of all the arguments. - NULL arguments are ignored. - - - concat('abcde', 2, NULL, 22) - abcde222 - - - - - - - concat_ws - - concat_ws ( sep text, - val1 "any" - , val2 "any" , ... ) - text - - - Concatenates all but the first argument, with separators. The first - argument is used as the separator string, and should not be NULL. - Other NULL arguments are ignored. - - - concat_ws(',', 'abcde', 2, NULL, 22) - abcde,2,22 - - - - - - - format - - format ( formatstr text - , formatarg "any" , ... ) - text - - - Formats arguments according to a format string; - see . - This function is similar to the C function sprintf. - - - format('Hello %s, %1$s', 'World') - Hello World, World - - - - - - - initcap - - initcap ( text ) - text - - - Converts the first letter of each word to upper case and the - rest to lower case. Words are sequences of alphanumeric - characters separated by non-alphanumeric characters. - - - initcap('hi THOMAS') - Hi Thomas - - - - - - - casefold - - casefold ( text ) - text - - - Performs case folding of the input string according to the collation. - Case folding is similar to case conversion, but the purpose of case - folding is to facilitate case-insensitive matching of strings, - whereas the purpose of case conversion is to convert to a particular - cased form. This function can only be used when the server encoding - is UTF8. - - - Ordinarily, case folding simply converts to lowercase, but there may - be exceptions depending on the collation. For instance, some - characters have more than two lowercase variants, or fold to uppercase. - - - Case folding may change the length of the string. For instance, in - the PG_UNICODE_FAST collation, ß - (U+00DF) folds to ss. - - - casefold can be used for Unicode Default Caseless - Matching. It does not always preserve the normalized form of the - input string (see ). - - - The libc provider doesn't support case folding, so - casefold is identical to . - - - - - - - left - - left ( string text, - n integer ) - text - - - Returns first n characters in the - string, or when n is negative, returns - all but last |n| characters. - - - left('abcde', 2) - ab - - - - - - - length - - length ( text ) - integer - - - Returns the number of characters in the string. - - - length('jose') - 4 - - - - - - - md5 - - md5 ( text ) - text - - - Computes the MD5 hash of - the argument, with the result written in hexadecimal. - - - md5('abc') - 900150983cd24fb0&zwsp;d6963f7d28e17f72 - - - - - - - parse_ident - - parse_ident ( qualified_identifier text - , strict_mode boolean DEFAULT true ) - text[] - - - Splits qualified_identifier into an array of - identifiers, removing any quoting of individual identifiers. By - default, extra characters after the last identifier are considered an - error; but if the second parameter is false, then such - extra characters are ignored. (This behavior is useful for parsing - names for objects like functions.) Note that this function does not - truncate over-length identifiers. If you want truncation you can cast - the result to name[]. - - - parse_ident('"SomeSchema".someTable') - {SomeSchema,sometable} - - - - - - - pg_client_encoding - - pg_client_encoding ( ) - name - - - Returns current client encoding name. - - - pg_client_encoding() - UTF8 - - - - - - - quote_ident - - quote_ident ( text ) - text - - - Returns the given string suitably quoted to be used as an identifier - in an SQL statement string. - Quotes are added only if necessary (i.e., if the string contains - non-identifier characters or would be case-folded). - Embedded quotes are properly doubled. - See also . - - - quote_ident('Foo bar') - "Foo bar" - - - - - - - quote_literal - - quote_literal ( text ) - text - - - Returns the given string suitably quoted to be used as a string literal - in an SQL statement string. - Embedded single-quotes and backslashes are properly doubled. - Note that quote_literal returns null on null - input; if the argument might be null, - quote_nullable is often more suitable. - See also . - - - quote_literal(E'O\'Reilly') - 'O''Reilly' - - - - - - quote_literal ( anyelement ) - text - - - Converts the given value to text and then quotes it as a literal. - Embedded single-quotes and backslashes are properly doubled. - - - quote_literal(42.5) - '42.5' - - - - - - - quote_nullable - - quote_nullable ( text ) - text - - - Returns the given string suitably quoted to be used as a string literal - in an SQL statement string; or, if the argument - is null, returns NULL. - Embedded single-quotes and backslashes are properly doubled. - See also . - - - quote_nullable(NULL) - NULL - - - - - - quote_nullable ( anyelement ) - text - - - Converts the given value to text and then quotes it as a literal; - or, if the argument is null, returns NULL. - Embedded single-quotes and backslashes are properly doubled. - - - quote_nullable(42.5) - '42.5' - - - - - - - regexp_count - - regexp_count ( string text, pattern text - , start integer - , flags text ) - integer - - - Returns the number of times the POSIX regular - expression pattern matches in - the string; see - . - - - regexp_count('123456789012', '\d\d\d', 2) - 3 - - - - - - - regexp_instr - - regexp_instr ( string text, pattern text - , start integer - , N integer - , endoption integer - , flags text - , subexpr integer ) - integer - - - Returns the position within string where - the N'th match of the POSIX regular - expression pattern occurs, or zero if there is - no such match; see . - - - regexp_instr('ABCDEF', 'c(.)(..)', 1, 1, 0, 'i') - 3 - - - regexp_instr('ABCDEF', 'c(.)(..)', 1, 1, 0, 'i', 2) - 5 - - - - - - - regexp_like - - regexp_like ( string text, pattern text - , flags text ) - boolean - - - Checks whether a match of the POSIX regular - expression pattern occurs - within string; see - . - - - regexp_like('Hello World', 'world$', 'i') - t - - - - - - - regexp_match - - regexp_match ( string text, pattern text , flags text ) - text[] - - - Returns substrings within the first match of the POSIX regular - expression pattern to - the string; see - . - - - regexp_match('foobarbequebaz', '(bar)(beque)') - {bar,beque} - - - - - - - regexp_matches - - regexp_matches ( string text, pattern text , flags text ) - setof text[] - - - Returns substrings within the first match of the POSIX regular - expression pattern to - the string, or substrings within all - such matches if the g flag is used; - see . - - - regexp_matches('foobarbequebaz', 'ba.', 'g') - - - {bar} - {baz} - - - - - - - - regexp_replace - - regexp_replace ( string text, pattern text, replacement text - , flags text ) - text - - - Replaces the substring that is the first match to the POSIX - regular expression pattern, or all such - matches if the g flag is used; see - . - - - regexp_replace('Thomas', '.[mN]a.', 'M') - ThM - - - - - - regexp_replace ( string text, pattern text, replacement text, - start integer - , N integer - , flags text ) - text - - - Replaces the substring that is the N'th - match to the POSIX regular expression pattern, - or all such matches if N is zero, with the - search beginning at the start'th character - of string. If N is - omitted, it defaults to 1. See - . - - - regexp_replace('Thomas', '.', 'X', 3, 2) - ThoXas - - - regexp_replace(string=>'hello world', pattern=>'l', replacement=>'XX', start=>1, "N"=>2) - helXXo world - - - - - - - regexp_split_to_array - - regexp_split_to_array ( string text, pattern text , flags text ) - text[] - - - Splits string using a POSIX regular - expression as the delimiter, producing an array of results; see - . - - - regexp_split_to_array('hello world', '\s+') - {hello,world} - - - - - - - regexp_split_to_table - - regexp_split_to_table ( string text, pattern text , flags text ) - setof text - - - Splits string using a POSIX regular - expression as the delimiter, producing a set of results; see - . - - - regexp_split_to_table('hello world', '\s+') - - - hello - world - - - - - - - - regexp_substr - - regexp_substr ( string text, pattern text - , start integer - , N integer - , flags text - , subexpr integer ) - text - - - Returns the substring within string that - matches the N'th occurrence of the POSIX - regular expression pattern, - or NULL if there is no such match; see - . - - - regexp_substr('ABCDEF', 'c(.)(..)', 1, 1, 'i') - CDEF - - - regexp_substr('ABCDEF', 'c(.)(..)', 1, 1, 'i', 2) - EF - - - - - - - repeat - - repeat ( string text, number integer ) - text - - - Repeats string the specified - number of times. - - - repeat('Pg', 4) - PgPgPgPg - - - - - - - replace - - replace ( string text, - from text, - to text ) - text - - - Replaces all occurrences in string of - substring from with - substring to. - - - replace('abcdefabcdef', 'cd', 'XX') - abXXefabXXef - - - - - - - reverse - - reverse ( text ) - text - - - Reverses the order of the characters in the string. - - - reverse('abcde') - edcba - - - - - - - right - - right ( string text, - n integer ) - text - - - Returns last n characters in the string, - or when n is negative, returns all but - first |n| characters. - - - right('abcde', 2) - de - - - - - - - split_part - - split_part ( string text, - delimiter text, - n integer ) - text - - - Splits string at occurrences - of delimiter and returns - the n'th field (counting from one), - or when n is negative, returns - the |n|'th-from-last field. - - - split_part('abc~@~def~@~ghi', '~@~', 2) - def - - - split_part('abc,def,ghi,jkl', ',', -2) - ghi - - - - - - - starts_with - - starts_with ( string text, prefix text ) - boolean - - - Returns true if string starts - with prefix. - - - starts_with('alphabet', 'alph') - t - - - - - - - string_to_array - - string_to_array ( string text, delimiter text , null_string text ) - text[] - - - Splits the string at occurrences - of delimiter and forms the resulting fields - into a text array. - If delimiter is NULL, - each character in the string will become a - separate element in the array. - If delimiter is an empty string, then - the string is treated as a single field. - If null_string is supplied and is - not NULL, fields matching that string are - replaced by NULL. - See also array_to_string. - - - string_to_array('xx~~yy~~zz', '~~', 'yy') - {xx,NULL,zz} - - - - - - - string_to_table - - string_to_table ( string text, delimiter text , null_string text ) - setof text - - - Splits the string at occurrences - of delimiter and returns the resulting fields - as a set of text rows. - If delimiter is NULL, - each character in the string will become a - separate row of the result. - If delimiter is an empty string, then - the string is treated as a single field. - If null_string is supplied and is - not NULL, fields matching that string are - replaced by NULL. - - - string_to_table('xx~^~yy~^~zz', '~^~', 'yy') - - - xx - NULL - zz - - - - - - - - strpos - - strpos ( string text, substring text ) - integer - - - Returns first starting index of the specified substring - within string, or zero if it's not present. - (Same as position(substring in - string), but note the reversed - argument order.) - - - strpos('high', 'ig') - 2 - - - - - - - substr - - substr ( string text, start integer , count integer ) - text - - - Extracts the substring of string starting at - the start'th character, - and extending for count characters if that is - specified. (Same - as substring(string - from start - for count).) - - - substr('alphabet', 3) - phabet - - - substr('alphabet', 3, 2) - ph - - - - - - - to_ascii - - to_ascii ( string text ) - text - - - to_ascii ( string text, - encoding name ) - text - - - to_ascii ( string text, - encoding integer ) - text - - - Converts string to ASCII - from another encoding, which may be identified by name or number. - If encoding is omitted the database encoding - is assumed (which in practice is the only useful case). - The conversion consists primarily of dropping accents. - Conversion is only supported - from LATIN1, LATIN2, - LATIN9, and WIN1250 encodings. - (See the module for another, more flexible - solution.) - - - to_ascii('Karél') - Karel - - - - - - - to_bin - - to_bin ( integer ) - text - - - to_bin ( bigint ) - text - - - Converts the number to its equivalent two's complement binary - representation. - - - to_bin(2147483647) - 1111111111111111111111111111111 - - - to_bin(-1234) - 11111111111111111111101100101110 - - - - - - - to_hex - - to_hex ( integer ) - text - - - to_hex ( bigint ) - text - - - Converts the number to its equivalent two's complement hexadecimal - representation. - - - to_hex(2147483647) - 7fffffff - - - to_hex(-1234) - fffffb2e - - - - - - - to_oct - - to_oct ( integer ) - text - - - to_oct ( bigint ) - text - - - Converts the number to its equivalent two's complement octal - representation. - - - to_oct(2147483647) - 17777777777 - - - to_oct(-1234) - 37777775456 - - - - - - - translate - - translate ( string text, - from text, - to text ) - text - - - Replaces each character in string that - matches a character in the from set with the - corresponding character in the to - set. If from is longer than - to, occurrences of the extra characters in - from are deleted. - - - translate('12345', '143', 'ax') - a2x5 - - - - - - - unistr - - unistr ( text ) - text - - - Evaluate escaped Unicode characters in the argument. Unicode characters - can be specified as - \XXXX (4 hexadecimal - digits), \+XXXXXX (6 - hexadecimal digits), - \uXXXX (4 hexadecimal - digits), or \UXXXXXXXX - (8 hexadecimal digits). To specify a backslash, write two - backslashes. All other characters are taken literally. - - - - If the server encoding is not UTF-8, the Unicode code point identified - by one of these escape sequences is converted to the actual server - encoding; an error is reported if that's not possible. - - - - This function provides a (non-standard) alternative to string - constants with Unicode escapes (see ). - - - - unistr('d\0061t\+000061') - data - - - unistr('d\u0061t\U00000061') - data - - - - - -
- - - The concat, concat_ws and - format functions are variadic, so it is possible to - pass the values to be concatenated or formatted as an array marked with - the VARIADIC keyword (see ). The array's elements are - treated as if they were separate ordinary arguments to the function. - If the variadic array argument is NULL, concat - and concat_ws return NULL, but - format treats a NULL as a zero-element array. - - - - See also the aggregate function string_agg in - , and the functions for - converting between strings and the bytea type in - . - - - - <function>format</function> - - - format - - - - The function format produces output formatted according to - a format string, in a style similar to the C function - sprintf. - - - - -format(formatstr text , formatarg "any" , ... ) - - formatstr is a format string that specifies how the - result should be formatted. Text in the format string is copied - directly to the result, except where format specifiers are - used. Format specifiers act as placeholders in the string, defining how - subsequent function arguments should be formatted and inserted into the - result. Each formatarg argument is converted to text - according to the usual output rules for its data type, and then formatted - and inserted into the result string according to the format specifier(s). - - - - Format specifiers are introduced by a % character and have - the form - -%[position][flags][width]type - - where the component fields are: - - - - position (optional) - - - A string of the form n$ where - n is the index of the argument to print. - Index 1 means the first argument after - formatstr. If the position is - omitted, the default is to use the next argument in sequence. - - - - - - flags (optional) - - - Additional options controlling how the format specifier's output is - formatted. Currently the only supported flag is a minus sign - (-) which will cause the format specifier's output to be - left-justified. This has no effect unless the width - field is also specified. - - - - - - width (optional) - - - Specifies the minimum number of characters to use to - display the format specifier's output. The output is padded on the - left or right (depending on the - flag) with spaces as - needed to fill the width. A too-small width does not cause - truncation of the output, but is simply ignored. The width may be - specified using any of the following: a positive integer; an - asterisk (*) to use the next function argument as the - width; or a string of the form *n$ to - use the nth function argument as the width. - - - - If the width comes from a function argument, that argument is - consumed before the argument that is used for the format specifier's - value. If the width argument is negative, the result is left - aligned (as if the - flag had been specified) within a - field of length abs(width). - - - - - - type (required) - - - The type of format conversion to use to produce the format - specifier's output. The following types are supported: - - - - s formats the argument value as a simple - string. A null value is treated as an empty string. - - - - - I treats the argument value as an SQL - identifier, double-quoting it if necessary. - It is an error for the value to be null (equivalent to - quote_ident). - - - - - L quotes the argument value as an SQL literal. - A null value is displayed as the string NULL, without - quotes (equivalent to quote_nullable). - - - - - - - - - - - In addition to the format specifiers described above, the special sequence - %% may be used to output a literal % character. - - - - Here are some examples of the basic format conversions: - - -SELECT format('Hello %s', 'World'); -Result: Hello World - -SELECT format('Testing %s, %s, %s, %%', 'one', 'two', 'three'); -Result: Testing one, two, three, % - -SELECT format('INSERT INTO %I VALUES(%L)', 'Foo bar', E'O\'Reilly'); -Result: INSERT INTO "Foo bar" VALUES('O''Reilly') - -SELECT format('INSERT INTO %I VALUES(%L)', 'locations', 'C:\Program Files'); -Result: INSERT INTO locations VALUES('C:\Program Files') - - - - - Here are examples using width fields - and the - flag: - - -SELECT format('|%10s|', 'foo'); -Result: | foo| - -SELECT format('|%-10s|', 'foo'); -Result: |foo | - -SELECT format('|%*s|', 10, 'foo'); -Result: | foo| - -SELECT format('|%*s|', -10, 'foo'); -Result: |foo | - -SELECT format('|%-*s|', 10, 'foo'); -Result: |foo | - -SELECT format('|%-*s|', -10, 'foo'); -Result: |foo | - - - - - These examples show use of position fields: - - -SELECT format('Testing %3$s, %2$s, %1$s', 'one', 'two', 'three'); -Result: Testing three, two, one - -SELECT format('|%*2$s|', 'foo', 10, 'bar'); -Result: | bar| - -SELECT format('|%1$*2$s|', 'foo', 10, 'bar'); -Result: | foo| - - - - - Unlike the standard C function sprintf, - PostgreSQL's format function allows format - specifiers with and without position fields to be mixed - in the same format string. A format specifier without a - position field always uses the next argument after the - last argument consumed. - In addition, the format function does not require all - function arguments to be used in the format string. - For example: - - -SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three'); -Result: Testing three, two, three - - - - - The %I and %L format specifiers are particularly - useful for safely constructing dynamic SQL statements. See - . - - - -
- - - - Binary String Functions and Operators - - - binary data - functions - - - - This section describes functions and operators for examining and - manipulating binary strings, that is values of type bytea. - Many of these are equivalent, in purpose and syntax, to the - text-string functions described in the previous section. - - - - SQL defines some string functions that use - key words, rather than commas, to separate - arguments. Details are in - . - PostgreSQL also provides versions of these functions - that use the regular function invocation syntax - (see ). - - - - <acronym>SQL</acronym> Binary String Functions and Operators - - - - - Function/Operator - - - Description - - - Example(s) - - - - - - - - - binary string - concatenation - - bytea || bytea - bytea - - - Concatenates the two binary strings. - - - '\x123456'::bytea || '\x789a00bcde'::bytea - \x123456789a00bcde - - - - - - - bit_length - - bit_length ( bytea ) - integer - - - Returns number of bits in the binary string (8 - times the octet_length). - - - bit_length('\x123456'::bytea) - 24 - - - - - - - btrim - - btrim ( bytes bytea, - bytesremoved bytea ) - bytea - - - Removes the longest string containing only bytes appearing in - bytesremoved from the start and end of - bytes. - - - btrim('\x1234567890'::bytea, '\x9012'::bytea) - \x345678 - - - - - - - ltrim - - ltrim ( bytes bytea, - bytesremoved bytea ) - bytea - - - Removes the longest string containing only bytes appearing in - bytesremoved from the start of - bytes. - - - ltrim('\x1234567890'::bytea, '\x9012'::bytea) - \x34567890 - - - - - - - octet_length - - octet_length ( bytea ) - integer - - - Returns number of bytes in the binary string. - - - octet_length('\x123456'::bytea) - 3 - - - - - - - overlay - - overlay ( bytes bytea PLACING newsubstring bytea FROM start integer FOR count integer ) - bytea - - - Replaces the substring of bytes that starts at - the start'th byte and extends - for count bytes - with newsubstring. - If count is omitted, it defaults to the length - of newsubstring. - - - overlay('\x1234567890'::bytea placing '\002\003'::bytea from 2 for 3) - \x12020390 - - - - - - - position - - position ( substring bytea IN bytes bytea ) - integer - - - Returns first starting index of the specified - substring within - bytes, or zero if it's not present. - - - position('\x5678'::bytea in '\x1234567890'::bytea) - 3 - - - - - - - rtrim - - rtrim ( bytes bytea, - bytesremoved bytea ) - bytea - - - Removes the longest string containing only bytes appearing in - bytesremoved from the end of - bytes. - - - rtrim('\x1234567890'::bytea, '\x9012'::bytea) - \x12345678 - - - - - - - substring - - substring ( bytes bytea FROM start integer FOR count integer ) - bytea - - - Extracts the substring of bytes starting at - the start'th byte if that is specified, - and stopping after count bytes if that is - specified. Provide at least one of start - and count. - - - substring('\x1234567890'::bytea from 3 for 2) - \x5678 - - - - - - - trim - - trim ( LEADING | TRAILING | BOTH - bytesremoved bytea FROM - bytes bytea ) - bytea - - - Removes the longest string containing only bytes appearing in - bytesremoved from the start, - end, or both ends (BOTH is the default) - of bytes. - - - trim('\x9012'::bytea from '\x1234567890'::bytea) - \x345678 - - - - - - trim ( LEADING | TRAILING | BOTH FROM - bytes bytea, - bytesremoved bytea ) - bytea - - - This is a non-standard syntax for trim(). - - - trim(both from '\x1234567890'::bytea, '\x9012'::bytea) - \x345678 - - - - -
- - - Additional binary string manipulation functions are available and - are listed in . Some - of them are used internally to implement the - SQL-standard string functions listed in . - - - - Other Binary String Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - bit_count - - - popcount - bit_count - - bit_count ( bytes bytea ) - bigint - - - Returns the number of bits set in the binary string (also known as - popcount). - - - bit_count('\x1234567890'::bytea) - 15 - - - - - - - crc32 - - crc32 ( bytea ) - bigint - - - Computes the CRC-32 value of the binary string. - - - crc32('abc'::bytea) - 891568578 - - - - - - - crc32c - - crc32c ( bytea ) - bigint - - - Computes the CRC-32C value of the binary string. - - - crc32c('abc'::bytea) - 910901175 - - - - - - - get_bit - - get_bit ( bytes bytea, - n bigint ) - integer - - - Extracts n'th bit - from binary string. - - - get_bit('\x1234567890'::bytea, 30) - 1 - - - - - - - get_byte - - get_byte ( bytes bytea, - n integer ) - integer - - - Extracts n'th byte - from binary string. - - - get_byte('\x1234567890'::bytea, 4) - 144 - - - - - - - length - - - binary string - length - - - length - of a binary string - binary strings, length - - length ( bytea ) - integer - - - Returns the number of bytes in the binary string. - - - length('\x1234567890'::bytea) - 5 - - - - - - length ( bytes bytea, - encoding name ) - integer - - - Returns the number of characters in the binary string, assuming - that it is text in the given encoding. - - - length('jose'::bytea, 'UTF8') - 4 - - - - - - - md5 - - md5 ( bytea ) - text - - - Computes the MD5 hash of - the binary string, with the result written in hexadecimal. - - - md5('Th\000omas'::bytea) - 8ab2d3c9689aaf18&zwsp;b4958c334c82d8b1 - - - - - - - reverse - - reverse ( bytea ) - bytea - - - Reverses the order of the bytes in the binary string. - - - reverse('\xabcd'::bytea) - \xcdab - - - - - - - set_bit - - set_bit ( bytes bytea, - n bigint, - newvalue integer ) - bytea - - - Sets n'th bit in - binary string to newvalue. - - - set_bit('\x1234567890'::bytea, 30, 0) - \x1234563890 - - - - - - - set_byte - - set_byte ( bytes bytea, - n integer, - newvalue integer ) - bytea - - - Sets n'th byte in - binary string to newvalue. - - - set_byte('\x1234567890'::bytea, 4, 64) - \x1234567840 - - - - - - - sha224 - - sha224 ( bytea ) - bytea - - - Computes the SHA-224 hash - of the binary string. - - - sha224('abc'::bytea) - \x23097d223405d8228642a477bda2&zwsp;55b32aadbce4bda0b3f7e36c9da7 - - - - - - - sha256 - - sha256 ( bytea ) - bytea - - - Computes the SHA-256 hash - of the binary string. - - - sha256('abc'::bytea) - \xba7816bf8f01cfea414140de5dae2223&zwsp;b00361a396177a9cb410ff61f20015ad - - - - - - - sha384 - - sha384 ( bytea ) - bytea - - - Computes the SHA-384 hash - of the binary string. - - - sha384('abc'::bytea) - \xcb00753f45a35e8bb5a03d699ac65007&zwsp;272c32ab0eded1631a8b605a43ff5bed&zwsp;8086072ba1e7cc2358baeca134c825a7 - - - - - - - sha512 - - sha512 ( bytea ) - bytea - - - Computes the SHA-512 hash - of the binary string. - - - sha512('abc'::bytea) - \xddaf35a193617abacc417349ae204131&zwsp;12e6fa4e89a97ea20a9eeee64b55d39a&zwsp;2192992a274fc1a836ba3c23a3feebbd&zwsp;454d4423643ce80e2a9ac94fa54ca49f - - - - - - - substr - - substr ( bytes bytea, start integer , count integer ) - bytea - - - Extracts the substring of bytes starting at - the start'th byte, - and extending for count bytes if that is - specified. (Same - as substring(bytes - from start - for count).) - - - substr('\x1234567890'::bytea, 3, 2) - \x5678 - - - - -
- - - Functions get_byte and set_byte - number the first byte of a binary string as byte 0. - Functions get_bit and set_bit - number bits from the right within each byte; for example bit 0 is the least - significant bit of the first byte, and bit 15 is the most significant bit - of the second byte. - - - - For historical reasons, the function md5 - returns a hex-encoded value of type text whereas the SHA-2 - functions return type bytea. Use the functions - encode - and decode to - convert between the two. For example write encode(sha256('abc'), - 'hex') to get a hex-encoded text representation, - or decode(md5('abc'), 'hex') to get - a bytea value. - - - - - character string - converting to binary string - - - binary string - converting to character string - - Functions for converting strings between different character sets - (encodings), and for representing arbitrary binary data in textual - form, are shown in - . For these - functions, an argument or result of type text is expressed - in the database's default encoding, while arguments or results of - type bytea are in an encoding named by another argument. - - - - Text/Binary String Conversion Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - convert - - convert ( bytes bytea, - src_encoding name, - dest_encoding name ) - bytea - - - Converts a binary string representing text in - encoding src_encoding - to a binary string in encoding dest_encoding - (see for - available conversions). - - - convert('text_in_utf8', 'UTF8', 'LATIN1') - \x746578745f696e5f75746638 - - - - - - - convert_from - - convert_from ( bytes bytea, - src_encoding name ) - text - - - Converts a binary string representing text in - encoding src_encoding - to text in the database encoding - (see for - available conversions). - - - convert_from('text_in_utf8', 'UTF8') - text_in_utf8 - - - - - - - convert_to - - convert_to ( string text, - dest_encoding name ) - bytea - - - Converts a text string (in the database encoding) to a - binary string encoded in encoding dest_encoding - (see for - available conversions). - - - convert_to('some_text', 'UTF8') - \x736f6d655f74657874 - - - - - - - encode - - encode ( bytes bytea, - format text ) - text - - - Encodes binary data into a textual representation; supported - format values are: - base64, - escape, - hex. - - - encode('123\000\001', 'base64') - MTIzAAE= - - - - - - - decode - - decode ( string text, - format text ) - bytea - - - Decodes binary data from a textual representation; supported - format values are the same as - for encode. - - - decode('MTIzAAE=', 'base64') - \x3132330001 - - - - -
- - - The encode and decode - functions support the following textual formats: - - - - base64 - - base64 format - - - - The base64 format is that - of RFC - 2045 Section 6.8. As per the RFC, encoded lines are - broken at 76 characters. However instead of the MIME CRLF - end-of-line marker, only a newline is used for end-of-line. - The decode function ignores carriage-return, - newline, space, and tab characters. Otherwise, an error is - raised when decode is supplied invalid - base64 data — including when trailing padding is incorrect. - - - - - - escape - - escape format - - - - The escape format converts zero bytes and - bytes with the high bit set into octal escape sequences - (\nnn), and it doubles - backslashes. Other byte values are represented literally. - The decode function will raise an error if a - backslash is not followed by either a second backslash or three - octal digits; it accepts other byte values unchanged. - - - - - - hex - - hex format - - - - The hex format represents each 4 bits of - data as one hexadecimal digit, 0 - through f, writing the higher-order digit of - each byte first. The encode function outputs - the a-f hex digits in lower - case. Because the smallest unit of data is 8 bits, there are - always an even number of characters returned - by encode. - The decode function - accepts the a-f characters in - either upper or lower case. An error is raised - when decode is given invalid hex data - — including when given an odd number of characters. - - - - - - - - In addition, it is possible to cast integral values to and from type - bytea. Casting an integer to bytea produces - 2, 4, or 8 bytes, depending on the width of the integer type. The result - is the two's complement representation of the integer, with the most - significant byte first. Some examples: - -1234::smallint::bytea \x04d2 -cast(1234 as bytea) \x000004d2 -cast(-1234 as bytea) \xfffffb2e -'\x8000'::bytea::smallint -32768 -'\x8000'::bytea::integer 32768 - - Casting a bytea to an integer will raise an error if the - length of the bytea exceeds the width of the integer type. - - - - See also the aggregate function string_agg in - and the large object functions - in . - -
- - - - Bit String Functions and Operators - - - bit strings - functions - - - - This section describes functions and operators for examining and - manipulating bit strings, that is values of the types - bit and bit varying. (While only - type bit is mentioned in these tables, values of - type bit varying can be used interchangeably.) - Bit strings support the usual comparison operators shown in - , as well as the - operators shown in . - - - - Bit String Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - bit || bit - bit - - - Concatenation - - - B'10001' || B'011' - 10001011 - - - - - - bit & bit - bit - - - Bitwise AND (inputs must be of equal length) - - - B'10001' & B'01101' - 00001 - - - - - - bit | bit - bit - - - Bitwise OR (inputs must be of equal length) - - - B'10001' | B'01101' - 11101 - - - - - - bit # bit - bit - - - Bitwise exclusive OR (inputs must be of equal length) - - - B'10001' # B'01101' - 11100 - - - - - - ~ bit - bit - - - Bitwise NOT - - - ~ B'10001' - 01110 - - - - - - bit << integer - bit - - - Bitwise shift left - (string length is preserved) - - - B'10001' << 3 - 01000 - - - - - - bit >> integer - bit - - - Bitwise shift right - (string length is preserved) - - - B'10001' >> 2 - 00100 - - - - -
- - - Some of the functions available for binary strings are also available - for bit strings, as shown in . - - - - Bit String Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - bit_count - - bit_count ( bit ) - bigint - - - Returns the number of bits set in the bit string (also known as - popcount). - - - bit_count(B'10111') - 4 - - - - - - - bit_length - - bit_length ( bit ) - integer - - - Returns number of bits in the bit string. - - - bit_length(B'10111') - 5 - - - - - - - length - - - bit string - length - - length ( bit ) - integer - - - Returns number of bits in the bit string. - - - length(B'10111') - 5 - - - - - - - octet_length - - octet_length ( bit ) - integer - - - Returns number of bytes in the bit string. - - - octet_length(B'1011111011') - 2 - - - - - - - overlay - - overlay ( bits bit PLACING newsubstring bit FROM start integer FOR count integer ) - bit - - - Replaces the substring of bits that starts at - the start'th bit and extends - for count bits - with newsubstring. - If count is omitted, it defaults to the length - of newsubstring. - - - overlay(B'01010101010101010' placing B'11111' from 2 for 3) - 0111110101010101010 - - - - - - - position - - position ( substring bit IN bits bit ) - integer - - - Returns first starting index of the specified substring - within bits, or zero if it's not present. - - - position(B'010' in B'000001101011') - 8 - - - - - - - substring - - substring ( bits bit FROM start integer FOR count integer ) - bit - - - Extracts the substring of bits starting at - the start'th bit if that is specified, - and stopping after count bits if that is - specified. Provide at least one of start - and count. - - - substring(B'110010111111' from 3 for 2) - 00 - - - - - - - get_bit - - get_bit ( bits bit, - n integer ) - integer - - - Extracts n'th bit - from bit string; the first (leftmost) bit is bit 0. - - - get_bit(B'101010101010101010', 6) - 1 - - - - - - - set_bit - - set_bit ( bits bit, - n integer, - newvalue integer ) - bit - - - Sets n'th bit in - bit string to newvalue; - the first (leftmost) bit is bit 0. - - - set_bit(B'101010101010101010', 6, 0) - 101010001010101010 - - - - -
- - - In addition, it is possible to cast integral values to and from type - bit. - Casting an integer to bit(n) copies the rightmost - n bits. Casting an integer to a bit string width wider - than the integer itself will sign-extend on the left. - Some examples: - -44::bit(10) 0000101100 -44::bit(3) 100 -cast(-44 as bit(12)) 111111010100 -'1110'::bit(4)::integer 14 - - Note that casting to just bit means casting to - bit(1), and so will deliver only the least significant - bit of the integer. - -
- - - - Pattern Matching - - - pattern matching - - - - There are three separate approaches to pattern matching provided - by PostgreSQL: the traditional - SQL LIKE operator, the - more recent SIMILAR TO operator (added in - SQL:1999), and POSIX-style regular - expressions. Aside from the basic does this string match - this pattern? operators, functions are available to extract - or replace matching substrings and to split a string at matching - locations. - - - - - If you have pattern matching needs that go beyond this, - consider writing a user-defined function in Perl or Tcl. - - - - - - While most regular-expression searches can be executed very quickly, - regular expressions can be contrived that take arbitrary amounts of - time and memory to process. Be wary of accepting regular-expression - search patterns from hostile sources. If you must do so, it is - advisable to impose a statement timeout. - - - - Searches using SIMILAR TO patterns have the same - security hazards, since SIMILAR TO provides many - of the same capabilities as POSIX-style regular - expressions. - - - - LIKE searches, being much simpler than the other - two options, are safer to use with possibly-hostile pattern sources. - - - - - SIMILAR TO and POSIX-style regular - expressions do not support nondeterministic collations. If required, use - LIKE or apply a different collation to the expression - to work around this limitation. - - - - <function>LIKE</function> - - - LIKE - - - -string LIKE pattern ESCAPE escape-character -string NOT LIKE pattern ESCAPE escape-character - - - - The LIKE expression returns true if the - string matches the supplied - pattern. (As - expected, the NOT LIKE expression returns - false if LIKE returns true, and vice versa. - An equivalent expression is - NOT (string LIKE - pattern).) - - - - If pattern does not contain percent - signs or underscores, then the pattern only represents the string - itself; in that case LIKE acts like the - equals operator. An underscore (_) in - pattern stands for (matches) any single - character; a percent sign (%) matches any sequence - of zero or more characters. - - - - Some examples: - -'abc' LIKE 'abc' true -'abc' LIKE 'a%' true -'abc' LIKE '_b_' true -'abc' LIKE 'c' false - - - - - LIKE pattern matching supports nondeterministic - collations (see ), such as - case-insensitive collations or collations that, say, ignore punctuation. - So with a case-insensitive collation, one could have: - -'AbC' LIKE 'abc' COLLATE case_insensitive true -'AbC' LIKE 'a%' COLLATE case_insensitive true - - With collations that ignore certain characters or in general that consider - strings of different lengths equal, the semantics can become a bit more - complicated. Consider these examples: - -'.foo.' LIKE 'foo' COLLATE ign_punct true -'.foo.' LIKE 'f_o' COLLATE ign_punct true -'.foo.' LIKE '_oo' COLLATE ign_punct false - - The way the matching works is that the pattern is partitioned into - sequences of wildcards and non-wildcard strings (wildcards being - _ and %). For example, the pattern - f_o is partitioned into f, _, o, the - pattern _oo is partitioned into _, - oo. The input string matches the pattern if it can be - partitioned in such a way that the wildcards match one character or any - number of characters respectively and the non-wildcard partitions are - equal under the applicable collation. So for example, '.foo.' - LIKE 'f_o' COLLATE ign_punct is true because one can partition - .foo. into .f, o, o., and then - '.f' = 'f' COLLATE ign_punct, 'o' - matches the _ wildcard, and 'o.' = 'o' COLLATE - ign_punct. But '.foo.' LIKE '_oo' COLLATE - ign_punct is false because .foo. cannot be - partitioned in a way that the first character is any character and the - rest of the string compares equal to oo. (Note that - the single-character wildcard always matches exactly one character, - independent of the collation. So in this example, the - _ would match ., but then the rest - of the input string won't match the rest of the pattern.) - - - - LIKE pattern matching always covers the entire - string. Therefore, if it's desired to match a sequence anywhere within - a string, the pattern must start and end with a percent sign. - - - - To match a literal underscore or percent sign without matching - other characters, the respective character in - pattern must be - preceded by the escape character. The default escape - character is the backslash but a different one can be selected by - using the ESCAPE clause. To match the escape - character itself, write two escape characters. - - - - - If you have turned off, - any backslashes you write in literal string constants will need to be - doubled. See for more information. - - - - - It's also possible to select no escape character by writing - ESCAPE ''. This effectively disables the - escape mechanism, which makes it impossible to turn off the - special meaning of underscore and percent signs in the pattern. - - - - According to the SQL standard, omitting ESCAPE - means there is no escape character (rather than defaulting to a - backslash), and a zero-length ESCAPE value is - disallowed. PostgreSQL's behavior in - this regard is therefore slightly nonstandard. - - - - The key word ILIKE can be used instead of - LIKE to make the match case-insensitive according to the - active locale. (But this does not support nondeterministic collations.) - This is not in the SQL standard but is a - PostgreSQL extension. - - - - The operator ~~ is equivalent to - LIKE, and ~~* corresponds to - ILIKE. There are also - !~~ and !~~* operators that - represent NOT LIKE and NOT - ILIKE, respectively. All of these operators are - PostgreSQL-specific. You may see these - operator names in EXPLAIN output and similar - places, since the parser actually translates LIKE - et al. to these operators. - - - - The phrases LIKE, ILIKE, - NOT LIKE, and NOT ILIKE are - generally treated as operators - in PostgreSQL syntax; for example they can - be used in expression - operator ANY - (subquery) constructs, although - an ESCAPE clause cannot be included there. In some - obscure cases it may be necessary to use the underlying operator names - instead. - - - - Also see the starts-with operator ^@ and the - corresponding starts_with() function, which are - useful in cases where simply matching the beginning of a string is - needed. - - - - - - <function>SIMILAR TO</function> Regular Expressions - - - regular expression - - - - - SIMILAR TO - - - substring - - - -string SIMILAR TO pattern ESCAPE escape-character -string NOT SIMILAR TO pattern ESCAPE escape-character - - - - The SIMILAR TO operator returns true or - false depending on whether its pattern matches the given string. - It is similar to LIKE, except that it - interprets the pattern using the SQL standard's definition of a - regular expression. SQL regular expressions are a curious cross - between LIKE notation and common (POSIX) regular - expression notation. - - - - Like LIKE, the SIMILAR TO - operator succeeds only if its pattern matches the entire string; - this is unlike common regular expression behavior where the pattern - can match any part of the string. - Also like - LIKE, SIMILAR TO uses - _ and % as wildcard characters denoting - any single character and any string, respectively (these are - comparable to . and .* in POSIX regular - expressions). - - - - In addition to these facilities borrowed from LIKE, - SIMILAR TO supports these pattern-matching - metacharacters borrowed from POSIX regular expressions: - - - - - | denotes alternation (either of two alternatives). - - - - - * denotes repetition of the previous item zero - or more times. - - - - - + denotes repetition of the previous item one - or more times. - - - - - ? denotes repetition of the previous item zero - or one time. - - - - - {m} denotes repetition - of the previous item exactly m times. - - - - - {m,} denotes repetition - of the previous item m or more times. - - - - - {m,n} - denotes repetition of the previous item at least m and - not more than n times. - - - - - Parentheses () can be used to group items into - a single logical item. - - - - - A bracket expression [...] specifies a character - class, just as in POSIX regular expressions. - - - - - Notice that the period (.) is not a metacharacter - for SIMILAR TO. - - - - As with LIKE, a backslash disables the special - meaning of any of these metacharacters. A different escape character - can be specified with ESCAPE, or the escape - capability can be disabled by writing ESCAPE ''. - - - - According to the SQL standard, omitting ESCAPE - means there is no escape character (rather than defaulting to a - backslash), and a zero-length ESCAPE value is - disallowed. PostgreSQL's behavior in - this regard is therefore slightly nonstandard. - - - - Another nonstandard extension is that following the escape character - with a letter or digit provides access to the escape sequences - defined for POSIX regular expressions; see - , - , and - below. - - - - Some examples: - -'abc' SIMILAR TO 'abc' true -'abc' SIMILAR TO 'a' false -'abc' SIMILAR TO '%(b|d)%' true -'abc' SIMILAR TO '(b|c)%' false -'-abc-' SIMILAR TO '%\mabc\M%' true -'xabcy' SIMILAR TO '%\mabc\M%' false - - - - - The substring function with three parameters - provides extraction of a substring that matches an SQL - regular expression pattern. The function can be written according - to standard SQL syntax: - -substring(string similar pattern escape escape-character) - - or using the now obsolete SQL:1999 syntax: - -substring(string from pattern for escape-character) - - or as a plain three-argument function: - -substring(string, pattern, escape-character) - - As with SIMILAR TO, the - specified pattern must match the entire data string, or else the - function fails and returns null. To indicate the part of the - pattern for which the matching data sub-string is of interest, - the pattern should contain - two occurrences of the escape character followed by a double quote - ("). - The text matching the portion of the pattern - between these separators is returned when the match is successful. - - - - The escape-double-quote separators actually - divide substring's pattern into three independent - regular expressions; for example, a vertical bar (|) - in any of the three sections affects only that section. Also, the first - and third of these regular expressions are defined to match the smallest - possible amount of text, not the largest, when there is any ambiguity - about how much of the data string matches which pattern. (In POSIX - parlance, the first and third regular expressions are forced to be - non-greedy.) - - - - As an extension to the SQL standard, PostgreSQL - allows there to be just one escape-double-quote separator, in which case - the third regular expression is taken as empty; or no separators, in which - case the first and third regular expressions are taken as empty. - - - - Some examples, with #" delimiting the return string: - -substring('foobar' similar '%#"o_b#"%' escape '#') oob -substring('foobar' similar '#"o_b#"%' escape '#') NULL - - - - - - <acronym>POSIX</acronym> Regular Expressions - - - regular expression - pattern matching - - - substring - - - regexp_count - - - regexp_instr - - - regexp_like - - - regexp_match - - - regexp_matches - - - regexp_replace - - - regexp_split_to_table - - - regexp_split_to_array - - - regexp_substr - - - - lists the available - operators for pattern matching using POSIX regular expressions. - - - - Regular Expression Match Operators - - - - - - Operator - - - Description - - - Example(s) - - - - - - - - text ~ text - boolean - - - String matches regular expression, case sensitively - - - 'thomas' ~ 't.*ma' - t - - - - - - text ~* text - boolean - - - String matches regular expression, case-insensitively - - - 'thomas' ~* 'T.*ma' - t - - - - - - text !~ text - boolean - - - String does not match regular expression, case sensitively - - - 'thomas' !~ 't.*max' - t - - - - - - text !~* text - boolean - - - String does not match regular expression, case-insensitively - - - 'thomas' !~* 'T.*ma' - f - - - - -
- - - POSIX regular expressions provide a more - powerful means for pattern matching than the LIKE and - SIMILAR TO operators. - Many Unix tools such as egrep, - sed, or awk use a pattern - matching language that is similar to the one described here. - - - - A regular expression is a character sequence that is an - abbreviated definition of a set of strings (a regular - set). A string is said to match a regular expression - if it is a member of the regular set described by the regular - expression. As with LIKE, pattern characters - match string characters exactly unless they are special characters - in the regular expression language — but regular expressions use - different special characters than LIKE does. - Unlike LIKE patterns, a - regular expression is allowed to match anywhere within a string, unless - the regular expression is explicitly anchored to the beginning or - end of the string. - - - - Some examples: - -'abcd' ~ 'bc' true -'abcd' ~ 'a.c' true — dot matches any character -'abcd' ~ 'a.*d' true — * repeats the preceding pattern item -'abcd' ~ '(b|x)' true — | means OR, parentheses group -'abcd' ~ '^a' true — ^ anchors to start of string -'abcd' ~ '^(b|c)' false — would match except for anchoring - - - - - The POSIX pattern language is described in much - greater detail below. - - - - The substring function with two parameters, - substring(string from - pattern), provides extraction of a - substring - that matches a POSIX regular expression pattern. It returns null if - there is no match, otherwise the first portion of the text that matched the - pattern. But if the pattern contains any parentheses, the portion - of the text that matched the first parenthesized subexpression (the - one whose left parenthesis comes first) is - returned. You can put parentheses around the whole expression - if you want to use parentheses within it without triggering this - exception. If you need parentheses in the pattern before the - subexpression you want to extract, see the non-capturing parentheses - described below. - - - - Some examples: - -substring('foobar' from 'o.b') oob -substring('foobar' from 'o(.)b') o - - - - - The regexp_count function counts the number of - places where a POSIX regular expression pattern matches a string. - It has the syntax - regexp_count(string, - pattern - , start - , flags - ). - pattern is searched for - in string, normally from the beginning of - the string, but if the start parameter is - provided then beginning from that character index. - The flags parameter is an optional text - string containing zero or more single-letter flags that change the - function's behavior. For example, including i in - flags specifies case-insensitive matching. - Supported flags are described in - . - - - - Some examples: - -regexp_count('ABCABCAXYaxy', 'A.') 3 -regexp_count('ABCABCAXYaxy', 'A.', 1, 'i') 4 - - - - - The regexp_instr function returns the starting or - ending position of the N'th match of a - POSIX regular expression pattern to a string, or zero if there is no - such match. It has the syntax - regexp_instr(string, - pattern - , start - , N - , endoption - , flags - , subexpr - ). - pattern is searched for - in string, normally from the beginning of - the string, but if the start parameter is - provided then beginning from that character index. - If N is specified - then the N'th match of the pattern - is located, otherwise the first match is located. - If the endoption parameter is omitted or - specified as zero, the function returns the position of the first - character of the match. Otherwise, endoption - must be one, and the function returns the position of the character - following the match. - The flags parameter is an optional text - string containing zero or more single-letter flags that change the - function's behavior. Supported flags are described - in . - For a pattern containing parenthesized - subexpressions, subexpr is an integer - indicating which subexpression is of interest: the result identifies - the position of the substring matching that subexpression. - Subexpressions are numbered in the order of their leading parentheses. - When subexpr is omitted or zero, the result - identifies the position of the whole match regardless of - parenthesized subexpressions. - - - - Some examples: - -regexp_instr('number of your street, town zip, FR', '[^,]+', 1, 2) - 23 -regexp_instr(string=>'ABCDEFGHI', pattern=>'(c..)(...)', start=>1, "N"=>1, endoption=>0, flags=>'i', subexpr=>2) - 6 - - - - - The regexp_like function checks whether a match - of a POSIX regular expression pattern occurs within a string, - returning boolean true or false. It has the syntax - regexp_like(string, - pattern - , flags ). - The flags parameter is an optional text - string containing zero or more single-letter flags that change the - function's behavior. Supported flags are described - in . - This function has the same results as the ~ - operator if no flags are specified. If only the i - flag is specified, it has the same results as - the ~* operator. - - - - Some examples: - -regexp_like('Hello World', 'world') false -regexp_like('Hello World', 'world', 'i') true - - - - - The regexp_match function returns a text array of - matching substring(s) within the first match of a POSIX - regular expression pattern to a string. It has the syntax - regexp_match(string, - pattern , flags ). - If there is no match, the result is NULL. - If a match is found, and the pattern contains no - parenthesized subexpressions, then the result is a single-element text - array containing the substring matching the whole pattern. - If a match is found, and the pattern contains - parenthesized subexpressions, then the result is a text array - whose n'th element is the substring matching - the n'th parenthesized subexpression of - the pattern (not counting non-capturing - parentheses; see below for details). - The flags parameter is an optional text string - containing zero or more single-letter flags that change the function's - behavior. Supported flags are described - in . - - - - Some examples: - -SELECT regexp_match('foobarbequebaz', 'bar.*que'); - regexp_match --------------- - {barbeque} -(1 row) - -SELECT regexp_match('foobarbequebaz', '(bar)(beque)'); - regexp_match --------------- - {bar,beque} -(1 row) - - - - - - In the common case where you just want the whole matching substring - or NULL for no match, the best solution is to - use regexp_substr(). - However, regexp_substr() only exists - in PostgreSQL version 15 and up. When - working in older versions, you can extract the first element - of regexp_match()'s result, for example: - -SELECT (regexp_match('foobarbequebaz', 'bar.*que'))[1]; - regexp_match --------------- - barbeque -(1 row) - - - - - - The regexp_matches function returns a set of text arrays - of matching substring(s) within matches of a POSIX regular - expression pattern to a string. It has the same syntax as - regexp_match. - This function returns no rows if there is no match, one row if there is - a match and the g flag is not given, or N - rows if there are N matches and the g flag - is given. Each returned row is a text array containing the whole - matched substring or the substrings matching parenthesized - subexpressions of the pattern, just as described above - for regexp_match. - regexp_matches accepts all the flags shown - in , plus - the g flag which commands it to return all matches, not - just the first one. - - - - Some examples: - -SELECT regexp_matches('foo', 'not there'); - regexp_matches ----------------- -(0 rows) - -SELECT regexp_matches('foobarbequebazilbarfbonk', '(b[^b]+)(b[^b]+)', 'g'); - regexp_matches ----------------- - {bar,beque} - {bazil,barf} -(2 rows) - - - - - - In most cases regexp_matches() should be used with - the g flag, since if you only want the first match, it's - easier and more efficient to use regexp_match(). - However, regexp_match() only exists - in PostgreSQL version 10 and up. When working in older - versions, a common trick is to place a regexp_matches() - call in a sub-select, for example: - -SELECT col1, (SELECT regexp_matches(col2, '(bar)(beque)')) FROM tab; - - This produces a text array if there's a match, or NULL if - not, the same as regexp_match() would do. Without the - sub-select, this query would produce no output at all for table rows - without a match, which is typically not the desired behavior. - - - - - The regexp_replace function provides substitution of - new text for substrings that match POSIX regular expression patterns. - It has the syntax - regexp_replace(string, - pattern, replacement - , flags ) - or - regexp_replace(string, - pattern, replacement, - start - , N - , flags ). - The source string is returned unchanged if - there is no match to the pattern. If there is a - match, the string is returned with the - replacement string substituted for the matching - substring. The replacement string can contain - \n, where n is 1 - through 9, to indicate that the source substring matching the - n'th parenthesized subexpression of the pattern should be - inserted, and it can contain \& to indicate that the - substring matching the entire pattern should be inserted. Write - \\ if you need to put a literal backslash in the replacement - text. - pattern is searched for - in string, normally from the beginning of - the string, but if the start parameter is - provided then beginning from that character index. - By default, only the first match of the pattern is replaced. - If N is specified and is greater than zero, - then the N'th match of the pattern - is replaced. - If the g flag is given, or - if N is specified and is zero, then all - matches at or after the start position are - replaced. (The g flag is ignored - when N is specified.) - The flags parameter is an optional text - string containing zero or more single-letter flags that change the - function's behavior. Supported flags (though - not g) are - described in . - - - - Some examples: - -regexp_replace('foobarbaz', 'b..', 'X') - fooXbaz -regexp_replace('foobarbaz', 'b..', 'X', 'g') - fooXX -regexp_replace('foobarbaz', 'b(..)', 'X\1Y', 'g') - fooXarYXazY -regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i') - X PXstgrXSQL fXnctXXn -regexp_replace(string=>'A PostgreSQL function', pattern=>'a|e|i|o|u', replacement=>'X', start=>1, "N"=>3, flags=>'i') - A PostgrXSQL function - - - - - The regexp_split_to_table function splits a string using a POSIX - regular expression pattern as a delimiter. It has the syntax - regexp_split_to_table(string, pattern - , flags ). - If there is no match to the pattern, the function returns the - string. If there is at least one match, for each match it returns - the text from the end of the last match (or the beginning of the string) - to the beginning of the match. When there are no more matches, it - returns the text from the end of the last match to the end of the string. - The flags parameter is an optional text string containing - zero or more single-letter flags that change the function's behavior. - regexp_split_to_table supports the flags described in - . - - - - The regexp_split_to_array function behaves the same as - regexp_split_to_table, except that regexp_split_to_array - returns its result as an array of text. It has the syntax - regexp_split_to_array(string, pattern - , flags ). - The parameters are the same as for regexp_split_to_table. - - - - Some examples: - -SELECT foo FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '\s+') AS foo; - foo -------- - the - quick - brown - fox - jumps - over - the - lazy - dog -(9 rows) - -SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s+'); - regexp_split_to_array ------------------------------------------------ - {the,quick,brown,fox,jumps,over,the,lazy,dog} -(1 row) - -SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; - foo ------ - t - h - e - q - u - i - c - k - b - r - o - w - n - f - o - x -(16 rows) - - - - - As the last example demonstrates, the regexp split functions ignore - zero-length matches that occur at the start or end of the string - or immediately after a previous match. This is contrary to the strict - definition of regexp matching that is implemented by - the other regexp functions, but is usually the most convenient behavior - in practice. Other software systems such as Perl use similar definitions. - - - - The regexp_substr function returns the substring - that matches a POSIX regular expression pattern, - or NULL if there is no match. It has the syntax - regexp_substr(string, - pattern - , start - , N - , flags - , subexpr - ). - pattern is searched for - in string, normally from the beginning of - the string, but if the start parameter is - provided then beginning from that character index. - If N is specified - then the N'th match of the pattern - is returned, otherwise the first match is returned. - The flags parameter is an optional text - string containing zero or more single-letter flags that change the - function's behavior. Supported flags are described - in . - For a pattern containing parenthesized - subexpressions, subexpr is an integer - indicating which subexpression is of interest: the result is the - substring matching that subexpression. - Subexpressions are numbered in the order of their leading parentheses. - When subexpr is omitted or zero, the result - is the whole match regardless of parenthesized subexpressions. - - - - Some examples: - -regexp_substr('number of your street, town zip, FR', '[^,]+', 1, 2) - town zip -regexp_substr('ABCDEFGHI', '(c..)(...)', 1, 1, 'i', 2) - FGH - - - - - - - Regular Expression Details - - - PostgreSQL's regular expressions are implemented - using a software package written by Henry Spencer. Much of - the description of regular expressions below is copied verbatim from his - manual. - - - - Regular expressions (REs), as defined in - POSIX 1003.2, come in two forms: - extended REs or EREs - (roughly those of egrep), and - basic REs or BREs - (roughly those of ed). - PostgreSQL supports both forms, and - also implements some extensions - that are not in the POSIX standard, but have become widely used - due to their availability in programming languages such as Perl and Tcl. - REs using these non-POSIX extensions are called - advanced REs or AREs - in this documentation. AREs are almost an exact superset of EREs, - but BREs have several notational incompatibilities (as well as being - much more limited). - We first describe the ARE and ERE forms, noting features that apply - only to AREs, and then describe how BREs differ. - - - - - PostgreSQL always initially presumes that a regular - expression follows the ARE rules. However, the more limited ERE or - BRE rules can be chosen by prepending an embedded option - to the RE pattern, as described in . - This can be useful for compatibility with applications that expect - exactly the POSIX 1003.2 rules. - - - - - A regular expression is defined as one or more - branches, separated by - |. It matches anything that matches one of the - branches. - - - - A branch is zero or more quantified atoms or - constraints, concatenated. - It matches a match for the first, followed by a match for the second, etc.; - an empty branch matches the empty string. - - - - A quantified atom is an atom possibly followed - by a single quantifier. - Without a quantifier, it matches a match for the atom. - With a quantifier, it can match some number of matches of the atom. - An atom can be any of the possibilities - shown in . - The possible quantifiers and their meanings are shown in - . - - - - A constraint matches an empty string, but matches only when - specific conditions are met. A constraint can be used where an atom - could be used, except it cannot be followed by a quantifier. - The simple constraints are shown in - ; - some more constraints are described later. - - - - - Regular Expression Atoms - - - - - Atom - Description - - - - - - (re) - (where re is any regular expression) - matches a match for - re, with the match noted for possible reporting - - - - (?:re) - as above, but the match is not noted for reporting - (a non-capturing set of parentheses) - (AREs only) - - - - . - matches any single character - - - - [chars] - a bracket expression, - matching any one of the chars (see - for more detail) - - - - \k - (where k is a non-alphanumeric character) - matches that character taken as an ordinary character, - e.g., \\ matches a backslash character - - - - \c - where c is alphanumeric - (possibly followed by other characters) - is an escape, see - (AREs only; in EREs and BREs, this matches c) - - - - { - when followed by a character other than a digit, - matches the left-brace character {; - when followed by a digit, it is the beginning of a - bound (see below) - - - - x - where x is a single character with no other - significance, matches that character - - - -
- - - An RE cannot end with a backslash (\). - - - - - If you have turned off, - any backslashes you write in literal string constants will need to be - doubled. See for more information. - - - - - Regular Expression Quantifiers - - - - - Quantifier - Matches - - - - - - * - a sequence of 0 or more matches of the atom - - - - + - a sequence of 1 or more matches of the atom - - - - ? - a sequence of 0 or 1 matches of the atom - - - - {m} - a sequence of exactly m matches of the atom - - - - {m,} - a sequence of m or more matches of the atom - - - - - {m,n} - a sequence of m through n - (inclusive) matches of the atom; m cannot exceed - n - - - - *? - non-greedy version of * - - - - +? - non-greedy version of + - - - - ?? - non-greedy version of ? - - - - {m}? - non-greedy version of {m} - - - - {m,}? - non-greedy version of {m,} - - - - - {m,n}? - non-greedy version of {m,n} - - - -
- - - The forms using {...} - are known as bounds. - The numbers m and n within a bound are - unsigned decimal integers with permissible values from 0 to 255 inclusive. - - - - Non-greedy quantifiers (available in AREs only) match the - same possibilities as their corresponding normal (greedy) - counterparts, but prefer the smallest number rather than the largest - number of matches. - See for more detail. - - - - - A quantifier cannot immediately follow another quantifier, e.g., - ** is invalid. - A quantifier cannot - begin an expression or subexpression or follow - ^ or |. - - - - - Regular Expression Constraints - - - - - Constraint - Description - - - - - - ^ - matches at the beginning of the string - - - - $ - matches at the end of the string - - - - (?=re) - positive lookahead matches at any point - where a substring matching re begins - (AREs only) - - - - (?!re) - negative lookahead matches at any point - where no substring matching re begins - (AREs only) - - - - (?<=re) - positive lookbehind matches at any point - where a substring matching re ends - (AREs only) - - - - (?<!re) - negative lookbehind matches at any point - where no substring matching re ends - (AREs only) - - - -
- - - Lookahead and lookbehind constraints cannot contain back - references (see ), - and all parentheses within them are considered non-capturing. - -
- - - Bracket Expressions - - - A bracket expression is a list of - characters enclosed in []. It normally matches - any single character from the list (but see below). If the list - begins with ^, it matches any single character - not from the rest of the list. - If two characters - in the list are separated by -, this is - shorthand for the full range of characters between those two - (inclusive) in the collating sequence, - e.g., [0-9] in ASCII matches - any decimal digit. It is illegal for two ranges to share an - endpoint, e.g., a-c-e. Ranges are very - collating-sequence-dependent, so portable programs should avoid - relying on them. - - - - To include a literal ] in the list, make it the - first character (after ^, if that is used). To - include a literal -, make it the first or last - character, or the second endpoint of a range. To use a literal - - as the first endpoint of a range, enclose it - in [. and .] to make it a - collating element (see below). With the exception of these characters, - some combinations using [ - (see next paragraphs), and escapes (AREs only), all other special - characters lose their special significance within a bracket expression. - In particular, \ is not special when following - ERE or BRE rules, though it is special (as introducing an escape) - in AREs. - - - - Within a bracket expression, a collating element (a character, a - multiple-character sequence that collates as if it were a single - character, or a collating-sequence name for either) enclosed in - [. and .] stands for the - sequence of characters of that collating element. The sequence is - treated as a single element of the bracket expression's list. This - allows a bracket - expression containing a multiple-character collating element to - match more than one character, e.g., if the collating sequence - includes a ch collating element, then the RE - [[.ch.]]*c matches the first five characters of - chchcc. - - - - - PostgreSQL currently does not support multi-character collating - elements. This information describes possible future behavior. - - - - - Within a bracket expression, a collating element enclosed in - [= and =] is an equivalence - class, standing for the sequences of characters of all collating - elements equivalent to that one, including itself. (If there are - no other equivalent collating elements, the treatment is as if the - enclosing delimiters were [. and - .].) For example, if o and - ^ are the members of an equivalence class, then - [[=o=]], [[=^=]], and - [o^] are all synonymous. An equivalence class - cannot be an endpoint of a range. - - - - Within a bracket expression, the name of a character class - enclosed in [: and :] stands - for the list of all characters belonging to that class. A character - class cannot be used as an endpoint of a range. - The POSIX standard defines these character class - names: - alnum (letters and numeric digits), - alpha (letters), - blank (space and tab), - cntrl (control characters), - digit (numeric digits), - graph (printable characters except space), - lower (lower-case letters), - print (printable characters including space), - punct (punctuation), - space (any white space), - upper (upper-case letters), - and xdigit (hexadecimal digits). - The behavior of these standard character classes is generally - consistent across platforms for characters in the 7-bit ASCII set. - Whether a given non-ASCII character is considered to belong to one - of these classes depends on the collation - that is used for the regular-expression function or operator - (see ), or by default on the - database's LC_CTYPE locale setting (see - ). The classification of non-ASCII - characters can vary across platforms even in similarly-named - locales. (But the C locale never considers any - non-ASCII characters to belong to any of these classes.) - In addition to these standard character - classes, PostgreSQL defines - the word character class, which is the same as - alnum plus the underscore (_) - character, and - the ascii character class, which contains exactly - the 7-bit ASCII set. - - - - There are two special cases of bracket expressions: the bracket - expressions [[:<:]] and - [[:>:]] are constraints, - matching empty strings at the beginning - and end of a word respectively. A word is defined as a sequence - of word characters that is neither preceded nor followed by word - characters. A word character is any character belonging to the - word character class, that is, any letter, digit, - or underscore. This is an extension, compatible with but not - specified by POSIX 1003.2, and should be used with - caution in software intended to be portable to other systems. - The constraint escapes described below are usually preferable; they - are no more standard, but are easier to type. - - - - - Regular Expression Escapes - - - Escapes are special sequences beginning with \ - followed by an alphanumeric character. Escapes come in several varieties: - character entry, class shorthands, constraint escapes, and back references. - A \ followed by an alphanumeric character but not constituting - a valid escape is illegal in AREs. - In EREs, there are no escapes: outside a bracket expression, - a \ followed by an alphanumeric character merely stands for - that character as an ordinary character, and inside a bracket expression, - \ is an ordinary character. - (The latter is the one actual incompatibility between EREs and AREs.) - - - - Character-entry escapes exist to make it easier to specify - non-printing and other inconvenient characters in REs. They are - shown in . - - - - Class-shorthand escapes provide shorthands for certain - commonly-used character classes. They are - shown in . - - - - A constraint escape is a constraint, - matching the empty string if specific conditions are met, - written as an escape. They are - shown in . - - - - A back reference (\n) matches the - same string matched by the previous parenthesized subexpression specified - by the number n - (see ). For example, - ([bc])\1 matches bb or cc - but not bc or cb. - The subexpression must entirely precede the back reference in the RE. - Subexpressions are numbered in the order of their leading parentheses. - Non-capturing parentheses do not define subexpressions. - The back reference considers only the string characters matched by the - referenced subexpression, not any constraints contained in it. For - example, (^\d)\1 will match 22. - - - - Regular Expression Character-Entry Escapes - - - - - Escape - Description - - - - - - \a - alert (bell) character, as in C - - - - \b - backspace, as in C - - - - \B - synonym for backslash (\) to help reduce the need for backslash - doubling - - - - \cX - (where X is any character) the character whose - low-order 5 bits are the same as those of - X, and whose other bits are all zero - - - - \e - the character whose collating-sequence name - is ESC, - or failing that, the character with octal value 033 - - - - \f - form feed, as in C - - - - \n - newline, as in C - - - - \r - carriage return, as in C - - - - \t - horizontal tab, as in C - - - - \uwxyz - (where wxyz is exactly four hexadecimal digits) - the character whose hexadecimal value is - 0xwxyz - - - - - \Ustuvwxyz - (where stuvwxyz is exactly eight hexadecimal - digits) - the character whose hexadecimal value is - 0xstuvwxyz - - - - - \v - vertical tab, as in C - - - - \xhhh - (where hhh is any sequence of hexadecimal - digits) - the character whose hexadecimal value is - 0xhhh - (a single character no matter how many hexadecimal digits are used) - - - - - \0 - the character whose value is 0 (the null byte) - - - - \xy - (where xy is exactly two octal digits, - and is not a back reference) - the character whose octal value is - 0xy - - - - \xyz - (where xyz is exactly three octal digits, - and is not a back reference) - the character whose octal value is - 0xyz - - - -
- - - Hexadecimal digits are 0-9, - a-f, and A-F. - Octal digits are 0-7. - - - - Numeric character-entry escapes specifying values outside the ASCII range - (0–127) have meanings dependent on the database encoding. When the - encoding is UTF-8, escape values are equivalent to Unicode code points, - for example \u1234 means the character U+1234. - For other multibyte encodings, character-entry escapes usually just - specify the concatenation of the byte values for the character. If the - escape value does not correspond to any legal character in the database - encoding, no error will be raised, but it will never match any data. - - - - The character-entry escapes are always taken as ordinary characters. - For example, \135 is ] in ASCII, but - \135 does not terminate a bracket expression. - - - - Regular Expression Class-Shorthand Escapes - - - - - Escape - Description - - - - - - \d - matches any digit, like - [[:digit:]] - - - - \s - matches any whitespace character, like - [[:space:]] - - - - \w - matches any word character, like - [[:word:]] - - - - \D - matches any non-digit, like - [^[:digit:]] - - - - \S - matches any non-whitespace character, like - [^[:space:]] - - - - \W - matches any non-word character, like - [^[:word:]] - - - -
- - - The class-shorthand escapes also work within bracket expressions, - although the definitions shown above are not quite syntactically - valid in that context. - For example, [a-c\d] is equivalent to - [a-c[:digit:]]. - - - - Regular Expression Constraint Escapes - - - - - Escape - Description - - - - - - \A - matches only at the beginning of the string - (see for how this differs from - ^) - - - - \m - matches only at the beginning of a word - - - - \M - matches only at the end of a word - - - - \y - matches only at the beginning or end of a word - - - - \Y - matches only at a point that is not the beginning or end of a - word - - - - \Z - matches only at the end of the string - (see for how this differs from - $) - - - -
- - - A word is defined as in the specification of - [[:<:]] and [[:>:]] above. - Constraint escapes are illegal within bracket expressions. - - - - Regular Expression Back References - - - - - Escape - Description - - - - - - \m - (where m is a nonzero digit) - a back reference to the m'th subexpression - - - - \mnn - (where m is a nonzero digit, and - nn is some more digits, and the decimal value - mnn is not greater than the number of closing capturing - parentheses seen so far) - a back reference to the mnn'th subexpression - - - -
- - - - There is an inherent ambiguity between octal character-entry - escapes and back references, which is resolved by the following heuristics, - as hinted at above. - A leading zero always indicates an octal escape. - A single non-zero digit, not followed by another digit, - is always taken as a back reference. - A multi-digit sequence not starting with a zero is taken as a back - reference if it comes after a suitable subexpression - (i.e., the number is in the legal range for a back reference), - and otherwise is taken as octal. - - -
- - - Regular Expression Metasyntax - - - In addition to the main syntax described above, there are some special - forms and miscellaneous syntactic facilities available. - - - - An RE can begin with one of two special director prefixes. - If an RE begins with ***:, - the rest of the RE is taken as an ARE. (This normally has no effect in - PostgreSQL, since REs are assumed to be AREs; - but it does have an effect if ERE or BRE mode had been specified by - the flags parameter to a regex function.) - If an RE begins with ***=, - the rest of the RE is taken to be a literal string, - with all characters considered ordinary characters. - - - - An ARE can begin with embedded options: - a sequence (?xyz) - (where xyz is one or more alphabetic characters) - specifies options affecting the rest of the RE. - These options override any previously determined options — - in particular, they can override the case-sensitivity behavior implied by - a regex operator, or the flags parameter to a regex - function. - The available option letters are - shown in . - Note that these same option letters are used in the flags - parameters of regex functions. - - - - ARE Embedded-Option Letters - - - - - Option - Description - - - - - - b - rest of RE is a BRE - - - - c - case-sensitive matching (overrides operator type) - - - - e - rest of RE is an ERE - - - - i - case-insensitive matching (see - ) (overrides operator type) - - - - m - historical synonym for n - - - - n - newline-sensitive matching (see - ) - - - - p - partial newline-sensitive matching (see - ) - - - - q - rest of RE is a literal (quoted) string, all ordinary - characters - - - - s - non-newline-sensitive matching (default) - - - - t - tight syntax (default; see below) - - - - w - inverse partial newline-sensitive (weird) matching - (see ) - - - - x - expanded syntax (see below) - - - -
- - - Embedded options take effect at the ) terminating the sequence. - They can appear only at the start of an ARE (after the - ***: director if any). - - - - In addition to the usual (tight) RE syntax, in which all - characters are significant, there is an expanded syntax, - available by specifying the embedded x option. - In the expanded syntax, - white-space characters in the RE are ignored, as are - all characters between a # - and the following newline (or the end of the RE). This - permits paragraphing and commenting a complex RE. - There are three exceptions to that basic rule: - - - - - a white-space character or # preceded by \ is - retained - - - - - white space or # within a bracket expression is retained - - - - - white space and comments cannot appear within multi-character symbols, - such as (?: - - - - - For this purpose, white-space characters are blank, tab, newline, and - any character that belongs to the space character class. - - - - Finally, in an ARE, outside bracket expressions, the sequence - (?#ttt) - (where ttt is any text not containing a )) - is a comment, completely ignored. - Again, this is not allowed between the characters of - multi-character symbols, like (?:. - Such comments are more a historical artifact than a useful facility, - and their use is deprecated; use the expanded syntax instead. - - - - None of these metasyntax extensions is available if - an initial ***= director - has specified that the user's input be treated as a literal string - rather than as an RE. - -
- - - Regular Expression Matching Rules - - - In the event that an RE could match more than one substring of a given - string, the RE matches the one starting earliest in the string. - If the RE could match more than one substring starting at that point, - either the longest possible match or the shortest possible match will - be taken, depending on whether the RE is greedy or - non-greedy. - - - - Whether an RE is greedy or not is determined by the following rules: - - - - Most atoms, and all constraints, have no greediness attribute (because - they cannot match variable amounts of text anyway). - - - - - Adding parentheses around an RE does not change its greediness. - - - - - A quantified atom with a fixed-repetition quantifier - ({m} - or - {m}?) - has the same greediness (possibly none) as the atom itself. - - - - - A quantified atom with other normal quantifiers (including - {m,n} - with m equal to n) - is greedy (prefers longest match). - - - - - A quantified atom with a non-greedy quantifier (including - {m,n}? - with m equal to n) - is non-greedy (prefers shortest match). - - - - - A branch — that is, an RE that has no top-level - | operator — has the same greediness as the first - quantified atom in it that has a greediness attribute. - - - - - An RE consisting of two or more branches connected by the - | operator is always greedy. - - - - - - - The above rules associate greediness attributes not only with individual - quantified atoms, but with branches and entire REs that contain quantified - atoms. What that means is that the matching is done in such a way that - the branch, or whole RE, matches the longest or shortest possible - substring as a whole. Once the length of the entire match - is determined, the part of it that matches any particular subexpression - is determined on the basis of the greediness attribute of that - subexpression, with subexpressions starting earlier in the RE taking - priority over ones starting later. - - - - An example of what this means: - -SELECT SUBSTRING('XY1234Z', 'Y*([0-9]{1,3})'); -Result: 123 -SELECT SUBSTRING('XY1234Z', 'Y*?([0-9]{1,3})'); -Result: 1 - - In the first case, the RE as a whole is greedy because Y* - is greedy. It can match beginning at the Y, and it matches - the longest possible string starting there, i.e., Y123. - The output is the parenthesized part of that, or 123. - In the second case, the RE as a whole is non-greedy because Y*? - is non-greedy. It can match beginning at the Y, and it matches - the shortest possible string starting there, i.e., Y1. - The subexpression [0-9]{1,3} is greedy but it cannot change - the decision as to the overall match length; so it is forced to match - just 1. - - - - In short, when an RE contains both greedy and non-greedy subexpressions, - the total match length is either as long as possible or as short as - possible, according to the attribute assigned to the whole RE. The - attributes assigned to the subexpressions only affect how much of that - match they are allowed to eat relative to each other. - - - - The quantifiers {1,1} and {1,1}? - can be used to force greediness or non-greediness, respectively, - on a subexpression or a whole RE. - This is useful when you need the whole RE to have a greediness attribute - different from what's deduced from its elements. As an example, - suppose that we are trying to separate a string containing some digits - into the digits and the parts before and after them. We might try to - do that like this: - -SELECT regexp_match('abc01234xyz', '(.*)(\d+)(.*)'); -Result: {abc0123,4,xyz} - - That didn't work: the first .* is greedy so - it eats as much as it can, leaving the \d+ to - match at the last possible place, the last digit. We might try to fix - that by making it non-greedy: - -SELECT regexp_match('abc01234xyz', '(.*?)(\d+)(.*)'); -Result: {abc,0,""} - - That didn't work either, because now the RE as a whole is non-greedy - and so it ends the overall match as soon as possible. We can get what - we want by forcing the RE as a whole to be greedy: - -SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); -Result: {abc,01234,xyz} - - Controlling the RE's overall greediness separately from its components' - greediness allows great flexibility in handling variable-length patterns. - - - - When deciding what is a longer or shorter match, - match lengths are measured in characters, not collating elements. - An empty string is considered longer than no match at all. - For example: - bb* - matches the three middle characters of abbbc; - (week|wee)(night|knights) - matches all ten characters of weeknights; - when (.*).* - is matched against abc the parenthesized subexpression - matches all three characters; and when - (a*)* is matched against bc - both the whole RE and the parenthesized - subexpression match an empty string. - - - - If case-independent matching is specified, - the effect is much as if all case distinctions had vanished from the - alphabet. - When an alphabetic that exists in multiple cases appears as an - ordinary character outside a bracket expression, it is effectively - transformed into a bracket expression containing both cases, - e.g., x becomes [xX]. - When it appears inside a bracket expression, all case counterparts - of it are added to the bracket expression, e.g., - [x] becomes [xX] - and [^x] becomes [^xX]. - - - - If newline-sensitive matching is specified, . - and bracket expressions using ^ - will never match the newline character - (so that matches will not cross lines unless the RE - explicitly includes a newline) - and ^ and $ - will match the empty string after and before a newline - respectively, in addition to matching at beginning and end of string - respectively. - But the ARE escapes \A and \Z - continue to match beginning or end of string only. - Also, the character class shorthands \D - and \W will match a newline regardless of this mode. - (Before PostgreSQL 14, they did not match - newlines when in newline-sensitive mode. - Write [^[:digit:]] - or [^[:word:]] to get the old behavior.) - - - - If partial newline-sensitive matching is specified, - this affects . and bracket expressions - as with newline-sensitive matching, but not ^ - and $. - - - - If inverse partial newline-sensitive matching is specified, - this affects ^ and $ - as with newline-sensitive matching, but not . - and bracket expressions. - This isn't very useful but is provided for symmetry. - - - - - Limits and Compatibility - - - No particular limit is imposed on the length of REs in this - implementation. However, - programs intended to be highly portable should not employ REs longer - than 256 bytes, - as a POSIX-compliant implementation can refuse to accept such REs. - - - - The only feature of AREs that is actually incompatible with - POSIX EREs is that \ does not lose its special - significance inside bracket expressions. - All other ARE features use syntax which is illegal or has - undefined or unspecified effects in POSIX EREs; - the *** syntax of directors likewise is outside the POSIX - syntax for both BREs and EREs. - - - - Many of the ARE extensions are borrowed from Perl, but some have - been changed to clean them up, and a few Perl extensions are not present. - Incompatibilities of note include \b, \B, - the lack of special treatment for a trailing newline, - the addition of complemented bracket expressions to the things - affected by newline-sensitive matching, - the restrictions on parentheses and back references in lookahead/lookbehind - constraints, and the longest/shortest-match (rather than first-match) - matching semantics. - - - - - Basic Regular Expressions - - - BREs differ from EREs in several respects. - In BREs, |, +, and ? - are ordinary characters and there is no equivalent - for their functionality. - The delimiters for bounds are - \{ and \}, - with { and } - by themselves ordinary characters. - The parentheses for nested subexpressions are - \( and \), - with ( and ) by themselves ordinary characters. - ^ is an ordinary character except at the beginning of the - RE or the beginning of a parenthesized subexpression, - $ is an ordinary character except at the end of the - RE or the end of a parenthesized subexpression, - and * is an ordinary character if it appears at the beginning - of the RE or the beginning of a parenthesized subexpression - (after a possible leading ^). - Finally, single-digit back references are available, and - \< and \> - are synonyms for - [[:<:]] and [[:>:]] - respectively; no other escapes are available in BREs. - - - - - - - Differences from SQL Standard and XQuery - - - LIKE_REGEX - - - - OCCURRENCES_REGEX - - - - POSITION_REGEX - - - - SUBSTRING_REGEX - - - - TRANSLATE_REGEX - - - - XQuery regular expressions - - - - Since SQL:2008, the SQL standard includes regular expression operators - and functions that performs pattern - matching according to the XQuery regular expression - standard: - - LIKE_REGEX - OCCURRENCES_REGEX - POSITION_REGEX - SUBSTRING_REGEX - TRANSLATE_REGEX - - PostgreSQL does not currently implement these - operators and functions. You can get approximately equivalent - functionality in each case as shown in . (Various optional clauses on - both sides have been omitted in this table.) - - - - Regular Expression Functions Equivalencies - - - - - SQL standard - PostgreSQL - - - - - - string LIKE_REGEX pattern - regexp_like(string, pattern) or string ~ pattern - - - - OCCURRENCES_REGEX(pattern IN string) - regexp_count(string, pattern) - - - - POSITION_REGEX(pattern IN string) - regexp_instr(string, pattern) - - - - SUBSTRING_REGEX(pattern IN string) - regexp_substr(string, pattern) - - - - TRANSLATE_REGEX(pattern IN string WITH replacement) - regexp_replace(string, pattern, replacement) - - - -
- - - Regular expression functions similar to those provided by PostgreSQL are - also available in a number of other SQL implementations, whereas the - SQL-standard functions are not as widely implemented. Some of the - details of the regular expression syntax will likely differ in each - implementation. - - - - The SQL-standard operators and functions use XQuery regular expressions, - which are quite close to the ARE syntax described above. - Notable differences between the existing POSIX-based - regular-expression feature and XQuery regular expressions include: - - - - - XQuery character class subtraction is not supported. An example of - this feature is using the following to match only English - consonants: [a-z-[aeiou]]. - - - - - XQuery character class shorthands \c, - \C, \i, - and \I are not supported. - - - - - XQuery character class elements - using \p{UnicodeProperty} or the - inverse \P{UnicodeProperty} are not supported. - - - - - POSIX interprets character classes such as \w - (see ) - according to the prevailing locale (which you can control by - attaching a COLLATE clause to the operator or - function). XQuery specifies these classes by reference to Unicode - character properties, so equivalent behavior is obtained only with - a locale that follows the Unicode rules. - - - - - The SQL standard (not XQuery itself) attempts to cater for more - variants of newline than POSIX does. The - newline-sensitive matching options described above consider only - ASCII NL (\n) to be a newline, but SQL would have - us treat CR (\r), CRLF (\r\n) - (a Windows-style newline), and some Unicode-only characters like - LINE SEPARATOR (U+2028) as newlines as well. - Notably, . and \s should - count \r\n as one character not two according to - SQL. - - - - - Of the character-entry escapes described in - , - XQuery supports only \n, \r, - and \t. - - - - - XQuery does not support - the [:name:] syntax - for character classes within bracket expressions. - - - - - XQuery does not have lookahead or lookbehind constraints, - nor any of the constraint escapes described in - . - - - - - The metasyntax forms described in - do not exist in XQuery. - - - - - The regular expression flag letters defined by XQuery are - related to but not the same as the option letters for POSIX - (). While the - i and q options behave the - same, others do not: - - - - XQuery's s (allow dot to match newline) - and m (allow ^ - and $ to match at newlines) flags provide - access to the same behaviors as - POSIX's n, p - and w flags, but they - do not match the behavior of - POSIX's s and m flags. - Note in particular that dot-matches-newline is the default - behavior in POSIX but not XQuery. - - - - - XQuery's x (ignore whitespace in pattern) flag - is noticeably different from POSIX's expanded-mode flag. - POSIX's x flag also - allows # to begin a comment in the pattern, - and POSIX will not ignore a whitespace character after a - backslash. - - - - - - - - -
-
-
- - - - Data Type Formatting Functions - - - formatting - - - - The PostgreSQL formatting functions - provide a powerful set of tools for converting various data types - (date/time, integer, floating point, numeric) to formatted strings - and for converting from formatted strings to specific data types. - lists them. - These functions all follow a common calling convention: the first - argument is the value to be formatted and the second argument is a - template that defines the output or input format. - - - - Formatting Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - to_char - - to_char ( timestamp, text ) - text - - - to_char ( timestamp with time zone, text ) - text - - - Converts time stamp to string according to the given format. - - - to_char(timestamp '2002-04-20 17:31:12.66', 'HH12:MI:SS') - 05:31:12 - - - - - - to_char ( interval, text ) - text - - - Converts interval to string according to the given format. - - - to_char(interval '15h 2m 12s', 'HH24:MI:SS') - 15:02:12 - - - - - - to_char ( numeric_type, text ) - text - - - Converts number to string according to the given format; available - for integer, bigint, numeric, - real, double precision. - - - to_char(125, '999') - 125 - - - to_char(125.8::real, '999D9') - 125.8 - - - to_char(-125.8, '999D99S') - 125.80- - - - - - - - to_date - - to_date ( text, text ) - date - - - Converts string to date according to the given format. - - - to_date('05 Dec 2000', 'DD Mon YYYY') - 2000-12-05 - - - - - - - to_number - - to_number ( text, text ) - numeric - - - Converts string to numeric according to the given format. - - - to_number('12,454.8-', '99G999D9S') - -12454.8 - - - - - - - to_timestamp - - to_timestamp ( text, text ) - timestamp with time zone - - - Converts string to time stamp according to the given format. - (See also to_timestamp(double precision) in - .) - - - to_timestamp('05 Dec 2000', 'DD Mon YYYY') - 2000-12-05 00:00:00-05 - - - - -
- - - - to_timestamp and to_date - exist to handle input formats that cannot be converted by - simple casting. For most standard date/time formats, simply casting the - source string to the required data type works, and is much easier. - Similarly, to_number is unnecessary for standard numeric - representations. - - - - - In a to_char output template string, there are certain - patterns that are recognized and replaced with appropriately-formatted - data based on the given value. Any text that is not a template pattern is - simply copied verbatim. Similarly, in an input template string (for the - other functions), template patterns identify the values to be supplied by - the input data string. If there are characters in the template string - that are not template patterns, the corresponding characters in the input - data string are simply skipped over (whether or not they are equal to the - template string characters). - - - - shows the - template patterns available for formatting date and time values. - - - - Template Patterns for Date/Time Formatting - - - - Pattern - Description - - - - - HH - hour of day (01–12) - - - HH12 - hour of day (01–12) - - - HH24 - hour of day (00–23) - - - MI - minute (00–59) - - - SS - second (00–59) - - - MS - millisecond (000–999) - - - US - microsecond (000000–999999) - - - FF1 - tenth of second (0–9) - - - FF2 - hundredth of second (00–99) - - - FF3 - millisecond (000–999) - - - FF4 - tenth of a millisecond (0000–9999) - - - FF5 - hundredth of a millisecond (00000–99999) - - - FF6 - microsecond (000000–999999) - - - SSSS, SSSSS - seconds past midnight (0–86399) - - - AM, am, - PM or pm - meridiem indicator (without periods) - - - A.M., a.m., - P.M. or p.m. - meridiem indicator (with periods) - - - Y,YYY - year (4 or more digits) with comma - - - YYYY - year (4 or more digits) - - - YYY - last 3 digits of year - - - YY - last 2 digits of year - - - Y - last digit of year - - - IYYY - ISO 8601 week-numbering year (4 or more digits) - - - IYY - last 3 digits of ISO 8601 week-numbering year - - - IY - last 2 digits of ISO 8601 week-numbering year - - - I - last digit of ISO 8601 week-numbering year - - - BC, bc, - AD or ad - era indicator (without periods) - - - B.C., b.c., - A.D. or a.d. - era indicator (with periods) - - - MONTH - full upper case month name (blank-padded to 9 chars) - - - Month - full capitalized month name (blank-padded to 9 chars) - - - month - full lower case month name (blank-padded to 9 chars) - - - MON - abbreviated upper case month name (3 chars in English, localized lengths vary) - - - Mon - abbreviated capitalized month name (3 chars in English, localized lengths vary) - - - mon - abbreviated lower case month name (3 chars in English, localized lengths vary) - - - MM - month number (01–12) - - - DAY - full upper case day name (blank-padded to 9 chars) - - - Day - full capitalized day name (blank-padded to 9 chars) - - - day - full lower case day name (blank-padded to 9 chars) - - - DY - abbreviated upper case day name (3 chars in English, localized lengths vary) - - - Dy - abbreviated capitalized day name (3 chars in English, localized lengths vary) - - - dy - abbreviated lower case day name (3 chars in English, localized lengths vary) - - - DDD - day of year (001–366) - - - IDDD - day of ISO 8601 week-numbering year (001–371; day 1 of the year is Monday of the first ISO week) - - - DD - day of month (01–31) - - - D - day of the week, Sunday (1) to Saturday (7) - - - ID - ISO 8601 day of the week, Monday (1) to Sunday (7) - - - W - week of month (1–5) (the first week starts on the first day of the month) - - - WW - week number of year (1–53) (the first week starts on the first day of the year) - - - IW - week number of ISO 8601 week-numbering year (01–53; the first Thursday of the year is in week 1) - - - CC - century (2 digits) (the twenty-first century starts on 2001-01-01) - - - J - Julian Date (integer days since November 24, 4714 BC at local - midnight; see ) - - - Q - quarter - - - RM - month in upper case Roman numerals (I–XII; I=January) - - - rm - month in lower case Roman numerals (i–xii; i=January) - - - TZ - upper case time-zone abbreviation - - - tz - lower case time-zone abbreviation - - - TZH - time-zone hours - - - TZM - time-zone minutes - - - OF - time-zone offset from UTC (HH - or HH:MM) - - - -
- - - Modifiers can be applied to any template pattern to alter its - behavior. For example, FMMonth - is the Month pattern with the - FM modifier. - shows the - modifier patterns for date/time formatting. - - - - Template Pattern Modifiers for Date/Time Formatting - - - - Modifier - Description - Example - - - - - FM prefix - fill mode (suppress leading zeroes and padding blanks) - FMMonth - - - TH suffix - upper case ordinal number suffix - DDTH, e.g., 12TH - - - th suffix - lower case ordinal number suffix - DDth, e.g., 12th - - - FX prefix - fixed format global option (see usage notes) - FX Month DD Day - - - TM prefix - translation mode (use localized day and month names based on - ) - TMMonth - - - SP suffix - spell mode (not implemented) - DDSP - - - -
- - - Usage notes for date/time formatting: - - - - - FM suppresses leading zeroes and trailing blanks - that would otherwise be added to make the output of a pattern be - fixed-width. In PostgreSQL, - FM modifies only the next specification, while in - Oracle FM affects all subsequent - specifications, and repeated FM modifiers - toggle fill mode on and off. - - - - - - TM suppresses trailing blanks whether or - not FM is specified. - - - - - - to_timestamp and to_date - ignore letter case in the input; so for - example MON, Mon, - and mon all accept the same strings. When using - the TM modifier, case-folding is done according to - the rules of the function's input collation (see - ). - - - - - - to_timestamp and to_date - skip multiple blank spaces at the beginning of the input string and - around date and time values unless the FX option is used. For example, - to_timestamp(' 2000    JUN', 'YYYY MON') and - to_timestamp('2000 - JUN', 'YYYY-MON') work, but - to_timestamp('2000    JUN', 'FXYYYY MON') returns an error - because to_timestamp expects only a single space. - FX must be specified as the first item in - the template. - - - - - - A separator (a space or non-letter/non-digit character) in the template string of - to_timestamp and to_date - matches any single separator in the input string or is skipped, - unless the FX option is used. - For example, to_timestamp('2000JUN', 'YYYY///MON') and - to_timestamp('2000/JUN', 'YYYY MON') work, but - to_timestamp('2000//JUN', 'YYYY/MON') - returns an error because the number of separators in the input string - exceeds the number of separators in the template. - - - If FX is specified, a separator in the template string - matches exactly one character in the input string. But note that the - input string character is not required to be the same as the separator from the template string. - For example, to_timestamp('2000/JUN', 'FXYYYY MON') - works, but to_timestamp('2000/JUN', 'FXYYYY  MON') - returns an error because the second space in the template string consumes - the letter J from the input string. - - - - - - A TZH template pattern can match a signed number. - Without the FX option, minus signs may be ambiguous, - and could be interpreted as a separator. - This ambiguity is resolved as follows: If the number of separators before - TZH in the template string is less than the number of - separators before the minus sign in the input string, the minus sign - is interpreted as part of TZH. - Otherwise, the minus sign is considered to be a separator between values. - For example, to_timestamp('2000 -10', 'YYYY TZH') matches - -10 to TZH, but - to_timestamp('2000 -10', 'YYYY  TZH') - matches 10 to TZH. - - - - - - Ordinary text is allowed in to_char - templates and will be output literally. You can put a substring - in double quotes to force it to be interpreted as literal text - even if it contains template patterns. For example, in - '"Hello Year "YYYY', the YYYY - will be replaced by the year data, but the single Y in Year - will not be. - In to_date, to_number, - and to_timestamp, literal text and double-quoted - strings result in skipping the number of characters contained in the - string; for example "XX" skips two input characters - (whether or not they are XX). - - - - Prior to PostgreSQL 12, it was possible to - skip arbitrary text in the input string using non-letter or non-digit - characters. For example, - to_timestamp('2000y6m1d', 'yyyy-MM-DD') used to - work. Now you can only use letter characters for this purpose. For example, - to_timestamp('2000y6m1d', 'yyyytMMtDDt') and - to_timestamp('2000y6m1d', 'yyyy"y"MM"m"DD"d"') - skip y, m, and - d. - - - - - - - If you want to have a double quote in the output you must - precede it with a backslash, for example '\"YYYY - Month\"'. - Backslashes are not otherwise special outside of double-quoted - strings. Within a double-quoted string, a backslash causes the - next character to be taken literally, whatever it is (but this - has no special effect unless the next character is a double quote - or another backslash). - - - - - - In to_timestamp and to_date, - if the year format specification is less than four digits, e.g., - YYY, and the supplied year is less than four digits, - the year will be adjusted to be nearest to the year 2020, e.g., - 95 becomes 1995. - - - - - - In to_timestamp and to_date, - negative years are treated as signifying BC. If you write both a - negative year and an explicit BC field, you get AD - again. An input of year zero is treated as 1 BC. - - - - - - In to_timestamp and to_date, - the YYYY conversion has a restriction when - processing years with more than 4 digits. You must - use some non-digit character or template after YYYY, - otherwise the year is always interpreted as 4 digits. For example - (with the year 20000): - to_date('200001130', 'YYYYMMDD') will be - interpreted as a 4-digit year; instead use a non-digit - separator after the year, like - to_date('20000-1130', 'YYYY-MMDD') or - to_date('20000Nov30', 'YYYYMonDD'). - - - - - - In to_timestamp and to_date, - the CC (century) field is accepted but ignored - if there is a YYY, YYYY or - Y,YYY field. If CC is used with - YY or Y then the result is - computed as that year in the specified century. If the century is - specified but the year is not, the first year of the century - is assumed. - - - - - - In to_timestamp and to_date, - weekday names or numbers (DAY, D, - and related field types) are accepted but are ignored for purposes of - computing the result. The same is true for quarter - (Q) fields. - - - - - - In to_timestamp and to_date, - an ISO 8601 week-numbering date (as distinct from a Gregorian date) - can be specified in one of two ways: - - - - Year, week number, and weekday: for - example to_date('2006-42-4', 'IYYY-IW-ID') - returns the date 2006-10-19. - If you omit the weekday it is assumed to be 1 (Monday). - - - - - Year and day of year: for example to_date('2006-291', - 'IYYY-IDDD') also returns 2006-10-19. - - - - - - Attempting to enter a date using a mixture of ISO 8601 week-numbering - fields and Gregorian date fields is nonsensical, and will cause an - error. In the context of an ISO 8601 week-numbering year, the - concept of a month or day of month has no - meaning. In the context of a Gregorian year, the ISO week has no - meaning. - - - - While to_date will reject a mixture of - Gregorian and ISO week-numbering date - fields, to_char will not, since output format - specifications like YYYY-MM-DD (IYYY-IDDD) can be - useful. But avoid writing something like IYYY-MM-DD; - that would yield surprising results near the start of the year. - (See for more - information.) - - - - - - - In to_timestamp, millisecond - (MS) or microsecond (US) - fields are used as the - seconds digits after the decimal point. For example - to_timestamp('12.3', 'SS.MS') is not 3 milliseconds, - but 300, because the conversion treats it as 12 + 0.3 seconds. - So, for the format SS.MS, the input values - 12.3, 12.30, - and 12.300 specify the - same number of milliseconds. To get three milliseconds, one must write - 12.003, which the conversion treats as - 12 + 0.003 = 12.003 seconds. - - - - Here is a more - complex example: - to_timestamp('15:12:02.020.001230', 'HH24:MI:SS.MS.US') - is 15 hours, 12 minutes, and 2 seconds + 20 milliseconds + - 1230 microseconds = 2.021230 seconds. - - - - - - to_char(..., 'ID')'s day of the week numbering - matches the extract(isodow from ...) function, but - to_char(..., 'D')'s does not match - extract(dow from ...)'s day numbering. - - - - - - to_char(interval) formats HH and - HH12 as shown on a 12-hour clock, for example zero hours - and 36 hours both output as 12, while HH24 - outputs the full hour value, which can exceed 23 in - an interval value. - - - - - - - - shows the - template patterns available for formatting numeric values. - - - - Template Patterns for Numeric Formatting - - - - Pattern - Description - - - - - 9 - digit position (can be dropped if insignificant) - - - 0 - digit position (will not be dropped, even if insignificant) - - - . (period) - decimal point - - - , (comma) - group (thousands) separator - - - PR - negative value in angle brackets - - - S - sign anchored to number (uses locale) - - - L - currency symbol (uses locale) - - - D - decimal point (uses locale) - - - G - group separator (uses locale) - - - MI - minus sign in specified position (if number < 0) - - - PL - plus sign in specified position (if number > 0) - - - SG - plus/minus sign in specified position - - - RN or rn - Roman numeral (values between 1 and 3999) - - - TH or th - ordinal number suffix - - - V - shift specified number of digits (see notes) - - - EEEE - exponent for scientific notation - - - -
- - - Usage notes for numeric formatting: - - - - - 0 specifies a digit position that will always be printed, - even if it contains a leading/trailing zero. 9 also - specifies a digit position, but if it is a leading zero then it will - be replaced by a space, while if it is a trailing zero and fill mode - is specified then it will be deleted. (For to_number(), - these two pattern characters are equivalent.) - - - - - - If the format provides fewer fractional digits than the number being - formatted, to_char() will round the number to - the specified number of fractional digits. - - - - - - The pattern characters S, L, D, - and G represent the sign, currency symbol, decimal point, - and thousands separator characters defined by the current locale - (see - and ). The pattern characters period - and comma represent those exact characters, with the meanings of - decimal point and thousands separator, regardless of locale. - - - - - - If no explicit provision is made for a sign - in to_char()'s pattern, one column will be reserved for - the sign, and it will be anchored to (appear just left of) the - number. If S appears just left of some 9's, - it will likewise be anchored to the number. - - - - - - A sign formatted using SG, PL, or - MI is not anchored to - the number; for example, - to_char(-12, 'MI9999') produces '-  12' - but to_char(-12, 'S9999') produces '  -12'. - (The Oracle implementation does not allow the use of - MI before 9, but rather - requires that 9 precede - MI.) - - - - - - TH does not convert values less than zero - and does not convert fractional numbers. - - - - - - PL, SG, and - TH are PostgreSQL - extensions. - - - - - - In to_number, if non-data template patterns such - as L or TH are used, the - corresponding number of input characters are skipped, whether or not - they match the template pattern, unless they are data characters - (that is, digits, sign, decimal point, or comma). For - example, TH would skip two non-data characters. - - - - - - V with to_char - multiplies the input values by - 10^n, where - n is the number of digits following - V. V with - to_number divides in a similar manner. - The V can be thought of as marking the position - of an implicit decimal point in the input or output string. - to_char and to_number - do not support the use of - V combined with a decimal point - (e.g., 99.9V99 is not allowed). - - - - - - EEEE (scientific notation) cannot be used in - combination with any of the other formatting patterns or - modifiers other than digit and decimal point patterns, and must be at the end of the format string - (e.g., 9.99EEEE is a valid pattern). - - - - - - In to_number(), the RN - pattern converts Roman numerals (in standard form) to numbers. - Input is case-insensitive, so RN - and rn are equivalent. RN - cannot be used in combination with any other formatting patterns or - modifiers except FM, which is applicable only - in to_char() and is ignored - in to_number(). - - - - - - - Certain modifiers can be applied to any template pattern to alter its - behavior. For example, FM99.99 - is the 99.99 pattern with the - FM modifier. - shows the - modifier patterns for numeric formatting. - - - - Template Pattern Modifiers for Numeric Formatting - - - - Modifier - Description - Example - - - - - FM prefix - fill mode (suppress trailing zeroes and padding blanks) - FM99.99 - - - TH suffix - upper case ordinal number suffix - 999TH - - - th suffix - lower case ordinal number suffix - 999th - - - -
- - - shows some - examples of the use of the to_char function. - - - - <function>to_char</function> Examples - - - - Expression - Result - - - - - to_char(current_timestamp, 'Day, DD  HH12:MI:SS') - 'Tuesday  , 06  05:39:18' - - - to_char(current_timestamp, 'FMDay, FMDD  HH12:MI:SS') - 'Tuesday, 6  05:39:18' - - - to_char(current_timestamp AT TIME ZONE - 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS"Z"') - '2022-12-06T05:39:18Z', - ISO 8601 extended format - - - to_char(-0.1, '99.99') - '  -.10' - - - to_char(-0.1, 'FM9.99') - '-.1' - - - to_char(-0.1, 'FM90.99') - '-0.1' - - - to_char(0.1, '0.9') - ' 0.1' - - - to_char(12, '9990999.9') - '    0012.0' - - - to_char(12, 'FM9990999.9') - '0012.' - - - to_char(485, '999') - ' 485' - - - to_char(-485, '999') - '-485' - - - to_char(485, '9 9 9') - ' 4 8 5' - - - to_char(1485, '9,999') - ' 1,485' - - - to_char(1485, '9G999') - ' 1 485' - - - to_char(148.5, '999.999') - ' 148.500' - - - to_char(148.5, 'FM999.999') - '148.5' - - - to_char(148.5, 'FM999.990') - '148.500' - - - to_char(148.5, '999D999') - ' 148,500' - - - to_char(3148.5, '9G999D999') - ' 3 148,500' - - - to_char(-485, '999S') - '485-' - - - to_char(-485, '999MI') - '485-' - - - to_char(485, '999MI') - '485 ' - - - to_char(485, 'FM999MI') - '485' - - - to_char(485, 'PL999') - '+485' - - - to_char(485, 'SG999') - '+485' - - - to_char(-485, 'SG999') - '-485' - - - to_char(-485, '9SG99') - '4-85' - - - to_char(-485, '999PR') - '<485>' - - - to_char(485, 'L999') - 'DM 485' - - - to_char(485, 'RN') - '        CDLXXXV' - - - to_char(485, 'FMRN') - 'CDLXXXV' - - - to_char(5.2, 'FMRN') - 'V' - - - to_char(482, '999th') - ' 482nd' - - - to_char(485, '"Good number:"999') - 'Good number: 485' - - - to_char(485.8, '"Pre:"999" Post:" .999') - 'Pre: 485 Post: .800' - - - to_char(12, '99V999') - ' 12000' - - - to_char(12.4, '99V999') - ' 12400' - - - to_char(12.45, '99V9') - ' 125' - - - to_char(0.0004859, '9.99EEEE') - ' 4.86e-04' - - - -
- -
- - - - Date/Time Functions and Operators - - - shows the available - functions for date/time value processing, with details appearing in - the following subsections. illustrates the behaviors of - the basic arithmetic operators (+, - *, etc.). For formatting functions, refer to - . You should be familiar with - the background information on date/time data types from . - - - - In addition, the usual comparison operators shown in - are available for the - date/time types. Dates and timestamps (with or without time zone) are - all comparable, while times (with or without time zone) and intervals - can only be compared to other values of the same data type. When - comparing a timestamp without time zone to a timestamp with time zone, - the former value is assumed to be given in the time zone specified by - the configuration parameter, and is - rotated to UTC for comparison to the latter value (which is already - in UTC internally). Similarly, a date value is assumed to represent - midnight in the TimeZone zone when comparing it - to a timestamp. - - - - All the functions and operators described below that take time or timestamp - inputs actually come in two variants: one that takes time with time zone or timestamp - with time zone, and one that takes time without time zone or timestamp without time zone. - For brevity, these variants are not shown separately. Also, the - + and * operators come in commutative pairs (for - example both date + integer - and integer + date); we show - only one of each such pair. - - - - Date/Time Operators - - - - - - Operator - - - Description - - - Example(s) - - - - - - - - date + integer - date - - - Add a number of days to a date - - - date '2001-09-28' + 7 - 2001-10-05 - - - - - - date + interval - timestamp - - - Add an interval to a date - - - date '2001-09-28' + interval '1 hour' - 2001-09-28 01:00:00 - - - - - - date + time - timestamp - - - Add a time-of-day to a date - - - date '2001-09-28' + time '03:00' - 2001-09-28 03:00:00 - - - - - - interval + interval - interval - - - Add intervals - - - interval '1 day' + interval '1 hour' - 1 day 01:00:00 - - - - - - timestamp + interval - timestamp - - - Add an interval to a timestamp - - - timestamp '2001-09-28 01:00' + interval '23 hours' - 2001-09-29 00:00:00 - - - - - - time + interval - time - - - Add an interval to a time - - - time '01:00' + interval '3 hours' - 04:00:00 - - - - - - - interval - interval - - - Negate an interval - - - - interval '23 hours' - -23:00:00 - - - - - - date - date - integer - - - Subtract dates, producing the number of days elapsed - - - date '2001-10-01' - date '2001-09-28' - 3 - - - - - - date - integer - date - - - Subtract a number of days from a date - - - date '2001-10-01' - 7 - 2001-09-24 - - - - - - date - interval - timestamp - - - Subtract an interval from a date - - - date '2001-09-28' - interval '1 hour' - 2001-09-27 23:00:00 - - - - - - time - time - interval - - - Subtract times - - - time '05:00' - time '03:00' - 02:00:00 - - - - - - time - interval - time - - - Subtract an interval from a time - - - time '05:00' - interval '2 hours' - 03:00:00 - - - - - - timestamp - interval - timestamp - - - Subtract an interval from a timestamp - - - timestamp '2001-09-28 23:00' - interval '23 hours' - 2001-09-28 00:00:00 - - - - - - interval - interval - interval - - - Subtract intervals - - - interval '1 day' - interval '1 hour' - 1 day -01:00:00 - - - - - - timestamp - timestamp - interval - - - Subtract timestamps (converting 24-hour intervals into days, - similarly to justify_hours()) - - - timestamp '2001-09-29 03:00' - timestamp '2001-07-27 12:00' - 63 days 15:00:00 - - - - - - interval * double precision - interval - - - Multiply an interval by a scalar - - - interval '1 second' * 900 - 00:15:00 - - - interval '1 day' * 21 - 21 days - - - interval '1 hour' * 3.5 - 03:30:00 - - - - - - interval / double precision - interval - - - Divide an interval by a scalar - - - interval '1 hour' / 1.5 - 00:40:00 - - - - -
- - - Date/Time Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - age - - age ( timestamp, timestamp ) - interval - - - Subtract arguments, producing a symbolic result that - uses years and months, rather than just days - - - age(timestamp '2001-04-10', timestamp '1957-06-13') - 43 years 9 mons 27 days - - - - - - age ( timestamp ) - interval - - - Subtract argument from current_date (at midnight) - - - age(timestamp '1957-06-13') - 62 years 6 mons 10 days - - - - - - - clock_timestamp - - clock_timestamp ( ) - timestamp with time zone - - - Current date and time (changes during statement execution); - see - - - clock_timestamp() - 2019-12-23 14:39:53.662522-05 - - - - - - - current_date - - current_date - date - - - Current date; see - - - current_date - 2019-12-23 - - - - - - - current_time - - current_time - time with time zone - - - Current time of day; see - - - current_time - 14:39:53.662522-05 - - - - - - current_time ( integer ) - time with time zone - - - Current time of day, with limited precision; - see - - - current_time(2) - 14:39:53.66-05 - - - - - - - current_timestamp - - current_timestamp - timestamp with time zone - - - Current date and time (start of current transaction); - see - - - current_timestamp - 2019-12-23 14:39:53.662522-05 - - - - - - current_timestamp ( integer ) - timestamp with time zone - - - Current date and time (start of current transaction), with limited precision; - see - - - current_timestamp(0) - 2019-12-23 14:39:53-05 - - - - - - - date_add - - date_add ( timestamp with time zone, interval , text ) - timestamp with time zone - - - Add an interval to a timestamp with time - zone, computing times of day and daylight-savings adjustments - according to the time zone named by the third argument, or the - current setting if that is omitted. - The form with two arguments is equivalent to the timestamp with - time zone + interval operator. - - - date_add('2021-10-31 00:00:00+02'::timestamptz, '1 day'::interval, 'Europe/Warsaw') - 2021-10-31 23:00:00+00 - - - - - - date_bin ( interval, timestamp, timestamp ) - timestamp - - - Bin input into specified interval aligned with specified origin; see - - - date_bin('15 minutes', timestamp '2001-02-16 20:38:40', timestamp '2001-02-16 20:05:00') - 2001-02-16 20:35:00 - - - - - - - date_part - - date_part ( text, timestamp ) - double precision - - - Get timestamp subfield (equivalent to extract); - see - - - date_part('hour', timestamp '2001-02-16 20:38:40') - 20 - - - - - - date_part ( text, interval ) - double precision - - - Get interval subfield (equivalent to extract); - see - - - date_part('month', interval '2 years 3 months') - 3 - - - - - - - date_subtract - - date_subtract ( timestamp with time zone, interval , text ) - timestamp with time zone - - - Subtract an interval from a timestamp with time - zone, computing times of day and daylight-savings adjustments - according to the time zone named by the third argument, or the - current setting if that is omitted. - The form with two arguments is equivalent to the timestamp with - time zone - interval operator. - - - date_subtract('2021-11-01 00:00:00+01'::timestamptz, '1 day'::interval, 'Europe/Warsaw') - 2021-10-30 22:00:00+00 - - - - - - - date_trunc - - date_trunc ( text, timestamp ) - timestamp - - - Truncate to specified precision; see - - - date_trunc('hour', timestamp '2001-02-16 20:38:40') - 2001-02-16 20:00:00 - - - - - - date_trunc ( text, timestamp with time zone, text ) - timestamp with time zone - - - Truncate to specified precision in the specified time zone; see - - - - date_trunc('day', timestamptz '2001-02-16 20:38:40+00', 'Australia/Sydney') - 2001-02-16 13:00:00+00 - - - - - - date_trunc ( text, interval ) - interval - - - Truncate to specified precision; see - - - - date_trunc('hour', interval '2 days 3 hours 40 minutes') - 2 days 03:00:00 - - - - - - - extract - - extract ( field from timestamp ) - numeric - - - Get timestamp subfield; see - - - extract(hour from timestamp '2001-02-16 20:38:40') - 20 - - - - - - extract ( field from interval ) - numeric - - - Get interval subfield; see - - - extract(month from interval '2 years 3 months') - 3 - - - - - - - isfinite - - isfinite ( date ) - boolean - - - Test for finite date (not +/-infinity) - - - isfinite(date '2001-02-16') - true - - - - - - isfinite ( timestamp ) - boolean - - - Test for finite timestamp (not +/-infinity) - - - isfinite(timestamp 'infinity') - false - - - - - - isfinite ( interval ) - boolean - - - Test for finite interval (not +/-infinity) - - - isfinite(interval '4 hours') - true - - - - - - - justify_days - - justify_days ( interval ) - interval - - - Adjust interval, converting 30-day time periods to months - - - justify_days(interval '1 year 65 days') - 1 year 2 mons 5 days - - - - - - - justify_hours - - justify_hours ( interval ) - interval - - - Adjust interval, converting 24-hour time periods to days - - - justify_hours(interval '50 hours 10 minutes') - 2 days 02:10:00 - - - - - - - justify_interval - - justify_interval ( interval ) - interval - - - Adjust interval using justify_days - and justify_hours, with additional sign - adjustments - - - justify_interval(interval '1 mon -1 hour') - 29 days 23:00:00 - - - - - - - localtime - - localtime - time - - - Current time of day; - see - - - localtime - 14:39:53.662522 - - - - - - localtime ( integer ) - time - - - Current time of day, with limited precision; - see - - - localtime(0) - 14:39:53 - - - - - - - localtimestamp - - localtimestamp - timestamp - - - Current date and time (start of current transaction); - see - - - localtimestamp - 2019-12-23 14:39:53.662522 - - - - - - localtimestamp ( integer ) - timestamp - - - Current date and time (start of current - transaction), with limited precision; - see - - - localtimestamp(2) - 2019-12-23 14:39:53.66 - - - - - - - make_date - - make_date ( year int, - month int, - day int ) - date - - - Create date from year, month and day fields - (negative years signify BC) - - - make_date(2013, 7, 15) - 2013-07-15 - - - - - - make_interval - - make_interval ( years int - , months int - , weeks int - , days int - , hours int - , mins int - , secs double precision - ) - interval - - - Create interval from years, months, weeks, days, hours, minutes and - seconds fields, each of which can default to zero - - - make_interval(days => 10) - 10 days - - - - - - - make_time - - make_time ( hour int, - min int, - sec double precision ) - time - - - Create time from hour, minute and seconds fields - - - make_time(8, 15, 23.5) - 08:15:23.5 - - - - - - - make_timestamp - - make_timestamp ( year int, - month int, - day int, - hour int, - min int, - sec double precision ) - timestamp - - - Create timestamp from year, month, day, hour, minute and seconds fields - (negative years signify BC) - - - make_timestamp(2013, 7, 15, 8, 15, 23.5) - 2013-07-15 08:15:23.5 - - - - - - - make_timestamptz - - make_timestamptz ( year int, - month int, - day int, - hour int, - min int, - sec double precision - , timezone text ) - timestamp with time zone - - - Create timestamp with time zone from year, month, day, hour, minute - and seconds fields (negative years signify BC). - If timezone is not - specified, the current time zone is used; the examples assume the - session time zone is Europe/London - - - make_timestamptz(2013, 7, 15, 8, 15, 23.5) - 2013-07-15 08:15:23.5+01 - - - make_timestamptz(2013, 7, 15, 8, 15, 23.5, 'America/New_York') - 2013-07-15 13:15:23.5+01 - - - - - - - now - - now ( ) - timestamp with time zone - - - Current date and time (start of current transaction); - see - - - now() - 2019-12-23 14:39:53.662522-05 - - - - - - - statement_timestamp - - statement_timestamp ( ) - timestamp with time zone - - - Current date and time (start of current statement); - see - - - statement_timestamp() - 2019-12-23 14:39:53.662522-05 - - - - - - - timeofday - - timeofday ( ) - text - - - Current date and time - (like clock_timestamp, but as a text string); - see - - - timeofday() - Mon Dec 23 14:39:53.662522 2019 EST - - - - - - - transaction_timestamp - - transaction_timestamp ( ) - timestamp with time zone - - - Current date and time (start of current transaction); - see - - - transaction_timestamp() - 2019-12-23 14:39:53.662522-05 - - - - - - - to_timestamp - - to_timestamp ( double precision ) - timestamp with time zone - - - Convert Unix epoch (seconds since 1970-01-01 00:00:00+00) to - timestamp with time zone - - - to_timestamp(1284352323) - 2010-09-13 04:32:03+00 - - - - -
- - - - OVERLAPS - - In addition to these functions, the SQL OVERLAPS operator is - supported: - -(start1, end1) OVERLAPS (start2, end2) -(start1, length1) OVERLAPS (start2, length2) - - This expression yields true when two time periods (defined by their - endpoints) overlap, false when they do not overlap. The endpoints - can be specified as pairs of dates, times, or time stamps; or as - a date, time, or time stamp followed by an interval. When a pair - of values is provided, either the start or the end can be written - first; OVERLAPS automatically takes the earlier value - of the pair as the start. Each time period is considered to - represent the half-open interval start <= - time < end, unless - start and end are equal in which case it - represents that single time instant. This means for instance that two - time periods with only an endpoint in common do not overlap. - - - -SELECT (DATE '2001-02-16', DATE '2001-12-21') OVERLAPS - (DATE '2001-10-30', DATE '2002-10-30'); -Result: true -SELECT (DATE '2001-02-16', INTERVAL '100 days') OVERLAPS - (DATE '2001-10-30', DATE '2002-10-30'); -Result: false -SELECT (DATE '2001-10-29', DATE '2001-10-30') OVERLAPS - (DATE '2001-10-30', DATE '2001-10-31'); -Result: false -SELECT (DATE '2001-10-30', DATE '2001-10-30') OVERLAPS - (DATE '2001-10-30', DATE '2001-10-31'); -Result: true - - - - When adding an interval value to (or subtracting an - interval value from) a timestamp - or timestamp with time zone value, the months, days, and - microseconds fields of the interval value are handled in turn. - First, a nonzero months field advances or decrements the date of the - timestamp by the indicated number of months, keeping the day of month the - same unless it would be past the end of the new month, in which case the - last day of that month is used. (For example, March 31 plus 1 month - becomes April 30, but March 31 plus 2 months becomes May 31.) - Then the days field advances or decrements the date of the timestamp by - the indicated number of days. In both these steps the local time of day - is kept the same. Finally, if there is a nonzero microseconds field, it - is added or subtracted literally. - When doing arithmetic on a timestamp with time zone value in - a time zone that recognizes DST, this means that adding or subtracting - (say) interval '1 day' does not necessarily have the - same result as adding or subtracting interval '24 - hours'. - For example, with the session time zone set - to America/Denver: - -SELECT timestamp with time zone '2005-04-02 12:00:00-07' + interval '1 day'; -Result: 2005-04-03 12:00:00-06 -SELECT timestamp with time zone '2005-04-02 12:00:00-07' + interval '24 hours'; -Result: 2005-04-03 13:00:00-06 - - This happens because an hour was skipped due to a change in daylight saving - time at 2005-04-03 02:00:00 in time zone - America/Denver. - - - - Note there can be ambiguity in the months field returned by - age because different months have different numbers of - days. PostgreSQL's approach uses the month from the - earlier of the two dates when calculating partial months. For example, - age('2004-06-01', '2004-04-30') uses April to yield - 1 mon 1 day, while using May would yield 1 mon 2 - days because May has 31 days, while April has only 30. - - - - Subtraction of dates and timestamps can also be complex. One conceptually - simple way to perform subtraction is to convert each value to a number - of seconds using EXTRACT(EPOCH FROM ...), then subtract the - results; this produces the - number of seconds between the two values. This will adjust - for the number of days in each month, timezone changes, and daylight - saving time adjustments. Subtraction of date or timestamp - values with the - operator - returns the number of days (24-hours) and hours/minutes/seconds - between the values, making the same adjustments. The age - function returns years, months, days, and hours/minutes/seconds, - performing field-by-field subtraction and then adjusting for negative - field values. The following queries illustrate the differences in these - approaches. The sample results were produced with timezone - = 'US/Eastern'; there is a daylight saving time change between the - two dates used: - - - -SELECT EXTRACT(EPOCH FROM timestamptz '2013-07-01 12:00:00') - - EXTRACT(EPOCH FROM timestamptz '2013-03-01 12:00:00'); -Result: 10537200.000000 -SELECT (EXTRACT(EPOCH FROM timestamptz '2013-07-01 12:00:00') - - EXTRACT(EPOCH FROM timestamptz '2013-03-01 12:00:00')) - / 60 / 60 / 24; -Result: 121.9583333333333333 -SELECT timestamptz '2013-07-01 12:00:00' - timestamptz '2013-03-01 12:00:00'; -Result: 121 days 23:00:00 -SELECT age(timestamptz '2013-07-01 12:00:00', timestamptz '2013-03-01 12:00:00'); -Result: 4 mons - - - - <function>EXTRACT</function>, <function>date_part</function> - - - date_part - - - extract - - - -EXTRACT(field FROM source) - - - - The extract function retrieves subfields - such as year or hour from date/time values. - source must be a value expression of - type timestamp, date, time, - or interval. (Timestamps and times can be with or - without time zone.) - field is an identifier or - string that selects what field to extract from the source value. - Not all fields are valid for every input data type; for example, fields - smaller than a day cannot be extracted from a date, while - fields of a day or more cannot be extracted from a time. - The extract function returns values of type - numeric. - - - - The following are valid field names: - - - - - century - - - The century; for interval values, the year field - divided by 100 - - - -SELECT EXTRACT(CENTURY FROM TIMESTAMP '2000-12-16 12:21:13'); -Result: 20 -SELECT EXTRACT(CENTURY FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 21 -SELECT EXTRACT(CENTURY FROM DATE '0001-01-01 AD'); -Result: 1 -SELECT EXTRACT(CENTURY FROM DATE '0001-12-31 BC'); -Result: -1 -SELECT EXTRACT(CENTURY FROM INTERVAL '2001 years'); -Result: 20 - - - - - - day - - - The day of the month (1–31); for interval - values, the number of days - - - -SELECT EXTRACT(DAY FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 16 -SELECT EXTRACT(DAY FROM INTERVAL '40 days 1 minute'); -Result: 40 - - - - - - - decade - - - The year field divided by 10 - - - -SELECT EXTRACT(DECADE FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 200 - - - - - - dow - - - The day of the week as Sunday (0) to - Saturday (6) - - - -SELECT EXTRACT(DOW FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 5 - - - Note that extract's day of the week numbering - differs from that of the to_char(..., - 'D') function. - - - - - - - doy - - - The day of the year (1–365/366) - - - -SELECT EXTRACT(DOY FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 47 - - - - - - epoch - - - For timestamp with time zone values, the - number of seconds since 1970-01-01 00:00:00 UTC (negative for - timestamps before that); - for date and timestamp values, the - nominal number of seconds since 1970-01-01 00:00:00, - without regard to timezone or daylight-savings rules; - for interval values, the total number - of seconds in the interval - - - -SELECT EXTRACT(EPOCH FROM TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40.12-08'); -Result: 982384720.120000 -SELECT EXTRACT(EPOCH FROM TIMESTAMP '2001-02-16 20:38:40.12'); -Result: 982355920.120000 -SELECT EXTRACT(EPOCH FROM INTERVAL '5 days 3 hours'); -Result: 442800.000000 - - - - You can convert an epoch value back to a timestamp with time zone - with to_timestamp: - - -SELECT to_timestamp(982384720.12); -Result: 2001-02-17 04:38:40.12+00 - - - - Beware that applying to_timestamp to an epoch - extracted from a date or timestamp value - could produce a misleading result: the result will effectively - assume that the original value had been given in UTC, which might - not be the case. - - - - - - hour - - - The hour field (0–23 in timestamps, unrestricted in - intervals) - - - -SELECT EXTRACT(HOUR FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 20 - - - - - - isodow - - - The day of the week as Monday (1) to - Sunday (7) - - - -SELECT EXTRACT(ISODOW FROM TIMESTAMP '2001-02-18 20:38:40'); -Result: 7 - - - This is identical to dow except for Sunday. This - matches the ISO 8601 day of the week numbering. - - - - - - - isoyear - - - The ISO 8601 week-numbering year that the date - falls in - - - -SELECT EXTRACT(ISOYEAR FROM DATE '2006-01-01'); -Result: 2005 -SELECT EXTRACT(ISOYEAR FROM DATE '2006-01-02'); -Result: 2006 - - - - Each ISO 8601 week-numbering year begins with the - Monday of the week containing the 4th of January, so in early - January or late December the ISO year may be - different from the Gregorian year. See the week - field for more information. - - - - - - julian - - - The Julian Date corresponding to the - date or timestamp. Timestamps - that are not local midnight result in a fractional value. See - for more information. - - - -SELECT EXTRACT(JULIAN FROM DATE '2006-01-01'); -Result: 2453737 -SELECT EXTRACT(JULIAN FROM TIMESTAMP '2006-01-01 12:00'); -Result: 2453737.50000000000000000000 - - - - - - microseconds - - - The seconds field, including fractional parts, multiplied by 1 - 000 000; note that this includes full seconds - - - -SELECT EXTRACT(MICROSECONDS FROM TIME '17:12:28.5'); -Result: 28500000 - - - - - - millennium - - - The millennium; for interval values, the year field - divided by 1000 - - - -SELECT EXTRACT(MILLENNIUM FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 3 -SELECT EXTRACT(MILLENNIUM FROM INTERVAL '2001 years'); -Result: 2 - - - - Years in the 1900s are in the second millennium. - The third millennium started January 1, 2001. - - - - - - milliseconds - - - The seconds field, including fractional parts, multiplied by - 1000. Note that this includes full seconds. - - - -SELECT EXTRACT(MILLISECONDS FROM TIME '17:12:28.5'); -Result: 28500.000 - - - - - - minute - - - The minutes field (0–59) - - - -SELECT EXTRACT(MINUTE FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 38 - - - - - - month - - - The number of the month within the year (1–12); - for interval values, the number of months modulo 12 - (0–11) - - - -SELECT EXTRACT(MONTH FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 2 -SELECT EXTRACT(MONTH FROM INTERVAL '2 years 3 months'); -Result: 3 -SELECT EXTRACT(MONTH FROM INTERVAL '2 years 13 months'); -Result: 1 - - - - - - quarter - - - The quarter of the year (1–4) that the date is in; - for interval values, the month field divided by 3 - plus 1 - - - -SELECT EXTRACT(QUARTER FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 1 -SELECT EXTRACT(QUARTER FROM INTERVAL '1 year 6 months'); -Result: 3 - - - - - - second - - - The seconds field, including any fractional seconds - - - -SELECT EXTRACT(SECOND FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 40.000000 -SELECT EXTRACT(SECOND FROM TIME '17:12:28.5'); -Result: 28.500000 - - - - - timezone - - - The time zone offset from UTC, measured in seconds. Positive values - correspond to time zones east of UTC, negative values to - zones west of UTC. (Technically, - PostgreSQL does not use UTC because - leap seconds are not handled.) - - - - - - timezone_hour - - - The hour component of the time zone offset - - - - - - timezone_minute - - - The minute component of the time zone offset - - - - - - week - - - The number of the ISO 8601 week-numbering week of - the year. By definition, ISO weeks start on Mondays and the first - week of a year contains January 4 of that year. In other words, the - first Thursday of a year is in week 1 of that year. - - - In the ISO week-numbering system, it is possible for early-January - dates to be part of the 52nd or 53rd week of the previous year, and for - late-December dates to be part of the first week of the next year. - For example, 2005-01-01 is part of the 53rd week of year - 2004, and 2006-01-01 is part of the 52nd week of year - 2005, while 2012-12-31 is part of the first week of 2013. - It's recommended to use the isoyear field together with - week to get consistent results. - - - - For interval values, the week field is simply the number - of integral days divided by 7. - - - -SELECT EXTRACT(WEEK FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 7 -SELECT EXTRACT(WEEK FROM INTERVAL '13 days 24 hours'); -Result: 1 - - - - - - year - - - The year field. Keep in mind there is no 0 AD, so subtracting - BC years from AD years should be done with care. - - - -SELECT EXTRACT(YEAR FROM TIMESTAMP '2001-02-16 20:38:40'); -Result: 2001 - - - - - - - - - When processing an interval value, - the extract function produces field values that - match the interpretation used by the interval output function. This - can produce surprising results if one starts with a non-normalized - interval representation, for example: - -SELECT INTERVAL '80 minutes'; -Result: 01:20:00 -SELECT EXTRACT(MINUTES FROM INTERVAL '80 minutes'); -Result: 20 - - - - - - When the input value is +/-Infinity, extract returns - +/-Infinity for monotonically-increasing fields (epoch, - julian, year, isoyear, - decade, century, and millennium - for timestamp inputs; epoch, hour, - day, year, decade, - century, and millennium for - interval inputs). - For other fields, NULL is returned. PostgreSQL - versions before 9.6 returned zero for all cases of infinite input. - - - - - The extract function is primarily intended - for computational processing. For formatting date/time values for - display, see . - - - - The date_part function is modeled on the traditional - Ingres equivalent to the - SQL-standard function extract: - -date_part('field', source) - - Note that here the field parameter needs to - be a string value, not a name. The valid field names for - date_part are the same as for - extract. - For historical reasons, the date_part function - returns values of type double precision. This can result in - a loss of precision in certain uses. Using extract - is recommended instead. - - - -SELECT date_part('day', TIMESTAMP '2001-02-16 20:38:40'); -Result: 16 -SELECT date_part('hour', INTERVAL '4 hours 3 minutes'); -Result: 4 - - - - - - <function>date_trunc</function> - - - date_trunc - - - - The function date_trunc is conceptually - similar to the trunc function for numbers. - - - - -date_trunc(field, source , time_zone ) - - source is a value expression of type - timestamp, timestamp with time zone, - or interval. - (Values of type date and - time are cast automatically to timestamp or - interval, respectively.) - field selects to which precision to - truncate the input value. The return value is likewise of type - timestamp, timestamp with time zone, - or interval, - and it has all fields that are less significant than the - selected one set to zero (or one, for day and month). - - - - Valid values for field are: - - microseconds - milliseconds - second - minute - hour - day - week - month - quarter - year - decade - century - millennium - - - - - When the input value is of type timestamp with time zone, - the truncation is performed with respect to a particular time zone; - for example, truncation to day produces a value that - is midnight in that zone. By default, truncation is done with respect - to the current setting, but the - optional time_zone argument can be provided - to specify a different time zone. The time zone name can be specified - in any of the ways described in . - - - - A time zone cannot be specified when processing timestamp without - time zone or interval inputs. These are always - taken at face value. - - - - Examples (assuming the local time zone is America/New_York): - -SELECT date_trunc('hour', TIMESTAMP '2001-02-16 20:38:40'); -Result: 2001-02-16 20:00:00 -SELECT date_trunc('year', TIMESTAMP '2001-02-16 20:38:40'); -Result: 2001-01-01 00:00:00 -SELECT date_trunc('day', TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40+00'); -Result: 2001-02-16 00:00:00-05 -SELECT date_trunc('day', TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40+00', 'Australia/Sydney'); -Result: 2001-02-16 08:00:00-05 -SELECT date_trunc('hour', INTERVAL '3 days 02:47:33'); -Result: 3 days 02:00:00 - - - - - - <function>date_bin</function> - - - date_bin - - - - The function date_bin bins the input - timestamp into the specified interval (the stride) - aligned with a specified origin. - - - - -date_bin(stride, source, origin) - - source is a value expression of type - timestamp or timestamp with time zone. (Values - of type date are cast automatically to - timestamp.) stride is a value - expression of type interval. The return value is likewise - of type timestamp or timestamp with time zone, - and it marks the beginning of the bin into which the - source is placed. - - - - Examples: - -SELECT date_bin('15 minutes', TIMESTAMP '2020-02-11 15:44:17', TIMESTAMP '2001-01-01'); -Result: 2020-02-11 15:30:00 -SELECT date_bin('15 minutes', TIMESTAMP '2020-02-11 15:44:17', TIMESTAMP '2001-01-01 00:02:30'); -Result: 2020-02-11 15:32:30 - - - - - In the case of full units (1 minute, 1 hour, etc.), it gives the same result as - the analogous date_trunc call, but the difference is - that date_bin can truncate to an arbitrary interval. - - - - The stride interval must be greater than zero and - cannot contain units of month or larger. - - - - - <literal>AT TIME ZONE</literal> and <literal>AT LOCAL</literal> - - - time zone - conversion - - - - AT TIME ZONE - - - - AT LOCAL - - - - The AT TIME ZONE operator converts time - stamp without time zone to/from - time stamp with time zone, and - time with time zone values to different time - zones. shows its - variants. - - - - <literal>AT TIME ZONE</literal> and <literal>AT LOCAL</literal> Variants - - - - - Operator - - - Description - - - Example(s) - - - - - - - - timestamp without time zone AT TIME ZONE zone - timestamp with time zone - - - Converts given time stamp without time zone to - time stamp with time zone, assuming the given - value is in the named time zone. - - - timestamp '2001-02-16 20:38:40' at time zone 'America/Denver' - 2001-02-17 03:38:40+00 - - - - - - timestamp without time zone AT LOCAL - timestamp with time zone - - - Converts given time stamp without time zone to - time stamp with the session's - TimeZone value as time zone. - - - timestamp '2001-02-16 20:38:40' at local - 2001-02-17 03:38:40+00 - - - - - - timestamp with time zone AT TIME ZONE zone - timestamp without time zone - - - Converts given time stamp with time zone to - time stamp without time zone, as the time would - appear in that zone. - - - timestamp with time zone '2001-02-16 20:38:40-05' at time zone 'America/Denver' - 2001-02-16 18:38:40 - - - - - - timestamp with time zone AT LOCAL - timestamp without time zone - - - Converts given time stamp with time zone to - time stamp without time zone, as the time would - appear with the session's TimeZone value as time zone. - - - timestamp with time zone '2001-02-16 20:38:40-05' at local - 2001-02-16 18:38:40 - - - - - - time with time zone AT TIME ZONE zone - time with time zone - - - Converts given time with time zone to a new time - zone. Since no date is supplied, this uses the currently active UTC - offset for the named destination zone. - - - time with time zone '05:34:17-05' at time zone 'UTC' - 10:34:17+00 - - - - - - time with time zone AT LOCAL - time with time zone - - - Converts given time with time zone to a new time - zone. Since no date is supplied, this uses the currently active UTC - offset for the session's TimeZone value. - - - Assuming the session's TimeZone is set to UTC: - - - time with time zone '05:34:17-05' at local - 10:34:17+00 - - - - -
- - - In these expressions, the desired time zone zone can be - specified either as a text value (e.g., 'America/Los_Angeles') - or as an interval (e.g., INTERVAL '-08:00'). - In the text case, a time zone name can be specified in any of the ways - described in . - The interval case is only useful for zones that have fixed offsets from - UTC, so it is not very common in practice. - - - - The syntax AT LOCAL may be used as shorthand for - AT TIME ZONE local, where - local is the session's - TimeZone value. - - - - Examples (assuming the current setting - is America/Los_Angeles): - -SELECT TIMESTAMP '2001-02-16 20:38:40' AT TIME ZONE 'America/Denver'; -Result: 2001-02-16 19:38:40-08 -SELECT TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40-05' AT TIME ZONE 'America/Denver'; -Result: 2001-02-16 18:38:40 -SELECT TIMESTAMP '2001-02-16 20:38:40' AT TIME ZONE 'Asia/Tokyo' AT TIME ZONE 'America/Chicago'; -Result: 2001-02-16 05:38:40 -SELECT TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40-05' AT LOCAL; -Result: 2001-02-16 17:38:40 -SELECT TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40-05' AT TIME ZONE '+05'; -Result: 2001-02-16 20:38:40 -SELECT TIME WITH TIME ZONE '20:38:40-05' AT LOCAL; -Result: 17:38:40 - - The first example adds a time zone to a value that lacks it, and - displays the value using the current TimeZone - setting. The second example shifts the time stamp with time zone value - to the specified time zone, and returns the value without a time zone. - This allows storage and display of values different from the current - TimeZone setting. The third example converts - Tokyo time to Chicago time. The fourth example shifts the time stamp - with time zone value to the time zone currently specified by the - TimeZone setting and returns the value without a - time zone. The fifth example demonstrates that the sign in a POSIX-style - time zone specification has the opposite meaning of the sign in an - ISO-8601 datetime literal, as described in - and . - - - - The sixth example is a cautionary tale. Due to the fact that there is no - date associated with the input value, the conversion is made using the - current date of the session. Therefore, this static example may show a wrong - result depending on the time of the year it is viewed because - 'America/Los_Angeles' observes Daylight Savings Time. - - - - The function timezone(zone, - timestamp) is equivalent to the SQL-conforming construct - timestamp AT TIME ZONE - zone. - - - - The function timezone(zone, - time) is equivalent to the SQL-conforming construct - time AT TIME ZONE - zone. - - - - The function timezone(timestamp) - is equivalent to the SQL-conforming construct timestamp - AT LOCAL. - - - - The function timezone(time) - is equivalent to the SQL-conforming construct time - AT LOCAL. - -
- - - Current Date/Time - - - date - current - - - - time - current - - - - PostgreSQL provides a number of functions - that return values related to the current date and time. These - SQL-standard functions all return values based on the start time of - the current transaction: - -CURRENT_DATE -CURRENT_TIME -CURRENT_TIMESTAMP -CURRENT_TIME(precision) -CURRENT_TIMESTAMP(precision) -LOCALTIME -LOCALTIMESTAMP -LOCALTIME(precision) -LOCALTIMESTAMP(precision) - - - - - CURRENT_TIME and - CURRENT_TIMESTAMP deliver values with time zone; - LOCALTIME and - LOCALTIMESTAMP deliver values without time zone. - - - - CURRENT_TIME, - CURRENT_TIMESTAMP, - LOCALTIME, and - LOCALTIMESTAMP - can optionally take - a precision parameter, which causes the result to be rounded - to that many fractional digits in the seconds field. Without a precision parameter, - the result is given to the full available precision. - - - - Some examples: - -SELECT CURRENT_TIME; -Result: 14:39:53.662522-05 -SELECT CURRENT_DATE; -Result: 2019-12-23 -SELECT CURRENT_TIMESTAMP; -Result: 2019-12-23 14:39:53.662522-05 -SELECT CURRENT_TIMESTAMP(2); -Result: 2019-12-23 14:39:53.66-05 -SELECT LOCALTIMESTAMP; -Result: 2019-12-23 14:39:53.662522 - - - - - Since these functions return - the start time of the current transaction, their values do not - change during the transaction. This is considered a feature: - the intent is to allow a single transaction to have a consistent - notion of the current time, so that multiple - modifications within the same transaction bear the same - time stamp. - - - - - Other database systems might advance these values more - frequently. - - - - - PostgreSQL also provides functions that - return the start time of the current statement, as well as the actual - current time at the instant the function is called. The complete list - of non-SQL-standard time functions is: - -transaction_timestamp() -statement_timestamp() -clock_timestamp() -timeofday() -now() - - - - - transaction_timestamp() is equivalent to - CURRENT_TIMESTAMP, but is named to clearly reflect - what it returns. - statement_timestamp() returns the start time of the current - statement (more specifically, the time of receipt of the latest command - message from the client). - statement_timestamp() and transaction_timestamp() - return the same value during the first command of a transaction, but might - differ during subsequent commands. - clock_timestamp() returns the actual current time, and - therefore its value changes even within a single SQL command. - timeofday() is a historical - PostgreSQL function. Like - clock_timestamp(), it returns the actual current time, - but as a formatted text string rather than a timestamp - with time zone value. - now() is a traditional PostgreSQL - equivalent to transaction_timestamp(). - - - - All the date/time data types also accept the special literal value - now to specify the current date and time (again, - interpreted as the transaction start time). Thus, - the following three all return the same result: - -SELECT CURRENT_TIMESTAMP; -SELECT now(); -SELECT TIMESTAMP 'now'; -- but see tip below - - - - - - Do not use the third form when specifying a value to be evaluated later, - for example in a DEFAULT clause for a table column. - The system will convert now - to a timestamp as soon as the constant is parsed, so that when - the default value is needed, - the time of the table creation would be used! The first two - forms will not be evaluated until the default value is used, - because they are function calls. Thus they will give the desired - behavior of defaulting to the time of row insertion. - (See also .) - - - - - - Delaying Execution - - - pg_sleep - - - pg_sleep_for - - - pg_sleep_until - - - sleep - - - delay - - - - The following functions are available to delay execution of the server - process: - -pg_sleep ( double precision ) -pg_sleep_for ( interval ) -pg_sleep_until ( timestamp with time zone ) - - - pg_sleep makes the current session's process - sleep until the given number of seconds have - elapsed. Fractional-second delays can be specified. - pg_sleep_for is a convenience function to - allow the sleep time to be specified as an interval. - pg_sleep_until is a convenience function for when - a specific wake-up time is desired. - For example: - - -SELECT pg_sleep(1.5); -SELECT pg_sleep_for('5 minutes'); -SELECT pg_sleep_until('tomorrow 03:00'); - - - - - - The effective resolution of the sleep interval is platform-specific; - 0.01 seconds is a common value. The sleep delay will be at least as long - as specified. It might be longer depending on factors such as server load. - In particular, pg_sleep_until is not guaranteed to - wake up exactly at the specified time, but it will not wake up any earlier. - - - - - - Make sure that your session does not hold more locks than necessary - when calling pg_sleep or its variants. Otherwise - other sessions might have to wait for your sleeping process, slowing down - the entire system. - - - - -
- - - - Enum Support Functions - - - For enum types (described in ), - there are several functions that allow cleaner programming without - hard-coding particular values of an enum type. - These are listed in . The examples - assume an enum type created as: - - -CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple'); - - - - - - Enum Support Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - enum_first - - enum_first ( anyenum ) - anyenum - - - Returns the first value of the input enum type. - - - enum_first(null::rainbow) - red - - - - - - enum_last - - enum_last ( anyenum ) - anyenum - - - Returns the last value of the input enum type. - - - enum_last(null::rainbow) - purple - - - - - - enum_range - - enum_range ( anyenum ) - anyarray - - - Returns all values of the input enum type in an ordered array. - - - enum_range(null::rainbow) - {red,orange,yellow,&zwsp;green,blue,purple} - - - - - enum_range ( anyenum, anyenum ) - anyarray - - - Returns the range between the two given enum values, as an ordered - array. The values must be from the same enum type. If the first - parameter is null, the result will start with the first value of - the enum type. - If the second parameter is null, the result will end with the last - value of the enum type. - - - enum_range('orange'::rainbow, 'green'::rainbow) - {orange,yellow,green} - - - enum_range(NULL, 'green'::rainbow) - {red,orange,&zwsp;yellow,green} - - - enum_range('orange'::rainbow, NULL) - {orange,yellow,green,&zwsp;blue,purple} - - - - -
- - - Notice that except for the two-argument form of enum_range, - these functions disregard the specific value passed to them; they care - only about its declared data type. Either null or a specific value of - the type can be passed, with the same result. It is more common to - apply these functions to a table column or function argument than to - a hardwired type name as used in the examples. - -
- - - Geometric Functions and Operators - - - The geometric types point, box, - lseg, line, path, - polygon, and circle have a large set of - native support functions and operators, shown in , , and . - - - - Geometric Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - geometric_type + point - geometric_type - - - Adds the coordinates of the second point to those of each - point of the first argument, thus performing translation. - Available for point, box, path, - circle. - - - box '(1,1),(0,0)' + point '(2,0)' - (3,1),(2,0) - - - - - - path + path - path - - - Concatenates two open paths (returns NULL if either path is closed). - - - path '[(0,0),(1,1)]' + path '[(2,2),(3,3),(4,4)]' - [(0,0),(1,1),(2,2),(3,3),(4,4)] - - - - - - geometric_type - point - geometric_type - - - Subtracts the coordinates of the second point from those - of each point of the first argument, thus performing translation. - Available for point, box, path, - circle. - - - box '(1,1),(0,0)' - point '(2,0)' - (-1,1),(-2,0) - - - - - - geometric_type * point - geometric_type - - - Multiplies each point of the first argument by the second - point (treating a point as being a complex number - represented by real and imaginary parts, and performing standard - complex multiplication). If one interprets - the second point as a vector, this is equivalent to - scaling the object's size and distance from the origin by the length - of the vector, and rotating it counterclockwise around the origin by - the vector's angle from the x axis. - Available for point, box,Rotating a - box with these operators only moves its corner points: the box is - still considered to have sides parallel to the axes. Hence the box's - size is not preserved, as a true rotation would do. - path, circle. - - - path '((0,0),(1,0),(1,1))' * point '(3.0,0)' - ((0,0),(3,0),(3,3)) - - - path '((0,0),(1,0),(1,1))' * point(cosd(45), sind(45)) - ((0,0),&zwsp;(0.7071067811865475,0.7071067811865475),&zwsp;(0,1.414213562373095)) - - - - - - geometric_type / point - geometric_type - - - Divides each point of the first argument by the second - point (treating a point as being a complex number - represented by real and imaginary parts, and performing standard - complex division). If one interprets - the second point as a vector, this is equivalent to - scaling the object's size and distance from the origin down by the - length of the vector, and rotating it clockwise around the origin by - the vector's angle from the x axis. - Available for point, box, path, - circle. - - - path '((0,0),(1,0),(1,1))' / point '(2.0,0)' - ((0,0),(0.5,0),(0.5,0.5)) - - - path '((0,0),(1,0),(1,1))' / point(cosd(45), sind(45)) - ((0,0),&zwsp;(0.7071067811865476,-0.7071067811865476),&zwsp;(1.4142135623730951,0)) - - - - - - @-@ geometric_type - double precision - - - Computes the total length. - Available for lseg, path. - - - @-@ path '[(0,0),(1,0),(1,1)]' - 2 - - - - - - @@ geometric_type - point - - - Computes the center point. - Available for box, lseg, - polygon, circle. - - - @@ box '(2,2),(0,0)' - (1,1) - - - - - - # geometric_type - integer - - - Returns the number of points. - Available for path, polygon. - - - # path '((1,0),(0,1),(-1,0))' - 3 - - - - - - geometric_type # geometric_type - point - - - Computes the point of intersection, or NULL if there is none. - Available for lseg, line. - - - lseg '[(0,0),(1,1)]' # lseg '[(1,0),(0,1)]' - (0.5,0.5) - - - - - - box # box - box - - - Computes the intersection of two boxes, or NULL if there is none. - - - box '(2,2),(-1,-1)' # box '(1,1),(-2,-2)' - (1,1),(-1,-1) - - - - - - geometric_type ## geometric_type - point - - - Computes the closest point to the first object on the second object. - Available for these pairs of types: - (point, box), - (point, lseg), - (point, line), - (lseg, box), - (lseg, lseg), - (line, lseg). - - - point '(0,0)' ## lseg '[(2,0),(0,2)]' - (1,1) - - - - - - geometric_type <-> geometric_type - double precision - - - Computes the distance between the objects. - Available for all seven geometric types, for all combinations - of point with another geometric type, and for - these additional pairs of types: - (box, lseg), - (lseg, line), - (polygon, circle) - (and the commutator cases). - - - circle '<(0,0),1>' <-> circle '<(5,0),1>' - 3 - - - - - - geometric_type @> geometric_type - boolean - - - Does first object contain second? - Available for these pairs of types: - (box, point), - (box, box), - (path, point), - (polygon, point), - (polygon, polygon), - (circle, point), - (circle, circle). - - - circle '<(0,0),2>' @> point '(1,1)' - t - - - - - - geometric_type <@ geometric_type - boolean - - - Is first object contained in or on second? - Available for these pairs of types: - (point, box), - (point, lseg), - (point, line), - (point, path), - (point, polygon), - (point, circle), - (box, box), - (lseg, box), - (lseg, line), - (polygon, polygon), - (circle, circle). - - - point '(1,1)' <@ circle '<(0,0),2>' - t - - - - - - geometric_type && geometric_type - boolean - - - Do these objects overlap? (One point in common makes this true.) - Available for box, polygon, - circle. - - - box '(1,1),(0,0)' && box '(2,2),(0,0)' - t - - - - - - geometric_type << geometric_type - boolean - - - Is first object strictly left of second? - Available for point, box, - polygon, circle. - - - circle '<(0,0),1>' << circle '<(5,0),1>' - t - - - - - - geometric_type >> geometric_type - boolean - - - Is first object strictly right of second? - Available for point, box, - polygon, circle. - - - circle '<(5,0),1>' >> circle '<(0,0),1>' - t - - - - - - geometric_type &< geometric_type - boolean - - - Does first object not extend to the right of second? - Available for box, polygon, - circle. - - - box '(1,1),(0,0)' &< box '(2,2),(0,0)' - t - - - - - - geometric_type &> geometric_type - boolean - - - Does first object not extend to the left of second? - Available for box, polygon, - circle. - - - box '(3,3),(0,0)' &> box '(2,2),(0,0)' - t - - - - - - geometric_type <<| geometric_type - boolean - - - Is first object strictly below second? - Available for point, box, polygon, - circle. - - - box '(3,3),(0,0)' <<| box '(5,5),(3,4)' - t - - - - - - geometric_type |>> geometric_type - boolean - - - Is first object strictly above second? - Available for point, box, polygon, - circle. - - - box '(5,5),(3,4)' |>> box '(3,3),(0,0)' - t - - - - - - geometric_type &<| geometric_type - boolean - - - Does first object not extend above second? - Available for box, polygon, - circle. - - - box '(1,1),(0,0)' &<| box '(2,2),(0,0)' - t - - - - - - geometric_type |&> geometric_type - boolean - - - Does first object not extend below second? - Available for box, polygon, - circle. - - - box '(3,3),(0,0)' |&> box '(2,2),(0,0)' - t - - - - - - box <^ box - boolean - - - Is first object below second (allows edges to touch)? - - - box '((1,1),(0,0))' <^ box '((2,2),(1,1))' - t - - - - - - box >^ box - boolean - - - Is first object above second (allows edges to touch)? - - - box '((2,2),(1,1))' >^ box '((1,1),(0,0))' - t - - - - - - geometric_type ?# geometric_type - boolean - - - Do these objects intersect? - Available for these pairs of types: - (box, box), - (lseg, box), - (lseg, lseg), - (lseg, line), - (line, box), - (line, line), - (path, path). - - - lseg '[(-1,0),(1,0)]' ?# box '(2,2),(-2,-2)' - t - - - - - - ?- line - boolean - - - ?- lseg - boolean - - - Is line horizontal? - - - ?- lseg '[(-1,0),(1,0)]' - t - - - - - - point ?- point - boolean - - - Are points horizontally aligned (that is, have same y coordinate)? - - - point '(1,0)' ?- point '(0,0)' - t - - - - - - ?| line - boolean - - - ?| lseg - boolean - - - Is line vertical? - - - ?| lseg '[(-1,0),(1,0)]' - f - - - - - - point ?| point - boolean - - - Are points vertically aligned (that is, have same x coordinate)? - - - point '(0,1)' ?| point '(0,0)' - t - - - - - - line ?-| line - boolean - - - lseg ?-| lseg - boolean - - - Are lines perpendicular? - - - lseg '[(0,0),(0,1)]' ?-| lseg '[(0,0),(1,0)]' - t - - - - - - line ?|| line - boolean - - - lseg ?|| lseg - boolean - - - Are lines parallel? - - - lseg '[(-1,0),(1,0)]' ?|| lseg '[(-1,2),(1,2)]' - t - - - - - - geometric_type ~= geometric_type - boolean - - - Are these objects the same? - Available for point, box, - polygon, circle. - - - polygon '((0,0),(1,1))' ~= polygon '((1,1),(0,0))' - t - - - - -
- - - - Note that the same as operator, ~=, - represents the usual notion of equality for the point, - box, polygon, and circle types. - Some of the geometric types also have an = operator, but - = compares for equal areas only. - The other scalar comparison operators (<= and so - on), where available for these types, likewise compare areas. - - - - - - Before PostgreSQL 14, the point - is strictly below/above comparison operators point - <<| point and point - |>> point were respectively - called <^ and >^. These - names are still available, but are deprecated and will eventually be - removed. - - - - - Geometric Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - area - - area ( geometric_type ) - double precision - - - Computes area. - Available for box, path, circle. - A path input must be closed, else NULL is returned. - Also, if the path is self-intersecting, the result may be - meaningless. - - - area(box '(2,2),(0,0)') - 4 - - - - - - - center - - center ( geometric_type ) - point - - - Computes center point. - Available for box, circle. - - - center(box '(1,2),(0,0)') - (0.5,1) - - - - - - - diagonal - - diagonal ( box ) - lseg - - - Extracts box's diagonal as a line segment - (same as lseg(box)). - - - diagonal(box '(1,2),(0,0)') - [(1,2),(0,0)] - - - - - - - diameter - - diameter ( circle ) - double precision - - - Computes diameter of circle. - - - diameter(circle '<(0,0),2>') - 4 - - - - - - - height - - height ( box ) - double precision - - - Computes vertical size of box. - - - height(box '(1,2),(0,0)') - 2 - - - - - - - isclosed - - isclosed ( path ) - boolean - - - Is path closed? - - - isclosed(path '((0,0),(1,1),(2,0))') - t - - - - - - - isopen - - isopen ( path ) - boolean - - - Is path open? - - - isopen(path '[(0,0),(1,1),(2,0)]') - t - - - - - - - length - - length ( geometric_type ) - double precision - - - Computes the total length. - Available for lseg, path. - - - length(path '((-1,0),(1,0))') - 4 - - - - - - - npoints - - npoints ( geometric_type ) - integer - - - Returns the number of points. - Available for path, polygon. - - - npoints(path '[(0,0),(1,1),(2,0)]') - 3 - - - - - - - pclose - - pclose ( path ) - path - - - Converts path to closed form. - - - pclose(path '[(0,0),(1,1),(2,0)]') - ((0,0),(1,1),(2,0)) - - - - - - - popen - - popen ( path ) - path - - - Converts path to open form. - - - popen(path '((0,0),(1,1),(2,0))') - [(0,0),(1,1),(2,0)] - - - - - - - radius - - radius ( circle ) - double precision - - - Computes radius of circle. - - - radius(circle '<(0,0),2>') - 2 - - - - - - - slope - - slope ( point, point ) - double precision - - - Computes slope of a line drawn through the two points. - - - slope(point '(0,0)', point '(2,1)') - 0.5 - - - - - - - width - - width ( box ) - double precision - - - Computes horizontal size of box. - - - width(box '(1,2),(0,0)') - 1 - - - - -
- - - Geometric Type Conversion Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - box - - box ( circle ) - box - - - Computes box inscribed within the circle. - - - box(circle '<(0,0),2>') - (1.414213562373095,1.414213562373095),&zwsp;(-1.414213562373095,-1.414213562373095) - - - - - - box ( point ) - box - - - Converts point to empty box. - - - box(point '(1,0)') - (1,0),(1,0) - - - - - - box ( point, point ) - box - - - Converts any two corner points to box. - - - box(point '(0,1)', point '(1,0)') - (1,1),(0,0) - - - - - - box ( polygon ) - box - - - Computes bounding box of polygon. - - - box(polygon '((0,0),(1,1),(2,0))') - (2,1),(0,0) - - - - - - - bound_box - - bound_box ( box, box ) - box - - - Computes bounding box of two boxes. - - - bound_box(box '(1,1),(0,0)', box '(4,4),(3,3)') - (4,4),(0,0) - - - - - - - circle - - circle ( box ) - circle - - - Computes smallest circle enclosing box. - - - circle(box '(1,1),(0,0)') - <(0.5,0.5),0.7071067811865476> - - - - - - circle ( point, double precision ) - circle - - - Constructs circle from center and radius. - - - circle(point '(0,0)', 2.0) - <(0,0),2> - - - - - - circle ( polygon ) - circle - - - Converts polygon to circle. The circle's center is the mean of the - positions of the polygon's points, and the radius is the average - distance of the polygon's points from that center. - - - circle(polygon '((0,0),(1,3),(2,0))') - <(1,1),1.6094757082487299> - - - - - - - line - - line ( point, point ) - line - - - Converts two points to the line through them. - - - line(point '(-1,0)', point '(1,0)') - {0,-1,0} - - - - - - - lseg - - lseg ( box ) - lseg - - - Extracts box's diagonal as a line segment. - - - lseg(box '(1,0),(-1,0)') - [(1,0),(-1,0)] - - - - - - lseg ( point, point ) - lseg - - - Constructs line segment from two endpoints. - - - lseg(point '(-1,0)', point '(1,0)') - [(-1,0),(1,0)] - - - - - - - path - - path ( polygon ) - path - - - Converts polygon to a closed path with the same list of points. - - - path(polygon '((0,0),(1,1),(2,0))') - ((0,0),(1,1),(2,0)) - - - - - - - point - - point ( double precision, double precision ) - point - - - Constructs point from its coordinates. - - - point(23.4, -44.5) - (23.4,-44.5) - - - - - - point ( box ) - point - - - Computes center of box. - - - point(box '(1,0),(-1,0)') - (0,0) - - - - - - point ( circle ) - point - - - Computes center of circle. - - - point(circle '<(0,0),2>') - (0,0) - - - - - - point ( lseg ) - point - - - Computes center of line segment. - - - point(lseg '[(-1,0),(1,0)]') - (0,0) - - - - - - point ( polygon ) - point - - - Computes center of polygon (the mean of the - positions of the polygon's points). - - - point(polygon '((0,0),(1,1),(2,0))') - (1,0.3333333333333333) - - - - - - - polygon - - polygon ( box ) - polygon - - - Converts box to a 4-point polygon. - - - polygon(box '(1,1),(0,0)') - ((0,0),(0,1),(1,1),(1,0)) - - - - - - polygon ( circle ) - polygon - - - Converts circle to a 12-point polygon. - - - polygon(circle '<(0,0),2>') - ((-2,0),&zwsp;(-1.7320508075688774,0.9999999999999999),&zwsp;(-1.0000000000000002,1.7320508075688772),&zwsp;(-1.2246063538223773e-16,2),&zwsp;(0.9999999999999996,1.7320508075688774),&zwsp;(1.732050807568877,1.0000000000000007),&zwsp;(2,2.4492127076447545e-16),&zwsp;(1.7320508075688776,-0.9999999999999994),&zwsp;(1.0000000000000009,-1.7320508075688767),&zwsp;(3.673819061467132e-16,-2),&zwsp;(-0.9999999999999987,-1.732050807568878),&zwsp;(-1.7320508075688767,-1.0000000000000009)) - - - - - - polygon ( integer, circle ) - polygon - - - Converts circle to an n-point polygon. - - - polygon(4, circle '<(3,0),1>') - ((2,0),&zwsp;(3,1),&zwsp;(4,1.2246063538223773e-16),&zwsp;(3,-1)) - - - - - - polygon ( path ) - polygon - - - Converts closed path to a polygon with the same list of points. - - - polygon(path '((0,0),(1,1),(2,0))') - ((0,0),(1,1),(2,0)) - - - - - -
- - - It is possible to access the two component numbers of a point - as though the point were an array with indexes 0 and 1. For example, if - t.p is a point column then - SELECT p[0] FROM t retrieves the X coordinate and - UPDATE t SET p[1] = ... changes the Y coordinate. - In the same way, a value of type box or lseg can be treated - as an array of two point values. - - -
- - - - Network Address Functions and Operators - - - The IP network address types, cidr and inet, - support the usual comparison operators shown in - - as well as the specialized operators and functions shown in - and - . - - - - Any cidr value can be cast to inet implicitly; - therefore, the operators and functions shown below as operating on - inet also work on cidr values. (Where there are - separate functions for inet and cidr, it is - because the behavior should be different for the two cases.) - Also, it is permitted to cast an inet value - to cidr. When this is done, any bits to the right of the - netmask are silently zeroed to create a valid cidr value. - - - - IP Address Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - inet << inet - boolean - - - Is subnet strictly contained by subnet? - This operator, and the next four, test for subnet inclusion. They - consider only the network parts of the two addresses (ignoring any - bits to the right of the netmasks) and determine whether one network - is identical to or a subnet of the other. - - - inet '192.168.1.5' << inet '192.168.1/24' - t - - - inet '192.168.0.5' << inet '192.168.1/24' - f - - - inet '192.168.1/24' << inet '192.168.1/24' - f - - - - - - inet <<= inet - boolean - - - Is subnet contained by or equal to subnet? - - - inet '192.168.1/24' <<= inet '192.168.1/24' - t - - - - - - inet >> inet - boolean - - - Does subnet strictly contain subnet? - - - inet '192.168.1/24' >> inet '192.168.1.5' - t - - - - - - inet >>= inet - boolean - - - Does subnet contain or equal subnet? - - - inet '192.168.1/24' >>= inet '192.168.1/24' - t - - - - - - inet && inet - boolean - - - Does either subnet contain or equal the other? - - - inet '192.168.1/24' && inet '192.168.1.80/28' - t - - - inet '192.168.1/24' && inet '192.168.2.0/28' - f - - - - - - ~ inet - inet - - - Computes bitwise NOT. - - - ~ inet '192.168.1.6' - 63.87.254.249 - - - - - - inet & inet - inet - - - Computes bitwise AND. - - - inet '192.168.1.6' & inet '0.0.0.255' - 0.0.0.6 - - - - - - inet | inet - inet - - - Computes bitwise OR. - - - inet '192.168.1.6' | inet '0.0.0.255' - 192.168.1.255 - - - - - - inet + bigint - inet - - - Adds an offset to an address. - - - inet '192.168.1.6' + 25 - 192.168.1.31 - - - - - - bigint + inet - inet - - - Adds an offset to an address. - - - 200 + inet '::ffff:fff0:1' - ::ffff:255.240.0.201 - - - - - - inet - bigint - inet - - - Subtracts an offset from an address. - - - inet '192.168.1.43' - 36 - 192.168.1.7 - - - - - - inet - inet - bigint - - - Computes the difference of two addresses. - - - inet '192.168.1.43' - inet '192.168.1.19' - 24 - - - inet '::1' - inet '::ffff:1' - -4294901760 - - - - -
- - - IP Address Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - abbrev - - abbrev ( inet ) - text - - - Creates an abbreviated display format as text. - (The result is the same as the inet output function - produces; it is abbreviated only in comparison to the - result of an explicit cast to text, which for historical - reasons will never suppress the netmask part.) - - - abbrev(inet '10.1.0.0/32') - 10.1.0.0 - - - - - - abbrev ( cidr ) - text - - - Creates an abbreviated display format as text. - (The abbreviation consists of dropping all-zero octets to the right - of the netmask; more examples are in - .) - - - abbrev(cidr '10.1.0.0/16') - 10.1/16 - - - - - - - broadcast - - broadcast ( inet ) - inet - - - Computes the broadcast address for the address's network. - - - broadcast(inet '192.168.1.5/24') - 192.168.1.255/24 - - - - - - - family - - family ( inet ) - integer - - - Returns the address's family: 4 for IPv4, - 6 for IPv6. - - - family(inet '::1') - 6 - - - - - - - host - - host ( inet ) - text - - - Returns the IP address as text, ignoring the netmask. - - - host(inet '192.168.1.0/24') - 192.168.1.0 - - - - - - - hostmask - - hostmask ( inet ) - inet - - - Computes the host mask for the address's network. - - - hostmask(inet '192.168.23.20/30') - 0.0.0.3 - - - - - - - inet_merge - - inet_merge ( inet, inet ) - cidr - - - Computes the smallest network that includes both of the given networks. - - - inet_merge(inet '192.168.1.5/24', inet '192.168.2.5/24') - 192.168.0.0/22 - - - - - - - inet_same_family - - inet_same_family ( inet, inet ) - boolean - - - Tests whether the addresses belong to the same IP family. - - - inet_same_family(inet '192.168.1.5/24', inet '::1') - f - - - - - - - masklen - - masklen ( inet ) - integer - - - Returns the netmask length in bits. - - - masklen(inet '192.168.1.5/24') - 24 - - - - - - - netmask - - netmask ( inet ) - inet - - - Computes the network mask for the address's network. - - - netmask(inet '192.168.1.5/24') - 255.255.255.0 - - - - - - - network - - network ( inet ) - cidr - - - Returns the network part of the address, zeroing out - whatever is to the right of the netmask. - (This is equivalent to casting the value to cidr.) - - - network(inet '192.168.1.5/24') - 192.168.1.0/24 - - - - - - - set_masklen - - set_masklen ( inet, integer ) - inet - - - Sets the netmask length for an inet value. - The address part does not change. - - - set_masklen(inet '192.168.1.5/24', 16) - 192.168.1.5/16 - - - - - - set_masklen ( cidr, integer ) - cidr - - - Sets the netmask length for a cidr value. - Address bits to the right of the new netmask are set to zero. - - - set_masklen(cidr '192.168.1.0/24', 16) - 192.168.0.0/16 - - - - - - - text - - text ( inet ) - text - - - Returns the unabbreviated IP address and netmask length as text. - (This has the same result as an explicit cast to text.) - - - text(inet '192.168.1.5') - 192.168.1.5/32 - - - - -
- - - - The abbrev, host, - and text functions are primarily intended to offer - alternative display formats for IP addresses. - - - - - The MAC address types, macaddr and macaddr8, - support the usual comparison operators shown in - - as well as the specialized functions shown in - . - In addition, they support the bitwise logical operators - ~, & and | - (NOT, AND and OR), just as shown above for IP addresses. - - - - MAC Address Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - trunc - - trunc ( macaddr ) - macaddr - - - Sets the last 3 bytes of the address to zero. The remaining prefix - can be associated with a particular manufacturer (using data not - included in PostgreSQL). - - - trunc(macaddr '12:34:56:78:90:ab') - 12:34:56:00:00:00 - - - - - - trunc ( macaddr8 ) - macaddr8 - - - Sets the last 5 bytes of the address to zero. The remaining prefix - can be associated with a particular manufacturer (using data not - included in PostgreSQL). - - - trunc(macaddr8 '12:34:56:78:90:ab:cd:ef') - 12:34:56:00:00:00:00:00 - - - - - - - macaddr8_set7bit - - macaddr8_set7bit ( macaddr8 ) - macaddr8 - - - Sets the 7th bit of the address to one, creating what is known as - modified EUI-64, for inclusion in an IPv6 address. - - - macaddr8_set7bit(macaddr8 '00:34:56:ab:cd:ef') - 02:34:56:ff:fe:ab:cd:ef - - - - -
- -
- - - - Text Search Functions and Operators - - - full text search - functions and operators - - - - text search - functions and operators - - - - , - and - - summarize the functions and operators that are provided - for full text searching. See for a detailed - explanation of PostgreSQL's text search - facility. - - - - Text Search Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - tsvector @@ tsquery - boolean - - - tsquery @@ tsvector - boolean - - - Does tsvector match tsquery? - (The arguments can be given in either order.) - - - to_tsvector('fat cats ate rats') @@ to_tsquery('cat & rat') - t - - - - - - text @@ tsquery - boolean - - - Does text string, after implicit invocation - of to_tsvector(), match tsquery? - - - 'fat cats ate rats' @@ to_tsquery('cat & rat') - t - - - - - - tsvector || tsvector - tsvector - - - Concatenates two tsvectors. If both inputs contain - lexeme positions, the second input's positions are adjusted - accordingly. - - - 'a:1 b:2'::tsvector || 'c:1 d:2 b:3'::tsvector - 'a':1 'b':2,5 'c':3 'd':4 - - - - - - tsquery && tsquery - tsquery - - - ANDs two tsquerys together, producing a query that - matches documents that match both input queries. - - - 'fat | rat'::tsquery && 'cat'::tsquery - ( 'fat' | 'rat' ) & 'cat' - - - - - - tsquery || tsquery - tsquery - - - ORs two tsquerys together, producing a query that - matches documents that match either input query. - - - 'fat | rat'::tsquery || 'cat'::tsquery - 'fat' | 'rat' | 'cat' - - - - - - !! tsquery - tsquery - - - Negates a tsquery, producing a query that matches - documents that do not match the input query. - - - !! 'cat'::tsquery - !'cat' - - - - - - tsquery <-> tsquery - tsquery - - - Constructs a phrase query, which matches if the two input queries - match at successive lexemes. - - - to_tsquery('fat') <-> to_tsquery('rat') - 'fat' <-> 'rat' - - - - - - tsquery @> tsquery - boolean - - - Does first tsquery contain the second? (This considers - only whether all the lexemes appearing in one query appear in the - other, ignoring the combining operators.) - - - 'cat'::tsquery @> 'cat & rat'::tsquery - f - - - - - - tsquery <@ tsquery - boolean - - - Is first tsquery contained in the second? (This - considers only whether all the lexemes appearing in one query appear - in the other, ignoring the combining operators.) - - - 'cat'::tsquery <@ 'cat & rat'::tsquery - t - - - 'cat'::tsquery <@ '!cat & rat'::tsquery - t - - - - -
- - - In addition to these specialized operators, the usual comparison - operators shown in are - available for types tsvector and tsquery. - These are not very - useful for text searching but allow, for example, unique indexes to be - built on columns of these types. - - - - Text Search Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - array_to_tsvector - - array_to_tsvector ( text[] ) - tsvector - - - Converts an array of text strings to a tsvector. - The given strings are used as lexemes as-is, without further - processing. Array elements must not be empty strings - or NULL. - - - array_to_tsvector('{fat,cat,rat}'::text[]) - 'cat' 'fat' 'rat' - - - - - - - get_current_ts_config - - get_current_ts_config ( ) - regconfig - - - Returns the OID of the current default text search configuration - (as set by ). - - - get_current_ts_config() - english - - - - - - - length - - length ( tsvector ) - integer - - - Returns the number of lexemes in the tsvector. - - - length('fat:2,4 cat:3 rat:5A'::tsvector) - 3 - - - - - - - numnode - - numnode ( tsquery ) - integer - - - Returns the number of lexemes plus operators in - the tsquery. - - - numnode('(fat & rat) | cat'::tsquery) - 5 - - - - - - - plainto_tsquery - - plainto_tsquery ( - config regconfig, - query text ) - tsquery - - - Converts text to a tsquery, normalizing words according to - the specified or default configuration. Any punctuation in the string - is ignored (it does not determine query operators). The resulting - query matches documents containing all non-stopwords in the text. - - - plainto_tsquery('english', 'The Fat Rats') - 'fat' & 'rat' - - - - - - - phraseto_tsquery - - phraseto_tsquery ( - config regconfig, - query text ) - tsquery - - - Converts text to a tsquery, normalizing words according to - the specified or default configuration. Any punctuation in the string - is ignored (it does not determine query operators). The resulting - query matches phrases containing all non-stopwords in the text. - - - phraseto_tsquery('english', 'The Fat Rats') - 'fat' <-> 'rat' - - - phraseto_tsquery('english', 'The Cat and Rats') - 'cat' <2> 'rat' - - - - - - - websearch_to_tsquery - - websearch_to_tsquery ( - config regconfig, - query text ) - tsquery - - - Converts text to a tsquery, normalizing words according - to the specified or default configuration. Quoted word sequences are - converted to phrase tests. The word or is understood - as producing an OR operator, and a dash produces a NOT operator; - other punctuation is ignored. - This approximates the behavior of some common web search tools. - - - websearch_to_tsquery('english', '"fat rat" or cat dog') - 'fat' <-> 'rat' | 'cat' & 'dog' - - - - - - - querytree - - querytree ( tsquery ) - text - - - Produces a representation of the indexable portion of - a tsquery. A result that is empty or - just T indicates a non-indexable query. - - - querytree('foo & ! bar'::tsquery) - 'foo' - - - - - - - setweight - - setweight ( vector tsvector, weight "char" ) - tsvector - - - Assigns the specified weight to each element - of the vector. - - - setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A') - 'cat':3A 'fat':2A,4A 'rat':5A - - - - - - - setweight - setweight for specific lexeme(s) - - setweight ( vector tsvector, weight "char", lexemes text[] ) - tsvector - - - Assigns the specified weight to elements - of the vector that are listed - in lexemes. - The strings in lexemes are taken as lexemes - as-is, without further processing. Strings that do not match any - lexeme in vector are ignored. - - - setweight('fat:2,4 cat:3 rat:5,6B'::tsvector, 'A', '{cat,rat}') - 'cat':3A 'fat':2,4 'rat':5A,6A - - - - - - - strip - - strip ( tsvector ) - tsvector - - - Removes positions and weights from the tsvector. - - - strip('fat:2,4 cat:3 rat:5A'::tsvector) - 'cat' 'fat' 'rat' - - - - - - - to_tsquery - - to_tsquery ( - config regconfig, - query text ) - tsquery - - - Converts text to a tsquery, normalizing words according to - the specified or default configuration. The words must be combined - by valid tsquery operators. - - - to_tsquery('english', 'The & Fat & Rats') - 'fat' & 'rat' - - - - - - - to_tsvector - - to_tsvector ( - config regconfig, - document text ) - tsvector - - - Converts text to a tsvector, normalizing words according - to the specified or default configuration. Position information is - included in the result. - - - to_tsvector('english', 'The Fat Rats') - 'fat':2 'rat':3 - - - - - - to_tsvector ( - config regconfig, - document json ) - tsvector - - - to_tsvector ( - config regconfig, - document jsonb ) - tsvector - - - Converts each string value in the JSON document to - a tsvector, normalizing words according to the specified - or default configuration. The results are then concatenated in - document order to produce the output. Position information is - generated as though one stopword exists between each pair of string - values. (Beware that document order of the fields of a - JSON object is implementation-dependent when the input - is jsonb; observe the difference in the examples.) - - - to_tsvector('english', '{"aa": "The Fat Rats", "b": "dog"}'::json) - 'dog':5 'fat':2 'rat':3 - - - to_tsvector('english', '{"aa": "The Fat Rats", "b": "dog"}'::jsonb) - 'dog':1 'fat':4 'rat':5 - - - - - - - json_to_tsvector - - json_to_tsvector ( - config regconfig, - document json, - filter jsonb ) - tsvector - - - - jsonb_to_tsvector - - jsonb_to_tsvector ( - config regconfig, - document jsonb, - filter jsonb ) - tsvector - - - Selects each item in the JSON document that is requested by - the filter and converts each one to - a tsvector, normalizing words according to the specified - or default configuration. The results are then concatenated in - document order to produce the output. Position information is - generated as though one stopword exists between each pair of selected - items. (Beware that document order of the fields of a - JSON object is implementation-dependent when the input - is jsonb.) - The filter must be a jsonb - array containing zero or more of these keywords: - "string" (to include all string values), - "numeric" (to include all numeric values), - "boolean" (to include all boolean values), - "key" (to include all keys), or - "all" (to include all the above). - As a special case, the filter can also be a - simple JSON value that is one of these keywords. - - - json_to_tsvector('english', '{"a": "The Fat Rats", "b": 123}'::json, '["string", "numeric"]') - '123':5 'fat':2 'rat':3 - - - json_to_tsvector('english', '{"cat": "The Fat Rats", "dog": 123}'::json, '"all"') - '123':9 'cat':1 'dog':7 'fat':4 'rat':5 - - - - - - - ts_delete - - ts_delete ( vector tsvector, lexeme text ) - tsvector - - - Removes any occurrence of the given lexeme - from the vector. - The lexeme string is treated as a lexeme as-is, - without further processing. - - - ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat') - 'cat':3 'rat':5A - - - - - - ts_delete ( vector tsvector, lexemes text[] ) - tsvector - - - Removes any occurrences of the lexemes - in lexemes - from the vector. - The strings in lexemes are taken as lexemes - as-is, without further processing. Strings that do not match any - lexeme in vector are ignored. - - - ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat']) - 'cat':3 - - - - - - - ts_filter - - ts_filter ( vector tsvector, weights "char"[] ) - tsvector - - - Selects only elements with the given weights - from the vector. - - - ts_filter('fat:2,4 cat:3b,7c rat:5A'::tsvector, '{a,b}') - 'cat':3B 'rat':5A - - - - - - - ts_headline - - ts_headline ( - config regconfig, - document text, - query tsquery - , options text ) - text - - - Displays, in an abbreviated form, the match(es) for - the query in - the document, which must be raw text not - a tsvector. Words in the document are normalized - according to the specified or default configuration before matching to - the query. Use of this function is discussed in - , which also describes the - available options. - - - ts_headline('The fat cat ate the rat.', 'cat') - The fat <b>cat</b> ate the rat. - - - - - - ts_headline ( - config regconfig, - document json, - query tsquery - , options text ) - text - - - ts_headline ( - config regconfig, - document jsonb, - query tsquery - , options text ) - text - - - Displays, in an abbreviated form, match(es) for - the query that occur in string values - within the JSON document. - See for more details. - - - ts_headline('{"cat":"raining cats and dogs"}'::jsonb, 'cat') - {"cat": "raining <b>cats</b> and dogs"} - - - - - - - ts_rank - - ts_rank ( - weights real[], - vector tsvector, - query tsquery - , normalization integer ) - real - - - Computes a score showing how well - the vector matches - the query. See - for details. - - - ts_rank(to_tsvector('raining cats and dogs'), 'cat') - 0.06079271 - - - - - - - ts_rank_cd - - ts_rank_cd ( - weights real[], - vector tsvector, - query tsquery - , normalization integer ) - real - - - Computes a score showing how well - the vector matches - the query, using a cover density - algorithm. See for details. - - - ts_rank_cd(to_tsvector('raining cats and dogs'), 'cat') - 0.1 - - - - - - - ts_rewrite - - ts_rewrite ( query tsquery, - target tsquery, - substitute tsquery ) - tsquery - - - Replaces occurrences of target - with substitute - within the query. - See for details. - - - ts_rewrite('a & b'::tsquery, 'a'::tsquery, 'foo|bar'::tsquery) - 'b' & ( 'foo' | 'bar' ) - - - - - - ts_rewrite ( query tsquery, - select text ) - tsquery - - - Replaces portions of the query according to - target(s) and substitute(s) obtained by executing - a SELECT command. - See for details. - - - SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases') - 'b' & ( 'foo' | 'bar' ) - - - - - - - tsquery_phrase - - tsquery_phrase ( query1 tsquery, query2 tsquery ) - tsquery - - - Constructs a phrase query that searches - for matches of query1 - and query2 at successive lexemes (same - as <-> operator). - - - tsquery_phrase(to_tsquery('fat'), to_tsquery('cat')) - 'fat' <-> 'cat' - - - - - - tsquery_phrase ( query1 tsquery, query2 tsquery, distance integer ) - tsquery - - - Constructs a phrase query that searches - for matches of query1 and - query2 that occur exactly - distance lexemes apart. - - - tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10) - 'fat' <10> 'cat' - - - - - - - tsvector_to_array - - tsvector_to_array ( tsvector ) - text[] - - - Converts a tsvector to an array of lexemes. - - - tsvector_to_array('fat:2,4 cat:3 rat:5A'::tsvector) - {cat,fat,rat} - - - - - - - unnest - for tsvector - - unnest ( tsvector ) - setof record - ( lexeme text, - positions smallint[], - weights text ) - - - Expands a tsvector into a set of rows, one per lexeme. - - - select * from unnest('cat:3 fat:2,4 rat:5A'::tsvector) - - - lexeme | positions | weights ---------+-----------+--------- - cat | {3} | {D} - fat | {2,4} | {D,D} - rat | {5} | {A} - - - - - -
- - - - All the text search functions that accept an optional regconfig - argument will use the configuration specified by - - when that argument is omitted. - - - - - The functions in - - are listed separately because they are not usually used in everyday text - searching operations. They are primarily helpful for development and - debugging of new text search configurations. - - - - Text Search Debugging Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - ts_debug - - ts_debug ( - config regconfig, - document text ) - setof record - ( alias text, - description text, - token text, - dictionaries regdictionary[], - dictionary regdictionary, - lexemes text[] ) - - - Extracts and normalizes tokens from - the document according to the specified or - default text search configuration, and returns information about how - each token was processed. - See for details. - - - ts_debug('english', 'The Brightest supernovaes') - (asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ... - - - - - - - ts_lexize - - ts_lexize ( dict regdictionary, token text ) - text[] - - - Returns an array of replacement lexemes if the input token is known to - the dictionary, or an empty array if the token is known to the - dictionary but it is a stop word, or NULL if it is not a known word. - See for details. - - - ts_lexize('english_stem', 'stars') - {star} - - - - - - - ts_parse - - ts_parse ( parser_name text, - document text ) - setof record - ( tokid integer, - token text ) - - - Extracts tokens from the document using the - named parser. - See for details. - - - ts_parse('default', 'foo - bar') - (1,foo) ... - - - - - - ts_parse ( parser_oid oid, - document text ) - setof record - ( tokid integer, - token text ) - - - Extracts tokens from the document using a - parser specified by OID. - See for details. - - - ts_parse(3722, 'foo - bar') - (1,foo) ... - - - - - - - ts_token_type - - ts_token_type ( parser_name text ) - setof record - ( tokid integer, - alias text, - description text ) - - - Returns a table that describes each type of token the named parser can - recognize. - See for details. - - - ts_token_type('default') - (1,asciiword,"Word, all ASCII") ... - - - - - - ts_token_type ( parser_oid oid ) - setof record - ( tokid integer, - alias text, - description text ) - - - Returns a table that describes each type of token a parser specified - by OID can recognize. - See for details. - - - ts_token_type(3722) - (1,asciiword,"Word, all ASCII") ... - - - - - - - ts_stat - - ts_stat ( sqlquery text - , weights text ) - setof record - ( word text, - ndoc integer, - nentry integer ) - - - Executes the sqlquery, which must return a - single tsvector column, and returns statistics about each - distinct lexeme contained in the data. - See for details. - - - ts_stat('SELECT vector FROM apod') - (foo,10,15) ... - - - - -
- -
- - - UUID Functions - - - UUID - generating - - - - gen_random_uuid - - - - uuidv4 - - - - uuidv7 - - - - uuid_extract_timestamp - - - - uuid_extract_version - - - - shows the PostgreSQL - functions that can be used to generate UUIDs. - - - - <acronym>UUID</acronym> Generation Functions - - - - - - Function - - - Description - - - Example(s) - - - - - - - - - - gen_random_uuid - uuid - - - uuidv4 - uuid - - - Generate a version 4 (random) UUID. - - - gen_random_uuid() - 5b30857f-0bfa-48b5-ac0b-5c64e28078d1 - - - uuidv4() - b42410ee-132f-42ee-9e4f-09a6485c95b8 - - - - - - - uuidv7 - ( shift interval ) - uuid - - - Generate a version 7 (time-ordered) UUID. The timestamp is computed using UNIX timestamp - with millisecond precision + sub-millisecond timestamp + random. The optional parameter - shift will shift the computed timestamp by the given interval. - - - uuidv7() - 019535d9-3df7-79fb-b466-fa907fa17f9e - - - - - -
- - - - The module provides additional functions that - implement other standard algorithms for generating UUIDs. - - - - - shows the PostgreSQL - functions that can be used to extract information from UUIDs. - - - - <acronym>UUID</acronym> Extraction Functions - - - - - - Function - - - Description - - - Example(s) - - - - - - - - - - uuid_extract_timestamp - ( uuid ) - timestamp with time zone - - - Extracts a timestamp with time zone from UUID - version 1 and 7. For other versions, this function returns null. Note that - the extracted timestamp is not necessarily exactly equal to the time the - UUID was generated; this depends on the implementation that generated the - UUID. - - - uuid_extract_timestamp('019535d9-3df7-79fb-b466-&zwsp;fa907fa17f9e'::uuid) - 2025-02-23 21:46:24.503-05 - - - - - - - uuid_extract_version - ( uuid ) - smallint - - - Extracts the version from a UUID of the variant described by - RFC 9562. For - other variants, this function returns null. For example, for a UUID - generated by gen_random_uuid, this function will - return 4. - - - uuid_extract_version('41db1265-8bc1-4ab3-992f-&zwsp;885799a4af1d'::uuid) - 4 - - - uuid_extract_version('019535d9-3df7-79fb-b466-&zwsp;fa907fa17f9e'::uuid) - 7 - - - - - -
- - - PostgreSQL also provides the usual comparison - operators shown in for - UUIDs. - - - See for details on the data type - uuid in PostgreSQL. - -
- - - - XML Functions - - - XML Functions - - - - The functions and function-like expressions described in this - section operate on values of type xml. See for information about the xml - type. The function-like expressions xmlparse - and xmlserialize for converting to and from - type xml are documented there, not in this section. - - - - Use of most of these functions - requires PostgreSQL to have been built - with configure --with-libxml. - - - - Producing XML Content - - - A set of functions and function-like expressions is available for - producing XML content from SQL data. As such, they are - particularly suitable for formatting query results into XML - documents for processing in client applications. - - - - <literal>xmltext</literal> - - - xmltext - - - -xmltext ( text ) xml - - - - The function xmltext returns an XML value with a single - text node containing the input argument as its content. Predefined entities - like ampersand (), left and right angle brackets - (]]>), and quotation marks () - are escaped. - - - - Example: -'); - xmltext -------------------------- - < foo & bar > -]]> - - - - - <literal>xmlcomment</literal> - - - xmlcomment - - - -xmlcomment ( text ) xml - - - - The function xmlcomment creates an XML value - containing an XML comment with the specified text as content. - The text cannot contain -- or end with a - -, otherwise the resulting construct - would not be a valid XML comment. - If the argument is null, the result is null. - - - - Example: - -]]> - - - - - <literal>xmlconcat</literal> - - - xmlconcat - - - -xmlconcat ( xml , ... ) xml - - - - The function xmlconcat concatenates a list - of individual XML values to create a single value containing an - XML content fragment. Null values are omitted; the result is - only null if there are no nonnull arguments. - - - - Example: -', 'foo'); - - xmlconcat ----------------------- - foo -]]> - - - - XML declarations, if present, are combined as follows. If all - argument values have the same XML version declaration, that - version is used in the result, else no version is used. If all - argument values have the standalone declaration value - yes, then that value is used in the result. If - all argument values have a standalone declaration value and at - least one is no, then that is used in the result. - Else the result will have no standalone declaration. If the - result is determined to require a standalone declaration but no - version declaration, a version declaration with version 1.0 will - be used because XML requires an XML declaration to contain a - version declaration. Encoding declarations are ignored and - removed in all cases. - - - - Example: -', ''); - - xmlconcat ------------------------------------ - -]]> - - - - - <literal>xmlelement</literal> - - - xmlelement - - - -xmlelement ( NAME name , XMLATTRIBUTES ( attvalue AS attname , ... ) , content , ... ) xml - - - - The xmlelement expression produces an XML - element with the given name, attributes, and content. - The name - and attname items shown in the syntax are - simple identifiers, not values. The attvalue - and content items are expressions, which can - yield any PostgreSQL data type. The - argument(s) within XMLATTRIBUTES generate attributes - of the XML element; the content value(s) are - concatenated to form its content. - - - - Examples: - - -SELECT xmlelement(name foo, xmlattributes('xyz' as bar)); - - xmlelement ------------------- - - -SELECT xmlelement(name foo, xmlattributes(current_date as bar), 'cont', 'ent'); - - xmlelement -------------------------------------- - content -]]> - - - - Element and attribute names that are not valid XML names are - escaped by replacing the offending characters by the sequence - _xHHHH_, where - HHHH is the character's Unicode - codepoint in hexadecimal notation. For example: - -]]> - - - - An explicit attribute name need not be specified if the attribute - value is a column reference, in which case the column's name will - be used as the attribute name by default. In other cases, the - attribute must be given an explicit name. So this example is - valid: - -CREATE TABLE test (a xml, b xml); -SELECT xmlelement(name test, xmlattributes(a, b)) FROM test; - - But these are not: - -SELECT xmlelement(name test, xmlattributes('constant'), a, b) FROM test; -SELECT xmlelement(name test, xmlattributes(func(a, b))) FROM test; - - - - - Element content, if specified, will be formatted according to - its data type. If the content is itself of type xml, - complex XML documents can be constructed. For example: - -]]> - - Content of other types will be formatted into valid XML character - data. This means in particular that the characters <, >, - and & will be converted to entities. Binary data (data type - bytea) will be represented in base64 or hex - encoding, depending on the setting of the configuration parameter - . The particular behavior for - individual data types is expected to evolve in order to align the - PostgreSQL mappings with those specified in SQL:2006 and later, - as discussed in . - - - - - <literal>xmlforest</literal> - - - xmlforest - - - -xmlforest ( content AS name , ... ) xml - - - - The xmlforest expression produces an XML - forest (sequence) of elements using the given names and content. - As for xmlelement, - each name must be a simple identifier, while - the content expressions can have any data - type. - - - - Examples: - -SELECT xmlforest('abc' AS foo, 123 AS bar); - - xmlforest ------------------------------- - <foo>abc</foo><bar>123</bar> - - -SELECT xmlforest(table_name, column_name) -FROM information_schema.columns -WHERE table_schema = 'pg_catalog'; - - xmlforest -------------------------------------&zwsp;----------------------------------- - <table_name>pg_authid</table_name>&zwsp;<column_name>rolname</column_name> - <table_name>pg_authid</table_name>&zwsp;<column_name>rolsuper</column_name> - ... - - - As seen in the second example, the element name can be omitted if - the content value is a column reference, in which case the column - name is used by default. Otherwise, a name must be specified. - - - - Element names that are not valid XML names are escaped as shown - for xmlelement above. Similarly, content - data is escaped to make valid XML content, unless it is already - of type xml. - - - - Note that XML forests are not valid XML documents if they consist - of more than one element, so it might be useful to wrap - xmlforest expressions in - xmlelement. - - - - - <literal>xmlpi</literal> - - - xmlpi - - - -xmlpi ( NAME name , content ) xml - - - - The xmlpi expression creates an XML - processing instruction. - As for xmlelement, - the name must be a simple identifier, while - the content expression can have any data type. - The content, if present, must not contain the - character sequence ?>. - - - - Example: - -]]> - - - - - <literal>xmlroot</literal> - - - xmlroot - - - -xmlroot ( xml, VERSION {text|NO VALUE} , STANDALONE {YES|NO|NO VALUE} ) xml - - - - The xmlroot expression alters the properties - of the root node of an XML value. If a version is specified, - it replaces the value in the root node's version declaration; if a - standalone setting is specified, it replaces the value in the - root node's standalone declaration. - - - -abc'), - version '1.0', standalone yes); - - xmlroot ----------------------------------------- - - abc -]]> - - - - - <literal>xmlagg</literal> - - - xmlagg - - - -xmlagg ( xml ) xml - - - - The function xmlagg is, unlike the other - functions described here, an aggregate function. It concatenates the - input values to the aggregate function call, - much like xmlconcat does, except that concatenation - occurs across rows rather than across expressions in a single row. - See for additional information - about aggregate functions. - - - - Example: -abc'); -INSERT INTO test VALUES (2, ''); -SELECT xmlagg(x) FROM test; - xmlagg ----------------------- - abc -]]> - - - - To determine the order of the concatenation, an ORDER BY - clause may be added to the aggregate call as described in - . For example: - -abc -]]> - - - - The following non-standard approach used to be recommended - in previous versions, and may still be useful in specific - cases: - -abc -]]> - - - - - - XML Predicates - - - The expressions described in this section check properties - of xml values. - - - - <literal>IS DOCUMENT</literal> - - - IS DOCUMENT - - - -xml IS DOCUMENT boolean - - - - The expression IS DOCUMENT returns true if the - argument XML value is a proper XML document, false if it is not - (that is, it is a content fragment), or null if the argument is - null. See about the difference - between documents and content fragments. - - - - - <literal>IS NOT DOCUMENT</literal> - - - IS NOT DOCUMENT - - - -xml IS NOT DOCUMENT boolean - - - - The expression IS NOT DOCUMENT returns false if the - argument XML value is a proper XML document, true if it is not (that is, - it is a content fragment), or null if the argument is null. - - - - - <literal>XMLEXISTS</literal> - - - XMLEXISTS - - - -XMLEXISTS ( text PASSING BY {REF|VALUE} xml BY {REF|VALUE} ) boolean - - - - The function xmlexists evaluates an XPath 1.0 - expression (the first argument), with the passed XML value as its context - item. The function returns false if the result of that evaluation - yields an empty node-set, true if it yields any other value. The - function returns null if any argument is null. A nonnull value - passed as the context item must be an XML document, not a content - fragment or any non-XML value. - - - - Example: - TorontoOttawa'); - - xmlexists ------------- - t -(1 row) -]]> - - - - The BY REF and BY VALUE clauses - are accepted in PostgreSQL, but are ignored, - as discussed in . - - - - In the SQL standard, the xmlexists function - evaluates an expression in the XML Query language, - but PostgreSQL allows only an XPath 1.0 - expression, as discussed in - . - - - - - <literal>xml_is_well_formed</literal> - - - xml_is_well_formed - - - - xml_is_well_formed_document - - - - xml_is_well_formed_content - - - -xml_is_well_formed ( text ) boolean -xml_is_well_formed_document ( text ) boolean -xml_is_well_formed_content ( text ) boolean - - - - These functions check whether a text string represents - well-formed XML, returning a Boolean result. - xml_is_well_formed_document checks for a well-formed - document, while xml_is_well_formed_content checks - for well-formed content. xml_is_well_formed does - the former if the configuration - parameter is set to DOCUMENT, or the latter if it is set to - CONTENT. This means that - xml_is_well_formed is useful for seeing whether - a simple cast to type xml will succeed, whereas the other two - functions are useful for seeing whether the corresponding variants of - XMLPARSE will succeed. - - - - Examples: - -'); - xml_is_well_formed --------------------- - f -(1 row) - -SELECT xml_is_well_formed(''); - xml_is_well_formed --------------------- - t -(1 row) - -SET xmloption TO CONTENT; -SELECT xml_is_well_formed('abc'); - xml_is_well_formed --------------------- - t -(1 row) - -SELECT xml_is_well_formed_document('bar'); - xml_is_well_formed_document ------------------------------ - t -(1 row) - -SELECT xml_is_well_formed_document('bar'); - xml_is_well_formed_document ------------------------------ - f -(1 row) -]]> - - The last example shows that the checks include whether - namespaces are correctly matched. - - - - - - Processing XML - - - To process values of data type xml, PostgreSQL offers - the functions xpath and - xpath_exists, which evaluate XPath 1.0 - expressions, and the XMLTABLE - table function. - - - - <literal>xpath</literal> - - - XPath - - - -xpath ( xpath text, xml xml , nsarray text[] ) xml[] - - - - The function xpath evaluates the XPath 1.0 - expression xpath (given as text) - against the XML value - xml. It returns an array of XML values - corresponding to the node-set produced by the XPath expression. - If the XPath expression returns a scalar value rather than a node-set, - a single-element array is returned. - - - - The second argument must be a well formed XML document. In particular, - it must have a single root node element. - - - - The optional third argument of the function is an array of namespace - mappings. This array should be a two-dimensional text array with - the length of the second axis being equal to 2 (i.e., it should be an - array of arrays, each of which consists of exactly 2 elements). - The first element of each array entry is the namespace name (alias), the - second the namespace URI. It is not required that aliases provided in - this array be the same as those being used in the XML document itself (in - other words, both in the XML document and in the xpath - function context, aliases are local). - - - - Example: -test', - ARRAY[ARRAY['my', 'http://example.com']]); - - xpath --------- - {test} -(1 row) -]]> - - - - To deal with default (anonymous) namespaces, do something like this: -test', - ARRAY[ARRAY['mydefns', 'http://example.com']]); - - xpath --------- - {test} -(1 row) -]]> - - - - - <literal>xpath_exists</literal> - - - xpath_exists - - - -xpath_exists ( xpath text, xml xml , nsarray text[] ) boolean - - - - The function xpath_exists is a specialized form - of the xpath function. Instead of returning the - individual XML values that satisfy the XPath 1.0 expression, this function - returns a Boolean indicating whether the query was satisfied or not - (specifically, whether it produced any value other than an empty node-set). - This function is equivalent to the XMLEXISTS predicate, - except that it also offers support for a namespace mapping argument. - - - - Example: -test', - ARRAY[ARRAY['my', 'http://example.com']]); - - xpath_exists --------------- - t -(1 row) -]]> - - - - - <literal>xmltable</literal> - - - xmltable - - - - table function - XMLTABLE - - - -XMLTABLE ( - XMLNAMESPACES ( namespace_uri AS namespace_name , ... ), - row_expression PASSING BY {REF|VALUE} document_expression BY {REF|VALUE} - COLUMNS name { type PATH column_expression DEFAULT default_expression NOT NULL | NULL - | FOR ORDINALITY } - , ... -) setof record - - - - The xmltable expression produces a table based - on an XML value, an XPath filter to extract rows, and a - set of column definitions. - Although it syntactically resembles a function, it can only appear - as a table in a query's FROM clause. - - - - The optional XMLNAMESPACES clause gives a - comma-separated list of namespace definitions, where - each namespace_uri is a text - expression and each namespace_name is a simple - identifier. It specifies the XML namespaces used in the document and - their aliases. A default namespace specification is not currently - supported. - - - - The required row_expression argument is an - XPath 1.0 expression (given as text) that is evaluated, - passing the XML value document_expression as - its context item, to obtain a set of XML nodes. These nodes are what - xmltable transforms into output rows. No rows - will be produced if the document_expression - is null, nor if the row_expression produces - an empty node-set or any value other than a node-set. - - - - document_expression provides the context - item for the row_expression. It must be a - well-formed XML document; fragments/forests are not accepted. - The BY REF and BY VALUE clauses - are accepted but ignored, as discussed in - . - - - - In the SQL standard, the xmltable function - evaluates expressions in the XML Query language, - but PostgreSQL allows only XPath 1.0 - expressions, as discussed in - . - - - - The required COLUMNS clause specifies the - column(s) that will be produced in the output table. - See the syntax summary above for the format. - A name is required for each column, as is a data type - (unless FOR ORDINALITY is specified, in which case - type integer is implicit). The path, default and - nullability clauses are optional. - - - - A column marked FOR ORDINALITY will be populated - with row numbers, starting with 1, in the order of nodes retrieved from - the row_expression's result node-set. - At most one column may be marked FOR ORDINALITY. - - - - - XPath 1.0 does not specify an order for nodes in a node-set, so code - that relies on a particular order of the results will be - implementation-dependent. Details can be found in - . - - - - - The column_expression for a column is an - XPath 1.0 expression that is evaluated for each row, with the current - node from the row_expression result as its - context item, to find the value of the column. If - no column_expression is given, then the - column name is used as an implicit path. - - - - If a column's XPath expression returns a non-XML value (which is limited - to string, boolean, or double in XPath 1.0) and the column has a - PostgreSQL type other than xml, the column will be set - as if by assigning the value's string representation to the PostgreSQL - type. (If the value is a boolean, its string representation is taken - to be 1 or 0 if the output - column's type category is numeric, otherwise true or - false.) - - - - If a column's XPath expression returns a non-empty set of XML nodes - and the column's PostgreSQL type is xml, the column will - be assigned the expression result exactly, if it is of document or - content form. - - - A result containing more than one element node at the top level, or - non-whitespace text outside of an element, is an example of content form. - An XPath result can be of neither form, for example if it returns an - attribute node selected from the element that contains it. Such a result - will be put into content form with each such disallowed node replaced by - its string value, as defined for the XPath 1.0 - string function. - - - - - - A non-XML result assigned to an xml output column produces - content, a single text node with the string value of the result. - An XML result assigned to a column of any other type may not have more than - one node, or an error is raised. If there is exactly one node, the column - will be set as if by assigning the node's string - value (as defined for the XPath 1.0 string function) - to the PostgreSQL type. - - - - The string value of an XML element is the concatenation, in document order, - of all text nodes contained in that element and its descendants. The string - value of an element with no descendant text nodes is an - empty string (not NULL). - Any xsi:nil attributes are ignored. - Note that the whitespace-only text() node between two non-text - elements is preserved, and that leading whitespace on a text() - node is not flattened. - The XPath 1.0 string function may be consulted for the - rules defining the string value of other XML node types and non-XML values. - - - - The conversion rules presented here are not exactly those of the SQL - standard, as discussed in . - - - - If the path expression returns an empty node-set - (typically, when it does not match) - for a given row, the column will be set to NULL, unless - a default_expression is specified; then the - value resulting from evaluating that expression is used. - - - - A default_expression, rather than being - evaluated immediately when xmltable is called, - is evaluated each time a default is needed for the column. - If the expression qualifies as stable or immutable, the repeat - evaluation may be skipped. - This means that you can usefully use volatile functions like - nextval in - default_expression. - - - - Columns may be marked NOT NULL. If the - column_expression for a NOT - NULL column does not match anything and there is - no DEFAULT or - the default_expression also evaluates to null, - an error is reported. - - - - Examples: - - - AU - Australia - - - JP - Japan - Shinzo Abe - 145935 - - - SG - Singapore - 697 - - -$$ AS data; - -SELECT xmltable.* - FROM xmldata, - XMLTABLE('//ROWS/ROW' - PASSING data - COLUMNS id int PATH '@id', - ordinality FOR ORDINALITY, - "COUNTRY_NAME" text, - country_id text PATH 'COUNTRY_ID', - size_sq_km float PATH 'SIZE[@unit = "sq_km"]', - size_other text PATH - 'concat(SIZE[@unit!="sq_km"], " ", SIZE[@unit!="sq_km"]/@unit)', - premier_name text PATH 'PREMIER_NAME' DEFAULT 'not specified'); - - id | ordinality | COUNTRY_NAME | country_id | size_sq_km | size_other | premier_name -----+------------+--------------+------------+------------+--------------+--------------- - 1 | 1 | Australia | AU | | | not specified - 5 | 2 | Japan | JP | | 145935 sq_mi | Shinzo Abe - 6 | 3 | Singapore | SG | 697 | | not specified -]]> - - The following example shows concatenation of multiple text() nodes, - usage of the column name as XPath filter, and the treatment of whitespace, - XML comments and processing instructions: - - - Hello2a2 bbbxxxCC - -$$ AS data; - -SELECT xmltable.* - FROM xmlelements, XMLTABLE('/root' PASSING data COLUMNS element text); - element -------------------------- - Hello2a2 bbbxxxCC -]]> - - - - The following example illustrates how - the XMLNAMESPACES clause can be used to specify - a list of namespaces - used in the XML document as well as in the XPath expressions: - - - - - -'::xml) -) -SELECT xmltable.* - FROM XMLTABLE(XMLNAMESPACES('http://example.com/myns' AS x, - 'http://example.com/b' AS "B"), - '/x:example/x:item' - PASSING (SELECT data FROM xmldata) - COLUMNS foo int PATH '@foo', - bar int PATH '@B:bar'); - foo | bar ------+----- - 1 | 2 - 3 | 4 - 4 | 5 -(3 rows) -]]> - - - - - - Mapping Tables to XML - - - XML export - - - - The following functions map the contents of relational tables to - XML values. They can be thought of as XML export functionality: - -table_to_xml ( table regclass, nulls boolean, - tableforest boolean, targetns text ) xml -query_to_xml ( query text, nulls boolean, - tableforest boolean, targetns text ) xml -cursor_to_xml ( cursor refcursor, count integer, nulls boolean, - tableforest boolean, targetns text ) xml - - - - - table_to_xml maps the content of the named - table, passed as parameter table. The - regclass type accepts strings identifying tables using the - usual notation, including optional schema qualification and - double quotes (see for details). - query_to_xml executes the - query whose text is passed as parameter - query and maps the result set. - cursor_to_xml fetches the indicated number of - rows from the cursor specified by the parameter - cursor. This variant is recommended if - large tables have to be mapped, because the result value is built - up in memory by each function. - - - - If tableforest is false, then the resulting - XML document looks like this: - - - data - data - - - - ... - - - ... - -]]> - - If tableforest is true, the result is an - XML content fragment that looks like this: - - data - data - - - - ... - - -... -]]> - - If no table name is available, that is, when mapping a query or a - cursor, the string table is used in the first - format, row in the second format. - - - - The choice between these formats is up to the user. The first - format is a proper XML document, which will be important in many - applications. The second format tends to be more useful in the - cursor_to_xml function if the result values are to be - reassembled into one document later on. The functions for - producing XML content discussed above, in particular - xmlelement, can be used to alter the results - to taste. - - - - The data values are mapped in the same way as described for the - function xmlelement above. - - - - The parameter nulls determines whether null - values should be included in the output. If true, null values in - columns are represented as: - -]]> - where xsi is the XML namespace prefix for XML - Schema Instance. An appropriate namespace declaration will be - added to the result value. If false, columns containing null - values are simply omitted from the output. - - - - The parameter targetns specifies the - desired XML namespace of the result. If no particular namespace - is wanted, an empty string should be passed. - - - - The following functions return XML Schema documents describing the - mappings performed by the corresponding functions above: - -table_to_xmlschema ( table regclass, nulls boolean, - tableforest boolean, targetns text ) xml -query_to_xmlschema ( query text, nulls boolean, - tableforest boolean, targetns text ) xml -cursor_to_xmlschema ( cursor refcursor, nulls boolean, - tableforest boolean, targetns text ) xml - - It is essential that the same parameters are passed in order to - obtain matching XML data mappings and XML Schema documents. - - - - The following functions produce XML data mappings and the - corresponding XML Schema in one document (or forest), linked - together. They can be useful where self-contained and - self-describing results are wanted: - -table_to_xml_and_xmlschema ( table regclass, nulls boolean, - tableforest boolean, targetns text ) xml -query_to_xml_and_xmlschema ( query text, nulls boolean, - tableforest boolean, targetns text ) xml - - - - - In addition, the following functions are available to produce - analogous mappings of entire schemas or the entire current - database: - -schema_to_xml ( schema name, nulls boolean, - tableforest boolean, targetns text ) xml -schema_to_xmlschema ( schema name, nulls boolean, - tableforest boolean, targetns text ) xml -schema_to_xml_and_xmlschema ( schema name, nulls boolean, - tableforest boolean, targetns text ) xml - -database_to_xml ( nulls boolean, - tableforest boolean, targetns text ) xml -database_to_xmlschema ( nulls boolean, - tableforest boolean, targetns text ) xml -database_to_xml_and_xmlschema ( nulls boolean, - tableforest boolean, targetns text ) xml - - - These functions ignore tables that are not readable by the current user. - The database-wide functions additionally ignore schemas that the current - user does not have USAGE (lookup) privilege for. - - - - Note that these potentially produce a lot of data, which needs to - be built up in memory. When requesting content mappings of large - schemas or databases, it might be worthwhile to consider mapping the - tables separately instead, possibly even through a cursor. - - - - The result of a schema content mapping looks like this: - - - -table1-mapping - -table2-mapping - -... - -]]> - - where the format of a table mapping depends on the - tableforest parameter as explained above. - - - - The result of a database content mapping looks like this: - - - - - ... - - - - ... - - -... - -]]> - - where the schema mapping is as above. - - - - As an example of using the output produced by these functions, - shows an XSLT stylesheet that - converts the output of - table_to_xml_and_xmlschema to an HTML - document containing a tabular rendition of the table data. In a - similar manner, the results from these functions can be - converted into other XML-based formats. - - - - XSLT Stylesheet for Converting SQL/XML Output to HTML - - - - - - - - - - - - - <xsl:value-of select="name(current())"/> - - - - - - - - - - - - - - - - -
- - -
- -
-]]>
-
-
-
- - - JSON Functions and Operators - - - JSON - functions and operators - - - SQL/JSON - functions and expressions - - - - This section describes: - - - - - functions and operators for processing and creating JSON data - - - - - the SQL/JSON path language - - - - - the SQL/JSON query functions - - - - - - - To provide native support for JSON data types within the SQL environment, - PostgreSQL implements the - SQL/JSON data model. - This model comprises sequences of items. Each item can hold SQL scalar - values, with an additional SQL/JSON null value, and composite data structures - that use JSON arrays and objects. The model is a formalization of the implied - data model in the JSON specification - RFC 7159. - - - - SQL/JSON allows you to handle JSON data alongside regular SQL data, - with transaction support, including: - - - - - Uploading JSON data into the database and storing it in - regular SQL columns as character or binary strings. - - - - - Generating JSON objects and arrays from relational data. - - - - - Querying JSON data using SQL/JSON query functions and - SQL/JSON path language expressions. - - - - - - - To learn more about the SQL/JSON standard, see - . For details on JSON types - supported in PostgreSQL, - see . - - - - Processing and Creating JSON Data - - - shows the operators that - are available for use with JSON data types (see ). - In addition, the usual comparison operators shown in are available for - jsonb, though not for json. The comparison - operators follow the ordering rules for B-tree operations outlined in - . - See also for the aggregate - function json_agg which aggregates record - values as JSON, the aggregate function - json_object_agg which aggregates pairs of values - into a JSON object, and their jsonb equivalents, - jsonb_agg and jsonb_object_agg. - - - - <type>json</type> and <type>jsonb</type> Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - json -> integer - json - - - jsonb -> integer - jsonb - - - Extracts n'th element of JSON array - (array elements are indexed from zero, but negative integers count - from the end). - - - '[{"a":"foo"},{"b":"bar"},{"c":"baz"}]'::json -> 2 - {"c":"baz"} - - - '[{"a":"foo"},{"b":"bar"},{"c":"baz"}]'::json -> -3 - {"a":"foo"} - - - - - - json -> text - json - - - jsonb -> text - jsonb - - - Extracts JSON object field with the given key. - - - '{"a": {"b":"foo"}}'::json -> 'a' - {"b":"foo"} - - - - - - json ->> integer - text - - - jsonb ->> integer - text - - - Extracts n'th element of JSON array, - as text. - - - '[1,2,3]'::json ->> 2 - 3 - - - - - - json ->> text - text - - - jsonb ->> text - text - - - Extracts JSON object field with the given key, as text. - - - '{"a":1,"b":2}'::json ->> 'b' - 2 - - - - - - json #> text[] - json - - - jsonb #> text[] - jsonb - - - Extracts JSON sub-object at the specified path, where path elements - can be either field keys or array indexes. - - - '{"a": {"b": ["foo","bar"]}}'::json #> '{a,b,1}' - "bar" - - - - - - json #>> text[] - text - - - jsonb #>> text[] - text - - - Extracts JSON sub-object at the specified path as text. - - - '{"a": {"b": ["foo","bar"]}}'::json #>> '{a,b,1}' - bar - - - - -
- - - - The field/element/path extraction operators return NULL, rather than - failing, if the JSON input does not have the right structure to match - the request; for example if no such key or array element exists. - - - - - Some further operators exist only for jsonb, as shown - in . - - describes how these operators can be used to effectively search indexed - jsonb data. - - - - Additional <type>jsonb</type> Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - jsonb @> jsonb - boolean - - - Does the first JSON value contain the second? - (See for details about containment.) - - - '{"a":1, "b":2}'::jsonb @> '{"b":2}'::jsonb - t - - - - - - jsonb <@ jsonb - boolean - - - Is the first JSON value contained in the second? - - - '{"b":2}'::jsonb <@ '{"a":1, "b":2}'::jsonb - t - - - - - - jsonb ? text - boolean - - - Does the text string exist as a top-level key or array element within - the JSON value? - - - '{"a":1, "b":2}'::jsonb ? 'b' - t - - - '["a", "b", "c"]'::jsonb ? 'b' - t - - - - - - jsonb ?| text[] - boolean - - - Do any of the strings in the text array exist as top-level keys or - array elements? - - - '{"a":1, "b":2, "c":3}'::jsonb ?| array['b', 'd'] - t - - - - - - jsonb ?& text[] - boolean - - - Do all of the strings in the text array exist as top-level keys or - array elements? - - - '["a", "b", "c"]'::jsonb ?& array['a', 'b'] - t - - - - - - jsonb || jsonb - jsonb - - - Concatenates two jsonb values. - Concatenating two arrays generates an array containing all the - elements of each input. Concatenating two objects generates an - object containing the union of their - keys, taking the second object's value when there are duplicate keys. - All other cases are treated by converting a non-array input into a - single-element array, and then proceeding as for two arrays. - Does not operate recursively: only the top-level array or object - structure is merged. - - - '["a", "b"]'::jsonb || '["a", "d"]'::jsonb - ["a", "b", "a", "d"] - - - '{"a": "b"}'::jsonb || '{"c": "d"}'::jsonb - {"a": "b", "c": "d"} - - - '[1, 2]'::jsonb || '3'::jsonb - [1, 2, 3] - - - '{"a": "b"}'::jsonb || '42'::jsonb - [{"a": "b"}, 42] - - - To append an array to another array as a single entry, wrap it - in an additional layer of array, for example: - - - '[1, 2]'::jsonb || jsonb_build_array('[3, 4]'::jsonb) - [1, 2, [3, 4]] - - - - - - jsonb - text - jsonb - - - Deletes a key (and its value) from a JSON object, or matching string - value(s) from a JSON array. - - - '{"a": "b", "c": "d"}'::jsonb - 'a' - {"c": "d"} - - - '["a", "b", "c", "b"]'::jsonb - 'b' - ["a", "c"] - - - - - - jsonb - text[] - jsonb - - - Deletes all matching keys or array elements from the left operand. - - - '{"a": "b", "c": "d"}'::jsonb - '{a,c}'::text[] - {} - - - - - - jsonb - integer - jsonb - - - Deletes the array element with specified index (negative - integers count from the end). Throws an error if JSON value - is not an array. - - - '["a", "b"]'::jsonb - 1 - ["a"] - - - - - - jsonb #- text[] - jsonb - - - Deletes the field or array element at the specified path, where path - elements can be either field keys or array indexes. - - - '["a", {"b":1}]'::jsonb #- '{1,b}' - ["a", {}] - - - - - - jsonb @? jsonpath - boolean - - - Does JSON path return any item for the specified JSON value? - (This is useful only with SQL-standard JSON path expressions, not - predicate check - expressions, since those always return a value.) - - - '{"a":[1,2,3,4,5]}'::jsonb @? '$.a[*] ? (@ > 2)' - t - - - - - - jsonb @@ jsonpath - boolean - - - Returns the result of a JSON path predicate check for the - specified JSON value. - (This is useful only - with predicate - check expressions, not SQL-standard JSON path expressions, - since it will return NULL if the path result is - not a single boolean value.) - - - '{"a":[1,2,3,4,5]}'::jsonb @@ '$.a[*] > 2' - t - - - - -
- - - - The jsonpath operators @? - and @@ suppress the following errors: missing object - field or array element, unexpected JSON item type, datetime and numeric - errors. The jsonpath-related functions described below can - also be told to suppress these types of errors. This behavior might be - helpful when searching JSON document collections of varying structure. - - - - - shows the functions that are - available for constructing json and jsonb values. - Some functions in this table have a RETURNING clause, - which specifies the data type returned. It must be one of json, - jsonb, bytea, a character string type (text, - char, or varchar), or a type - that can be cast to json. - By default, the json type is returned. - - - - JSON Creation Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - to_json - - to_json ( anyelement ) - json - - - - to_jsonb - - to_jsonb ( anyelement ) - jsonb - - - Converts any SQL value to json or jsonb. - Arrays and composites are converted recursively to arrays and - objects (multidimensional arrays become arrays of arrays in JSON). - Otherwise, if there is a cast from the SQL data type - to json, the cast function will be used to perform the - conversion; - - For example, the extension has a cast - from hstore to json, so that - hstore values converted via the JSON creation functions - will be represented as JSON objects, not as primitive string values. - - - otherwise, a scalar JSON value is produced. For any scalar other than - a number, a Boolean, or a null value, the text representation will be - used, with escaping as necessary to make it a valid JSON string value. - - - to_json('Fred said "Hi."'::text) - "Fred said \"Hi.\"" - - - to_jsonb(row(42, 'Fred said "Hi."'::text)) - {"f1": 42, "f2": "Fred said \"Hi.\""} - - - - - - - array_to_json - - array_to_json ( anyarray , boolean ) - json - - - Converts an SQL array to a JSON array. The behavior is the same - as to_json except that line feeds will be added - between top-level array elements if the optional boolean parameter is - true. - - - array_to_json('{{1,5},{99,100}}'::int[]) - [[1,5],[99,100]] - - - - - - - json_array - json_array ( - { value_expression FORMAT JSON } , ... - { NULL | ABSENT } ON NULL - RETURNING data_type FORMAT JSON ENCODING UTF8 ) - - - json_array ( - query_expression - RETURNING data_type FORMAT JSON ENCODING UTF8 ) - - - Constructs a JSON array from either a series of - value_expression parameters or from the results - of query_expression, - which must be a SELECT query returning a single column. If - ABSENT ON NULL is specified, NULL values are ignored. - This is always the case if a - query_expression is used. - - - json_array(1,true,json '{"a":null}') - [1, true, {"a":null}] - - - json_array(SELECT * FROM (VALUES(1),(2)) t) - [1, 2] - - - - - - - row_to_json - - row_to_json ( record , boolean ) - json - - - Converts an SQL composite value to a JSON object. The behavior is the - same as to_json except that line feeds will be - added between top-level elements if the optional boolean parameter is - true. - - - row_to_json(row(1,'foo')) - {"f1":1,"f2":"foo"} - - - - - - - json_build_array - - json_build_array ( VARIADIC "any" ) - json - - - - jsonb_build_array - - jsonb_build_array ( VARIADIC "any" ) - jsonb - - - Builds a possibly-heterogeneously-typed JSON array out of a variadic - argument list. Each argument is converted as - per to_json or to_jsonb. - - - json_build_array(1, 2, 'foo', 4, 5) - [1, 2, "foo", 4, 5] - - - - - - - json_build_object - - json_build_object ( VARIADIC "any" ) - json - - - - jsonb_build_object - - jsonb_build_object ( VARIADIC "any" ) - jsonb - - - Builds a JSON object out of a variadic argument list. By convention, - the argument list consists of alternating keys and values. Key - arguments are coerced to text; value arguments are converted as - per to_json or to_jsonb. - - - json_build_object('foo', 1, 2, row(3,'bar')) - {"foo" : 1, "2" : {"f1":3,"f2":"bar"}} - - - - - - json_object - json_object ( - { key_expression { VALUE | ':' } - value_expression FORMAT JSON ENCODING UTF8 }, ... - { NULL | ABSENT } ON NULL - { WITH | WITHOUT } UNIQUE KEYS - RETURNING data_type FORMAT JSON ENCODING UTF8 ) - - - Constructs a JSON object of all the key/value pairs given, - or an empty object if none are given. - key_expression is a scalar expression - defining the JSON key, which is - converted to the text type. - It cannot be NULL nor can it - belong to a type that has a cast to the json type. - If WITH UNIQUE KEYS is specified, there must not - be any duplicate key_expression. - Any pair for which the value_expression - evaluates to NULL is omitted from the output - if ABSENT ON NULL is specified; - if NULL ON NULL is specified or the clause - omitted, the key is included with value NULL. - - - json_object('code' VALUE 'P123', 'title': 'Jaws') - {"code" : "P123", "title" : "Jaws"} - - - - - - - json_object - - json_object ( text[] ) - json - - - - jsonb_object - - jsonb_object ( text[] ) - jsonb - - - Builds a JSON object out of a text array. The array must have either - exactly one dimension with an even number of members, in which case - they are taken as alternating key/value pairs, or two dimensions - such that each inner array has exactly two elements, which - are taken as a key/value pair. All values are converted to JSON - strings. - - - json_object('{a, 1, b, "def", c, 3.5}') - {"a" : "1", "b" : "def", "c" : "3.5"} - - json_object('{{a, 1}, {b, "def"}, {c, 3.5}}') - {"a" : "1", "b" : "def", "c" : "3.5"} - - - - - - json_object ( keys text[], values text[] ) - json - - - jsonb_object ( keys text[], values text[] ) - jsonb - - - This form of json_object takes keys and values - pairwise from separate text arrays. Otherwise it is identical to - the one-argument form. - - - json_object('{a,b}', '{1,2}') - {"a": "1", "b": "2"} - - - - - - json constructor - json ( - expression - FORMAT JSON ENCODING UTF8 - { WITH | WITHOUT } UNIQUE KEYS ) - json - - - Converts a given expression specified as text or - bytea string (in UTF8 encoding) into a JSON - value. If expression is NULL, an - SQL null value is returned. - If WITH UNIQUE is specified, the - expression must not contain any duplicate - object keys. - - - json('{"a":123, "b":[true,"foo"], "a":"bar"}') - {"a":123, "b":[true,"foo"], "a":"bar"} - - - - - - - json_scalar - json_scalar ( expression ) - - - Converts a given SQL scalar value into a JSON scalar value. - If the input is NULL, an SQL null is returned. If - the input is number or a boolean value, a corresponding JSON number - or boolean value is returned. For any other value, a JSON string is - returned. - - - json_scalar(123.45) - 123.45 - - - json_scalar(CURRENT_TIMESTAMP) - "2022-05-10T10:51:04.62128-04:00" - - - - - - json_serialize ( - expression FORMAT JSON ENCODING UTF8 - RETURNING data_type FORMAT JSON ENCODING UTF8 ) - - - Converts an SQL/JSON expression into a character or binary string. The - expression can be of any JSON type, any - character string type, or bytea in UTF8 encoding. - The returned type used in RETURNING can be any - character string type or bytea. The default is - text. - - - json_serialize('{ "a" : 1 } ' RETURNING bytea) - \x7b20226122203a2031207d20 - - - - -
- - - details SQL/JSON - facilities for testing JSON. - - - - SQL/JSON Testing Functions - - - - - Function signature - - - Description - - - Example(s) - - - - - - - IS JSON - expression IS NOT JSON - { VALUE | SCALAR | ARRAY | OBJECT } - { WITH | WITHOUT } UNIQUE KEYS - - - This predicate tests whether expression can be - parsed as JSON, possibly of a specified type. - If SCALAR or ARRAY or - OBJECT is specified, the - test is whether or not the JSON is of that particular type. If - WITH UNIQUE KEYS is specified, then any object in the - expression is also tested to see if it - has duplicate keys. - - - -SELECT js, - js IS JSON "json?", - js IS JSON SCALAR "scalar?", - js IS JSON OBJECT "object?", - js IS JSON ARRAY "array?" -FROM (VALUES - ('123'), ('"abc"'), ('{"a": "b"}'), ('[1,2]'),('abc')) foo(js); - js | json? | scalar? | object? | array? -------------+-------+---------+---------+-------- - 123 | t | t | f | f - "abc" | t | t | f | f - {"a": "b"} | t | f | t | f - [1,2] | t | f | f | t - abc | f | f | f | f - - - - -SELECT js, - js IS JSON OBJECT "object?", - js IS JSON ARRAY "array?", - js IS JSON ARRAY WITH UNIQUE KEYS "array w. UK?", - js IS JSON ARRAY WITHOUT UNIQUE KEYS "array w/o UK?" -FROM (VALUES ('[{"a":"1"}, - {"b":"2","b":"3"}]')) foo(js); --[ RECORD 1 ]-+-------------------- -js | [{"a":"1"}, + - | {"b":"2","b":"3"}] -object? | f -array? | t -array w. UK? | f -array w/o UK? | t - - - - - -
- - - shows the functions that - are available for processing json and jsonb values. - - - - JSON Processing Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - json_array_elements - - json_array_elements ( json ) - setof json - - - - jsonb_array_elements - - jsonb_array_elements ( jsonb ) - setof jsonb - - - Expands the top-level JSON array into a set of JSON values. - - - select * from json_array_elements('[1,true, [2,false]]') - - - value ------------ - 1 - true - [2,false] - - - - - - - - json_array_elements_text - - json_array_elements_text ( json ) - setof text - - - - jsonb_array_elements_text - - jsonb_array_elements_text ( jsonb ) - setof text - - - Expands the top-level JSON array into a set of text values. - - - select * from json_array_elements_text('["foo", "bar"]') - - - value ------------ - foo - bar - - - - - - - - json_array_length - - json_array_length ( json ) - integer - - - - jsonb_array_length - - jsonb_array_length ( jsonb ) - integer - - - Returns the number of elements in the top-level JSON array. - - - json_array_length('[1,2,3,{"f1":1,"f2":[5,6]},4]') - 5 - - - jsonb_array_length('[]') - 0 - - - - - - - json_each - - json_each ( json ) - setof record - ( key text, - value json ) - - - - jsonb_each - - jsonb_each ( jsonb ) - setof record - ( key text, - value jsonb ) - - - Expands the top-level JSON object into a set of key/value pairs. - - - select * from json_each('{"a":"foo", "b":"bar"}') - - - key | value ------+------- - a | "foo" - b | "bar" - - - - - - - - json_each_text - - json_each_text ( json ) - setof record - ( key text, - value text ) - - - - jsonb_each_text - - jsonb_each_text ( jsonb ) - setof record - ( key text, - value text ) - - - Expands the top-level JSON object into a set of key/value pairs. - The returned values will be of - type text. - - - select * from json_each_text('{"a":"foo", "b":"bar"}') - - - key | value ------+------- - a | foo - b | bar - - - - - - - - json_extract_path - - json_extract_path ( from_json json, VARIADIC path_elems text[] ) - json - - - - jsonb_extract_path - - jsonb_extract_path ( from_json jsonb, VARIADIC path_elems text[] ) - jsonb - - - Extracts JSON sub-object at the specified path. - (This is functionally equivalent to the #> - operator, but writing the path out as a variadic list can be more - convenient in some cases.) - - - json_extract_path('{"f2":{"f3":1},"f4":{"f5":99,"f6":"foo"}}', 'f4', 'f6') - "foo" - - - - - - - json_extract_path_text - - json_extract_path_text ( from_json json, VARIADIC path_elems text[] ) - text - - - - jsonb_extract_path_text - - jsonb_extract_path_text ( from_json jsonb, VARIADIC path_elems text[] ) - text - - - Extracts JSON sub-object at the specified path as text. - (This is functionally equivalent to the #>> - operator.) - - - json_extract_path_text('{"f2":{"f3":1},"f4":{"f5":99,"f6":"foo"}}', 'f4', 'f6') - foo - - - - - - - json_object_keys - - json_object_keys ( json ) - setof text - - - - jsonb_object_keys - - jsonb_object_keys ( jsonb ) - setof text - - - Returns the set of keys in the top-level JSON object. - - - select * from json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}') - - - json_object_keys ------------------- - f1 - f2 - - - - - - - - json_populate_record - - json_populate_record ( base anyelement, from_json json ) - anyelement - - - - jsonb_populate_record - - jsonb_populate_record ( base anyelement, from_json jsonb ) - anyelement - - - Expands the top-level JSON object to a row having the composite type - of the base argument. The JSON object - is scanned for fields whose names match column names of the output row - type, and their values are inserted into those columns of the output. - (Fields that do not correspond to any output column name are ignored.) - In typical use, the value of base is just - NULL, which means that any output columns that do - not match any object field will be filled with nulls. However, - if base isn't NULL then - the values it contains will be used for unmatched columns. - - - To convert a JSON value to the SQL type of an output column, the - following rules are applied in sequence: - - - - A JSON null value is converted to an SQL null in all cases. - - - - - If the output column is of type json - or jsonb, the JSON value is just reproduced exactly. - - - - - If the output column is a composite (row) type, and the JSON value - is a JSON object, the fields of the object are converted to columns - of the output row type by recursive application of these rules. - - - - - Likewise, if the output column is an array type and the JSON value - is a JSON array, the elements of the JSON array are converted to - elements of the output array by recursive application of these - rules. - - - - - Otherwise, if the JSON value is a string, the contents of the - string are fed to the input conversion function for the column's - data type. - - - - - Otherwise, the ordinary text representation of the JSON value is - fed to the input conversion function for the column's data type. - - - - - - While the example below uses a constant JSON value, typical use would - be to reference a json or jsonb column - laterally from another table in the query's FROM - clause. Writing json_populate_record in - the FROM clause is good practice, since all of the - extracted columns are available for use without duplicate function - calls. - - - create type subrowtype as (d int, e text); - create type myrowtype as (a int, b text[], c subrowtype); - - - select * from json_populate_record(null::myrowtype, - '{"a": 1, "b": ["2", "a b"], "c": {"d": 4, "e": "a b c"}, "x": "foo"}') - - - a | b | c ----+-----------+------------- - 1 | {2,"a b"} | (4,"a b c") - - - - - - - - jsonb_populate_record_valid - - jsonb_populate_record_valid ( base anyelement, from_json json ) - boolean - - - Function for testing jsonb_populate_record. Returns - true if the input jsonb_populate_record - would finish without an error for the given input JSON object; that is, it's - valid input, false otherwise. - - - create type jsb_char2 as (a char(2)); - - - select jsonb_populate_record_valid(NULL::jsb_char2, '{"a": "aaa"}'); - - - jsonb_populate_record_valid ------------------------------ - f -(1 row) - - - select * from jsonb_populate_record(NULL::jsb_char2, '{"a": "aaa"}') q; - - -ERROR: value too long for type character(2) - - select jsonb_populate_record_valid(NULL::jsb_char2, '{"a": "aa"}'); - - - jsonb_populate_record_valid ------------------------------ - t -(1 row) - - - select * from jsonb_populate_record(NULL::jsb_char2, '{"a": "aa"}') q; - - - a ----- - aa -(1 row) - - - - - - - - json_populate_recordset - - json_populate_recordset ( base anyelement, from_json json ) - setof anyelement - - - - jsonb_populate_recordset - - jsonb_populate_recordset ( base anyelement, from_json jsonb ) - setof anyelement - - - Expands the top-level JSON array of objects to a set of rows having - the composite type of the base argument. - Each element of the JSON array is processed as described above - for json[b]_populate_record. - - - create type twoints as (a int, b int); - - - select * from json_populate_recordset(null::twoints, '[{"a":1,"b":2}, {"a":3,"b":4}]') - - - a | b ----+--- - 1 | 2 - 3 | 4 - - - - - - - - json_to_record - - json_to_record ( json ) - record - - - - jsonb_to_record - - jsonb_to_record ( jsonb ) - record - - - Expands the top-level JSON object to a row having the composite type - defined by an AS clause. (As with all functions - returning record, the calling query must explicitly - define the structure of the record with an AS - clause.) The output record is filled from fields of the JSON object, - in the same way as described above - for json[b]_populate_record. Since there is no - input record value, unmatched columns are always filled with nulls. - - - create type myrowtype as (a int, b text); - - - select * from json_to_record('{"a":1,"b":[1,2,3],"c":[1,2,3],"e":"bar","r": {"a": 123, "b": "a b c"}}') as x(a int, b text, c int[], d text, r myrowtype) - - - a | b | c | d | r ----+---------+---------+---+--------------- - 1 | [1,2,3] | {1,2,3} | | (123,"a b c") - - - - - - - - json_to_recordset - - json_to_recordset ( json ) - setof record - - - - jsonb_to_recordset - - jsonb_to_recordset ( jsonb ) - setof record - - - Expands the top-level JSON array of objects to a set of rows having - the composite type defined by an AS clause. (As - with all functions returning record, the calling query - must explicitly define the structure of the record with - an AS clause.) Each element of the JSON array is - processed as described above - for json[b]_populate_record. - - - select * from json_to_recordset('[{"a":1,"b":"foo"}, {"a":"2","c":"bar"}]') as x(a int, b text) - - - a | b ----+----- - 1 | foo - 2 | - - - - - - - - jsonb_set - - jsonb_set ( target jsonb, path text[], new_value jsonb , create_if_missing boolean ) - jsonb - - - Returns target - with the item designated by path - replaced by new_value, or with - new_value added if - create_if_missing is true (which is the - default) and the item designated by path - does not exist. - All earlier steps in the path must exist, or - the target is returned unchanged. - As with the path oriented operators, negative integers that - appear in the path count from the end - of JSON arrays. - If the last path step is an array index that is out of range, - and create_if_missing is true, the new - value is added at the beginning of the array if the index is negative, - or at the end of the array if it is positive. - - - jsonb_set('[{"f1":1,"f2":null},2,null,3]', '{0,f1}', '[2,3,4]', false) - [{"f1": [2, 3, 4], "f2": null}, 2, null, 3] - - - jsonb_set('[{"f1":1,"f2":null},2]', '{0,f3}', '[2,3,4]') - [{"f1": 1, "f2": null, "f3": [2, 3, 4]}, 2] - - - - - - - jsonb_set_lax - - jsonb_set_lax ( target jsonb, path text[], new_value jsonb , create_if_missing boolean , null_value_treatment text ) - jsonb - - - If new_value is not NULL, - behaves identically to jsonb_set. Otherwise behaves - according to the value - of null_value_treatment which must be one - of 'raise_exception', - 'use_json_null', 'delete_key', or - 'return_target'. The default is - 'use_json_null'. - - - jsonb_set_lax('[{"f1":1,"f2":null},2,null,3]', '{0,f1}', null) - [{"f1": null, "f2": null}, 2, null, 3] - - - jsonb_set_lax('[{"f1":99,"f2":null},2]', '{0,f3}', null, true, 'return_target') - [{"f1": 99, "f2": null}, 2] - - - - - - - jsonb_insert - - jsonb_insert ( target jsonb, path text[], new_value jsonb , insert_after boolean ) - jsonb - - - Returns target - with new_value inserted. If the item - designated by the path is an array - element, new_value will be inserted before - that item if insert_after is false (which - is the default), or after it - if insert_after is true. If the item - designated by the path is an object - field, new_value will be inserted only if - the object does not already contain that key. - All earlier steps in the path must exist, or - the target is returned unchanged. - As with the path oriented operators, negative integers that - appear in the path count from the end - of JSON arrays. - If the last path step is an array index that is out of range, the new - value is added at the beginning of the array if the index is negative, - or at the end of the array if it is positive. - - - jsonb_insert('{"a": [0,1,2]}', '{a, 1}', '"new_value"') - {"a": [0, "new_value", 1, 2]} - - - jsonb_insert('{"a": [0,1,2]}', '{a, 1}', '"new_value"', true) - {"a": [0, 1, "new_value", 2]} - - - - - - - json_strip_nulls - - json_strip_nulls ( target json ,strip_in_arrays boolean ) - json - - - - jsonb_strip_nulls - - jsonb_strip_nulls ( target jsonb ,strip_in_arrays boolean ) - jsonb - - - Deletes all object fields that have null values from the given JSON - value, recursively. - If strip_in_arrays is true (the default is false), - null array elements are also stripped. - Otherwise they are not stripped. Bare null values are never stripped. - - - json_strip_nulls('[{"f1":1, "f2":null}, 2, null, 3]') - [{"f1":1},2,null,3] - - - jsonb_strip_nulls('[1,2,null,3,4]', true); - [1,2,3,4] - - - - - - - - jsonb_path_exists - - jsonb_path_exists ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - boolean - - - Checks whether the JSON path returns any item for the specified JSON - value. - (This is useful only with SQL-standard JSON path expressions, not - predicate check - expressions, since those always return a value.) - If the vars argument is specified, it must - be a JSON object, and its fields provide named values to be - substituted into the jsonpath expression. - If the silent argument is specified and - is true, the function suppresses the same errors - as the @? and @@ operators do. - - - jsonb_path_exists('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') - t - - - - - - - jsonb_path_match - - jsonb_path_match ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - boolean - - - Returns the SQL boolean result of a JSON path predicate check - for the specified JSON value. - (This is useful only - with predicate - check expressions, not SQL-standard JSON path expressions, - since it will either fail or return NULL if the - path result is not a single boolean value.) - The optional vars - and silent arguments act the same as - for jsonb_path_exists. - - - jsonb_path_match('{"a":[1,2,3,4,5]}', 'exists($.a[*] ? (@ >= $min && @ <= $max))', '{"min":2, "max":4}') - t - - - - - - - jsonb_path_query - - jsonb_path_query ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - setof jsonb - - - Returns all JSON items returned by the JSON path for the specified - JSON value. - For SQL-standard JSON path expressions it returns the JSON - values selected from target. - For predicate - check expressions it returns the result of the predicate - check: true, false, - or null. - The optional vars - and silent arguments act the same as - for jsonb_path_exists. - - - select * from jsonb_path_query('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') - - - jsonb_path_query ------------------- - 2 - 3 - 4 - - - - - - - - jsonb_path_query_array - - jsonb_path_query_array ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - jsonb - - - Returns all JSON items returned by the JSON path for the specified - JSON value, as a JSON array. - The parameters are the same as - for jsonb_path_query. - - - jsonb_path_query_array('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') - [2, 3, 4] - - - - - - - jsonb_path_query_first - - jsonb_path_query_first ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - jsonb - - - Returns the first JSON item returned by the JSON path for the - specified JSON value, or NULL if there are no - results. - The parameters are the same as - for jsonb_path_query. - - - jsonb_path_query_first('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') - 2 - - - - - - - jsonb_path_exists_tz - - jsonb_path_exists_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - boolean - - - - jsonb_path_match_tz - - jsonb_path_match_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - boolean - - - - jsonb_path_query_tz - - jsonb_path_query_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - setof jsonb - - - - jsonb_path_query_array_tz - - jsonb_path_query_array_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - jsonb - - - - jsonb_path_query_first_tz - - jsonb_path_query_first_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) - jsonb - - - These functions act like their counterparts described above without - the _tz suffix, except that these functions support - comparisons of date/time values that require timezone-aware - conversions. The example below requires interpretation of the - date-only value 2015-08-02 as a timestamp with time - zone, so the result depends on the current - setting. Due to this dependency, these - functions are marked as stable, which means these functions cannot be - used in indexes. Their counterparts are immutable, and so can be used - in indexes; but they will throw errors if asked to make such - comparisons. - - - jsonb_path_exists_tz('["2015-08-01 12:00:00-05"]', '$[*] ? (@.datetime() < "2015-08-02".datetime())') - t - - - - - - - jsonb_pretty - - jsonb_pretty ( jsonb ) - text - - - Converts the given JSON value to pretty-printed, indented text. - - - jsonb_pretty('[{"f1":1,"f2":null}, 2]') - - -[ - { - "f1": 1, - "f2": null - }, - 2 -] - - - - - - - - json_typeof - - json_typeof ( json ) - text - - - - jsonb_typeof - - jsonb_typeof ( jsonb ) - text - - - Returns the type of the top-level JSON value as a text string. - Possible types are - object, array, - string, number, - boolean, and null. - (The null result should not be confused - with an SQL NULL; see the examples.) - - - json_typeof('-123.4') - number - - - json_typeof('null'::json) - null - - - json_typeof(NULL::json) IS NULL - t - - - - -
-
- - - The SQL/JSON Path Language - - - SQL/JSON path language - - - - SQL/JSON path expressions specify item(s) to be retrieved - from a JSON value, similarly to XPath expressions used - for access to XML content. In PostgreSQL, - path expressions are implemented as the jsonpath - data type and can use any elements described in - . - - - - JSON query functions and operators - pass the provided path expression to the path engine - for evaluation. If the expression matches the queried JSON data, - the corresponding JSON item, or set of items, is returned. - If there is no match, the result will be NULL, - false, or an error, depending on the function. - Path expressions are written in the SQL/JSON path language - and can include arithmetic expressions and functions. - - - - A path expression consists of a sequence of elements allowed - by the jsonpath data type. - The path expression is normally evaluated from left to right, but - you can use parentheses to change the order of operations. - If the evaluation is successful, a sequence of JSON items is produced, - and the evaluation result is returned to the JSON query function - that completes the specified computation. - - - - To refer to the JSON value being queried (the - context item), use the $ variable - in the path expression. The first element of a path must always - be $. It can be followed by one or more - accessor operators, - which go down the JSON structure level by level to retrieve sub-items - of the context item. Each accessor operator acts on the - result(s) of the previous evaluation step, producing zero, one, or more - output items from each input item. - - - - For example, suppose you have some JSON data from a GPS tracker that you - would like to parse, such as: - -SELECT '{ - "track": { - "segments": [ - { - "location": [ 47.763, 13.4034 ], - "start time": "2018-10-14 10:05:14", - "HR": 73 - }, - { - "location": [ 47.706, 13.2635 ], - "start time": "2018-10-14 10:39:21", - "HR": 135 - } - ] - } -}' AS json \gset - - (The above example can be copied-and-pasted - into psql to set things up for the following - examples. Then psql will - expand :'json' into a suitably-quoted string - constant containing the JSON value.) - - - - To retrieve the available track segments, you need to use the - .key accessor - operator to descend through surrounding JSON objects, for example: - -=> select jsonb_path_query(:'json', '$.track.segments'); - jsonb_path_query ------------------------------------------------------------&zwsp;-----------------------------------------------------------&zwsp;--------------------------------------------- - [{"HR": 73, "location": [47.763, 13.4034], "start time": "2018-10-14 10:05:14"}, {"HR": 135, "location": [47.706, 13.2635], "start time": "2018-10-14 10:39:21"}] - - - - - To retrieve the contents of an array, you typically use the - [*] operator. - The following example will return the location coordinates for all - the available track segments: - -=> select jsonb_path_query(:'json', '$.track.segments[*].location'); - jsonb_path_query -------------------- - [47.763, 13.4034] - [47.706, 13.2635] - - Here we started with the whole JSON input value ($), - then the .track accessor selected the JSON object - associated with the "track" object key, then - the .segments accessor selected the JSON array - associated with the "segments" key within that - object, then the [*] accessor selected each element - of that array (producing a series of items), then - the .location accessor selected the JSON array - associated with the "location" key within each of - those objects. In this example, each of those objects had - a "location" key; but if any of them did not, - the .location accessor would have simply produced no - output for that input item. - - - - To return the coordinates of the first segment only, you can - specify the corresponding subscript in the [] - accessor operator. Recall that JSON array indexes are 0-relative: - -=> select jsonb_path_query(:'json', '$.track.segments[0].location'); - jsonb_path_query -------------------- - [47.763, 13.4034] - - - - - The result of each path evaluation step can be processed - by one or more of the jsonpath operators and methods - listed in . - Each method name must be preceded by a dot. For example, - you can get the size of an array: - -=> select jsonb_path_query(:'json', '$.track.segments.size()'); - jsonb_path_query ------------------- - 2 - - More examples of using jsonpath operators - and methods within path expressions appear below in - . - - - - A path can also contain - filter expressions that work similarly to the - WHERE clause in SQL. A filter expression begins with - a question mark and provides a condition in parentheses: - - -? (condition) - - - - - Filter expressions must be written just after the path evaluation step - to which they should apply. The result of that step is filtered to include - only those items that satisfy the provided condition. SQL/JSON defines - three-valued logic, so the condition can - produce true, false, - or unknown. The unknown value - plays the same role as SQL NULL and can be tested - for with the is unknown predicate. Further path - evaluation steps use only those items for which the filter expression - returned true. - - - - The functions and operators that can be used in filter expressions are - listed in . Within a - filter expression, the @ variable denotes the value - being considered (i.e., one result of the preceding path step). You can - write accessor operators after @ to retrieve component - items. - - - - For example, suppose you would like to retrieve all heart rate values higher - than 130. You can achieve this as follows: - -=> select jsonb_path_query(:'json', '$.track.segments[*].HR ? (@ > 130)'); - jsonb_path_query ------------------- - 135 - - - - - To get the start times of segments with such values, you have to - filter out irrelevant segments before selecting the start times, so the - filter expression is applied to the previous step, and the path used - in the condition is different: - -=> select jsonb_path_query(:'json', '$.track.segments[*] ? (@.HR > 130)."start time"'); - jsonb_path_query ------------------------ - "2018-10-14 10:39:21" - - - - - You can use several filter expressions in sequence, if required. - The following example selects start times of all segments that - contain locations with relevant coordinates and high heart rate values: - -=> select jsonb_path_query(:'json', '$.track.segments[*] ? (@.location[1] < 13.4) ? (@.HR > 130)."start time"'); - jsonb_path_query ------------------------ - "2018-10-14 10:39:21" - - - - - Using filter expressions at different nesting levels is also allowed. - The following example first filters all segments by location, and then - returns high heart rate values for these segments, if available: - -=> select jsonb_path_query(:'json', '$.track.segments[*] ? (@.location[1] < 13.4).HR ? (@ > 130)'); - jsonb_path_query ------------------- - 135 - - - - - You can also nest filter expressions within each other. - This example returns the size of the track if it contains any - segments with high heart rate values, or an empty sequence otherwise: - -=> select jsonb_path_query(:'json', '$.track ? (exists(@.segments[*] ? (@.HR > 130))).segments.size()'); - jsonb_path_query ------------------- - 2 - - - - - Deviations from the SQL Standard - - PostgreSQL's implementation of the SQL/JSON path - language has the following deviations from the SQL/JSON standard. - - - - Boolean Predicate Check Expressions - - As an extension to the SQL standard, - a PostgreSQL path expression can be a - Boolean predicate, whereas the SQL standard allows predicates only within - filters. While SQL-standard path expressions return the relevant - element(s) of the queried JSON value, predicate check expressions - return the single three-valued jsonb result of the - predicate: true, - false, or null. - For example, we could write this SQL-standard filter expression: - -=> select jsonb_path_query(:'json', '$.track.segments ?(@[*].HR > 130)'); - jsonb_path_query ------------------------------------------------------------&zwsp;---------------------- - {"HR": 135, "location": [47.706, 13.2635], "start time": "2018-10-14 10:39:21"} - - The similar predicate check expression simply - returns true, indicating that a match exists: - -=> select jsonb_path_query(:'json', '$.track.segments[*].HR > 130'); - jsonb_path_query ------------------- - true - - - - - - Predicate check expressions are required in the - @@ operator (and the - jsonb_path_match function), and should not be used - with the @? operator (or the - jsonb_path_exists function). - - - - - - Regular Expression Interpretation - - There are minor differences in the interpretation of regular - expression patterns used in like_regex filters, as - described in . - - - - - - Strict and Lax Modes - - When you query JSON data, the path expression may not match the - actual JSON data structure. An attempt to access a non-existent - member of an object or element of an array is defined as a - structural error. SQL/JSON path expressions have two modes - of handling structural errors: - - - - - - lax (default) — the path engine implicitly adapts - the queried data to the specified path. - Any structural errors that cannot be fixed as described below - are suppressed, producing no match. - - - - - strict — if a structural error occurs, an error is raised. - - - - - - Lax mode facilitates matching of a JSON document and path - expression when the JSON data does not conform to the expected schema. - If an operand does not match the requirements of a particular operation, - it can be automatically wrapped as an SQL/JSON array, or unwrapped by - converting its elements into an SQL/JSON sequence before performing - the operation. Also, comparison operators automatically unwrap their - operands in lax mode, so you can compare SQL/JSON arrays - out-of-the-box. An array of size 1 is considered equal to its sole element. - Automatic unwrapping is not performed when: - - - - The path expression contains type() or - size() methods that return the type - and the number of elements in the array, respectively. - - - - - The queried JSON data contain nested arrays. In this case, only - the outermost array is unwrapped, while all the inner arrays - remain unchanged. Thus, implicit unwrapping can only go one - level down within each path evaluation step. - - - - - - - For example, when querying the GPS data listed above, you can - abstract from the fact that it stores an array of segments - when using lax mode: - -=> select jsonb_path_query(:'json', 'lax $.track.segments.location'); - jsonb_path_query -------------------- - [47.763, 13.4034] - [47.706, 13.2635] - - - - - In strict mode, the specified path must exactly match the structure of - the queried JSON document, so using this path - expression will cause an error: - -=> select jsonb_path_query(:'json', 'strict $.track.segments.location'); -ERROR: jsonpath member accessor can only be applied to an object - - To get the same result as in lax mode, you have to explicitly unwrap the - segments array: - -=> select jsonb_path_query(:'json', 'strict $.track.segments[*].location'); - jsonb_path_query -------------------- - [47.763, 13.4034] - [47.706, 13.2635] - - - - - The unwrapping behavior of lax mode can lead to surprising results. For - instance, the following query using the .** accessor - selects every HR value twice: - -=> select jsonb_path_query(:'json', 'lax $.**.HR'); - jsonb_path_query ------------------- - 73 - 135 - 73 - 135 - - This happens because the .** accessor selects both - the segments array and each of its elements, while - the .HR accessor automatically unwraps arrays when - using lax mode. To avoid surprising results, we recommend using - the .** accessor only in strict mode. The - following query selects each HR value just once: - -=> select jsonb_path_query(:'json', 'strict $.**.HR'); - jsonb_path_query ------------------- - 73 - 135 - - - - - The unwrapping of arrays can also lead to unexpected results. Consider this - example, which selects all the location arrays: - -=> select jsonb_path_query(:'json', 'lax $.track.segments[*].location'); - jsonb_path_query -------------------- - [47.763, 13.4034] - [47.706, 13.2635] -(2 rows) - - As expected it returns the full arrays. But applying a filter expression - causes the arrays to be unwrapped to evaluate each item, returning only the - items that match the expression: - -=> select jsonb_path_query(:'json', 'lax $.track.segments[*].location ?(@[*] > 15)'); - jsonb_path_query ------------------- - 47.763 - 47.706 -(2 rows) - - This despite the fact that the full arrays are selected by the path - expression. Use strict mode to restore selecting the arrays: - -=> select jsonb_path_query(:'json', 'strict $.track.segments[*].location ?(@[*] > 15)'); - jsonb_path_query -------------------- - [47.763, 13.4034] - [47.706, 13.2635] -(2 rows) - - - - - - SQL/JSON Path Operators and Methods - - - shows the operators and - methods available in jsonpath. Note that while the unary - operators and methods can be applied to multiple values resulting from a - preceding path step, the binary operators (addition etc.) can only be - applied to single values. In lax mode, methods applied to an array will be - executed for each value in the array. The exceptions are - .type() and .size(), which apply to - the array itself. - - - - <type>jsonpath</type> Operators and Methods - - - - - Operator/Method - - - Description - - - Example(s) - - - - - - - - number + number - number - - - Addition - - - jsonb_path_query('[2]', '$[0] + 3') - 5 - - - - - - + number - number - - - Unary plus (no operation); unlike addition, this can iterate over - multiple values - - - jsonb_path_query_array('{"x": [2,3,4]}', '+ $.x') - [2, 3, 4] - - - - - - number - number - number - - - Subtraction - - - jsonb_path_query('[2]', '7 - $[0]') - 5 - - - - - - - number - number - - - Negation; unlike subtraction, this can iterate over - multiple values - - - jsonb_path_query_array('{"x": [2,3,4]}', '- $.x') - [-2, -3, -4] - - - - - - number * number - number - - - Multiplication - - - jsonb_path_query('[4]', '2 * $[0]') - 8 - - - - - - number / number - number - - - Division - - - jsonb_path_query('[8.5]', '$[0] / 2') - 4.2500000000000000 - - - - - - number % number - number - - - Modulo (remainder) - - - jsonb_path_query('[32]', '$[0] % 10') - 2 - - - - - - value . type() - string - - - Type of the JSON item (see json_typeof) - - - jsonb_path_query_array('[1, "2", {}]', '$[*].type()') - ["number", "string", "object"] - - - - - - value . size() - number - - - Size of the JSON item (number of array elements, or 1 if not an - array) - - - jsonb_path_query('{"m": [11, 15]}', '$.m.size()') - 2 - - - - - - value . boolean() - boolean - - - Boolean value converted from a JSON boolean, number, or string - - - jsonb_path_query_array('[1, "yes", false]', '$[*].boolean()') - [true, true, false] - - - - - - value . string() - string - - - String value converted from a JSON boolean, number, string, or - datetime - - - jsonb_path_query_array('[1.23, "xyz", false]', '$[*].string()') - ["1.23", "xyz", "false"] - - - jsonb_path_query('"2023-08-15 12:34:56"', '$.timestamp().string()') - "2023-08-15T12:34:56" - - - - - - value . double() - number - - - Approximate floating-point number converted from a JSON number or - string - - - jsonb_path_query('{"len": "1.9"}', '$.len.double() * 2') - 3.8 - - - - - - number . ceiling() - number - - - Nearest integer greater than or equal to the given number - - - jsonb_path_query('{"h": 1.3}', '$.h.ceiling()') - 2 - - - - - - number . floor() - number - - - Nearest integer less than or equal to the given number - - - jsonb_path_query('{"h": 1.7}', '$.h.floor()') - 1 - - - - - - number . abs() - number - - - Absolute value of the given number - - - jsonb_path_query('{"z": -0.3}', '$.z.abs()') - 0.3 - - - - - - value . bigint() - bigint - - - Big integer value converted from a JSON number or string - - - jsonb_path_query('{"len": "9876543219"}', '$.len.bigint()') - 9876543219 - - - - - - value . decimal( [ precision [ , scale ] ] ) - decimal - - - Rounded decimal value converted from a JSON number or string - (precision and scale must be - integer values) - - - jsonb_path_query('1234.5678', '$.decimal(6, 2)') - 1234.57 - - - - - - value . integer() - integer - - - Integer value converted from a JSON number or string - - - jsonb_path_query('{"len": "12345"}', '$.len.integer()') - 12345 - - - - - - value . number() - numeric - - - Numeric value converted from a JSON number or string - - - jsonb_path_query('{"len": "123.45"}', '$.len.number()') - 123.45 - - - - - - string . datetime() - datetime_type - (see note) - - - Date/time value converted from a string - - - jsonb_path_query('["2015-8-1", "2015-08-12"]', '$[*] ? (@.datetime() < "2015-08-2".datetime())') - "2015-8-1" - - - - - - string . datetime(template) - datetime_type - (see note) - - - Date/time value converted from a string using the - specified to_timestamp template - - - jsonb_path_query_array('["12:30", "18:40"]', '$[*].datetime("HH24:MI")') - ["12:30:00", "18:40:00"] - - - - - - string . date() - date - - - Date value converted from a string - - - jsonb_path_query('"2023-08-15"', '$.date()') - "2023-08-15" - - - - - - string . time() - time without time zone - - - Time without time zone value converted from a string - - - jsonb_path_query('"12:34:56"', '$.time()') - "12:34:56" - - - - - - string . time(precision) - time without time zone - - - Time without time zone value converted from a string, with fractional - seconds adjusted to the given precision - - - jsonb_path_query('"12:34:56.789"', '$.time(2)') - "12:34:56.79" - - - - - - string . time_tz() - time with time zone - - - Time with time zone value converted from a string - - - jsonb_path_query('"12:34:56 +05:30"', '$.time_tz()') - "12:34:56+05:30" - - - - - - string . time_tz(precision) - time with time zone - - - Time with time zone value converted from a string, with fractional - seconds adjusted to the given precision - - - jsonb_path_query('"12:34:56.789 +05:30"', '$.time_tz(2)') - "12:34:56.79+05:30" - - - - - - string . timestamp() - timestamp without time zone - - - Timestamp without time zone value converted from a string - - - jsonb_path_query('"2023-08-15 12:34:56"', '$.timestamp()') - "2023-08-15T12:34:56" - - - - - - string . timestamp(precision) - timestamp without time zone - - - Timestamp without time zone value converted from a string, with - fractional seconds adjusted to the given precision - - - jsonb_path_query('"2023-08-15 12:34:56.789"', '$.timestamp(2)') - "2023-08-15T12:34:56.79" - - - - - - string . timestamp_tz() - timestamp with time zone - - - Timestamp with time zone value converted from a string - - - jsonb_path_query('"2023-08-15 12:34:56 +05:30"', '$.timestamp_tz()') - "2023-08-15T12:34:56+05:30" - - - - - - string . timestamp_tz(precision) - timestamp with time zone - - - Timestamp with time zone value converted from a string, with fractional - seconds adjusted to the given precision - - - jsonb_path_query('"2023-08-15 12:34:56.789 +05:30"', '$.timestamp_tz(2)') - "2023-08-15T12:34:56.79+05:30" - - - - - - object . keyvalue() - array - - - The object's key-value pairs, represented as an array of objects - containing three fields: "key", - "value", and "id"; - "id" is a unique identifier of the object the - key-value pair belongs to - - - jsonb_path_query_array('{"x": "20", "y": 32}', '$.keyvalue()') - [{"id": 0, "key": "x", "value": "20"}, {"id": 0, "key": "y", "value": 32}] - - - - -
- - - - The result type of the datetime() and - datetime(template) - methods can be date, timetz, time, - timestamptz, or timestamp. - Both methods determine their result type dynamically. - - - The datetime() method sequentially tries to - match its input string to the ISO formats - for date, timetz, time, - timestamptz, and timestamp. It stops on - the first matching format and emits the corresponding data type. - - - The datetime(template) - method determines the result type according to the fields used in the - provided template string. - - - The datetime() and - datetime(template) methods - use the same parsing rules as the to_timestamp SQL - function does (see ), with three - exceptions. First, these methods don't allow unmatched template - patterns. Second, only the following separators are allowed in the - template string: minus sign, period, solidus (slash), comma, apostrophe, - semicolon, colon and space. Third, separators in the template string - must exactly match the input string. - - - If different date/time types need to be compared, an implicit cast is - applied. A date value can be cast to timestamp - or timestamptz, timestamp can be cast to - timestamptz, and time to timetz. - However, all but the first of these conversions depend on the current - setting, and thus can only be performed - within timezone-aware jsonpath functions. Similarly, other - date/time-related methods that convert strings to date/time types - also do this casting, which may involve the current - setting. Therefore, these conversions can - also only be performed within timezone-aware jsonpath - functions. - - - - - shows the available - filter expression elements. - - - - <type>jsonpath</type> Filter Expression Elements - - - - - Predicate/Value - - - Description - - - Example(s) - - - - - - - - value == value - boolean - - - Equality comparison (this, and the other comparison operators, work on - all JSON scalar values) - - - jsonb_path_query_array('[1, "a", 1, 3]', '$[*] ? (@ == 1)') - [1, 1] - - - jsonb_path_query_array('[1, "a", 1, 3]', '$[*] ? (@ == "a")') - ["a"] - - - - - - value != value - boolean - - - value <> value - boolean - - - Non-equality comparison - - - jsonb_path_query_array('[1, 2, 1, 3]', '$[*] ? (@ != 1)') - [2, 3] - - - jsonb_path_query_array('["a", "b", "c"]', '$[*] ? (@ <> "b")') - ["a", "c"] - - - - - - value < value - boolean - - - Less-than comparison - - - jsonb_path_query_array('[1, 2, 3]', '$[*] ? (@ < 2)') - [1] - - - - - - value <= value - boolean - - - Less-than-or-equal-to comparison - - - jsonb_path_query_array('["a", "b", "c"]', '$[*] ? (@ <= "b")') - ["a", "b"] - - - - - - value > value - boolean - - - Greater-than comparison - - - jsonb_path_query_array('[1, 2, 3]', '$[*] ? (@ > 2)') - [3] - - - - - - value >= value - boolean - - - Greater-than-or-equal-to comparison - - - jsonb_path_query_array('[1, 2, 3]', '$[*] ? (@ >= 2)') - [2, 3] - - - - - - true - boolean - - - JSON constant true - - - jsonb_path_query('[{"name": "John", "parent": false}, {"name": "Chris", "parent": true}]', '$[*] ? (@.parent == true)') - {"name": "Chris", "parent": true} - - - - - - false - boolean - - - JSON constant false - - - jsonb_path_query('[{"name": "John", "parent": false}, {"name": "Chris", "parent": true}]', '$[*] ? (@.parent == false)') - {"name": "John", "parent": false} - - - - - - null - value - - - JSON constant null (note that, unlike in SQL, - comparison to null works normally) - - - jsonb_path_query('[{"name": "Mary", "job": null}, {"name": "Michael", "job": "driver"}]', '$[*] ? (@.job == null) .name') - "Mary" - - - - - - boolean && boolean - boolean - - - Boolean AND - - - jsonb_path_query('[1, 3, 7]', '$[*] ? (@ > 1 && @ < 5)') - 3 - - - - - - boolean || boolean - boolean - - - Boolean OR - - - jsonb_path_query('[1, 3, 7]', '$[*] ? (@ < 1 || @ > 5)') - 7 - - - - - - ! boolean - boolean - - - Boolean NOT - - - jsonb_path_query('[1, 3, 7]', '$[*] ? (!(@ < 5))') - 7 - - - - - - boolean is unknown - boolean - - - Tests whether a Boolean condition is unknown. - - - jsonb_path_query('[-1, 2, 7, "foo"]', '$[*] ? ((@ > 0) is unknown)') - "foo" - - - - - - string like_regex string flag string - boolean - - - Tests whether the first operand matches the regular expression - given by the second operand, optionally with modifications - described by a string of flag characters (see - ). - - - jsonb_path_query_array('["abc", "abd", "aBdC", "abdacb", "babc"]', '$[*] ? (@ like_regex "^ab.*c")') - ["abc", "abdacb"] - - - jsonb_path_query_array('["abc", "abd", "aBdC", "abdacb", "babc"]', '$[*] ? (@ like_regex "^ab.*c" flag "i")') - ["abc", "aBdC", "abdacb"] - - - - - - string starts with string - boolean - - - Tests whether the second operand is an initial substring of the first - operand. - - - jsonb_path_query('["John Smith", "Mary Stone", "Bob Johnson"]', '$[*] ? (@ starts with "John")') - "John Smith" - - - - - - exists ( path_expression ) - boolean - - - Tests whether a path expression matches at least one SQL/JSON item. - Returns unknown if the path expression would result - in an error; the second example uses this to avoid a no-such-key error - in strict mode. - - - jsonb_path_query('{"x": [1, 2], "y": [2, 4]}', 'strict $.* ? (exists (@ ? (@[*] > 2)))') - [2, 4] - - - jsonb_path_query_array('{"value": 41}', 'strict $ ? (exists (@.name)) .name') - [] - - - - -
- -
- - - SQL/JSON Regular Expressions - - - LIKE_REGEX - in SQL/JSON - - - - SQL/JSON path expressions allow matching text to a regular expression - with the like_regex filter. For example, the - following SQL/JSON path query would case-insensitively match all - strings in an array that start with an English vowel: - -$[*] ? (@ like_regex "^[aeiou]" flag "i") - - - - - The optional flag string may include one or more of - the characters - i for case-insensitive match, - m to allow ^ - and $ to match at newlines, - s to allow . to match a newline, - and q to quote the whole pattern (reducing the - behavior to a simple substring match). - - - - The SQL/JSON standard borrows its definition for regular expressions - from the LIKE_REGEX operator, which in turn uses the - XQuery standard. PostgreSQL does not currently support the - LIKE_REGEX operator. Therefore, - the like_regex filter is implemented using the - POSIX regular expression engine described in - . This leads to various minor - discrepancies from standard SQL/JSON behavior, which are cataloged in - . - Note, however, that the flag-letter incompatibilities described there - do not apply to SQL/JSON, as it translates the XQuery flag letters to - match what the POSIX engine expects. - - - - Keep in mind that the pattern argument of like_regex - is a JSON path string literal, written according to the rules given in - . This means in particular that any - backslashes you want to use in the regular expression must be doubled. - For example, to match string values of the root document that contain - only digits: - -$.* ? (@ like_regex "^\\d+$") - - - -
- - - SQL/JSON Query Functions - - SQL/JSON functions JSON_EXISTS(), - JSON_QUERY(), and JSON_VALUE() - described in can be used - to query JSON documents. Each of these functions apply a - path_expression (an SQL/JSON path query) to a - context_item (the document). See - for more details on what - the path_expression can contain. The - path_expression can also reference variables, - whose values are specified with their respective names in the - PASSING clause that is supported by each function. - context_item can be a jsonb value - or a character string that can be successfully cast to jsonb. - - - - SQL/JSON Query Functions - - - - - Function signature - - - Description - - - Example(s) - - - - - - - json_exists - -JSON_EXISTS ( -context_item, path_expression - PASSING { value AS varname } , ... -{ TRUE | FALSE | UNKNOWN | ERROR } ON ERROR ) boolean - - - - - - Returns true if the SQL/JSON path_expression - applied to the context_item yields any - items, false otherwise. - - - - - The ON ERROR clause specifies the behavior if - an error occurs during path_expression - evaluation. Specifying ERROR will cause an error to - be thrown with the appropriate message. Other options include - returning boolean values FALSE or - TRUE or the value UNKNOWN which - is actually an SQL NULL. The default when no ON ERROR - clause is specified is to return the boolean value - FALSE. - - - - - Examples: - - - JSON_EXISTS(jsonb '{"key1": [1,2,3]}', 'strict $.key1[*] ? (@ > $x)' PASSING 2 AS x) - t - - - JSON_EXISTS(jsonb '{"a": [1,2,3]}', 'lax $.a[5]' ERROR ON ERROR) - f - - - JSON_EXISTS(jsonb '{"a": [1,2,3]}', 'strict $.a[5]' ERROR ON ERROR) - - -ERROR: jsonpath array subscript is out of bounds - - - - - - json_query - -JSON_QUERY ( -context_item, path_expression - PASSING { value AS varname } , ... - RETURNING data_type FORMAT JSON ENCODING UTF8 - { WITHOUT | WITH { CONDITIONAL | UNCONDITIONAL } } ARRAY WRAPPER - { KEEP | OMIT } QUOTES ON SCALAR STRING - { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON EMPTY - { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON ERROR ) jsonb - - - - - - Returns the result of applying the SQL/JSON - path_expression to the - context_item. - - - - - By default, the result is returned as a value of type jsonb, - though the RETURNING clause can be used to return - as some other type to which it can be successfully coerced. - - - - - If the path expression may return multiple values, it might be necessary - to wrap those values using the WITH WRAPPER clause to - make it a valid JSON string, because the default behavior is to not wrap - them, as if WITHOUT WRAPPER were specified. The - WITH WRAPPER clause is by default taken to mean - WITH UNCONDITIONAL WRAPPER, which means that even a - single result value will be wrapped. To apply the wrapper only when - multiple values are present, specify WITH CONDITIONAL WRAPPER. - Getting multiple values in result will be treated as an error if - WITHOUT WRAPPER is specified. - - - - - If the result is a scalar string, by default, the returned value will - be surrounded by quotes, making it a valid JSON value. It can be made - explicit by specifying KEEP QUOTES. Conversely, - quotes can be omitted by specifying OMIT QUOTES. - To ensure that the result is a valid JSON value, OMIT QUOTES - cannot be specified when WITH WRAPPER is also - specified. - - - - - The ON EMPTY clause specifies the behavior if - evaluating path_expression yields an empty - set. The ON ERROR clause specifies the behavior - if an error occurs when evaluating path_expression, - when coercing the result value to the RETURNING type, - or when evaluating the ON EMPTY expression if the - path_expression evaluation returns an empty - set. - - - - - For both ON EMPTY and ON ERROR, - specifying ERROR will cause an error to be thrown with - the appropriate message. Other options include returning an SQL NULL, an - empty array (EMPTY ARRAY), - an empty object (EMPTY OBJECT), or a user-specified - expression (DEFAULT expression) - that can be coerced to jsonb or the type specified in RETURNING. - The default when ON EMPTY or ON ERROR - is not specified is to return an SQL NULL value. - - - - - Examples: - - - JSON_QUERY(jsonb '[1,[2,3],null]', 'lax $[*][$off]' PASSING 1 AS off WITH CONDITIONAL WRAPPER) - 3 - - - JSON_QUERY(jsonb '{"a": "[1, 2]"}', 'lax $.a' OMIT QUOTES) - [1, 2] - - - JSON_QUERY(jsonb '{"a": "[1, 2]"}', 'lax $.a' RETURNING int[] OMIT QUOTES ERROR ON ERROR) - - -ERROR: malformed array literal: "[1, 2]" -DETAIL: Missing "]" after array dimensions. - - - - - - - json_value - -JSON_VALUE ( -context_item, path_expression - PASSING { value AS varname } , ... - RETURNING data_type - { ERROR | NULL | DEFAULT expression } ON EMPTY - { ERROR | NULL | DEFAULT expression } ON ERROR ) text - - - - - - Returns the result of applying the SQL/JSON - path_expression to the - context_item. - - - - - Only use JSON_VALUE() if the extracted value is - expected to be a single SQL/JSON scalar item; - getting multiple values will be treated as an error. If you expect that - extracted value might be an object or an array, use the - JSON_QUERY function instead. - - - - - By default, the result, which must be a single scalar value, is - returned as a value of type text, though the - RETURNING clause can be used to return as some - other type to which it can be successfully coerced. - - - - - The ON ERROR and ON EMPTY - clauses have similar semantics as mentioned in the description of - JSON_QUERY, except the set of values returned in - lieu of throwing an error is different. - - - - - Note that scalar strings returned by JSON_VALUE - always have their quotes removed, equivalent to specifying - OMIT QUOTES in JSON_QUERY. - - - - - Examples: - - - JSON_VALUE(jsonb '"123.45"', '$' RETURNING float) - 123.45 - - - JSON_VALUE(jsonb '"03:04 2015-02-01"', '$.datetime("HH24:MI YYYY-MM-DD")' RETURNING date) - 2015-02-01 - - - JSON_VALUE(jsonb '[1,2]', 'strict $[$off]' PASSING 1 as off) - 2 - - - JSON_VALUE(jsonb '[1,2]', 'strict $[*]' DEFAULT 9 ON ERROR) - 9 - - - - - -
- - - The context_item expression is converted to - jsonb by an implicit cast if the expression is not already of - type jsonb. Note, however, that any parsing errors that occur - during that conversion are thrown unconditionally, that is, are not - handled according to the (specified or implicit) ON ERROR - clause. - - - - - JSON_VALUE() returns an SQL NULL if - path_expression returns a JSON - null, whereas JSON_QUERY() returns - the JSON null as is. - - -
- - - JSON_TABLE - - json_table - - - - JSON_TABLE is an SQL/JSON function which - queries JSON data - and presents the results as a relational view, which can be accessed as a - regular SQL table. You can use JSON_TABLE inside - the FROM clause of a SELECT, - UPDATE, or DELETE and as data source - in a MERGE statement. - - - - Taking JSON data as input, JSON_TABLE uses a JSON path - expression to extract a part of the provided data to use as a - row pattern for the constructed view. Each SQL/JSON - value given by the row pattern serves as source for a separate row in the - constructed view. - - - - To split the row pattern into columns, JSON_TABLE - provides the COLUMNS clause that defines the - schema of the created view. For each column, a separate JSON path expression - can be specified to be evaluated against the row pattern to get an SQL/JSON - value that will become the value for the specified column in a given output - row. - - - - JSON data stored at a nested level of the row pattern can be extracted using - the NESTED PATH clause. Each - NESTED PATH clause can be used to generate one or more - columns using the data from a nested level of the row pattern. Those - columns can be specified using a COLUMNS clause that - looks similar to the top-level COLUMNS clause. Rows constructed from - NESTED COLUMNS are called child rows and are joined - against the row constructed from the columns specified in the parent - COLUMNS clause to get the row in the final view. Child - columns themselves may contain a NESTED PATH - specification thus allowing to extract data located at arbitrary nesting - levels. Columns produced by multiple NESTED PATHs at the - same level are considered to be siblings of each - other and their rows after joining with the parent row are combined using - UNION. - - - - The rows produced by JSON_TABLE are laterally - joined to the row that generated them, so you do not have to explicitly join - the constructed view with the original table holding JSON - data. - - - - The syntax is: - - - -JSON_TABLE ( - context_item, path_expression AS json_path_name PASSING { value AS varname } , ... - COLUMNS ( json_table_column , ... ) - { ERROR | EMPTY ARRAY} ON ERROR -) - - -where json_table_column is: - - name FOR ORDINALITY - | name type - FORMAT JSON ENCODING UTF8 - PATH path_expression - { WITHOUT | WITH { CONDITIONAL | UNCONDITIONAL } } ARRAY WRAPPER - { KEEP | OMIT } QUOTES ON SCALAR STRING - { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON EMPTY - { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON ERROR - | name type EXISTS PATH path_expression - { ERROR | TRUE | FALSE | UNKNOWN } ON ERROR - | NESTED PATH path_expression AS json_path_name COLUMNS ( json_table_column , ... ) - - - - Each syntax element is described below in more detail. - - - - - - context_item, path_expression AS json_path_name PASSING { value AS varname } , ... - - - - The context_item specifies the input document - to query, the path_expression is an SQL/JSON - path expression defining the query, and json_path_name - is an optional name for the path_expression. - The optional PASSING clause provides data values for - the variables mentioned in the path_expression. - The result of the input data evaluation using the aforementioned elements - is called the row pattern, which is used as the - source for row values in the constructed view. - - - - - - - COLUMNS ( json_table_column , ... ) - - - - - The COLUMNS clause defining the schema of the - constructed view. In this clause, you can specify each column to be - filled with an SQL/JSON value obtained by applying a JSON path expression - against the row pattern. json_table_column has - the following variants: - - - - - - name FOR ORDINALITY - - - - Adds an ordinality column that provides sequential row numbering starting - from 1. Each NESTED PATH (see below) gets its own - counter for any nested ordinality columns. - - - - - - - name type - FORMAT JSON ENCODING UTF8 - PATH path_expression - - - - Inserts an SQL/JSON value obtained by applying - path_expression against the row pattern into - the view's output row after coercing it to specified - type. - - - Specifying FORMAT JSON makes it explicit that you - expect the value to be a valid json object. It only - makes sense to specify FORMAT JSON if - type is one of bpchar, - bytea, character varying, name, - json, jsonb, text, or a domain over - these types. - - - Optionally, you can specify WRAPPER and - QUOTES clauses to format the output. Note that - specifying OMIT QUOTES overrides - FORMAT JSON if also specified, because unquoted - literals do not constitute valid json values. - - - Optionally, you can use ON EMPTY and - ON ERROR clauses to specify whether to throw the error - or return the specified value when the result of JSON path evaluation is - empty and when an error occurs during JSON path evaluation or when - coercing the SQL/JSON value to the specified type, respectively. The - default for both is to return a NULL value. - - - - This clause is internally turned into and has the same semantics as - JSON_VALUE or JSON_QUERY. - The latter if the specified type is not a scalar type or if either of - FORMAT JSON, WRAPPER, or - QUOTES clause is present. - - - - - - - - name type - EXISTS PATH path_expression - - - - Inserts a boolean value obtained by applying - path_expression against the row pattern - into the view's output row after coercing it to specified - type. - - - The value corresponds to whether applying the PATH - expression to the row pattern yields any values. - - - The specified type should have a cast from the - boolean type. - - - Optionally, you can use ON ERROR to specify whether to - throw the error or return the specified value when an error occurs during - JSON path evaluation or when coercing SQL/JSON value to the specified - type. The default is to return a boolean value - FALSE. - - - - This clause is internally turned into and has the same semantics as - JSON_EXISTS. - - - - - - - - NESTED PATH path_expression AS json_path_name - COLUMNS ( json_table_column , ... ) - - - - - Extracts SQL/JSON values from nested levels of the row pattern, - generates one or more columns as defined by the COLUMNS - subclause, and inserts the extracted SQL/JSON values into those - columns. The json_table_column - expression in the COLUMNS subclause uses the same - syntax as in the parent COLUMNS clause. - - - - The NESTED PATH syntax is recursive, - so you can go down multiple nested levels by specifying several - NESTED PATH subclauses within each other. - It allows to unnest the hierarchy of JSON objects and arrays - in a single function invocation rather than chaining several - JSON_TABLE expressions in an SQL statement. - - - - - - - - In each variant of json_table_column described - above, if the PATH clause is omitted, path expression - $.name is used, where - name is the provided column name. - - - - - - - - - AS json_path_name - - - - - The optional json_path_name serves as an - identifier of the provided path_expression. - The name must be unique and distinct from the column names. - - - - - - - { ERROR | EMPTY } ON ERROR - - - - - The optional ON ERROR can be used to specify how to - handle errors when evaluating the top-level - path_expression. Use ERROR - if you want the errors to be thrown and EMPTY to - return an empty table, that is, a table containing 0 rows. Note that - this clause does not affect the errors that occur when evaluating - columns, for which the behavior depends on whether the - ON ERROR clause is specified against a given column. - - - - - - Examples - - - In the examples that follow, the following table containing JSON data - will be used: - - -CREATE TABLE my_films ( js jsonb ); - -INSERT INTO my_films VALUES ( -'{ "favorites" : [ - { "kind" : "comedy", "films" : [ - { "title" : "Bananas", - "director" : "Woody Allen"}, - { "title" : "The Dinner Game", - "director" : "Francis Veber" } ] }, - { "kind" : "horror", "films" : [ - { "title" : "Psycho", - "director" : "Alfred Hitchcock" } ] }, - { "kind" : "thriller", "films" : [ - { "title" : "Vertigo", - "director" : "Alfred Hitchcock" } ] }, - { "kind" : "drama", "films" : [ - { "title" : "Yojimbo", - "director" : "Akira Kurosawa" } ] } - ] }'); - - - - - The following query shows how to use JSON_TABLE to - turn the JSON objects in the my_films table - to a view containing columns for the keys kind, - title, and director contained in - the original JSON along with an ordinality column: - - -SELECT jt.* FROM - my_films, - JSON_TABLE (js, '$.favorites[*]' COLUMNS ( - id FOR ORDINALITY, - kind text PATH '$.kind', - title text PATH '$.films[*].title' WITH WRAPPER, - director text PATH '$.films[*].director' WITH WRAPPER)) AS jt; - - - - id | kind | title | director -----+----------+--------------------------------+---------------------------------- - 1 | comedy | ["Bananas", "The Dinner Game"] | ["Woody Allen", "Francis Veber"] - 2 | horror | ["Psycho"] | ["Alfred Hitchcock"] - 3 | thriller | ["Vertigo"] | ["Alfred Hitchcock"] - 4 | drama | ["Yojimbo"] | ["Akira Kurosawa"] -(4 rows) - - - - - The following is a modified version of the above query to show the - usage of PASSING arguments in the filter specified in - the top-level JSON path expression and the various options for the - individual columns: - - -SELECT jt.* FROM - my_films, - JSON_TABLE (js, '$.favorites[*] ? (@.films[*].director == $filter)' - PASSING 'Alfred Hitchcock' AS filter - COLUMNS ( - id FOR ORDINALITY, - kind text PATH '$.kind', - title text FORMAT JSON PATH '$.films[*].title' OMIT QUOTES, - director text PATH '$.films[*].director' KEEP QUOTES)) AS jt; - - - - id | kind | title | director -----+----------+---------+-------------------- - 1 | horror | Psycho | "Alfred Hitchcock" - 2 | thriller | Vertigo | "Alfred Hitchcock" -(2 rows) - - - - - The following is a modified version of the above query to show the usage - of NESTED PATH for populating title and director - columns, illustrating how they are joined to the parent columns id and - kind: - - -SELECT jt.* FROM - my_films, - JSON_TABLE ( js, '$.favorites[*] ? (@.films[*].director == $filter)' - PASSING 'Alfred Hitchcock' AS filter - COLUMNS ( - id FOR ORDINALITY, - kind text PATH '$.kind', - NESTED PATH '$.films[*]' COLUMNS ( - title text FORMAT JSON PATH '$.title' OMIT QUOTES, - director text PATH '$.director' KEEP QUOTES))) AS jt; - - - - id | kind | title | director -----+----------+---------+-------------------- - 1 | horror | Psycho | "Alfred Hitchcock" - 2 | thriller | Vertigo | "Alfred Hitchcock" -(2 rows) - - - - - - The following is the same query but without the filter in the root - path: - - -SELECT jt.* FROM - my_films, - JSON_TABLE ( js, '$.favorites[*]' - COLUMNS ( - id FOR ORDINALITY, - kind text PATH '$.kind', - NESTED PATH '$.films[*]' COLUMNS ( - title text FORMAT JSON PATH '$.title' OMIT QUOTES, - director text PATH '$.director' KEEP QUOTES))) AS jt; - - - - id | kind | title | director -----+----------+-----------------+-------------------- - 1 | comedy | Bananas | "Woody Allen" - 1 | comedy | The Dinner Game | "Francis Veber" - 2 | horror | Psycho | "Alfred Hitchcock" - 3 | thriller | Vertigo | "Alfred Hitchcock" - 4 | drama | Yojimbo | "Akira Kurosawa" -(5 rows) - - - - - - The following shows another query using a different JSON - object as input. It shows the UNION "sibling join" between - NESTED paths $.movies[*] and - $.books[*] and also the usage of - FOR ORDINALITY column at NESTED - levels (columns movie_id, book_id, - and author_id): - - -SELECT * FROM JSON_TABLE ( -'{"favorites": - [{"movies": - [{"name": "One", "director": "John Doe"}, - {"name": "Two", "director": "Don Joe"}], - "books": - [{"name": "Mystery", "authors": [{"name": "Brown Dan"}]}, - {"name": "Wonder", "authors": [{"name": "Jun Murakami"}, {"name":"Craig Doe"}]}] -}]}'::json, '$.favorites[*]' -COLUMNS ( - user_id FOR ORDINALITY, - NESTED '$.movies[*]' - COLUMNS ( - movie_id FOR ORDINALITY, - mname text PATH '$.name', - director text), - NESTED '$.books[*]' - COLUMNS ( - book_id FOR ORDINALITY, - bname text PATH '$.name', - NESTED '$.authors[*]' - COLUMNS ( - author_id FOR ORDINALITY, - author_name text PATH '$.name')))); - - - - user_id | movie_id | mname | director | book_id | bname | author_id | author_name ----------+----------+-------+----------+---------+---------+-----------+-------------- - 1 | 1 | One | John Doe | | | | - 1 | 2 | Two | Don Joe | | | | - 1 | | | | 1 | Mystery | 1 | Brown Dan - 1 | | | | 2 | Wonder | 1 | Jun Murakami - 1 | | | | 2 | Wonder | 2 | Craig Doe -(5 rows) - - - - -
- - - Sequence Manipulation Functions - - - sequence - - - - This section describes functions for operating on sequence - objects, also called sequence generators or just sequences. - Sequence objects are special single-row tables created with . - Sequence objects are commonly used to generate unique identifiers - for rows of a table. The sequence functions, listed in , provide simple, multiuser-safe - methods for obtaining successive sequence values from sequence - objects. - - - - Sequence Functions - - - - - Function - - - Description - - - - - - - - - nextval - - nextval ( regclass ) - bigint - - - Advances the sequence object to its next value and returns that value. - This is done atomically: even if multiple sessions - execute nextval concurrently, each will safely - receive a distinct sequence value. - If the sequence object has been created with default parameters, - successive nextval calls will return successive - values beginning with 1. Other behaviors can be obtained by using - appropriate parameters in the - command. - - - This function requires USAGE - or UPDATE privilege on the sequence. - - - - - - - setval - - setval ( regclass, bigint , boolean ) - bigint - - - Sets the sequence object's current value, and optionally - its is_called flag. The two-parameter - form sets the sequence's last_value field to the - specified value and sets its is_called field to - true, meaning that the next - nextval will advance the sequence before - returning a value. The value that will be reported - by currval is also set to the specified value. - In the three-parameter form, is_called can be set - to either true - or false. true has the same - effect as the two-parameter form. If it is set - to false, the next nextval - will return exactly the specified value, and sequence advancement - commences with the following nextval. - Furthermore, the value reported by currval is not - changed in this case. For example, - -SELECT setval('myseq', 42); Next nextval will return 43 -SELECT setval('myseq', 42, true); Same as above -SELECT setval('myseq', 42, false); Next nextval will return 42 - - The result returned by setval is just the value of its - second argument. - - - This function requires UPDATE privilege on the - sequence. - - - - - - - currval - - currval ( regclass ) - bigint - - - Returns the value most recently obtained - by nextval for this sequence in the current - session. (An error is reported if nextval has - never been called for this sequence in this session.) Because this is - returning a session-local value, it gives a predictable answer whether - or not other sessions have executed nextval since - the current session did. - - - This function requires USAGE - or SELECT privilege on the sequence. - - - - - - - lastval - - lastval () - bigint - - - Returns the value most recently returned by - nextval in the current session. This function is - identical to currval, except that instead - of taking the sequence name as an argument it refers to whichever - sequence nextval was most recently applied to - in the current session. It is an error to call - lastval if nextval - has not yet been called in the current session. - - - This function requires USAGE - or SELECT privilege on the last used sequence. - - - - -
- - - - To avoid blocking concurrent transactions that obtain numbers from - the same sequence, the value obtained by nextval - is not reclaimed for re-use if the calling transaction later aborts. - This means that transaction aborts or database crashes can result in - gaps in the sequence of assigned values. That can happen without a - transaction abort, too. For example an INSERT with - an ON CONFLICT clause will compute the to-be-inserted - tuple, including doing any required nextval - calls, before detecting any conflict that would cause it to follow - the ON CONFLICT rule instead. - Thus, PostgreSQL sequence - objects cannot be used to obtain gapless - sequences. - - - - Likewise, sequence state changes made by setval - are immediately visible to other transactions, and are not undone if - the calling transaction rolls back. - - - - If the database cluster crashes before committing a transaction - containing a nextval - or setval call, the sequence state change might - not have made its way to persistent storage, so that it is uncertain - whether the sequence will have its original or updated state after the - cluster restarts. This is harmless for usage of the sequence within - the database, since other effects of uncommitted transactions will not - be visible either. However, if you wish to use a sequence value for - persistent outside-the-database purposes, make sure that the - nextval call has been committed before doing so. - - - - - The sequence to be operated on by a sequence function is specified by - a regclass argument, which is simply the OID of the sequence in the - pg_class system catalog. You do not have to look up the - OID by hand, however, since the regclass data type's input - converter will do the work for you. See - for details. - -
- - - - Conditional Expressions - - - CASE - - - - conditional expression - - - - This section describes the SQL-compliant conditional expressions - available in PostgreSQL. - - - - - If your needs go beyond the capabilities of these conditional - expressions, you might want to consider writing a server-side function - in a more expressive programming language. - - - - - - Although COALESCE, GREATEST, and - LEAST are syntactically similar to functions, they are - not ordinary functions, and thus cannot be used with explicit - VARIADIC array arguments. - - - - - <literal>CASE</literal> - - - The SQL CASE expression is a - generic conditional expression, similar to if/else statements in - other programming languages: - - -CASE WHEN condition THEN result - WHEN ... - ELSE result -END - - - CASE clauses can be used wherever - an expression is valid. Each condition is an - expression that returns a boolean result. If the condition's - result is true, the value of the CASE expression is the - result that follows the condition, and the - remainder of the CASE expression is not processed. If the - condition's result is not true, any subsequent WHEN clauses - are examined in the same manner. If no WHEN - condition yields true, the value of the - CASE expression is the result of the - ELSE clause. If the ELSE clause is - omitted and no condition is true, the result is null. - - - - An example: - -SELECT * FROM test; - - a ---- - 1 - 2 - 3 - - -SELECT a, - CASE WHEN a=1 THEN 'one' - WHEN a=2 THEN 'two' - ELSE 'other' - END - FROM test; - - a | case ----+------- - 1 | one - 2 | two - 3 | other - - - - - The data types of all the result - expressions must be convertible to a single output type. - See for more details. - - - - There is a simple form of CASE expression - that is a variant of the general form above: - - -CASE expression - WHEN value THEN result - WHEN ... - ELSE result -END - - - The first - expression is computed, then compared to - each of the value expressions in the - WHEN clauses until one is found that is equal to it. If - no match is found, the result of the - ELSE clause (or a null value) is returned. This is similar - to the switch statement in C. - - - - The example above can be written using the simple - CASE syntax: - -SELECT a, - CASE a WHEN 1 THEN 'one' - WHEN 2 THEN 'two' - ELSE 'other' - END - FROM test; - - a | case ----+------- - 1 | one - 2 | two - 3 | other - - - - - A CASE expression does not evaluate any subexpressions - that are not needed to determine the result. For example, this is a - possible way of avoiding a division-by-zero failure: - -SELECT ... WHERE CASE WHEN x <> 0 THEN y/x > 1.5 ELSE false END; - - - - - - As described in , there are various - situations in which subexpressions of an expression are evaluated at - different times, so that the principle that CASE - evaluates only necessary subexpressions is not ironclad. For - example a constant 1/0 subexpression will usually result in - a division-by-zero failure at planning time, even if it's within - a CASE arm that would never be entered at run time. - - - - - - <literal>COALESCE</literal> - - - COALESCE - - - - NVL - - - - IFNULL - - - -COALESCE(value , ...) - - - - The COALESCE function returns the first of its - arguments that is not null. Null is returned only if all arguments - are null. It is often used to substitute a default value for - null values when data is retrieved for display, for example: - -SELECT COALESCE(description, short_description, '(none)') ... - - This returns description if it is not null, otherwise - short_description if it is not null, otherwise (none). - - - - The arguments must all be convertible to a common data type, which - will be the type of the result (see - for details). - - - - Like a CASE expression, COALESCE only - evaluates the arguments that are needed to determine the result; - that is, arguments to the right of the first non-null argument are - not evaluated. This SQL-standard function provides capabilities similar - to NVL and IFNULL, which are used in some other - database systems. - - - - - <literal>NULLIF</literal> - - - NULLIF - - - -NULLIF(value1, value2) - - - - The NULLIF function returns a null value if - value1 equals value2; - otherwise it returns value1. - This can be used to perform the inverse operation of the - COALESCE example given above: - -SELECT NULLIF(value, '(none)') ... - - In this example, if value is (none), - null is returned, otherwise the value of value - is returned. - - - - The two arguments must be of comparable types. - To be specific, they are compared exactly as if you had - written value1 - = value2, so there must be a - suitable = operator available. - - - - The result has the same type as the first argument — but there is - a subtlety. What is actually returned is the first argument of the - implied = operator, and in some cases that will have - been promoted to match the second argument's type. For - example, NULLIF(1, 2.2) yields numeric, - because there is no integer = - numeric operator, - only numeric = numeric. - - - - - - <literal>GREATEST</literal> and <literal>LEAST</literal> - - - GREATEST - - - LEAST - - - -GREATEST(value , ...) - - -LEAST(value , ...) - - - - The GREATEST and LEAST functions select the - largest or smallest value from a list of any number of expressions. - The expressions must all be convertible to a common data type, which - will be the type of the result - (see for details). - - - - NULL values in the argument list are ignored. The result will be NULL - only if all the expressions evaluate to NULL. (This is a deviation from - the SQL standard. According to the standard, the return value is NULL if - any argument is NULL. Some other databases behave this way.) - - - - - - Array Functions and Operators - - - shows the specialized operators - available for array types. - In addition to those, the usual comparison operators shown in are available for - arrays. The comparison operators compare the array contents - element-by-element, using the default B-tree comparison function for - the element data type, and sort based on the first difference. - In multidimensional arrays the elements are visited in row-major order - (last subscript varies most rapidly). - If the contents of two arrays are equal but the dimensionality is - different, the first difference in the dimensionality information - determines the sort order. - - - - Array Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - anyarray @> anyarray - boolean - - - Does the first array contain the second, that is, does each element - appearing in the second array equal some element of the first array? - (Duplicates are not treated specially, - thus ARRAY[1] and ARRAY[1,1] are - each considered to contain the other.) - - - ARRAY[1,4,3] @> ARRAY[3,1,3] - t - - - - - - anyarray <@ anyarray - boolean - - - Is the first array contained by the second? - - - ARRAY[2,2,7] <@ ARRAY[1,7,4,2,6] - t - - - - - - anyarray && anyarray - boolean - - - Do the arrays overlap, that is, have any elements in common? - - - ARRAY[1,4,3] && ARRAY[2,1] - t - - - - - - anycompatiblearray || anycompatiblearray - anycompatiblearray - - - Concatenates the two arrays. Concatenating a null or empty array is a - no-op; otherwise the arrays must have the same number of dimensions - (as illustrated by the first example) or differ in number of - dimensions by one (as illustrated by the second). - If the arrays are not of identical element types, they will be coerced - to a common type (see ). - - - ARRAY[1,2,3] || ARRAY[4,5,6,7] - {1,2,3,4,5,6,7} - - - ARRAY[1,2,3] || ARRAY[[4,5,6],[7,8,9.9]] - {{1,2,3},{4,5,6},{7,8,9.9}} - - - - - - anycompatible || anycompatiblearray - anycompatiblearray - - - Concatenates an element onto the front of an array (which must be - empty or one-dimensional). - - - 3 || ARRAY[4,5,6] - {3,4,5,6} - - - - - - anycompatiblearray || anycompatible - anycompatiblearray - - - Concatenates an element onto the end of an array (which must be - empty or one-dimensional). - - - ARRAY[4,5,6] || 7 - {4,5,6,7} - - - - -
- - - See for more details about array operator - behavior. See for more details about - which operators support indexed operations. - - - - shows the functions - available for use with array types. See - for more information and examples of the use of these functions. - - - - Array Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - array_append - - array_append ( anycompatiblearray, anycompatible ) - anycompatiblearray - - - Appends an element to the end of an array (same as - the anycompatiblearray || anycompatible - operator). - - - array_append(ARRAY[1,2], 3) - {1,2,3} - - - - - - - array_cat - - array_cat ( anycompatiblearray, anycompatiblearray ) - anycompatiblearray - - - Concatenates two arrays (same as - the anycompatiblearray || anycompatiblearray - operator). - - - array_cat(ARRAY[1,2,3], ARRAY[4,5]) - {1,2,3,4,5} - - - - - - - array_dims - - array_dims ( anyarray ) - text - - - Returns a text representation of the array's dimensions. - - - array_dims(ARRAY[[1,2,3], [4,5,6]]) - [1:2][1:3] - - - - - - - array_fill - - array_fill ( anyelement, integer[] - , integer[] ) - anyarray - - - Returns an array filled with copies of the given value, having - dimensions of the lengths specified by the second argument. - The optional third argument supplies lower-bound values for each - dimension (which default to all 1). - - - array_fill(11, ARRAY[2,3]) - {{11,11,11},{11,11,11}} - - - array_fill(7, ARRAY[3], ARRAY[2]) - [2:4]={7,7,7} - - - - - - - array_length - - array_length ( anyarray, integer ) - integer - - - Returns the length of the requested array dimension. - (Produces NULL instead of 0 for empty or missing array dimensions.) - - - array_length(array[1,2,3], 1) - 3 - - - array_length(array[]::int[], 1) - NULL - - - array_length(array['text'], 2) - NULL - - - - - - - array_lower - - array_lower ( anyarray, integer ) - integer - - - Returns the lower bound of the requested array dimension. - - - array_lower('[0:2]={1,2,3}'::integer[], 1) - 0 - - - - - - - array_ndims - - array_ndims ( anyarray ) - integer - - - Returns the number of dimensions of the array. - - - array_ndims(ARRAY[[1,2,3], [4,5,6]]) - 2 - - - - - - - array_position - - array_position ( anycompatiblearray, anycompatible , integer ) - integer - - - Returns the subscript of the first occurrence of the second argument - in the array, or NULL if it's not present. - If the third argument is given, the search begins at that subscript. - The array must be one-dimensional. - Comparisons are done using IS NOT DISTINCT FROM - semantics, so it is possible to search for NULL. - - - array_position(ARRAY['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'], 'mon') - 2 - - - - - - - array_positions - - array_positions ( anycompatiblearray, anycompatible ) - integer[] - - - Returns an array of the subscripts of all occurrences of the second - argument in the array given as first argument. - The array must be one-dimensional. - Comparisons are done using IS NOT DISTINCT FROM - semantics, so it is possible to search for NULL. - NULL is returned only if the array - is NULL; if the value is not found in the array, an - empty array is returned. - - - array_positions(ARRAY['A','A','B','A'], 'A') - {1,2,4} - - - - - - - array_prepend - - array_prepend ( anycompatible, anycompatiblearray ) - anycompatiblearray - - - Prepends an element to the beginning of an array (same as - the anycompatible || anycompatiblearray - operator). - - - array_prepend(1, ARRAY[2,3]) - {1,2,3} - - - - - - - array_remove - - array_remove ( anycompatiblearray, anycompatible ) - anycompatiblearray - - - Removes all elements equal to the given value from the array. - The array must be one-dimensional. - Comparisons are done using IS NOT DISTINCT FROM - semantics, so it is possible to remove NULLs. - - - array_remove(ARRAY[1,2,3,2], 2) - {1,3} - - - - - - - array_replace - - array_replace ( anycompatiblearray, anycompatible, anycompatible ) - anycompatiblearray - - - Replaces each array element equal to the second argument with the - third argument. - - - array_replace(ARRAY[1,2,5,4], 5, 3) - {1,2,3,4} - - - - - - - array_reverse - - array_reverse ( anyarray ) - anyarray - - - Reverses the first dimension of the array. - - - array_reverse(ARRAY[[1,2],[3,4],[5,6]]) - {{5,6},{3,4},{1,2}} - - - - - - - array_sample - - array_sample ( array anyarray, n integer ) - anyarray - - - Returns an array of n items randomly selected - from array. n may not - exceed the length of array's first dimension. - If array is multi-dimensional, - an item is a slice having a given first subscript. - - - array_sample(ARRAY[1,2,3,4,5,6], 3) - {2,6,1} - - - array_sample(ARRAY[[1,2],[3,4],[5,6]], 2) - {{5,6},{1,2}} - - - - - - - array_shuffle - - array_shuffle ( anyarray ) - anyarray - - - Randomly shuffles the first dimension of the array. - - - array_shuffle(ARRAY[[1,2],[3,4],[5,6]]) - {{5,6},{1,2},{3,4}} - - - - - - - array_sort - - array_sort ( - array anyarray - , descending boolean - , nulls_first boolean - ) - anyarray - - - Sorts the first dimension of the array. - The sort order is determined by the default sort ordering of the - array's element type; however, if the element type is collatable, - the collation to use can be specified by adding - a COLLATE clause to - the array argument. - - - If descending is true then sort in - descending order, otherwise ascending order. If omitted, the - default is ascending order. - If nulls_first is true then nulls appear - before non-null values, otherwise nulls appear after non-null - values. - If omitted, nulls_first is taken to have - the same value as descending. - - - array_sort(ARRAY[[2,4],[2,1],[6,5]]) - {{2,1},{2,4},{6,5}} - - - - - - - array_to_string - - array_to_string ( array anyarray, delimiter text , null_string text ) - text - - - Converts each array element to its text representation, and - concatenates those separated by - the delimiter string. - If null_string is given and is - not NULL, then NULL array - entries are represented by that string; otherwise, they are omitted. - See also string_to_array. - - - array_to_string(ARRAY[1, 2, 3, NULL, 5], ',', '*') - 1,2,3,*,5 - - - - - - - array_upper - - array_upper ( anyarray, integer ) - integer - - - Returns the upper bound of the requested array dimension. - - - array_upper(ARRAY[1,8,3,7], 1) - 4 - - - - - - - cardinality - - cardinality ( anyarray ) - integer - - - Returns the total number of elements in the array, or 0 if the array - is empty. - - - cardinality(ARRAY[[1,2],[3,4]]) - 4 - - - - - - - trim_array - - trim_array ( array anyarray, n integer ) - anyarray - - - Trims an array by removing the last n elements. - If the array is multidimensional, only the first dimension is trimmed. - - - trim_array(ARRAY[1,2,3,4,5,6], 2) - {1,2,3,4} - - - - - - - unnest - - unnest ( anyarray ) - setof anyelement - - - Expands an array into a set of rows. - The array's elements are read out in storage order. - - - unnest(ARRAY[1,2]) - - - 1 - 2 - - - - unnest(ARRAY[['foo','bar'],['baz','quux']]) - - - foo - bar - baz - quux - - - - - - - unnest ( anyarray, anyarray , ... ) - setof anyelement, anyelement [, ... ] - - - Expands multiple arrays (possibly of different data types) into a set of - rows. If the arrays are not all the same length then the shorter ones - are padded with NULLs. This form is only allowed - in a query's FROM clause; see . - - - select * from unnest(ARRAY[1,2], ARRAY['foo','bar','baz']) as x(a,b) - - - a | b ----+----- - 1 | foo - 2 | bar - | baz - - - - - -
- - - See also about the aggregate - function array_agg for use with arrays. - -
- - - Range/Multirange Functions and Operators - - - See for an overview of range types. - - - - shows the specialized operators - available for range types. - shows the specialized operators - available for multirange types. - In addition to those, the usual comparison operators shown in - are available for range - and multirange types. The comparison operators order first by the range lower - bounds, and only if those are equal do they compare the upper bounds. The - multirange operators compare each range until one is unequal. This - does not usually result in a useful overall ordering, but the operators are - provided to allow unique indexes to be constructed on ranges. - - - - Range Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - anyrange @> anyrange - boolean - - - Does the first range contain the second? - - - int4range(2,4) @> int4range(2,3) - t - - - - - - anyrange @> anyelement - boolean - - - Does the range contain the element? - - - '[2011-01-01,2011-03-01)'::tsrange @> '2011-01-10'::timestamp - t - - - - - - anyrange <@ anyrange - boolean - - - Is the first range contained by the second? - - - int4range(2,4) <@ int4range(1,7) - t - - - - - - anyelement <@ anyrange - boolean - - - Is the element contained in the range? - - - 42 <@ int4range(1,7) - f - - - - - - anyrange && anyrange - boolean - - - Do the ranges overlap, that is, have any elements in common? - - - int8range(3,7) && int8range(4,12) - t - - - - - - anyrange << anyrange - boolean - - - Is the first range strictly left of the second? - - - int8range(1,10) << int8range(100,110) - t - - - - - - anyrange >> anyrange - boolean - - - Is the first range strictly right of the second? - - - int8range(50,60) >> int8range(20,30) - t - - - - - - anyrange &< anyrange - boolean - - - Does the first range not extend to the right of the second? - - - int8range(1,20) &< int8range(18,20) - t - - - - - - anyrange &> anyrange - boolean - - - Does the first range not extend to the left of the second? - - - int8range(7,20) &> int8range(5,10) - t - - - - - - anyrange -|- anyrange - boolean - - - Are the ranges adjacent? - - - numrange(1.1,2.2) -|- numrange(2.2,3.3) - t - - - - - - anyrange + anyrange - anyrange - - - Computes the union of the ranges. The ranges must overlap or be - adjacent, so that the union is a single range (but - see range_merge()). - - - numrange(5,15) + numrange(10,20) - [5,20) - - - - - - anyrange * anyrange - anyrange - - - Computes the intersection of the ranges. - - - int8range(5,15) * int8range(10,20) - [10,15) - - - - - - anyrange - anyrange - anyrange - - - Computes the difference of the ranges. The second range must not be - contained in the first in such a way that the difference would not be - a single range. - - - int8range(5,15) - int8range(10,20) - [5,10) - - - - -
- - - Multirange Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - anymultirange @> anymultirange - boolean - - - Does the first multirange contain the second? - - - '{[2,4)}'::int4multirange @> '{[2,3)}'::int4multirange - t - - - - - - anymultirange @> anyrange - boolean - - - Does the multirange contain the range? - - - '{[2,4)}'::int4multirange @> int4range(2,3) - t - - - - - - anymultirange @> anyelement - boolean - - - Does the multirange contain the element? - - - '{[2011-01-01,2011-03-01)}'::tsmultirange @> '2011-01-10'::timestamp - t - - - - - - anyrange @> anymultirange - boolean - - - Does the range contain the multirange? - - - '[2,4)'::int4range @> '{[2,3)}'::int4multirange - t - - - - - - anymultirange <@ anymultirange - boolean - - - Is the first multirange contained by the second? - - - '{[2,4)}'::int4multirange <@ '{[1,7)}'::int4multirange - t - - - - - - anymultirange <@ anyrange - boolean - - - Is the multirange contained by the range? - - - '{[2,4)}'::int4multirange <@ int4range(1,7) - t - - - - - - anyrange <@ anymultirange - boolean - - - Is the range contained by the multirange? - - - int4range(2,4) <@ '{[1,7)}'::int4multirange - t - - - - - - anyelement <@ anymultirange - boolean - - - Is the element contained by the multirange? - - - 4 <@ '{[1,7)}'::int4multirange - t - - - - - - anymultirange && anymultirange - boolean - - - Do the multiranges overlap, that is, have any elements in common? - - - '{[3,7)}'::int8multirange && '{[4,12)}'::int8multirange - t - - - - - - anymultirange && anyrange - boolean - - - Does the multirange overlap the range? - - - '{[3,7)}'::int8multirange && int8range(4,12) - t - - - - - - anyrange && anymultirange - boolean - - - Does the range overlap the multirange? - - - int8range(3,7) && '{[4,12)}'::int8multirange - t - - - - - - anymultirange << anymultirange - boolean - - - Is the first multirange strictly left of the second? - - - '{[1,10)}'::int8multirange << '{[100,110)}'::int8multirange - t - - - - - - anymultirange << anyrange - boolean - - - Is the multirange strictly left of the range? - - - '{[1,10)}'::int8multirange << int8range(100,110) - t - - - - - - anyrange << anymultirange - boolean - - - Is the range strictly left of the multirange? - - - int8range(1,10) << '{[100,110)}'::int8multirange - t - - - - - - anymultirange >> anymultirange - boolean - - - Is the first multirange strictly right of the second? - - - '{[50,60)}'::int8multirange >> '{[20,30)}'::int8multirange - t - - - - - - anymultirange >> anyrange - boolean - - - Is the multirange strictly right of the range? - - - '{[50,60)}'::int8multirange >> int8range(20,30) - t - - - - - - anyrange >> anymultirange - boolean - - - Is the range strictly right of the multirange? - - - int8range(50,60) >> '{[20,30)}'::int8multirange - t - - - - - - anymultirange &< anymultirange - boolean - - - Does the first multirange not extend to the right of the second? - - - '{[1,20)}'::int8multirange &< '{[18,20)}'::int8multirange - t - - - - - - anymultirange &< anyrange - boolean - - - Does the multirange not extend to the right of the range? - - - '{[1,20)}'::int8multirange &< int8range(18,20) - t - - - - - - anyrange &< anymultirange - boolean - - - Does the range not extend to the right of the multirange? - - - int8range(1,20) &< '{[18,20)}'::int8multirange - t - - - - - - anymultirange &> anymultirange - boolean - - - Does the first multirange not extend to the left of the second? - - - '{[7,20)}'::int8multirange &> '{[5,10)}'::int8multirange - t - - - - - - anymultirange &> anyrange - boolean - - - Does the multirange not extend to the left of the range? - - - '{[7,20)}'::int8multirange &> int8range(5,10) - t - - - - - - anyrange &> anymultirange - boolean - - - Does the range not extend to the left of the multirange? - - - int8range(7,20) &> '{[5,10)}'::int8multirange - t - - - - - - anymultirange -|- anymultirange - boolean - - - Are the multiranges adjacent? - - - '{[1.1,2.2)}'::nummultirange -|- '{[2.2,3.3)}'::nummultirange - t - - - - - - anymultirange -|- anyrange - boolean - - - Is the multirange adjacent to the range? - - - '{[1.1,2.2)}'::nummultirange -|- numrange(2.2,3.3) - t - - - - - - anyrange -|- anymultirange - boolean - - - Is the range adjacent to the multirange? - - - numrange(1.1,2.2) -|- '{[2.2,3.3)}'::nummultirange - t - - - - - - anymultirange + anymultirange - anymultirange - - - Computes the union of the multiranges. The multiranges need not overlap - or be adjacent. - - - '{[5,10)}'::nummultirange + '{[15,20)}'::nummultirange - {[5,10), [15,20)} - - - - - - anymultirange * anymultirange - anymultirange - - - Computes the intersection of the multiranges. - - - '{[5,15)}'::int8multirange * '{[10,20)}'::int8multirange - {[10,15)} - - - - - - anymultirange - anymultirange - anymultirange - - - Computes the difference of the multiranges. - - - '{[5,20)}'::int8multirange - '{[10,15)}'::int8multirange - {[5,10), [15,20)} - - - - -
- - - The left-of/right-of/adjacent operators always return false when an empty - range or multirange is involved; that is, an empty range is not considered to - be either before or after any other range. - - - - Elsewhere empty ranges and multiranges are treated as the additive identity: - anything unioned with an empty value is itself. Anything minus an empty - value is itself. An empty multirange has exactly the same points as an empty - range. Every range contains the empty range. Every multirange contains as many - empty ranges as you like. - - - - The range union and difference operators will fail if the resulting range would - need to contain two disjoint sub-ranges, as such a range cannot be - represented. There are separate operators for union and difference that take - multirange parameters and return a multirange, and they do not fail even if - their arguments are disjoint. So if you need a union or difference operation - for ranges that may be disjoint, you can avoid errors by first casting your - ranges to multiranges. - - - - shows the functions - available for use with range types. - shows the functions - available for use with multirange types. - - - - Range Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - lower - - lower ( anyrange ) - anyelement - - - Extracts the lower bound of the range (NULL if the - range is empty or has no lower bound). - - - lower(numrange(1.1,2.2)) - 1.1 - - - - - - - upper - - upper ( anyrange ) - anyelement - - - Extracts the upper bound of the range (NULL if the - range is empty or has no upper bound). - - - upper(numrange(1.1,2.2)) - 2.2 - - - - - - - isempty - - isempty ( anyrange ) - boolean - - - Is the range empty? - - - isempty(numrange(1.1,2.2)) - f - - - - - - - lower_inc - - lower_inc ( anyrange ) - boolean - - - Is the range's lower bound inclusive? - - - lower_inc(numrange(1.1,2.2)) - t - - - - - - - upper_inc - - upper_inc ( anyrange ) - boolean - - - Is the range's upper bound inclusive? - - - upper_inc(numrange(1.1,2.2)) - f - - - - - - - lower_inf - - lower_inf ( anyrange ) - boolean - - - Does the range have no lower bound? (A lower bound of - -Infinity returns false.) - - - lower_inf('(,)'::daterange) - t - - - - - - - upper_inf - - upper_inf ( anyrange ) - boolean - - - Does the range have no upper bound? (An upper bound of - Infinity returns false.) - - - upper_inf('(,)'::daterange) - t - - - - - - - range_merge - - range_merge ( anyrange, anyrange ) - anyrange - - - Computes the smallest range that includes both of the given ranges. - - - range_merge('[1,2)'::int4range, '[3,4)'::int4range) - [1,4) - - - - -
- - - Multirange Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - lower - - lower ( anymultirange ) - anyelement - - - Extracts the lower bound of the multirange (NULL if the - multirange is empty or has no lower bound). - - - lower('{[1.1,2.2)}'::nummultirange) - 1.1 - - - - - - - upper - - upper ( anymultirange ) - anyelement - - - Extracts the upper bound of the multirange (NULL if the - multirange is empty or has no upper bound). - - - upper('{[1.1,2.2)}'::nummultirange) - 2.2 - - - - - - - isempty - - isempty ( anymultirange ) - boolean - - - Is the multirange empty? - - - isempty('{[1.1,2.2)}'::nummultirange) - f - - - - - - - lower_inc - - lower_inc ( anymultirange ) - boolean - - - Is the multirange's lower bound inclusive? - - - lower_inc('{[1.1,2.2)}'::nummultirange) - t - - - - - - - upper_inc - - upper_inc ( anymultirange ) - boolean - - - Is the multirange's upper bound inclusive? - - - upper_inc('{[1.1,2.2)}'::nummultirange) - f - - - - - - - lower_inf - - lower_inf ( anymultirange ) - boolean - - - Does the multirange have no lower bound? (A lower bound of - -Infinity returns false.) - - - lower_inf('{(,)}'::datemultirange) - t - - - - - - - upper_inf - - upper_inf ( anymultirange ) - boolean - - - Does the multirange have no upper bound? (An upper bound of - Infinity returns false.) - - - upper_inf('{(,)}'::datemultirange) - t - - - - - - - range_merge - - range_merge ( anymultirange ) - anyrange - - - Computes the smallest range that includes the entire multirange. - - - range_merge('{[1,2), [3,4)}'::int4multirange) - [1,4) - - - - - - - multirange (function) - - multirange ( anyrange ) - anymultirange - - - Returns a multirange containing just the given range. - - - multirange('[1,2)'::int4range) - {[1,2)} - - - - - - - unnest - for multirange - - unnest ( anymultirange ) - setof anyrange - - - Expands a multirange into a set of ranges in ascending order. - - - unnest('{[1,2), [3,4)}'::int4multirange) - - - [1,2) - [3,4) - - - - - -
- - - The lower_inc, upper_inc, - lower_inf, and upper_inf - functions all return false for an empty range or multirange. - -
- - - Aggregate Functions - - - aggregate function - built-in - - - - Aggregate functions compute a single result - from a set of input values. The built-in general-purpose aggregate - functions are listed in - while statistical aggregates are in . - The built-in within-group ordered-set aggregate functions - are listed in - while the built-in within-group hypothetical-set ones are in . Grouping operations, - which are closely related to aggregate functions, are listed in - . - The special syntax considerations for aggregate - functions are explained in . - Consult for additional introductory - information. - - - - Aggregate functions that support Partial Mode - are eligible to participate in various optimizations, such as parallel - aggregation. - - - - While all aggregates below accept an optional - ORDER BY clause (as outlined in ), the clause has only been added to - aggregates whose output is affected by ordering. - - - - General-Purpose Aggregate Functions - - - - - - - Function - - - Description - - Partial Mode - - - - - - - - any_value - - any_value ( anyelement ) - same as input type - - - Returns an arbitrary value from the non-null input values. - - Yes - - - - - - array_agg - - array_agg ( anynonarray ORDER BY input_sort_columns ) - anyarray - - - Collects all the input values, including nulls, into an array. - - Yes - - - - - array_agg ( anyarray ORDER BY input_sort_columns ) - anyarray - - - Concatenates all the input arrays into an array of one higher - dimension. (The inputs must all have the same dimensionality, and - cannot be empty or null.) - - Yes - - - - - - average - - - avg - - avg ( smallint ) - numeric - - - avg ( integer ) - numeric - - - avg ( bigint ) - numeric - - - avg ( numeric ) - numeric - - - avg ( real ) - double precision - - - avg ( double precision ) - double precision - - - avg ( interval ) - interval - - - Computes the average (arithmetic mean) of all the non-null input - values. - - Yes - - - - - - bit_and - - bit_and ( smallint ) - smallint - - - bit_and ( integer ) - integer - - - bit_and ( bigint ) - bigint - - - bit_and ( bit ) - bit - - - Computes the bitwise AND of all non-null input values. - - Yes - - - - - - bit_or - - bit_or ( smallint ) - smallint - - - bit_or ( integer ) - integer - - - bit_or ( bigint ) - bigint - - - bit_or ( bit ) - bit - - - Computes the bitwise OR of all non-null input values. - - Yes - - - - - - bit_xor - - bit_xor ( smallint ) - smallint - - - bit_xor ( integer ) - integer - - - bit_xor ( bigint ) - bigint - - - bit_xor ( bit ) - bit - - - Computes the bitwise exclusive OR of all non-null input values. - Can be useful as a checksum for an unordered set of values. - - Yes - - - - - - bool_and - - bool_and ( boolean ) - boolean - - - Returns true if all non-null input values are true, otherwise false. - - Yes - - - - - - bool_or - - bool_or ( boolean ) - boolean - - - Returns true if any non-null input value is true, otherwise false. - - Yes - - - - - - count - - count ( * ) - bigint - - - Computes the number of input rows. - - Yes - - - - - count ( "any" ) - bigint - - - Computes the number of input rows in which the input value is not - null. - - Yes - - - - - - every - - every ( boolean ) - boolean - - - This is the SQL standard's equivalent to bool_and. - - Yes - - - - - - json_agg - - json_agg ( anyelement ORDER BY input_sort_columns ) - json - - - - jsonb_agg - - jsonb_agg ( anyelement ORDER BY input_sort_columns ) - jsonb - - - Collects all the input values, including nulls, into a JSON array. - Values are converted to JSON as per to_json - or to_jsonb. - - No - - - - - - json_agg_strict - - json_agg_strict ( anyelement ) - json - - - - jsonb_agg_strict - - jsonb_agg_strict ( anyelement ) - jsonb - - - Collects all the input values, skipping nulls, into a JSON array. - Values are converted to JSON as per to_json - or to_jsonb. - - No - - - - - json_arrayagg - json_arrayagg ( - value_expression - ORDER BY sort_expression - { NULL | ABSENT } ON NULL - RETURNING data_type FORMAT JSON ENCODING UTF8 ) - - - Behaves in the same way as json_array - but as an aggregate function so it only takes one - value_expression parameter. - If ABSENT ON NULL is specified, any NULL - values are omitted. - If ORDER BY is specified, the elements will - appear in the array in that order rather than in the input order. - - - SELECT json_arrayagg(v) FROM (VALUES(2),(1)) t(v) - [2, 1] - - No - - - - - json_objectagg - json_objectagg ( - { key_expression { VALUE | ':' } value_expression } - { NULL | ABSENT } ON NULL - { WITH | WITHOUT } UNIQUE KEYS - RETURNING data_type FORMAT JSON ENCODING UTF8 ) - - - Behaves like json_object, but as an - aggregate function, so it only takes one - key_expression and one - value_expression parameter. - - - SELECT json_objectagg(k:v) FROM (VALUES ('a'::text,current_date),('b',current_date + 1)) AS t(k,v) - { "a" : "2022-05-10", "b" : "2022-05-11" } - - No - - - - - - json_object_agg - - json_object_agg ( key - "any", value - "any" - ORDER BY input_sort_columns ) - json - - - - jsonb_object_agg - - jsonb_object_agg ( key - "any", value - "any" - ORDER BY input_sort_columns ) - jsonb - - - Collects all the key/value pairs into a JSON object. Key arguments - are coerced to text; value arguments are converted as per - to_json or to_jsonb. - Values can be null, but keys cannot. - - No - - - - - - json_object_agg_strict - - json_object_agg_strict ( - key "any", - value "any" ) - json - - - - jsonb_object_agg_strict - - jsonb_object_agg_strict ( - key "any", - value "any" ) - jsonb - - - Collects all the key/value pairs into a JSON object. Key arguments - are coerced to text; value arguments are converted as per - to_json or to_jsonb. - The key can not be null. If the - value is null then the entry is skipped, - - No - - - - - - json_object_agg_unique - - json_object_agg_unique ( - key "any", - value "any" ) - json - - - - jsonb_object_agg_unique - - jsonb_object_agg_unique ( - key "any", - value "any" ) - jsonb - - - Collects all the key/value pairs into a JSON object. Key arguments - are coerced to text; value arguments are converted as per - to_json or to_jsonb. - Values can be null, but keys cannot. - If there is a duplicate key an error is thrown. - - No - - - - - - json_object_agg_unique_strict - - json_object_agg_unique_strict ( - key "any", - value "any" ) - json - - - - jsonb_object_agg_unique_strict - - jsonb_object_agg_unique_strict ( - key "any", - value "any" ) - jsonb - - - Collects all the key/value pairs into a JSON object. Key arguments - are coerced to text; value arguments are converted as per - to_json or to_jsonb. - The key can not be null. If the - value is null then the entry is skipped. - If there is a duplicate key an error is thrown. - - No - - - - - - max - - max ( see text ) - same as input type - - - Computes the maximum of the non-null input - values. Available for any numeric, string, date/time, or enum type, - as well as bytea, inet, interval, - money, oid, pg_lsn, - tid, xid8, - and also arrays and composite types containing sortable data types. - - Yes - - - - - - min - - min ( see text ) - same as input type - - - Computes the minimum of the non-null input - values. Available for any numeric, string, date/time, or enum type, - as well as bytea, inet, interval, - money, oid, pg_lsn, - tid, xid8, - and also arrays and composite types containing sortable data types. - - Yes - - - - - - range_agg - - range_agg ( value - anyrange ) - anymultirange - - - range_agg ( value - anymultirange ) - anymultirange - - - Computes the union of the non-null input values. - - No - - - - - - range_intersect_agg - - range_intersect_agg ( value - anyrange ) - anyrange - - - range_intersect_agg ( value - anymultirange ) - anymultirange - - - Computes the intersection of the non-null input values. - - No - - - - - - string_agg - - string_agg ( value - text, delimiter text ) - text - - - string_agg ( value - bytea, delimiter bytea - ORDER BY input_sort_columns ) - bytea - - - Concatenates the non-null input values into a string. Each value - after the first is preceded by the - corresponding delimiter (if it's not null). - - Yes - - - - - - sum - - sum ( smallint ) - bigint - - - sum ( integer ) - bigint - - - sum ( bigint ) - numeric - - - sum ( numeric ) - numeric - - - sum ( real ) - real - - - sum ( double precision ) - double precision - - - sum ( interval ) - interval - - - sum ( money ) - money - - - Computes the sum of the non-null input values. - - Yes - - - - - - xmlagg - - xmlagg ( xml ORDER BY input_sort_columns ) - xml - - - Concatenates the non-null XML input values (see - ). - - No - - - -
- - - It should be noted that except for count, - these functions return a null value when no rows are selected. In - particular, sum of no rows returns null, not - zero as one might expect, and array_agg - returns null rather than an empty array when there are no input - rows. The coalesce function can be used to - substitute zero or an empty array for null when necessary. - - - - The aggregate functions array_agg, - json_agg, jsonb_agg, - json_agg_strict, jsonb_agg_strict, - json_object_agg, jsonb_object_agg, - json_object_agg_strict, jsonb_object_agg_strict, - json_object_agg_unique, jsonb_object_agg_unique, - json_object_agg_unique_strict, - jsonb_object_agg_unique_strict, - string_agg, - and xmlagg, as well as similar user-defined - aggregate functions, produce meaningfully different result values - depending on the order of the input values. This ordering is - unspecified by default, but can be controlled by writing an - ORDER BY clause within the aggregate call, as shown in - . - Alternatively, supplying the input values from a sorted subquery - will usually work. For example: - - - - Beware that this approach can fail if the outer query level contains - additional processing, such as a join, because that might cause the - subquery's output to be reordered before the aggregate is computed. - - - - - ANY - - - SOME - - - The boolean aggregates bool_and and - bool_or correspond to the standard SQL aggregates - every and any or - some. - PostgreSQL - supports every, but not any - or some, because there is an ambiguity built into - the standard syntax: - -SELECT b1 = ANY((SELECT b2 FROM t2 ...)) FROM t1 ...; - - Here ANY can be considered either as introducing - a subquery, or as being an aggregate function, if the subquery - returns one row with a Boolean value. - Thus the standard name cannot be given to these aggregates. - - - - - - Users accustomed to working with other SQL database management - systems might be disappointed by the performance of the - count aggregate when it is applied to the - entire table. A query like: - -SELECT count(*) FROM sometable; - - will require effort proportional to the size of the table: - PostgreSQL will need to scan either the - entire table or the entirety of an index that includes all rows in - the table. - - - - - shows - aggregate functions typically used in statistical analysis. - (These are separated out merely to avoid cluttering the listing - of more-commonly-used aggregates.) Functions shown as - accepting numeric_type are available for all - the types smallint, integer, - bigint, numeric, real, - and double precision. - Where the description mentions - N, it means the - number of input rows for which all the input expressions are non-null. - In all cases, null is returned if the computation is meaningless, - for example when N is zero. - - - - statistics - - - linear regression - - - - Aggregate Functions for Statistics - - - - - - - Function - - - Description - - Partial Mode - - - - - - - - correlation - - - corr - - corr ( Y double precision, X double precision ) - double precision - - - Computes the correlation coefficient. - - Yes - - - - - - covariance - population - - - covar_pop - - covar_pop ( Y double precision, X double precision ) - double precision - - - Computes the population covariance. - - Yes - - - - - - covariance - sample - - - covar_samp - - covar_samp ( Y double precision, X double precision ) - double precision - - - Computes the sample covariance. - - Yes - - - - - - regr_avgx - - regr_avgx ( Y double precision, X double precision ) - double precision - - - Computes the average of the independent variable, - sum(X)/N. - - Yes - - - - - - regr_avgy - - regr_avgy ( Y double precision, X double precision ) - double precision - - - Computes the average of the dependent variable, - sum(Y)/N. - - Yes - - - - - - regr_count - - regr_count ( Y double precision, X double precision ) - bigint - - - Computes the number of rows in which both inputs are non-null. - - Yes - - - - - - regression intercept - - - regr_intercept - - regr_intercept ( Y double precision, X double precision ) - double precision - - - Computes the y-intercept of the least-squares-fit linear equation - determined by the - (X, Y) pairs. - - Yes - - - - - - regr_r2 - - regr_r2 ( Y double precision, X double precision ) - double precision - - - Computes the square of the correlation coefficient. - - Yes - - - - - - regression slope - - - regr_slope - - regr_slope ( Y double precision, X double precision ) - double precision - - - Computes the slope of the least-squares-fit linear equation determined - by the (X, Y) - pairs. - - Yes - - - - - - regr_sxx - - regr_sxx ( Y double precision, X double precision ) - double precision - - - Computes the sum of squares of the independent - variable, - sum(X^2) - sum(X)^2/N. - - Yes - - - - - - regr_sxy - - regr_sxy ( Y double precision, X double precision ) - double precision - - - Computes the sum of products of independent times - dependent variables, - sum(X*Y) - sum(X) * sum(Y)/N. - - Yes - - - - - - regr_syy - - regr_syy ( Y double precision, X double precision ) - double precision - - - Computes the sum of squares of the dependent - variable, - sum(Y^2) - sum(Y)^2/N. - - Yes - - - - - - standard deviation - - - stddev - - stddev ( numeric_type ) - double precision - for real or double precision, - otherwise numeric - - - This is a historical alias for stddev_samp. - - Yes - - - - - - standard deviation - population - - - stddev_pop - - stddev_pop ( numeric_type ) - double precision - for real or double precision, - otherwise numeric - - - Computes the population standard deviation of the input values. - - Yes - - - - - - standard deviation - sample - - - stddev_samp - - stddev_samp ( numeric_type ) - double precision - for real or double precision, - otherwise numeric - - - Computes the sample standard deviation of the input values. - - Yes - - - - - - variance - - variance ( numeric_type ) - double precision - for real or double precision, - otherwise numeric - - - This is a historical alias for var_samp. - - Yes - - - - - - variance - population - - - var_pop - - var_pop ( numeric_type ) - double precision - for real or double precision, - otherwise numeric - - - Computes the population variance of the input values (square of the - population standard deviation). - - Yes - - - - - - variance - sample - - - var_samp - - var_samp ( numeric_type ) - double precision - for real or double precision, - otherwise numeric - - - Computes the sample variance of the input values (square of the sample - standard deviation). - - Yes - - - -
- - - shows some - aggregate functions that use the ordered-set aggregate - syntax. These functions are sometimes referred to as inverse - distribution functions. Their aggregated input is introduced by - ORDER BY, and they may also take a direct - argument that is not aggregated, but is computed only once. - All these functions ignore null values in their aggregated input. - For those that take a fraction parameter, the - fraction value must be between 0 and 1; an error is thrown if not. - However, a null fraction value simply produces a - null result. - - - - ordered-set aggregate - built-in - - - inverse distribution - - - - Ordered-Set Aggregate Functions - - - - - - - Function - - - Description - - Partial Mode - - - - - - - - mode - statistical - - mode () WITHIN GROUP ( ORDER BY anyelement ) - anyelement - - - Computes the mode, the most frequent - value of the aggregated argument (arbitrarily choosing the first one - if there are multiple equally-frequent values). The aggregated - argument must be of a sortable type. - - No - - - - - - percentile - continuous - - percentile_cont ( fraction double precision ) WITHIN GROUP ( ORDER BY double precision ) - double precision - - - percentile_cont ( fraction double precision ) WITHIN GROUP ( ORDER BY interval ) - interval - - - Computes the continuous percentile, a value - corresponding to the specified fraction - within the ordered set of aggregated argument values. This will - interpolate between adjacent input items if needed. - - No - - - - - percentile_cont ( fractions double precision[] ) WITHIN GROUP ( ORDER BY double precision ) - double precision[] - - - percentile_cont ( fractions double precision[] ) WITHIN GROUP ( ORDER BY interval ) - interval[] - - - Computes multiple continuous percentiles. The result is an array of - the same dimensions as the fractions - parameter, with each non-null element replaced by the (possibly - interpolated) value corresponding to that percentile. - - No - - - - - - percentile - discrete - - percentile_disc ( fraction double precision ) WITHIN GROUP ( ORDER BY anyelement ) - anyelement - - - Computes the discrete percentile, the first - value within the ordered set of aggregated argument values whose - position in the ordering equals or exceeds the - specified fraction. The aggregated - argument must be of a sortable type. - - No - - - - - percentile_disc ( fractions double precision[] ) WITHIN GROUP ( ORDER BY anyelement ) - anyarray - - - Computes multiple discrete percentiles. The result is an array of the - same dimensions as the fractions parameter, - with each non-null element replaced by the input value corresponding - to that percentile. - The aggregated argument must be of a sortable type. - - No - - - -
- - - hypothetical-set aggregate - built-in - - - - Each of the hypothetical-set aggregates listed in - is associated with a - window function of the same name defined in - . In each case, the aggregate's result - is the value that the associated window function would have - returned for the hypothetical row constructed from - args, if such a row had been added to the sorted - group of rows represented by the sorted_args. - For each of these functions, the list of direct arguments - given in args must match the number and types of - the aggregated arguments given in sorted_args. - Unlike most built-in aggregates, these aggregates are not strict, that is - they do not drop input rows containing nulls. Null values sort according - to the rule specified in the ORDER BY clause. - - - - Hypothetical-Set Aggregate Functions - - - - - - - Function - - - Description - - Partial Mode - - - - - - - - rank - hypothetical - - rank ( args ) WITHIN GROUP ( ORDER BY sorted_args ) - bigint - - - Computes the rank of the hypothetical row, with gaps; that is, the row - number of the first row in its peer group. - - No - - - - - - dense_rank - hypothetical - - dense_rank ( args ) WITHIN GROUP ( ORDER BY sorted_args ) - bigint - - - Computes the rank of the hypothetical row, without gaps; this function - effectively counts peer groups. - - No - - - - - - percent_rank - hypothetical - - percent_rank ( args ) WITHIN GROUP ( ORDER BY sorted_args ) - double precision - - - Computes the relative rank of the hypothetical row, that is - (rank - 1) / (total rows - 1). - The value thus ranges from 0 to 1 inclusive. - - No - - - - - - cume_dist - hypothetical - - cume_dist ( args ) WITHIN GROUP ( ORDER BY sorted_args ) - double precision - - - Computes the cumulative distribution, that is (number of rows - preceding or peers with hypothetical row) / (total rows). The value - thus ranges from 1/N to 1. - - No - - - -
- - - Grouping Operations - - - - - Function - - - Description - - - - - - - - - GROUPING - - GROUPING ( group_by_expression(s) ) - integer - - - Returns a bit mask indicating which GROUP BY - expressions are not included in the current grouping set. - Bits are assigned with the rightmost argument corresponding to the - least-significant bit; each bit is 0 if the corresponding expression - is included in the grouping criteria of the grouping set generating - the current result row, and 1 if it is not included. - - - - -
- - - The grouping operations shown in - are used in conjunction with - grouping sets (see ) to distinguish - result rows. The arguments to the GROUPING function - are not actually evaluated, but they must exactly match expressions given - in the GROUP BY clause of the associated query level. - For example: - -=> SELECT * FROM items_sold; - make | model | sales --------+-------+------- - Foo | GT | 10 - Foo | Tour | 20 - Bar | City | 15 - Bar | Sport | 5 -(4 rows) - -=> SELECT make, model, GROUPING(make,model), sum(sales) FROM items_sold GROUP BY ROLLUP(make,model); - make | model | grouping | sum --------+-------+----------+----- - Foo | GT | 0 | 10 - Foo | Tour | 0 | 20 - Bar | City | 0 | 15 - Bar | Sport | 0 | 5 - Foo | | 1 | 30 - Bar | | 1 | 20 - | | 3 | 50 -(7 rows) - - Here, the grouping value 0 in the - first four rows shows that those have been grouped normally, over both the - grouping columns. The value 1 indicates - that model was not grouped by in the next-to-last two - rows, and the value 3 indicates that - neither make nor model was grouped - by in the last row (which therefore is an aggregate over all the input - rows). - - -
- - - Window Functions - - - window function - built-in - - - - Window functions provide the ability to perform - calculations across sets of rows that are related to the current query - row. See for an introduction to this - feature, and for syntax - details. - - - - The built-in window functions are listed in - . Note that these functions - must be invoked using window function syntax, i.e., an - OVER clause is required. - - - - In addition to these functions, any built-in or user-defined - ordinary aggregate (i.e., not ordered-set or hypothetical-set aggregates) - can be used as a window function; see - for a list of the built-in aggregates. - Aggregate functions act as window functions only when an OVER - clause follows the call; otherwise they act as plain aggregates - and return a single row for the entire set. - - - - General-Purpose Window Functions - - - - - Function - - - Description - - - - - - - - - row_number - - row_number () - bigint - - - Returns the number of the current row within its partition, counting - from 1. - - - - - - - rank - - rank () - bigint - - - Returns the rank of the current row, with gaps; that is, - the row_number of the first row in its peer - group. - - - - - - - dense_rank - - dense_rank () - bigint - - - Returns the rank of the current row, without gaps; this function - effectively counts peer groups. - - - - - - - percent_rank - - percent_rank () - double precision - - - Returns the relative rank of the current row, that is - (rank - 1) / (total partition rows - 1). - The value thus ranges from 0 to 1 inclusive. - - - - - - - cume_dist - - cume_dist () - double precision - - - Returns the cumulative distribution, that is (number of partition rows - preceding or peers with current row) / (total partition rows). - The value thus ranges from 1/N to 1. - - - - - - - ntile - - ntile ( num_buckets integer ) - integer - - - Returns an integer ranging from 1 to the argument value, dividing the - partition as equally as possible. - - - - - - - lag - - lag ( value anycompatible - , offset integer - , default anycompatible ) - anycompatible - - - Returns value evaluated at - the row that is offset - rows before the current row within the partition; if there is no such - row, instead returns default - (which must be of a type compatible with - value). - Both offset and - default are evaluated - with respect to the current row. If omitted, - offset defaults to 1 and - default to NULL. - - - - - - - lead - - lead ( value anycompatible - , offset integer - , default anycompatible ) - anycompatible - - - Returns value evaluated at - the row that is offset - rows after the current row within the partition; if there is no such - row, instead returns default - (which must be of a type compatible with - value). - Both offset and - default are evaluated - with respect to the current row. If omitted, - offset defaults to 1 and - default to NULL. - - - - - - - first_value - - first_value ( value anyelement ) - anyelement - - - Returns value evaluated - at the row that is the first row of the window frame. - - - - - - - last_value - - last_value ( value anyelement ) - anyelement - - - Returns value evaluated - at the row that is the last row of the window frame. - - - - - - - nth_value - - nth_value ( value anyelement, n integer ) - anyelement - - - Returns value evaluated - at the row that is the n'th - row of the window frame (counting from 1); - returns NULL if there is no such row. - - - - -
- - - All of the functions listed in - depend on the sort ordering - specified by the ORDER BY clause of the associated window - definition. Rows that are not distinct when considering only the - ORDER BY columns are said to be peers. - The four ranking functions (including cume_dist) are - defined so that they give the same answer for all rows of a peer group. - - - - Note that first_value, last_value, and - nth_value consider only the rows within the window - frame, which by default contains the rows from the start of the - partition through the last peer of the current row. This is - likely to give unhelpful results for last_value and - sometimes also nth_value. You can redefine the frame by - adding a suitable frame specification (RANGE, - ROWS or GROUPS) to - the OVER clause. - See for more information - about frame specifications. - - - - When an aggregate function is used as a window function, it aggregates - over the rows within the current row's window frame. - An aggregate used with ORDER BY and the default window frame - definition produces a running sum type of behavior, which may or - may not be what's wanted. To obtain - aggregation over the whole partition, omit ORDER BY or use - ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING. - Other frame specifications can be used to obtain other effects. - - - - - The SQL standard defines a RESPECT NULLS or - IGNORE NULLS option for lead, lag, - first_value, last_value, and - nth_value. This is not implemented in - PostgreSQL: the behavior is always the - same as the standard's default, namely RESPECT NULLS. - Likewise, the standard's FROM FIRST or FROM LAST - option for nth_value is not implemented: only the - default FROM FIRST behavior is supported. (You can achieve - the result of FROM LAST by reversing the ORDER BY - ordering.) - - - -
- - - Merge Support Functions - - - MERGE - RETURNING - - - - PostgreSQL includes one merge support function - that may be used in the RETURNING list of a - command to identify the action taken for each - row; see . - - - - Merge Support Functions - - - - - - Function - - - Description - - - - - - - - - merge_action - - merge_action ( ) - text - - - Returns the merge action command executed for the current row. This - will be 'INSERT', 'UPDATE', or - 'DELETE'. - - - - -
- - - Example: - 0 THEN - UPDATE SET in_stock = true, quantity = s.quantity - WHEN MATCHED THEN - UPDATE SET in_stock = false, quantity = 0 - WHEN NOT MATCHED THEN - INSERT (product_id, in_stock, quantity) - VALUES (s.product_id, true, s.quantity) - RETURNING merge_action(), p.*; - - merge_action | product_id | in_stock | quantity ---------------+------------+----------+---------- - UPDATE | 1001 | t | 50 - UPDATE | 1002 | f | 0 - INSERT | 1003 | t | 10 -]]> - - - - Note that this function can only be used in the RETURNING - list of a MERGE command. It is an error to use it in any - other part of a query. - - -
- - - Subquery Expressions - - - EXISTS - - - - IN - - - - NOT IN - - - - ANY - - - - ALL - - - - SOME - - - - subquery - - - - This section describes the SQL-compliant subquery - expressions available in PostgreSQL. - All of the expression forms documented in this section return - Boolean (true/false) results. - - - - <literal>EXISTS</literal> - - -EXISTS (subquery) - - - - The argument of EXISTS is an arbitrary SELECT statement, - or subquery. The - subquery is evaluated to determine whether it returns any rows. - If it returns at least one row, the result of EXISTS is - true; if the subquery returns no rows, the result of EXISTS - is false. - - - - The subquery can refer to variables from the surrounding query, - which will act as constants during any one evaluation of the subquery. - - - - The subquery will generally only be executed long enough to determine - whether at least one row is returned, not all the way to completion. - It is unwise to write a subquery that has side effects (such as - calling sequence functions); whether the side effects occur - might be unpredictable. - - - - Since the result depends only on whether any rows are returned, - and not on the contents of those rows, the output list of the - subquery is normally unimportant. A common coding convention is - to write all EXISTS tests in the form - EXISTS(SELECT 1 WHERE ...). There are exceptions to - this rule however, such as subqueries that use INTERSECT. - - - - This simple example is like an inner join on col2, but - it produces at most one output row for each tab1 row, - even if there are several matching tab2 rows: - -SELECT col1 -FROM tab1 -WHERE EXISTS (SELECT 1 FROM tab2 WHERE col2 = tab1.col2); - - - - - - <literal>IN</literal> - - -expression IN (subquery) - - - - The right-hand side is a parenthesized - subquery, which must return exactly one column. The left-hand expression - is evaluated and compared to each row of the subquery result. - The result of IN is true if any equal subquery row is found. - The result is false if no equal row is found (including the - case where the subquery returns no rows). - - - - Note that if the left-hand expression yields null, or if there are - no equal right-hand values and at least one right-hand row yields - null, the result of the IN construct will be null, not false. - This is in accordance with SQL's normal rules for Boolean combinations - of null values. - - - - As with EXISTS, it's unwise to assume that the subquery will - be evaluated completely. - - - -row_constructor IN (subquery) - - - - The left-hand side of this form of IN is a row constructor, - as described in . - The right-hand side is a parenthesized - subquery, which must return exactly as many columns as there are - expressions in the left-hand row. The left-hand expressions are - evaluated and compared row-wise to each row of the subquery result. - The result of IN is true if any equal subquery row is found. - The result is false if no equal row is found (including the - case where the subquery returns no rows). - - - - As usual, null values in the rows are combined per - the normal rules of SQL Boolean expressions. Two rows are considered - equal if all their corresponding members are non-null and equal; the rows - are unequal if any corresponding members are non-null and unequal; - otherwise the result of that row comparison is unknown (null). - If all the per-row results are either unequal or null, with at least one - null, then the result of IN is null. - - - - - <literal>NOT IN</literal> - - -expression NOT IN (subquery) - - - - The right-hand side is a parenthesized - subquery, which must return exactly one column. The left-hand expression - is evaluated and compared to each row of the subquery result. - The result of NOT IN is true if only unequal subquery rows - are found (including the case where the subquery returns no rows). - The result is false if any equal row is found. - - - - Note that if the left-hand expression yields null, or if there are - no equal right-hand values and at least one right-hand row yields - null, the result of the NOT IN construct will be null, not true. - This is in accordance with SQL's normal rules for Boolean combinations - of null values. - - - - As with EXISTS, it's unwise to assume that the subquery will - be evaluated completely. - - - -row_constructor NOT IN (subquery) - - - - The left-hand side of this form of NOT IN is a row constructor, - as described in . - The right-hand side is a parenthesized - subquery, which must return exactly as many columns as there are - expressions in the left-hand row. The left-hand expressions are - evaluated and compared row-wise to each row of the subquery result. - The result of NOT IN is true if only unequal subquery rows - are found (including the case where the subquery returns no rows). - The result is false if any equal row is found. - - - - As usual, null values in the rows are combined per - the normal rules of SQL Boolean expressions. Two rows are considered - equal if all their corresponding members are non-null and equal; the rows - are unequal if any corresponding members are non-null and unequal; - otherwise the result of that row comparison is unknown (null). - If all the per-row results are either unequal or null, with at least one - null, then the result of NOT IN is null. - - - - - <literal>ANY</literal>/<literal>SOME</literal> - - -expression operator ANY (subquery) -expression operator SOME (subquery) - - - - The right-hand side is a parenthesized - subquery, which must return exactly one column. The left-hand expression - is evaluated and compared to each row of the subquery result using the - given operator, which must yield a Boolean - result. - The result of ANY is true if any true result is obtained. - The result is false if no true result is found (including the - case where the subquery returns no rows). - - - - SOME is a synonym for ANY. - IN is equivalent to = ANY. - - - - Note that if there are no successes and at least one right-hand row yields - null for the operator's result, the result of the ANY construct - will be null, not false. - This is in accordance with SQL's normal rules for Boolean combinations - of null values. - - - - As with EXISTS, it's unwise to assume that the subquery will - be evaluated completely. - - - -row_constructor operator ANY (subquery) -row_constructor operator SOME (subquery) - - - - The left-hand side of this form of ANY is a row constructor, - as described in . - The right-hand side is a parenthesized - subquery, which must return exactly as many columns as there are - expressions in the left-hand row. The left-hand expressions are - evaluated and compared row-wise to each row of the subquery result, - using the given operator. - The result of ANY is true if the comparison - returns true for any subquery row. - The result is false if the comparison returns false for every - subquery row (including the case where the subquery returns no - rows). - The result is NULL if no comparison with a subquery row returns true, - and at least one comparison returns NULL. - - - - See for details about the meaning - of a row constructor comparison. - - - - - <literal>ALL</literal> - - -expression operator ALL (subquery) - - - - The right-hand side is a parenthesized - subquery, which must return exactly one column. The left-hand expression - is evaluated and compared to each row of the subquery result using the - given operator, which must yield a Boolean - result. - The result of ALL is true if all rows yield true - (including the case where the subquery returns no rows). - The result is false if any false result is found. - The result is NULL if no comparison with a subquery row returns false, - and at least one comparison returns NULL. - - - - NOT IN is equivalent to <> ALL. - - - - As with EXISTS, it's unwise to assume that the subquery will - be evaluated completely. - - - -row_constructor operator ALL (subquery) - - - - The left-hand side of this form of ALL is a row constructor, - as described in . - The right-hand side is a parenthesized - subquery, which must return exactly as many columns as there are - expressions in the left-hand row. The left-hand expressions are - evaluated and compared row-wise to each row of the subquery result, - using the given operator. - The result of ALL is true if the comparison - returns true for all subquery rows (including the - case where the subquery returns no rows). - The result is false if the comparison returns false for any - subquery row. - The result is NULL if no comparison with a subquery row returns false, - and at least one comparison returns NULL. - - - - See for details about the meaning - of a row constructor comparison. - - - - - Single-Row Comparison - - - comparison - subquery result row - - - -row_constructor operator (subquery) - - - - The left-hand side is a row constructor, - as described in . - The right-hand side is a parenthesized subquery, which must return exactly - as many columns as there are expressions in the left-hand row. Furthermore, - the subquery cannot return more than one row. (If it returns zero rows, - the result is taken to be null.) The left-hand side is evaluated and - compared row-wise to the single subquery result row. - - - - See for details about the meaning - of a row constructor comparison. - - - - - - - Row and Array Comparisons - - - IN - - - - NOT IN - - - - ANY - - - - ALL - - - - SOME - - - - composite type - comparison - - - - row-wise comparison - - - - comparison - composite type - - - - comparison - row constructor - - - - IS DISTINCT FROM - - - - IS NOT DISTINCT FROM - - - - This section describes several specialized constructs for making - multiple comparisons between groups of values. These forms are - syntactically related to the subquery forms of the previous section, - but do not involve subqueries. - The forms involving array subexpressions are - PostgreSQL extensions; the rest are - SQL-compliant. - All of the expression forms documented in this section return - Boolean (true/false) results. - - - - <literal>IN</literal> - - -expression IN (value , ...) - - - - The right-hand side is a parenthesized list - of expressions. The result is true if the left-hand expression's - result is equal to any of the right-hand expressions. This is a shorthand - notation for - - -expression = value1 -OR -expression = value2 -OR -... - - - - - Note that if the left-hand expression yields null, or if there are - no equal right-hand values and at least one right-hand expression yields - null, the result of the IN construct will be null, not false. - This is in accordance with SQL's normal rules for Boolean combinations - of null values. - - - - - <literal>NOT IN</literal> - - -expression NOT IN (value , ...) - - - - The right-hand side is a parenthesized list - of expressions. The result is true if the left-hand expression's - result is unequal to all of the right-hand expressions. This is a shorthand - notation for - - -expression <> value1 -AND -expression <> value2 -AND -... - - - - - Note that if the left-hand expression yields null, or if there are - no equal right-hand values and at least one right-hand expression yields - null, the result of the NOT IN construct will be null, not true - as one might naively expect. - This is in accordance with SQL's normal rules for Boolean combinations - of null values. - - - - - x NOT IN y is equivalent to NOT (x IN y) in all - cases. However, null values are much more likely to trip up the novice when - working with NOT IN than when working with IN. - It is best to express your condition positively if possible. - - - - - - <literal>ANY</literal>/<literal>SOME</literal> (array) - - -expression operator ANY (array expression) -expression operator SOME (array expression) - - - - The right-hand side is a parenthesized expression, which must yield an - array value. - The left-hand expression - is evaluated and compared to each element of the array using the - given operator, which must yield a Boolean - result. - The result of ANY is true if any true result is obtained. - The result is false if no true result is found (including the - case where the array has zero elements). - - - - If the array expression yields a null array, the result of - ANY will be null. If the left-hand expression yields null, - the result of ANY is ordinarily null (though a non-strict - comparison operator could possibly yield a different result). - Also, if the right-hand array contains any null elements and no true - comparison result is obtained, the result of ANY - will be null, not false (again, assuming a strict comparison operator). - This is in accordance with SQL's normal rules for Boolean combinations - of null values. - - - - SOME is a synonym for ANY. - - - - - <literal>ALL</literal> (array) - - -expression operator ALL (array expression) - - - - The right-hand side is a parenthesized expression, which must yield an - array value. - The left-hand expression - is evaluated and compared to each element of the array using the - given operator, which must yield a Boolean - result. - The result of ALL is true if all comparisons yield true - (including the case where the array has zero elements). - The result is false if any false result is found. - - - - If the array expression yields a null array, the result of - ALL will be null. If the left-hand expression yields null, - the result of ALL is ordinarily null (though a non-strict - comparison operator could possibly yield a different result). - Also, if the right-hand array contains any null elements and no false - comparison result is obtained, the result of ALL - will be null, not true (again, assuming a strict comparison operator). - This is in accordance with SQL's normal rules for Boolean combinations - of null values. - - - - - Row Constructor Comparison - - -row_constructor operator row_constructor - - - - Each side is a row constructor, - as described in . - The two row constructors must have the same number of fields. - The given operator is applied to each pair - of corresponding fields. (Since the fields could be of different - types, this means that a different specific operator could be selected - for each pair.) - All the selected operators must be members of some B-tree operator - class, or be the negator of an = member of a B-tree - operator class, meaning that row constructor comparison is only - possible when the operator is - =, - <>, - <, - <=, - >, or - >=, - or has semantics similar to one of these. - - - - The = and <> cases work slightly differently - from the others. Two rows are considered - equal if all their corresponding members are non-null and equal; the rows - are unequal if any corresponding members are non-null and unequal; - otherwise the result of the row comparison is unknown (null). - - - - For the <, <=, > and - >= cases, the row elements are compared left-to-right, - stopping as soon as an unequal or null pair of elements is found. - If either of this pair of elements is null, the result of the - row comparison is unknown (null); otherwise comparison of this pair - of elements determines the result. For example, - ROW(1,2,NULL) < ROW(1,3,0) - yields true, not null, because the third pair of elements are not - considered. - - - -row_constructor IS DISTINCT FROM row_constructor - - - - This construct is similar to a <> row comparison, - but it does not yield null for null inputs. Instead, any null value is - considered unequal to (distinct from) any non-null value, and any two - nulls are considered equal (not distinct). Thus the result will - either be true or false, never null. - - - -row_constructor IS NOT DISTINCT FROM row_constructor - - - - This construct is similar to a = row comparison, - but it does not yield null for null inputs. Instead, any null value is - considered unequal to (distinct from) any non-null value, and any two - nulls are considered equal (not distinct). Thus the result will always - be either true or false, never null. - - - - - - Composite Type Comparison - - -record operator record - - - - The SQL specification requires row-wise comparison to return NULL if the - result depends on comparing two NULL values or a NULL and a non-NULL. - PostgreSQL does this only when comparing the - results of two row constructors (as in - ) or comparing a row constructor - to the output of a subquery (as in ). - In other contexts where two composite-type values are compared, two - NULL field values are considered equal, and a NULL is considered larger - than a non-NULL. This is necessary in order to have consistent sorting - and indexing behavior for composite types. - - - - Each side is evaluated and they are compared row-wise. Composite type - comparisons are allowed when the operator is - =, - <>, - <, - <=, - > or - >=, - or has semantics similar to one of these. (To be specific, an operator - can be a row comparison operator if it is a member of a B-tree operator - class, or is the negator of the = member of a B-tree operator - class.) The default behavior of the above operators is the same as for - IS [ NOT ] DISTINCT FROM for row constructors (see - ). - - - - To support matching of rows which include elements without a default - B-tree operator class, the following operators are defined for composite - type comparison: - *=, - *<>, - *<, - *<=, - *>, and - *>=. - These operators compare the internal binary representation of the two - rows. Two rows might have a different binary representation even - though comparisons of the two rows with the equality operator is true. - The ordering of rows under these comparison operators is deterministic - but not otherwise meaningful. These operators are used internally - for materialized views and might be useful for other specialized - purposes such as replication and B-Tree deduplication (see ). They are not intended to be - generally useful for writing queries, though. - - - - - - Set Returning Functions - - - set returning functions - functions - - - - This section describes functions that possibly return more than one row. - The most widely used functions in this class are series generating - functions, as detailed in and - . Other, more specialized - set-returning functions are described elsewhere in this manual. - See for ways to combine multiple - set-returning functions. - - - - Series Generating Functions - - - - - Function - - - Description - - - - - - - - - generate_series - - generate_series ( start integer, stop integer , step integer ) - setof integer - - - generate_series ( start bigint, stop bigint , step bigint ) - setof bigint - - - generate_series ( start numeric, stop numeric , step numeric ) - setof numeric - - - Generates a series of values from start - to stop, with a step size - of step. step - defaults to 1. - - - - - - generate_series ( start timestamp, stop timestamp, step interval ) - setof timestamp - - - generate_series ( start timestamp with time zone, stop timestamp with time zone, step interval , timezone text ) - setof timestamp with time zone - - - Generates a series of values from start - to stop, with a step size - of step. - In the timezone-aware form, times of day and daylight-savings - adjustments are computed according to the time zone named by - the timezone argument, or the current - setting if that is omitted. - - - - -
- - - When step is positive, zero rows are returned if - start is greater than stop. - Conversely, when step is negative, zero rows are - returned if start is less than stop. - Zero rows are also returned if any input is NULL. - It is an error - for step to be zero. Some examples follow: - -SELECT * FROM generate_series(2,4); - generate_series ------------------ - 2 - 3 - 4 -(3 rows) - -SELECT * FROM generate_series(5,1,-2); - generate_series ------------------ - 5 - 3 - 1 -(3 rows) - -SELECT * FROM generate_series(4,3); - generate_series ------------------ -(0 rows) - -SELECT generate_series(1.1, 4, 1.3); - generate_series ------------------ - 1.1 - 2.4 - 3.7 -(3 rows) - --- this example relies on the date-plus-integer operator: -SELECT current_date + s.a AS dates FROM generate_series(0,14,7) AS s(a); - dates ------------- - 2004-02-05 - 2004-02-12 - 2004-02-19 -(3 rows) - -SELECT * FROM generate_series('2008-03-01 00:00'::timestamp, - '2008-03-04 12:00', '10 hours'); - generate_series ---------------------- - 2008-03-01 00:00:00 - 2008-03-01 10:00:00 - 2008-03-01 20:00:00 - 2008-03-02 06:00:00 - 2008-03-02 16:00:00 - 2008-03-03 02:00:00 - 2008-03-03 12:00:00 - 2008-03-03 22:00:00 - 2008-03-04 08:00:00 -(9 rows) - --- this example assumes that TimeZone is set to UTC; note the DST transition: -SELECT * FROM generate_series('2001-10-22 00:00 -04:00'::timestamptz, - '2001-11-01 00:00 -05:00'::timestamptz, - '1 day'::interval, 'America/New_York'); - generate_series ------------------------- - 2001-10-22 04:00:00+00 - 2001-10-23 04:00:00+00 - 2001-10-24 04:00:00+00 - 2001-10-25 04:00:00+00 - 2001-10-26 04:00:00+00 - 2001-10-27 04:00:00+00 - 2001-10-28 04:00:00+00 - 2001-10-29 05:00:00+00 - 2001-10-30 05:00:00+00 - 2001-10-31 05:00:00+00 - 2001-11-01 05:00:00+00 -(11 rows) - - - - - Subscript Generating Functions - - - - - Function - - - Description - - - - - - - - - generate_subscripts - - generate_subscripts ( array anyarray, dim integer ) - setof integer - - - Generates a series comprising the valid subscripts of - the dim'th dimension of the given array. - - - - - - generate_subscripts ( array anyarray, dim integer, reverse boolean ) - setof integer - - - Generates a series comprising the valid subscripts of - the dim'th dimension of the given array. - When reverse is true, returns the series in - reverse order. - - - - -
- - - generate_subscripts is a convenience function that generates - the set of valid subscripts for the specified dimension of the given - array. - Zero rows are returned for arrays that do not have the requested dimension, - or if any input is NULL. - Some examples follow: - --- basic usage: -SELECT generate_subscripts('{NULL,1,NULL,2}'::int[], 1) AS s; - s ---- - 1 - 2 - 3 - 4 -(4 rows) - --- presenting an array, the subscript and the subscripted --- value requires a subquery: -SELECT * FROM arrays; - a --------------------- - {-1,-2} - {100,200,300} -(2 rows) - -SELECT a AS array, s AS subscript, a[s] AS value -FROM (SELECT generate_subscripts(a, 1) AS s, a FROM arrays) foo; - array | subscript | value ----------------+-----------+------- - {-1,-2} | 1 | -1 - {-1,-2} | 2 | -2 - {100,200,300} | 1 | 100 - {100,200,300} | 2 | 200 - {100,200,300} | 3 | 300 -(5 rows) - --- unnest a 2D array: -CREATE OR REPLACE FUNCTION unnest2(anyarray) -RETURNS SETOF anyelement AS $$ -select $1[i][j] - from generate_subscripts($1,1) g1(i), - generate_subscripts($1,2) g2(j); -$$ LANGUAGE sql IMMUTABLE; -CREATE FUNCTION -SELECT * FROM unnest2(ARRAY[[1,2],[3,4]]); - unnest2 ---------- - 1 - 2 - 3 - 4 -(4 rows) - - - - - ordinality - - - - When a function in the FROM clause is suffixed - by WITH ORDINALITY, a bigint column is - appended to the function's output column(s), which starts from 1 and - increments by 1 for each row of the function's output. - This is most useful in the case of set returning - functions such as unnest(). - - --- set returning function WITH ORDINALITY: -SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n); - ls | n ------------------+---- - pg_serial | 1 - pg_twophase | 2 - postmaster.opts | 3 - pg_notify | 4 - postgresql.conf | 5 - pg_tblspc | 6 - logfile | 7 - base | 8 - postmaster.pid | 9 - pg_ident.conf | 10 - global | 11 - pg_xact | 12 - pg_snapshots | 13 - pg_multixact | 14 - PG_VERSION | 15 - pg_wal | 16 - pg_hba.conf | 17 - pg_stat_tmp | 18 - pg_subtrans | 19 -(19 rows) - - - -
- - - System Information Functions and Operators - - - The functions described in this section are used to obtain various - information about a PostgreSQL installation. - - - - Session Information Functions - - - shows several - functions that extract session and system information. - - - - In addition to the functions listed in this section, there are a number of - functions related to the statistics system that also provide system - information. See for more - information. - - - - Session Information Functions - - - - - Function - - - Description - - - - - - - - - current_catalog - - current_catalog - name - - - - current_database - - current_database () - name - - - Returns the name of the current database. (Databases are - called catalogs in the SQL standard, - so current_catalog is the standard's - spelling.) - - - - - - - current_query - - current_query () - text - - - Returns the text of the currently executing query, as submitted - by the client (which might contain more than one statement). - - - - - - - current_role - - current_role - name - - - This is equivalent to current_user. - - - - - - - current_schema - - - schema - current - - current_schema - name - - - current_schema () - name - - - Returns the name of the schema that is first in the search path (or a - null value if the search path is empty). This is the schema that will - be used for any tables or other named objects that are created without - specifying a target schema. - - - - - - - current_schemas - - - search path - current - - current_schemas ( include_implicit boolean ) - name[] - - - Returns an array of the names of all schemas presently in the - effective search path, in their priority order. (Items in the current - setting that do not correspond to - existing, searchable schemas are omitted.) If the Boolean argument - is true, then implicitly-searched system schemas - such as pg_catalog are included in the result. - - - - - - - current_user - - - user - current - - current_user - name - - - Returns the user name of the current execution context. - - - - - - - inet_client_addr - - inet_client_addr () - inet - - - Returns the IP address of the current client, - or NULL if the current connection is via a - Unix-domain socket. - - - - - - - inet_client_port - - inet_client_port () - integer - - - Returns the IP port number of the current client, - or NULL if the current connection is via a - Unix-domain socket. - - - - - - - inet_server_addr - - inet_server_addr () - inet - - - Returns the IP address on which the server accepted the current - connection, - or NULL if the current connection is via a - Unix-domain socket. - - - - - - - inet_server_port - - inet_server_port () - integer - - - Returns the IP port number on which the server accepted the current - connection, - or NULL if the current connection is via a - Unix-domain socket. - - - - - - - pg_backend_pid - - pg_backend_pid () - integer - - - Returns the process ID of the server process attached to the current - session. - - - - - - - pg_blocking_pids - - pg_blocking_pids ( integer ) - integer[] - - - Returns an array of the process ID(s) of the sessions that are - blocking the server process with the specified process ID from - acquiring a lock, or an empty array if there is no such server process - or it is not blocked. - - - One server process blocks another if it either holds a lock that - conflicts with the blocked process's lock request (hard block), or is - waiting for a lock that would conflict with the blocked process's lock - request and is ahead of it in the wait queue (soft block). When using - parallel queries the result always lists client-visible process IDs - (that is, pg_backend_pid results) even if the - actual lock is held or awaited by a child worker process. As a result - of that, there may be duplicated PIDs in the result. Also note that - when a prepared transaction holds a conflicting lock, it will be - represented by a zero process ID. - - - Frequent calls to this function could have some impact on database - performance, because it needs exclusive access to the lock manager's - shared state for a short time. - - - - - - - pg_conf_load_time - - pg_conf_load_time () - timestamp with time zone - - - Returns the time when the server configuration files were last loaded. - If the current session was alive at the time, this will be the time - when the session itself re-read the configuration files (so the - reading will vary a little in different sessions). Otherwise it is - the time when the postmaster process re-read the configuration files. - - - - - - - pg_current_logfile - - - Logging - pg_current_logfile function - - - current_logfiles - and the pg_current_logfile function - - - Logging - current_logfiles file and the pg_current_logfile - function - - pg_current_logfile ( text ) - text - - - Returns the path name of the log file currently in use by the logging - collector. The path includes the - directory and the individual log file name. The result - is NULL if the logging collector is disabled. - When multiple log files exist, each in a different - format, pg_current_logfile without an argument - returns the path of the file having the first format found in the - ordered list: stderr, - csvlog, jsonlog. - NULL is returned if no log file has any of these - formats. - To request information about a specific log file format, supply - either csvlog, jsonlog or - stderr as the - value of the optional parameter. The result is NULL - if the log format requested is not configured in - . - The result reflects the contents of - the current_logfiles file. - - - This function is restricted to superusers and roles with privileges of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_get_loaded_modules - - pg_get_loaded_modules () - setof record - ( module_name text, - version text, - file_name text ) - - - Returns a list of the loadable modules that are loaded into the - current server session. The module_name - and version fields are NULL unless the - module author supplied values for them using - the PG_MODULE_MAGIC_EXT macro. - The file_name field gives the file - name of the module (shared library). - - - - - - - pg_my_temp_schema - - pg_my_temp_schema () - oid - - - Returns the OID of the current session's temporary schema, or zero if - it has none (because it has not created any temporary tables). - - - - - - - pg_is_other_temp_schema - - pg_is_other_temp_schema ( oid ) - boolean - - - Returns true if the given OID is the OID of another session's - temporary schema. (This can be useful, for example, to exclude other - sessions' temporary tables from a catalog display.) - - - - - - - pg_jit_available - - pg_jit_available () - boolean - - - Returns true if a JIT compiler extension is - available (see ) and the - configuration parameter is set to - on. - - - - - - - pg_numa_available - - pg_numa_available () - boolean - - - Returns true if the server has been compiled with NUMA support. - - - - - - - pg_listening_channels - - pg_listening_channels () - setof text - - - Returns the set of names of asynchronous notification channels that - the current session is listening to. - - - - - - - pg_notification_queue_usage - - pg_notification_queue_usage () - double precision - - - Returns the fraction (0–1) of the asynchronous notification - queue's maximum size that is currently occupied by notifications that - are waiting to be processed. - See and - for more information. - - - - - - - pg_postmaster_start_time - - pg_postmaster_start_time () - timestamp with time zone - - - Returns the time when the server started. - - - - - - - pg_safe_snapshot_blocking_pids - - pg_safe_snapshot_blocking_pids ( integer ) - integer[] - - - Returns an array of the process ID(s) of the sessions that are blocking - the server process with the specified process ID from acquiring a safe - snapshot, or an empty array if there is no such server process or it - is not blocked. - - - A session running a SERIALIZABLE transaction blocks - a SERIALIZABLE READ ONLY DEFERRABLE transaction - from acquiring a snapshot until the latter determines that it is safe - to avoid taking any predicate locks. See - for more information about - serializable and deferrable transactions. - - - Frequent calls to this function could have some impact on database - performance, because it needs access to the predicate lock manager's - shared state for a short time. - - - - - - - pg_trigger_depth - - pg_trigger_depth () - integer - - - Returns the current nesting level - of PostgreSQL triggers (0 if not called, - directly or indirectly, from inside a trigger). - - - - - - - session_user - - session_user - name - - - Returns the session user's name. - - - - - - - system_user - - system_user - text - - - Returns the authentication method and the identity (if any) that the - user presented during the authentication cycle before they were - assigned a database role. It is represented as - auth_method:identity or - NULL if the user has not been authenticated (for - example if Trust authentication has - been used). - - - - - - - user - - user - name - - - This is equivalent to current_user. - - - - -
- - - - current_catalog, - current_role, - current_schema, - current_user, - session_user, - and user have special syntactic status - in SQL: they must be called without trailing - parentheses. In PostgreSQL, parentheses can optionally be used with - current_schema, but not with the others. - - - - - The session_user is normally the user who initiated - the current database connection; but superusers can change this setting - with . - The current_user is the user identifier - that is applicable for permission checking. Normally it is equal - to the session user, but it can be changed with - . - It also changes during the execution of - functions with the attribute SECURITY DEFINER. - In Unix parlance, the session user is the real user and - the current user is the effective user. - current_role and user are - synonyms for current_user. (The SQL standard draws - a distinction between current_role - and current_user, but PostgreSQL - does not, since it unifies users and roles into a single kind of entity.) - - -
- - - Access Privilege Inquiry Functions - - - privilege - querying - - - - lists functions that - allow querying object access privileges programmatically. - (See for more information about - privileges.) - In these functions, the user whose privileges are being inquired about - can be specified by name or by OID - (pg_authid.oid), or if - the name is given as public then the privileges of the - PUBLIC pseudo-role are checked. Also, the user - argument can be omitted entirely, in which case - the current_user is assumed. - The object that is being inquired about can be specified either by name or - by OID, too. When specifying by name, a schema name can be included if - relevant. - The access privilege of interest is specified by a text string, which must - evaluate to one of the appropriate privilege keywords for the object's type - (e.g., SELECT). Optionally, WITH GRANT - OPTION can be added to a privilege type to test whether the - privilege is held with grant option. Also, multiple privilege types can be - listed separated by commas, in which case the result will be true if any of - the listed privileges is held. (Case of the privilege string is not - significant, and extra whitespace is allowed between but not within - privilege names.) - Some examples: - -SELECT has_table_privilege('myschema.mytable', 'select'); -SELECT has_table_privilege('joe', 'mytable', 'INSERT, SELECT WITH GRANT OPTION'); - - - - - Access Privilege Inquiry Functions - - - - - Function - - - Description - - - - - - - - - has_any_column_privilege - - has_any_column_privilege ( - user name or oid, - table text or oid, - privilege text ) - boolean - - - Does user have privilege for any column of table? - This succeeds either if the privilege is held for the whole table, or - if there is a column-level grant of the privilege for at least one - column. - Allowable privilege types are - SELECT, INSERT, - UPDATE, and REFERENCES. - - - - - - - has_column_privilege - - has_column_privilege ( - user name or oid, - table text or oid, - column text or smallint, - privilege text ) - boolean - - - Does user have privilege for the specified table column? - This succeeds either if the privilege is held for the whole table, or - if there is a column-level grant of the privilege for the column. - The column can be specified by name or by attribute number - (pg_attribute.attnum). - Allowable privilege types are - SELECT, INSERT, - UPDATE, and REFERENCES. - - - - - - - has_database_privilege - - has_database_privilege ( - user name or oid, - database text or oid, - privilege text ) - boolean - - - Does user have privilege for database? - Allowable privilege types are - CREATE, - CONNECT, - TEMPORARY, and - TEMP (which is equivalent to - TEMPORARY). - - - - - - - has_foreign_data_wrapper_privilege - - has_foreign_data_wrapper_privilege ( - user name or oid, - fdw text or oid, - privilege text ) - boolean - - - Does user have privilege for foreign-data wrapper? - The only allowable privilege type is USAGE. - - - - - - - has_function_privilege - - has_function_privilege ( - user name or oid, - function text or oid, - privilege text ) - boolean - - - Does user have privilege for function? - The only allowable privilege type is EXECUTE. - - - When specifying a function by name rather than by OID, the allowed - input is the same as for the regprocedure data type (see - ). - An example is: - -SELECT has_function_privilege('joeuser', 'myfunc(int, text)', 'execute'); - - - - - - - - has_language_privilege - - has_language_privilege ( - user name or oid, - language text or oid, - privilege text ) - boolean - - - Does user have privilege for language? - The only allowable privilege type is USAGE. - - - - - - - has_largeobject_privilege - - has_largeobject_privilege ( - user name or oid, - largeobject oid, - privilege text ) - boolean - - - Does user have privilege for large object? - Allowable privilege types are - SELECT and UPDATE. - - - - - - - has_parameter_privilege - - has_parameter_privilege ( - user name or oid, - parameter text, - privilege text ) - boolean - - - Does user have privilege for configuration parameter? - The parameter name is case-insensitive. - Allowable privilege types are SET - and ALTER SYSTEM. - - - - - - - has_schema_privilege - - has_schema_privilege ( - user name or oid, - schema text or oid, - privilege text ) - boolean - - - Does user have privilege for schema? - Allowable privilege types are - CREATE and - USAGE. - - - - - - - has_sequence_privilege - - has_sequence_privilege ( - user name or oid, - sequence text or oid, - privilege text ) - boolean - - - Does user have privilege for sequence? - Allowable privilege types are - USAGE, - SELECT, and - UPDATE. - - - - - - - has_server_privilege - - has_server_privilege ( - user name or oid, - server text or oid, - privilege text ) - boolean - - - Does user have privilege for foreign server? - The only allowable privilege type is USAGE. - - - - - - - has_table_privilege - - has_table_privilege ( - user name or oid, - table text or oid, - privilege text ) - boolean - - - Does user have privilege for table? - Allowable privilege types - are SELECT, INSERT, - UPDATE, DELETE, - TRUNCATE, REFERENCES, - TRIGGER, and MAINTAIN. - - - - - - - has_tablespace_privilege - - has_tablespace_privilege ( - user name or oid, - tablespace text or oid, - privilege text ) - boolean - - - Does user have privilege for tablespace? - The only allowable privilege type is CREATE. - - - - - - - has_type_privilege - - has_type_privilege ( - user name or oid, - type text or oid, - privilege text ) - boolean - - - Does user have privilege for data type? - The only allowable privilege type is USAGE. - When specifying a type by name rather than by OID, the allowed input - is the same as for the regtype data type (see - ). - - - - - - - pg_has_role - - pg_has_role ( - user name or oid, - role text or oid, - privilege text ) - boolean - - - Does user have privilege for role? - Allowable privilege types are - MEMBER, USAGE, - and SET. - MEMBER denotes direct or indirect membership in - the role without regard to what specific privileges may be conferred. - USAGE denotes whether the privileges of the role - are immediately available without doing SET ROLE, - while SET denotes whether it is possible to change - to the role using the SET ROLE command. - WITH ADMIN OPTION or WITH GRANT - OPTION can be added to any of these privilege types to - test whether the ADMIN privilege is held (all - six spellings test the same thing). - This function does not allow the special case of - setting user to public, - because the PUBLIC pseudo-role can never be a member of real roles. - - - - - - - row_security_active - - row_security_active ( - table text or oid ) - boolean - - - Is row-level security active for the specified table in the context of - the current user and current environment? - - - - -
- - - shows the operators - available for the aclitem type, which is the catalog - representation of access privileges. See - for information about how to read access privilege values. - - - - <type>aclitem</type> Operators - - - - - Operator - - - Description - - - Example(s) - - - - - - - - - aclitemeq - - aclitem = aclitem - boolean - - - Are aclitems equal? (Notice that - type aclitem lacks the usual set of comparison - operators; it has only equality. In turn, aclitem - arrays can only be compared for equality.) - - - 'calvin=r*w/hobbes'::aclitem = 'calvin=r*w*/hobbes'::aclitem - f - - - - - - - aclcontains - - aclitem[] @> aclitem - boolean - - - Does array contain the specified privileges? (This is true if there - is an array entry that matches the aclitem's grantee and - grantor, and has at least the specified set of privileges.) - - - '{calvin=r*w/hobbes,hobbes=r*w*/postgres}'::aclitem[] @> 'calvin=r*/hobbes'::aclitem - t - - - - - - aclitem[] ~ aclitem - boolean - - - This is a deprecated alias for @>. - - - '{calvin=r*w/hobbes,hobbes=r*w*/postgres}'::aclitem[] ~ 'calvin=r*/hobbes'::aclitem - t - - - - -
- - - shows some additional - functions to manage the aclitem type. - - - - <type>aclitem</type> Functions - - - - - Function - - - Description - - - - - - - - - acldefault - - acldefault ( - type "char", - ownerId oid ) - aclitem[] - - - Constructs an aclitem array holding the default access - privileges for an object of type type belonging - to the role with OID ownerId. This represents - the access privileges that will be assumed when an object's - ACL entry is null. (The default access privileges - are described in .) - The type parameter must be one of - 'c' for COLUMN, - 'r' for TABLE and table-like objects, - 's' for SEQUENCE, - 'd' for DATABASE, - 'f' for FUNCTION or PROCEDURE, - 'l' for LANGUAGE, - 'L' for LARGE OBJECT, - 'n' for SCHEMA, - 'p' for PARAMETER, - 't' for TABLESPACE, - 'F' for FOREIGN DATA WRAPPER, - 'S' for FOREIGN SERVER, - or - 'T' for TYPE or DOMAIN. - - - - - - - aclexplode - - aclexplode ( aclitem[] ) - setof record - ( grantor oid, - grantee oid, - privilege_type text, - is_grantable boolean ) - - - Returns the aclitem array as a set of rows. - If the grantee is the pseudo-role PUBLIC, it is represented by zero in - the grantee column. Each granted privilege is - represented as SELECT, INSERT, - etc (see for a full list). - Note that each privilege is broken out as a separate row, so - only one keyword appears in the privilege_type - column. - - - - - - - makeaclitem - - makeaclitem ( - grantee oid, - grantor oid, - privileges text, - is_grantable boolean ) - aclitem - - - Constructs an aclitem with the given properties. - privileges is a comma-separated list of - privilege names such as SELECT, - INSERT, etc, all of which are set in the - result. (Case of the privilege string is not significant, and - extra whitespace is allowed between but not within privilege - names.) - - - - -
- -
- - - Schema Visibility Inquiry Functions - - - shows functions that - determine whether a certain object is visible in the - current schema search path. - For example, a table is said to be visible if its - containing schema is in the search path and no table of the same - name appears earlier in the search path. This is equivalent to the - statement that the table can be referenced by name without explicit - schema qualification. Thus, to list the names of all visible tables: - -SELECT relname FROM pg_class WHERE pg_table_is_visible(oid); - - For functions and operators, an object in the search path is said to be - visible if there is no object of the same name and argument data - type(s) earlier in the path. For operator classes and families, - both the name and the associated index access method are considered. - - - - search path - object visibility - - - - Schema Visibility Inquiry Functions - - - - - Function - - - Description - - - - - - - - - pg_collation_is_visible - - pg_collation_is_visible ( collation oid ) - boolean - - - Is collation visible in search path? - - - - - - - pg_conversion_is_visible - - pg_conversion_is_visible ( conversion oid ) - boolean - - - Is conversion visible in search path? - - - - - - - pg_function_is_visible - - pg_function_is_visible ( function oid ) - boolean - - - Is function visible in search path? - (This also works for procedures and aggregates.) - - - - - - - pg_opclass_is_visible - - pg_opclass_is_visible ( opclass oid ) - boolean - - - Is operator class visible in search path? - - - - - - - pg_operator_is_visible - - pg_operator_is_visible ( operator oid ) - boolean - - - Is operator visible in search path? - - - - - - - pg_opfamily_is_visible - - pg_opfamily_is_visible ( opclass oid ) - boolean - - - Is operator family visible in search path? - - - - - - - pg_statistics_obj_is_visible - - pg_statistics_obj_is_visible ( stat oid ) - boolean - - - Is statistics object visible in search path? - - - - - - - pg_table_is_visible - - pg_table_is_visible ( table oid ) - boolean - - - Is table visible in search path? - (This works for all types of relations, including views, materialized - views, indexes, sequences and foreign tables.) - - - - - - - pg_ts_config_is_visible - - pg_ts_config_is_visible ( config oid ) - boolean - - - Is text search configuration visible in search path? - - - - - - - pg_ts_dict_is_visible - - pg_ts_dict_is_visible ( dict oid ) - boolean - - - Is text search dictionary visible in search path? - - - - - - - pg_ts_parser_is_visible - - pg_ts_parser_is_visible ( parser oid ) - boolean - - - Is text search parser visible in search path? - - - - - - - pg_ts_template_is_visible - - pg_ts_template_is_visible ( template oid ) - boolean - - - Is text search template visible in search path? - - - - - - - pg_type_is_visible - - pg_type_is_visible ( type oid ) - boolean - - - Is type (or domain) visible in search path? - - - - -
- - - All these functions require object OIDs to identify the object to be - checked. If you want to test an object by name, it is convenient to use - the OID alias types (regclass, regtype, - regprocedure, regoperator, regconfig, - or regdictionary), - for example: - -SELECT pg_type_is_visible('myschema.widget'::regtype); - - Note that it would not make much sense to test a non-schema-qualified - type name in this way — if the name can be recognized at all, it must be visible. - - -
- - - System Catalog Information Functions - - - lists functions that - extract information from the system catalogs. - - - - System Catalog Information Functions - - - - - Function - - - Description - - - - - - - - - format_type - - format_type ( type oid, typemod integer ) - text - - - Returns the SQL name for a data type that is identified by its type - OID and possibly a type modifier. Pass NULL for the type modifier if - no specific modifier is known. - - - - - - - pg_basetype - - pg_basetype ( regtype ) - regtype - - - Returns the OID of the base type of a domain identified by its - type OID. If the argument is the OID of a non-domain type, - returns the argument as-is. Returns NULL if the argument is - not a valid type OID. If there's a chain of domain dependencies, - it will recurse until finding the base type. - - - Assuming CREATE DOMAIN mytext AS text: - - - pg_basetype('mytext'::regtype) - text - - - - - - - pg_char_to_encoding - - pg_char_to_encoding ( encoding name ) - integer - - - Converts the supplied encoding name into an integer representing the - internal identifier used in some system catalog tables. - Returns -1 if an unknown encoding name is provided. - - - - - - - pg_encoding_to_char - - pg_encoding_to_char ( encoding integer ) - name - - - Converts the integer used as the internal identifier of an encoding in some - system catalog tables into a human-readable string. - Returns an empty string if an invalid encoding number is provided. - - - - - - - pg_get_catalog_foreign_keys - - pg_get_catalog_foreign_keys () - setof record - ( fktable regclass, - fkcols text[], - pktable regclass, - pkcols text[], - is_array boolean, - is_opt boolean ) - - - Returns a set of records describing the foreign key relationships - that exist within the PostgreSQL system - catalogs. - The fktable column contains the name of the - referencing catalog, and the fkcols column - contains the name(s) of the referencing column(s). Similarly, - the pktable column contains the name of the - referenced catalog, and the pkcols column - contains the name(s) of the referenced column(s). - If is_array is true, the last referencing - column is an array, each of whose elements should match some entry - in the referenced catalog. - If is_opt is true, the referencing column(s) - are allowed to contain zeroes instead of a valid reference. - - - - - - - pg_get_constraintdef - - pg_get_constraintdef ( constraint oid , pretty boolean ) - text - - - Reconstructs the creating command for a constraint. - (This is a decompiled reconstruction, not the original text - of the command.) - - - - - - - pg_get_expr - - pg_get_expr ( expr pg_node_tree, relation oid , pretty boolean ) - text - - - Decompiles the internal form of an expression stored in the system - catalogs, such as the default value for a column. If the expression - might contain Vars, specify the OID of the relation they refer to as - the second parameter; if no Vars are expected, passing zero is - sufficient. - - - - - - - pg_get_functiondef - - pg_get_functiondef ( func oid ) - text - - - Reconstructs the creating command for a function or procedure. - (This is a decompiled reconstruction, not the original text - of the command.) - The result is a complete CREATE OR REPLACE FUNCTION - or CREATE OR REPLACE PROCEDURE statement. - - - - - - - pg_get_function_arguments - - pg_get_function_arguments ( func oid ) - text - - - Reconstructs the argument list of a function or procedure, in the form - it would need to appear in within CREATE FUNCTION - (including default values). - - - - - - - pg_get_function_identity_arguments - - pg_get_function_identity_arguments ( func oid ) - text - - - Reconstructs the argument list necessary to identify a function or - procedure, in the form it would need to appear in within commands such - as ALTER FUNCTION. This form omits default values. - - - - - - - pg_get_function_result - - pg_get_function_result ( func oid ) - text - - - Reconstructs the RETURNS clause of a function, in - the form it would need to appear in within CREATE - FUNCTION. Returns NULL for a procedure. - - - - - - - pg_get_indexdef - - pg_get_indexdef ( index oid , column integer, pretty boolean ) - text - - - Reconstructs the creating command for an index. - (This is a decompiled reconstruction, not the original text - of the command.) If column is supplied and is - not zero, only the definition of that column is reconstructed. - - - - - - - pg_get_keywords - - pg_get_keywords () - setof record - ( word text, - catcode "char", - barelabel boolean, - catdesc text, - baredesc text ) - - - Returns a set of records describing the SQL keywords recognized by the - server. The word column contains the - keyword. The catcode column contains a - category code: U for an unreserved - keyword, C for a keyword that can be a column - name, T for a keyword that can be a type or - function name, or R for a fully reserved keyword. - The barelabel column - contains true if the keyword can be used as - a bare column label in SELECT lists, - or false if it can only be used - after AS. - The catdesc column contains a - possibly-localized string describing the keyword's category. - The baredesc column contains a - possibly-localized string describing the keyword's column label status. - - - - - - - pg_get_partkeydef - - pg_get_partkeydef ( table oid ) - text - - - Reconstructs the definition of a partitioned table's partition - key, in the form it would have in the PARTITION - BY clause of CREATE TABLE. - (This is a decompiled reconstruction, not the original text - of the command.) - - - - - - - pg_get_ruledef - - pg_get_ruledef ( rule oid , pretty boolean ) - text - - - Reconstructs the creating command for a rule. - (This is a decompiled reconstruction, not the original text - of the command.) - - - - - - - pg_get_serial_sequence - - pg_get_serial_sequence ( table text, column text ) - text - - - Returns the name of the sequence associated with a column, - or NULL if no sequence is associated with the column. - If the column is an identity column, the associated sequence is the - sequence internally created for that column. - For columns created using one of the serial types - (serial, smallserial, bigserial), - it is the sequence created for that serial column definition. - In the latter case, the association can be modified or removed - with ALTER SEQUENCE OWNED BY. - (This function probably should have been - called pg_get_owned_sequence; its current name - reflects the fact that it has historically been used with serial-type - columns.) The first parameter is a table name with optional - schema, and the second parameter is a column name. Because the first - parameter potentially contains both schema and table names, it is - parsed per usual SQL rules, meaning it is lower-cased by default. - The second parameter, being just a column name, is treated literally - and so has its case preserved. The result is suitably formatted - for passing to the sequence functions (see - ). - - - A typical use is in reading the current value of the sequence for an - identity or serial column, for example: - -SELECT currval(pg_get_serial_sequence('sometable', 'id')); - - - - - - - - pg_get_statisticsobjdef - - pg_get_statisticsobjdef ( statobj oid ) - text - - - Reconstructs the creating command for an extended statistics object. - (This is a decompiled reconstruction, not the original text - of the command.) - - - - - - - pg_get_triggerdef - -pg_get_triggerdef ( trigger oid , pretty boolean ) - text - - - Reconstructs the creating command for a trigger. - (This is a decompiled reconstruction, not the original text - of the command.) - - - - - - - pg_get_userbyid - - pg_get_userbyid ( role oid ) - name - - - Returns a role's name given its OID. - - - - - - - pg_get_viewdef - - pg_get_viewdef ( view oid , pretty boolean ) - text - - - Reconstructs the underlying SELECT command for a - view or materialized view. (This is a decompiled reconstruction, not - the original text of the command.) - - - - - - pg_get_viewdef ( view oid, wrap_column integer ) - text - - - Reconstructs the underlying SELECT command for a - view or materialized view. (This is a decompiled reconstruction, not - the original text of the command.) In this form of the function, - pretty-printing is always enabled, and long lines are wrapped to try - to keep them shorter than the specified number of columns. - - - - - - pg_get_viewdef ( view text , pretty boolean ) - text - - - Reconstructs the underlying SELECT command for a - view or materialized view, working from a textual name for the view - rather than its OID. (This is deprecated; use the OID variant - instead.) - - - - - - - pg_index_column_has_property - - pg_index_column_has_property ( index regclass, column integer, property text ) - boolean - - - Tests whether an index column has the named property. - Common index column properties are listed in - . - (Note that extension access methods can define additional property - names for their indexes.) - NULL is returned if the property name is not known - or does not apply to the particular object, or if the OID or column - number does not identify a valid object. - - - - - - - pg_index_has_property - - pg_index_has_property ( index regclass, property text ) - boolean - - - Tests whether an index has the named property. - Common index properties are listed in - . - (Note that extension access methods can define additional property - names for their indexes.) - NULL is returned if the property name is not known - or does not apply to the particular object, or if the OID does not - identify a valid object. - - - - - - - pg_indexam_has_property - - pg_indexam_has_property ( am oid, property text ) - boolean - - - Tests whether an index access method has the named property. - Access method properties are listed in - . - NULL is returned if the property name is not known - or does not apply to the particular object, or if the OID does not - identify a valid object. - - - - - - - pg_options_to_table - - pg_options_to_table ( options_array text[] ) - setof record - ( option_name text, - option_value text ) - - - Returns the set of storage options represented by a value from - pg_class.reloptions or - pg_attribute.attoptions. - - - - - - - pg_settings_get_flags - - pg_settings_get_flags ( guc text ) - text[] - - - Returns an array of the flags associated with the given GUC, or - NULL if it does not exist. The result is - an empty array if the GUC exists but there are no flags to show. - Only the most useful flags listed in - are exposed. - - - - - - - pg_tablespace_databases - - pg_tablespace_databases ( tablespace oid ) - setof oid - - - Returns the set of OIDs of databases that have objects stored in the - specified tablespace. If this function returns any rows, the - tablespace is not empty and cannot be dropped. To identify the specific - objects populating the tablespace, you will need to connect to the - database(s) identified by pg_tablespace_databases - and query their pg_class catalogs. - - - - - - - pg_tablespace_location - - pg_tablespace_location ( tablespace oid ) - text - - - Returns the file system path that this tablespace is located in. - - - - - - - pg_typeof - - pg_typeof ( "any" ) - regtype - - - Returns the OID of the data type of the value that is passed to it. - This can be helpful for troubleshooting or dynamically constructing - SQL queries. The function is declared as - returning regtype, which is an OID alias type (see - ); this means that it is the same as an - OID for comparison purposes but displays as a type name. - - - pg_typeof(33) - integer - - - - - - - COLLATION FOR - - COLLATION FOR ( "any" ) - text - - - Returns the name of the collation of the value that is passed to it. - The value is quoted and schema-qualified if necessary. If no - collation was derived for the argument expression, - then NULL is returned. If the argument is not of a - collatable data type, then an error is raised. - - - collation for ('foo'::text) - "default" - - - collation for ('foo' COLLATE "de_DE") - "de_DE" - - - - - - - to_regclass - - to_regclass ( text ) - regclass - - - Translates a textual relation name to its OID. A similar result is - obtained by casting the string to type regclass (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found. - - - - - - - to_regcollation - - to_regcollation ( text ) - regcollation - - - Translates a textual collation name to its OID. A similar result is - obtained by casting the string to type regcollation (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found. - - - - - - - to_regnamespace - - to_regnamespace ( text ) - regnamespace - - - Translates a textual schema name to its OID. A similar result is - obtained by casting the string to type regnamespace (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found. - - - - - - - to_regoper - - to_regoper ( text ) - regoper - - - Translates a textual operator name to its OID. A similar result is - obtained by casting the string to type regoper (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found or is ambiguous. - - - - - - - to_regoperator - - to_regoperator ( text ) - regoperator - - - Translates a textual operator name (with parameter types) to its OID. A similar result is - obtained by casting the string to type regoperator (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found. - - - - - - - to_regproc - - to_regproc ( text ) - regproc - - - Translates a textual function or procedure name to its OID. A similar result is - obtained by casting the string to type regproc (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found or is ambiguous. - - - - - - - to_regprocedure - - to_regprocedure ( text ) - regprocedure - - - Translates a textual function or procedure name (with argument types) to its OID. A similar result is - obtained by casting the string to type regprocedure (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found. - - - - - - - to_regrole - - to_regrole ( text ) - regrole - - - Translates a textual role name to its OID. A similar result is - obtained by casting the string to type regrole (see - ); however, this function will return - NULL rather than throwing an error if the name is - not found. - - - - - - - to_regtype - - to_regtype ( text ) - regtype - - - Parses a string of text, extracts a potential type name from it, - and translates that name into a type OID. A syntax error in the - string will result in an error; but if the string is a - syntactically valid type name that happens not to be found in the - catalogs, the result is NULL. A similar result - is obtained by casting the string to type regtype - (see ), except that that will throw - error for name not found. - - - - - - - to_regtypemod - - to_regtypemod ( text ) - integer - - - Parses a string of text, extracts a potential type name from it, - and translates its type modifier, if any. A syntax error in the - string will result in an error; but if the string is a - syntactically valid type name that happens not to be found in the - catalogs, the result is NULL. The result is - -1 if no type modifier is present. - - - to_regtypemod can be combined with - to produce appropriate inputs for - , allowing a string representing a - type name to be canonicalized. - - - format_type(to_regtype('varchar(32)'), to_regtypemod('varchar(32)')) - character varying(32) - - - - -
- - - Most of the functions that reconstruct (decompile) database objects - have an optional pretty flag, which - if true causes the result to - be pretty-printed. Pretty-printing suppresses unnecessary - parentheses and adds whitespace for legibility. - The pretty-printed format is more readable, but the default format - is more likely to be interpreted the same way by future versions of - PostgreSQL; so avoid using pretty-printed output - for dump purposes. Passing false for - the pretty parameter yields the same result as - omitting the parameter. - - - - Index Column Properties - - - NameDescription - - - - asc - Does the column sort in ascending order on a forward scan? - - - - desc - Does the column sort in descending order on a forward scan? - - - - nulls_first - Does the column sort with nulls first on a forward scan? - - - - nulls_last - Does the column sort with nulls last on a forward scan? - - - - orderable - Does the column possess any defined sort ordering? - - - - distance_orderable - Can the column be scanned in order by a distance - operator, for example ORDER BY col <-> constant ? - - - - returnable - Can the column value be returned by an index-only scan? - - - - search_array - Does the column natively support col = ANY(array) - searches? - - - - search_nulls - Does the column support IS NULL and - IS NOT NULL searches? - - - - -
- - - Index Properties - - - NameDescription - - - - clusterable - Can the index be used in a CLUSTER command? - - - - index_scan - Does the index support plain (non-bitmap) scans? - - - - bitmap_scan - Does the index support bitmap scans? - - - - backward_scan - Can the scan direction be changed in mid-scan (to - support FETCH BACKWARD on a cursor without - needing materialization)? - - - - -
- - - Index Access Method Properties - - - NameDescription - - - - can_order - Does the access method support ASC, - DESC and related keywords in - CREATE INDEX? - - - - can_unique - Does the access method support unique indexes? - - - - can_multi_col - Does the access method support indexes with multiple columns? - - - - can_exclude - Does the access method support exclusion constraints? - - - - can_include - Does the access method support the INCLUDE - clause of CREATE INDEX? - - - - -
- - - GUC Flags - - - FlagDescription - - - - EXPLAIN - Parameters with this flag are included in - EXPLAIN (SETTINGS) commands. - - - - NO_SHOW_ALL - Parameters with this flag are excluded from - SHOW ALL commands. - - - - NO_RESET - Parameters with this flag do not support - RESET commands. - - - - NO_RESET_ALL - Parameters with this flag are excluded from - RESET ALL commands. - - - - NOT_IN_SAMPLE - Parameters with this flag are not included in - postgresql.conf by default. - - - - RUNTIME_COMPUTED - Parameters with this flag are runtime-computed ones. - - - - -
- -
- - - Object Information and Addressing Functions - - - lists functions related to - database object identification and addressing. - - - - Object Information and Addressing Functions - - - - - Function - - - Description - - - - - - - - - pg_get_acl - - pg_get_acl ( classid oid, objid oid, objsubid integer ) - aclitem[] - - - Returns the ACL for a database object, specified - by catalog OID, object OID and sub-object ID. This function returns - NULL values for undefined objects. - - - - - - - pg_describe_object - - pg_describe_object ( classid oid, objid oid, objsubid integer ) - text - - - Returns a textual description of a database object identified by - catalog OID, object OID, and sub-object ID (such as a column number - within a table; the sub-object ID is zero when referring to a whole - object). This description is intended to be human-readable, and might - be translated, depending on server configuration. This is especially - useful to determine the identity of an object referenced in the - pg_depend catalog. This function returns - NULL values for undefined objects. - - - - - - - pg_identify_object - - pg_identify_object ( classid oid, objid oid, objsubid integer ) - record - ( type text, - schema text, - name text, - identity text ) - - - Returns a row containing enough information to uniquely identify the - database object specified by catalog OID, object OID and sub-object - ID. - This information is intended to be machine-readable, and is never - translated. - type identifies the type of database object; - schema is the schema name that the object - belongs in, or NULL for object types that do not - belong to schemas; - name is the name of the object, quoted if - necessary, if the name (along with schema name, if pertinent) is - sufficient to uniquely identify the object, - otherwise NULL; - identity is the complete object identity, with - the precise format depending on object type, and each name within the - format being schema-qualified and quoted as necessary. Undefined - objects are identified with NULL values. - - - - - - - pg_identify_object_as_address - - pg_identify_object_as_address ( classid oid, objid oid, objsubid integer ) - record - ( type text, - object_names text[], - object_args text[] ) - - - Returns a row containing enough information to uniquely identify the - database object specified by catalog OID, object OID and sub-object - ID. - The returned information is independent of the current server, that - is, it could be used to identify an identically named object in - another server. - type identifies the type of database object; - object_names and - object_args - are text arrays that together form a reference to the object. - These three values can be passed - to pg_get_object_address to obtain the internal - address of the object. - - - - - - - pg_get_object_address - - pg_get_object_address ( type text, object_names text[], object_args text[] ) - record - ( classid oid, - objid oid, - objsubid integer ) - - - Returns a row containing enough information to uniquely identify the - database object specified by a type code and object name and argument - arrays. - The returned values are the ones that would be used in system catalogs - such as pg_depend; they can be passed to - other system functions such as pg_describe_object - or pg_identify_object. - classid is the OID of the system catalog - containing the object; - objid is the OID of the object itself, and - objsubid is the sub-object ID, or zero if none. - This function is the inverse - of pg_identify_object_as_address. - Undefined objects are identified with NULL values. - - - - -
- - - pg_get_acl is useful for retrieving and inspecting - the privileges associated with database objects without looking at - specific catalogs. For example, to retrieve all the granted privileges - on objects in the current database: - -postgres=# SELECT - (pg_identify_object(s.classid,s.objid,s.objsubid)).*, - pg_catalog.pg_get_acl(s.classid,s.objid,s.objsubid) AS acl -FROM pg_catalog.pg_shdepend AS s -JOIN pg_catalog.pg_database AS d - ON d.datname = current_database() AND - d.oid = s.dbid -JOIN pg_catalog.pg_authid AS a - ON a.oid = s.refobjid AND - s.refclassid = 'pg_authid'::regclass -WHERE s.deptype = 'a'; --[ RECORD 1 ]----------------------------------------- -type | table -schema | public -name | testtab -identity | public.testtab -acl | {postgres=arwdDxtm/postgres,foo=r/postgres} - - - -
- - - Comment Information Functions - - - comment - about database objects - - - - The functions shown in - extract comments previously stored with the - command. A null value is returned if no - comment could be found for the specified parameters. - - - - Comment Information Functions - - - - - Function - - - Description - - - - - - - - - col_description - - col_description ( table oid, column integer ) - text - - - Returns the comment for a table column, which is specified by the OID - of its table and its column number. - (obj_description cannot be used for table - columns, since columns do not have OIDs of their own.) - - - - - - - obj_description - - obj_description ( object oid, catalog name ) - text - - - Returns the comment for a database object specified by its OID and the - name of the containing system catalog. For - example, obj_description(123456, 'pg_class') would - retrieve the comment for the table with OID 123456. - - - - - - obj_description ( object oid ) - text - - - Returns the comment for a database object specified by its OID alone. - This is deprecated since there is no guarantee - that OIDs are unique across different system catalogs; therefore, the - wrong comment might be returned. - - - - - - - shobj_description - - shobj_description ( object oid, catalog name ) - text - - - Returns the comment for a shared database object specified by its OID - and the name of the containing system catalog. This is just - like obj_description except that it is used for - retrieving comments on shared objects (that is, databases, roles, and - tablespaces). Some system catalogs are global to all databases within - each cluster, and the descriptions for objects in them are stored - globally as well. - - - - -
- -
- - - Data Validity Checking Functions - - - The functions shown in - can be helpful for checking validity of proposed input data. - - - - Data Validity Checking Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - pg_input_is_valid - - pg_input_is_valid ( - string text, - type text - ) - boolean - - - Tests whether the given string is valid - input for the specified data type, returning true or false. - - - This function will only work as desired if the data type's input - function has been updated to report invalid input as - a soft error. Otherwise, invalid input will abort - the transaction, just as if the string had been cast to the type - directly. - - - pg_input_is_valid('42', 'integer') - t - - - pg_input_is_valid('42000000000', 'integer') - f - - - pg_input_is_valid('1234.567', 'numeric(7,4)') - f - - - - - - pg_input_error_info - - pg_input_error_info ( - string text, - type text - ) - record - ( message text, - detail text, - hint text, - sql_error_code text ) - - - Tests whether the given string is valid - input for the specified data type; if not, return the details of - the error that would have been thrown. If the input is valid, the - results are NULL. The inputs are the same as - for pg_input_is_valid. - - - This function will only work as desired if the data type's input - function has been updated to report invalid input as - a soft error. Otherwise, invalid input will abort - the transaction, just as if the string had been cast to the type - directly. - - - SELECT * FROM pg_input_error_info('42000000000', 'integer') - - - message | detail | hint | sql_error_code -------------------------------------------------------+--------+------+---------------- - value "42000000000" is out of range for type integer | | | 22003 - - - - - -
- -
- - - Transaction ID and Snapshot Information Functions - - - The functions shown in - provide server transaction information in an exportable form. The main - use of these functions is to determine which transactions were committed - between two snapshots. - - - - Transaction ID and Snapshot Information Functions - - - - - Function - - - Description - - - - - - - - - age - - age ( xid ) - integer - - - Returns the number of transactions between the supplied - transaction id and the current transaction counter. - - - - - - - mxid_age - - mxid_age ( xid ) - integer - - - Returns the number of multixacts IDs between the supplied - multixact ID and the current multixacts counter. - - - - - - - pg_current_xact_id - - pg_current_xact_id () - xid8 - - - Returns the current transaction's ID. It will assign a new one if the - current transaction does not have one already (because it has not - performed any database updates); see for details. If executed in a - subtransaction, this will return the top-level transaction ID; - see for details. - - - - - - - pg_current_xact_id_if_assigned - - pg_current_xact_id_if_assigned () - xid8 - - - Returns the current transaction's ID, or NULL if no - ID is assigned yet. (It's best to use this variant if the transaction - might otherwise be read-only, to avoid unnecessary consumption of an - XID.) - If executed in a subtransaction, this will return the top-level - transaction ID. - - - - - - - pg_xact_status - - pg_xact_status ( xid8 ) - text - - - Reports the commit status of a recent transaction. - The result is one of in progress, - committed, or aborted, - provided that the transaction is recent enough that the system retains - the commit status of that transaction. - If it is old enough that no references to the transaction survive in - the system and the commit status information has been discarded, the - result is NULL. - Applications might use this function, for example, to determine - whether their transaction committed or aborted after the application - and database server become disconnected while - a COMMIT is in progress. - Note that prepared transactions are reported as in - progress; applications must check pg_prepared_xacts - if they need to determine whether a transaction ID belongs to a - prepared transaction. - - - - - - - pg_current_snapshot - - pg_current_snapshot () - pg_snapshot - - - Returns a current snapshot, a data structure - showing which transaction IDs are now in-progress. - Only top-level transaction IDs are included in the snapshot; - subtransaction IDs are not shown; see - for details. - - - - - - - pg_snapshot_xip - - pg_snapshot_xip ( pg_snapshot ) - setof xid8 - - - Returns the set of in-progress transaction IDs contained in a snapshot. - - - - - - - pg_snapshot_xmax - - pg_snapshot_xmax ( pg_snapshot ) - xid8 - - - Returns the xmax of a snapshot. - - - - - - - pg_snapshot_xmin - - pg_snapshot_xmin ( pg_snapshot ) - xid8 - - - Returns the xmin of a snapshot. - - - - - - - pg_visible_in_snapshot - - pg_visible_in_snapshot ( xid8, pg_snapshot ) - boolean - - - Is the given transaction ID visible according - to this snapshot (that is, was it completed before the snapshot was - taken)? Note that this function will not give the correct answer for - a subtransaction ID (subxid); see for - details. - - - - -
- - - The internal transaction ID type xid is 32 bits wide and - wraps around every 4 billion transactions. However, - the functions shown in , except - age and mxid_age, use a - 64-bit type xid8 that does not wrap around during the life - of an installation and can be converted to xid by casting if - required; see for details. - The data type pg_snapshot stores information about - transaction ID visibility at a particular moment in time. Its components - are described in . - pg_snapshot's textual representation is - xmin:xmax:xip_list. - For example 10:20:10,14,15 means - xmin=10, xmax=20, xip_list=10, 14, 15. - - - - Snapshot Components - - - - Name - Description - - - - - - xmin - - Lowest transaction ID that was still active. All transaction IDs - less than xmin are either committed and visible, - or rolled back and dead. - - - - - xmax - - One past the highest completed transaction ID. All transaction IDs - greater than or equal to xmax had not yet - completed as of the time of the snapshot, and thus are invisible. - - - - - xip_list - - Transactions in progress at the time of the snapshot. A transaction - ID that is xmin <= X < - xmax and not in this list was already completed at the time - of the snapshot, and thus is either visible or dead according to its - commit status. This list does not include the transaction IDs of - subtransactions (subxids). - - - - -
- - - In releases of PostgreSQL before 13 there was - no xid8 type, so variants of these functions were provided - that used bigint to represent a 64-bit XID, with a - correspondingly distinct snapshot data type txid_snapshot. - These older functions have txid in their names. They - are still supported for backward compatibility, but may be removed from a - future release. See . - - - - Deprecated Transaction ID and Snapshot Information Functions - - - - - Function - - - Description - - - - - - - - - - txid_current - - txid_current () - bigint - - - See pg_current_xact_id(). - - - - - - - txid_current_if_assigned - - txid_current_if_assigned () - bigint - - - See pg_current_xact_id_if_assigned(). - - - - - - - txid_current_snapshot - - txid_current_snapshot () - txid_snapshot - - - See pg_current_snapshot(). - - - - - - - txid_snapshot_xip - - txid_snapshot_xip ( txid_snapshot ) - setof bigint - - - See pg_snapshot_xip(). - - - - - - - txid_snapshot_xmax - - txid_snapshot_xmax ( txid_snapshot ) - bigint - - - See pg_snapshot_xmax(). - - - - - - - txid_snapshot_xmin - - txid_snapshot_xmin ( txid_snapshot ) - bigint - - - See pg_snapshot_xmin(). - - - - - - - txid_visible_in_snapshot - - txid_visible_in_snapshot ( bigint, txid_snapshot ) - boolean - - - See pg_visible_in_snapshot(). - - - - - - - txid_status - - txid_status ( bigint ) - text - - - See pg_xact_status(). - - - - -
- -
- - - Committed Transaction Information Functions - - - The functions shown in - provide information about when past transactions were committed. - They only provide useful data when the - configuration option is - enabled, and only for transactions that were committed after it was - enabled. Commit timestamp information is routinely removed during - vacuum. - - - - Committed Transaction Information Functions - - - - - Function - - - Description - - - - - - - - - pg_xact_commit_timestamp - - pg_xact_commit_timestamp ( xid ) - timestamp with time zone - - - Returns the commit timestamp of a transaction. - - - - - - - pg_xact_commit_timestamp_origin - - pg_xact_commit_timestamp_origin ( xid ) - record - ( timestamp timestamp with time zone, - roident oid) - - - Returns the commit timestamp and replication origin of a transaction. - - - - - - - pg_last_committed_xact - - pg_last_committed_xact () - record - ( xid xid, - timestamp timestamp with time zone, - roident oid ) - - - Returns the transaction ID, commit timestamp and replication origin - of the latest committed transaction. - - - - -
- -
- - - Control Data Functions - - - The functions shown in - print information initialized during initdb, such - as the catalog version. They also show information about write-ahead - logging and checkpoint processing. This information is cluster-wide, - not specific to any one database. These functions provide most of the same - information, from the same source, as the - application. - - - - Control Data Functions - - - - - Function - - - Description - - - - - - - - - pg_control_checkpoint - - pg_control_checkpoint () - record - - - Returns information about current checkpoint state, as shown in - . - - - - - - - pg_control_system - - pg_control_system () - record - - - Returns information about current control file state, as shown in - . - - - - - - - pg_control_init - - pg_control_init () - record - - - Returns information about cluster initialization state, as shown in - . - - - - - - - pg_control_recovery - - pg_control_recovery () - record - - - Returns information about recovery state, as shown in - . - - - - -
- - - <function>pg_control_checkpoint</function> Output Columns - - - - Column Name - Data Type - - - - - - - checkpoint_lsn - pg_lsn - - - - redo_lsn - pg_lsn - - - - redo_wal_file - text - - - - timeline_id - integer - - - - prev_timeline_id - integer - - - - full_page_writes - boolean - - - - next_xid - text - - - - next_oid - oid - - - - next_multixact_id - xid - - - - next_multi_offset - xid - - - - oldest_xid - xid - - - - oldest_xid_dbid - oid - - - - oldest_active_xid - xid - - - - oldest_multi_xid - xid - - - - oldest_multi_dbid - oid - - - - oldest_commit_ts_xid - xid - - - - newest_commit_ts_xid - xid - - - - checkpoint_time - timestamp with time zone - - - - -
- - - <function>pg_control_system</function> Output Columns - - - - Column Name - Data Type - - - - - - - pg_control_version - integer - - - - catalog_version_no - integer - - - - system_identifier - bigint - - - - pg_control_last_modified - timestamp with time zone - - - - -
- - - <function>pg_control_init</function> Output Columns - - - - Column Name - Data Type - - - - - - - max_data_alignment - integer - - - - database_block_size - integer - - - - blocks_per_segment - integer - - - - wal_block_size - integer - - - - bytes_per_wal_segment - integer - - - - max_identifier_length - integer - - - - max_index_columns - integer - - - - max_toast_chunk_size - integer - - - - large_object_chunk_size - integer - - - - float8_pass_by_value - boolean - - - - data_page_checksum_version - integer - - - - default_char_signedness - boolean - - - - -
- - - <function>pg_control_recovery</function> Output Columns - - - - Column Name - Data Type - - - - - - - min_recovery_end_lsn - pg_lsn - - - - min_recovery_end_timeline - integer - - - - backup_start_lsn - pg_lsn - - - - backup_end_lsn - pg_lsn - - - - end_of_backup_record_required - boolean - - - - -
- -
- - - Version Information Functions - - - The functions shown in - print version information. - - - - Version Information Functions - - - - - Function - - - Description - - - - - - - - - version - - version () - text - - - Returns a string describing the PostgreSQL - server's version. You can also get this information from - , or for a machine-readable - version use . Software - developers should use server_version_num (available - since 8.2) or instead of - parsing the text version. - - - - - - - unicode_version - - unicode_version () - text - - - Returns a string representing the version of Unicode used by - PostgreSQL. - - - - - - icu_unicode_version - - icu_unicode_version () - text - - - Returns a string representing the version of Unicode used by ICU, if - the server was built with ICU support; otherwise returns - NULL - - - -
- -
- - - WAL Summarization Information Functions - - - The functions shown in - print information about the status of WAL summarization. - See . - - - - WAL Summarization Information Functions - - - - - Function - - - Description - - - - - - - - - pg_available_wal_summaries - - pg_available_wal_summaries () - setof record - ( tli bigint, - start_lsn pg_lsn, - end_lsn pg_lsn ) - - - Returns information about the WAL summary files present in the - data directory, under pg_wal/summaries. - One row will be returned per WAL summary file. Each file summarizes - WAL on the indicated TLI within the indicated LSN range. This function - might be useful to determine whether enough WAL summaries are present - on the server to take an incremental backup based on some prior - backup whose start LSN is known. - - - - - - - pg_wal_summary_contents - - pg_wal_summary_contents ( tli bigint, start_lsn pg_lsn, end_lsn pg_lsn ) - setof record - ( relfilenode oid, - reltablespace oid, - reldatabase oid, - relforknumber smallint, - relblocknumber bigint, - is_limit_block boolean ) - - - Returns one information about the contents of a single WAL summary file - identified by TLI and starting and ending LSNs. Each row with - is_limit_block false indicates that the block - identified by the remaining output columns was modified by at least - one WAL record within the range of records summarized by this file. - Each row with is_limit_block true indicates either - that (a) the relation fork was truncated to the length given by - relblocknumber within the relevant range of WAL - records or (b) that the relation fork was created or dropped within - the relevant range of WAL records; in such cases, - relblocknumber will be zero. - - - - - - - pg_get_wal_summarizer_state - - pg_get_wal_summarizer_state () - record - ( summarized_tli bigint, - summarized_lsn pg_lsn, - pending_lsn pg_lsn, - summarizer_pid int ) - - - Returns information about the progress of the WAL summarizer. If the - WAL summarizer has never run since the instance was started, then - summarized_tli and summarized_lsn - will be 0 and 0/0 respectively; - otherwise, they will be the TLI and ending LSN of the last WAL summary - file written to disk. If the WAL summarizer is currently running, - pending_lsn will be the ending LSN of the last - record that it has consumed, which must always be greater than or - equal to summarized_lsn; if the WAL summarizer is - not running, it will be equal to summarized_lsn. - summarizer_pid is the PID of the WAL summarizer - process, if it is running, and otherwise NULL. - - - As a special exception, the WAL summarizer will refuse to generate - WAL summary files if run on WAL generated under - wal_level=minimal, since such summaries would be - unsafe to use as the basis for an incremental backup. In this case, - the fields above will continue to advance as if summaries were being - generated, but nothing will be written to disk. Once the summarizer - reaches WAL generated while wal_level was set - to replica or higher, it will resume writing - summaries to disk. - - - - -
- -
- -
- - - System Administration Functions - - - The functions described in this section are used to control and - monitor a PostgreSQL installation. - - - - Configuration Settings Functions - - - SET - - - - SHOW - - - - configuration - of the server - functions - - - - shows the functions - available to query and alter run-time configuration parameters. - - - - Configuration Settings Functions - - - - - Function - - - Description - - - Example(s) - - - - - - - - - current_setting - - current_setting ( setting_name text , missing_ok boolean ) - text - - - Returns the current value of the - setting setting_name. If there is no such - setting, current_setting throws an error - unless missing_ok is supplied and - is true (in which case NULL is returned). - This function corresponds to - the SQL command . - - - current_setting('datestyle') - ISO, MDY - - - - - - - set_config - - set_config ( - setting_name text, - new_value text, - is_local boolean ) - text - - - Sets the parameter setting_name - to new_value, and returns that value. - If is_local is true, the new - value will only apply during the current transaction. If you want the - new value to apply for the rest of the current session, - use false instead. This function corresponds to - the SQL command . - - - set_config accepts the NULL value for - new_value, but as settings cannot be null, it - is interpreted as a request to reset the setting to its default value. - - - set_config('log_statement_stats', 'off', false) - off - - - - -
- -
- - - Server Signaling Functions - - - signal - backend processes - - - - The functions shown in send control signals to - other server processes. Use of these functions is restricted to - superusers by default but access may be granted to others using - GRANT, with noted exceptions. - - - - Each of these functions returns true if - the signal was successfully sent and false - if sending the signal failed. - - - - Server Signaling Functions - - - - - Function - - - Description - - - - - - - - - pg_cancel_backend - - pg_cancel_backend ( pid integer ) - boolean - - - Cancels the current query of the session whose backend process has the - specified process ID. This is also allowed if the - calling role is a member of the role whose backend is being canceled or - the calling role has privileges of pg_signal_backend, - however only superusers can cancel superuser backends. - As an exception, roles with privileges of - pg_signal_autovacuum_worker are permitted to - cancel autovacuum worker processes, which are otherwise considered - superuser backends. - - - - - - - pg_log_backend_memory_contexts - - pg_log_backend_memory_contexts ( pid integer ) - boolean - - - Requests to log the memory contexts of the backend with the - specified process ID. This function can send the request to - backends and auxiliary processes except logger. These memory contexts - will be logged at - LOG message level. They will appear in - the server log based on the log configuration set - (see for more information), - but will not be sent to the client regardless of - . - - - - - - - pg_reload_conf - - pg_reload_conf () - boolean - - - Causes all processes of the PostgreSQL - server to reload their configuration files. (This is initiated by - sending a SIGHUP signal to the postmaster - process, which in turn sends SIGHUP to each - of its children.) You can use the - pg_file_settings, - pg_hba_file_rules and - pg_ident_file_mappings views - to check the configuration files for possible errors, before reloading. - - - - - - - pg_rotate_logfile - - pg_rotate_logfile () - boolean - - - Signals the log-file manager to switch to a new output file - immediately. This works only when the built-in log collector is - running, since otherwise there is no log-file manager subprocess. - - - - - - - pg_terminate_backend - - pg_terminate_backend ( pid integer, timeout bigint DEFAULT 0 ) - boolean - - - Terminates the session whose backend process has the - specified process ID. This is also allowed if the calling role - is a member of the role whose backend is being terminated or the - calling role has privileges of pg_signal_backend, - however only superusers can terminate superuser backends. - As an exception, roles with privileges of - pg_signal_autovacuum_worker are permitted to - terminate autovacuum worker processes, which are otherwise considered - superuser backends. - - - If timeout is not specified or zero, this - function returns true whether the process actually - terminates or not, indicating only that the sending of the signal was - successful. If the timeout is specified (in - milliseconds) and greater than zero, the function waits until the - process is actually terminated or until the given time has passed. If - the process is terminated, the function - returns true. On timeout, a warning is emitted and - false is returned. - - - - -
- - - pg_cancel_backend and pg_terminate_backend - send signals (SIGINT or SIGTERM - respectively) to backend processes identified by process ID. - The process ID of an active backend can be found from - the pid column of the - pg_stat_activity view, or by listing the - postgres processes on the server (using - ps on Unix or the Task - Manager on Windows). - The role of an active backend can be found from the - usename column of the - pg_stat_activity view. - - - - pg_log_backend_memory_contexts can be used - to log the memory contexts of a backend process. For example: - -postgres=# SELECT pg_log_backend_memory_contexts(pg_backend_pid()); - pg_log_backend_memory_contexts --------------------------------- - t -(1 row) - -One message for each memory context will be logged. For example: - -LOG: logging memory contexts of PID 10377 -STATEMENT: SELECT pg_log_backend_memory_contexts(pg_backend_pid()); -LOG: level: 1; TopMemoryContext: 80800 total in 6 blocks; 14432 free (5 chunks); 66368 used -LOG: level: 2; pgstat TabStatusArray lookup hash table: 8192 total in 1 blocks; 1408 free (0 chunks); 6784 used -LOG: level: 2; TopTransactionContext: 8192 total in 1 blocks; 7720 free (1 chunks); 472 used -LOG: level: 2; RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used -LOG: level: 2; MessageContext: 16384 total in 2 blocks; 5152 free (0 chunks); 11232 used -LOG: level: 2; Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used -LOG: level: 2; smgr relation table: 16384 total in 2 blocks; 4544 free (3 chunks); 11840 used -LOG: level: 2; TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used -... -LOG: level: 2; ErrorContext: 8192 total in 1 blocks; 7928 free (3 chunks); 264 used -LOG: Grand total: 1651920 bytes in 201 blocks; 622360 free (88 chunks); 1029560 used - - If there are more than 100 child contexts under the same parent, the first - 100 child contexts are logged, along with a summary of the remaining contexts. - Note that frequent calls to this function could incur significant overhead, - because it may generate a large number of log messages. - - -
- - - Backup Control Functions - - - backup - - - - The functions shown in assist in making on-line backups. - These functions cannot be executed during recovery (except - pg_backup_start, - pg_backup_stop, - and pg_wal_lsn_diff). - - - - For details about proper usage of these functions, see - . - - - - Backup Control Functions - - - - - Function - - - Description - - - - - - - - - pg_create_restore_point - - pg_create_restore_point ( name text ) - pg_lsn - - - Creates a named marker record in the write-ahead log that can later be - used as a recovery target, and returns the corresponding write-ahead - log location. The given name can then be used with - to specify the point up to - which recovery will proceed. Avoid creating multiple restore points - with the same name, since recovery will stop at the first one whose - name matches the recovery target. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_current_wal_flush_lsn - - pg_current_wal_flush_lsn () - pg_lsn - - - Returns the current write-ahead log flush location (see notes below). - - - - - - - pg_current_wal_insert_lsn - - pg_current_wal_insert_lsn () - pg_lsn - - - Returns the current write-ahead log insert location (see notes below). - - - - - - - pg_current_wal_lsn - - pg_current_wal_lsn () - pg_lsn - - - Returns the current write-ahead log write location (see notes below). - - - - - - - pg_backup_start - - pg_backup_start ( - label text - , fast boolean - ) - pg_lsn - - - Prepares the server to begin an on-line backup. The only required - parameter is an arbitrary user-defined label for the backup. - (Typically this would be the name under which the backup dump file - will be stored.) - If the optional second parameter is given as true, - it specifies executing pg_backup_start as quickly - as possible. This forces an immediate checkpoint which will cause a - spike in I/O operations, slowing any concurrently executing queries. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_backup_stop - - pg_backup_stop ( - wait_for_archive boolean - ) - record - ( lsn pg_lsn, - labelfile text, - spcmapfile text ) - - - Finishes performing an on-line backup. The desired contents of the - backup label file and the tablespace map file are returned as part of - the result of the function and must be written to files in the - backup area. These files must not be written to the live data directory - (doing so will cause PostgreSQL to fail to restart in the event of a - crash). - - - There is an optional parameter of type boolean. - If false, the function will return immediately after the backup is - completed, without waiting for WAL to be archived. This behavior is - only useful with backup software that independently monitors WAL - archiving. Otherwise, WAL required to make the backup consistent might - be missing and make the backup useless. By default or when this - parameter is true, pg_backup_stop will wait for - WAL to be archived when archiving is enabled. (On a standby, this - means that it will wait only when archive_mode = - always. If write activity on the primary is low, - it may be useful to run pg_switch_wal on the - primary in order to trigger an immediate segment switch.) - - - When executed on a primary, this function also creates a backup - history file in the write-ahead log archive area. The history file - includes the label given to pg_backup_start, the - starting and ending write-ahead log locations for the backup, and the - starting and ending times of the backup. After recording the ending - location, the current write-ahead log insertion point is automatically - advanced to the next write-ahead log file, so that the ending - write-ahead log file can be archived immediately to complete the - backup. - - - The result of the function is a single record. - The lsn column holds the backup's ending - write-ahead log location (which again can be ignored). The second - column returns the contents of the backup label file, and the third - column returns the contents of the tablespace map file. These must be - stored as part of the backup and are required as part of the restore - process. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_switch_wal - - pg_switch_wal () - pg_lsn - - - Forces the server to switch to a new write-ahead log file, which - allows the current file to be archived (assuming you are using - continuous archiving). The result is the ending write-ahead log - location plus 1 within the just-completed write-ahead log file. If - there has been no write-ahead log activity since the last write-ahead - log switch, pg_switch_wal does nothing and - returns the start location of the write-ahead log file currently in - use. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_walfile_name - - pg_walfile_name ( lsn pg_lsn ) - text - - - Converts a write-ahead log location to the name of the WAL file - holding that location. - - - - - - - pg_walfile_name_offset - - pg_walfile_name_offset ( lsn pg_lsn ) - record - ( file_name text, - file_offset integer ) - - - Converts a write-ahead log location to a WAL file name and byte offset - within that file. - - - - - - - pg_split_walfile_name - - pg_split_walfile_name ( file_name text ) - record - ( segment_number numeric, - timeline_id bigint ) - - - Extracts the sequence number and timeline ID from a WAL file - name. - - - - - - - pg_wal_lsn_diff - - pg_wal_lsn_diff ( lsn1 pg_lsn, lsn2 pg_lsn ) - numeric - - - Calculates the difference in bytes (lsn1 - lsn2) between two write-ahead log - locations. This can be used - with pg_stat_replication or some of the - functions shown in to - get the replication lag. - - - - -
- - - pg_current_wal_lsn displays the current write-ahead - log write location in the same format used by the above functions. - Similarly, pg_current_wal_insert_lsn displays the - current write-ahead log insertion location - and pg_current_wal_flush_lsn displays the current - write-ahead log flush location. The insertion location is - the logical end of the write-ahead log at any instant, - while the write location is the end of what has actually been written out - from the server's internal buffers, and the flush location is the last - location known to be written to durable storage. The write location is the - end of what can be examined from outside the server, and is usually what - you want if you are interested in archiving partially-complete write-ahead - log files. The insertion and flush locations are made available primarily - for server debugging purposes. These are all read-only operations and do - not require superuser permissions. - - - - You can use pg_walfile_name_offset to extract the - corresponding write-ahead log file name and byte offset from - a pg_lsn value. For example: - -postgres=# SELECT * FROM pg_walfile_name_offset((pg_backup_stop()).lsn); - file_name | file_offset ---------------------------+------------- - 00000001000000000000000D | 4039624 -(1 row) - - Similarly, pg_walfile_name extracts just the write-ahead log file name. - - - - pg_split_walfile_name is useful to compute a - LSN from a file offset and WAL file name, for example: - -postgres=# \set file_name '000000010000000100C000AB' -postgres=# \set offset 256 -postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset AS lsn - FROM pg_split_walfile_name(:'file_name') pd, - pg_show_all_settings() ps - WHERE ps.name = 'wal_segment_size'; - lsn ---------------- - C001/AB000100 -(1 row) - - - -
- - - Recovery Control Functions - - - The functions shown in provide information - about the current status of a standby server. - These functions may be executed both during recovery and in normal running. - - - - Recovery Information Functions - - - - - Function - - - Description - - - - - - - - - pg_is_in_recovery - - pg_is_in_recovery () - boolean - - - Returns true if recovery is still in progress. - - - - - - - pg_last_wal_receive_lsn - - pg_last_wal_receive_lsn () - pg_lsn - - - Returns the last write-ahead log location that has been received and - synced to disk by streaming replication. While streaming replication - is in progress this will increase monotonically. If recovery has - completed then this will remain static at the location of the last WAL - record received and synced to disk during recovery. If streaming - replication is disabled, or if it has not yet started, the function - returns NULL. - - - - - - - pg_last_wal_replay_lsn - - pg_last_wal_replay_lsn () - pg_lsn - - - Returns the last write-ahead log location that has been replayed - during recovery. If recovery is still in progress this will increase - monotonically. If recovery has completed then this will remain - static at the location of the last WAL record applied during recovery. - When the server has been started normally without recovery, the - function returns NULL. - - - - - - - pg_last_xact_replay_timestamp - - pg_last_xact_replay_timestamp () - timestamp with time zone - - - Returns the time stamp of the last transaction replayed during - recovery. This is the time at which the commit or abort WAL record - for that transaction was generated on the primary. If no transactions - have been replayed during recovery, the function - returns NULL. Otherwise, if recovery is still in - progress this will increase monotonically. If recovery has completed - then this will remain static at the time of the last transaction - applied during recovery. When the server has been started normally - without recovery, the function returns NULL. - - - - - - - pg_get_wal_resource_managers - - pg_get_wal_resource_managers () - setof record - ( rm_id integer, - rm_name text, - rm_builtin boolean ) - - - Returns the currently-loaded WAL resource managers in the system. The - column rm_builtin indicates whether it's a - built-in resource manager, or a custom resource manager loaded by an - extension. - - - - -
- - - The functions shown in control the progress of recovery. - These functions may be executed only during recovery. - - - - Recovery Control Functions - - - - - Function - - - Description - - - - - - - - - pg_is_wal_replay_paused - - pg_is_wal_replay_paused () - boolean - - - Returns true if recovery pause is requested. - - - - - - - pg_get_wal_replay_pause_state - - pg_get_wal_replay_pause_state () - text - - - Returns recovery pause state. The return values are - not paused if pause is not requested, - pause requested if pause is requested but recovery is - not yet paused, and paused if the recovery is - actually paused. - - - - - - - pg_promote - - pg_promote ( wait boolean DEFAULT true, wait_seconds integer DEFAULT 60 ) - boolean - - - Promotes a standby server to primary status. - With wait set to true (the - default), the function waits until promotion is completed - or wait_seconds seconds have passed, and - returns true if promotion is successful - and false otherwise. - If wait is set to false, the - function returns true immediately after sending a - SIGUSR1 signal to the postmaster to trigger - promotion. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_wal_replay_pause - - pg_wal_replay_pause () - void - - - Request to pause recovery. A request doesn't mean that recovery stops - right away. If you want a guarantee that recovery is actually paused, - you need to check for the recovery pause state returned by - pg_get_wal_replay_pause_state(). Note that - pg_is_wal_replay_paused() returns whether a request - is made. While recovery is paused, no further database changes are applied. - If hot standby is active, all new queries will see the same consistent - snapshot of the database, and no further query conflicts will be generated - until recovery is resumed. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_wal_replay_resume - - pg_wal_replay_resume () - void - - - Restarts recovery if it was paused. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - -
- - - pg_wal_replay_pause and - pg_wal_replay_resume cannot be executed while - a promotion is ongoing. If a promotion is triggered while recovery - is paused, the paused state ends and promotion continues. - - - - If streaming replication is disabled, the paused state may continue - indefinitely without a problem. If streaming replication is in - progress then WAL records will continue to be received, which will - eventually fill available disk space, depending upon the duration of - the pause, the rate of WAL generation and available disk space. - - -
- - - Snapshot Synchronization Functions - - - PostgreSQL allows database sessions to synchronize their - snapshots. A snapshot determines which data is visible to the - transaction that is using the snapshot. Synchronized snapshots are - necessary when two or more sessions need to see identical content in the - database. If two sessions just start their transactions independently, - there is always a possibility that some third transaction commits - between the executions of the two START TRANSACTION commands, - so that one session sees the effects of that transaction and the other - does not. - - - - To solve this problem, PostgreSQL allows a transaction to - export the snapshot it is using. As long as the exporting - transaction remains open, other transactions can import its - snapshot, and thereby be guaranteed that they see exactly the same view - of the database that the first transaction sees. But note that any - database changes made by any one of these transactions remain invisible - to the other transactions, as is usual for changes made by uncommitted - transactions. So the transactions are synchronized with respect to - pre-existing data, but act normally for changes they make themselves. - - - - Snapshots are exported with the pg_export_snapshot function, - shown in , and - imported with the command. - - - - Snapshot Synchronization Functions - - - - - Function - - - Description - - - - - - - - - pg_export_snapshot - - pg_export_snapshot () - text - - - Saves the transaction's current snapshot and returns - a text string identifying the snapshot. This string must - be passed (outside the database) to clients that want to import the - snapshot. The snapshot is available for import only until the end of - the transaction that exported it. - - - A transaction can export more than one snapshot, if needed. Note that - doing so is only useful in READ COMMITTED - transactions, since in REPEATABLE READ and higher - isolation levels, transactions use the same snapshot throughout their - lifetime. Once a transaction has exported any snapshots, it cannot be - prepared with . - - - - - - pg_log_standby_snapshot - - pg_log_standby_snapshot () - pg_lsn - - - Take a snapshot of running transactions and write it to WAL, without - having to wait for bgwriter or checkpointer to log one. This is useful - for logical decoding on standby, as logical slot creation has to wait - until such a record is replayed on the standby. - - - - -
- -
- - - Replication Management Functions - - - The functions shown - in are for - controlling and interacting with replication features. - See , - , and - - for information about the underlying features. - Use of functions for replication origin is only allowed to the - superuser by default, but may be allowed to other users by using the - GRANT command. - Use of functions for replication slots is restricted to superusers - and users having REPLICATION privilege. - - - - Many of these functions have equivalent commands in the replication - protocol; see . - - - - The functions described in - , - , and - - are also relevant for replication. - - - - Replication Management Functions - - - - - Function - - - Description - - - - - - - - - pg_create_physical_replication_slot - - pg_create_physical_replication_slot ( slot_name name , immediately_reserve boolean, temporary boolean ) - record - ( slot_name name, - lsn pg_lsn ) - - - Creates a new physical replication slot named - slot_name. The optional second parameter, - when true, specifies that the LSN for this - replication slot be reserved immediately; otherwise - the LSN is reserved on first connection from a streaming - replication client. Streaming changes from a physical slot is only - possible with the streaming-replication protocol — - see . The optional third - parameter, temporary, when set to true, specifies that - the slot should not be permanently stored to disk and is only meant - for use by the current session. Temporary slots are also - released upon any error. This function corresponds - to the replication protocol command CREATE_REPLICATION_SLOT - ... PHYSICAL. - - - - - - - pg_drop_replication_slot - - pg_drop_replication_slot ( slot_name name ) - void - - - Drops the physical or logical replication slot - named slot_name. Same as replication protocol - command DROP_REPLICATION_SLOT. - - - - - - - pg_create_logical_replication_slot - - pg_create_logical_replication_slot ( slot_name name, plugin name , temporary boolean, twophase boolean, failover boolean ) - record - ( slot_name name, - lsn pg_lsn ) - - - Creates a new logical (decoding) replication slot named - slot_name using the output plugin - plugin. The optional third - parameter, temporary, when set to true, specifies that - the slot should not be permanently stored to disk and is only meant - for use by the current session. Temporary slots are also - released upon any error. The optional fourth parameter, - twophase, when set to true, specifies - that the decoding of prepared transactions is enabled for this - slot. The optional fifth parameter, - failover, when set to true, - specifies that this slot is enabled to be synced to the - standbys so that logical replication can be resumed after - failover. A call to this function has the same effect as - the replication protocol command - CREATE_REPLICATION_SLOT ... LOGICAL. - - - - - - - pg_copy_physical_replication_slot - - pg_copy_physical_replication_slot ( src_slot_name name, dst_slot_name name , temporary boolean ) - record - ( slot_name name, - lsn pg_lsn ) - - - Copies an existing physical replication slot named src_slot_name - to a physical replication slot named dst_slot_name. - The copied physical slot starts to reserve WAL from the same LSN as the - source slot. - temporary is optional. If temporary - is omitted, the same value as the source slot is used. Copy of an - invalidated slot is not allowed. - - - - - - - pg_copy_logical_replication_slot - - pg_copy_logical_replication_slot ( src_slot_name name, dst_slot_name name , temporary boolean , plugin name ) - record - ( slot_name name, - lsn pg_lsn ) - - - Copies an existing logical replication slot - named src_slot_name to a logical replication - slot named dst_slot_name, optionally changing - the output plugin and persistence. The copied logical slot starts - from the same LSN as the source logical slot. Both - temporary and plugin are - optional; if they are omitted, the values of the source slot are used. - The failover option of the source logical slot - is not copied and is set to false by default. This - is to avoid the risk of being unable to continue logical replication - after failover to standby where the slot is being synchronized. Copy of - an invalidated slot is not allowed. - - - - - - - pg_logical_slot_get_changes - - pg_logical_slot_get_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) - setof record - ( lsn pg_lsn, - xid xid, - data text ) - - - Returns changes in the slot slot_name, starting - from the point from which changes have been consumed last. If - upto_lsn - and upto_nchanges are NULL, - logical decoding will continue until end of WAL. If - upto_lsn is non-NULL, decoding will include only - those transactions which commit prior to the specified LSN. If - upto_nchanges is non-NULL, decoding will - stop when the number of rows produced by decoding exceeds - the specified value. Note, however, that the actual number of - rows returned may be larger, since this limit is only checked after - adding the rows produced when decoding each new transaction commit. - If the specified slot is a logical failover slot then the function will - not return until all physical slots specified in - synchronized_standby_slots - have confirmed WAL receipt. - - - - - - - pg_logical_slot_peek_changes - - pg_logical_slot_peek_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) - setof record - ( lsn pg_lsn, - xid xid, - data text ) - - - Behaves just like - the pg_logical_slot_get_changes() function, - except that changes are not consumed; that is, they will be returned - again on future calls. - - - - - - - pg_logical_slot_get_binary_changes - - pg_logical_slot_get_binary_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) - setof record - ( lsn pg_lsn, - xid xid, - data bytea ) - - - Behaves just like - the pg_logical_slot_get_changes() function, - except that changes are returned as bytea. - - - - - - - pg_logical_slot_peek_binary_changes - - pg_logical_slot_peek_binary_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) - setof record - ( lsn pg_lsn, - xid xid, - data bytea ) - - - Behaves just like - the pg_logical_slot_peek_changes() function, - except that changes are returned as bytea. - - - - - - - pg_replication_slot_advance - - pg_replication_slot_advance ( slot_name name, upto_lsn pg_lsn ) - record - ( slot_name name, - end_lsn pg_lsn ) - - - Advances the current confirmed position of a replication slot named - slot_name. The slot will not be moved backwards, - and it will not be moved beyond the current insert location. Returns - the name of the slot and the actual position that it was advanced to. - The updated slot position information is written out at the next - checkpoint if any advancing is done. So in the event of a crash, the - slot may return to an earlier position. If the specified slot is a - logical failover slot then the function will not return until all - physical slots specified in - synchronized_standby_slots - have confirmed WAL receipt. - - - - - - - pg_replication_origin_create - - pg_replication_origin_create ( node_name text ) - oid - - - Creates a replication origin with the given external - name, and returns the internal ID assigned to it. - The name must be no longer than 512 bytes. - - - - - - - pg_replication_origin_drop - - pg_replication_origin_drop ( node_name text ) - void - - - Deletes a previously-created replication origin, including any - associated replay progress. - - - - - - - pg_replication_origin_oid - - pg_replication_origin_oid ( node_name text ) - oid - - - Looks up a replication origin by name and returns the internal ID. If - no such replication origin is found, NULL is - returned. - - - - - - - pg_replication_origin_session_setup - - pg_replication_origin_session_setup ( node_name text ) - void - - - Marks the current session as replaying from the given - origin, allowing replay progress to be tracked. - Can only be used if no origin is currently selected. - Use pg_replication_origin_session_reset to undo. - - - - - - - pg_replication_origin_session_reset - - pg_replication_origin_session_reset () - void - - - Cancels the effects - of pg_replication_origin_session_setup(). - - - - - - - pg_replication_origin_session_is_setup - - pg_replication_origin_session_is_setup () - boolean - - - Returns true if a replication origin has been selected in the - current session. - - - - - - - pg_replication_origin_session_progress - - pg_replication_origin_session_progress ( flush boolean ) - pg_lsn - - - Returns the replay location for the replication origin selected in - the current session. The parameter flush - determines whether the corresponding local transaction will be - guaranteed to have been flushed to disk or not. - - - - - - - pg_replication_origin_xact_setup - - pg_replication_origin_xact_setup ( origin_lsn pg_lsn, origin_timestamp timestamp with time zone ) - void - - - Marks the current transaction as replaying a transaction that has - committed at the given LSN and timestamp. Can - only be called when a replication origin has been selected - using pg_replication_origin_session_setup. - - - - - - - pg_replication_origin_xact_reset - - pg_replication_origin_xact_reset () - void - - - Cancels the effects of - pg_replication_origin_xact_setup(). - - - - - - - pg_replication_origin_advance - - pg_replication_origin_advance ( node_name text, lsn pg_lsn ) - void - - - Sets replication progress for the given node to the given - location. This is primarily useful for setting up the initial - location, or setting a new location after configuration changes and - similar. Be aware that careless use of this function can lead to - inconsistently replicated data. - - - - - - - pg_replication_origin_progress - - pg_replication_origin_progress ( node_name text, flush boolean ) - pg_lsn - - - Returns the replay location for the given replication origin. The - parameter flush determines whether the - corresponding local transaction will be guaranteed to have been - flushed to disk or not. - - - - - - - pg_logical_emit_message - - pg_logical_emit_message ( transactional boolean, prefix text, content text , flush boolean DEFAULT false ) - pg_lsn - - - pg_logical_emit_message ( transactional boolean, prefix text, content bytea , flush boolean DEFAULT false ) - pg_lsn - - - Emits a logical decoding message. This can be used to pass generic - messages to logical decoding plugins through - WAL. The transactional parameter specifies if - the message should be part of the current transaction, or if it should - be written immediately and decoded as soon as the logical decoder - reads the record. The prefix parameter is a - textual prefix that can be used by logical decoding plugins to easily - recognize messages that are interesting for them. - The content parameter is the content of the - message, given either in text or binary form. - The flush parameter (default set to - false) controls if the message is immediately - flushed to WAL or not. flush has no effect - with transactional, as the message's WAL - record is flushed along with its transaction. - - - - - - - pg_sync_replication_slots - - pg_sync_replication_slots () - void - - - Synchronize the logical failover replication slots from the primary - server to the standby server. This function can only be executed on the - standby server. Temporary synced slots, if any, cannot be used for - logical decoding and must be dropped after promotion. See - for details. - Note that this function cannot be executed if - - sync_replication_slots is enabled and the slotsync - worker is already running to perform the synchronization of slots. - - - - - If, after executing the function, - - hot_standby_feedback is disabled on - the standby or the physical slot configured in - - primary_slot_name is - removed, then it is possible that the necessary rows of the - synchronized slot will be removed by the VACUUM process on the primary - server, resulting in the synchronized slot becoming invalidated. - - - - - - - -
- -
- - - Database Object Management Functions - - - The functions shown in calculate - the disk space usage of database objects, or assist in presentation - or understanding of usage results. bigint results - are measured in bytes. If an OID that does - not represent an existing object is passed to one of these - functions, NULL is returned. - - - - Database Object Size Functions - - - - - Function - - - Description - - - - - - - - - pg_column_size - - pg_column_size ( "any" ) - integer - - - Shows the number of bytes used to store any individual data value. If - applied directly to a table column value, this reflects any - compression that was done. - - - - - - - pg_column_compression - - pg_column_compression ( "any" ) - text - - - Shows the compression algorithm that was used to compress - an individual variable-length value. Returns NULL - if the value is not compressed. - - - - - - - pg_column_toast_chunk_id - - pg_column_toast_chunk_id ( "any" ) - oid - - - Shows the chunk_id of an on-disk - TOASTed value. Returns NULL - if the value is un-TOASTed or not on-disk. See - for more information about - TOAST. - - - - - - - pg_database_size - - pg_database_size ( name ) - bigint - - - pg_database_size ( oid ) - bigint - - - Computes the total disk space used by the database with the specified - name or OID. To use this function, you must - have CONNECT privilege on the specified database - (which is granted by default) or have privileges of - the pg_read_all_stats role. - - - - - - - pg_indexes_size - - pg_indexes_size ( regclass ) - bigint - - - Computes the total disk space used by indexes attached to the - specified table. - - - - - - - pg_relation_size - - pg_relation_size ( relation regclass , fork text ) - bigint - - - Computes the disk space used by one fork of the - specified relation. (Note that for most purposes it is more - convenient to use the higher-level - functions pg_total_relation_size - or pg_table_size, which sum the sizes of all - forks.) With one argument, this returns the size of the main data - fork of the relation. The second argument can be provided to specify - which fork to examine: - - - - main returns the size of the main - data fork of the relation. - - - - - fsm returns the size of the Free Space Map - (see ) associated with the relation. - - - - - vm returns the size of the Visibility Map - (see ) associated with the relation. - - - - - init returns the size of the initialization - fork, if any, associated with the relation. - - - - - - - - - - pg_size_bytes - - pg_size_bytes ( text ) - bigint - - - Converts a size in human-readable format (as returned - by pg_size_pretty) into bytes. Valid units are - bytes, B, kB, - MB, GB, TB, - and PB. - - - - - - - pg_size_pretty - - pg_size_pretty ( bigint ) - text - - - pg_size_pretty ( numeric ) - text - - - Converts a size in bytes into a more easily human-readable format with - size units (bytes, kB, MB, GB, TB, or PB as appropriate). Note that the - units are powers of 2 rather than powers of 10, so 1kB is 1024 bytes, - 1MB is 10242 = 1048576 bytes, and so on. - - - - - - - pg_table_size - - pg_table_size ( regclass ) - bigint - - - Computes the disk space used by the specified table, excluding indexes - (but including its TOAST table if any, free space map, and visibility - map). - - - - - - - pg_tablespace_size - - pg_tablespace_size ( name ) - bigint - - - pg_tablespace_size ( oid ) - bigint - - - Computes the total disk space used in the tablespace with the - specified name or OID. To use this function, you must - have CREATE privilege on the specified tablespace - or have privileges of the pg_read_all_stats role, - unless it is the default tablespace for the current database. - - - - - - - pg_total_relation_size - - pg_total_relation_size ( regclass ) - bigint - - - Computes the total disk space used by the specified table, including - all indexes and TOAST data. The result is - equivalent to pg_table_size - + pg_indexes_size. - - - - -
- - - The functions above that operate on tables or indexes accept a - regclass argument, which is simply the OID of the table or index - in the pg_class system catalog. You do not have to look up - the OID by hand, however, since the regclass data type's input - converter will do the work for you. See - for details. - - - - The functions shown in assist - in identifying the specific disk files associated with database objects. - - - - Database Object Location Functions - - - - - Function - - - Description - - - - - - - - - pg_relation_filenode - - pg_relation_filenode ( relation regclass ) - oid - - - Returns the filenode number currently assigned to the - specified relation. The filenode is the base component of the file - name(s) used for the relation (see - for more information). - For most relations the result is the same as - pg_class.relfilenode, - but for certain system catalogs relfilenode - is zero and this function must be used to get the correct value. The - function returns NULL if passed a relation that does not have storage, - such as a view. - - - - - - - pg_relation_filepath - - pg_relation_filepath ( relation regclass ) - text - - - Returns the entire file path name (relative to the database cluster's - data directory, PGDATA) of the relation. - - - - - - - pg_filenode_relation - - pg_filenode_relation ( tablespace oid, filenode oid ) - regclass - - - Returns a relation's OID given the tablespace OID and filenode it is - stored under. This is essentially the inverse mapping of - pg_relation_filepath. For a relation in the - database's default tablespace, the tablespace can be specified as zero. - Returns NULL if no relation in the current database - is associated with the given values. - - - - -
- - - lists functions used to manage - collations. - - - - Collation Management Functions - - - - - Function - - - Description - - - - - - - - - pg_collation_actual_version - - pg_collation_actual_version ( oid ) - text - - - Returns the actual version of the collation object as it is currently - installed in the operating system. If this is different from the - value in - pg_collation.collversion, - then objects depending on the collation might need to be rebuilt. See - also . - - - - - - - pg_database_collation_actual_version - - pg_database_collation_actual_version ( oid ) - text - - - Returns the actual version of the database's collation as it is currently - installed in the operating system. If this is different from the - value in - pg_database.datcollversion, - then objects depending on the collation might need to be rebuilt. See - also . - - - - - - - pg_import_system_collations - - pg_import_system_collations ( schema regnamespace ) - integer - - - Adds collations to the system - catalog pg_collation based on all the locales - it finds in the operating system. This is - what initdb uses; see - for more details. If additional - locales are installed into the operating system later on, this - function can be run again to add collations for the new locales. - Locales that match existing entries - in pg_collation will be skipped. (But - collation objects based on locales that are no longer present in the - operating system are not removed by this function.) - The schema parameter would typically - be pg_catalog, but that is not a requirement; the - collations could be installed into some other schema as well. The - function returns the number of new collation objects it created. - Use of this function is restricted to superusers. - - - - -
- - - lists functions used to - manipulate statistics. - These functions cannot be executed during recovery. - - - Changes made by these statistics manipulation functions are likely to be - overwritten by autovacuum (or manual - VACUUM or ANALYZE) and should be - considered temporary. - - - - - - Database Object Statistics Manipulation Functions - - - - - Function - - - Description - - - - - - - - - pg_restore_relation_stats - - pg_restore_relation_stats ( - VARIADIC kwargs "any" ) - boolean - - - Updates table-level statistics. Ordinarily, these statistics are - collected automatically or updated as a part of or , so it's not - necessary to call this function. However, it is useful after a - restore to enable the optimizer to choose better plans if - ANALYZE has not been run yet. - - - The tracked statistics may change from version to version, so - arguments are passed as pairs of argname - and argvalue in the form: - -SELECT pg_restore_relation_stats( - 'arg1name', 'arg1value'::arg1type, - 'arg2name', 'arg2value'::arg2type, - 'arg3name', 'arg3value'::arg3type); - - - - For example, to set the relpages and - reltuples values for the table - mytable: - -SELECT pg_restore_relation_stats( - 'schemaname', 'myschema', - 'relname', 'mytable', - 'relpages', 173::integer, - 'reltuples', 10000::real); - - - - The arguments schemaname and - relname are required, and specify the table. Other - arguments are the names and values of statistics corresponding to - certain columns in pg_class. - The currently-supported relation statistics are - relpages with a value of type - integer, reltuples with a value of - type real, relallvisible with a value - of type integer, and relallfrozen - with a value of type integer. - - - Additionally, this function accepts argument name - version of type integer, which - specifies the server version from which the statistics originated. - This is anticipated to be helpful in porting statistics from older - versions of PostgreSQL. - - - Minor errors are reported as a WARNING and - ignored, and remaining statistics will still be restored. If all - specified statistics are successfully restored, returns - true, otherwise false. - - - The caller must have the MAINTAIN privilege on the - table or be the owner of the database. - - - - - - - - - pg_clear_relation_stats - - pg_clear_relation_stats ( schemaname text, relname text ) - void - - - Clears table-level statistics for the given relation, as though the - table was newly created. - - - The caller must have the MAINTAIN privilege on the - table or be the owner of the database. - - - - - - - - pg_restore_attribute_stats - - pg_restore_attribute_stats ( - VARIADIC kwargs "any" ) - boolean - - - Creates or updates column-level statistics. Ordinarily, these - statistics are collected automatically or updated as a part of or , so it's not - necessary to call this function. However, it is useful after a - restore to enable the optimizer to choose better plans if - ANALYZE has not been run yet. - - - The tracked statistics may change from version to version, so - arguments are passed as pairs of argname - and argvalue in the form: - -SELECT pg_restore_attribute_stats( - 'arg1name', 'arg1value'::arg1type, - 'arg2name', 'arg2value'::arg2type, - 'arg3name', 'arg3value'::arg3type); - - - - For example, to set the avg_width and - null_frac values for the attribute - col1 of the table - mytable: - -SELECT pg_restore_attribute_stats( - 'schemaname', 'myschema', - 'relname', 'mytable', - 'attname', 'col1', - 'inherited', false, - 'avg_width', 125::integer, - 'null_frac', 0.5::real); - - - - The required arguments are schemaname and - relname with a value of type text - which specify the table; either attname with a - value of type text or attnum with a - value of type smallint, which specifies the column; and - inherited, which specifies whether the statistics - include values from child tables. Other arguments are the names and - values of statistics corresponding to columns in pg_stats. - - - Additionally, this function accepts argument name - version of type integer, which - specifies the server version from which the statistics originated. - This is anticipated to be helpful in porting statistics from older - versions of PostgreSQL. - - - Minor errors are reported as a WARNING and - ignored, and remaining statistics will still be restored. If all - specified statistics are successfully restored, returns - true, otherwise false. - - - The caller must have the MAINTAIN privilege on the - table or be the owner of the database. - - - - - - - - - pg_clear_attribute_stats - - pg_clear_attribute_stats ( - schemaname text, - relname text, - attname text, - inherited boolean ) - void - - - Clears column-level statistics for the given relation and - attribute, as though the table was newly created. - - - The caller must have the MAINTAIN privilege on - the table or be the owner of the database. - - - - - -
- - - lists functions that provide - information about the structure of partitioned tables. - - - - Partitioning Information Functions - - - - - Function - - - Description - - - - - - - - - pg_partition_tree - - pg_partition_tree ( regclass ) - setof record - ( relid regclass, - parentrelid regclass, - isleaf boolean, - level integer ) - - - Lists the tables or indexes in the partition tree of the - given partitioned table or partitioned index, with one row for each - partition. Information provided includes the OID of the partition, - the OID of its immediate parent, a boolean value telling if the - partition is a leaf, and an integer telling its level in the hierarchy. - The level value is 0 for the input table or index, 1 for its - immediate child partitions, 2 for their partitions, and so on. - Returns no rows if the relation does not exist or is not a partition - or partitioned table. - - - - - - - pg_partition_ancestors - - pg_partition_ancestors ( regclass ) - setof regclass - - - Lists the ancestor relations of the given partition, - including the relation itself. Returns no rows if the relation - does not exist or is not a partition or partitioned table. - - - - - - - pg_partition_root - - pg_partition_root ( regclass ) - regclass - - - Returns the top-most parent of the partition tree to which the given - relation belongs. Returns NULL if the relation - does not exist or is not a partition or partitioned table. - - - - -
- - - For example, to check the total size of the data contained in a - partitioned table measurement, one could use the - following query: - -SELECT pg_size_pretty(sum(pg_relation_size(relid))) AS total_size - FROM pg_partition_tree('measurement'); - - - -
- - - Index Maintenance Functions - - - shows the functions - available for index maintenance tasks. (Note that these maintenance - tasks are normally done automatically by autovacuum; use of these - functions is only required in special cases.) - These functions cannot be executed during recovery. - Use of these functions is restricted to superusers and the owner - of the given index. - - - - Index Maintenance Functions - - - - - Function - - - Description - - - - - - - - - brin_summarize_new_values - - brin_summarize_new_values ( index regclass ) - integer - - - Scans the specified BRIN index to find page ranges in the base table - that are not currently summarized by the index; for any such range it - creates a new summary index tuple by scanning those table pages. - Returns the number of new page range summaries that were inserted - into the index. - - - - - - - brin_summarize_range - - brin_summarize_range ( index regclass, blockNumber bigint ) - integer - - - Summarizes the page range covering the given block, if not already - summarized. This is - like brin_summarize_new_values except that it - only processes the page range that covers the given table block number. - - - - - - - brin_desummarize_range - - brin_desummarize_range ( index regclass, blockNumber bigint ) - void - - - Removes the BRIN index tuple that summarizes the page range covering - the given table block, if there is one. - - - - - - - gin_clean_pending_list - - gin_clean_pending_list ( index regclass ) - bigint - - - Cleans up the pending list of the specified GIN index - by moving entries in it, in bulk, to the main GIN data structure. - Returns the number of pages removed from the pending list. - If the argument is a GIN index built with - the fastupdate option disabled, no cleanup happens - and the result is zero, because the index doesn't have a pending list. - See and - for details about the pending list and fastupdate - option. - - - - -
- -
- - - Generic File Access Functions - - - The functions shown in provide native access to - files on the machine hosting the server. Only files within the - database cluster directory and the log_directory can be - accessed, unless the user is a superuser or is granted the role - pg_read_server_files. Use a relative path for files in - the cluster directory, and a path matching the log_directory - configuration setting for log files. - - - - Note that granting users the EXECUTE privilege on - pg_read_file(), or related functions, allows them the - ability to read any file on the server that the database server process can - read; these functions bypass all in-database privilege checks. This means - that, for example, a user with such access is able to read the contents of - the pg_authid table where authentication - information is stored, as well as read any table data in the database. - Therefore, granting access to these functions should be carefully - considered. - - - - When granting privilege on these functions, note that the table entries - showing optional parameters are mostly implemented as several physical - functions with different parameter lists. Privilege must be granted - separately on each such function, if it is to be - used. psql's \df command - can be useful to check what the actual function signatures are. - - - - Some of these functions take an optional missing_ok - parameter, which specifies the behavior when the file or directory does - not exist. If true, the function - returns NULL or an empty result set, as appropriate. - If false, an error is raised. (Failure conditions - other than file not found are reported as errors in any - case.) The default is false. - - - - Generic File Access Functions - - - - - Function - - - Description - - - - - - - - - pg_ls_dir - - pg_ls_dir ( dirname text , missing_ok boolean, include_dot_dirs boolean ) - setof text - - - Returns the names of all files (and directories and other special - files) in the specified - directory. The include_dot_dirs parameter - indicates whether . and .. are to be - included in the result set; the default is to exclude them. Including - them can be useful when missing_ok - is true, to distinguish an empty directory from a - non-existent directory. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_ls_logdir - - pg_ls_logdir () - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the server's log directory. Filenames beginning with - a dot, directories, and other special files are excluded. - - - This function is restricted to superusers and roles with privileges of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_ls_waldir - - pg_ls_waldir () - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the server's write-ahead log (WAL) directory. - Filenames beginning with a dot, directories, and other special files - are excluded. - - - This function is restricted to superusers and roles with privileges of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_ls_logicalmapdir - - pg_ls_logicalmapdir () - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the server's pg_logical/mappings - directory. Filenames beginning with a dot, directories, and other - special files are excluded. - - - This function is restricted to superusers and members of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_ls_logicalsnapdir - - pg_ls_logicalsnapdir () - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the server's pg_logical/snapshots - directory. Filenames beginning with a dot, directories, and other - special files are excluded. - - - This function is restricted to superusers and members of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_ls_replslotdir - - pg_ls_replslotdir ( slot_name text ) - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the server's pg_replslot/slot_name - directory, where slot_name is the name of the - replication slot provided as input of the function. Filenames beginning - with a dot, directories, and other special files are excluded. - - - This function is restricted to superusers and members of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_ls_summariesdir - - pg_ls_summariesdir () - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the server's WAL summaries directory - (pg_wal/summaries). Filenames beginning - with a dot, directories, and other special files are excluded. - - - This function is restricted to superusers and members of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_ls_archive_statusdir - - pg_ls_archive_statusdir () - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the server's WAL archive status directory - (pg_wal/archive_status). Filenames beginning - with a dot, directories, and other special files are excluded. - - - This function is restricted to superusers and members of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - - pg_ls_tmpdir - - pg_ls_tmpdir ( tablespace oid ) - setof record - ( name text, - size bigint, - modification timestamp with time zone ) - - - Returns the name, size, and last modification time (mtime) of each - ordinary file in the temporary file directory for the - specified tablespace. - If tablespace is not provided, - the pg_default tablespace is examined. Filenames - beginning with a dot, directories, and other special files are - excluded. - - - This function is restricted to superusers and members of - the pg_monitor role by default, but other users can - be granted EXECUTE to run the function. - - - - - - - pg_read_file - - pg_read_file ( filename text , offset bigint, length bigint , missing_ok boolean ) - text - - - Returns all or part of a text file, starting at the - given byte offset, returning at - most length bytes (less if the end of file is - reached first). If offset is negative, it is - relative to the end of the file. If offset - and length are omitted, the entire file is - returned. The bytes read from the file are interpreted as a string in - the database's encoding; an error is thrown if they are not valid in - that encoding. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - - - pg_read_binary_file - - pg_read_binary_file ( filename text , offset bigint, length bigint , missing_ok boolean ) - bytea - - - Returns all or part of a file. This function is identical to - pg_read_file except that it can read arbitrary - binary data, returning the result as bytea - not text; accordingly, no encoding checks are performed. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - In combination with the convert_from function, - this function can be used to read a text file in a specified encoding - and convert to the database's encoding: - -SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8'); - - - - - - - - pg_stat_file - - pg_stat_file ( filename text , missing_ok boolean ) - record - ( size bigint, - access timestamp with time zone, - modification timestamp with time zone, - change timestamp with time zone, - creation timestamp with time zone, - isdir boolean ) - - - Returns a record containing the file's size, last access time stamp, - last modification time stamp, last file status change time stamp (Unix - platforms only), file creation time stamp (Windows only), and a flag - indicating if it is a directory. - - - This function is restricted to superusers by default, but other users - can be granted EXECUTE to run the function. - - - - - -
- -
- - - Advisory Lock Functions - - - The functions shown in - manage advisory locks. For details about proper use of these functions, - see . - - - - All these functions are intended to be used to lock application-defined - resources, which can be identified either by a single 64-bit key value or - two 32-bit key values (note that these two key spaces do not overlap). - If another session already holds a conflicting lock on the same resource - identifier, the functions will either wait until the resource becomes - available, or return a false result, as appropriate for - the function. - Locks can be either shared or exclusive: a shared lock does not conflict - with other shared locks on the same resource, only with exclusive locks. - Locks can be taken at session level (so that they are held until released - or the session ends) or at transaction level (so that they are held until - the current transaction ends; there is no provision for manual release). - Multiple session-level lock requests stack, so that if the same resource - identifier is locked three times there must then be three unlock requests - to release the resource in advance of session end. - - - - Advisory Lock Functions - - - - - Function - - - Description - - - - - - - - - pg_advisory_lock - - pg_advisory_lock ( key bigint ) - void - - - pg_advisory_lock ( key1 integer, key2 integer ) - void - - - Obtains an exclusive session-level advisory lock, waiting if necessary. - - - - - - - pg_advisory_lock_shared - - pg_advisory_lock_shared ( key bigint ) - void - - - pg_advisory_lock_shared ( key1 integer, key2 integer ) - void - - - Obtains a shared session-level advisory lock, waiting if necessary. - - - - - - - pg_advisory_unlock - - pg_advisory_unlock ( key bigint ) - boolean - - - pg_advisory_unlock ( key1 integer, key2 integer ) - boolean - - - Releases a previously-acquired exclusive session-level advisory lock. - Returns true if the lock is successfully released. - If the lock was not held, false is returned, and in - addition, an SQL warning will be reported by the server. - - - - - - - pg_advisory_unlock_all - - pg_advisory_unlock_all () - void - - - Releases all session-level advisory locks held by the current session. - (This function is implicitly invoked at session end, even if the - client disconnects ungracefully.) - - - - - - - pg_advisory_unlock_shared - - pg_advisory_unlock_shared ( key bigint ) - boolean - - - pg_advisory_unlock_shared ( key1 integer, key2 integer ) - boolean - - - Releases a previously-acquired shared session-level advisory lock. - Returns true if the lock is successfully released. - If the lock was not held, false is returned, and in - addition, an SQL warning will be reported by the server. - - - - - - - pg_advisory_xact_lock - - pg_advisory_xact_lock ( key bigint ) - void - - - pg_advisory_xact_lock ( key1 integer, key2 integer ) - void - - - Obtains an exclusive transaction-level advisory lock, waiting if - necessary. - - - - - - - pg_advisory_xact_lock_shared - - pg_advisory_xact_lock_shared ( key bigint ) - void - - - pg_advisory_xact_lock_shared ( key1 integer, key2 integer ) - void - - - Obtains a shared transaction-level advisory lock, waiting if - necessary. - - - - - - - pg_try_advisory_lock - - pg_try_advisory_lock ( key bigint ) - boolean - - - pg_try_advisory_lock ( key1 integer, key2 integer ) - boolean - - - Obtains an exclusive session-level advisory lock if available. - This will either obtain the lock immediately and - return true, or return false - without waiting if the lock cannot be acquired immediately. - - - - - - - pg_try_advisory_lock_shared - - pg_try_advisory_lock_shared ( key bigint ) - boolean - - - pg_try_advisory_lock_shared ( key1 integer, key2 integer ) - boolean - - - Obtains a shared session-level advisory lock if available. - This will either obtain the lock immediately and - return true, or return false - without waiting if the lock cannot be acquired immediately. - - - - - - - pg_try_advisory_xact_lock - - pg_try_advisory_xact_lock ( key bigint ) - boolean - - - pg_try_advisory_xact_lock ( key1 integer, key2 integer ) - boolean - - - Obtains an exclusive transaction-level advisory lock if available. - This will either obtain the lock immediately and - return true, or return false - without waiting if the lock cannot be acquired immediately. - - - - - - - pg_try_advisory_xact_lock_shared - - pg_try_advisory_xact_lock_shared ( key bigint ) - boolean - - - pg_try_advisory_xact_lock_shared ( key1 integer, key2 integer ) - boolean - - - Obtains a shared transaction-level advisory lock if available. - This will either obtain the lock immediately and - return true, or return false - without waiting if the lock cannot be acquired immediately. - - - - -
- -
- -
- - - Trigger Functions - - - While many uses of triggers involve user-written trigger functions, - PostgreSQL provides a few built-in trigger - functions that can be used directly in user-defined triggers. These - are summarized in . - (Additional built-in trigger functions exist, which implement foreign - key constraints and deferred index constraints. Those are not documented - here since users need not use them directly.) - - - - For more information about creating triggers, see - . - - - - Built-In Trigger Functions - - - - - Function - - - Description - - - Example Usage - - - - - - - - - suppress_redundant_updates_trigger - - suppress_redundant_updates_trigger ( ) - trigger - - - Suppresses do-nothing update operations. See below for details. - - - CREATE TRIGGER ... suppress_redundant_updates_trigger() - - - - - - - tsvector_update_trigger - - tsvector_update_trigger ( ) - trigger - - - Automatically updates a tsvector column from associated - plain-text document column(s). The text search configuration to use - is specified by name as a trigger argument. See - for details. - - - CREATE TRIGGER ... tsvector_update_trigger(tsvcol, 'pg_catalog.swedish', title, body) - - - - - - - tsvector_update_trigger_column - - tsvector_update_trigger_column ( ) - trigger - - - Automatically updates a tsvector column from associated - plain-text document column(s). The text search configuration to use - is taken from a regconfig column of the table. See - for details. - - - CREATE TRIGGER ... tsvector_update_trigger_column(tsvcol, tsconfigcol, title, body) - - - - -
- - - The suppress_redundant_updates_trigger function, - when applied as a row-level BEFORE UPDATE trigger, - will prevent any update that does not actually change the data in the - row from taking place. This overrides the normal behavior which always - performs a physical row update - regardless of whether or not the data has changed. (This normal behavior - makes updates run faster, since no checking is required, and is also - useful in certain cases.) - - - - Ideally, you should avoid running updates that don't actually - change the data in the record. Redundant updates can cost considerable - unnecessary time, especially if there are lots of indexes to alter, - and space in dead rows that will eventually have to be vacuumed. - However, detecting such situations in client code is not - always easy, or even possible, and writing expressions to detect - them can be error-prone. An alternative is to use - suppress_redundant_updates_trigger, which will skip - updates that don't change the data. You should use this with care, - however. The trigger takes a small but non-trivial time for each record, - so if most of the records affected by updates do actually change, - use of this trigger will make updates run slower on average. - - - - The suppress_redundant_updates_trigger function can be - added to a table like this: - -CREATE TRIGGER z_min_update -BEFORE UPDATE ON tablename -FOR EACH ROW EXECUTE FUNCTION suppress_redundant_updates_trigger(); - - In most cases, you need to fire this trigger last for each row, so that - it does not override other triggers that might wish to alter the row. - Bearing in mind that triggers fire in name order, you would therefore - choose a trigger name that comes after the name of any other trigger - you might have on the table. (Hence the z prefix in the - example.) - -
- - - Event Trigger Functions - - - PostgreSQL provides these helper functions - to retrieve information from event triggers. - - - - For more information about event triggers, - see . - - - - Capturing Changes at Command End - - - pg_event_trigger_ddl_commands - - - -pg_event_trigger_ddl_commands () setof record - - - - pg_event_trigger_ddl_commands returns a list of - DDL commands executed by each user action, - when invoked in a function attached to a - ddl_command_end event trigger. If called in any other - context, an error is raised. - pg_event_trigger_ddl_commands returns one row for each - base command executed; some commands that are a single SQL sentence - may return more than one row. This function returns the following - columns: - - - - - - Name - Type - Description - - - - - - classid - oid - OID of catalog the object belongs in - - - objid - oid - OID of the object itself - - - objsubid - integer - Sub-object ID (e.g., attribute number for a column) - - - command_tag - text - Command tag - - - object_type - text - Type of the object - - - schema_name - text - - Name of the schema the object belongs in, if any; otherwise NULL. - No quoting is applied. - - - - object_identity - text - - Text rendering of the object identity, schema-qualified. Each - identifier included in the identity is quoted if necessary. - - - - in_extension - boolean - True if the command is part of an extension script - - - command - pg_ddl_command - - A complete representation of the command, in internal format. - This cannot be output directly, but it can be passed to other - functions to obtain different pieces of information about the - command. - - - - - - - - - - Processing Objects Dropped by a DDL Command - - - pg_event_trigger_dropped_objects - - - -pg_event_trigger_dropped_objects () setof record - - - - pg_event_trigger_dropped_objects returns a list of all objects - dropped by the command in whose sql_drop event it is called. - If called in any other context, an error is raised. - This function returns the following columns: - - - - - - Name - Type - Description - - - - - - classid - oid - OID of catalog the object belonged in - - - objid - oid - OID of the object itself - - - objsubid - integer - Sub-object ID (e.g., attribute number for a column) - - - original - boolean - True if this was one of the root object(s) of the deletion - - - normal - boolean - - True if there was a normal dependency relationship - in the dependency graph leading to this object - - - - is_temporary - boolean - - True if this was a temporary object - - - - object_type - text - Type of the object - - - schema_name - text - - Name of the schema the object belonged in, if any; otherwise NULL. - No quoting is applied. - - - - object_name - text - - Name of the object, if the combination of schema and name can be - used as a unique identifier for the object; otherwise NULL. - No quoting is applied, and name is never schema-qualified. - - - - object_identity - text - - Text rendering of the object identity, schema-qualified. Each - identifier included in the identity is quoted if necessary. - - - - address_names - text[] - - An array that, together with object_type and - address_args, can be used by - the pg_get_object_address function to - recreate the object address in a remote server containing an - identically named object of the same kind. - - - - address_args - text[] - - Complement for address_names - - - - - - - - - The pg_event_trigger_dropped_objects function can be used - in an event trigger like this: - -CREATE FUNCTION test_event_trigger_for_drops() - RETURNS event_trigger LANGUAGE plpgsql AS $$ -DECLARE - obj record; -BEGIN - FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() - LOOP - RAISE NOTICE '% dropped object: % %.% %', - tg_tag, - obj.object_type, - obj.schema_name, - obj.object_name, - obj.object_identity; - END LOOP; -END; -$$; -CREATE EVENT TRIGGER test_event_trigger_for_drops - ON sql_drop - EXECUTE FUNCTION test_event_trigger_for_drops(); - - - - - - Handling a Table Rewrite Event - - - The functions shown in - - provide information about a table for which a - table_rewrite event has just been called. - If called in any other context, an error is raised. - - - - Table Rewrite Information Functions - - - - - Function - - - Description - - - - - - - - - pg_event_trigger_table_rewrite_oid - - pg_event_trigger_table_rewrite_oid () - oid - - - Returns the OID of the table about to be rewritten. - - - - - - - pg_event_trigger_table_rewrite_reason - - pg_event_trigger_table_rewrite_reason () - integer - - - Returns a code explaining the reason(s) for rewriting. The value is - a bitmap built from the following values: 1 - (the table has changed its persistence), 2 - (default value of a column has changed), 4 - (a column has a new data type) and 8 - (the table access method has changed). - - - - -
- - - These functions can be used in an event trigger like this: - -CREATE FUNCTION test_event_trigger_table_rewrite_oid() - RETURNS event_trigger - LANGUAGE plpgsql AS -$$ -BEGIN - RAISE NOTICE 'rewriting table % for reason %', - pg_event_trigger_table_rewrite_oid()::regclass, - pg_event_trigger_table_rewrite_reason(); -END; -$$; - -CREATE EVENT TRIGGER test_table_rewrite_oid - ON table_rewrite - EXECUTE FUNCTION test_event_trigger_table_rewrite_oid(); - - -
-
- - - Statistics Information Functions - - - function - statistics - - - - PostgreSQL provides a function to inspect complex - statistics defined using the CREATE STATISTICS command. - - - - Inspecting MCV Lists - - - pg_mcv_list_items - - - -pg_mcv_list_items ( pg_mcv_list ) setof record - - - - pg_mcv_list_items returns a set of records describing - all items stored in a multi-column MCV list. It - returns the following columns: - - - - - - Name - Type - Description - - - - - - index - integer - index of the item in the MCV list - - - values - text[] - values stored in the MCV item - - - nulls - boolean[] - flags identifying NULL values - - - frequency - double precision - frequency of this MCV item - - - base_frequency - double precision - base frequency of this MCV item - - - - - - - - The pg_mcv_list_items function can be used like this: - - -SELECT m.* FROM pg_statistic_ext join pg_statistic_ext_data on (oid = stxoid), - pg_mcv_list_items(stxdmcv) m WHERE stxname = 'stts'; - - - Values of the pg_mcv_list type can be obtained only from the - pg_statistic_ext_data.stxdmcv - column. - - - - - - diff --git a/doc/src/sgml/func/allfiles.sgml b/doc/src/sgml/func/allfiles.sgml new file mode 100644 index 0000000000000..ce11ef1d5d8ed --- /dev/null +++ b/doc/src/sgml/func/allfiles.sgml @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml new file mode 100644 index 0000000000000..2896cd9e42909 --- /dev/null +++ b/doc/src/sgml/func/func-admin.sgml @@ -0,0 +1,2980 @@ + + System Administration Functions + + + The functions described in this section are used to control and + monitor a PostgreSQL installation. + + + + Configuration Settings Functions + + + SET + + + + SHOW + + + + configuration + of the server + functions + + + + shows the functions + available to query and alter run-time configuration parameters. + + + + Configuration Settings Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + current_setting + + current_setting ( setting_name text , missing_ok boolean ) + text + + + Returns the current value of the + setting setting_name. If there is no such + setting, current_setting throws an error + unless missing_ok is supplied and + is true (in which case NULL is returned). + This function corresponds to + the SQL command . + + + current_setting('datestyle') + ISO, MDY + + + + + + + set_config + + set_config ( + setting_name text, + new_value text, + is_local boolean ) + text + + + Sets the parameter setting_name + to new_value, and returns that value. + If is_local is true, the new + value will only apply during the current transaction. If you want the + new value to apply for the rest of the current session, + use false instead. This function corresponds to + the SQL command . + + + set_config accepts the NULL value for + new_value, but as settings cannot be null, it + is interpreted as a request to reset the setting to its default value. + + + set_config('log_statement_stats', 'off', false) + off + + + + +
+ +
+ + + Server Signaling Functions + + + signal + backend processes + + + + The functions shown in send control signals to + other server processes. Use of these functions is restricted to + superusers by default but access may be granted to others using + GRANT, with noted exceptions. + + + + Each of these functions returns true if + the signal was successfully sent and false + if sending the signal failed. + + + + Server Signaling Functions + + + + + Function + + + Description + + + + + + + + + pg_cancel_backend + + pg_cancel_backend ( pid integer ) + boolean + + + Cancels the current query of the session whose backend process has the + specified process ID. This is also allowed if the + calling role is a member of the role whose backend is being canceled or + the calling role has privileges of pg_signal_backend, + however only superusers can cancel superuser backends. + As an exception, roles with privileges of + pg_signal_autovacuum_worker are permitted to + cancel autovacuum worker processes, which are otherwise considered + superuser backends. + + + + + + + pg_log_backend_memory_contexts + + pg_log_backend_memory_contexts ( pid integer ) + boolean + + + Requests to log the memory contexts of the backend with the + specified process ID. This function can send the request to + backends and auxiliary processes except logger. These memory contexts + will be logged at + LOG message level. They will appear in + the server log based on the log configuration set + (see for more information), + but will not be sent to the client regardless of + . + + + + + + + pg_reload_conf + + pg_reload_conf () + boolean + + + Causes all processes of the PostgreSQL + server to reload their configuration files. (This is initiated by + sending a SIGHUP signal to the postmaster + process, which in turn sends SIGHUP to each + of its children.) You can use the + pg_file_settings, + pg_hba_file_rules and + pg_ident_file_mappings views + to check the configuration files for possible errors, before reloading. + + + + + + + pg_rotate_logfile + + pg_rotate_logfile () + boolean + + + Signals the log-file manager to switch to a new output file + immediately. This works only when the built-in log collector is + running, since otherwise there is no log-file manager subprocess. + + + + + + + pg_terminate_backend + + pg_terminate_backend ( pid integer, timeout bigint DEFAULT 0 ) + boolean + + + Terminates the session whose backend process has the + specified process ID. This is also allowed if the calling role + is a member of the role whose backend is being terminated or the + calling role has privileges of pg_signal_backend, + however only superusers can terminate superuser backends. + As an exception, roles with privileges of + pg_signal_autovacuum_worker are permitted to + terminate autovacuum worker processes, which are otherwise considered + superuser backends. + + + If timeout is not specified or zero, this + function returns true whether the process actually + terminates or not, indicating only that the sending of the signal was + successful. If the timeout is specified (in + milliseconds) and greater than zero, the function waits until the + process is actually terminated or until the given time has passed. If + the process is terminated, the function + returns true. On timeout, a warning is emitted and + false is returned. + + + + +
+ + + pg_cancel_backend and pg_terminate_backend + send signals (SIGINT or SIGTERM + respectively) to backend processes identified by process ID. + The process ID of an active backend can be found from + the pid column of the + pg_stat_activity view, or by listing the + postgres processes on the server (using + ps on Unix or the Task + Manager on Windows). + The role of an active backend can be found from the + usename column of the + pg_stat_activity view. + + + + pg_log_backend_memory_contexts can be used + to log the memory contexts of a backend process. For example: + +postgres=# SELECT pg_log_backend_memory_contexts(pg_backend_pid()); + pg_log_backend_memory_contexts +-------------------------------- + t +(1 row) + +One message for each memory context will be logged. For example: + +LOG: logging memory contexts of PID 10377 +STATEMENT: SELECT pg_log_backend_memory_contexts(pg_backend_pid()); +LOG: level: 1; TopMemoryContext: 80800 total in 6 blocks; 14432 free (5 chunks); 66368 used +LOG: level: 2; pgstat TabStatusArray lookup hash table: 8192 total in 1 blocks; 1408 free (0 chunks); 6784 used +LOG: level: 2; TopTransactionContext: 8192 total in 1 blocks; 7720 free (1 chunks); 472 used +LOG: level: 2; RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used +LOG: level: 2; MessageContext: 16384 total in 2 blocks; 5152 free (0 chunks); 11232 used +LOG: level: 2; Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used +LOG: level: 2; smgr relation table: 16384 total in 2 blocks; 4544 free (3 chunks); 11840 used +LOG: level: 2; TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used +... +LOG: level: 2; ErrorContext: 8192 total in 1 blocks; 7928 free (3 chunks); 264 used +LOG: Grand total: 1651920 bytes in 201 blocks; 622360 free (88 chunks); 1029560 used + + If there are more than 100 child contexts under the same parent, the first + 100 child contexts are logged, along with a summary of the remaining contexts. + Note that frequent calls to this function could incur significant overhead, + because it may generate a large number of log messages. + + +
+ + + Backup Control Functions + + + backup + + + + The functions shown in assist in making on-line backups. + These functions cannot be executed during recovery (except + pg_backup_start, + pg_backup_stop, + and pg_wal_lsn_diff). + + + + For details about proper usage of these functions, see + . + + + + Backup Control Functions + + + + + Function + + + Description + + + + + + + + + pg_create_restore_point + + pg_create_restore_point ( name text ) + pg_lsn + + + Creates a named marker record in the write-ahead log that can later be + used as a recovery target, and returns the corresponding write-ahead + log location. The given name can then be used with + to specify the point up to + which recovery will proceed. Avoid creating multiple restore points + with the same name, since recovery will stop at the first one whose + name matches the recovery target. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_current_wal_flush_lsn + + pg_current_wal_flush_lsn () + pg_lsn + + + Returns the current write-ahead log flush location (see notes below). + + + + + + + pg_current_wal_insert_lsn + + pg_current_wal_insert_lsn () + pg_lsn + + + Returns the current write-ahead log insert location (see notes below). + + + + + + + pg_current_wal_lsn + + pg_current_wal_lsn () + pg_lsn + + + Returns the current write-ahead log write location (see notes below). + + + + + + + pg_backup_start + + pg_backup_start ( + label text + , fast boolean + ) + pg_lsn + + + Prepares the server to begin an on-line backup. The only required + parameter is an arbitrary user-defined label for the backup. + (Typically this would be the name under which the backup dump file + will be stored.) + If the optional second parameter is given as true, + it specifies executing pg_backup_start as quickly + as possible. This forces a fast checkpoint which will cause a + spike in I/O operations, slowing any concurrently executing queries. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_backup_stop + + pg_backup_stop ( + wait_for_archive boolean + ) + record + ( lsn pg_lsn, + labelfile text, + spcmapfile text ) + + + Finishes performing an on-line backup. The desired contents of the + backup label file and the tablespace map file are returned as part of + the result of the function and must be written to files in the + backup area. These files must not be written to the live data directory + (doing so will cause PostgreSQL to fail to restart in the event of a + crash). + + + There is an optional parameter of type boolean. + If false, the function will return immediately after the backup is + completed, without waiting for WAL to be archived. This behavior is + only useful with backup software that independently monitors WAL + archiving. Otherwise, WAL required to make the backup consistent might + be missing and make the backup useless. By default or when this + parameter is true, pg_backup_stop will wait for + WAL to be archived when archiving is enabled. (On a standby, this + means that it will wait only when archive_mode = + always. If write activity on the primary is low, + it may be useful to run pg_switch_wal on the + primary in order to trigger an immediate segment switch.) + + + When executed on a primary, this function also creates a backup + history file in the write-ahead log archive area. The history file + includes the label given to pg_backup_start, the + starting and ending write-ahead log locations for the backup, and the + starting and ending times of the backup. After recording the ending + location, the current write-ahead log insertion point is automatically + advanced to the next write-ahead log file, so that the ending + write-ahead log file can be archived immediately to complete the + backup. + + + The result of the function is a single record. + The lsn column holds the backup's ending + write-ahead log location (which again can be ignored). The second + column returns the contents of the backup label file, and the third + column returns the contents of the tablespace map file. These must be + stored as part of the backup and are required as part of the restore + process. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_switch_wal + + pg_switch_wal () + pg_lsn + + + Forces the server to switch to a new write-ahead log file, which + allows the current file to be archived (assuming you are using + continuous archiving). The result is the ending write-ahead log + location plus 1 within the just-completed write-ahead log file. If + there has been no write-ahead log activity since the last write-ahead + log switch, pg_switch_wal does nothing and + returns the start location of the write-ahead log file currently in + use. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_walfile_name + + pg_walfile_name ( lsn pg_lsn ) + text + + + Converts a write-ahead log location to the name of the WAL file + holding that location. + + + + + + + pg_walfile_name_offset + + pg_walfile_name_offset ( lsn pg_lsn ) + record + ( file_name text, + file_offset integer ) + + + Converts a write-ahead log location to a WAL file name and byte offset + within that file. + + + + + + + pg_split_walfile_name + + pg_split_walfile_name ( file_name text ) + record + ( segment_number numeric, + timeline_id bigint ) + + + Extracts the sequence number and timeline ID from a WAL file + name. + + + + + + + pg_wal_lsn_diff + + pg_wal_lsn_diff ( lsn1 pg_lsn, lsn2 pg_lsn ) + numeric + + + Calculates the difference in bytes (lsn1 - lsn2) between two write-ahead log + locations. This can be used + with pg_stat_replication or some of the + functions shown in to + get the replication lag. + + + + +
+ + + pg_current_wal_lsn displays the current write-ahead + log write location in the same format used by the above functions. + Similarly, pg_current_wal_insert_lsn displays the + current write-ahead log insertion location + and pg_current_wal_flush_lsn displays the current + write-ahead log flush location. The insertion location is + the logical end of the write-ahead log at any instant, + while the write location is the end of what has actually been written out + from the server's internal buffers, and the flush location is the last + location known to be written to durable storage. The write location is the + end of what can be examined from outside the server, and is usually what + you want if you are interested in archiving partially-complete write-ahead + log files. The insertion and flush locations are made available primarily + for server debugging purposes. These are all read-only operations and do + not require superuser permissions. + + + + You can use pg_walfile_name_offset to extract the + corresponding write-ahead log file name and byte offset from + a pg_lsn value. For example: + +postgres=# SELECT * FROM pg_walfile_name_offset((pg_backup_stop()).lsn); + file_name | file_offset +--------------------------+------------- + 00000001000000000000000D | 4039624 +(1 row) + + Similarly, pg_walfile_name extracts just the write-ahead log file name. + + + + pg_split_walfile_name is useful to compute a + LSN from a file offset and WAL file name, for example: + +postgres=# \set file_name '000000010000000100C000AB' +postgres=# \set offset 256 +postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset AS lsn + FROM pg_split_walfile_name(:'file_name') pd, + pg_show_all_settings() ps + WHERE ps.name = 'wal_segment_size'; + lsn +--------------- + C001/AB000100 +(1 row) + + + +
+ + + Recovery Control Functions + + + The functions shown in provide information + about the current status of a standby server. + These functions may be executed both during recovery and in normal running. + + + + Recovery Information Functions + + + + + Function + + + Description + + + + + + + + + pg_is_in_recovery + + pg_is_in_recovery () + boolean + + + Returns true if recovery is still in progress. + + + + + + + pg_last_wal_receive_lsn + + pg_last_wal_receive_lsn () + pg_lsn + + + Returns the last write-ahead log location that has been received and + synced to disk by streaming replication. While streaming replication + is in progress this will increase monotonically. If recovery has + completed then this will remain static at the location of the last WAL + record received and synced to disk during recovery. If streaming + replication is disabled, or if it has not yet started, the function + returns NULL. + + + + + + + pg_last_wal_replay_lsn + + pg_last_wal_replay_lsn () + pg_lsn + + + Returns the last write-ahead log location that has been replayed + during recovery. If recovery is still in progress this will increase + monotonically. If recovery has completed then this will remain + static at the location of the last WAL record applied during recovery. + When the server has been started normally without recovery, the + function returns NULL. + + + + + + + pg_last_xact_replay_timestamp + + pg_last_xact_replay_timestamp () + timestamp with time zone + + + Returns the time stamp of the last transaction replayed during + recovery. This is the time at which the commit or abort WAL record + for that transaction was generated on the primary. If no transactions + have been replayed during recovery, the function + returns NULL. Otherwise, if recovery is still in + progress this will increase monotonically. If recovery has completed + then this will remain static at the time of the last transaction + applied during recovery. When the server has been started normally + without recovery, the function returns NULL. + + + + + + + pg_get_wal_resource_managers + + pg_get_wal_resource_managers () + setof record + ( rm_id integer, + rm_name text, + rm_builtin boolean ) + + + Returns the currently-loaded WAL resource managers in the system. The + column rm_builtin indicates whether it's a + built-in resource manager, or a custom resource manager loaded by an + extension. + + + + +
+ + + The functions shown in control the progress of recovery. + These functions may be executed only during recovery. + + + + Recovery Control Functions + + + + + Function + + + Description + + + + + + + + + pg_is_wal_replay_paused + + pg_is_wal_replay_paused () + boolean + + + Returns true if recovery pause is requested. + + + + + + + pg_get_wal_replay_pause_state + + pg_get_wal_replay_pause_state () + text + + + Returns recovery pause state. The return values are + not paused if pause is not requested, + pause requested if pause is requested but recovery is + not yet paused, and paused if the recovery is + actually paused. + + + + + + + pg_promote + + pg_promote ( wait boolean DEFAULT true, wait_seconds integer DEFAULT 60 ) + boolean + + + Promotes a standby server to primary status. + With wait set to true (the + default), the function waits until promotion is completed + or wait_seconds seconds have passed, and + returns true if promotion is successful + and false otherwise. + If wait is set to false, the + function returns true immediately after sending a + SIGUSR1 signal to the postmaster to trigger + promotion. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_wal_replay_pause + + pg_wal_replay_pause () + void + + + Request to pause recovery. A request doesn't mean that recovery stops + right away. If you want a guarantee that recovery is actually paused, + you need to check for the recovery pause state returned by + pg_get_wal_replay_pause_state(). Note that + pg_is_wal_replay_paused() returns whether a request + is made. While recovery is paused, no further database changes are applied. + If hot standby is active, all new queries will see the same consistent + snapshot of the database, and no further query conflicts will be generated + until recovery is resumed. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_wal_replay_resume + + pg_wal_replay_resume () + void + + + Restarts recovery if it was paused. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + +
+ + + pg_wal_replay_pause and + pg_wal_replay_resume cannot be executed while + a promotion is ongoing. If a promotion is triggered while recovery + is paused, the paused state ends and promotion continues. + + + + If streaming replication is disabled, the paused state may continue + indefinitely without a problem. If streaming replication is in + progress then WAL records will continue to be received, which will + eventually fill available disk space, depending upon the duration of + the pause, the rate of WAL generation and available disk space. + + +
+ + + Snapshot Synchronization Functions + + + PostgreSQL allows database sessions to synchronize their + snapshots. A snapshot determines which data is visible to the + transaction that is using the snapshot. Synchronized snapshots are + necessary when two or more sessions need to see identical content in the + database. If two sessions just start their transactions independently, + there is always a possibility that some third transaction commits + between the executions of the two START TRANSACTION commands, + so that one session sees the effects of that transaction and the other + does not. + + + + To solve this problem, PostgreSQL allows a transaction to + export the snapshot it is using. As long as the exporting + transaction remains open, other transactions can import its + snapshot, and thereby be guaranteed that they see exactly the same view + of the database that the first transaction sees. But note that any + database changes made by any one of these transactions remain invisible + to the other transactions, as is usual for changes made by uncommitted + transactions. So the transactions are synchronized with respect to + pre-existing data, but act normally for changes they make themselves. + + + + Snapshots are exported with the pg_export_snapshot function, + shown in , and + imported with the command. + + + + Snapshot Synchronization Functions + + + + + Function + + + Description + + + + + + + + + pg_export_snapshot + + pg_export_snapshot () + text + + + Saves the transaction's current snapshot and returns + a text string identifying the snapshot. This string must + be passed (outside the database) to clients that want to import the + snapshot. The snapshot is available for import only until the end of + the transaction that exported it. + + + A transaction can export more than one snapshot, if needed. Note that + doing so is only useful in READ COMMITTED + transactions, since in REPEATABLE READ and higher + isolation levels, transactions use the same snapshot throughout their + lifetime. Once a transaction has exported any snapshots, it cannot be + prepared with . + + + + + + pg_log_standby_snapshot + + pg_log_standby_snapshot () + pg_lsn + + + Take a snapshot of running transactions and write it to WAL, without + having to wait for bgwriter or checkpointer to log one. This is useful + for logical decoding on standby, as logical slot creation has to wait + until such a record is replayed on the standby. + + + + +
+ +
+ + + Replication Management Functions + + + The functions shown + in are for + controlling and interacting with replication features. + See , + , and + + for information about the underlying features. + Use of functions for replication origin is only allowed to the + superuser by default, but may be allowed to other users by using the + GRANT command. + Use of functions for replication slots is restricted to superusers + and users having REPLICATION privilege. + + + + Many of these functions have equivalent commands in the replication + protocol; see . + + + + The functions described in + , + , and + + are also relevant for replication. + + + + Replication Management Functions + + + + + Function + + + Description + + + + + + + + + pg_create_physical_replication_slot + + pg_create_physical_replication_slot ( slot_name name , immediately_reserve boolean, temporary boolean ) + record + ( slot_name name, + lsn pg_lsn ) + + + Creates a new physical replication slot named + slot_name. The name cannot be + pg_conflict_detection as it is reserved for the + conflict detection slot. The optional second parameter, + when true, specifies that the LSN for this + replication slot be reserved immediately; otherwise + the LSN is reserved on first connection from a streaming + replication client. Streaming changes from a physical slot is only + possible with the streaming-replication protocol — + see . The optional third + parameter, temporary, when set to true, specifies that + the slot should not be permanently stored to disk and is only meant + for use by the current session. Temporary slots are also + released upon any error. This function corresponds + to the replication protocol command CREATE_REPLICATION_SLOT + ... PHYSICAL. + + + + + + + pg_drop_replication_slot + + pg_drop_replication_slot ( slot_name name ) + void + + + Drops the physical or logical replication slot + named slot_name. Same as replication protocol + command DROP_REPLICATION_SLOT. + + + + + + + pg_create_logical_replication_slot + + pg_create_logical_replication_slot ( slot_name name, plugin name , temporary boolean, twophase boolean, failover boolean ) + record + ( slot_name name, + lsn pg_lsn ) + + + Creates a new logical (decoding) replication slot named + slot_name using the output plugin + plugin. The name cannot be + pg_conflict_detection as it is reserved for + the conflict detection slot. The optional third + parameter, temporary, when set to true, specifies that + the slot should not be permanently stored to disk and is only meant + for use by the current session. Temporary slots are also + released upon any error. The optional fourth parameter, + twophase, when set to true, specifies + that the decoding of prepared transactions is enabled for this + slot. The optional fifth parameter, + failover, when set to true, + specifies that this slot is enabled to be synced to the + standbys so that logical replication can be resumed after + failover. A call to this function has the same effect as + the replication protocol command + CREATE_REPLICATION_SLOT ... LOGICAL. + + + + + + + pg_copy_physical_replication_slot + + pg_copy_physical_replication_slot ( src_slot_name name, dst_slot_name name , temporary boolean ) + record + ( slot_name name, + lsn pg_lsn ) + + + Copies an existing physical replication slot named src_slot_name + to a physical replication slot named dst_slot_name. + The new slot name cannot be pg_conflict_detection, + as it is reserved for the conflict detection. + The copied physical slot starts to reserve WAL from the same LSN as the + source slot. + temporary is optional. If temporary + is omitted, the same value as the source slot is used. Copy of an + invalidated slot is not allowed. + + + + + + + pg_copy_logical_replication_slot + + pg_copy_logical_replication_slot ( src_slot_name name, dst_slot_name name , temporary boolean , plugin name ) + record + ( slot_name name, + lsn pg_lsn ) + + + Copies an existing logical replication slot + named src_slot_name to a logical replication + slot named dst_slot_name, optionally changing + the output plugin and persistence. The new slot name cannot be + pg_conflict_detection as it is reserved for + the conflict detection. The copied logical slot starts from the same + LSN as the source logical slot. Both + temporary and plugin are + optional; if they are omitted, the values of the source slot are used. + The failover option of the source logical slot + is not copied and is set to false by default. This + is to avoid the risk of being unable to continue logical replication + after failover to standby where the slot is being synchronized. Copy of + an invalidated slot is not allowed. + + + + + + + pg_logical_slot_get_changes + + pg_logical_slot_get_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) + setof record + ( lsn pg_lsn, + xid xid, + data text ) + + + Returns changes in the slot slot_name, starting + from the point from which changes have been consumed last. If + upto_lsn + and upto_nchanges are NULL, + logical decoding will continue until end of WAL. If + upto_lsn is non-NULL, decoding will include only + those transactions which commit prior to the specified LSN. If + upto_nchanges is non-NULL, decoding will + stop when the number of rows produced by decoding exceeds + the specified value. Note, however, that the actual number of + rows returned may be larger, since this limit is only checked after + adding the rows produced when decoding each new transaction commit. + If the specified slot is a logical failover slot then the function will + not return until all physical slots specified in + synchronized_standby_slots + have confirmed WAL receipt. + + + + + + + pg_logical_slot_peek_changes + + pg_logical_slot_peek_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) + setof record + ( lsn pg_lsn, + xid xid, + data text ) + + + Behaves just like + the pg_logical_slot_get_changes() function, + except that changes are not consumed; that is, they will be returned + again on future calls. + + + + + + + pg_logical_slot_get_binary_changes + + pg_logical_slot_get_binary_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) + setof record + ( lsn pg_lsn, + xid xid, + data bytea ) + + + Behaves just like + the pg_logical_slot_get_changes() function, + except that changes are returned as bytea. + + + + + + + pg_logical_slot_peek_binary_changes + + pg_logical_slot_peek_binary_changes ( slot_name name, upto_lsn pg_lsn, upto_nchanges integer, VARIADIC options text[] ) + setof record + ( lsn pg_lsn, + xid xid, + data bytea ) + + + Behaves just like + the pg_logical_slot_peek_changes() function, + except that changes are returned as bytea. + + + + + + + pg_replication_slot_advance + + pg_replication_slot_advance ( slot_name name, upto_lsn pg_lsn ) + record + ( slot_name name, + end_lsn pg_lsn ) + + + Advances the current confirmed position of a replication slot named + slot_name. The slot will not be moved backwards, + and it will not be moved beyond the current insert location. Returns + the name of the slot and the actual position that it was advanced to. + The updated slot position information is written out at the next + checkpoint if any advancing is done. So in the event of a crash, the + slot may return to an earlier position. If the specified slot is a + logical failover slot then the function will not return until all + physical slots specified in + synchronized_standby_slots + have confirmed WAL receipt. + + + + + + + pg_replication_origin_create + + pg_replication_origin_create ( node_name text ) + oid + + + Creates a replication origin with the given external + name, and returns the internal ID assigned to it. + The name must be no longer than 512 bytes. + + + + + + + pg_replication_origin_drop + + pg_replication_origin_drop ( node_name text ) + void + + + Deletes a previously-created replication origin, including any + associated replay progress. + + + + + + + pg_replication_origin_oid + + pg_replication_origin_oid ( node_name text ) + oid + + + Looks up a replication origin by name and returns the internal ID. If + no such replication origin is found, NULL is + returned. + + + + + + + pg_replication_origin_session_setup + + pg_replication_origin_session_setup ( node_name text , pid integer DEFAULT 0 ) + void + + + Marks the current session as replaying from the given + origin, allowing replay progress to be tracked. + Can only be used if no origin is currently selected. + Use pg_replication_origin_session_reset to undo. + If multiple processes can safely use the same replication origin (for + example, parallel apply processes), the optional pid + parameter can be used to specify the process ID of the first process. + The first process must provide pid equals to + 0 and the other processes that share the same + replication origin should provide the process ID of the first process. + + + + When multiple processes share the same replication origin, it is critical + to maintain commit order to prevent data inconsistency. While processes + may send operations out of order, they must commit transactions in the + correct sequence to ensure proper replication consistency. The recommended workflow + for each worker is: set up the replication origin session with the first process's PID, + apply changes within transactions, call pg_replication_origin_xact_setup + with the LSN and commit timestamp before committing, then commit the + transaction only if everything succeeded. + + + + + + + + + pg_replication_origin_session_reset + + pg_replication_origin_session_reset () + void + + + Cancels the effects + of pg_replication_origin_session_setup(). + + + + + + + pg_replication_origin_session_is_setup + + pg_replication_origin_session_is_setup () + boolean + + + Returns true if a replication origin has been selected in the + current session. + + + + + + + pg_replication_origin_session_progress + + pg_replication_origin_session_progress ( flush boolean ) + pg_lsn + + + Returns the replay location for the replication origin selected in + the current session. The parameter flush + determines whether the corresponding local transaction will be + guaranteed to have been flushed to disk or not. + + + + + + + pg_replication_origin_xact_setup + + pg_replication_origin_xact_setup ( origin_lsn pg_lsn, origin_timestamp timestamp with time zone ) + void + + + Marks the current transaction as replaying a transaction that has + committed at the given LSN and timestamp. Can + only be called when a replication origin has been selected + using pg_replication_origin_session_setup. + + + + + + + pg_replication_origin_xact_reset + + pg_replication_origin_xact_reset () + void + + + Cancels the effects of + pg_replication_origin_xact_setup(). + + + + + + + pg_replication_origin_advance + + pg_replication_origin_advance ( node_name text, lsn pg_lsn ) + void + + + Sets replication progress for the given node to the given + location. This is primarily useful for setting up the initial + location, or setting a new location after configuration changes and + similar. Be aware that careless use of this function can lead to + inconsistently replicated data. + + + + + + + pg_replication_origin_progress + + pg_replication_origin_progress ( node_name text, flush boolean ) + pg_lsn + + + Returns the replay location for the given replication origin. The + parameter flush determines whether the + corresponding local transaction will be guaranteed to have been + flushed to disk or not. + + + + + + + pg_logical_emit_message + + pg_logical_emit_message ( transactional boolean, prefix text, content text , flush boolean DEFAULT false ) + pg_lsn + + + pg_logical_emit_message ( transactional boolean, prefix text, content bytea , flush boolean DEFAULT false ) + pg_lsn + + + Emits a logical decoding message. This can be used to pass generic + messages to logical decoding plugins through + WAL. The transactional parameter specifies if + the message should be part of the current transaction, or if it should + be written immediately and decoded as soon as the logical decoder + reads the record. The prefix parameter is a + textual prefix that can be used by logical decoding plugins to easily + recognize messages that are interesting for them. + The content parameter is the content of the + message, given either in text or binary form. + The flush parameter (default set to + false) controls if the message is immediately + flushed to WAL or not. flush has no effect + with transactional, as the message's WAL + record is flushed along with its transaction. + + + + + + + pg_sync_replication_slots + + pg_sync_replication_slots () + void + + + Synchronize the logical failover replication slots from the primary + server to the standby server. This function can only be executed on the + standby server. Temporary synced slots, if any, cannot be used for + logical decoding and must be dropped after promotion. See + for details. + Note that this function cannot be executed if + + sync_replication_slots is enabled and the slotsync + worker is already running to perform the synchronization of slots. + + + + + If, after executing the function, + + hot_standby_feedback is disabled on + the standby or the physical slot configured in + + primary_slot_name is + removed, then it is possible that the necessary rows of the + synchronized slot will be removed by the VACUUM process on the primary + server, resulting in the synchronized slot becoming invalidated. + + + + + + + +
+ +
+ + + Database Object Management Functions + + + The functions shown in calculate + the disk space usage of database objects, or assist in presentation + or understanding of usage results. bigint results + are measured in bytes. If an OID that does + not represent an existing object is passed to one of these + functions, NULL is returned. + + + + Database Object Size Functions + + + + + Function + + + Description + + + + + + + + + pg_column_size + + pg_column_size ( "any" ) + integer + + + Shows the number of bytes used to store any individual data value. If + applied directly to a table column value, this reflects any + compression that was done. + + + + + + + pg_column_compression + + pg_column_compression ( "any" ) + text + + + Shows the compression algorithm that was used to compress + an individual variable-length value. Returns NULL + if the value is not compressed. + + + + + + + pg_column_toast_chunk_id + + pg_column_toast_chunk_id ( "any" ) + oid + + + Shows the chunk_id of an on-disk + TOASTed value. Returns NULL + if the value is un-TOASTed or not on-disk. See + for more information about + TOAST. + + + + + + + pg_database_size + + pg_database_size ( name ) + bigint + + + pg_database_size ( oid ) + bigint + + + Computes the total disk space used by the database with the specified + name or OID. To use this function, you must + have CONNECT privilege on the specified database + (which is granted by default) or have privileges of + the pg_read_all_stats role. + + + + + + + pg_indexes_size + + pg_indexes_size ( regclass ) + bigint + + + Computes the total disk space used by indexes attached to the + specified table. + + + + + + + pg_relation_size + + pg_relation_size ( relation regclass , fork text ) + bigint + + + Computes the disk space used by one fork of the + specified relation. (Note that for most purposes it is more + convenient to use the higher-level + functions pg_total_relation_size + or pg_table_size, which sum the sizes of all + forks.) With one argument, this returns the size of the main data + fork of the relation. The second argument can be provided to specify + which fork to examine: + + + + main returns the size of the main + data fork of the relation. + + + + + fsm returns the size of the Free Space Map + (see ) associated with the relation. + + + + + vm returns the size of the Visibility Map + (see ) associated with the relation. + + + + + init returns the size of the initialization + fork, if any, associated with the relation. + + + + + + + + + + pg_size_bytes + + pg_size_bytes ( text ) + bigint + + + Converts a size in human-readable format (as returned + by pg_size_pretty) into bytes. Valid units are + bytes, B, kB, + MB, GB, TB, + and PB. + + + + + + + pg_size_pretty + + pg_size_pretty ( bigint ) + text + + + pg_size_pretty ( numeric ) + text + + + Converts a size in bytes into a more easily human-readable format with + size units (bytes, kB, MB, GB, TB, or PB as appropriate). Note that the + units are powers of 2 rather than powers of 10, so 1kB is 1024 bytes, + 1MB is 10242 = 1048576 bytes, and so on. + + + + + + + pg_table_size + + pg_table_size ( regclass ) + bigint + + + Computes the disk space used by the specified table, excluding indexes + (but including its TOAST table if any, free space map, and visibility + map). + + + + + + + pg_tablespace_size + + pg_tablespace_size ( name ) + bigint + + + pg_tablespace_size ( oid ) + bigint + + + Computes the total disk space used in the tablespace with the + specified name or OID. To use this function, you must + have CREATE privilege on the specified tablespace + or have privileges of the pg_read_all_stats role, + unless it is the default tablespace for the current database. + + + + + + + pg_total_relation_size + + pg_total_relation_size ( regclass ) + bigint + + + Computes the total disk space used by the specified table, including + all indexes and TOAST data. The result is + equivalent to pg_table_size + + pg_indexes_size. + + + + +
+ + + The functions above that operate on tables or indexes accept a + regclass argument, which is simply the OID of the table or index + in the pg_class system catalog. You do not have to look up + the OID by hand, however, since the regclass data type's input + converter will do the work for you. See + for details. + + + + The functions shown in assist + in identifying the specific disk files associated with database objects. + + + + Database Object Location Functions + + + + + Function + + + Description + + + + + + + + + pg_relation_filenode + + pg_relation_filenode ( relation regclass ) + oid + + + Returns the filenode number currently assigned to the + specified relation. The filenode is the base component of the file + name(s) used for the relation (see + for more information). + For most relations the result is the same as + pg_class.relfilenode, + but for certain system catalogs relfilenode + is zero and this function must be used to get the correct value. The + function returns NULL if passed a relation that does not have storage, + such as a view. + + + + + + + pg_relation_filepath + + pg_relation_filepath ( relation regclass ) + text + + + Returns the entire file path name (relative to the database cluster's + data directory, PGDATA) of the relation. + + + + + + + pg_filenode_relation + + pg_filenode_relation ( tablespace oid, filenode oid ) + regclass + + + Returns a relation's OID given the tablespace OID and filenode it is + stored under. This is essentially the inverse mapping of + pg_relation_filepath. For a relation in the + database's default tablespace, the tablespace can be specified as zero. + Returns NULL if no relation in the current database + is associated with the given values, or if dealing with a temporary + relation. + + + + +
+ + + lists functions used to manage + collations. + + + + Collation Management Functions + + + + + Function + + + Description + + + + + + + + + pg_collation_actual_version + + pg_collation_actual_version ( oid ) + text + + + Returns the actual version of the collation object as it is currently + installed in the operating system. If this is different from the + value in + pg_collation.collversion, + then objects depending on the collation might need to be rebuilt. See + also . + + + + + + + pg_database_collation_actual_version + + pg_database_collation_actual_version ( oid ) + text + + + Returns the actual version of the database's collation as it is currently + installed in the operating system. If this is different from the + value in + pg_database.datcollversion, + then objects depending on the collation might need to be rebuilt. See + also . + + + + + + + pg_import_system_collations + + pg_import_system_collations ( schema regnamespace ) + integer + + + Adds collations to the system + catalog pg_collation based on all the locales + it finds in the operating system. This is + what initdb uses; see + for more details. If additional + locales are installed into the operating system later on, this + function can be run again to add collations for the new locales. + Locales that match existing entries + in pg_collation will be skipped. (But + collation objects based on locales that are no longer present in the + operating system are not removed by this function.) + The schema parameter would typically + be pg_catalog, but that is not a requirement; the + collations could be installed into some other schema as well. The + function returns the number of new collation objects it created. + Use of this function is restricted to superusers. + + + + +
+ + + lists functions used to + manipulate statistics. + These functions cannot be executed during recovery. + + + Changes made by these statistics manipulation functions are likely to be + overwritten by autovacuum (or manual + VACUUM or ANALYZE) and should be + considered temporary. + + + + + + Database Object Statistics Manipulation Functions + + + + + Function + + + Description + + + + + + + + + pg_restore_relation_stats + + pg_restore_relation_stats ( + VARIADIC kwargs "any" ) + boolean + + + Updates table-level statistics. Ordinarily, these statistics are + collected automatically or updated as a part of or , so it's not + necessary to call this function. However, it is useful after a + restore to enable the optimizer to choose better plans if + ANALYZE has not been run yet. + + + The tracked statistics may change from version to version, so + arguments are passed as pairs of argname + and argvalue in the form: + +SELECT pg_restore_relation_stats( + 'arg1name', 'arg1value'::arg1type, + 'arg2name', 'arg2value'::arg2type, + 'arg3name', 'arg3value'::arg3type); + + + + For example, to set the relpages and + reltuples values for the table + mytable: + +SELECT pg_restore_relation_stats( + 'schemaname', 'myschema', + 'relname', 'mytable', + 'relpages', 173::integer, + 'reltuples', 10000::real); + + + + The arguments schemaname and + relname are required, and specify the table. Other + arguments are the names and values of statistics corresponding to + certain columns in pg_class. + The currently-supported relation statistics are + relpages with a value of type + integer, reltuples with a value of + type real, relallvisible with a value + of type integer, and relallfrozen + with a value of type integer. + + + Additionally, this function accepts argument name + version of type integer, which + specifies the server version from which the statistics originated. + This is anticipated to be helpful in porting statistics from older + versions of PostgreSQL. + + + Minor errors are reported as a WARNING and + ignored, and remaining statistics will still be restored. If all + specified statistics are successfully restored, returns + true, otherwise false. + + + The caller must have the MAINTAIN privilege on the + table or be the owner of the database. + + + + + + + + + pg_clear_relation_stats + + pg_clear_relation_stats ( schemaname text, relname text ) + void + + + Clears table-level statistics for the given relation, as though the + table was newly created. + + + The caller must have the MAINTAIN privilege on the + table or be the owner of the database. + + + + + + + + pg_restore_attribute_stats + + pg_restore_attribute_stats ( + VARIADIC kwargs "any" ) + boolean + + + Creates or updates column-level statistics. Ordinarily, these + statistics are collected automatically or updated as a part of or , so it's not + necessary to call this function. However, it is useful after a + restore to enable the optimizer to choose better plans if + ANALYZE has not been run yet. + + + The tracked statistics may change from version to version, so + arguments are passed as pairs of argname + and argvalue in the form: + +SELECT pg_restore_attribute_stats( + 'arg1name', 'arg1value'::arg1type, + 'arg2name', 'arg2value'::arg2type, + 'arg3name', 'arg3value'::arg3type); + + + + For example, to set the avg_width and + null_frac values for the attribute + col1 of the table + mytable: + +SELECT pg_restore_attribute_stats( + 'schemaname', 'myschema', + 'relname', 'mytable', + 'attname', 'col1', + 'inherited', false, + 'avg_width', 125::integer, + 'null_frac', 0.5::real); + + + + The required arguments are schemaname and + relname with a value of type text + which specify the table; either attname with a + value of type text or attnum with a + value of type smallint, which specifies the column; and + inherited, which specifies whether the statistics + include values from child tables. Other arguments are the names and + values of statistics corresponding to columns in pg_stats. + + + Additionally, this function accepts argument name + version of type integer, which + specifies the server version from which the statistics originated. + This is anticipated to be helpful in porting statistics from older + versions of PostgreSQL. + + + Minor errors are reported as a WARNING and + ignored, and remaining statistics will still be restored. If all + specified statistics are successfully restored, returns + true, otherwise false. + + + The caller must have the MAINTAIN privilege on the + table or be the owner of the database. + + + + + + + + + pg_clear_attribute_stats + + pg_clear_attribute_stats ( + schemaname text, + relname text, + attname text, + inherited boolean ) + void + + + Clears column-level statistics for the given relation and + attribute, as though the table was newly created. + + + The caller must have the MAINTAIN privilege on + the table or be the owner of the database. + + + + + +
+ + + lists functions that provide + information about the structure of partitioned tables. + + + + Partitioning Information Functions + + + + + Function + + + Description + + + + + + + + + pg_partition_tree + + pg_partition_tree ( regclass ) + setof record + ( relid regclass, + parentrelid regclass, + isleaf boolean, + level integer ) + + + Lists the tables or indexes in the partition tree of the + given partitioned table or partitioned index, with one row for each + partition. Information provided includes the OID of the partition, + the OID of its immediate parent, a boolean value telling if the + partition is a leaf, and an integer telling its level in the hierarchy. + The level value is 0 for the input table or index, 1 for its + immediate child partitions, 2 for their partitions, and so on. + Returns no rows if the relation does not exist or is not a partition + or partitioned table. + + + + + + + pg_partition_ancestors + + pg_partition_ancestors ( regclass ) + setof regclass + + + Lists the ancestor relations of the given partition, + including the relation itself. Returns no rows if the relation + does not exist or is not a partition or partitioned table. + + + + + + + pg_partition_root + + pg_partition_root ( regclass ) + regclass + + + Returns the top-most parent of the partition tree to which the given + relation belongs. Returns NULL if the relation + does not exist or is not a partition or partitioned table. + + + + +
+ + + For example, to check the total size of the data contained in a + partitioned table measurement, one could use the + following query: + +SELECT pg_size_pretty(sum(pg_relation_size(relid))) AS total_size + FROM pg_partition_tree('measurement'); + + + +
+ + + Index Maintenance Functions + + + shows the functions + available for index maintenance tasks. (Note that these maintenance + tasks are normally done automatically by autovacuum; use of these + functions is only required in special cases.) + These functions cannot be executed during recovery. + Use of these functions is restricted to superusers and the owner + of the given index. + + + + Index Maintenance Functions + + + + + Function + + + Description + + + + + + + + + brin_summarize_new_values + + brin_summarize_new_values ( index regclass ) + integer + + + Scans the specified BRIN index to find page ranges in the base table + that are not currently summarized by the index; for any such range it + creates a new summary index tuple by scanning those table pages. + Returns the number of new page range summaries that were inserted + into the index. + + + + + + + brin_summarize_range + + brin_summarize_range ( index regclass, blockNumber bigint ) + integer + + + Summarizes the page range covering the given block, if not already + summarized. This is + like brin_summarize_new_values except that it + only processes the page range that covers the given table block number. + + + + + + + brin_desummarize_range + + brin_desummarize_range ( index regclass, blockNumber bigint ) + void + + + Removes the BRIN index tuple that summarizes the page range covering + the given table block, if there is one. + + + + + + + gin_clean_pending_list + + gin_clean_pending_list ( index regclass ) + bigint + + + Cleans up the pending list of the specified GIN index + by moving entries in it, in bulk, to the main GIN data structure. + Returns the number of pages removed from the pending list. + If the argument is a GIN index built with + the fastupdate option disabled, no cleanup happens + and the result is zero, because the index doesn't have a pending list. + See and + for details about the pending list and fastupdate + option. + + + + +
+ +
+ + + Generic File Access Functions + + + The functions shown in provide native access to + files on the machine hosting the server. Only files within the + database cluster directory and the log_directory can be + accessed, unless the user is a superuser or is granted the role + pg_read_server_files. Use a relative path for files in + the cluster directory, and a path matching the log_directory + configuration setting for log files. + + + + Note that granting users the EXECUTE privilege on + pg_read_file(), or related functions, allows them the + ability to read any file on the server that the database server process can + read; these functions bypass all in-database privilege checks. This means + that, for example, a user with such access is able to read the contents of + the pg_authid table where authentication + information is stored, as well as read any table data in the database. + Therefore, granting access to these functions should be carefully + considered. + + + + When granting privilege on these functions, note that the table entries + showing optional parameters are mostly implemented as several physical + functions with different parameter lists. Privilege must be granted + separately on each such function, if it is to be + used. psql's \df command + can be useful to check what the actual function signatures are. + + + + Some of these functions take an optional missing_ok + parameter, which specifies the behavior when the file or directory does + not exist. If true, the function + returns NULL or an empty result set, as appropriate. + If false, an error is raised. (Failure conditions + other than file not found are reported as errors in any + case.) The default is false. + + + + Generic File Access Functions + + + + + Function + + + Description + + + + + + + + + pg_ls_dir + + pg_ls_dir ( dirname text , missing_ok boolean, include_dot_dirs boolean ) + setof text + + + Returns the names of all files (and directories and other special + files) in the specified + directory. The include_dot_dirs parameter + indicates whether . and .. are to be + included in the result set; the default is to exclude them. Including + them can be useful when missing_ok + is true, to distinguish an empty directory from a + non-existent directory. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_ls_logdir + + pg_ls_logdir () + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the server's log directory. Filenames beginning with + a dot, directories, and other special files are excluded. + + + This function is restricted to superusers and roles with privileges of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_ls_waldir + + pg_ls_waldir () + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the server's write-ahead log (WAL) directory. + Filenames beginning with a dot, directories, and other special files + are excluded. + + + This function is restricted to superusers and roles with privileges of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_ls_logicalmapdir + + pg_ls_logicalmapdir () + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the server's pg_logical/mappings + directory. Filenames beginning with a dot, directories, and other + special files are excluded. + + + This function is restricted to superusers and members of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_ls_logicalsnapdir + + pg_ls_logicalsnapdir () + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the server's pg_logical/snapshots + directory. Filenames beginning with a dot, directories, and other + special files are excluded. + + + This function is restricted to superusers and members of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_ls_replslotdir + + pg_ls_replslotdir ( slot_name text ) + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the server's pg_replslot/slot_name + directory, where slot_name is the name of the + replication slot provided as input of the function. Filenames beginning + with a dot, directories, and other special files are excluded. + + + This function is restricted to superusers and members of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_ls_summariesdir + + pg_ls_summariesdir () + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the server's WAL summaries directory + (pg_wal/summaries). Filenames beginning + with a dot, directories, and other special files are excluded. + + + This function is restricted to superusers and members of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_ls_archive_statusdir + + pg_ls_archive_statusdir () + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the server's WAL archive status directory + (pg_wal/archive_status). Filenames beginning + with a dot, directories, and other special files are excluded. + + + This function is restricted to superusers and members of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + + pg_ls_tmpdir + + pg_ls_tmpdir ( tablespace oid ) + setof record + ( name text, + size bigint, + modification timestamp with time zone ) + + + Returns the name, size, and last modification time (mtime) of each + ordinary file in the temporary file directory for the + specified tablespace. + If tablespace is not provided, + the pg_default tablespace is examined. Filenames + beginning with a dot, directories, and other special files are + excluded. + + + This function is restricted to superusers and members of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_read_file + + pg_read_file ( filename text , offset bigint, length bigint , missing_ok boolean ) + text + + + Returns all or part of a text file, starting at the + given byte offset, returning at + most length bytes (less if the end of file is + reached first). If offset is negative, it is + relative to the end of the file. If offset + and length are omitted, the entire file is + returned. The bytes read from the file are interpreted as a string in + the database's encoding; an error is thrown if they are not valid in + that encoding. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + + + pg_read_binary_file + + pg_read_binary_file ( filename text , offset bigint, length bigint , missing_ok boolean ) + bytea + + + Returns all or part of a file. This function is identical to + pg_read_file except that it can read arbitrary + binary data, returning the result as bytea + not text; accordingly, no encoding checks are performed. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + In combination with the convert_from function, + this function can be used to read a text file in a specified encoding + and convert to the database's encoding: + +SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8'); + + + + + + + + pg_stat_file + + pg_stat_file ( filename text , missing_ok boolean ) + record + ( size bigint, + access timestamp with time zone, + modification timestamp with time zone, + change timestamp with time zone, + creation timestamp with time zone, + isdir boolean ) + + + Returns a record containing the file's size, last access time stamp, + last modification time stamp, last file status change time stamp (Unix + platforms only), file creation time stamp (Windows only), and a flag + indicating if it is a directory. + + + This function is restricted to superusers by default, but other users + can be granted EXECUTE to run the function. + + + + + +
+ +
+ + + Advisory Lock Functions + + + The functions shown in + manage advisory locks. For details about proper use of these functions, + see . + + + + All these functions are intended to be used to lock application-defined + resources, which can be identified either by a single 64-bit key value or + two 32-bit key values (note that these two key spaces do not overlap). + If another session already holds a conflicting lock on the same resource + identifier, the functions will either wait until the resource becomes + available, or return a false result, as appropriate for + the function. + Locks can be either shared or exclusive: a shared lock does not conflict + with other shared locks on the same resource, only with exclusive locks. + Locks can be taken at session level (so that they are held until released + or the session ends) or at transaction level (so that they are held until + the current transaction ends; there is no provision for manual release). + Multiple session-level lock requests stack, so that if the same resource + identifier is locked three times there must then be three unlock requests + to release the resource in advance of session end. + + + + Advisory Lock Functions + + + + + Function + + + Description + + + + + + + + + pg_advisory_lock + + pg_advisory_lock ( key bigint ) + void + + + pg_advisory_lock ( key1 integer, key2 integer ) + void + + + Obtains an exclusive session-level advisory lock, waiting if necessary. + + + + + + + pg_advisory_lock_shared + + pg_advisory_lock_shared ( key bigint ) + void + + + pg_advisory_lock_shared ( key1 integer, key2 integer ) + void + + + Obtains a shared session-level advisory lock, waiting if necessary. + + + + + + + pg_advisory_unlock + + pg_advisory_unlock ( key bigint ) + boolean + + + pg_advisory_unlock ( key1 integer, key2 integer ) + boolean + + + Releases a previously-acquired exclusive session-level advisory lock. + Returns true if the lock is successfully released. + If the lock was not held, false is returned, and in + addition, an SQL warning will be reported by the server. + + + + + + + pg_advisory_unlock_all + + pg_advisory_unlock_all () + void + + + Releases all session-level advisory locks held by the current session. + (This function is implicitly invoked at session end, even if the + client disconnects ungracefully.) + + + + + + + pg_advisory_unlock_shared + + pg_advisory_unlock_shared ( key bigint ) + boolean + + + pg_advisory_unlock_shared ( key1 integer, key2 integer ) + boolean + + + Releases a previously-acquired shared session-level advisory lock. + Returns true if the lock is successfully released. + If the lock was not held, false is returned, and in + addition, an SQL warning will be reported by the server. + + + + + + + pg_advisory_xact_lock + + pg_advisory_xact_lock ( key bigint ) + void + + + pg_advisory_xact_lock ( key1 integer, key2 integer ) + void + + + Obtains an exclusive transaction-level advisory lock, waiting if + necessary. + + + + + + + pg_advisory_xact_lock_shared + + pg_advisory_xact_lock_shared ( key bigint ) + void + + + pg_advisory_xact_lock_shared ( key1 integer, key2 integer ) + void + + + Obtains a shared transaction-level advisory lock, waiting if + necessary. + + + + + + + pg_try_advisory_lock + + pg_try_advisory_lock ( key bigint ) + boolean + + + pg_try_advisory_lock ( key1 integer, key2 integer ) + boolean + + + Obtains an exclusive session-level advisory lock if available. + This will either obtain the lock immediately and + return true, or return false + without waiting if the lock cannot be acquired immediately. + + + + + + + pg_try_advisory_lock_shared + + pg_try_advisory_lock_shared ( key bigint ) + boolean + + + pg_try_advisory_lock_shared ( key1 integer, key2 integer ) + boolean + + + Obtains a shared session-level advisory lock if available. + This will either obtain the lock immediately and + return true, or return false + without waiting if the lock cannot be acquired immediately. + + + + + + + pg_try_advisory_xact_lock + + pg_try_advisory_xact_lock ( key bigint ) + boolean + + + pg_try_advisory_xact_lock ( key1 integer, key2 integer ) + boolean + + + Obtains an exclusive transaction-level advisory lock if available. + This will either obtain the lock immediately and + return true, or return false + without waiting if the lock cannot be acquired immediately. + + + + + + + pg_try_advisory_xact_lock_shared + + pg_try_advisory_xact_lock_shared ( key bigint ) + boolean + + + pg_try_advisory_xact_lock_shared ( key1 integer, key2 integer ) + boolean + + + Obtains a shared transaction-level advisory lock if available. + This will either obtain the lock immediately and + return true, or return false + without waiting if the lock cannot be acquired immediately. + + + + +
+ +
+ +
diff --git a/doc/src/sgml/func/func-aggregate.sgml b/doc/src/sgml/func/func-aggregate.sgml new file mode 100644 index 0000000000000..8031cde2c96c8 --- /dev/null +++ b/doc/src/sgml/func/func-aggregate.sgml @@ -0,0 +1,1418 @@ + + Aggregate Functions + + + aggregate function + built-in + + + + Aggregate functions compute a single result + from a set of input values. The built-in general-purpose aggregate + functions are listed in + while statistical aggregates are in . + The built-in within-group ordered-set aggregate functions + are listed in + while the built-in within-group hypothetical-set ones are in . Grouping operations, + which are closely related to aggregate functions, are listed in + . + The special syntax considerations for aggregate + functions are explained in . + Consult for additional introductory + information. + + + + Aggregate functions that support Partial Mode + are eligible to participate in various optimizations, such as parallel + aggregation. + + + + While all aggregates below accept an optional + ORDER BY clause (as outlined in ), the clause has only been added to + aggregates whose output is affected by ordering. + + + + General-Purpose Aggregate Functions + + + + + + + Function + + + Description + + Partial Mode + + + + + + + + any_value + + any_value ( anyelement ) + same as input type + + + Returns an arbitrary value from the non-null input values. + + Yes + + + + + + array_agg + + array_agg ( anynonarray ORDER BY input_sort_columns ) + anyarray + + + Collects all the input values, including nulls, into an array. + + Yes + + + + + array_agg ( anyarray ORDER BY input_sort_columns ) + anyarray + + + Concatenates all the input arrays into an array of one higher + dimension. (The inputs must all have the same dimensionality, and + cannot be empty or null.) + + Yes + + + + + + average + + + avg + + avg ( smallint ) + numeric + + + avg ( integer ) + numeric + + + avg ( bigint ) + numeric + + + avg ( numeric ) + numeric + + + avg ( real ) + double precision + + + avg ( double precision ) + double precision + + + avg ( interval ) + interval + + + Computes the average (arithmetic mean) of all the non-null input + values. + + Yes + + + + + + bit_and + + bit_and ( smallint ) + smallint + + + bit_and ( integer ) + integer + + + bit_and ( bigint ) + bigint + + + bit_and ( bit ) + bit + + + Computes the bitwise AND of all non-null input values. + + Yes + + + + + + bit_or + + bit_or ( smallint ) + smallint + + + bit_or ( integer ) + integer + + + bit_or ( bigint ) + bigint + + + bit_or ( bit ) + bit + + + Computes the bitwise OR of all non-null input values. + + Yes + + + + + + bit_xor + + bit_xor ( smallint ) + smallint + + + bit_xor ( integer ) + integer + + + bit_xor ( bigint ) + bigint + + + bit_xor ( bit ) + bit + + + Computes the bitwise exclusive OR of all non-null input values. + Can be useful as a checksum for an unordered set of values. + + Yes + + + + + + bool_and + + bool_and ( boolean ) + boolean + + + Returns true if all non-null input values are true, otherwise false. + + Yes + + + + + + bool_or + + bool_or ( boolean ) + boolean + + + Returns true if any non-null input value is true, otherwise false. + + Yes + + + + + + count + + count ( * ) + bigint + + + Computes the number of input rows. + + Yes + + + + + count ( "any" ) + bigint + + + Computes the number of input rows in which the input value is not + null. + + Yes + + + + + + every + + every ( boolean ) + boolean + + + This is the SQL standard's equivalent to bool_and. + + Yes + + + + + + json_agg + + json_agg ( anyelement ORDER BY input_sort_columns ) + json + + + + jsonb_agg + + jsonb_agg ( anyelement ORDER BY input_sort_columns ) + jsonb + + + Collects all the input values, including nulls, into a JSON array. + Values are converted to JSON as per to_json + or to_jsonb. + + No + + + + + + json_agg_strict + + json_agg_strict ( anyelement ) + json + + + + jsonb_agg_strict + + jsonb_agg_strict ( anyelement ) + jsonb + + + Collects all the input values, skipping nulls, into a JSON array. + Values are converted to JSON as per to_json + or to_jsonb. + + No + + + + + json_arrayagg + json_arrayagg ( + value_expression + ORDER BY sort_expression + { NULL | ABSENT } ON NULL + RETURNING data_type FORMAT JSON ENCODING UTF8 ) + + + Behaves in the same way as json_array + but as an aggregate function so it only takes one + value_expression parameter. + If ABSENT ON NULL is specified, any NULL + values are omitted. + If ORDER BY is specified, the elements will + appear in the array in that order rather than in the input order. + + + SELECT json_arrayagg(v) FROM (VALUES(2),(1)) t(v) + [2, 1] + + No + + + + + json_objectagg + json_objectagg ( + { key_expression { VALUE | ':' } value_expression } + { NULL | ABSENT } ON NULL + { WITH | WITHOUT } UNIQUE KEYS + RETURNING data_type FORMAT JSON ENCODING UTF8 ) + + + Behaves like json_object, but as an + aggregate function, so it only takes one + key_expression and one + value_expression parameter. + + + SELECT json_objectagg(k:v) FROM (VALUES ('a'::text,current_date),('b',current_date + 1)) AS t(k,v) + { "a" : "2022-05-10", "b" : "2022-05-11" } + + No + + + + + + json_object_agg + + json_object_agg ( key + "any", value + "any" + ORDER BY input_sort_columns ) + json + + + + jsonb_object_agg + + jsonb_object_agg ( key + "any", value + "any" + ORDER BY input_sort_columns ) + jsonb + + + Collects all the key/value pairs into a JSON object. Key arguments + are coerced to text; value arguments are converted as per + to_json or to_jsonb. + Values can be null, but keys cannot. + + No + + + + + + json_object_agg_strict + + json_object_agg_strict ( + key "any", + value "any" ) + json + + + + jsonb_object_agg_strict + + jsonb_object_agg_strict ( + key "any", + value "any" ) + jsonb + + + Collects all the key/value pairs into a JSON object. Key arguments + are coerced to text; value arguments are converted as per + to_json or to_jsonb. + The key cannot be null. If the + value is null then the entry is skipped, + + No + + + + + + json_object_agg_unique + + json_object_agg_unique ( + key "any", + value "any" ) + json + + + + jsonb_object_agg_unique + + jsonb_object_agg_unique ( + key "any", + value "any" ) + jsonb + + + Collects all the key/value pairs into a JSON object. Key arguments + are coerced to text; value arguments are converted as per + to_json or to_jsonb. + Values can be null, but keys cannot. + If there is a duplicate key an error is thrown. + + No + + + + + + json_object_agg_unique_strict + + json_object_agg_unique_strict ( + key "any", + value "any" ) + json + + + + jsonb_object_agg_unique_strict + + jsonb_object_agg_unique_strict ( + key "any", + value "any" ) + jsonb + + + Collects all the key/value pairs into a JSON object. Key arguments + are coerced to text; value arguments are converted as per + to_json or to_jsonb. + The key cannot be null. If the + value is null then the entry is skipped. + If there is a duplicate key an error is thrown. + + No + + + + + + max + + max ( see text ) + same as input type + + + Computes the maximum of the non-null input + values. Available for any numeric, string, date/time, or enum type, + as well as bytea, inet, interval, + money, oid, pg_lsn, + tid, xid8, + and also arrays and composite types containing sortable data types. + + Yes + + + + + + min + + min ( see text ) + same as input type + + + Computes the minimum of the non-null input + values. Available for any numeric, string, date/time, or enum type, + as well as bytea, inet, interval, + money, oid, pg_lsn, + tid, xid8, + and also arrays and composite types containing sortable data types. + + Yes + + + + + + range_agg + + range_agg ( value + anyrange ) + anymultirange + + + range_agg ( value + anymultirange ) + anymultirange + + + Computes the union of the non-null input values. + + No + + + + + + range_intersect_agg + + range_intersect_agg ( value + anyrange ) + anyrange + + + range_intersect_agg ( value + anymultirange ) + anymultirange + + + Computes the intersection of the non-null input values. + + No + + + + + + string_agg + + string_agg ( value + text, delimiter text ) + text + + + string_agg ( value + bytea, delimiter bytea + ORDER BY input_sort_columns ) + bytea + + + Concatenates the non-null input values into a string. Each value + after the first is preceded by the + corresponding delimiter (if it's not null). + + Yes + + + + + + sum + + sum ( smallint ) + bigint + + + sum ( integer ) + bigint + + + sum ( bigint ) + numeric + + + sum ( numeric ) + numeric + + + sum ( real ) + real + + + sum ( double precision ) + double precision + + + sum ( interval ) + interval + + + sum ( money ) + money + + + Computes the sum of the non-null input values. + + Yes + + + + + + xmlagg + + xmlagg ( xml ORDER BY input_sort_columns ) + xml + + + Concatenates the non-null XML input values (see + ). + + No + + + +
+ + + It should be noted that except for count, + these functions return a null value when no rows are selected. In + particular, sum of no rows returns null, not + zero as one might expect, and array_agg + returns null rather than an empty array when there are no input + rows. The coalesce function can be used to + substitute zero or an empty array for null when necessary. + + + + The aggregate functions array_agg, + json_agg, jsonb_agg, + json_agg_strict, jsonb_agg_strict, + json_object_agg, jsonb_object_agg, + json_object_agg_strict, jsonb_object_agg_strict, + json_object_agg_unique, jsonb_object_agg_unique, + json_object_agg_unique_strict, + jsonb_object_agg_unique_strict, + string_agg, + and xmlagg, as well as similar user-defined + aggregate functions, produce meaningfully different result values + depending on the order of the input values. This ordering is + unspecified by default, but can be controlled by writing an + ORDER BY clause within the aggregate call, as shown in + . + Alternatively, supplying the input values from a sorted subquery + will usually work. For example: + + + + Beware that this approach can fail if the outer query level contains + additional processing, such as a join, because that might cause the + subquery's output to be reordered before the aggregate is computed. + + + + + ANY + + + SOME + + + The boolean aggregates bool_and and + bool_or correspond to the standard SQL aggregates + every and any or + some. + PostgreSQL + supports every, but not any + or some, because there is an ambiguity built into + the standard syntax: + +SELECT b1 = ANY((SELECT b2 FROM t2 ...)) FROM t1 ...; + + Here ANY can be considered either as introducing + a subquery, or as being an aggregate function, if the subquery + returns one row with a Boolean value. + Thus the standard name cannot be given to these aggregates. + + + + + + Users accustomed to working with other SQL database management + systems might be disappointed by the performance of the + count aggregate when it is applied to the + entire table. A query like: + +SELECT count(*) FROM sometable; + + will require effort proportional to the size of the table: + PostgreSQL will need to scan either the + entire table or the entirety of an index that includes all rows in + the table. + + + + + shows + aggregate functions typically used in statistical analysis. + (These are separated out merely to avoid cluttering the listing + of more-commonly-used aggregates.) Functions shown as + accepting numeric_type are available for all + the types smallint, integer, + bigint, numeric, real, + and double precision. + Where the description mentions + N, it means the + number of input rows for which all the input expressions are non-null. + In all cases, null is returned if the computation is meaningless, + for example when N is zero. + + + + statistics + + + linear regression + + + + Aggregate Functions for Statistics + + + + + + + Function + + + Description + + Partial Mode + + + + + + + + correlation + + + corr + + corr ( Y double precision, X double precision ) + double precision + + + Computes the correlation coefficient. + + Yes + + + + + + covariance + population + + + covar_pop + + covar_pop ( Y double precision, X double precision ) + double precision + + + Computes the population covariance. + + Yes + + + + + + covariance + sample + + + covar_samp + + covar_samp ( Y double precision, X double precision ) + double precision + + + Computes the sample covariance. + + Yes + + + + + + regr_avgx + + regr_avgx ( Y double precision, X double precision ) + double precision + + + Computes the average of the independent variable, + sum(X)/N. + + Yes + + + + + + regr_avgy + + regr_avgy ( Y double precision, X double precision ) + double precision + + + Computes the average of the dependent variable, + sum(Y)/N. + + Yes + + + + + + regr_count + + regr_count ( Y double precision, X double precision ) + bigint + + + Computes the number of rows in which both inputs are non-null. + + Yes + + + + + + regression intercept + + + regr_intercept + + regr_intercept ( Y double precision, X double precision ) + double precision + + + Computes the y-intercept of the least-squares-fit linear equation + determined by the + (X, Y) pairs. + + Yes + + + + + + regr_r2 + + regr_r2 ( Y double precision, X double precision ) + double precision + + + Computes the square of the correlation coefficient. + + Yes + + + + + + regression slope + + + regr_slope + + regr_slope ( Y double precision, X double precision ) + double precision + + + Computes the slope of the least-squares-fit linear equation determined + by the (X, Y) + pairs. + + Yes + + + + + + regr_sxx + + regr_sxx ( Y double precision, X double precision ) + double precision + + + Computes the sum of squares of the independent + variable, + sum(X^2) - sum(X)^2/N. + + Yes + + + + + + regr_sxy + + regr_sxy ( Y double precision, X double precision ) + double precision + + + Computes the sum of products of independent times + dependent variables, + sum(X*Y) - sum(X) * sum(Y)/N. + + Yes + + + + + + regr_syy + + regr_syy ( Y double precision, X double precision ) + double precision + + + Computes the sum of squares of the dependent + variable, + sum(Y^2) - sum(Y)^2/N. + + Yes + + + + + + standard deviation + + + stddev + + stddev ( numeric_type ) + double precision + for real or double precision, + otherwise numeric + + + This is a historical alias for stddev_samp. + + Yes + + + + + + standard deviation + population + + + stddev_pop + + stddev_pop ( numeric_type ) + double precision + for real or double precision, + otherwise numeric + + + Computes the population standard deviation of the input values. + + Yes + + + + + + standard deviation + sample + + + stddev_samp + + stddev_samp ( numeric_type ) + double precision + for real or double precision, + otherwise numeric + + + Computes the sample standard deviation of the input values. + + Yes + + + + + + variance + + variance ( numeric_type ) + double precision + for real or double precision, + otherwise numeric + + + This is a historical alias for var_samp. + + Yes + + + + + + variance + population + + + var_pop + + var_pop ( numeric_type ) + double precision + for real or double precision, + otherwise numeric + + + Computes the population variance of the input values (square of the + population standard deviation). + + Yes + + + + + + variance + sample + + + var_samp + + var_samp ( numeric_type ) + double precision + for real or double precision, + otherwise numeric + + + Computes the sample variance of the input values (square of the sample + standard deviation). + + Yes + + + +
+ + + shows some + aggregate functions that use the ordered-set aggregate + syntax. These functions are sometimes referred to as inverse + distribution functions. Their aggregated input is introduced by + ORDER BY, and they may also take a direct + argument that is not aggregated, but is computed only once. + All these functions ignore null values in their aggregated input. + For those that take a fraction parameter, the + fraction value must be between 0 and 1; an error is thrown if not. + However, a null fraction value simply produces a + null result. + + + + ordered-set aggregate + built-in + + + inverse distribution + + + + Ordered-Set Aggregate Functions + + + + + + + Function + + + Description + + Partial Mode + + + + + + + + mode + statistical + + mode () WITHIN GROUP ( ORDER BY anyelement ) + anyelement + + + Computes the mode, the most frequent + value of the aggregated argument (arbitrarily choosing the first one + if there are multiple equally-frequent values). The aggregated + argument must be of a sortable type. + + No + + + + + + percentile + continuous + + percentile_cont ( fraction double precision ) WITHIN GROUP ( ORDER BY double precision ) + double precision + + + percentile_cont ( fraction double precision ) WITHIN GROUP ( ORDER BY interval ) + interval + + + Computes the continuous percentile, a value + corresponding to the specified fraction + within the ordered set of aggregated argument values. This will + interpolate between adjacent input items if needed. + + No + + + + + percentile_cont ( fractions double precision[] ) WITHIN GROUP ( ORDER BY double precision ) + double precision[] + + + percentile_cont ( fractions double precision[] ) WITHIN GROUP ( ORDER BY interval ) + interval[] + + + Computes multiple continuous percentiles. The result is an array of + the same dimensions as the fractions + parameter, with each non-null element replaced by the (possibly + interpolated) value corresponding to that percentile. + + No + + + + + + percentile + discrete + + percentile_disc ( fraction double precision ) WITHIN GROUP ( ORDER BY anyelement ) + anyelement + + + Computes the discrete percentile, the first + value within the ordered set of aggregated argument values whose + position in the ordering equals or exceeds the + specified fraction. The aggregated + argument must be of a sortable type. + + No + + + + + percentile_disc ( fractions double precision[] ) WITHIN GROUP ( ORDER BY anyelement ) + anyarray + + + Computes multiple discrete percentiles. The result is an array of the + same dimensions as the fractions parameter, + with each non-null element replaced by the input value corresponding + to that percentile. + The aggregated argument must be of a sortable type. + + No + + + +
+ + + hypothetical-set aggregate + built-in + + + + Each of the hypothetical-set aggregates listed in + is associated with a + window function of the same name defined in + . In each case, the aggregate's result + is the value that the associated window function would have + returned for the hypothetical row constructed from + args, if such a row had been added to the sorted + group of rows represented by the sorted_args. + For each of these functions, the list of direct arguments + given in args must match the number and types of + the aggregated arguments given in sorted_args. + Unlike most built-in aggregates, these aggregates are not strict, that is + they do not drop input rows containing nulls. Null values sort according + to the rule specified in the ORDER BY clause. + + + + Hypothetical-Set Aggregate Functions + + + + + + + Function + + + Description + + Partial Mode + + + + + + + + rank + hypothetical + + rank ( args ) WITHIN GROUP ( ORDER BY sorted_args ) + bigint + + + Computes the rank of the hypothetical row, with gaps; that is, the row + number of the first row in its peer group. + + No + + + + + + dense_rank + hypothetical + + dense_rank ( args ) WITHIN GROUP ( ORDER BY sorted_args ) + bigint + + + Computes the rank of the hypothetical row, without gaps; this function + effectively counts peer groups. + + No + + + + + + percent_rank + hypothetical + + percent_rank ( args ) WITHIN GROUP ( ORDER BY sorted_args ) + double precision + + + Computes the relative rank of the hypothetical row, that is + (rank - 1) / (total rows - 1). + The value thus ranges from 0 to 1 inclusive. + + No + + + + + + cume_dist + hypothetical + + cume_dist ( args ) WITHIN GROUP ( ORDER BY sorted_args ) + double precision + + + Computes the cumulative distribution, that is (number of rows + preceding or peers with hypothetical row) / (total rows). The value + thus ranges from 1/N to 1. + + No + + + +
+ + + Grouping Operations + + + + + Function + + + Description + + + + + + + + + GROUPING + + GROUPING ( group_by_expression(s) ) + integer + + + Returns a bit mask indicating which GROUP BY + expressions are not included in the current grouping set. + Bits are assigned with the rightmost argument corresponding to the + least-significant bit; each bit is 0 if the corresponding expression + is included in the grouping criteria of the grouping set generating + the current result row, and 1 if it is not included. + + + + +
+ + + The grouping operations shown in + are used in conjunction with + grouping sets (see ) to distinguish + result rows. The arguments to the GROUPING function + are not actually evaluated, but they must exactly match expressions given + in the GROUP BY clause of the associated query level. + For example: + +=> SELECT * FROM items_sold; + make | model | sales +-------+-------+------- + Foo | GT | 10 + Foo | Tour | 20 + Bar | City | 15 + Bar | Sport | 5 +(4 rows) + +=> SELECT make, model, GROUPING(make,model), sum(sales) FROM items_sold GROUP BY ROLLUP(make,model); + make | model | grouping | sum +-------+-------+----------+----- + Foo | GT | 0 | 10 + Foo | Tour | 0 | 20 + Bar | City | 0 | 15 + Bar | Sport | 0 | 5 + Foo | | 1 | 30 + Bar | | 1 | 20 + | | 3 | 50 +(7 rows) + + Here, the grouping value 0 in the + first four rows shows that those have been grouped normally, over both the + grouping columns. The value 1 indicates + that model was not grouped by in the next-to-last two + rows, and the value 3 indicates that + neither make nor model was grouped + by in the last row (which therefore is an aggregate over all the input + rows). + + +
diff --git a/doc/src/sgml/func/func-array.sgml b/doc/src/sgml/func/func-array.sgml new file mode 100644 index 0000000000000..7f162bd767023 --- /dev/null +++ b/doc/src/sgml/func/func-array.sgml @@ -0,0 +1,646 @@ + + Array Functions and Operators + + + shows the specialized operators + available for array types. + In addition to those, the usual comparison operators shown in are available for + arrays. The comparison operators compare the array contents + element-by-element, using the default B-tree comparison function for + the element data type, and sort based on the first difference. + In multidimensional arrays the elements are visited in row-major order + (last subscript varies most rapidly). + If the contents of two arrays are equal but the dimensionality is + different, the first difference in the dimensionality information + determines the sort order. + + + + Array Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + anyarray @> anyarray + boolean + + + Does the first array contain the second, that is, does each element + appearing in the second array equal some element of the first array? + (Duplicates are not treated specially, + thus ARRAY[1] and ARRAY[1,1] are + each considered to contain the other.) + + + ARRAY[1,4,3] @> ARRAY[3,1,3] + t + + + + + + anyarray <@ anyarray + boolean + + + Is the first array contained by the second? + + + ARRAY[2,2,7] <@ ARRAY[1,7,4,2,6] + t + + + + + + anyarray && anyarray + boolean + + + Do the arrays overlap, that is, have any elements in common? + + + ARRAY[1,4,3] && ARRAY[2,1] + t + + + + + + anycompatiblearray || anycompatiblearray + anycompatiblearray + + + Concatenates the two arrays. Concatenating a null or empty array is a + no-op; otherwise the arrays must have the same number of dimensions + (as illustrated by the first example) or differ in number of + dimensions by one (as illustrated by the second). + If the arrays are not of identical element types, they will be coerced + to a common type (see ). + + + ARRAY[1,2,3] || ARRAY[4,5,6,7] + {1,2,3,4,5,6,7} + + + ARRAY[1,2,3] || ARRAY[[4,5,6],[7,8,9.9]] + {{1,2,3},{4,5,6},{7,8,9.9}} + + + + + + anycompatible || anycompatiblearray + anycompatiblearray + + + Concatenates an element onto the front of an array (which must be + empty or one-dimensional). + + + 3 || ARRAY[4,5,6] + {3,4,5,6} + + + + + + anycompatiblearray || anycompatible + anycompatiblearray + + + Concatenates an element onto the end of an array (which must be + empty or one-dimensional). + + + ARRAY[4,5,6] || 7 + {4,5,6,7} + + + + +
+ + + See for more details about array operator + behavior. See for more details about + which operators support indexed operations. + + + + shows the functions + available for use with array types. See + for more information and examples of the use of these functions. + + + + Array Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + array_append + + array_append ( anycompatiblearray, anycompatible ) + anycompatiblearray + + + Appends an element to the end of an array (same as + the anycompatiblearray || anycompatible + operator). + + + array_append(ARRAY[1,2], 3) + {1,2,3} + + + + + + + array_cat + + array_cat ( anycompatiblearray, anycompatiblearray ) + anycompatiblearray + + + Concatenates two arrays (same as + the anycompatiblearray || anycompatiblearray + operator). + + + array_cat(ARRAY[1,2,3], ARRAY[4,5]) + {1,2,3,4,5} + + + + + + + array_dims + + array_dims ( anyarray ) + text + + + Returns a text representation of the array's dimensions. + + + array_dims(ARRAY[[1,2,3], [4,5,6]]) + [1:2][1:3] + + + + + + + array_fill + + array_fill ( anyelement, integer[] + , integer[] ) + anyarray + + + Returns an array filled with copies of the given value, having + dimensions of the lengths specified by the second argument. + The optional third argument supplies lower-bound values for each + dimension (which default to all 1). + + + array_fill(11, ARRAY[2,3]) + {{11,11,11},{11,11,11}} + + + array_fill(7, ARRAY[3], ARRAY[2]) + [2:4]={7,7,7} + + + + + + + array_length + + array_length ( anyarray, integer ) + integer + + + Returns the length of the requested array dimension. + (Produces NULL instead of 0 for empty or missing array dimensions.) + + + array_length(array[1,2,3], 1) + 3 + + + array_length(array[]::int[], 1) + NULL + + + array_length(array['text'], 2) + NULL + + + + + + + array_lower + + array_lower ( anyarray, integer ) + integer + + + Returns the lower bound of the requested array dimension. + + + array_lower('[0:2]={1,2,3}'::integer[], 1) + 0 + + + + + + + array_ndims + + array_ndims ( anyarray ) + integer + + + Returns the number of dimensions of the array. + + + array_ndims(ARRAY[[1,2,3], [4,5,6]]) + 2 + + + + + + + array_position + + array_position ( anycompatiblearray, anycompatible , integer ) + integer + + + Returns the subscript of the first occurrence of the second argument + in the array, or NULL if it's not present. + If the third argument is given, the search begins at that subscript. + The array must be one-dimensional. + Comparisons are done using IS NOT DISTINCT FROM + semantics, so it is possible to search for NULL. + + + array_position(ARRAY['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'], 'mon') + 2 + + + + + + + array_positions + + array_positions ( anycompatiblearray, anycompatible ) + integer[] + + + Returns an array of the subscripts of all occurrences of the second + argument in the array given as first argument. + The array must be one-dimensional. + Comparisons are done using IS NOT DISTINCT FROM + semantics, so it is possible to search for NULL. + NULL is returned only if the array + is NULL; if the value is not found in the array, an + empty array is returned. + + + array_positions(ARRAY['A','A','B','A'], 'A') + {1,2,4} + + + + + + + array_prepend + + array_prepend ( anycompatible, anycompatiblearray ) + anycompatiblearray + + + Prepends an element to the beginning of an array (same as + the anycompatible || anycompatiblearray + operator). + + + array_prepend(1, ARRAY[2,3]) + {1,2,3} + + + + + + + array_remove + + array_remove ( anycompatiblearray, anycompatible ) + anycompatiblearray + + + Removes all elements equal to the given value from the array. + The array must be one-dimensional. + Comparisons are done using IS NOT DISTINCT FROM + semantics, so it is possible to remove NULLs. + + + array_remove(ARRAY[1,2,3,2], 2) + {1,3} + + + + + + + array_replace + + array_replace ( anycompatiblearray, anycompatible, anycompatible ) + anycompatiblearray + + + Replaces each array element equal to the second argument with the + third argument. + + + array_replace(ARRAY[1,2,5,4], 5, 3) + {1,2,3,4} + + + + + + + array_reverse + + array_reverse ( anyarray ) + anyarray + + + Reverses the first dimension of the array. + + + array_reverse(ARRAY[[1,2],[3,4],[5,6]]) + {{5,6},{3,4},{1,2}} + + + + + + + array_sample + + array_sample ( array anyarray, n integer ) + anyarray + + + Returns an array of n items randomly selected + from array. n may not + exceed the length of array's first dimension. + If array is multi-dimensional, + an item is a slice having a given first subscript. + + + array_sample(ARRAY[1,2,3,4,5,6], 3) + {2,6,1} + + + array_sample(ARRAY[[1,2],[3,4],[5,6]], 2) + {{5,6},{1,2}} + + + + + + + array_shuffle + + array_shuffle ( anyarray ) + anyarray + + + Randomly shuffles the first dimension of the array. + + + array_shuffle(ARRAY[[1,2],[3,4],[5,6]]) + {{5,6},{1,2},{3,4}} + + + + + + + array_sort + + array_sort ( + array anyarray + , descending boolean + , nulls_first boolean + ) + anyarray + + + Sorts the first dimension of the array. + The sort order is determined by the default sort ordering of the + array's element type; however, if the element type is collatable, + the collation to use can be specified by adding + a COLLATE clause to + the array argument. + + + If descending is true then sort in + descending order, otherwise ascending order. If omitted, the + default is ascending order. + If nulls_first is true then nulls appear + before non-null values, otherwise nulls appear after non-null + values. + If omitted, nulls_first is taken to have + the same value as descending. + + + array_sort(ARRAY[[2,4],[2,1],[6,5]]) + {{2,1},{2,4},{6,5}} + + + + + + + array_to_string + + array_to_string ( array anyarray, delimiter text , null_string text ) + text + + + Converts each array element to its text representation, and + concatenates those separated by + the delimiter string. + If null_string is given and is + not NULL, then NULL array + entries are represented by that string; otherwise, they are omitted. + See also string_to_array. + + + array_to_string(ARRAY[1, 2, 3, NULL, 5], ',', '*') + 1,2,3,*,5 + + + + + + + array_upper + + array_upper ( anyarray, integer ) + integer + + + Returns the upper bound of the requested array dimension. + + + array_upper(ARRAY[1,8,3,7], 1) + 4 + + + + + + + cardinality + + cardinality ( anyarray ) + integer + + + Returns the total number of elements in the array, or 0 if the array + is empty. + + + cardinality(ARRAY[[1,2],[3,4]]) + 4 + + + + + + + trim_array + + trim_array ( array anyarray, n integer ) + anyarray + + + Trims an array by removing the last n elements. + If the array is multidimensional, only the first dimension is trimmed. + + + trim_array(ARRAY[1,2,3,4,5,6], 2) + {1,2,3,4} + + + + + + + unnest + + unnest ( anyarray ) + setof anyelement + + + Expands an array into a set of rows. + The array's elements are read out in storage order. + + + unnest(ARRAY[1,2]) + + + 1 + 2 + + + + unnest(ARRAY[['foo','bar'],['baz','quux']]) + + + foo + bar + baz + quux + + + + + + + unnest ( anyarray, anyarray , ... ) + setof anyelement, anyelement [, ... ] + + + Expands multiple arrays (possibly of different data types) into a set of + rows. If the arrays are not all the same length then the shorter ones + are padded with NULLs. This form is only allowed + in a query's FROM clause; see . + + + SELECT * FROM unnest(ARRAY[1, 2], ARRAY['foo', 'bar', 'baz']) AS x(a, b) + + + a | b +---+----- + 1 | foo + 2 | bar + | baz + + + + + +
+ + + See also about the aggregate + function array_agg for use with arrays. + +
diff --git a/doc/src/sgml/func/func-binarystring.sgml b/doc/src/sgml/func/func-binarystring.sgml new file mode 100644 index 0000000000000..b256381e01f06 --- /dev/null +++ b/doc/src/sgml/func/func-binarystring.sgml @@ -0,0 +1,873 @@ + + Binary String Functions and Operators + + + binary data + functions + + + + This section describes functions and operators for examining and + manipulating binary strings, that is values of type bytea. + Many of these are equivalent, in purpose and syntax, to the + text-string functions described in the previous section. + + + + SQL defines some string functions that use + key words, rather than commas, to separate + arguments. Details are in + . + PostgreSQL also provides versions of these functions + that use the regular function invocation syntax + (see ). + + + + <acronym>SQL</acronym> Binary String Functions and Operators + + + + + Function/Operator + + + Description + + + Example(s) + + + + + + + + + binary string + concatenation + + bytea || bytea + bytea + + + Concatenates the two binary strings. + + + '\x123456'::bytea || '\x789a00bcde'::bytea + \x123456789a00bcde + + + + + + + bit_length + + bit_length ( bytea ) + integer + + + Returns number of bits in the binary string (8 + times the octet_length). + + + bit_length('\x123456'::bytea) + 24 + + + + + + + btrim + + btrim ( bytes bytea, + bytesremoved bytea ) + bytea + + + Removes the longest string containing only bytes appearing in + bytesremoved from the start and end of + bytes. + + + btrim('\x1234567890'::bytea, '\x9012'::bytea) + \x345678 + + + + + + + ltrim + + ltrim ( bytes bytea, + bytesremoved bytea ) + bytea + + + Removes the longest string containing only bytes appearing in + bytesremoved from the start of + bytes. + + + ltrim('\x1234567890'::bytea, '\x9012'::bytea) + \x34567890 + + + + + + + octet_length + + octet_length ( bytea ) + integer + + + Returns number of bytes in the binary string. + + + octet_length('\x123456'::bytea) + 3 + + + + + + + overlay + + overlay ( bytes bytea PLACING newsubstring bytea FROM start integer FOR count integer ) + bytea + + + Replaces the substring of bytes that starts at + the start'th byte and extends + for count bytes + with newsubstring. + If count is omitted, it defaults to the length + of newsubstring. + + + overlay('\x1234567890'::bytea PLACING '\002\003'::bytea FROM 2 FOR 3) + \x12020390 + + + + + + + position + + position ( substring bytea IN bytes bytea ) + integer + + + Returns first starting index of the specified + substring within + bytes, or zero if it's not present. + + + position('\x5678'::bytea IN '\x1234567890'::bytea) + 3 + + + + + + + rtrim + + rtrim ( bytes bytea, + bytesremoved bytea ) + bytea + + + Removes the longest string containing only bytes appearing in + bytesremoved from the end of + bytes. + + + rtrim('\x1234567890'::bytea, '\x9012'::bytea) + \x12345678 + + + + + + + substring + + substring ( bytes bytea FROM start integer FOR count integer ) + bytea + + + Extracts the substring of bytes starting at + the start'th byte if that is specified, + and stopping after count bytes if that is + specified. Provide at least one of start + and count. + + + substring('\x1234567890'::bytea FROM 3 FOR 2) + \x5678 + + + + + + + trim + + trim ( LEADING | TRAILING | BOTH + bytesremoved bytea FROM + bytes bytea ) + bytea + + + Removes the longest string containing only bytes appearing in + bytesremoved from the start, + end, or both ends (BOTH is the default) + of bytes. + + + trim('\x9012'::bytea from '\x1234567890'::bytea) + \x345678 + + + + + + trim ( LEADING | TRAILING | BOTH FROM + bytes bytea, + bytesremoved bytea ) + bytea + + + This is a non-standard syntax for trim(). + + + trim(both from '\x1234567890'::bytea, '\x9012'::bytea) + \x345678 + + + + +
+ + + Additional binary string manipulation functions are available and + are listed in . Some + of them are used internally to implement the + SQL-standard string functions listed in . + + + + Other Binary String Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + bit_count + + + popcount + bit_count + + bit_count ( bytes bytea ) + bigint + + + Returns the number of bits set in the binary string (also known as + popcount). + + + bit_count('\x1234567890'::bytea) + 15 + + + + + + + crc32 + + crc32 ( bytea ) + bigint + + + Computes the CRC-32 value of the binary string. + + + crc32('abc'::bytea) + 891568578 + + + + + + + crc32c + + crc32c ( bytea ) + bigint + + + Computes the CRC-32C value of the binary string. + + + crc32c('abc'::bytea) + 910901175 + + + + + + + get_bit + + get_bit ( bytes bytea, + n bigint ) + integer + + + Extracts n'th bit + from binary string. + + + get_bit('\x1234567890'::bytea, 30) + 1 + + + + + + + get_byte + + get_byte ( bytes bytea, + n integer ) + integer + + + Extracts n'th byte + from binary string. + + + get_byte('\x1234567890'::bytea, 4) + 144 + + + + + + + length + + + binary string + length + + + length + of a binary string + binary strings, length + + length ( bytea ) + integer + + + Returns the number of bytes in the binary string. + + + length('\x1234567890'::bytea) + 5 + + + + + + length ( bytes bytea, + encoding name ) + integer + + + Returns the number of characters in the binary string, assuming + that it is text in the given encoding. + + + length('jose'::bytea, 'UTF8') + 4 + + + + + + + md5 + + md5 ( bytea ) + text + + + Computes the MD5 hash of + the binary string, with the result written in hexadecimal. + + + md5('Th\000omas'::bytea) + 8ab2d3c9689aaf18&zwsp;b4958c334c82d8b1 + + + + + + + reverse + + reverse ( bytea ) + bytea + + + Reverses the order of the bytes in the binary string. + + + reverse('\xabcd'::bytea) + \xcdab + + + + + + + set_bit + + set_bit ( bytes bytea, + n bigint, + newvalue integer ) + bytea + + + Sets n'th bit in + binary string to newvalue. + + + set_bit('\x1234567890'::bytea, 30, 0) + \x1234563890 + + + + + + + set_byte + + set_byte ( bytes bytea, + n integer, + newvalue integer ) + bytea + + + Sets n'th byte in + binary string to newvalue. + + + set_byte('\x1234567890'::bytea, 4, 64) + \x1234567840 + + + + + + + sha224 + + sha224 ( bytea ) + bytea + + + Computes the SHA-224 hash + of the binary string. + + + sha224('abc'::bytea) + \x23097d223405d8228642a477bda2&zwsp;55b32aadbce4bda0b3f7e36c9da7 + + + + + + + sha256 + + sha256 ( bytea ) + bytea + + + Computes the SHA-256 hash + of the binary string. + + + sha256('abc'::bytea) + \xba7816bf8f01cfea414140de5dae2223&zwsp;b00361a396177a9cb410ff61f20015ad + + + + + + + sha384 + + sha384 ( bytea ) + bytea + + + Computes the SHA-384 hash + of the binary string. + + + sha384('abc'::bytea) + \xcb00753f45a35e8bb5a03d699ac65007&zwsp;272c32ab0eded1631a8b605a43ff5bed&zwsp;8086072ba1e7cc2358baeca134c825a7 + + + + + + + sha512 + + sha512 ( bytea ) + bytea + + + Computes the SHA-512 hash + of the binary string. + + + sha512('abc'::bytea) + \xddaf35a193617abacc417349ae204131&zwsp;12e6fa4e89a97ea20a9eeee64b55d39a&zwsp;2192992a274fc1a836ba3c23a3feebbd&zwsp;454d4423643ce80e2a9ac94fa54ca49f + + + + + + + substr + + substr ( bytes bytea, start integer , count integer ) + bytea + + + Extracts the substring of bytes starting at + the start'th byte, + and extending for count bytes if that is + specified. (Same + as substring(bytes + from start + for count).) + + + substr('\x1234567890'::bytea, 3, 2) + \x5678 + + + + +
+ + + Functions get_byte and set_byte + number the first byte of a binary string as byte 0. + Functions get_bit and set_bit + number bits from the right within each byte; for example bit 0 is the least + significant bit of the first byte, and bit 15 is the most significant bit + of the second byte. + + + + For historical reasons, the function md5 + returns a hex-encoded value of type text whereas the SHA-2 + functions return type bytea. Use the functions + encode + and decode to + convert between the two. For example write encode(sha256('abc'), + 'hex') to get a hex-encoded text representation, + or decode(md5('abc'), 'hex') to get + a bytea value. + + + + + character string + converting to binary string + + + binary string + converting to character string + + Functions for converting strings between different character sets + (encodings), and for representing arbitrary binary data in textual + form, are shown in + . For these + functions, an argument or result of type text is expressed + in the database's default encoding, while arguments or results of + type bytea are in an encoding named by another argument. + + + + Text/Binary String Conversion Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + convert + + convert ( bytes bytea, + src_encoding name, + dest_encoding name ) + bytea + + + Converts a binary string representing text in + encoding src_encoding + to a binary string in encoding dest_encoding + (see for + available conversions). + + + convert('text_in_utf8', 'UTF8', 'LATIN1') + \x746578745f696e5f75746638 + + + + + + + convert_from + + convert_from ( bytes bytea, + src_encoding name ) + text + + + Converts a binary string representing text in + encoding src_encoding + to text in the database encoding + (see for + available conversions). + + + convert_from('text_in_utf8', 'UTF8') + text_in_utf8 + + + + + + + convert_to + + convert_to ( string text, + dest_encoding name ) + bytea + + + Converts a text string (in the database encoding) to a + binary string encoded in encoding dest_encoding + (see for + available conversions). + + + convert_to('some_text', 'UTF8') + \x736f6d655f74657874 + + + + + + + encode + + encode ( bytes bytea, + format text ) + text + + + Encodes binary data into a textual representation; supported + format values are: + base64, + base64url, + escape, + hex. + + + encode('123\000\001', 'base64') + MTIzAAE= + + + + + + + decode + + decode ( string text, + format text ) + bytea + + + Decodes binary data from a textual representation; supported + format values are the same as + for encode. + + + decode('MTIzAAE=', 'base64') + \x3132330001 + + + + +
+ + + The encode and decode + functions support the following textual formats: + + + + base64 + + base64 format + + + + The base64 format is that + of RFC + 2045 Section 6.8. As per the RFC, encoded lines are + broken at 76 characters. However instead of the MIME CRLF + end-of-line marker, only a newline is used for end-of-line. + The decode function ignores carriage-return, + newline, space, and tab characters. Otherwise, an error is + raised when decode is supplied invalid + base64 data — including when trailing padding is incorrect. + + + + + + base64url + + base64url format + + + + The base64url format is that of + + RFC 4648 Section 5, a base64 variant safe to + use in filenames and URLs. The base64url alphabet + uses '-' instead of '+' and + '_' instead of '/' and also omits + the '=' padding character. + + + + + + escape + + escape format + + + + The escape format converts zero bytes and + bytes with the high bit set into octal escape sequences + (\nnn), and it doubles + backslashes. Other byte values are represented literally. + The decode function will raise an error if a + backslash is not followed by either a second backslash or three + octal digits; it accepts other byte values unchanged. + + + + + + hex + + hex format + + + + The hex format represents each 4 bits of + data as one hexadecimal digit, 0 + through f, writing the higher-order digit of + each byte first. The encode function outputs + the a-f hex digits in lower + case. Because the smallest unit of data is 8 bits, there are + always an even number of characters returned + by encode. + The decode function + accepts the a-f characters in + either upper or lower case. An error is raised + when decode is given invalid hex data + — including when given an odd number of characters. + + + + + + + + In addition, it is possible to cast integral values to and from type + bytea. Casting an integer to bytea produces + 2, 4, or 8 bytes, depending on the width of the integer type. The result + is the two's complement representation of the integer, with the most + significant byte first. Some examples: + +1234::smallint::bytea \x04d2 +cast(1234 AS bytea) \x000004d2 +cast(-1234 AS bytea) \xfffffb2e +'\x8000'::bytea::smallint -32768 +'\x8000'::bytea::integer 32768 + + Casting a bytea to an integer will raise an error if the + length of the bytea exceeds the width of the integer type. + + + + See also the aggregate function string_agg in + and the large object functions + in . + +
diff --git a/doc/src/sgml/func/func-bitstring.sgml b/doc/src/sgml/func/func-bitstring.sgml new file mode 100644 index 0000000000000..3f59de464a44d --- /dev/null +++ b/doc/src/sgml/func/func-bitstring.sgml @@ -0,0 +1,358 @@ + + Bit String Functions and Operators + + + bit strings + functions + + + + This section describes functions and operators for examining and + manipulating bit strings, that is values of the types + bit and bit varying. (While only + type bit is mentioned in these tables, values of + type bit varying can be used interchangeably.) + Bit strings support the usual comparison operators shown in + , as well as the + operators shown in . + + + + Bit String Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + bit || bit + bit + + + Concatenation + + + B'10001' || B'011' + 10001011 + + + + + + bit & bit + bit + + + Bitwise AND (inputs must be of equal length) + + + B'10001' & B'01101' + 00001 + + + + + + bit | bit + bit + + + Bitwise OR (inputs must be of equal length) + + + B'10001' | B'01101' + 11101 + + + + + + bit # bit + bit + + + Bitwise exclusive OR (inputs must be of equal length) + + + B'10001' # B'01101' + 11100 + + + + + + ~ bit + bit + + + Bitwise NOT + + + ~ B'10001' + 01110 + + + + + + bit << integer + bit + + + Bitwise shift left + (string length is preserved) + + + B'10001' << 3 + 01000 + + + + + + bit >> integer + bit + + + Bitwise shift right + (string length is preserved) + + + B'10001' >> 2 + 00100 + + + + +
+ + + Some of the functions available for binary strings are also available + for bit strings, as shown in . + + + + Bit String Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + bit_count + + bit_count ( bit ) + bigint + + + Returns the number of bits set in the bit string (also known as + popcount). + + + bit_count(B'10111') + 4 + + + + + + + bit_length + + bit_length ( bit ) + integer + + + Returns number of bits in the bit string. + + + bit_length(B'10111') + 5 + + + + + + + length + + + bit string + length + + length ( bit ) + integer + + + Returns number of bits in the bit string. + + + length(B'10111') + 5 + + + + + + + octet_length + + octet_length ( bit ) + integer + + + Returns number of bytes in the bit string. + + + octet_length(B'1011111011') + 2 + + + + + + + overlay + + overlay ( bits bit PLACING newsubstring bit FROM start integer FOR count integer ) + bit + + + Replaces the substring of bits that starts at + the start'th bit and extends + for count bits + with newsubstring. + If count is omitted, it defaults to the length + of newsubstring. + + + overlay(B'01010101010101010' PLACING B'11111' FROM 2 FOR 3) + 0111110101010101010 + + + + + + + position + + position ( substring bit IN bits bit ) + integer + + + Returns first starting index of the specified substring + within bits, or zero if it's not present. + + + position(B'010' IN B'000001101011') + 8 + + + + + + + substring + + substring ( bits bit FROM start integer FOR count integer ) + bit + + + Extracts the substring of bits starting at + the start'th bit if that is specified, + and stopping after count bits if that is + specified. Provide at least one of start + and count. + + + substring(B'110010111111' FROM 3 FOR 2) + 00 + + + + + + + get_bit + + get_bit ( bits bit, + n integer ) + integer + + + Extracts n'th bit + from bit string; the first (leftmost) bit is bit 0. + + + get_bit(B'101010101010101010', 6) + 1 + + + + + + + set_bit + + set_bit ( bits bit, + n integer, + newvalue integer ) + bit + + + Sets n'th bit in + bit string to newvalue; + the first (leftmost) bit is bit 0. + + + set_bit(B'101010101010101010', 6, 0) + 101010001010101010 + + + + +
+ + + In addition, it is possible to cast integral values to and from type + bit. + Casting an integer to bit(n) copies the rightmost + n bits. Casting an integer to a bit string width wider + than the integer itself will sign-extend on the left. + Some examples: + +44::bit(10) 0000101100 +44::bit(3) 100 +cast(-44 AS bit(12)) 111111010100 +'1110'::bit(4)::integer 14 + + Note that casting to just bit means casting to + bit(1), and so will deliver only the least significant + bit of the integer. + +
diff --git a/doc/src/sgml/func/func-comparison.sgml b/doc/src/sgml/func/func-comparison.sgml new file mode 100644 index 0000000000000..ecb1d89463a1e --- /dev/null +++ b/doc/src/sgml/func/func-comparison.sgml @@ -0,0 +1,660 @@ + + Comparison Functions and Operators + + + comparison + operators + + + + The usual comparison operators are available, as shown in . + + + + Comparison Operators + + + + Operator + Description + + + + + + + datatype < datatype + boolean + + Less than + + + + + datatype > datatype + boolean + + Greater than + + + + + datatype <= datatype + boolean + + Less than or equal to + + + + + datatype >= datatype + boolean + + Greater than or equal to + + + + + datatype = datatype + boolean + + Equal + + + + + datatype <> datatype + boolean + + Not equal + + + + + datatype != datatype + boolean + + Not equal + + + +
+ + + + <> is the standard SQL notation for not + equal. != is an alias, which is converted + to <> at a very early stage of parsing. + Hence, it is not possible to implement != + and <> operators that do different things. + + + + + These comparison operators are available for all built-in data types + that have a natural ordering, including numeric, string, and date/time + types. In addition, arrays, composite types, and ranges can be compared + if their component data types are comparable. + + + + It is usually possible to compare values of related data + types as well; for example integer > + bigint will work. Some cases of this sort are implemented + directly by cross-type comparison operators, but if no + such operator is available, the parser will coerce the less-general type + to the more-general type and apply the latter's comparison operator. + + + + As shown above, all comparison operators are binary operators that + return values of type boolean. Thus, expressions like + 1 < 2 < 3 are not valid (because there is + no < operator to compare a Boolean value with + 3). Use the BETWEEN predicates + shown below to perform range tests. + + + + There are also some comparison predicates, as shown in . These behave much like + operators, but have special syntax mandated by the SQL standard. + + + + Comparison Predicates + + + + + Predicate + + + Description + + + Example(s) + + + + + + + + datatype BETWEEN datatype AND datatype + boolean + + + Between (inclusive of the range endpoints). + + + 2 BETWEEN 1 AND 3 + t + + + 2 BETWEEN 3 AND 1 + f + + + + + + datatype NOT BETWEEN datatype AND datatype + boolean + + + Not between (the negation of BETWEEN). + + + 2 NOT BETWEEN 1 AND 3 + f + + + + + + datatype BETWEEN SYMMETRIC datatype AND datatype + boolean + + + Between, after sorting the two endpoint values. + + + 2 BETWEEN SYMMETRIC 3 AND 1 + t + + + + + + datatype NOT BETWEEN SYMMETRIC datatype AND datatype + boolean + + + Not between, after sorting the two endpoint values. + + + 2 NOT BETWEEN SYMMETRIC 3 AND 1 + f + + + + + + datatype IS DISTINCT FROM datatype + boolean + + + Not equal, treating null as a comparable value. + + + 1 IS DISTINCT FROM NULL + t (rather than NULL) + + + NULL IS DISTINCT FROM NULL + f (rather than NULL) + + + + + + datatype IS NOT DISTINCT FROM datatype + boolean + + + Equal, treating null as a comparable value. + + + 1 IS NOT DISTINCT FROM NULL + f (rather than NULL) + + + NULL IS NOT DISTINCT FROM NULL + t (rather than NULL) + + + + + + datatype IS NULL + boolean + + + Test whether value is null. + + + 1.5 IS NULL + f + + + + + + datatype IS NOT NULL + boolean + + + Test whether value is not null. + + + 'null' IS NOT NULL + t + + + + + + datatype ISNULL + boolean + + + Test whether value is null (nonstandard syntax). + + + + + + datatype NOTNULL + boolean + + + Test whether value is not null (nonstandard syntax). + + + + + + boolean IS TRUE + boolean + + + Test whether boolean expression yields true. + + + true IS TRUE + t + + + NULL::boolean IS TRUE + f (rather than NULL) + + + + + + boolean IS NOT TRUE + boolean + + + Test whether boolean expression yields false or unknown. + + + true IS NOT TRUE + f + + + NULL::boolean IS NOT TRUE + t (rather than NULL) + + + + + + boolean IS FALSE + boolean + + + Test whether boolean expression yields false. + + + true IS FALSE + f + + + NULL::boolean IS FALSE + f (rather than NULL) + + + + + + boolean IS NOT FALSE + boolean + + + Test whether boolean expression yields true or unknown. + + + true IS NOT FALSE + t + + + NULL::boolean IS NOT FALSE + t (rather than NULL) + + + + + + boolean IS UNKNOWN + boolean + + + Test whether boolean expression yields unknown. + + + true IS UNKNOWN + f + + + NULL::boolean IS UNKNOWN + t (rather than NULL) + + + + + + boolean IS NOT UNKNOWN + boolean + + + Test whether boolean expression yields true or false. + + + true IS NOT UNKNOWN + t + + + NULL::boolean IS NOT UNKNOWN + f (rather than NULL) + + + + +
+ + + + BETWEEN + + + BETWEEN SYMMETRIC + + The BETWEEN predicate simplifies range tests: + +a BETWEEN x AND y + + is equivalent to + +a >= x AND a <= y + + Notice that BETWEEN treats the endpoint values as included + in the range. + BETWEEN SYMMETRIC is like BETWEEN + except there is no requirement that the argument to the left of + AND be less than or equal to the argument on the right. + If it is not, those two arguments are automatically swapped, so that + a nonempty range is always implied. + + + + The various variants of BETWEEN are implemented in + terms of the ordinary comparison operators, and therefore will work for + any data type(s) that can be compared. + + + + + The use of AND in the BETWEEN + syntax creates an ambiguity with the use of AND as a + logical operator. To resolve this, only a limited set of expression + types are allowed as the second argument of a BETWEEN + clause. If you need to write a more complex sub-expression + in BETWEEN, write parentheses around the + sub-expression. + + + + + + IS DISTINCT FROM + + + IS NOT DISTINCT FROM + + Ordinary comparison operators yield null (signifying unknown), + not true or false, when either input is null. For example, + 7 = NULL yields null, as does 7 <> NULL. When + this behavior is not suitable, use the + IS NOT DISTINCT FROM predicates: + +a IS DISTINCT FROM b +a IS NOT DISTINCT FROM b + + For non-null inputs, IS DISTINCT FROM is + the same as the <> operator. However, if both + inputs are null it returns false, and if only one input is + null it returns true. Similarly, IS NOT DISTINCT + FROM is identical to = for non-null + inputs, but it returns true when both inputs are null, and false when only + one input is null. Thus, these predicates effectively act as though null + were a normal data value, rather than unknown. + + + + + IS NULL + + + IS NOT NULL + + + ISNULL + + + NOTNULL + + To check whether a value is or is not null, use the predicates: + +expression IS NULL +expression IS NOT NULL + + or the equivalent, but nonstandard, predicates: + +expression ISNULL +expression NOTNULL + + null valuecomparing + + + + Do not write + expression = NULL + because NULL is not equal to + NULL. (The null value represents an unknown value, + and it is not known whether two unknown values are equal.) + + + + + Some applications might expect that + expression = NULL + returns true if expression evaluates to + the null value. It is highly recommended that these applications + be modified to comply with the SQL standard. However, if that + cannot be done the + configuration variable is available. If it is enabled, + PostgreSQL will convert x = + NULL clauses to x IS NULL. + + + + + If the expression is row-valued, then + IS NULL is true when the row expression itself is null + or when all the row's fields are null, while + IS NOT NULL is true when the row expression itself is non-null + and all the row's fields are non-null. Because of this behavior, + IS NULL and IS NOT NULL do not always return + inverse results for row-valued expressions; in particular, a row-valued + expression that contains both null and non-null fields will return false + for both tests. For example: + + +SELECT ROW(1,2.5,'this is a test') = ROW(1, 3, 'not the same'); + +SELECT ROW(table.*) IS NULL FROM table; -- detect all-null rows + +SELECT ROW(table.*) IS NOT NULL FROM table; -- detect all-non-null rows + +SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in rows + + + In some cases, it may be preferable to + write row IS DISTINCT FROM NULL + or row IS NOT DISTINCT FROM NULL, + which will simply check whether the overall row value is null without any + additional tests on the row fields. + + + + + IS TRUE + + + IS NOT TRUE + + + IS FALSE + + + IS NOT FALSE + + + IS UNKNOWN + + + IS NOT UNKNOWN + + Boolean values can also be tested using the predicates + +boolean_expression IS TRUE +boolean_expression IS NOT TRUE +boolean_expression IS FALSE +boolean_expression IS NOT FALSE +boolean_expression IS UNKNOWN +boolean_expression IS NOT UNKNOWN + + These will always return true or false, never a null value, even when the + operand is null. + A null input is treated as the logical value unknown. + Notice that IS UNKNOWN and IS NOT UNKNOWN are + effectively the same as IS NULL and + IS NOT NULL, respectively, except that the input + expression must be of Boolean type. + + + + Some comparison-related functions are also available, as shown in . + + + + Comparison Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + error_on_null + + error_on_null ( anyelement ) + anyelement + + + Checks if the input is the null value, generating an error if so; + otherwise, returns the input. + + + error_on_null(42) + 42 + + + error_on_null(row(null,null)) + (,) + + + + + + + num_nonnulls + + num_nonnulls ( VARIADIC "any" ) + integer + + + Returns the number of non-null arguments. + + + num_nonnulls(1, NULL, 2) + 2 + + + + + + num_nulls + + num_nulls ( VARIADIC "any" ) + integer + + + Returns the number of null arguments. + + + num_nulls(1, NULL, 2) + 1 + + + + +
+ +
diff --git a/doc/src/sgml/func/func-comparisons.sgml b/doc/src/sgml/func/func-comparisons.sgml new file mode 100644 index 0000000000000..6a6e0bd401920 --- /dev/null +++ b/doc/src/sgml/func/func-comparisons.sgml @@ -0,0 +1,336 @@ + + Row and Array Comparisons + + + IN + + + + NOT IN + + + + ANY + + + + ALL + + + + SOME + + + + composite type + comparison + + + + row-wise comparison + + + + comparison + composite type + + + + comparison + row constructor + + + + IS DISTINCT FROM + + + + IS NOT DISTINCT FROM + + + + This section describes several specialized constructs for making + multiple comparisons between groups of values. These forms are + syntactically related to the subquery forms of the previous section, + but do not involve subqueries. + The forms involving array subexpressions are + PostgreSQL extensions; the rest are + SQL-compliant. + All of the expression forms documented in this section return + Boolean (true/false) results. + + + + <literal>IN</literal> + + +expression IN (value , ...) + + + + The right-hand side is a parenthesized list + of expressions. The result is true if the left-hand expression's + result is equal to any of the right-hand expressions. This is a shorthand + notation for + + +expression = value1 +OR +expression = value2 +OR +... + + + + + Note that if the left-hand expression yields null, or if there are + no equal right-hand values and at least one right-hand expression yields + null, the result of the IN construct will be null, not false. + This is in accordance with SQL's normal rules for Boolean combinations + of null values. + + + + + <literal>NOT IN</literal> + + +expression NOT IN (value , ...) + + + + The right-hand side is a parenthesized list + of expressions. The result is true if the left-hand expression's + result is unequal to all of the right-hand expressions. This is a shorthand + notation for + + +expression <> value1 +AND +expression <> value2 +AND +... + + + + + Note that if the left-hand expression yields null, or if there are + no equal right-hand values and at least one right-hand expression yields + null, the result of the NOT IN construct will be null, not true + as one might naively expect. + This is in accordance with SQL's normal rules for Boolean combinations + of null values. + + + + + x NOT IN y is equivalent to NOT (x IN y) in all + cases. However, null values are much more likely to trip up the novice when + working with NOT IN than when working with IN. + It is best to express your condition positively if possible. + + + + + + <literal>ANY</literal>/<literal>SOME</literal> (array) + + +expression operator ANY (array expression) +expression operator SOME (array expression) + + + + The right-hand side is a parenthesized expression, which must yield an + array value. + The left-hand expression + is evaluated and compared to each element of the array using the + given operator, which must yield a Boolean + result. + The result of ANY is true if any true result is obtained. + The result is false if no true result is found (including the + case where the array has zero elements). + + + + If the array expression yields a null array, the result of + ANY will be null. If the left-hand expression yields null, + the result of ANY is ordinarily null (though a non-strict + comparison operator could possibly yield a different result). + Also, if the right-hand array contains any null elements and no true + comparison result is obtained, the result of ANY + will be null, not false (again, assuming a strict comparison operator). + This is in accordance with SQL's normal rules for Boolean combinations + of null values. + + + + SOME is a synonym for ANY. + + + + + <literal>ALL</literal> (array) + + +expression operator ALL (array expression) + + + + The right-hand side is a parenthesized expression, which must yield an + array value. + The left-hand expression + is evaluated and compared to each element of the array using the + given operator, which must yield a Boolean + result. + The result of ALL is true if all comparisons yield true + (including the case where the array has zero elements). + The result is false if any false result is found. + + + + If the array expression yields a null array, the result of + ALL will be null. If the left-hand expression yields null, + the result of ALL is ordinarily null (though a non-strict + comparison operator could possibly yield a different result). + Also, if the right-hand array contains any null elements and no false + comparison result is obtained, the result of ALL + will be null, not true (again, assuming a strict comparison operator). + This is in accordance with SQL's normal rules for Boolean combinations + of null values. + + + + + Row Constructor Comparison + + +row_constructor operator row_constructor + + + + Each side is a row constructor, + as described in . + The two row constructors must have the same number of fields. + The given operator is applied to each pair + of corresponding fields. (Since the fields could be of different + types, this means that a different specific operator could be selected + for each pair.) + All the selected operators must be members of some B-tree operator + class, or be the negator of an = member of a B-tree + operator class, meaning that row constructor comparison is only + possible when the operator is + =, + <>, + <, + <=, + >, or + >=, + or has semantics similar to one of these. + + + + The = and <> cases work slightly differently + from the others. Two rows are considered + equal if all their corresponding members are non-null and equal; the rows + are unequal if any corresponding members are non-null and unequal; + otherwise the result of the row comparison is unknown (null). + + + + For the <, <=, > and + >= cases, the row elements are compared left-to-right, + stopping as soon as an unequal or null pair of elements is found. + If either of this pair of elements is null, the result of the + row comparison is unknown (null); otherwise comparison of this pair + of elements determines the result. For example, + ROW(1,2,NULL) < ROW(1,3,0) + yields true, not null, because the third pair of elements are not + considered. + + + +row_constructor IS DISTINCT FROM row_constructor + + + + This construct is similar to a <> row comparison, + but it does not yield null for null inputs. Instead, any null value is + considered unequal to (distinct from) any non-null value, and any two + nulls are considered equal (not distinct). Thus the result will + either be true or false, never null. + + + +row_constructor IS NOT DISTINCT FROM row_constructor + + + + This construct is similar to a = row comparison, + but it does not yield null for null inputs. Instead, any null value is + considered unequal to (distinct from) any non-null value, and any two + nulls are considered equal (not distinct). Thus the result will always + be either true or false, never null. + + + + + + Composite Type Comparison + + +record operator record + + + + The SQL specification requires row-wise comparison to return NULL if the + result depends on comparing two NULL values or a NULL and a non-NULL. + PostgreSQL does this only when comparing the + results of two row constructors (as in + ) or comparing a row constructor + to the output of a subquery (as in ). + In other contexts where two composite-type values are compared, two + NULL field values are considered equal, and a NULL is considered larger + than a non-NULL. This is necessary in order to have consistent sorting + and indexing behavior for composite types. + + + + Each side is evaluated and they are compared row-wise. Composite type + comparisons are allowed when the operator is + =, + <>, + <, + <=, + > or + >=, + or has semantics similar to one of these. (To be specific, an operator + can be a row comparison operator if it is a member of a B-tree operator + class, or is the negator of the = member of a B-tree operator + class.) The default behavior of the above operators is the same as for + IS [ NOT ] DISTINCT FROM for row constructors (see + ). + + + + To support matching of rows which include elements without a default + B-tree operator class, the following operators are defined for composite + type comparison: + *=, + *<>, + *<, + *<=, + *>, and + *>=. + These operators compare the internal binary representation of the two + rows. Two rows might have a different binary representation even + though comparisons of the two rows with the equality operator is true. + The ordering of rows under these comparison operators is deterministic + but not otherwise meaningful. These operators are used internally + for materialized views and might be useful for other specialized + purposes such as replication and B-Tree deduplication (see ). They are not intended to be + generally useful for writing queries, though. + + + diff --git a/doc/src/sgml/func/func-conditional.sgml b/doc/src/sgml/func/func-conditional.sgml new file mode 100644 index 0000000000000..7ca53dbf1ab03 --- /dev/null +++ b/doc/src/sgml/func/func-conditional.sgml @@ -0,0 +1,283 @@ + + Conditional Expressions + + + CASE + + + + conditional expression + + + + This section describes the SQL-compliant conditional expressions + available in PostgreSQL. + + + + + If your needs go beyond the capabilities of these conditional + expressions, you might want to consider writing a server-side function + in a more expressive programming language. + + + + + + Although COALESCE, GREATEST, and + LEAST are syntactically similar to functions, they are + not ordinary functions, and thus cannot be used with explicit + VARIADIC array arguments. + + + + + <literal>CASE</literal> + + + The SQL CASE expression is a + generic conditional expression, similar to if/else statements in + other programming languages: + + +CASE WHEN condition THEN result + WHEN ... + ELSE result +END + + + CASE clauses can be used wherever + an expression is valid. Each condition is an + expression that returns a boolean result. If the condition's + result is true, the value of the CASE expression is the + result that follows the condition, and the + remainder of the CASE expression is not processed. If the + condition's result is not true, any subsequent WHEN clauses + are examined in the same manner. If no WHEN + condition yields true, the value of the + CASE expression is the result of the + ELSE clause. If the ELSE clause is + omitted and no condition is true, the result is null. + + + + An example: + +SELECT * FROM test; + + a +--- + 1 + 2 + 3 + + +SELECT a, + CASE WHEN a=1 THEN 'one' + WHEN a=2 THEN 'two' + ELSE 'other' + END + FROM test; + + a | case +---+------- + 1 | one + 2 | two + 3 | other + + + + + The data types of all the result + expressions must be convertible to a single output type. + See for more details. + + + + There is a simple form of CASE expression + that is a variant of the general form above: + + +CASE expression + WHEN value THEN result + WHEN ... + ELSE result +END + + + The first + expression is computed, then compared to + each of the value expressions in the + WHEN clauses until one is found that is equal to it. If + no match is found, the result of the + ELSE clause (or a null value) is returned. This is similar + to the switch statement in C. + + + + The example above can be written using the simple + CASE syntax: + +SELECT a, + CASE a WHEN 1 THEN 'one' + WHEN 2 THEN 'two' + ELSE 'other' + END + FROM test; + + a | case +---+------- + 1 | one + 2 | two + 3 | other + + + + + A CASE expression does not evaluate any subexpressions + that are not needed to determine the result. For example, this is a + possible way of avoiding a division-by-zero failure: + +SELECT ... WHERE CASE WHEN x <> 0 THEN y/x > 1.5 ELSE false END; + + + + + + As described in , there are various + situations in which subexpressions of an expression are evaluated at + different times, so that the principle that CASE + evaluates only necessary subexpressions is not ironclad. For + example a constant 1/0 subexpression will usually result in + a division-by-zero failure at planning time, even if it's within + a CASE arm that would never be entered at run time. + + + + + + <literal>COALESCE</literal> + + + COALESCE + + + + NVL + + + + IFNULL + + + +COALESCE(value , ...) + + + + The COALESCE function returns the first of its + arguments that is not null. Null is returned only if all arguments + are null. It is often used to substitute a default value for + null values when data is retrieved for display, for example: + +SELECT COALESCE(description, short_description, '(none)') ... + + This returns description if it is not null, otherwise + short_description if it is not null, otherwise (none). + + + + The arguments must all be convertible to a common data type, which + will be the type of the result (see + for details). + + + + Like a CASE expression, COALESCE only + evaluates the arguments that are needed to determine the result; + that is, arguments to the right of the first non-null argument are + not evaluated. This SQL-standard function provides capabilities similar + to NVL and IFNULL, which are used in some other + database systems. + + + + + <literal>NULLIF</literal> + + + NULLIF + + + +NULLIF(value1, value2) + + + + The NULLIF function returns a null value if + value1 equals value2; + otherwise it returns value1. + This can be used to perform the inverse operation of the + COALESCE example given above: + +SELECT NULLIF(value, '(none)') ... + + In this example, if value is (none), + null is returned, otherwise the value of value + is returned. + + + + The two arguments must be of comparable types. + To be specific, they are compared exactly as if you had + written value1 + = value2, so there must be a + suitable = operator available. + + + + The result has the same type as the first argument — but there is + a subtlety. What is actually returned is the first argument of the + implied = operator, and in some cases that will have + been promoted to match the second argument's type. For + example, NULLIF(1, 2.2) yields numeric, + because there is no integer = + numeric operator, + only numeric = numeric. + + + + + + <literal>GREATEST</literal> and <literal>LEAST</literal> + + + GREATEST + + + LEAST + + + +GREATEST(value , ...) + + +LEAST(value , ...) + + + + The GREATEST and LEAST functions select the + largest or smallest value from a list of any number of expressions. + The expressions must all be convertible to a common data type, which + will be the type of the result + (see for details). + + + + NULL values in the argument list are ignored. The result will be NULL + only if all the expressions evaluate to NULL. (This is a deviation from + the SQL standard. According to the standard, the return value is NULL if + any argument is NULL. Some other databases behave this way.) + + + diff --git a/doc/src/sgml/func/func-datetime.sgml b/doc/src/sgml/func/func-datetime.sgml new file mode 100644 index 0000000000000..39dddde4fe126 --- /dev/null +++ b/doc/src/sgml/func/func-datetime.sgml @@ -0,0 +1,2236 @@ + + Date/Time Functions and Operators + + + shows the available + functions for date/time value processing, with details appearing in + the following subsections. illustrates the behaviors of + the basic arithmetic operators (+, + *, etc.). For formatting functions, refer to + . You should be familiar with + the background information on date/time data types from . + + + + In addition, the usual comparison operators shown in + are available for the + date/time types. Dates and timestamps (with or without time zone) are + all comparable, while times (with or without time zone) and intervals + can only be compared to other values of the same data type. When + comparing a timestamp without time zone to a timestamp with time zone, + the former value is assumed to be given in the time zone specified by + the configuration parameter, and is + rotated to UTC for comparison to the latter value (which is already + in UTC internally). Similarly, a date value is assumed to represent + midnight in the TimeZone zone when comparing it + to a timestamp. + + + + All the functions and operators described below that take time or timestamp + inputs actually come in two variants: one that takes time with time zone or timestamp + with time zone, and one that takes time without time zone or timestamp without time zone. + For brevity, these variants are not shown separately. Also, the + + and * operators come in commutative pairs (for + example both date + integer + and integer + date); we show + only one of each such pair. + + + + Date/Time Operators + + + + + + Operator + + + Description + + + Example(s) + + + + + + + + date + integer + date + + + Add a number of days to a date + + + date '2001-09-28' + 7 + 2001-10-05 + + + + + + date + interval + timestamp + + + Add an interval to a date + + + date '2001-09-28' + interval '1 hour' + 2001-09-28 01:00:00 + + + + + + date + time + timestamp + + + Add a time-of-day to a date + + + date '2001-09-28' + time '03:00' + 2001-09-28 03:00:00 + + + + + + interval + interval + interval + + + Add intervals + + + interval '1 day' + interval '1 hour' + 1 day 01:00:00 + + + + + + timestamp + interval + timestamp + + + Add an interval to a timestamp + + + timestamp '2001-09-28 01:00' + interval '23 hours' + 2001-09-29 00:00:00 + + + + + + time + interval + time + + + Add an interval to a time + + + time '01:00' + interval '3 hours' + 04:00:00 + + + + + + - interval + interval + + + Negate an interval + + + - interval '23 hours' + -23:00:00 + + + + + + date - date + integer + + + Subtract dates, producing the number of days elapsed + + + date '2001-10-01' - date '2001-09-28' + 3 + + + + + + date - integer + date + + + Subtract a number of days from a date + + + date '2001-10-01' - 7 + 2001-09-24 + + + + + + date - interval + timestamp + + + Subtract an interval from a date + + + date '2001-09-28' - interval '1 hour' + 2001-09-27 23:00:00 + + + + + + time - time + interval + + + Subtract times + + + time '05:00' - time '03:00' + 02:00:00 + + + + + + time - interval + time + + + Subtract an interval from a time + + + time '05:00' - interval '2 hours' + 03:00:00 + + + + + + timestamp - interval + timestamp + + + Subtract an interval from a timestamp + + + timestamp '2001-09-28 23:00' - interval '23 hours' + 2001-09-28 00:00:00 + + + + + + interval - interval + interval + + + Subtract intervals + + + interval '1 day' - interval '1 hour' + 1 day -01:00:00 + + + + + + timestamp - timestamp + interval + + + Subtract timestamps (converting 24-hour intervals into days, + similarly to justify_hours()) + + + timestamp '2001-09-29 03:00' - timestamp '2001-07-27 12:00' + 63 days 15:00:00 + + + + + + interval * double precision + interval + + + Multiply an interval by a scalar + + + interval '1 second' * 900 + 00:15:00 + + + interval '1 day' * 21 + 21 days + + + interval '1 hour' * 3.5 + 03:30:00 + + + + + + interval / double precision + interval + + + Divide an interval by a scalar + + + interval '1 hour' / 1.5 + 00:40:00 + + + + +
+ + + Date/Time Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + age + + age ( timestamp, timestamp ) + interval + + + Subtract arguments, producing a symbolic result that + uses years and months, rather than just days + + + age(timestamp '2001-04-10', timestamp '1957-06-13') + 43 years 9 mons 27 days + + + + + + age ( timestamp ) + interval + + + Subtract argument from current_date (at midnight) + + + age(timestamp '1957-06-13') + 62 years 6 mons 10 days + + + + + + + clock_timestamp + + clock_timestamp ( ) + timestamp with time zone + + + Current date and time (changes during statement execution); + see + + + clock_timestamp() + 2019-12-23 14:39:53.662522-05 + + + + + + + current_date + + current_date + date + + + Current date; see + + + current_date + 2019-12-23 + + + + + + + current_time + + current_time + time with time zone + + + Current time of day; see + + + current_time + 14:39:53.662522-05 + + + + + + current_time ( integer ) + time with time zone + + + Current time of day, with limited precision; + see + + + current_time(2) + 14:39:53.66-05 + + + + + + + current_timestamp + + current_timestamp + timestamp with time zone + + + Current date and time (start of current transaction); + see + + + current_timestamp + 2019-12-23 14:39:53.662522-05 + + + + + + current_timestamp ( integer ) + timestamp with time zone + + + Current date and time (start of current transaction), with limited precision; + see + + + current_timestamp(0) + 2019-12-23 14:39:53-05 + + + + + + + date_add + + date_add ( timestamp with time zone, interval , text ) + timestamp with time zone + + + Add an interval to a timestamp with time + zone, computing times of day and daylight-savings adjustments + according to the time zone named by the third argument, or the + current setting if that is omitted. + The form with two arguments is equivalent to the timestamp with + time zone + interval operator. + + + date_add('2021-10-31 00:00:00+02'::timestamptz, '1 day'::interval, 'Europe/Warsaw') + 2021-10-31 23:00:00+00 + + + + + + date_bin ( interval, timestamp, timestamp ) + timestamp + + + Bin input into specified interval aligned with specified origin; see + + + date_bin('15 minutes', timestamp '2001-02-16 20:38:40', timestamp '2001-02-16 20:05:00') + 2001-02-16 20:35:00 + + + + + + + date_part + + date_part ( text, timestamp ) + double precision + + + Get timestamp subfield (equivalent to extract); + see + + + date_part('hour', timestamp '2001-02-16 20:38:40') + 20 + + + + + + date_part ( text, interval ) + double precision + + + Get interval subfield (equivalent to extract); + see + + + date_part('month', interval '2 years 3 months') + 3 + + + + + + + date_subtract + + date_subtract ( timestamp with time zone, interval , text ) + timestamp with time zone + + + Subtract an interval from a timestamp with time + zone, computing times of day and daylight-savings adjustments + according to the time zone named by the third argument, or the + current setting if that is omitted. + The form with two arguments is equivalent to the timestamp with + time zone - interval operator. + + + date_subtract('2021-11-01 00:00:00+01'::timestamptz, '1 day'::interval, 'Europe/Warsaw') + 2021-10-30 22:00:00+00 + + + + + + + date_trunc + + date_trunc ( text, timestamp ) + timestamp + + + Truncate to specified precision; see + + + date_trunc('hour', timestamp '2001-02-16 20:38:40') + 2001-02-16 20:00:00 + + + + + + date_trunc ( text, timestamp with time zone, text ) + timestamp with time zone + + + Truncate to specified precision in the specified time zone; see + + + + date_trunc('day', timestamptz '2001-02-16 20:38:40+00', 'Australia/Sydney') + 2001-02-16 13:00:00+00 + + + + + + date_trunc ( text, interval ) + interval + + + Truncate to specified precision; see + + + + date_trunc('hour', interval '2 days 3 hours 40 minutes') + 2 days 03:00:00 + + + + + + + extract + + extract ( field FROM timestamp ) + numeric + + + Get timestamp subfield; see + + + extract(hour FROM timestamp '2001-02-16 20:38:40') + 20 + + + + + + extract ( field FROM interval ) + numeric + + + Get interval subfield; see + + + extract(month FROM interval '2 years 3 months') + 3 + + + + + + + isfinite + + isfinite ( date ) + boolean + + + Test for finite date (not +/-infinity) + + + isfinite(date '2001-02-16') + true + + + + + + isfinite ( timestamp ) + boolean + + + Test for finite timestamp (not +/-infinity) + + + isfinite(timestamp 'infinity') + false + + + + + + isfinite ( interval ) + boolean + + + Test for finite interval (not +/-infinity) + + + isfinite(interval '4 hours') + true + + + + + + + justify_days + + justify_days ( interval ) + interval + + + Adjust interval, converting 30-day time periods to months + + + justify_days(interval '1 year 65 days') + 1 year 2 mons 5 days + + + + + + + justify_hours + + justify_hours ( interval ) + interval + + + Adjust interval, converting 24-hour time periods to days + + + justify_hours(interval '50 hours 10 minutes') + 2 days 02:10:00 + + + + + + + justify_interval + + justify_interval ( interval ) + interval + + + Adjust interval using justify_days + and justify_hours, with additional sign + adjustments + + + justify_interval(interval '1 mon -1 hour') + 29 days 23:00:00 + + + + + + + localtime + + localtime + time + + + Current time of day; + see + + + localtime + 14:39:53.662522 + + + + + + localtime ( integer ) + time + + + Current time of day, with limited precision; + see + + + localtime(0) + 14:39:53 + + + + + + + localtimestamp + + localtimestamp + timestamp + + + Current date and time (start of current transaction); + see + + + localtimestamp + 2019-12-23 14:39:53.662522 + + + + + + localtimestamp ( integer ) + timestamp + + + Current date and time (start of current + transaction), with limited precision; + see + + + localtimestamp(2) + 2019-12-23 14:39:53.66 + + + + + + + make_date + + make_date ( year int, + month int, + day int ) + date + + + Create date from year, month and day fields + (negative years signify BC) + + + make_date(2013, 7, 15) + 2013-07-15 + + + + + + make_interval + + make_interval ( years int + , months int + , weeks int + , days int + , hours int + , mins int + , secs double precision + ) + interval + + + Create interval from years, months, weeks, days, hours, minutes and + seconds fields, each of which can default to zero + + + make_interval(days => 10) + 10 days + + + + + + + make_time + + make_time ( hour int, + min int, + sec double precision ) + time + + + Create time from hour, minute and seconds fields + + + make_time(8, 15, 23.5) + 08:15:23.5 + + + + + + + make_timestamp + + make_timestamp ( year int, + month int, + day int, + hour int, + min int, + sec double precision ) + timestamp + + + Create timestamp from year, month, day, hour, minute and seconds fields + (negative years signify BC) + + + make_timestamp(2013, 7, 15, 8, 15, 23.5) + 2013-07-15 08:15:23.5 + + + + + + + make_timestamptz + + make_timestamptz ( year int, + month int, + day int, + hour int, + min int, + sec double precision + , timezone text ) + timestamp with time zone + + + Create timestamp with time zone from year, month, day, hour, minute + and seconds fields (negative years signify BC). + If timezone is not + specified, the current time zone is used; the examples assume the + session time zone is Europe/London + + + make_timestamptz(2013, 7, 15, 8, 15, 23.5) + 2013-07-15 08:15:23.5+01 + + + make_timestamptz(2013, 7, 15, 8, 15, 23.5, 'America/New_York') + 2013-07-15 13:15:23.5+01 + + + + + + + now + + now ( ) + timestamp with time zone + + + Current date and time (start of current transaction); + see + + + now() + 2019-12-23 14:39:53.662522-05 + + + + + + + random + + random ( min date, max date ) + date + + + random ( min timestamp, max timestamp ) + timestamp + + + random ( min timestamptz, max timestamptz ) + timestamptz + + + Returns a random value in the range + min <= x <= max. + + + Note that these functions use the same pseudo-random number generator + as the functions listed in , + and respond in the same way to calling + setseed(). + + + random('1979-02-08'::date,'2025-07-03'::date) + 1983-04-21 + + + random('2000-01-01'::timestamptz, now()) + 2015-09-27 09:11:33.732707+00 + + + + + + + statement_timestamp + + statement_timestamp ( ) + timestamp with time zone + + + Current date and time (start of current statement); + see + + + statement_timestamp() + 2019-12-23 14:39:53.662522-05 + + + + + + + timeofday + + timeofday ( ) + text + + + Current date and time + (like clock_timestamp, but as a text string); + see + + + timeofday() + Mon Dec 23 14:39:53.662522 2019 EST + + + + + + + transaction_timestamp + + transaction_timestamp ( ) + timestamp with time zone + + + Current date and time (start of current transaction); + see + + + transaction_timestamp() + 2019-12-23 14:39:53.662522-05 + + + + + + + to_timestamp + + to_timestamp ( double precision ) + timestamp with time zone + + + Convert Unix epoch (seconds since 1970-01-01 00:00:00+00) to + timestamp with time zone + + + to_timestamp(1284352323) + 2010-09-13 04:32:03+00 + + + + +
+ + + + OVERLAPS + + In addition to these functions, the SQL OVERLAPS operator is + supported: + +(start1, end1) OVERLAPS (start2, end2) +(start1, length1) OVERLAPS (start2, length2) + + This expression yields true when two time periods (defined by their + endpoints) overlap, false when they do not overlap. The endpoints + can be specified as pairs of dates, times, or time stamps; or as + a date, time, or time stamp followed by an interval. When a pair + of values is provided, either the start or the end can be written + first; OVERLAPS automatically takes the earlier value + of the pair as the start. Each time period is considered to + represent the half-open interval start <= + time < end, unless + start and end are equal in which case it + represents that single time instant. This means for instance that two + time periods with only an endpoint in common do not overlap. + + + +SELECT (DATE '2001-02-16', DATE '2001-12-21') OVERLAPS + (DATE '2001-10-30', DATE '2002-10-30'); +Result: true +SELECT (DATE '2001-02-16', INTERVAL '100 days') OVERLAPS + (DATE '2001-10-30', DATE '2002-10-30'); +Result: false +SELECT (DATE '2001-10-29', DATE '2001-10-30') OVERLAPS + (DATE '2001-10-30', DATE '2001-10-31'); +Result: false +SELECT (DATE '2001-10-30', DATE '2001-10-30') OVERLAPS + (DATE '2001-10-30', DATE '2001-10-31'); +Result: true + + + + When adding an interval value to (or subtracting an + interval value from) a timestamp + or timestamp with time zone value, the months, days, and + microseconds fields of the interval value are handled in turn. + First, a nonzero months field advances or decrements the date of the + timestamp by the indicated number of months, keeping the day of month the + same unless it would be past the end of the new month, in which case the + last day of that month is used. (For example, March 31 plus 1 month + becomes April 30, but March 31 plus 2 months becomes May 31.) + Then the days field advances or decrements the date of the timestamp by + the indicated number of days. In both these steps the local time of day + is kept the same. Finally, if there is a nonzero microseconds field, it + is added or subtracted literally. + When doing arithmetic on a timestamp with time zone value in + a time zone that recognizes DST, this means that adding or subtracting + (say) interval '1 day' does not necessarily have the + same result as adding or subtracting interval '24 + hours'. + For example, with the session time zone set + to America/Denver: + +SELECT timestamp with time zone '2005-04-02 12:00:00-07' + interval '1 day'; +Result: 2005-04-03 12:00:00-06 +SELECT timestamp with time zone '2005-04-02 12:00:00-07' + interval '24 hours'; +Result: 2005-04-03 13:00:00-06 + + This happens because an hour was skipped due to a change in daylight saving + time at 2005-04-03 02:00:00 in time zone + America/Denver. + + + + Note there can be ambiguity in the months field returned by + age because different months have different numbers of + days. PostgreSQL's approach uses the month from the + earlier of the two dates when calculating partial months. For example, + age('2004-06-01', '2004-04-30') uses April to yield + 1 mon 1 day, while using May would yield 1 mon 2 + days because May has 31 days, while April has only 30. + + + + Subtraction of dates and timestamps can also be complex. One conceptually + simple way to perform subtraction is to convert each value to a number + of seconds using EXTRACT(EPOCH FROM ...), then subtract the + results; this produces the + number of seconds between the two values. This will adjust + for the number of days in each month, timezone changes, and daylight + saving time adjustments. Subtraction of date or timestamp + values with the - operator + returns the number of days (24-hours) and hours/minutes/seconds + between the values, making the same adjustments. The age + function returns years, months, days, and hours/minutes/seconds, + performing field-by-field subtraction and then adjusting for negative + field values. The following queries illustrate the differences in these + approaches. The sample results were produced with timezone + = 'US/Eastern'; there is a daylight saving time change between the + two dates used: + + + +SELECT EXTRACT(EPOCH FROM timestamptz '2013-07-01 12:00:00') - + EXTRACT(EPOCH FROM timestamptz '2013-03-01 12:00:00'); +Result: 10537200.000000 +SELECT (EXTRACT(EPOCH FROM timestamptz '2013-07-01 12:00:00') - + EXTRACT(EPOCH FROM timestamptz '2013-03-01 12:00:00')) + / 60 / 60 / 24; +Result: 121.9583333333333333 +SELECT timestamptz '2013-07-01 12:00:00' - timestamptz '2013-03-01 12:00:00'; +Result: 121 days 23:00:00 +SELECT age(timestamptz '2013-07-01 12:00:00', timestamptz '2013-03-01 12:00:00'); +Result: 4 mons + + + + <function>EXTRACT</function>, <function>date_part</function> + + + date_part + + + extract + + + +EXTRACT(field FROM source) + + + + The extract function retrieves subfields + such as year or hour from date/time values. + source must be a value expression of + type timestamp, date, time, + or interval. (Timestamps and times can be with or + without time zone.) + field is an identifier or + string that selects what field to extract from the source value. + Not all fields are valid for every input data type; for example, fields + smaller than a day cannot be extracted from a date, while + fields of a day or more cannot be extracted from a time. + The extract function returns values of type + numeric. + + + + The following are valid field names: + + + + + century + + + The century; for interval values, the year field + divided by 100 + + + +SELECT EXTRACT(CENTURY FROM TIMESTAMP '2000-12-16 12:21:13'); +Result: 20 +SELECT EXTRACT(CENTURY FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 21 +SELECT EXTRACT(CENTURY FROM DATE '0001-01-01 AD'); +Result: 1 +SELECT EXTRACT(CENTURY FROM DATE '0001-12-31 BC'); +Result: -1 +SELECT EXTRACT(CENTURY FROM INTERVAL '2001 years'); +Result: 20 + + + + + + day + + + The day of the month (1–31); for interval + values, the number of days + + + +SELECT EXTRACT(DAY FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 16 +SELECT EXTRACT(DAY FROM INTERVAL '40 days 1 minute'); +Result: 40 + + + + + + + decade + + + The year field divided by 10 + + + +SELECT EXTRACT(DECADE FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 200 + + + + + + dow + + + The day of the week as Sunday (0) to + Saturday (6) + + + +SELECT EXTRACT(DOW FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 5 + + + Note that extract's day of the week numbering + differs from that of the to_char(..., + 'D') function. + + + + + + + doy + + + The day of the year (1–365/366) + + + +SELECT EXTRACT(DOY FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 47 + + + + + + epoch + + + For timestamp with time zone values, the + number of seconds since 1970-01-01 00:00:00 UTC (negative for + timestamps before that); + for date and timestamp values, the + nominal number of seconds since 1970-01-01 00:00:00, + without regard to timezone or daylight-savings rules; + for interval values, the total number + of seconds in the interval + + + +SELECT EXTRACT(EPOCH FROM TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40.12-08'); +Result: 982384720.120000 +SELECT EXTRACT(EPOCH FROM TIMESTAMP '2001-02-16 20:38:40.12'); +Result: 982355920.120000 +SELECT EXTRACT(EPOCH FROM INTERVAL '5 days 3 hours'); +Result: 442800.000000 + + + + You can convert an epoch value back to a timestamp with time zone + with to_timestamp: + + +SELECT to_timestamp(982384720.12); +Result: 2001-02-17 04:38:40.12+00 + + + + Beware that applying to_timestamp to an epoch + extracted from a date or timestamp value + could produce a misleading result: the result will effectively + assume that the original value had been given in UTC, which might + not be the case. + + + + + + hour + + + The hour field (0–23 in timestamps, unrestricted in + intervals) + + + +SELECT EXTRACT(HOUR FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 20 + + + + + + isodow + + + The day of the week as Monday (1) to + Sunday (7) + + + +SELECT EXTRACT(ISODOW FROM TIMESTAMP '2001-02-18 20:38:40'); +Result: 7 + + + This is identical to dow except for Sunday. This + matches the ISO 8601 day of the week numbering. + + + + + + + isoyear + + + The ISO 8601 week-numbering year that the date + falls in + + + +SELECT EXTRACT(ISOYEAR FROM DATE '2006-01-01'); +Result: 2005 +SELECT EXTRACT(ISOYEAR FROM DATE '2006-01-02'); +Result: 2006 + + + + Each ISO 8601 week-numbering year begins with the + Monday of the week containing the 4th of January, so in early + January or late December the ISO year may be + different from the Gregorian year. See the week + field for more information. + + + + + + julian + + + The Julian Date corresponding to the + date or timestamp. Timestamps + that are not local midnight result in a fractional value. See + for more information. + + + +SELECT EXTRACT(JULIAN FROM DATE '2006-01-01'); +Result: 2453737 +SELECT EXTRACT(JULIAN FROM TIMESTAMP '2006-01-01 12:00'); +Result: 2453737.50000000000000000000 + + + + + + microseconds + + + The seconds field, including fractional parts, multiplied by 1 + 000 000; note that this includes full seconds + + + +SELECT EXTRACT(MICROSECONDS FROM TIME '17:12:28.5'); +Result: 28500000 + + + + + + millennium + + + The millennium; for interval values, the year field + divided by 1000 + + + +SELECT EXTRACT(MILLENNIUM FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 3 +SELECT EXTRACT(MILLENNIUM FROM INTERVAL '2001 years'); +Result: 2 + + + + Years in the 1900s are in the second millennium. + The third millennium started January 1, 2001. + + + + + + milliseconds + + + The seconds field, including fractional parts, multiplied by + 1000. Note that this includes full seconds. + + + +SELECT EXTRACT(MILLISECONDS FROM TIME '17:12:28.5'); +Result: 28500.000 + + + + + + minute + + + The minutes field (0–59) + + + +SELECT EXTRACT(MINUTE FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 38 + + + + + + month + + + The number of the month within the year (1–12); + for interval values, the number of months modulo 12 + (0–11) + + + +SELECT EXTRACT(MONTH FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 2 +SELECT EXTRACT(MONTH FROM INTERVAL '2 years 3 months'); +Result: 3 +SELECT EXTRACT(MONTH FROM INTERVAL '2 years 13 months'); +Result: 1 + + + + + + quarter + + + The quarter of the year (1–4) that the date is in; + for interval values, the month field divided by 3 + plus 1 + + + +SELECT EXTRACT(QUARTER FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 1 +SELECT EXTRACT(QUARTER FROM INTERVAL '1 year 6 months'); +Result: 3 + + + + + + second + + + The seconds field, including any fractional seconds + + + +SELECT EXTRACT(SECOND FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 40.000000 +SELECT EXTRACT(SECOND FROM TIME '17:12:28.5'); +Result: 28.500000 + + + + + timezone + + + The time zone offset from UTC, measured in seconds. Positive values + correspond to time zones east of UTC, negative values to + zones west of UTC. (Technically, + PostgreSQL does not use UTC because + leap seconds are not handled.) + + + + + + timezone_hour + + + The hour component of the time zone offset + + + + + + timezone_minute + + + The minute component of the time zone offset + + + + + + week + + + The number of the ISO 8601 week-numbering week of + the year. By definition, ISO weeks start on Mondays and the first + week of a year contains January 4 of that year. In other words, the + first Thursday of a year is in week 1 of that year. + + + In the ISO week-numbering system, it is possible for early-January + dates to be part of the 52nd or 53rd week of the previous year, and for + late-December dates to be part of the first week of the next year. + For example, 2005-01-01 is part of the 53rd week of year + 2004, and 2006-01-01 is part of the 52nd week of year + 2005, while 2012-12-31 is part of the first week of 2013. + It's recommended to use the isoyear field together with + week to get consistent results. + + + + For interval values, the week field is simply the number + of integral days divided by 7. + + + +SELECT EXTRACT(WEEK FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 7 +SELECT EXTRACT(WEEK FROM INTERVAL '13 days 24 hours'); +Result: 1 + + + + + + year + + + The year field. Keep in mind there is no 0 AD, so subtracting + BC years from AD years should be done with care. + + + +SELECT EXTRACT(YEAR FROM TIMESTAMP '2001-02-16 20:38:40'); +Result: 2001 + + + + + + + + + When processing an interval value, + the extract function produces field values that + match the interpretation used by the interval output function. This + can produce surprising results if one starts with a non-normalized + interval representation, for example: + +SELECT INTERVAL '80 minutes'; +Result: 01:20:00 +SELECT EXTRACT(MINUTES FROM INTERVAL '80 minutes'); +Result: 20 + + + + + + When the input value is +/-Infinity, extract returns + +/-Infinity for monotonically-increasing fields (epoch, + julian, year, isoyear, + decade, century, and millennium + for timestamp inputs; epoch, hour, + day, year, decade, + century, and millennium for + interval inputs). + For other fields, NULL is returned. PostgreSQL + versions before 9.6 returned zero for all cases of infinite input. + + + + + The extract function is primarily intended + for computational processing. For formatting date/time values for + display, see . + + + + The date_part function is modeled on the traditional + Ingres equivalent to the + SQL-standard function extract: + +date_part('field', source) + + Note that here the field parameter needs to + be a string value, not a name. The valid field names for + date_part are the same as for + extract. + For historical reasons, the date_part function + returns values of type double precision. This can result in + a loss of precision in certain uses. Using extract + is recommended instead. + + + +SELECT date_part('day', TIMESTAMP '2001-02-16 20:38:40'); +Result: 16 +SELECT date_part('hour', INTERVAL '4 hours 3 minutes'); +Result: 4 + + + + + + <function>date_trunc</function> + + + date_trunc + + + + The function date_trunc is conceptually + similar to the trunc function for numbers. + + + + +date_trunc(field, source , time_zone ) + + source is a value expression of type + timestamp, timestamp with time zone, + or interval. + (Values of type date and + time are cast automatically to timestamp or + interval, respectively.) + field selects to which precision to + truncate the input value. The return value is likewise of type + timestamp, timestamp with time zone, + or interval, + and it has all fields that are less significant than the + selected one set to zero (or one, for day and month). + + + + Valid values for field are: + + microseconds + milliseconds + second + minute + hour + day + week + month + quarter + year + decade + century + millennium + + + + + When the input value is of type timestamp with time zone, + the truncation is performed with respect to a particular time zone; + for example, truncation to day produces a value that + is midnight in that zone. By default, truncation is done with respect + to the current setting, but the + optional time_zone argument can be provided + to specify a different time zone. The time zone name can be specified + in any of the ways described in . + + + + A time zone cannot be specified when processing timestamp without + time zone or interval inputs. These are always + taken at face value. + + + + Examples (assuming the local time zone is America/New_York): + +SELECT date_trunc('hour', TIMESTAMP '2001-02-16 20:38:40'); +Result: 2001-02-16 20:00:00 +SELECT date_trunc('year', TIMESTAMP '2001-02-16 20:38:40'); +Result: 2001-01-01 00:00:00 +SELECT date_trunc('day', TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40+00'); +Result: 2001-02-16 00:00:00-05 +SELECT date_trunc('day', TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40+00', 'Australia/Sydney'); +Result: 2001-02-16 08:00:00-05 +SELECT date_trunc('hour', INTERVAL '3 days 02:47:33'); +Result: 3 days 02:00:00 + + + + + + <function>date_bin</function> + + + date_bin + + + + The function date_bin bins the input + timestamp into the specified interval (the stride) + aligned with a specified origin. + + + + +date_bin(stride, source, origin) + + source is a value expression of type + timestamp or timestamp with time zone. (Values + of type date are cast automatically to + timestamp.) stride is a value + expression of type interval. The return value is likewise + of type timestamp or timestamp with time zone, + and it marks the beginning of the bin into which the + source is placed. + + + + Examples: + +SELECT date_bin('15 minutes', TIMESTAMP '2020-02-11 15:44:17', TIMESTAMP '2001-01-01'); +Result: 2020-02-11 15:30:00 +SELECT date_bin('15 minutes', TIMESTAMP '2020-02-11 15:44:17', TIMESTAMP '2001-01-01 00:02:30'); +Result: 2020-02-11 15:32:30 + + + + + In the case of full units (1 minute, 1 hour, etc.), it gives the same result as + the analogous date_trunc call, but the difference is + that date_bin can truncate to an arbitrary interval. + + + + The stride interval must be greater than zero and + cannot contain units of month or larger. + + + + + <literal>AT TIME ZONE</literal> and <literal>AT LOCAL</literal> + + + time zone + conversion + + + + AT TIME ZONE + + + + AT LOCAL + + + + The AT TIME ZONE operator converts time + stamp without time zone to/from + time stamp with time zone, and + time with time zone values to different time + zones. shows its + variants. + + + + <literal>AT TIME ZONE</literal> and <literal>AT LOCAL</literal> Variants + + + + + Operator + + + Description + + + Example(s) + + + + + + + + timestamp without time zone AT TIME ZONE zone + timestamp with time zone + + + Converts given time stamp without time zone to + time stamp with time zone, assuming the given + value is in the named time zone. + + + timestamp '2001-02-16 20:38:40' AT TIME ZONE 'America/Denver' + 2001-02-17 03:38:40+00 + + + + + + timestamp without time zone AT LOCAL + timestamp with time zone + + + Converts given time stamp without time zone to + time stamp with the session's + TimeZone value as time zone. + + + timestamp '2001-02-16 20:38:40' at local + 2001-02-17 03:38:40+00 + + + + + + timestamp with time zone AT TIME ZONE zone + timestamp without time zone + + + Converts given time stamp with time zone to + time stamp without time zone, as the time would + appear in that zone. + + + timestamp with time zone '2001-02-16 20:38:40-05' AT TIME ZONE 'America/Denver' + 2001-02-16 18:38:40 + + + + + + timestamp with time zone AT LOCAL + timestamp without time zone + + + Converts given time stamp with time zone to + time stamp without time zone, as the time would + appear with the session's TimeZone value as time zone. + + + timestamp with time zone '2001-02-16 20:38:40-05' at local + 2001-02-16 18:38:40 + + + + + + time with time zone AT TIME ZONE zone + time with time zone + + + Converts given time with time zone to a new time + zone. Since no date is supplied, this uses the currently active UTC + offset for the named destination zone. + + + time with time zone '05:34:17-05' AT TIME ZONE 'UTC' + 10:34:17+00 + + + + + + time with time zone AT LOCAL + time with time zone + + + Converts given time with time zone to a new time + zone. Since no date is supplied, this uses the currently active UTC + offset for the session's TimeZone value. + + + Assuming the session's TimeZone is set to UTC: + + + time with time zone '05:34:17-05' at local + 10:34:17+00 + + + + +
+ + + In these expressions, the desired time zone zone can be + specified either as a text value (e.g., 'America/Los_Angeles') + or as an interval (e.g., INTERVAL '-08:00'). + In the text case, a time zone name can be specified in any of the ways + described in . + The interval case is only useful for zones that have fixed offsets from + UTC, so it is not very common in practice. + + + + The syntax AT LOCAL may be used as shorthand for + AT TIME ZONE local, where + local is the session's + TimeZone value. + + + + Examples (assuming the current setting + is America/Los_Angeles): + +SELECT TIMESTAMP '2001-02-16 20:38:40' AT TIME ZONE 'America/Denver'; +Result: 2001-02-16 19:38:40-08 +SELECT TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40-05' AT TIME ZONE 'America/Denver'; +Result: 2001-02-16 18:38:40 +SELECT TIMESTAMP '2001-02-16 20:38:40' AT TIME ZONE 'Asia/Tokyo' AT TIME ZONE 'America/Chicago'; +Result: 2001-02-16 05:38:40 +SELECT TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40-05' AT LOCAL; +Result: 2001-02-16 17:38:40 +SELECT TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40-05' AT TIME ZONE '+05'; +Result: 2001-02-16 20:38:40 +SELECT TIME WITH TIME ZONE '20:38:40-05' AT LOCAL; +Result: 17:38:40 + + The first example adds a time zone to a value that lacks it, and + displays the value using the current TimeZone + setting. The second example shifts the time stamp with time zone value + to the specified time zone, and returns the value without a time zone. + This allows storage and display of values different from the current + TimeZone setting. The third example converts + Tokyo time to Chicago time. The fourth example shifts the time stamp + with time zone value to the time zone currently specified by the + TimeZone setting and returns the value without a + time zone. The fifth example demonstrates that the sign in a POSIX-style + time zone specification has the opposite meaning of the sign in an + ISO-8601 datetime literal, as described in + and . + + + + The sixth example is a cautionary tale. Due to the fact that there is no + date associated with the input value, the conversion is made using the + current date of the session. Therefore, this static example may show a wrong + result depending on the time of the year it is viewed because + 'America/Los_Angeles' observes Daylight Savings Time. + + + + The function timezone(zone, + timestamp) is equivalent to the SQL-conforming construct + timestamp AT TIME ZONE + zone. + + + + The function timezone(zone, + time) is equivalent to the SQL-conforming construct + time AT TIME ZONE + zone. + + + + The function timezone(timestamp) + is equivalent to the SQL-conforming construct timestamp + AT LOCAL. + + + + The function timezone(time) + is equivalent to the SQL-conforming construct time + AT LOCAL. + +
+ + + Current Date/Time + + + date + current + + + + time + current + + + + PostgreSQL provides a number of functions + that return values related to the current date and time. These + SQL-standard functions all return values based on the start time of + the current transaction: + +CURRENT_DATE +CURRENT_TIME +CURRENT_TIMESTAMP +CURRENT_TIME(precision) +CURRENT_TIMESTAMP(precision) +LOCALTIME +LOCALTIMESTAMP +LOCALTIME(precision) +LOCALTIMESTAMP(precision) + + + + + CURRENT_TIME and + CURRENT_TIMESTAMP deliver values with time zone; + LOCALTIME and + LOCALTIMESTAMP deliver values without time zone. + + + + CURRENT_TIME, + CURRENT_TIMESTAMP, + LOCALTIME, and + LOCALTIMESTAMP + can optionally take + a precision parameter, which causes the result to be rounded + to that many fractional digits in the seconds field. Without a precision parameter, + the result is given to the full available precision. + + + + Some examples: + +SELECT CURRENT_TIME; +Result: 14:39:53.662522-05 +SELECT CURRENT_DATE; +Result: 2019-12-23 +SELECT CURRENT_TIMESTAMP; +Result: 2019-12-23 14:39:53.662522-05 +SELECT CURRENT_TIMESTAMP(2); +Result: 2019-12-23 14:39:53.66-05 +SELECT LOCALTIMESTAMP; +Result: 2019-12-23 14:39:53.662522 + + + + + Since these functions return + the start time of the current transaction, their values do not + change during the transaction. This is considered a feature: + the intent is to allow a single transaction to have a consistent + notion of the current time, so that multiple + modifications within the same transaction bear the same + time stamp. + + + + + Other database systems might advance these values more + frequently. + + + + + PostgreSQL also provides functions that + return the start time of the current statement, as well as the actual + current time at the instant the function is called. The complete list + of non-SQL-standard time functions is: + +transaction_timestamp() +statement_timestamp() +clock_timestamp() +timeofday() +now() + + + + + transaction_timestamp() is equivalent to + CURRENT_TIMESTAMP, but is named to clearly reflect + what it returns. + statement_timestamp() returns the start time of the current + statement (more specifically, the time of receipt of the latest command + message from the client). + statement_timestamp() and transaction_timestamp() + return the same value during the first statement of a transaction, but might + differ during subsequent statements. + clock_timestamp() returns the actual current time, and + therefore its value changes even within a single SQL statement. + timeofday() is a historical + PostgreSQL function. Like + clock_timestamp(), it returns the actual current time, + but as a formatted text string rather than a timestamp + with time zone value. + now() is a traditional PostgreSQL + equivalent to transaction_timestamp(). + + + + All the date/time data types also accept the special literal value + now to specify the current date and time (again, + interpreted as the transaction start time). Thus, + the following three all return the same result: + +SELECT CURRENT_TIMESTAMP; +SELECT now(); +SELECT TIMESTAMP 'now'; -- but see tip below + + + + + + Do not use the third form when specifying a value to be evaluated later, + for example in a DEFAULT clause for a table column. + The system will convert now + to a timestamp as soon as the constant is parsed, so that when + the default value is needed, + the time of the table creation would be used! The first two + forms will not be evaluated until the default value is used, + because they are function calls. Thus they will give the desired + behavior of defaulting to the time of row insertion. + (See also .) + + + + + + Delaying Execution + + + pg_sleep + + + pg_sleep_for + + + pg_sleep_until + + + sleep + + + delay + + + + The following functions are available to delay execution of the server + process: + +pg_sleep ( double precision ) +pg_sleep_for ( interval ) +pg_sleep_until ( timestamp with time zone ) + + + pg_sleep makes the current session's process + sleep until the given number of seconds have + elapsed. Fractional-second delays can be specified. + pg_sleep_for is a convenience function to + allow the sleep time to be specified as an interval. + pg_sleep_until is a convenience function for when + a specific wake-up time is desired. + For example: + + +SELECT pg_sleep(1.5); +SELECT pg_sleep_for('5 minutes'); +SELECT pg_sleep_until('tomorrow 03:00'); + + + + + + The effective resolution of the sleep interval is platform-specific; + 0.01 seconds is a common value. The sleep delay will be at least as long + as specified. It might be longer depending on factors such as server load. + In particular, pg_sleep_until is not guaranteed to + wake up exactly at the specified time, but it will not wake up any earlier. + + + + + + Make sure that your session does not hold more locks than necessary + when calling pg_sleep or its variants. Otherwise + other sessions might have to wait for your sleeping process, slowing down + the entire system. + + + + +
diff --git a/doc/src/sgml/func/func-enum.sgml b/doc/src/sgml/func/func-enum.sgml new file mode 100644 index 0000000000000..6227afe4057ba --- /dev/null +++ b/doc/src/sgml/func/func-enum.sgml @@ -0,0 +1,121 @@ + + Enum Support Functions + + + For enum types (described in ), + there are several functions that allow cleaner programming without + hard-coding particular values of an enum type. + These are listed in . The examples + assume an enum type created as: + + +CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple'); + + + + + + Enum Support Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + enum_first + + enum_first ( anyenum ) + anyenum + + + Returns the first value of the input enum type. + + + enum_first(null::rainbow) + red + + + + + + enum_last + + enum_last ( anyenum ) + anyenum + + + Returns the last value of the input enum type. + + + enum_last(null::rainbow) + purple + + + + + + enum_range + + enum_range ( anyenum ) + anyarray + + + Returns all values of the input enum type in an ordered array. + + + enum_range(null::rainbow) + {red,orange,yellow,&zwsp;green,blue,purple} + + + + + enum_range ( anyenum, anyenum ) + anyarray + + + Returns the range between the two given enum values, as an ordered + array. The values must be from the same enum type. If the first + parameter is null, the result will start with the first value of + the enum type. + If the second parameter is null, the result will end with the last + value of the enum type. + + + enum_range('orange'::rainbow, 'green'::rainbow) + {orange,yellow,green} + + + enum_range(NULL, 'green'::rainbow) + {red,orange,&zwsp;yellow,green} + + + enum_range('orange'::rainbow, NULL) + {orange,yellow,green,&zwsp;blue,purple} + + + + +
+ + + Notice that except for the two-argument form of enum_range, + these functions disregard the specific value passed to them; they care + only about its declared data type. Either null or a specific value of + the type can be passed, with the same result. It is more common to + apply these functions to a table column or function argument than to + a hardwired type name as used in the examples. + +
diff --git a/doc/src/sgml/func/func-event-triggers.sgml b/doc/src/sgml/func/func-event-triggers.sgml new file mode 100644 index 0000000000000..9f3f51e9f5133 --- /dev/null +++ b/doc/src/sgml/func/func-event-triggers.sgml @@ -0,0 +1,332 @@ + + Event Trigger Functions + + + PostgreSQL provides these helper functions + to retrieve information from event triggers. + + + + For more information about event triggers, + see . + + + + Capturing Changes at Command End + + + pg_event_trigger_ddl_commands + + + +pg_event_trigger_ddl_commands () setof record + + + + pg_event_trigger_ddl_commands returns a list of + DDL commands executed by each user action, + when invoked in a function attached to a + ddl_command_end event trigger. If called in any other + context, an error is raised. + pg_event_trigger_ddl_commands returns one row for each + base command executed; some commands that are a single SQL sentence + may return more than one row. This function returns the following + columns: + + + + + + Name + Type + Description + + + + + + classid + oid + OID of catalog the object belongs in + + + objid + oid + OID of the object itself + + + objsubid + integer + Sub-object ID (e.g., attribute number for a column) + + + command_tag + text + Command tag + + + object_type + text + Type of the object + + + schema_name + text + + Name of the schema the object belongs in, if any; otherwise NULL. + No quoting is applied. + + + + object_identity + text + + Text rendering of the object identity, schema-qualified. Each + identifier included in the identity is quoted if necessary. + + + + in_extension + boolean + True if the command is part of an extension script + + + command + pg_ddl_command + + A complete representation of the command, in internal format. + This cannot be output directly, but it can be passed to other + functions to obtain different pieces of information about the + command. + + + + + + + + + + Processing Objects Dropped by a DDL Command + + + pg_event_trigger_dropped_objects + + + +pg_event_trigger_dropped_objects () setof record + + + + pg_event_trigger_dropped_objects returns a list of all objects + dropped by the command in whose sql_drop event it is called. + If called in any other context, an error is raised. + This function returns the following columns: + + + + + + Name + Type + Description + + + + + + classid + oid + OID of catalog the object belonged in + + + objid + oid + OID of the object itself + + + objsubid + integer + Sub-object ID (e.g., attribute number for a column) + + + original + boolean + True if this was one of the root object(s) of the deletion + + + normal + boolean + + True if there was a normal dependency relationship + in the dependency graph leading to this object + + + + is_temporary + boolean + + True if this was a temporary object + + + + object_type + text + Type of the object + + + schema_name + text + + Name of the schema the object belonged in, if any; otherwise NULL. + No quoting is applied. + + + + object_name + text + + Name of the object, if the combination of schema and name can be + used as a unique identifier for the object; otherwise NULL. + No quoting is applied, and name is never schema-qualified. + + + + object_identity + text + + Text rendering of the object identity, schema-qualified. Each + identifier included in the identity is quoted if necessary. + + + + address_names + text[] + + An array that, together with object_type and + address_args, can be used by + the pg_get_object_address function to + recreate the object address in a remote server containing an + identically named object of the same kind. + + + + address_args + text[] + + Complement for address_names + + + + + + + + + The pg_event_trigger_dropped_objects function can be used + in an event trigger like this: + +CREATE FUNCTION test_event_trigger_for_drops() + RETURNS event_trigger LANGUAGE plpgsql AS $$ +DECLARE + obj record; +BEGIN + FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE '% dropped object: % %.% %', + tg_tag, + obj.object_type, + obj.schema_name, + obj.object_name, + obj.object_identity; + END LOOP; +END; +$$; +CREATE EVENT TRIGGER test_event_trigger_for_drops + ON sql_drop + EXECUTE FUNCTION test_event_trigger_for_drops(); + + + + + + Handling a Table Rewrite Event + + + The functions shown in + + provide information about a table for which a + table_rewrite event has just been called. + If called in any other context, an error is raised. + + + + Table Rewrite Information Functions + + + + + Function + + + Description + + + + + + + + + pg_event_trigger_table_rewrite_oid + + pg_event_trigger_table_rewrite_oid () + oid + + + Returns the OID of the table about to be rewritten. + + + + + + + pg_event_trigger_table_rewrite_reason + + pg_event_trigger_table_rewrite_reason () + integer + + + Returns a code explaining the reason(s) for rewriting. The value is + a bitmap built from the following values: 1 + (the table has changed its persistence), 2 + (default value of a column has changed), 4 + (a column has a new data type) and 8 + (the table access method has changed). + + + + +
+ + + These functions can be used in an event trigger like this: + +CREATE FUNCTION test_event_trigger_table_rewrite_oid() + RETURNS event_trigger + LANGUAGE plpgsql AS +$$ +BEGIN + RAISE NOTICE 'rewriting table % for reason %', + pg_event_trigger_table_rewrite_oid()::regclass, + pg_event_trigger_table_rewrite_reason(); +END; +$$; + +CREATE EVENT TRIGGER test_table_rewrite_oid + ON table_rewrite + EXECUTE FUNCTION test_event_trigger_table_rewrite_oid(); + + +
+
diff --git a/doc/src/sgml/func/func-formatting.sgml b/doc/src/sgml/func/func-formatting.sgml new file mode 100644 index 0000000000000..af9e2223998ad --- /dev/null +++ b/doc/src/sgml/func/func-formatting.sgml @@ -0,0 +1,1197 @@ + + Data Type Formatting Functions + + + formatting + + + + The PostgreSQL formatting functions + provide a powerful set of tools for converting various data types + (date/time, integer, floating point, numeric) to formatted strings + and for converting from formatted strings to specific data types. + lists them. + These functions all follow a common calling convention: the first + argument is the value to be formatted and the second argument is a + template that defines the output or input format. + + + + Formatting Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + to_char + + to_char ( timestamp, text ) + text + + + to_char ( timestamp with time zone, text ) + text + + + Converts time stamp to string according to the given format. + + + to_char(timestamp '2002-04-20 17:31:12.66', 'HH12:MI:SS') + 05:31:12 + + + + + + to_char ( interval, text ) + text + + + Converts interval to string according to the given format. + + + to_char(interval '15h 2m 12s', 'HH24:MI:SS') + 15:02:12 + + + + + + to_char ( numeric_type, text ) + text + + + Converts number to string according to the given format; available + for integer, bigint, numeric, + real, double precision. + + + to_char(125, '999') + 125 + + + to_char(125.8::real, '999D9') + 125.8 + + + to_char(-125.8, '999D99S') + 125.80- + + + + + + + to_date + + to_date ( text, text ) + date + + + Converts string to date according to the given format. + + + to_date('05 Dec 2000', 'DD Mon YYYY') + 2000-12-05 + + + + + + + to_number + + to_number ( text, text ) + numeric + + + Converts string to numeric according to the given format. + + + to_number('12,454.8-', '99G999D9S') + -12454.8 + + + + + + + to_timestamp + + to_timestamp ( text, text ) + timestamp with time zone + + + Converts string to time stamp according to the given format. + (See also to_timestamp(double precision) in + .) + + + to_timestamp('05 Dec 2000', 'DD Mon YYYY') + 2000-12-05 00:00:00-05 + + + + +
+ + + + to_timestamp and to_date + exist to handle input formats that cannot be converted by + simple casting. For most standard date/time formats, simply casting the + source string to the required data type works, and is much easier. + Similarly, to_number is unnecessary for standard numeric + representations. + + + + + In a to_char output template string, there are certain + patterns that are recognized and replaced with appropriately-formatted + data based on the given value. Any text that is not a template pattern is + simply copied verbatim. Similarly, in an input template string (for the + other functions), template patterns identify the values to be supplied by + the input data string. If there are characters in the template string + that are not template patterns, the corresponding characters in the input + data string are simply skipped over (whether or not they are equal to the + template string characters). + + + + shows the + template patterns available for formatting date and time values. + + + + Template Patterns for Date/Time Formatting + + + + + + Pattern + Description + + + + + HH + hour of day (01–12) + + + HH12 + hour of day (01–12) + + + HH24 + hour of day (00–23) + + + MI + minute (00–59) + + + SS + second (00–59) + + + MS + millisecond (000–999) + + + US + microsecond (000000–999999) + + + FF1 + tenth of second (0–9) + + + FF2 + hundredth of second (00–99) + + + FF3 + millisecond (000–999) + + + FF4 + tenth of a millisecond (0000–9999) + + + FF5 + hundredth of a millisecond (00000–99999) + + + FF6 + microsecond (000000–999999) + + + SSSS, SSSSS + seconds past midnight (0–86399) + + + AM, am, + PM or pm + meridiem indicator (without periods) + + + A.M., a.m., + P.M. or p.m. + meridiem indicator (with periods) + + + Y,YYY + year (4 or more digits) with comma + + + YYYY + year (4 or more digits) + + + YYY + last 3 digits of year + + + YY + last 2 digits of year + + + Y + last digit of year + + + IYYY + ISO 8601 week-numbering year (4 or more digits) + + + IYY + last 3 digits of ISO 8601 week-numbering year + + + IY + last 2 digits of ISO 8601 week-numbering year + + + I + last digit of ISO 8601 week-numbering year + + + BC, bc, + AD or ad + era indicator (without periods) + + + B.C., b.c., + A.D. or a.d. + era indicator (with periods) + + + MONTH + full upper case month name (blank-padded to 9 chars) + + + Month + full capitalized month name (blank-padded to 9 chars) + + + month + full lower case month name (blank-padded to 9 chars) + + + MON + abbreviated upper case month name (3 chars in English, localized lengths vary) + + + Mon + abbreviated capitalized month name (3 chars in English, localized lengths vary) + + + mon + abbreviated lower case month name (3 chars in English, localized lengths vary) + + + MM + month number (01–12) + + + DAY + full upper case day name (blank-padded to 9 chars) + + + Day + full capitalized day name (blank-padded to 9 chars) + + + day + full lower case day name (blank-padded to 9 chars) + + + DY + abbreviated upper case day name (3 chars in English, localized lengths vary) + + + Dy + abbreviated capitalized day name (3 chars in English, localized lengths vary) + + + dy + abbreviated lower case day name (3 chars in English, localized lengths vary) + + + DDD + day of year (001–366) + + + IDDD + day of ISO 8601 week-numbering year (001–371; day 1 of the year is Monday of the first ISO week) + + + DD + day of month (01–31) + + + D + day of the week, Sunday (1) to Saturday (7) + + + ID + ISO 8601 day of the week, Monday (1) to Sunday (7) + + + W + week of month (1–5) (the first week starts on the first day of the month) + + + WW + week number of year (1–53) (the first week starts on the first day of the year) + + + IW + week number of ISO 8601 week-numbering year (01–53; the first Thursday of the year is in week 1) + + + CC + century (2 digits) (the twenty-first century starts on 2001-01-01) + + + J + Julian Date (integer days since November 24, 4714 BC at local + midnight; see ) + + + Q + quarter + + + RM + month in upper case Roman numerals (I–XII; I=January) + + + rm + month in lower case Roman numerals (i–xii; i=January) + + + TZ + upper case time-zone abbreviation + + + tz + lower case time-zone abbreviation + + + TZH + time-zone hours + + + TZM + time-zone minutes + + + OF + time-zone offset from UTC (HH + or HH:MM) + + + +
+ + + Modifiers can be applied to any template pattern to alter its + behavior. For example, FMMonth + is the Month pattern with the + FM modifier. + shows the + modifier patterns for date/time formatting. + + + + Template Pattern Modifiers for Date/Time Formatting + + + + Modifier + Description + Example + + + + + FM prefix + fill mode (suppress leading zeroes and padding blanks) + FMMonth + + + TH suffix + upper case ordinal number suffix + DDTH, e.g., 12TH + + + th suffix + lower case ordinal number suffix + DDth, e.g., 12th + + + FX prefix + fixed format global option (see usage notes) + FX Month DD Day + + + TM prefix + translation mode (use localized day and month names based on + ) + TMMonth + + + SP suffix + spell mode (not implemented) + DDSP + + + +
+ + + Usage notes for date/time formatting: + + + + + FM suppresses leading zeroes and trailing blanks + that would otherwise be added to make the output of a pattern be + fixed-width. In PostgreSQL, + FM modifies only the next specification, while in + Oracle FM affects all subsequent + specifications, and repeated FM modifiers + toggle fill mode on and off. + + + + + + TM suppresses trailing blanks whether or + not FM is specified. + + + + + + to_timestamp and to_date + ignore letter case in the input; so for + example MON, Mon, + and mon all accept the same strings. When using + the TM modifier, case-folding is done according to + the rules of the function's input collation (see + ). + + + + + + to_timestamp and to_date + skip multiple blank spaces at the beginning of the input string and + around date and time values unless the FX option is used. For example, + to_timestamp(' 2000    JUN', 'YYYY MON') and + to_timestamp('2000 - JUN', 'YYYY-MON') work, but + to_timestamp('2000    JUN', 'FXYYYY MON') returns an error + because to_timestamp expects only a single space. + FX must be specified as the first item in + the template. + + + + + + A separator (a space or non-letter/non-digit character) in the template string of + to_timestamp and to_date + matches any single separator in the input string or is skipped, + unless the FX option is used. + For example, to_timestamp('2000JUN', 'YYYY///MON') and + to_timestamp('2000/JUN', 'YYYY MON') work, but + to_timestamp('2000//JUN', 'YYYY/MON') + returns an error because the number of separators in the input string + exceeds the number of separators in the template. + + + If FX is specified, a separator in the template string + matches exactly one character in the input string. But note that the + input string character is not required to be the same as the separator from the template string. + For example, to_timestamp('2000/JUN', 'FXYYYY MON') + works, but to_timestamp('2000/JUN', 'FXYYYY  MON') + returns an error because the second space in the template string consumes + the letter J from the input string. + + + + + + A TZH template pattern can match a signed number. + Without the FX option, minus signs may be ambiguous, + and could be interpreted as a separator. + This ambiguity is resolved as follows: If the number of separators before + TZH in the template string is less than the number of + separators before the minus sign in the input string, the minus sign + is interpreted as part of TZH. + Otherwise, the minus sign is considered to be a separator between values. + For example, to_timestamp('2000 -10', 'YYYY TZH') matches + -10 to TZH, but + to_timestamp('2000 -10', 'YYYY  TZH') + matches 10 to TZH. + + + + + + Ordinary text is allowed in to_char + templates and will be output literally. You can put a substring + in double quotes to force it to be interpreted as literal text + even if it contains template patterns. For example, in + '"Hello Year "YYYY', the YYYY + will be replaced by the year data, but the single Y in Year + will not be. + In to_date, to_number, + and to_timestamp, literal text and double-quoted + strings result in skipping the number of characters contained in the + string; for example "XX" skips two input characters + (whether or not they are XX). + + + + Prior to PostgreSQL 12, it was possible to + skip arbitrary text in the input string using non-letter or non-digit + characters. For example, + to_timestamp('2000y6m1d', 'yyyy-MM-DD') used to + work. Now you can only use letter characters for this purpose. For example, + to_timestamp('2000y6m1d', 'yyyytMMtDDt') and + to_timestamp('2000y6m1d', 'yyyy"y"MM"m"DD"d"') + skip y, m, and + d. + + + + + + + If you want to have a double quote in the output you must + precede it with a backslash, for example '\"YYYY + Month\"'. + Backslashes are not otherwise special outside of double-quoted + strings. Within a double-quoted string, a backslash causes the + next character to be taken literally, whatever it is (but this + has no special effect unless the next character is a double quote + or another backslash). + + + + + + In to_timestamp and to_date, + if the year format specification is less than four digits, e.g., + YYY, and the supplied year is less than four digits, + the year will be adjusted to be nearest to the year 2020, e.g., + 95 becomes 1995. + + + + + + In to_timestamp and to_date, + negative years are treated as signifying BC. If you write both a + negative year and an explicit BC field, you get AD + again. An input of year zero is treated as 1 BC. + + + + + + In to_timestamp and to_date, + the YYYY conversion has a restriction when + processing years with more than 4 digits. You must + use some non-digit character or template after YYYY, + otherwise the year is always interpreted as 4 digits. For example + (with the year 20000): + to_date('200001130', 'YYYYMMDD') will be + interpreted as a 4-digit year; instead use a non-digit + separator after the year, like + to_date('20000-1130', 'YYYY-MMDD') or + to_date('20000Nov30', 'YYYYMonDD'). + + + + + + In to_timestamp and to_date, + the CC (century) field is accepted but ignored + if there is a YYY, YYYY or + Y,YYY field. If CC is used with + YY or Y then the result is + computed as that year in the specified century. If the century is + specified but the year is not, the first year of the century + is assumed. + + + + + + In to_timestamp and to_date, + weekday names or numbers (DAY, D, + and related field types) are accepted but are ignored for purposes of + computing the result. The same is true for quarter + (Q) fields. + + + + + + In to_timestamp and to_date, + an ISO 8601 week-numbering date (as distinct from a Gregorian date) + can be specified in one of two ways: + + + + Year, week number, and weekday: for + example to_date('2006-42-4', 'IYYY-IW-ID') + returns the date 2006-10-19. + If you omit the weekday it is assumed to be 1 (Monday). + + + + + Year and day of year: for example to_date('2006-291', + 'IYYY-IDDD') also returns 2006-10-19. + + + + + + Attempting to enter a date using a mixture of ISO 8601 week-numbering + fields and Gregorian date fields is nonsensical, and will cause an + error. In the context of an ISO 8601 week-numbering year, the + concept of a month or day of month has no + meaning. In the context of a Gregorian year, the ISO week has no + meaning. + + + + While to_date will reject a mixture of + Gregorian and ISO week-numbering date + fields, to_char will not, since output format + specifications like YYYY-MM-DD (IYYY-IDDD) can be + useful. But avoid writing something like IYYY-MM-DD; + that would yield surprising results near the start of the year. + (See for more + information.) + + + + + + + In to_timestamp, millisecond + (MS) or microsecond (US) + fields are used as the + seconds digits after the decimal point. For example + to_timestamp('12.3', 'SS.MS') is not 3 milliseconds, + but 300, because the conversion treats it as 12 + 0.3 seconds. + So, for the format SS.MS, the input values + 12.3, 12.30, + and 12.300 specify the + same number of milliseconds. To get three milliseconds, one must write + 12.003, which the conversion treats as + 12 + 0.003 = 12.003 seconds. + + + + Here is a more + complex example: + to_timestamp('15:12:02.020.001230', 'HH24:MI:SS.MS.US') + is 15 hours, 12 minutes, and 2 seconds + 20 milliseconds + + 1230 microseconds = 2.021230 seconds. + + + + + + to_char(..., 'ID')'s day of the week numbering + matches the extract(isodow FROM ...) function, but + to_char(..., 'D')'s does not match + extract(dow FROM ...)'s day numbering. + + + + + + to_char(interval) formats HH and + HH12 as shown on a 12-hour clock, for example zero hours + and 36 hours both output as 12, while HH24 + outputs the full hour value, which can exceed 23 in + an interval value. + + + + + + + + shows the + template patterns available for formatting numeric values. + + + + Template Patterns for Numeric Formatting + + + + + + Pattern + Description + + + + + 9 + digit position (can be dropped if insignificant) + + + 0 + digit position (will not be dropped, even if insignificant) + + + . (period) + decimal point + + + , (comma) + group (thousands) separator + + + PR + negative value in angle brackets + + + S + sign anchored to number (uses locale) + + + L + currency symbol (uses locale) + + + D + decimal point (uses locale) + + + G + group separator (uses locale) + + + MI + minus sign in specified position (if number < 0) + + + PL + plus sign in specified position (if number > 0) + + + SG + plus/minus sign in specified position + + + RN or rn + Roman numeral (values between 1 and 3999) + + + TH or th + ordinal number suffix + + + V + shift specified number of digits (see notes) + + + EEEE + exponent for scientific notation + + + +
+ + + Usage notes for numeric formatting: + + + + + 0 specifies a digit position that will always be printed, + even if it contains a leading/trailing zero. 9 also + specifies a digit position, but if it is a leading zero then it will + be replaced by a space, while if it is a trailing zero and fill mode + is specified then it will be deleted. (For to_number(), + these two pattern characters are equivalent.) + + + + + + If the format provides fewer fractional digits than the number being + formatted, to_char() will round the number to + the specified number of fractional digits. + + + + + + The pattern characters S, L, D, + and G represent the sign, currency symbol, decimal point, + and thousands separator characters defined by the current locale + (see + and ). The pattern characters period + and comma represent those exact characters, with the meanings of + decimal point and thousands separator, regardless of locale. + + + + + + If no explicit provision is made for a sign + in to_char()'s pattern, one column will be reserved for + the sign, and it will be anchored to (appear just left of) the + number. If S appears just left of some 9's, + it will likewise be anchored to the number. + + + + + + A sign formatted using SG, PL, or + MI is not anchored to + the number; for example, + to_char(-12, 'MI9999') produces '-  12' + but to_char(-12, 'S9999') produces '  -12'. + (The Oracle implementation does not allow the use of + MI before 9, but rather + requires that 9 precede + MI.) + + + + + + TH does not convert values less than zero + and does not convert fractional numbers. + + + + + + PL, SG, and + TH are PostgreSQL + extensions. + + + + + + In to_number, if non-data template patterns such + as L or TH are used, the + corresponding number of input characters are skipped, whether or not + they match the template pattern, unless they are data characters + (that is, digits, sign, decimal point, or comma). For + example, TH would skip two non-data characters. + + + + + + V with to_char + multiplies the input values by + 10^n, where + n is the number of digits following + V. V with + to_number divides in a similar manner. + The V can be thought of as marking the position + of an implicit decimal point in the input or output string. + to_char and to_number + do not support the use of + V combined with a decimal point + (e.g., 99.9V99 is not allowed). + + + + + + EEEE (scientific notation) cannot be used in + combination with any of the other formatting patterns or + modifiers other than digit and decimal point patterns, and must be at the end of the format string + (e.g., 9.99EEEE is a valid pattern). + + + + + + In to_number(), the RN + pattern converts Roman numerals (in standard form) to numbers. + Input is case-insensitive, so RN + and rn are equivalent. RN + cannot be used in combination with any other formatting patterns or + modifiers except FM, which is applicable only + in to_char() and is ignored + in to_number(). + + + + + + + Certain modifiers can be applied to any template pattern to alter its + behavior. For example, FM99.99 + is the 99.99 pattern with the + FM modifier. + shows the + modifier patterns for numeric formatting. + + + + Template Pattern Modifiers for Numeric Formatting + + + + Modifier + Description + Example + + + + + FM prefix + fill mode (suppress trailing zeroes and padding blanks) + FM99.99 + + + TH suffix + upper case ordinal number suffix + 999TH + + + th suffix + lower case ordinal number suffix + 999th + + + +
+ + + shows some + examples of the use of the to_char function. + + + + <function>to_char</function> Examples + + + + Expression + Result + + + + + to_char(current_timestamp, 'Day, DD  HH12:MI:SS') + 'Tuesday  , 06  05:39:18' + + + to_char(current_timestamp, 'FMDay, FMDD  HH12:MI:SS') + 'Tuesday, 6  05:39:18' + + + to_char(current_timestamp AT TIME ZONE + 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS"Z"') + '2022-12-06T05:39:18Z', + ISO 8601 extended format + + + to_char(-0.1, '99.99') + '  -.10' + + + to_char(-0.1, 'FM9.99') + '-.1' + + + to_char(-0.1, 'FM90.99') + '-0.1' + + + to_char(0.1, '0.9') + ' 0.1' + + + to_char(12, '9990999.9') + '    0012.0' + + + to_char(12, 'FM9990999.9') + '0012.' + + + to_char(485, '999') + ' 485' + + + to_char(-485, '999') + '-485' + + + to_char(485, '9 9 9') + ' 4 8 5' + + + to_char(1485, '9,999') + ' 1,485' + + + to_char(1485, '9G999') + ' 1 485' + + + to_char(148.5, '999.999') + ' 148.500' + + + to_char(148.5, 'FM999.999') + '148.5' + + + to_char(148.5, 'FM999.990') + '148.500' + + + to_char(148.5, '999D999') + ' 148,500' + + + to_char(3148.5, '9G999D999') + ' 3 148,500' + + + to_char(-485, '999S') + '485-' + + + to_char(-485, '999MI') + '485-' + + + to_char(485, '999MI') + '485 ' + + + to_char(485, 'FM999MI') + '485' + + + to_char(485, 'PL999') + '+485' + + + to_char(485, 'SG999') + '+485' + + + to_char(-485, 'SG999') + '-485' + + + to_char(-485, '9SG99') + '4-85' + + + to_char(-485, '999PR') + '<485>' + + + to_char(485, 'L999') + 'DM 485' + + + to_char(485, 'RN') + '        CDLXXXV' + + + to_char(485, 'FMRN') + 'CDLXXXV' + + + to_char(5.2, 'FMRN') + 'V' + + + to_char(482, '999th') + ' 482nd' + + + to_char(485, '"Good number:"999') + 'Good number: 485' + + + to_char(485.8, '"Pre:"999" Post:" .999') + 'Pre: 485 Post: .800' + + + to_char(12, '99V999') + ' 12000' + + + to_char(12.4, '99V999') + ' 12400' + + + to_char(12.45, '99V9') + ' 125' + + + to_char(0.0004859, '9.99EEEE') + ' 4.86e-04' + + + +
+ +
diff --git a/doc/src/sgml/func/func-geometry.sgml b/doc/src/sgml/func/func-geometry.sgml new file mode 100644 index 0000000000000..ba203af3bd289 --- /dev/null +++ b/doc/src/sgml/func/func-geometry.sgml @@ -0,0 +1,1261 @@ + + Geometric Functions and Operators + + + The geometric types point, box, + lseg, line, path, + polygon, and circle have a large set of + native support functions and operators, shown in , , and . + + + + Geometric Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + geometric_type + point + geometric_type + + + Adds the coordinates of the second point to those of each + point of the first argument, thus performing translation. + Available for point, box, path, + circle. + + + box '(1,1),(0,0)' + point '(2,0)' + (3,1),(2,0) + + + + + + path + path + path + + + Concatenates two open paths (returns NULL if either path is closed). + + + path '[(0,0),(1,1)]' + path '[(2,2),(3,3),(4,4)]' + [(0,0),(1,1),(2,2),(3,3),(4,4)] + + + + + + geometric_type - point + geometric_type + + + Subtracts the coordinates of the second point from those + of each point of the first argument, thus performing translation. + Available for point, box, path, + circle. + + + box '(1,1),(0,0)' - point '(2,0)' + (-1,1),(-2,0) + + + + + + geometric_type * point + geometric_type + + + Multiplies each point of the first argument by the second + point (treating a point as being a complex number + represented by real and imaginary parts, and performing standard + complex multiplication). If one interprets + the second point as a vector, this is equivalent to + scaling the object's size and distance from the origin by the length + of the vector, and rotating it counterclockwise around the origin by + the vector's angle from the x axis. + Available for point, box,Rotating a + box with these operators only moves its corner points: the box is + still considered to have sides parallel to the axes. Hence the box's + size is not preserved, as a true rotation would do. + path, circle. + + + path '((0,0),(1,0),(1,1))' * point '(3.0,0)' + ((0,0),(3,0),(3,3)) + + + path '((0,0),(1,0),(1,1))' * point(cosd(45), sind(45)) + ((0,0),&zwsp;(0.7071067811865475,0.7071067811865475),&zwsp;(0,1.414213562373095)) + + + + + + geometric_type / point + geometric_type + + + Divides each point of the first argument by the second + point (treating a point as being a complex number + represented by real and imaginary parts, and performing standard + complex division). If one interprets + the second point as a vector, this is equivalent to + scaling the object's size and distance from the origin down by the + length of the vector, and rotating it clockwise around the origin by + the vector's angle from the x axis. + Available for point, box, path, + circle. + + + path '((0,0),(1,0),(1,1))' / point '(2.0,0)' + ((0,0),(0.5,0),(0.5,0.5)) + + + path '((0,0),(1,0),(1,1))' / point(cosd(45), sind(45)) + ((0,0),&zwsp;(0.7071067811865476,-0.7071067811865476),&zwsp;(1.4142135623730951,0)) + + + + + + @-@ geometric_type + double precision + + + Computes the total length. + Available for lseg, path. + + + @-@ path '[(0,0),(1,0),(1,1)]' + 2 + + + + + + @@ geometric_type + point + + + Computes the center point. + Available for box, lseg, + polygon, circle. + + + @@ box '(2,2),(0,0)' + (1,1) + + + + + + # geometric_type + integer + + + Returns the number of points. + Available for path, polygon. + + + # path '((1,0),(0,1),(-1,0))' + 3 + + + + + + geometric_type # geometric_type + point + + + Computes the point of intersection, or NULL if there is none. + Available for lseg, line. + + + lseg '[(0,0),(1,1)]' # lseg '[(1,0),(0,1)]' + (0.5,0.5) + + + + + + box # box + box + + + Computes the intersection of two boxes, or NULL if there is none. + + + box '(2,2),(-1,-1)' # box '(1,1),(-2,-2)' + (1,1),(-1,-1) + + + + + + geometric_type ## geometric_type + point + + + Computes the closest point to the first object on the second object. + Available for these pairs of types: + (point, box), + (point, lseg), + (point, line), + (lseg, box), + (lseg, lseg), + (line, lseg). + + + point '(0,0)' ## lseg '[(2,0),(0,2)]' + (1,1) + + + + + + geometric_type <-> geometric_type + double precision + + + Computes the distance between the objects. + Available for all seven geometric types, for all combinations + of point with another geometric type, and for + these additional pairs of types: + (box, lseg), + (lseg, line), + (polygon, circle) + (and the commutator cases). + + + circle '<(0,0),1>' <-> circle '<(5,0),1>' + 3 + + + + + + geometric_type @> geometric_type + boolean + + + Does first object contain second? + Available for these pairs of types: + (box, point), + (box, box), + (path, point), + (polygon, point), + (polygon, polygon), + (circle, point), + (circle, circle). + + + circle '<(0,0),2>' @> point '(1,1)' + t + + + + + + geometric_type <@ geometric_type + boolean + + + Is first object contained in or on second? + Available for these pairs of types: + (point, box), + (point, lseg), + (point, line), + (point, path), + (point, polygon), + (point, circle), + (box, box), + (lseg, box), + (lseg, line), + (polygon, polygon), + (circle, circle). + + + point '(1,1)' <@ circle '<(0,0),2>' + t + + + + + + geometric_type && geometric_type + boolean + + + Do these objects overlap? (One point in common makes this true.) + Available for box, polygon, + circle. + + + box '(1,1),(0,0)' && box '(2,2),(0,0)' + t + + + + + + geometric_type << geometric_type + boolean + + + Is first object strictly left of second? + Available for point, box, + polygon, circle. + + + circle '<(0,0),1>' << circle '<(5,0),1>' + t + + + + + + geometric_type >> geometric_type + boolean + + + Is first object strictly right of second? + Available for point, box, + polygon, circle. + + + circle '<(5,0),1>' >> circle '<(0,0),1>' + t + + + + + + geometric_type &< geometric_type + boolean + + + Does first object not extend to the right of second? + Available for box, polygon, + circle. + + + box '(1,1),(0,0)' &< box '(2,2),(0,0)' + t + + + + + + geometric_type &> geometric_type + boolean + + + Does first object not extend to the left of second? + Available for box, polygon, + circle. + + + box '(3,3),(0,0)' &> box '(2,2),(0,0)' + t + + + + + + geometric_type <<| geometric_type + boolean + + + Is first object strictly below second? + Available for point, box, polygon, + circle. + + + box '(3,3),(0,0)' <<| box '(5,5),(3,4)' + t + + + + + + geometric_type |>> geometric_type + boolean + + + Is first object strictly above second? + Available for point, box, polygon, + circle. + + + box '(5,5),(3,4)' |>> box '(3,3),(0,0)' + t + + + + + + geometric_type &<| geometric_type + boolean + + + Does first object not extend above second? + Available for box, polygon, + circle. + + + box '(1,1),(0,0)' &<| box '(2,2),(0,0)' + t + + + + + + geometric_type |&> geometric_type + boolean + + + Does first object not extend below second? + Available for box, polygon, + circle. + + + box '(3,3),(0,0)' |&> box '(2,2),(0,0)' + t + + + + + + box <^ box + boolean + + + Is first object below second (allows edges to touch)? + + + box '((1,1),(0,0))' <^ box '((2,2),(1,1))' + t + + + + + + box >^ box + boolean + + + Is first object above second (allows edges to touch)? + + + box '((2,2),(1,1))' >^ box '((1,1),(0,0))' + t + + + + + + geometric_type ?# geometric_type + boolean + + + Do these objects intersect? + Available for these pairs of types: + (box, box), + (lseg, box), + (lseg, lseg), + (lseg, line), + (line, box), + (line, line), + (path, path). + + + lseg '[(-1,0),(1,0)]' ?# box '(2,2),(-2,-2)' + t + + + + + + ?- line + boolean + + + ?- lseg + boolean + + + Is line horizontal? + + + ?- lseg '[(-1,0),(1,0)]' + t + + + + + + point ?- point + boolean + + + Are points horizontally aligned (that is, have same y coordinate)? + + + point '(1,0)' ?- point '(0,0)' + t + + + + + + ?| line + boolean + + + ?| lseg + boolean + + + Is line vertical? + + + ?| lseg '[(-1,0),(1,0)]' + f + + + + + + point ?| point + boolean + + + Are points vertically aligned (that is, have same x coordinate)? + + + point '(0,1)' ?| point '(0,0)' + t + + + + + + line ?-| line + boolean + + + lseg ?-| lseg + boolean + + + Are lines perpendicular? + + + lseg '[(0,0),(0,1)]' ?-| lseg '[(0,0),(1,0)]' + t + + + + + + line ?|| line + boolean + + + lseg ?|| lseg + boolean + + + Are lines parallel? + + + lseg '[(-1,0),(1,0)]' ?|| lseg '[(-1,2),(1,2)]' + t + + + + + + geometric_type ~= geometric_type + boolean + + + Are these objects the same? + Available for point, box, + polygon, circle. + + + polygon '((0,0),(1,1))' ~= polygon '((1,1),(0,0))' + t + + + + +
+ + + + Note that the same as operator, ~=, + represents the usual notion of equality for the point, + box, polygon, and circle types. + Some of the geometric types also have an = operator, but + = compares for equal areas only. + The other scalar comparison operators (<= and so + on), where available for these types, likewise compare areas. + + + + + + Before PostgreSQL 14, the point + is strictly below/above comparison operators point + <<| point and point + |>> point were respectively + called <^ and >^. These + names are still available, but are deprecated and will eventually be + removed. + + + + + Geometric Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + area + + area ( geometric_type ) + double precision + + + Computes area. + Available for box, path, circle. + A path input must be closed, else NULL is returned. + Also, if the path is self-intersecting, the result may be + meaningless. + + + area(box '(2,2),(0,0)') + 4 + + + + + + + center + + center ( geometric_type ) + point + + + Computes center point. + Available for box, circle. + + + center(box '(1,2),(0,0)') + (0.5,1) + + + + + + + diagonal + + diagonal ( box ) + lseg + + + Extracts box's diagonal as a line segment + (same as lseg(box)). + + + diagonal(box '(1,2),(0,0)') + [(1,2),(0,0)] + + + + + + + diameter + + diameter ( circle ) + double precision + + + Computes diameter of circle. + + + diameter(circle '<(0,0),2>') + 4 + + + + + + + height + + height ( box ) + double precision + + + Computes vertical size of box. + + + height(box '(1,2),(0,0)') + 2 + + + + + + + isclosed + + isclosed ( path ) + boolean + + + Is path closed? + + + isclosed(path '((0,0),(1,1),(2,0))') + t + + + + + + + isopen + + isopen ( path ) + boolean + + + Is path open? + + + isopen(path '[(0,0),(1,1),(2,0)]') + t + + + + + + + length + + length ( geometric_type ) + double precision + + + Computes the total length. + Available for lseg, path. + + + length(path '((-1,0),(1,0))') + 4 + + + + + + + npoints + + npoints ( geometric_type ) + integer + + + Returns the number of points. + Available for path, polygon. + + + npoints(path '[(0,0),(1,1),(2,0)]') + 3 + + + + + + + pclose + + pclose ( path ) + path + + + Converts path to closed form. + + + pclose(path '[(0,0),(1,1),(2,0)]') + ((0,0),(1,1),(2,0)) + + + + + + + popen + + popen ( path ) + path + + + Converts path to open form. + + + popen(path '((0,0),(1,1),(2,0))') + [(0,0),(1,1),(2,0)] + + + + + + + radius + + radius ( circle ) + double precision + + + Computes radius of circle. + + + radius(circle '<(0,0),2>') + 2 + + + + + + + slope + + slope ( point, point ) + double precision + + + Computes slope of a line drawn through the two points. + + + slope(point '(0,0)', point '(2,1)') + 0.5 + + + + + + + width + + width ( box ) + double precision + + + Computes horizontal size of box. + + + width(box '(1,2),(0,0)') + 1 + + + + +
+ + + Geometric Type Conversion Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + box + + box ( circle ) + box + + + Computes box inscribed within the circle. + + + box(circle '<(0,0),2>') + (1.414213562373095,1.414213562373095),&zwsp;(-1.414213562373095,-1.414213562373095) + + + + + + box ( point ) + box + + + Converts point to empty box. + + + box(point '(1,0)') + (1,0),(1,0) + + + + + + box ( point, point ) + box + + + Converts any two corner points to box. + + + box(point '(0,1)', point '(1,0)') + (1,1),(0,0) + + + + + + box ( polygon ) + box + + + Computes bounding box of polygon. + + + box(polygon '((0,0),(1,1),(2,0))') + (2,1),(0,0) + + + + + + + bound_box + + bound_box ( box, box ) + box + + + Computes bounding box of two boxes. + + + bound_box(box '(1,1),(0,0)', box '(4,4),(3,3)') + (4,4),(0,0) + + + + + + + circle + + circle ( box ) + circle + + + Computes smallest circle enclosing box. + + + circle(box '(1,1),(0,0)') + <(0.5,0.5),0.7071067811865476> + + + + + + circle ( point, double precision ) + circle + + + Constructs circle from center and radius. + + + circle(point '(0,0)', 2.0) + <(0,0),2> + + + + + + circle ( polygon ) + circle + + + Converts polygon to circle. The circle's center is the mean of the + positions of the polygon's points, and the radius is the average + distance of the polygon's points from that center. + + + circle(polygon '((0,0),(1,3),(2,0))') + <(1,1),1.6094757082487299> + + + + + + + line + + line ( point, point ) + line + + + Converts two points to the line through them. + + + line(point '(-1,0)', point '(1,0)') + {0,-1,0} + + + + + + + lseg + + lseg ( box ) + lseg + + + Extracts box's diagonal as a line segment. + + + lseg(box '(1,0),(-1,0)') + [(1,0),(-1,0)] + + + + + + lseg ( point, point ) + lseg + + + Constructs line segment from two endpoints. + + + lseg(point '(-1,0)', point '(1,0)') + [(-1,0),(1,0)] + + + + + + + path + + path ( polygon ) + path + + + Converts polygon to a closed path with the same list of points. + + + path(polygon '((0,0),(1,1),(2,0))') + ((0,0),(1,1),(2,0)) + + + + + + + point + + point ( double precision, double precision ) + point + + + Constructs point from its coordinates. + + + point(23.4, -44.5) + (23.4,-44.5) + + + + + + point ( box ) + point + + + Computes center of box. + + + point(box '(1,0),(-1,0)') + (0,0) + + + + + + point ( circle ) + point + + + Computes center of circle. + + + point(circle '<(0,0),2>') + (0,0) + + + + + + point ( lseg ) + point + + + Computes center of line segment. + + + point(lseg '[(-1,0),(1,0)]') + (0,0) + + + + + + point ( polygon ) + point + + + Computes center of polygon (the mean of the + positions of the polygon's points). + + + point(polygon '((0,0),(1,1),(2,0))') + (1,0.3333333333333333) + + + + + + + polygon + + polygon ( box ) + polygon + + + Converts box to a 4-point polygon. + + + polygon(box '(1,1),(0,0)') + ((0,0),(0,1),(1,1),(1,0)) + + + + + + polygon ( circle ) + polygon + + + Converts circle to a 12-point polygon. + + + polygon(circle '<(0,0),2>') + ((-2,0),&zwsp;(-1.7320508075688774,0.9999999999999999),&zwsp;(-1.0000000000000002,1.7320508075688772),&zwsp;(-1.2246063538223773e-16,2),&zwsp;(0.9999999999999996,1.7320508075688774),&zwsp;(1.732050807568877,1.0000000000000007),&zwsp;(2,2.4492127076447545e-16),&zwsp;(1.7320508075688776,-0.9999999999999994),&zwsp;(1.0000000000000009,-1.7320508075688767),&zwsp;(3.673819061467132e-16,-2),&zwsp;(-0.9999999999999987,-1.732050807568878),&zwsp;(-1.7320508075688767,-1.0000000000000009)) + + + + + + polygon ( integer, circle ) + polygon + + + Converts circle to an n-point polygon. + + + polygon(4, circle '<(3,0),1>') + ((2,0),&zwsp;(3,1),&zwsp;(4,1.2246063538223773e-16),&zwsp;(3,-1)) + + + + + + polygon ( path ) + polygon + + + Converts closed path to a polygon with the same list of points. + + + polygon(path '((0,0),(1,1),(2,0))') + ((0,0),(1,1),(2,0)) + + + + + +
+ + + It is possible to access the two component numbers of a point + as though the point were an array with indexes 0 and 1. For example, if + t.p is a point column then + SELECT p[0] FROM t retrieves the X coordinate and + UPDATE t SET p[1] = ... changes the Y coordinate. + In the same way, a value of type box or lseg can be treated + as an array of two point values. + + +
diff --git a/doc/src/sgml/func/func-info.sgml b/doc/src/sgml/func/func-info.sgml new file mode 100644 index 0000000000000..d4508114a48ea --- /dev/null +++ b/doc/src/sgml/func/func-info.sgml @@ -0,0 +1,3800 @@ + + System Information Functions and Operators + + + The functions described in this section are used to obtain various + information about a PostgreSQL installation. + + + + Session Information Functions + + + shows several + functions that extract session and system information. + + + + In addition to the functions listed in this section, there are a number of + functions related to the statistics system that also provide system + information. See for more + information. + + + + Session Information Functions + + + + + Function + + + Description + + + + + + + + + current_catalog + + current_catalog + name + + + + current_database + + current_database () + name + + + Returns the name of the current database. (Databases are + called catalogs in the SQL standard, + so current_catalog is the standard's + spelling.) + + + + + + + current_query + + current_query () + text + + + Returns the text of the currently executing query, as submitted + by the client (which might contain more than one statement). + + + + + + + current_role + + current_role + name + + + This is equivalent to current_user. + + + + + + + current_schema + + + schema + current + + current_schema + name + + + current_schema () + name + + + Returns the name of the schema that is first in the search path (or a + null value if the search path is empty). This is the schema that will + be used for any tables or other named objects that are created without + specifying a target schema. + + + + + + + current_schemas + + + search path + current + + current_schemas ( include_implicit boolean ) + name[] + + + Returns an array of the names of all schemas presently in the + effective search path, in their priority order. (Items in the current + setting that do not correspond to + existing, searchable schemas are omitted.) If the Boolean argument + is true, then implicitly-searched system schemas + such as pg_catalog are included in the result. + + + + + + + current_user + + + user + current + + current_user + name + + + Returns the user name of the current execution context. + + + + + + + inet_client_addr + + inet_client_addr () + inet + + + Returns the IP address of the current client, + or NULL if the current connection is via a + Unix-domain socket. + + + + + + + inet_client_port + + inet_client_port () + integer + + + Returns the IP port number of the current client, + or NULL if the current connection is via a + Unix-domain socket. + + + + + + + inet_server_addr + + inet_server_addr () + inet + + + Returns the IP address on which the server accepted the current + connection, + or NULL if the current connection is via a + Unix-domain socket. + + + + + + + inet_server_port + + inet_server_port () + integer + + + Returns the IP port number on which the server accepted the current + connection, + or NULL if the current connection is via a + Unix-domain socket. + + + + + + + pg_backend_pid + + pg_backend_pid () + integer + + + Returns the process ID of the server process attached to the current + session. + + + + + + + pg_blocking_pids + + pg_blocking_pids ( integer ) + integer[] + + + Returns an array of the process ID(s) of the sessions that are + blocking the server process with the specified process ID from + acquiring a lock, or an empty array if there is no such server process + or it is not blocked. + + + One server process blocks another if it either holds a lock that + conflicts with the blocked process's lock request (hard block), or is + waiting for a lock that would conflict with the blocked process's lock + request and is ahead of it in the wait queue (soft block). When using + parallel queries the result always lists client-visible process IDs + (that is, pg_backend_pid results) even if the + actual lock is held or awaited by a child worker process. As a result + of that, there may be duplicated PIDs in the result. Also note that + when a prepared transaction holds a conflicting lock, it will be + represented by a zero process ID. + + + Frequent calls to this function could have some impact on database + performance, because it needs exclusive access to the lock manager's + shared state for a short time. + + + + + + + pg_conf_load_time + + pg_conf_load_time () + timestamp with time zone + + + Returns the time when the server configuration files were last loaded. + If the current session was alive at the time, this will be the time + when the session itself re-read the configuration files (so the + reading will vary a little in different sessions). Otherwise it is + the time when the postmaster process re-read the configuration files. + + + + + + + pg_current_logfile + + + Logging + pg_current_logfile function + + + current_logfiles + and the pg_current_logfile function + + + Logging + current_logfiles file and the pg_current_logfile + function + + pg_current_logfile ( text ) + text + + + Returns the path name of the log file currently in use by the logging + collector. The path includes the + directory and the individual log file name. The result + is NULL if the logging collector is disabled. + When multiple log files exist, each in a different + format, pg_current_logfile without an argument + returns the path of the file having the first format found in the + ordered list: stderr, + csvlog, jsonlog. + NULL is returned if no log file has any of these + formats. + To request information about a specific log file format, supply + either csvlog, jsonlog or + stderr as the + value of the optional parameter. The result is NULL + if the log format requested is not configured in + . + The result reflects the contents of + the current_logfiles file. + + + This function is restricted to superusers and roles with privileges of + the pg_monitor role by default, but other users can + be granted EXECUTE to run the function. + + + + + + + pg_get_loaded_modules + + pg_get_loaded_modules () + setof record + ( module_name text, + version text, + file_name text ) + + + Returns a list of the loadable modules that are loaded into the + current server session. The module_name + and version fields are NULL unless the + module author supplied values for them using + the PG_MODULE_MAGIC_EXT macro. + The file_name field gives the file + name of the module (shared library). + + + + + + + pg_my_temp_schema + + pg_my_temp_schema () + oid + + + Returns the OID of the current session's temporary schema, or zero if + it has none (because it has not created any temporary tables). + + + + + + + pg_is_other_temp_schema + + pg_is_other_temp_schema ( oid ) + boolean + + + Returns true if the given OID is the OID of another session's + temporary schema. (This can be useful, for example, to exclude other + sessions' temporary tables from a catalog display.) + + + + + + + pg_jit_available + + pg_jit_available () + boolean + + + Returns true if a JIT compiler extension is + available (see ) and the + configuration parameter is set to + on. + + + + + + + pg_numa_available + + pg_numa_available () + boolean + + + Returns true if the server has been compiled with NUMA support. + + + + + + + pg_listening_channels + + pg_listening_channels () + setof text + + + Returns the set of names of asynchronous notification channels that + the current session is listening to. + + + + + + + pg_notification_queue_usage + + pg_notification_queue_usage () + double precision + + + Returns the fraction (0–1) of the asynchronous notification + queue's maximum size that is currently occupied by notifications that + are waiting to be processed. + See and + for more information. + + + + + + + pg_postmaster_start_time + + pg_postmaster_start_time () + timestamp with time zone + + + Returns the time when the server started. + + + + + + + pg_safe_snapshot_blocking_pids + + pg_safe_snapshot_blocking_pids ( integer ) + integer[] + + + Returns an array of the process ID(s) of the sessions that are blocking + the server process with the specified process ID from acquiring a safe + snapshot, or an empty array if there is no such server process or it + is not blocked. + + + A session running a SERIALIZABLE transaction blocks + a SERIALIZABLE READ ONLY DEFERRABLE transaction + from acquiring a snapshot until the latter determines that it is safe + to avoid taking any predicate locks. See + for more information about + serializable and deferrable transactions. + + + Frequent calls to this function could have some impact on database + performance, because it needs access to the predicate lock manager's + shared state for a short time. + + + + + + + pg_trigger_depth + + pg_trigger_depth () + integer + + + Returns the current nesting level + of PostgreSQL triggers (0 if not called, + directly or indirectly, from inside a trigger). + + + + + + + session_user + + session_user + name + + + Returns the session user's name. + + + + + + + system_user + + system_user + text + + + Returns the authentication method and the identity (if any) that the + user presented during the authentication cycle before they were + assigned a database role. It is represented as + auth_method:identity or + NULL if the user has not been authenticated (for + example if Trust authentication has + been used). + + + + + + + user + + user + name + + + This is equivalent to current_user. + + + + +
+ + + + current_catalog, + current_role, + current_schema, + current_user, + session_user, + and user have special syntactic status + in SQL: they must be called without trailing + parentheses. In PostgreSQL, parentheses can optionally be used with + current_schema, but not with the others. + + + + + The session_user is normally the user who initiated + the current database connection; but superusers can change this setting + with . + The current_user is the user identifier + that is applicable for permission checking. Normally it is equal + to the session user, but it can be changed with + . + It also changes during the execution of + functions with the attribute SECURITY DEFINER. + In Unix parlance, the session user is the real user and + the current user is the effective user. + current_role and user are + synonyms for current_user. (The SQL standard draws + a distinction between current_role + and current_user, but PostgreSQL + does not, since it unifies users and roles into a single kind of entity.) + + +
+ + + Access Privilege Inquiry Functions + + + privilege + querying + + + + lists functions that + allow querying object access privileges programmatically. + (See for more information about + privileges.) + In these functions, the user whose privileges are being inquired about + can be specified by name or by OID + (pg_authid.oid), or if + the name is given as public then the privileges of the + PUBLIC pseudo-role are checked. Also, the user + argument can be omitted entirely, in which case + the current_user is assumed. + The object that is being inquired about can be specified either by name or + by OID, too. When specifying by name, a schema name can be included if + relevant. + The access privilege of interest is specified by a text string, which must + evaluate to one of the appropriate privilege keywords for the object's type + (e.g., SELECT). Optionally, WITH GRANT + OPTION can be added to a privilege type to test whether the + privilege is held with grant option. Also, multiple privilege types can be + listed separated by commas, in which case the result will be true if any of + the listed privileges is held. (Case of the privilege string is not + significant, and extra whitespace is allowed between but not within + privilege names.) + Some examples: + +SELECT has_table_privilege('myschema.mytable', 'select'); +SELECT has_table_privilege('joe', 'mytable', 'INSERT, SELECT WITH GRANT OPTION'); + + + + + Access Privilege Inquiry Functions + + + + + Function + + + Description + + + + + + + + + has_any_column_privilege + + has_any_column_privilege ( + user name or oid, + table text or oid, + privilege text ) + boolean + + + Does user have privilege for any column of table? + This succeeds either if the privilege is held for the whole table, or + if there is a column-level grant of the privilege for at least one + column. + Allowable privilege types are + SELECT, INSERT, + UPDATE, and REFERENCES. + + + + + + + has_column_privilege + + has_column_privilege ( + user name or oid, + table text or oid, + column text or smallint, + privilege text ) + boolean + + + Does user have privilege for the specified table column? + This succeeds either if the privilege is held for the whole table, or + if there is a column-level grant of the privilege for the column. + The column can be specified by name or by attribute number + (pg_attribute.attnum). + Allowable privilege types are + SELECT, INSERT, + UPDATE, and REFERENCES. + + + + + + + has_database_privilege + + has_database_privilege ( + user name or oid, + database text or oid, + privilege text ) + boolean + + + Does user have privilege for database? + Allowable privilege types are + CREATE, + CONNECT, + TEMPORARY, and + TEMP (which is equivalent to + TEMPORARY). + + + + + + + has_foreign_data_wrapper_privilege + + has_foreign_data_wrapper_privilege ( + user name or oid, + fdw text or oid, + privilege text ) + boolean + + + Does user have privilege for foreign-data wrapper? + The only allowable privilege type is USAGE. + + + + + + + has_function_privilege + + has_function_privilege ( + user name or oid, + function text or oid, + privilege text ) + boolean + + + Does user have privilege for function? + The only allowable privilege type is EXECUTE. + + + When specifying a function by name rather than by OID, the allowed + input is the same as for the regprocedure data type (see + ). + An example is: + +SELECT has_function_privilege('joeuser', 'myfunc(int, text)', 'execute'); + + + + + + + + has_language_privilege + + has_language_privilege ( + user name or oid, + language text or oid, + privilege text ) + boolean + + + Does user have privilege for language? + The only allowable privilege type is USAGE. + + + + + + + has_largeobject_privilege + + has_largeobject_privilege ( + user name or oid, + largeobject oid, + privilege text ) + boolean + + + Does user have privilege for large object? + Allowable privilege types are + SELECT and UPDATE. + + + + + + + has_parameter_privilege + + has_parameter_privilege ( + user name or oid, + parameter text, + privilege text ) + boolean + + + Does user have privilege for configuration parameter? + The parameter name is case-insensitive. + Allowable privilege types are SET + and ALTER SYSTEM. + + + + + + + has_schema_privilege + + has_schema_privilege ( + user name or oid, + schema text or oid, + privilege text ) + boolean + + + Does user have privilege for schema? + Allowable privilege types are + CREATE and + USAGE. + + + + + + + has_sequence_privilege + + has_sequence_privilege ( + user name or oid, + sequence text or oid, + privilege text ) + boolean + + + Does user have privilege for sequence? + Allowable privilege types are + USAGE, + SELECT, and + UPDATE. + + + + + + + has_server_privilege + + has_server_privilege ( + user name or oid, + server text or oid, + privilege text ) + boolean + + + Does user have privilege for foreign server? + The only allowable privilege type is USAGE. + + + + + + + has_table_privilege + + has_table_privilege ( + user name or oid, + table text or oid, + privilege text ) + boolean + + + Does user have privilege for table? + Allowable privilege types + are SELECT, INSERT, + UPDATE, DELETE, + TRUNCATE, REFERENCES, + TRIGGER, and MAINTAIN. + + + + + + + has_tablespace_privilege + + has_tablespace_privilege ( + user name or oid, + tablespace text or oid, + privilege text ) + boolean + + + Does user have privilege for tablespace? + The only allowable privilege type is CREATE. + + + + + + + has_type_privilege + + has_type_privilege ( + user name or oid, + type text or oid, + privilege text ) + boolean + + + Does user have privilege for data type? + The only allowable privilege type is USAGE. + When specifying a type by name rather than by OID, the allowed input + is the same as for the regtype data type (see + ). + + + + + + + pg_has_role + + pg_has_role ( + user name or oid, + role text or oid, + privilege text ) + boolean + + + Does user have privilege for role? + Allowable privilege types are + MEMBER, USAGE, + and SET. + MEMBER denotes direct or indirect membership in + the role without regard to what specific privileges may be conferred. + USAGE denotes whether the privileges of the role + are immediately available without doing SET ROLE, + while SET denotes whether it is possible to change + to the role using the SET ROLE command. + WITH ADMIN OPTION or WITH GRANT + OPTION can be added to any of these privilege types to + test whether the ADMIN privilege is held (all + six spellings test the same thing). + This function does not allow the special case of + setting user to public, + because the PUBLIC pseudo-role can never be a member of real roles. + + + + + + + row_security_active + + row_security_active ( + table text or oid ) + boolean + + + Is row-level security active for the specified table in the context of + the current user and current environment? + + + + +
+ + + shows the operators + available for the aclitem type, which is the catalog + representation of access privileges. See + for information about how to read access privilege values. + + + + <type>aclitem</type> Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + + aclitemeq + + aclitem = aclitem + boolean + + + Are aclitems equal? (Notice that + type aclitem lacks the usual set of comparison + operators; it has only equality. In turn, aclitem + arrays can only be compared for equality.) + + + 'calvin=r*w/hobbes'::aclitem = 'calvin=r*w*/hobbes'::aclitem + f + + + + + + + aclcontains + + aclitem[] @> aclitem + boolean + + + Does array contain the specified privileges? (This is true if there + is an array entry that matches the aclitem's grantee and + grantor, and has at least the specified set of privileges.) + + + '{calvin=r*w/hobbes,hobbes=r*w*/postgres}'::aclitem[] @> 'calvin=r*/hobbes'::aclitem + t + + + + + + aclitem[] ~ aclitem + boolean + + + This is a deprecated alias for @>. + + + '{calvin=r*w/hobbes,hobbes=r*w*/postgres}'::aclitem[] ~ 'calvin=r*/hobbes'::aclitem + t + + + + +
+ + + shows some additional + functions to manage the aclitem type. + + + + <type>aclitem</type> Functions + + + + + Function + + + Description + + + + + + + + + acldefault + + acldefault ( + type "char", + ownerId oid ) + aclitem[] + + + Constructs an aclitem array holding the default access + privileges for an object of type type belonging + to the role with OID ownerId. This represents + the access privileges that will be assumed when an object's + ACL entry is null. (The default access privileges + are described in .) + The type parameter must be one of + 'c' for COLUMN, + 'r' for TABLE and table-like objects, + 's' for SEQUENCE, + 'd' for DATABASE, + 'f' for FUNCTION or PROCEDURE, + 'l' for LANGUAGE, + 'L' for LARGE OBJECT, + 'n' for SCHEMA, + 'p' for PARAMETER, + 't' for TABLESPACE, + 'F' for FOREIGN DATA WRAPPER, + 'S' for FOREIGN SERVER, + or + 'T' for TYPE or DOMAIN. + + + + + + + aclexplode + + aclexplode ( aclitem[] ) + setof record + ( grantor oid, + grantee oid, + privilege_type text, + is_grantable boolean ) + + + Returns the aclitem array as a set of rows. + If the grantee is the pseudo-role PUBLIC, it is represented by zero in + the grantee column. Each granted privilege is + represented as SELECT, INSERT, + etc (see for a full list). + Note that each privilege is broken out as a separate row, so + only one keyword appears in the privilege_type + column. + + + + + + + makeaclitem + + makeaclitem ( + grantee oid, + grantor oid, + privileges text, + is_grantable boolean ) + aclitem + + + Constructs an aclitem with the given properties. + privileges is a comma-separated list of + privilege names such as SELECT, + INSERT, etc, all of which are set in the + result. (Case of the privilege string is not significant, and + extra whitespace is allowed between but not within privilege + names.) + + + + +
+ +
+ + + Schema Visibility Inquiry Functions + + + shows functions that + determine whether a certain object is visible in the + current schema search path. + For example, a table is said to be visible if its + containing schema is in the search path and no table of the same + name appears earlier in the search path. This is equivalent to the + statement that the table can be referenced by name without explicit + schema qualification. Thus, to list the names of all visible tables: + +SELECT relname FROM pg_class WHERE pg_table_is_visible(oid); + + For functions and operators, an object in the search path is said to be + visible if there is no object of the same name and argument data + type(s) earlier in the path. For operator classes and families, + both the name and the associated index access method are considered. + + + + search path + object visibility + + + + Schema Visibility Inquiry Functions + + + + + Function + + + Description + + + + + + + + + pg_collation_is_visible + + pg_collation_is_visible ( collation oid ) + boolean + + + Is collation visible in search path? + + + + + + + pg_conversion_is_visible + + pg_conversion_is_visible ( conversion oid ) + boolean + + + Is conversion visible in search path? + + + + + + + pg_function_is_visible + + pg_function_is_visible ( function oid ) + boolean + + + Is function visible in search path? + (This also works for procedures and aggregates.) + + + + + + + pg_opclass_is_visible + + pg_opclass_is_visible ( opclass oid ) + boolean + + + Is operator class visible in search path? + + + + + + + pg_operator_is_visible + + pg_operator_is_visible ( operator oid ) + boolean + + + Is operator visible in search path? + + + + + + + pg_opfamily_is_visible + + pg_opfamily_is_visible ( opclass oid ) + boolean + + + Is operator family visible in search path? + + + + + + + pg_statistics_obj_is_visible + + pg_statistics_obj_is_visible ( stat oid ) + boolean + + + Is statistics object visible in search path? + + + + + + + pg_table_is_visible + + pg_table_is_visible ( table oid ) + boolean + + + Is table visible in search path? + (This works for all types of relations, including views, materialized + views, indexes, sequences and foreign tables.) + + + + + + + pg_ts_config_is_visible + + pg_ts_config_is_visible ( config oid ) + boolean + + + Is text search configuration visible in search path? + + + + + + + pg_ts_dict_is_visible + + pg_ts_dict_is_visible ( dict oid ) + boolean + + + Is text search dictionary visible in search path? + + + + + + + pg_ts_parser_is_visible + + pg_ts_parser_is_visible ( parser oid ) + boolean + + + Is text search parser visible in search path? + + + + + + + pg_ts_template_is_visible + + pg_ts_template_is_visible ( template oid ) + boolean + + + Is text search template visible in search path? + + + + + + + pg_type_is_visible + + pg_type_is_visible ( type oid ) + boolean + + + Is type (or domain) visible in search path? + + + + +
+ + + All these functions require object OIDs to identify the object to be + checked. If you want to test an object by name, it is convenient to use + the OID alias types (regclass, regtype, + regprocedure, regoperator, regconfig, + or regdictionary), + for example: + +SELECT pg_type_is_visible('myschema.widget'::regtype); + + Note that it would not make much sense to test a non-schema-qualified + type name in this way — if the name can be recognized at all, it must be visible. + + +
+ + + System Catalog Information Functions + + + lists functions that + extract information from the system catalogs. + + + + System Catalog Information Functions + + + + + Function + + + Description + + + + + + + + + format_type + + format_type ( type oid, typemod integer ) + text + + + Returns the SQL name for a data type that is identified by its type + OID and possibly a type modifier. Pass NULL for the type modifier if + no specific modifier is known. + + + + + + + pg_basetype + + pg_basetype ( regtype ) + regtype + + + Returns the OID of the base type of a domain identified by its + type OID. If the argument is the OID of a non-domain type, + returns the argument as-is. Returns NULL if the argument is + not a valid type OID. If there's a chain of domain dependencies, + it will recurse until finding the base type. + + + Assuming CREATE DOMAIN mytext AS text: + + + pg_basetype('mytext'::regtype) + text + + + + + + + pg_char_to_encoding + + pg_char_to_encoding ( encoding name ) + integer + + + Converts the supplied encoding name into an integer representing the + internal identifier used in some system catalog tables. + Returns -1 if an unknown encoding name is provided. + + + + + + + pg_encoding_to_char + + pg_encoding_to_char ( encoding integer ) + name + + + Converts the integer used as the internal identifier of an encoding in some + system catalog tables into a human-readable string. + Returns an empty string if an invalid encoding number is provided. + + + + + + + pg_get_catalog_foreign_keys + + pg_get_catalog_foreign_keys () + setof record + ( fktable regclass, + fkcols text[], + pktable regclass, + pkcols text[], + is_array boolean, + is_opt boolean ) + + + Returns a set of records describing the foreign key relationships + that exist within the PostgreSQL system + catalogs. + The fktable column contains the name of the + referencing catalog, and the fkcols column + contains the name(s) of the referencing column(s). Similarly, + the pktable column contains the name of the + referenced catalog, and the pkcols column + contains the name(s) of the referenced column(s). + If is_array is true, the last referencing + column is an array, each of whose elements should match some entry + in the referenced catalog. + If is_opt is true, the referencing column(s) + are allowed to contain zeroes instead of a valid reference. + + + + + + + pg_get_constraintdef + + pg_get_constraintdef ( constraint oid , pretty boolean ) + text + + + Reconstructs the creating command for a constraint. + (This is a decompiled reconstruction, not the original text + of the command.) + + + + + + + pg_get_expr + + pg_get_expr ( expr pg_node_tree, relation oid , pretty boolean ) + text + + + Decompiles the internal form of an expression stored in the system + catalogs, such as the default value for a column. If the expression + might contain Vars, specify the OID of the relation they refer to as + the second parameter; if no Vars are expected, passing zero is + sufficient. + + + + + + + pg_get_functiondef + + pg_get_functiondef ( func oid ) + text + + + Reconstructs the creating command for a function or procedure. + (This is a decompiled reconstruction, not the original text + of the command.) + The result is a complete CREATE OR REPLACE FUNCTION + or CREATE OR REPLACE PROCEDURE statement. + + + + + + + pg_get_function_arguments + + pg_get_function_arguments ( func oid ) + text + + + Reconstructs the argument list of a function or procedure, in the form + it would need to appear in within CREATE FUNCTION + (including default values). + + + + + + + pg_get_function_identity_arguments + + pg_get_function_identity_arguments ( func oid ) + text + + + Reconstructs the argument list necessary to identify a function or + procedure, in the form it would need to appear in within commands such + as ALTER FUNCTION. This form omits default values. + + + + + + + pg_get_function_result + + pg_get_function_result ( func oid ) + text + + + Reconstructs the RETURNS clause of a function, in + the form it would need to appear in within CREATE + FUNCTION. Returns NULL for a procedure. + + + + + + + pg_get_indexdef + + pg_get_indexdef ( index oid , column integer, pretty boolean ) + text + + + Reconstructs the creating command for an index. + (This is a decompiled reconstruction, not the original text + of the command.) If column is supplied and is + not zero, only the definition of that column is reconstructed. + + + + + + + pg_get_keywords + + pg_get_keywords () + setof record + ( word text, + catcode "char", + barelabel boolean, + catdesc text, + baredesc text ) + + + Returns a set of records describing the SQL keywords recognized by the + server. The word column contains the + keyword. The catcode column contains a + category code: U for an unreserved + keyword, C for a keyword that can be a column + name, T for a keyword that can be a type or + function name, or R for a fully reserved keyword. + The barelabel column + contains true if the keyword can be used as + a bare column label in SELECT lists, + or false if it can only be used + after AS. + The catdesc column contains a + possibly-localized string describing the keyword's category. + The baredesc column contains a + possibly-localized string describing the keyword's column label status. + + + + + + + pg_get_partkeydef + + pg_get_partkeydef ( table oid ) + text + + + Reconstructs the definition of a partitioned table's partition + key, in the form it would have in the PARTITION + BY clause of CREATE TABLE. + (This is a decompiled reconstruction, not the original text + of the command.) + + + + + + + pg_get_ruledef + + pg_get_ruledef ( rule oid , pretty boolean ) + text + + + Reconstructs the creating command for a rule. + (This is a decompiled reconstruction, not the original text + of the command.) + + + + + + + pg_get_serial_sequence + + pg_get_serial_sequence ( table text, column text ) + text + + + Returns the name of the sequence associated with a column, + or NULL if no sequence is associated with the column. + If the column is an identity column, the associated sequence is the + sequence internally created for that column. + For columns created using one of the serial types + (serial, smallserial, bigserial), + it is the sequence created for that serial column definition. + In the latter case, the association can be modified or removed + with ALTER SEQUENCE OWNED BY. + (This function probably should have been + called pg_get_owned_sequence; its current name + reflects the fact that it has historically been used with serial-type + columns.) The first parameter is a table name with optional + schema, and the second parameter is a column name. Because the first + parameter potentially contains both schema and table names, it is + parsed per usual SQL rules, meaning it is lower-cased by default. + The second parameter, being just a column name, is treated literally + and so has its case preserved. The result is suitably formatted + for passing to the sequence functions (see + ). + + + A typical use is in reading the current value of the sequence for an + identity or serial column, for example: + +SELECT currval(pg_get_serial_sequence('sometable', 'id')); + + + + + + + + pg_get_statisticsobjdef + + pg_get_statisticsobjdef ( statobj oid ) + text + + + Reconstructs the creating command for an extended statistics object. + (This is a decompiled reconstruction, not the original text + of the command.) + + + + + + + pg_get_triggerdef + +pg_get_triggerdef ( trigger oid , pretty boolean ) + text + + + Reconstructs the creating command for a trigger. + (This is a decompiled reconstruction, not the original text + of the command.) + + + + + + + pg_get_userbyid + + pg_get_userbyid ( role oid ) + name + + + Returns a role's name given its OID. + + + + + + + pg_get_viewdef + + pg_get_viewdef ( view oid , pretty boolean ) + text + + + Reconstructs the underlying SELECT command for a + view or materialized view. (This is a decompiled reconstruction, not + the original text of the command.) + + + + + + pg_get_viewdef ( view oid, wrap_column integer ) + text + + + Reconstructs the underlying SELECT command for a + view or materialized view. (This is a decompiled reconstruction, not + the original text of the command.) In this form of the function, + pretty-printing is always enabled, and long lines are wrapped to try + to keep them shorter than the specified number of columns. + + + + + + pg_get_viewdef ( view text , pretty boolean ) + text + + + Reconstructs the underlying SELECT command for a + view or materialized view, working from a textual name for the view + rather than its OID. (This is deprecated; use the OID variant + instead.) + + + + + + + pg_index_column_has_property + + pg_index_column_has_property ( index regclass, column integer, property text ) + boolean + + + Tests whether an index column has the named property. + Common index column properties are listed in + . + (Note that extension access methods can define additional property + names for their indexes.) + NULL is returned if the property name is not known + or does not apply to the particular object, or if the OID or column + number does not identify a valid object. + + + + + + + pg_index_has_property + + pg_index_has_property ( index regclass, property text ) + boolean + + + Tests whether an index has the named property. + Common index properties are listed in + . + (Note that extension access methods can define additional property + names for their indexes.) + NULL is returned if the property name is not known + or does not apply to the particular object, or if the OID does not + identify a valid object. + + + + + + + pg_indexam_has_property + + pg_indexam_has_property ( am oid, property text ) + boolean + + + Tests whether an index access method has the named property. + Access method properties are listed in + . + NULL is returned if the property name is not known + or does not apply to the particular object, or if the OID does not + identify a valid object. + + + + + + + pg_options_to_table + + pg_options_to_table ( options_array text[] ) + setof record + ( option_name text, + option_value text ) + + + Returns the set of storage options represented by a value from + pg_class.reloptions or + pg_attribute.attoptions. + + + + + + + pg_settings_get_flags + + pg_settings_get_flags ( guc text ) + text[] + + + Returns an array of the flags associated with the given GUC, or + NULL if it does not exist. The result is + an empty array if the GUC exists but there are no flags to show. + Only the most useful flags listed in + are exposed. + + + + + + + pg_tablespace_databases + + pg_tablespace_databases ( tablespace oid ) + setof oid + + + Returns the set of OIDs of databases that have objects stored in the + specified tablespace. If this function returns any rows, the + tablespace is not empty and cannot be dropped. To identify the specific + objects populating the tablespace, you will need to connect to the + database(s) identified by pg_tablespace_databases + and query their pg_class catalogs. + + + + + + + pg_tablespace_location + + pg_tablespace_location ( tablespace oid ) + text + + + Returns the file system path that this tablespace is located in. + + + + + + + pg_typeof + + pg_typeof ( "any" ) + regtype + + + Returns the OID of the data type of the value that is passed to it. + This can be helpful for troubleshooting or dynamically constructing + SQL queries. The function is declared as + returning regtype, which is an OID alias type (see + ); this means that it is the same as an + OID for comparison purposes but displays as a type name. + + + pg_typeof(33) + integer + + + + + + + COLLATION FOR + + COLLATION FOR ( "any" ) + text + + + Returns the name of the collation of the value that is passed to it. + The value is quoted and schema-qualified if necessary. If no + collation was derived for the argument expression, + then NULL is returned. If the argument is not of a + collatable data type, then an error is raised. + + + COLLATION FOR ('foo'::text) + "default" + + + COLLATION FOR ('foo' COLLATE "de_DE") + "de_DE" + + + + + + + to_regclass + + to_regclass ( text ) + regclass + + + Translates a textual relation name to its OID. A similar result is + obtained by casting the string to type regclass (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + + + + + to_regdatabase + + to_regdatabase ( text ) + regdatabase + + + Translates a textual database name to its OID. A similar result is + obtained by casting the string to type regdatabase (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + + + + + to_regcollation + + to_regcollation ( text ) + regcollation + + + Translates a textual collation name to its OID. A similar result is + obtained by casting the string to type regcollation (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + + + + + to_regnamespace + + to_regnamespace ( text ) + regnamespace + + + Translates a textual schema name to its OID. A similar result is + obtained by casting the string to type regnamespace (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + + + + + to_regoper + + to_regoper ( text ) + regoper + + + Translates a textual operator name to its OID. A similar result is + obtained by casting the string to type regoper (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found or is ambiguous. + + + + + + + to_regoperator + + to_regoperator ( text ) + regoperator + + + Translates a textual operator name (with parameter types) to its OID. A similar result is + obtained by casting the string to type regoperator (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + + + + + to_regproc + + to_regproc ( text ) + regproc + + + Translates a textual function or procedure name to its OID. A similar result is + obtained by casting the string to type regproc (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found or is ambiguous. + + + + + + + to_regprocedure + + to_regprocedure ( text ) + regprocedure + + + Translates a textual function or procedure name (with argument types) to its OID. A similar result is + obtained by casting the string to type regprocedure (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + + + + + to_regrole + + to_regrole ( text ) + regrole + + + Translates a textual role name to its OID. A similar result is + obtained by casting the string to type regrole (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + + + + + to_regtype + + to_regtype ( text ) + regtype + + + Parses a string of text, extracts a potential type name from it, + and translates that name into a type OID. A syntax error in the + string will result in an error; but if the string is a + syntactically valid type name that happens not to be found in the + catalogs, the result is NULL. A similar result + is obtained by casting the string to type regtype + (see ), except that that will throw + error for name not found. + + + + + + + to_regtypemod + + to_regtypemod ( text ) + integer + + + Parses a string of text, extracts a potential type name from it, + and translates its type modifier, if any. A syntax error in the + string will result in an error; but if the string is a + syntactically valid type name that happens not to be found in the + catalogs, the result is NULL. The result is + -1 if no type modifier is present. + + + to_regtypemod can be combined with + to produce appropriate inputs for + , allowing a string representing a + type name to be canonicalized. + + + format_type(to_regtype('varchar(32)'), to_regtypemod('varchar(32)')) + character varying(32) + + + + +
+ + + Most of the functions that reconstruct (decompile) database objects + have an optional pretty flag, which + if true causes the result to + be pretty-printed. Pretty-printing suppresses unnecessary + parentheses and adds whitespace for legibility. + The pretty-printed format is more readable, but the default format + is more likely to be interpreted the same way by future versions of + PostgreSQL; so avoid using pretty-printed output + for dump purposes. Passing false for + the pretty parameter yields the same result as + omitting the parameter. + + + + Index Column Properties + + + + + NameDescription + + + + asc + Does the column sort in ascending order on a forward scan? + + + + desc + Does the column sort in descending order on a forward scan? + + + + nulls_first + Does the column sort with nulls first on a forward scan? + + + + nulls_last + Does the column sort with nulls last on a forward scan? + + + + orderable + Does the column possess any defined sort ordering? + + + + distance_orderable + Can the column be scanned in order by a distance + operator, for example ORDER BY col <-> constant ? + + + + returnable + Can the column value be returned by an index-only scan? + + + + search_array + Does the column natively support col = ANY(array) + searches? + + + + search_nulls + Does the column support IS NULL and + IS NOT NULL searches? + + + + +
+ + + Index Properties + + + + + NameDescription + + + + clusterable + Can the index be used in a CLUSTER command? + + + + index_scan + Does the index support plain (non-bitmap) scans? + + + + bitmap_scan + Does the index support bitmap scans? + + + + backward_scan + Can the scan direction be changed in mid-scan (to + support FETCH BACKWARD on a cursor without + needing materialization)? + + + + +
+ + + Index Access Method Properties + + + + + NameDescription + + + + can_order + Does the access method support ASC, + DESC and related keywords in + CREATE INDEX? + + + + can_unique + Does the access method support unique indexes? + + + + can_multi_col + Does the access method support indexes with multiple columns? + + + + can_exclude + Does the access method support exclusion constraints? + + + + can_include + Does the access method support the INCLUDE + clause of CREATE INDEX? + + + + +
+ + + GUC Flags + + + + + FlagDescription + + + + EXPLAIN + Parameters with this flag are included in + EXPLAIN (SETTINGS) commands. + + + + NO_SHOW_ALL + Parameters with this flag are excluded from + SHOW ALL commands. + + + + NO_RESET + Parameters with this flag do not support + RESET commands. + + + + NO_RESET_ALL + Parameters with this flag are excluded from + RESET ALL commands. + + + + NOT_IN_SAMPLE + Parameters with this flag are not included in + postgresql.conf by default. + + + + RUNTIME_COMPUTED + Parameters with this flag are runtime-computed ones. + + + + +
+ +
+ + + Object Information and Addressing Functions + + + lists functions related to + database object identification and addressing. + + + + Object Information and Addressing Functions + + + + + Function + + + Description + + + + + + + + + pg_get_acl + + pg_get_acl ( classid oid, objid oid, objsubid integer ) + aclitem[] + + + Returns the ACL for a database object, specified + by catalog OID, object OID and sub-object ID. This function returns + NULL values for undefined objects. + + + + + + + pg_describe_object + + pg_describe_object ( classid oid, objid oid, objsubid integer ) + text + + + Returns a textual description of a database object identified by + catalog OID, object OID, and sub-object ID (such as a column number + within a table; the sub-object ID is zero when referring to a whole + object). This description is intended to be human-readable, and might + be translated, depending on server configuration. This is especially + useful to determine the identity of an object referenced in the + pg_depend catalog. This function returns + NULL values for undefined objects. + + + + + + + pg_identify_object + + pg_identify_object ( classid oid, objid oid, objsubid integer ) + record + ( type text, + schema text, + name text, + identity text ) + + + Returns a row containing enough information to uniquely identify the + database object specified by catalog OID, object OID and sub-object + ID. + This information is intended to be machine-readable, and is never + translated. + type identifies the type of database object; + schema is the schema name that the object + belongs in, or NULL for object types that do not + belong to schemas; + name is the name of the object, quoted if + necessary, if the name (along with schema name, if pertinent) is + sufficient to uniquely identify the object, + otherwise NULL; + identity is the complete object identity, with + the precise format depending on object type, and each name within the + format being schema-qualified and quoted as necessary. Undefined + objects are identified with NULL values. + + + + + + + pg_identify_object_as_address + + pg_identify_object_as_address ( classid oid, objid oid, objsubid integer ) + record + ( type text, + object_names text[], + object_args text[] ) + + + Returns a row containing enough information to uniquely identify the + database object specified by catalog OID, object OID and sub-object + ID. + The returned information is independent of the current server, that + is, it could be used to identify an identically named object in + another server. + type identifies the type of database object; + object_names and + object_args + are text arrays that together form a reference to the object. + These three values can be passed + to pg_get_object_address to obtain the internal + address of the object. + + + + + + + pg_get_object_address + + pg_get_object_address ( type text, object_names text[], object_args text[] ) + record + ( classid oid, + objid oid, + objsubid integer ) + + + Returns a row containing enough information to uniquely identify the + database object specified by a type code and object name and argument + arrays. + The returned values are the ones that would be used in system catalogs + such as pg_depend; they can be passed to + other system functions such as pg_describe_object + or pg_identify_object. + classid is the OID of the system catalog + containing the object; + objid is the OID of the object itself, and + objsubid is the sub-object ID, or zero if none. + This function is the inverse + of pg_identify_object_as_address. + Undefined objects are identified with NULL values. + + + + +
+ + + pg_get_acl is useful for retrieving and inspecting + the privileges associated with database objects without looking at + specific catalogs. For example, to retrieve all the granted privileges + on objects in the current database: + +postgres=# SELECT + (pg_identify_object(s.classid,s.objid,s.objsubid)).*, + pg_catalog.pg_get_acl(s.classid,s.objid,s.objsubid) AS acl +FROM pg_catalog.pg_shdepend AS s +JOIN pg_catalog.pg_database AS d + ON d.datname = current_database() AND + d.oid = s.dbid +JOIN pg_catalog.pg_authid AS a + ON a.oid = s.refobjid AND + s.refclassid = 'pg_authid'::regclass +WHERE s.deptype = 'a'; +-[ RECORD 1 ]----------------------------------------- +type | table +schema | public +name | testtab +identity | public.testtab +acl | {postgres=arwdDxtm/postgres,foo=r/postgres} + + + +
+ + + Comment Information Functions + + + comment + about database objects + + + + The functions shown in + extract comments previously stored with the + command. A null value is returned if no + comment could be found for the specified parameters. + + + + Comment Information Functions + + + + + Function + + + Description + + + + + + + + + col_description + + col_description ( table oid, column integer ) + text + + + Returns the comment for a table column, which is specified by the OID + of its table and its column number. + (obj_description cannot be used for table + columns, since columns do not have OIDs of their own.) + + + + + + + obj_description + + obj_description ( object oid, catalog name ) + text + + + Returns the comment for a database object specified by its OID and the + name of the containing system catalog. For + example, obj_description(123456, 'pg_class') would + retrieve the comment for the table with OID 123456. + + + + + + obj_description ( object oid ) + text + + + Returns the comment for a database object specified by its OID alone. + This is deprecated since there is no guarantee + that OIDs are unique across different system catalogs; therefore, the + wrong comment might be returned. + + + + + + + shobj_description + + shobj_description ( object oid, catalog name ) + text + + + Returns the comment for a shared database object specified by its OID + and the name of the containing system catalog. This is just + like obj_description except that it is used for + retrieving comments on shared objects (that is, databases, roles, and + tablespaces). Some system catalogs are global to all databases within + each cluster, and the descriptions for objects in them are stored + globally as well. + + + + +
+ +
+ + + Data Validity Checking Functions + + + The functions shown in + can be helpful for checking validity of proposed input data. + + + + Data Validity Checking Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + pg_input_is_valid + + pg_input_is_valid ( + string text, + type text + ) + boolean + + + Tests whether the given string is valid + input for the specified data type, returning true or false. + + + This function will only work as desired if the data type's input + function has been updated to report invalid input as + a soft error. Otherwise, invalid input will abort + the transaction, just as if the string had been cast to the type + directly. + + + pg_input_is_valid('42', 'integer') + t + + + pg_input_is_valid('42000000000', 'integer') + f + + + pg_input_is_valid('1234.567', 'numeric(7,4)') + f + + + + + + pg_input_error_info + + pg_input_error_info ( + string text, + type text + ) + record + ( message text, + detail text, + hint text, + sql_error_code text ) + + + Tests whether the given string is valid + input for the specified data type; if not, return the details of + the error that would have been thrown. If the input is valid, the + results are NULL. The inputs are the same as + for pg_input_is_valid. + + + This function will only work as desired if the data type's input + function has been updated to report invalid input as + a soft error. Otherwise, invalid input will abort + the transaction, just as if the string had been cast to the type + directly. + + + SELECT * FROM pg_input_error_info('42000000000', 'integer') + + + message | detail | hint | sql_error_code +------------------------------------------------------+--------+------+---------------- + value "42000000000" is out of range for type integer | | | 22003 + + + + + +
+ +
+ + + Transaction ID and Snapshot Information Functions + + + The functions shown in + provide server transaction information in an exportable form. The main + use of these functions is to determine which transactions were committed + between two snapshots. + + + + Transaction ID and Snapshot Information Functions + + + + + Function + + + Description + + + + + + + + + age + + age ( xid ) + integer + + + Returns the number of transactions between the supplied + transaction id and the current transaction counter. + + + + + + + mxid_age + + mxid_age ( xid ) + integer + + + Returns the number of multixacts IDs between the supplied + multixact ID and the current multixacts counter. + + + + + + + pg_current_xact_id + + pg_current_xact_id () + xid8 + + + Returns the current transaction's ID. It will assign a new one if the + current transaction does not have one already (because it has not + performed any database updates); see for details. If executed in a + subtransaction, this will return the top-level transaction ID; + see for details. + + + + + + + pg_current_xact_id_if_assigned + + pg_current_xact_id_if_assigned () + xid8 + + + Returns the current transaction's ID, or NULL if no + ID is assigned yet. (It's best to use this variant if the transaction + might otherwise be read-only, to avoid unnecessary consumption of an + XID.) + If executed in a subtransaction, this will return the top-level + transaction ID. + + + + + + + pg_xact_status + + pg_xact_status ( xid8 ) + text + + + Reports the commit status of a recent transaction. + The result is one of in progress, + committed, or aborted, + provided that the transaction is recent enough that the system retains + the commit status of that transaction. + If it is old enough that no references to the transaction survive in + the system and the commit status information has been discarded, the + result is NULL. + Applications might use this function, for example, to determine + whether their transaction committed or aborted after the application + and database server become disconnected while + a COMMIT is in progress. + Note that prepared transactions are reported as in + progress; applications must check pg_prepared_xacts + if they need to determine whether a transaction ID belongs to a + prepared transaction. + + + + + + + pg_current_snapshot + + pg_current_snapshot () + pg_snapshot + + + Returns a current snapshot, a data structure + showing which transaction IDs are now in-progress. + Only top-level transaction IDs are included in the snapshot; + subtransaction IDs are not shown; see + for details. + + + + + + + pg_snapshot_xip + + pg_snapshot_xip ( pg_snapshot ) + setof xid8 + + + Returns the set of in-progress transaction IDs contained in a snapshot. + + + + + + + pg_snapshot_xmax + + pg_snapshot_xmax ( pg_snapshot ) + xid8 + + + Returns the xmax of a snapshot. + + + + + + + pg_snapshot_xmin + + pg_snapshot_xmin ( pg_snapshot ) + xid8 + + + Returns the xmin of a snapshot. + + + + + + + pg_visible_in_snapshot + + pg_visible_in_snapshot ( xid8, pg_snapshot ) + boolean + + + Is the given transaction ID visible according + to this snapshot (that is, was it completed before the snapshot was + taken)? Note that this function will not give the correct answer for + a subtransaction ID (subxid); see for + details. + + + + + + + pg_get_multixact_members + + pg_get_multixact_members ( multixid xid ) + setof record + ( xid xid, + mode text ) + + + Returns the transaction ID and lock mode for each member of the + specified multixact ID. The lock modes forupd, + fornokeyupd, sh, and + keysh correspond to the row-level locks + FOR UPDATE, FOR NO KEY UPDATE, + FOR SHARE, and FOR KEY SHARE, + respectively, as described in . Two + additional modes are specific to multixacts: + nokeyupd, used by updates that do not modify key + columns, and upd, used by updates or deletes that + modify key columns. + + + + +
+ + + The internal transaction ID type xid is 32 bits wide and + wraps around every 4 billion transactions. However, + the functions shown in , except + age, mxid_age, and + pg_get_multixact_members, use a + 64-bit type xid8 that does not wrap around during the life + of an installation and can be converted to xid by casting if + required; see for details. + The data type pg_snapshot stores information about + transaction ID visibility at a particular moment in time. Its components + are described in . + pg_snapshot's textual representation is + xmin:xmax:xip_list. + For example 10:20:10,14,15 means + xmin=10, xmax=20, xip_list=10, 14, 15. + + + + Snapshot Components + + + + + + Name + Description + + + + + + xmin + + Lowest transaction ID that was still active. All transaction IDs + less than xmin are either committed and visible, + or rolled back and dead. + + + + + xmax + + One past the highest completed transaction ID. All transaction IDs + greater than or equal to xmax had not yet + completed as of the time of the snapshot, and thus are invisible. + + + + + xip_list + + Transactions in progress at the time of the snapshot. A transaction + ID that is xmin <= X < + xmax and not in this list was already completed at the time + of the snapshot, and thus is either visible or dead according to its + commit status. This list does not include the transaction IDs of + subtransactions (subxids). + + + + +
+ + + In releases of PostgreSQL before 13 there was + no xid8 type, so variants of these functions were provided + that used bigint to represent a 64-bit XID, with a + correspondingly distinct snapshot data type txid_snapshot. + These older functions have txid in their names. They + are still supported for backward compatibility, but may be removed from a + future release. See . + + + + Deprecated Transaction ID and Snapshot Information Functions + + + + + Function + + + Description + + + + + + + + + + txid_current + + txid_current () + bigint + + + See pg_current_xact_id(). + + + + + + + txid_current_if_assigned + + txid_current_if_assigned () + bigint + + + See pg_current_xact_id_if_assigned(). + + + + + + + txid_current_snapshot + + txid_current_snapshot () + txid_snapshot + + + See pg_current_snapshot(). + + + + + + + txid_snapshot_xip + + txid_snapshot_xip ( txid_snapshot ) + setof bigint + + + See pg_snapshot_xip(). + + + + + + + txid_snapshot_xmax + + txid_snapshot_xmax ( txid_snapshot ) + bigint + + + See pg_snapshot_xmax(). + + + + + + + txid_snapshot_xmin + + txid_snapshot_xmin ( txid_snapshot ) + bigint + + + See pg_snapshot_xmin(). + + + + + + + txid_visible_in_snapshot + + txid_visible_in_snapshot ( bigint, txid_snapshot ) + boolean + + + See pg_visible_in_snapshot(). + + + + + + + txid_status + + txid_status ( bigint ) + text + + + See pg_xact_status(). + + + + +
+ +
+ + + Committed Transaction Information Functions + + + The functions shown in + provide information about when past transactions were committed. + They only provide useful data when the + configuration option is + enabled, and only for transactions that were committed after it was + enabled. Commit timestamp information is routinely removed during + vacuum. + + + + Committed Transaction Information Functions + + + + + Function + + + Description + + + + + + + + + pg_xact_commit_timestamp + + pg_xact_commit_timestamp ( xid ) + timestamp with time zone + + + Returns the commit timestamp of a transaction. + + + + + + + pg_xact_commit_timestamp_origin + + pg_xact_commit_timestamp_origin ( xid ) + record + ( timestamp timestamp with time zone, + roident oid) + + + Returns the commit timestamp and replication origin of a transaction. + + + + + + + pg_last_committed_xact + + pg_last_committed_xact () + record + ( xid xid, + timestamp timestamp with time zone, + roident oid ) + + + Returns the transaction ID, commit timestamp and replication origin + of the latest committed transaction. + + + + +
+ +
+ + + Control Data Functions + + + The functions shown in + print information initialized during initdb, such + as the catalog version. They also show information about write-ahead + logging and checkpoint processing. This information is cluster-wide, + not specific to any one database. These functions provide most of the same + information, from the same source, as the + application. + + + + Control Data Functions + + + + + Function + + + Description + + + + + + + + + pg_control_checkpoint + + pg_control_checkpoint () + record + + + Returns information about current checkpoint state, as shown in + . + + + + + + + pg_control_system + + pg_control_system () + record + + + Returns information about current control file state, as shown in + . + + + + + + + pg_control_init + + pg_control_init () + record + + + Returns information about cluster initialization state, as shown in + . + + + + + + + pg_control_recovery + + pg_control_recovery () + record + + + Returns information about recovery state, as shown in + . + + + + +
+ + + <function>pg_control_checkpoint</function> Output Columns + + + + Column Name + Data Type + + + + + + + checkpoint_lsn + pg_lsn + + + + redo_lsn + pg_lsn + + + + redo_wal_file + text + + + + timeline_id + integer + + + + prev_timeline_id + integer + + + + full_page_writes + boolean + + + + next_xid + text + + + + next_oid + oid + + + + next_multixact_id + xid + + + + next_multi_offset + xid + + + + oldest_xid + xid + + + + oldest_xid_dbid + oid + + + + oldest_active_xid + xid + + + + oldest_multi_xid + xid + + + + oldest_multi_dbid + oid + + + + oldest_commit_ts_xid + xid + + + + newest_commit_ts_xid + xid + + + + checkpoint_time + timestamp with time zone + + + + +
+ + + <function>pg_control_system</function> Output Columns + + + + Column Name + Data Type + + + + + + + pg_control_version + integer + + + + catalog_version_no + integer + + + + system_identifier + bigint + + + + pg_control_last_modified + timestamp with time zone + + + + +
+ + + <function>pg_control_init</function> Output Columns + + + + Column Name + Data Type + + + + + + + max_data_alignment + integer + + + + database_block_size + integer + + + + blocks_per_segment + integer + + + + wal_block_size + integer + + + + bytes_per_wal_segment + integer + + + + max_identifier_length + integer + + + + max_index_columns + integer + + + + max_toast_chunk_size + integer + + + + large_object_chunk_size + integer + + + + float8_pass_by_value + boolean + + + + data_page_checksum_version + integer + + + + default_char_signedness + boolean + + + + +
+ + + <function>pg_control_recovery</function> Output Columns + + + + Column Name + Data Type + + + + + + + min_recovery_end_lsn + pg_lsn + + + + min_recovery_end_timeline + integer + + + + backup_start_lsn + pg_lsn + + + + backup_end_lsn + pg_lsn + + + + end_of_backup_record_required + boolean + + + + +
+ +
+ + + Version Information Functions + + + The functions shown in + print version information. + + + + Version Information Functions + + + + + Function + + + Description + + + + + + + + + version + + version () + text + + + Returns a string describing the PostgreSQL + server's version. You can also get this information from + , or for a machine-readable + version use . Software + developers should use server_version_num (available + since 8.2) or instead of + parsing the text version. + + + + + + + unicode_version + + unicode_version () + text + + + Returns a string representing the version of Unicode used by + PostgreSQL. + + + + + + icu_unicode_version + + icu_unicode_version () + text + + + Returns a string representing the version of Unicode used by ICU, if + the server was built with ICU support; otherwise returns + NULL + + + +
+ +
+ + + WAL Summarization Information Functions + + + The functions shown in + print information about the status of WAL summarization. + See . + + + + WAL Summarization Information Functions + + + + + Function + + + Description + + + + + + + + + pg_available_wal_summaries + + pg_available_wal_summaries () + setof record + ( tli bigint, + start_lsn pg_lsn, + end_lsn pg_lsn ) + + + Returns information about the WAL summary files present in the + data directory, under pg_wal/summaries. + One row will be returned per WAL summary file. Each file summarizes + WAL on the indicated TLI within the indicated LSN range. This function + might be useful to determine whether enough WAL summaries are present + on the server to take an incremental backup based on some prior + backup whose start LSN is known. + + + + + + + pg_wal_summary_contents + + pg_wal_summary_contents ( tli bigint, start_lsn pg_lsn, end_lsn pg_lsn ) + setof record + ( relfilenode oid, + reltablespace oid, + reldatabase oid, + relforknumber smallint, + relblocknumber bigint, + is_limit_block boolean ) + + + Returns one information about the contents of a single WAL summary file + identified by TLI and starting and ending LSNs. Each row with + is_limit_block false indicates that the block + identified by the remaining output columns was modified by at least + one WAL record within the range of records summarized by this file. + Each row with is_limit_block true indicates either + that (a) the relation fork was truncated to the length given by + relblocknumber within the relevant range of WAL + records or (b) that the relation fork was created or dropped within + the relevant range of WAL records; in such cases, + relblocknumber will be zero. + + + + + + + pg_get_wal_summarizer_state + + pg_get_wal_summarizer_state () + record + ( summarized_tli bigint, + summarized_lsn pg_lsn, + pending_lsn pg_lsn, + summarizer_pid int ) + + + Returns information about the progress of the WAL summarizer. If the + WAL summarizer has never run since the instance was started, then + summarized_tli and summarized_lsn + will be 0 and 0/00000000 respectively; + otherwise, they will be the TLI and ending LSN of the last WAL summary + file written to disk. If the WAL summarizer is currently running, + pending_lsn will be the ending LSN of the last + record that it has consumed, which must always be greater than or + equal to summarized_lsn; if the WAL summarizer is + not running, it will be equal to summarized_lsn. + summarizer_pid is the PID of the WAL summarizer + process, if it is running, and otherwise NULL. + + + As a special exception, the WAL summarizer will refuse to generate + WAL summary files if run on WAL generated under + wal_level=minimal, since such summaries would be + unsafe to use as the basis for an incremental backup. In this case, + the fields above will continue to advance as if summaries were being + generated, but nothing will be written to disk. Once the summarizer + reaches WAL generated while wal_level was set + to replica or higher, it will resume writing + summaries to disk. + + + + +
+ +
+ +
diff --git a/doc/src/sgml/func/func-json.sgml b/doc/src/sgml/func/func-json.sgml new file mode 100644 index 0000000000000..1ec73cff4645d --- /dev/null +++ b/doc/src/sgml/func/func-json.sgml @@ -0,0 +1,3945 @@ + + JSON Functions and Operators + + + JSON + functions and operators + + + SQL/JSON + functions and expressions + + + + This section describes: + + + + + functions and operators for processing and creating JSON data + + + + + the SQL/JSON path language + + + + + the SQL/JSON query functions + + + + + + + To provide native support for JSON data types within the SQL environment, + PostgreSQL implements the + SQL/JSON data model. + This model comprises sequences of items. Each item can hold SQL scalar + values, with an additional SQL/JSON null value, and composite data structures + that use JSON arrays and objects. The model is a formalization of the implied + data model in the JSON specification + RFC 7159. + + + + SQL/JSON allows you to handle JSON data alongside regular SQL data, + with transaction support, including: + + + + + Uploading JSON data into the database and storing it in + regular SQL columns as character or binary strings. + + + + + Generating JSON objects and arrays from relational data. + + + + + Querying JSON data using SQL/JSON query functions and + SQL/JSON path language expressions. + + + + + + + To learn more about the SQL/JSON standard, see + . For details on JSON types + supported in PostgreSQL, + see . + + + + Processing and Creating JSON Data + + + shows the operators that + are available for use with JSON data types (see ). + In addition, the usual comparison operators shown in are available for + jsonb, though not for json. The comparison + operators follow the ordering rules for B-tree operations outlined in + . + See also for the aggregate + function json_agg which aggregates record + values as JSON, the aggregate function + json_object_agg which aggregates pairs of values + into a JSON object, and their jsonb equivalents, + jsonb_agg and jsonb_object_agg. + + + + <type>json</type> and <type>jsonb</type> Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + json -> integer + json + + + jsonb -> integer + jsonb + + + Extracts n'th element of JSON array + (array elements are indexed from zero, but negative integers count + from the end). + + + '[{"a":"foo"},{"b":"bar"},{"c":"baz"}]'::json -> 2 + {"c":"baz"} + + + '[{"a":"foo"},{"b":"bar"},{"c":"baz"}]'::json -> -3 + {"a":"foo"} + + + + + + json -> text + json + + + jsonb -> text + jsonb + + + Extracts JSON object field with the given key. + + + '{"a": {"b":"foo"}}'::json -> 'a' + {"b":"foo"} + + + + + + json ->> integer + text + + + jsonb ->> integer + text + + + Extracts n'th element of JSON array, + as text. + + + '[1,2,3]'::json ->> 2 + 3 + + + + + + json ->> text + text + + + jsonb ->> text + text + + + Extracts JSON object field with the given key, as text. + + + '{"a":1,"b":2}'::json ->> 'b' + 2 + + + + + + json #> text[] + json + + + jsonb #> text[] + jsonb + + + Extracts JSON sub-object at the specified path, where path elements + can be either field keys or array indexes. + + + '{"a": {"b": ["foo","bar"]}}'::json #> '{a,b,1}' + "bar" + + + + + + json #>> text[] + text + + + jsonb #>> text[] + text + + + Extracts JSON sub-object at the specified path as text. + + + '{"a": {"b": ["foo","bar"]}}'::json #>> '{a,b,1}' + bar + + + + +
+ + + + The field/element/path extraction operators return NULL, rather than + failing, if the JSON input does not have the right structure to match + the request; for example if no such key or array element exists. + + + + + Some further operators exist only for jsonb, as shown + in . + + describes how these operators can be used to effectively search indexed + jsonb data. + + + + Additional <type>jsonb</type> Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + jsonb @> jsonb + boolean + + + Does the first JSON value contain the second? + (See for details about containment.) + + + '{"a":1, "b":2}'::jsonb @> '{"b":2}'::jsonb + t + + + + + + jsonb <@ jsonb + boolean + + + Is the first JSON value contained in the second? + + + '{"b":2}'::jsonb <@ '{"a":1, "b":2}'::jsonb + t + + + + + + jsonb ? text + boolean + + + Does the text string exist as a top-level key or array element within + the JSON value? + + + '{"a":1, "b":2}'::jsonb ? 'b' + t + + + '["a", "b", "c"]'::jsonb ? 'b' + t + + + + + + jsonb ?| text[] + boolean + + + Do any of the strings in the text array exist as top-level keys or + array elements? + + + '{"a":1, "b":2, "c":3}'::jsonb ?| array['b', 'd'] + t + + + + + + jsonb ?& text[] + boolean + + + Do all of the strings in the text array exist as top-level keys or + array elements? + + + '["a", "b", "c"]'::jsonb ?& array['a', 'b'] + t + + + + + + jsonb || jsonb + jsonb + + + Concatenates two jsonb values. + Concatenating two arrays generates an array containing all the + elements of each input. Concatenating two objects generates an + object containing the union of their + keys, taking the second object's value when there are duplicate keys. + All other cases are treated by converting a non-array input into a + single-element array, and then proceeding as for two arrays. + Does not operate recursively: only the top-level array or object + structure is merged. + + + '["a", "b"]'::jsonb || '["a", "d"]'::jsonb + ["a", "b", "a", "d"] + + + '{"a": "b"}'::jsonb || '{"c": "d"}'::jsonb + {"a": "b", "c": "d"} + + + '[1, 2]'::jsonb || '3'::jsonb + [1, 2, 3] + + + '{"a": "b"}'::jsonb || '42'::jsonb + [{"a": "b"}, 42] + + + To append an array to another array as a single entry, wrap it + in an additional layer of array, for example: + + + '[1, 2]'::jsonb || jsonb_build_array('[3, 4]'::jsonb) + [1, 2, [3, 4]] + + + + + + jsonb - text + jsonb + + + Deletes a key (and its value) from a JSON object, or matching string + value(s) from a JSON array. + + + '{"a": "b", "c": "d"}'::jsonb - 'a' + {"c": "d"} + + + '["a", "b", "c", "b"]'::jsonb - 'b' + ["a", "c"] + + + + + + jsonb - text[] + jsonb + + + Deletes all matching keys or array elements from the left operand. + + + '{"a": "b", "c": "d"}'::jsonb - '{a,c}'::text[] + {} + + + + + + jsonb - integer + jsonb + + + Deletes the array element with specified index (negative + integers count from the end). Throws an error if JSON value + is not an array. + + + '["a", "b"]'::jsonb - 1 + ["a"] + + + + + + jsonb #- text[] + jsonb + + + Deletes the field or array element at the specified path, where path + elements can be either field keys or array indexes. + + + '["a", {"b":1}]'::jsonb #- '{1,b}' + ["a", {}] + + + + + + jsonb @? jsonpath + boolean + + + Does JSON path return any item for the specified JSON value? + (This is useful only with SQL-standard JSON path expressions, not + predicate check + expressions, since those always return a value.) + + + '{"a":[1,2,3,4,5]}'::jsonb @? '$.a[*] ? (@ > 2)' + t + + + + + + jsonb @@ jsonpath + boolean + + + Returns the result of a JSON path predicate check for the + specified JSON value. + (This is useful only + with predicate + check expressions, not SQL-standard JSON path expressions, + since it will return NULL if the path result is + not a single boolean value.) + + + '{"a":[1,2,3,4,5]}'::jsonb @@ '$.a[*] > 2' + t + + + + +
+ + + + The jsonpath operators @? + and @@ suppress the following errors: missing object + field or array element, unexpected JSON item type, datetime and numeric + errors. The jsonpath-related functions described below can + also be told to suppress these types of errors. This behavior might be + helpful when searching JSON document collections of varying structure. + + + + + shows the functions that are + available for constructing json and jsonb values. + Some functions in this table have a RETURNING clause, + which specifies the data type returned. It must be one of json, + jsonb, bytea, a character string type (text, + char, or varchar), or a type + that can be cast to json. + By default, the json type is returned. + + + + JSON Creation Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + to_json + + to_json ( anyelement ) + json + + + + to_jsonb + + to_jsonb ( anyelement ) + jsonb + + + Converts any SQL value to json or jsonb. + Arrays and composites are converted recursively to arrays and + objects (multidimensional arrays become arrays of arrays in JSON). + Otherwise, if there is a cast from the SQL data type + to json, the cast function will be used to perform the + conversion; + + For example, the extension has a cast + from hstore to json, so that + hstore values converted via the JSON creation functions + will be represented as JSON objects, not as primitive string values. + + + otherwise, a scalar JSON value is produced. For any scalar other than + a number, a Boolean, or a null value, the text representation will be + used, with escaping as necessary to make it a valid JSON string value. + + + to_json('Fred said "Hi."'::text) + "Fred said \"Hi.\"" + + + to_jsonb(row(42, 'Fred said "Hi."'::text)) + {"f1": 42, "f2": "Fred said \"Hi.\""} + + + + + + + array_to_json + + array_to_json ( anyarray , boolean ) + json + + + Converts an SQL array to a JSON array. The behavior is the same + as to_json except that line feeds will be added + between top-level array elements if the optional boolean parameter is + true. + + + array_to_json('{{1,5},{99,100}}'::int[]) + [[1,5],[99,100]] + + + + + + + json_array + json_array ( + { value_expression FORMAT JSON } , ... + { NULL | ABSENT } ON NULL + RETURNING data_type FORMAT JSON ENCODING UTF8 ) + + + json_array ( + query_expression + RETURNING data_type FORMAT JSON ENCODING UTF8 ) + + + Constructs a JSON array from either a series of + value_expression parameters or from the results + of query_expression, + which must be a SELECT query returning a single column. If + ABSENT ON NULL is specified, NULL values are ignored. + This is always the case if a + query_expression is used. + + + json_array(1,true,json '{"a":null}') + [1, true, {"a":null}] + + + json_array(SELECT * FROM (VALUES(1),(2)) t) + [1, 2] + + + + + + + row_to_json + + row_to_json ( record , boolean ) + json + + + Converts an SQL composite value to a JSON object. The behavior is the + same as to_json except that line feeds will be + added between top-level elements if the optional boolean parameter is + true. + + + row_to_json(row(1,'foo')) + {"f1":1,"f2":"foo"} + + + + + + + json_build_array + + json_build_array ( VARIADIC "any" ) + json + + + + jsonb_build_array + + jsonb_build_array ( VARIADIC "any" ) + jsonb + + + Builds a possibly-heterogeneously-typed JSON array out of a variadic + argument list. Each argument is converted as + per to_json or to_jsonb. + + + json_build_array(1, 2, 'foo', 4, 5) + [1, 2, "foo", 4, 5] + + + + + + + json_build_object + + json_build_object ( VARIADIC "any" ) + json + + + + jsonb_build_object + + jsonb_build_object ( VARIADIC "any" ) + jsonb + + + Builds a JSON object out of a variadic argument list. By convention, + the argument list consists of alternating keys and values. Key + arguments are coerced to text; value arguments are converted as + per to_json or to_jsonb. + + + json_build_object('foo', 1, 2, row(3,'bar')) + {"foo" : 1, "2" : {"f1":3,"f2":"bar"}} + + + + + + json_object + json_object ( + { key_expression { VALUE | ':' } + value_expression FORMAT JSON ENCODING UTF8 }, ... + { NULL | ABSENT } ON NULL + { WITH | WITHOUT } UNIQUE KEYS + RETURNING data_type FORMAT JSON ENCODING UTF8 ) + + + Constructs a JSON object of all the key/value pairs given, + or an empty object if none are given. + key_expression is a scalar expression + defining the JSON key, which is + converted to the text type. + It cannot be NULL nor can it + belong to a type that has a cast to the json type. + If WITH UNIQUE KEYS is specified, there must not + be any duplicate key_expression. + Any pair for which the value_expression + evaluates to NULL is omitted from the output + if ABSENT ON NULL is specified; + if NULL ON NULL is specified or the clause + omitted, the key is included with value NULL. + + + json_object('code' VALUE 'P123', 'title': 'Jaws') + {"code" : "P123", "title" : "Jaws"} + + + + + + + json_object + + json_object ( text[] ) + json + + + + jsonb_object + + jsonb_object ( text[] ) + jsonb + + + Builds a JSON object out of a text array. The array must have either + exactly one dimension with an even number of members, in which case + they are taken as alternating key/value pairs, or two dimensions + such that each inner array has exactly two elements, which + are taken as a key/value pair. All values are converted to JSON + strings. + + + json_object('{a, 1, b, "def", c, 3.5}') + {"a" : "1", "b" : "def", "c" : "3.5"} + + json_object('{{a, 1}, {b, "def"}, {c, 3.5}}') + {"a" : "1", "b" : "def", "c" : "3.5"} + + + + + + json_object ( keys text[], values text[] ) + json + + + jsonb_object ( keys text[], values text[] ) + jsonb + + + This form of json_object takes keys and values + pairwise from separate text arrays. Otherwise it is identical to + the one-argument form. + + + json_object('{a,b}', '{1,2}') + {"a": "1", "b": "2"} + + + + + + json constructor + json ( + expression + FORMAT JSON ENCODING UTF8 + { WITH | WITHOUT } UNIQUE KEYS ) + json + + + Converts a given expression specified as text or + bytea string (in UTF8 encoding) into a JSON + value. If expression is NULL, an + SQL null value is returned. + If WITH UNIQUE is specified, the + expression must not contain any duplicate + object keys. + + + json('{"a":123, "b":[true,"foo"], "a":"bar"}') + {"a":123, "b":[true,"foo"], "a":"bar"} + + + + + + + json_scalar + json_scalar ( expression ) + + + Converts a given SQL scalar value into a JSON scalar value. + If the input is NULL, an SQL null is returned. If + the input is number or a boolean value, a corresponding JSON number + or boolean value is returned. For any other value, a JSON string is + returned. + + + json_scalar(123.45) + 123.45 + + + json_scalar(CURRENT_TIMESTAMP) + "2022-05-10T10:51:04.62128-04:00" + + + + + + json_serialize ( + expression FORMAT JSON ENCODING UTF8 + RETURNING data_type FORMAT JSON ENCODING UTF8 ) + + + Converts an SQL/JSON expression into a character or binary string. The + expression can be of any JSON type, any + character string type, or bytea in UTF8 encoding. + The returned type used in RETURNING can be any + character string type or bytea. The default is + text. + + + json_serialize('{ "a" : 1 } ' RETURNING bytea) + \x7b20226122203a2031207d20 + + + + +
+ + + details SQL/JSON + facilities for testing JSON. + + + + SQL/JSON Testing Functions + + + + + Function signature + + + Description + + + Example(s) + + + + + + + IS JSON + expression IS NOT JSON + { VALUE | SCALAR | ARRAY | OBJECT } + { WITH | WITHOUT } UNIQUE KEYS + + + This predicate tests whether expression can be + parsed as JSON, possibly of a specified type. + If SCALAR or ARRAY or + OBJECT is specified, the + test is whether or not the JSON is of that particular type. If + WITH UNIQUE KEYS is specified, then any object in the + expression is also tested to see if it + has duplicate keys. + + + +SELECT js, + js IS JSON "json?", + js IS JSON SCALAR "scalar?", + js IS JSON OBJECT "object?", + js IS JSON ARRAY "array?" +FROM (VALUES + ('123'), ('"abc"'), ('{"a": "b"}'), ('[1,2]'),('abc')) foo(js); + js | json? | scalar? | object? | array? +------------+-------+---------+---------+-------- + 123 | t | t | f | f + "abc" | t | t | f | f + {"a": "b"} | t | f | t | f + [1,2] | t | f | f | t + abc | f | f | f | f + + + + +SELECT js, + js IS JSON OBJECT "object?", + js IS JSON ARRAY "array?", + js IS JSON ARRAY WITH UNIQUE KEYS "array w. UK?", + js IS JSON ARRAY WITHOUT UNIQUE KEYS "array w/o UK?" +FROM (VALUES ('[{"a":"1"}, + {"b":"2","b":"3"}]')) foo(js); +-[ RECORD 1 ]-+-------------------- +js | [{"a":"1"}, + + | {"b":"2","b":"3"}] +object? | f +array? | t +array w. UK? | f +array w/o UK? | t + + + + + +
+ + + shows the functions that + are available for processing json and jsonb values. + + + + JSON Processing Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + json_array_elements + + json_array_elements ( json ) + setof json + + + + jsonb_array_elements + + jsonb_array_elements ( jsonb ) + setof jsonb + + + Expands the top-level JSON array into a set of JSON values. + + + SELECT * FROM json_array_elements('[1,true, [2,false]]') + + + value +----------- + 1 + true + [2,false] + + + + + + + + json_array_elements_text + + json_array_elements_text ( json ) + setof text + + + + jsonb_array_elements_text + + jsonb_array_elements_text ( jsonb ) + setof text + + + Expands the top-level JSON array into a set of text values. + + + SELECT * FROM json_array_elements_text('["foo", "bar"]') + + + value +----------- + foo + bar + + + + + + + + json_array_length + + json_array_length ( json ) + integer + + + + jsonb_array_length + + jsonb_array_length ( jsonb ) + integer + + + Returns the number of elements in the top-level JSON array. + + + json_array_length('[1,2,3,{"f1":1,"f2":[5,6]},4]') + 5 + + + jsonb_array_length('[]') + 0 + + + + + + + json_each + + json_each ( json ) + setof record + ( key text, + value json ) + + + + jsonb_each + + jsonb_each ( jsonb ) + setof record + ( key text, + value jsonb ) + + + Expands the top-level JSON object into a set of key/value pairs. + + + SELECT * FROM json_each('{"a":"foo", "b":"bar"}') + + + key | value +-----+------- + a | "foo" + b | "bar" + + + + + + + + json_each_text + + json_each_text ( json ) + setof record + ( key text, + value text ) + + + + jsonb_each_text + + jsonb_each_text ( jsonb ) + setof record + ( key text, + value text ) + + + Expands the top-level JSON object into a set of key/value pairs. + The returned values will be of + type text. + + + SELECT * FROM json_each_text('{"a":"foo", "b":"bar"}') + + + key | value +-----+------- + a | foo + b | bar + + + + + + + + json_extract_path + + json_extract_path ( from_json json, VARIADIC path_elems text[] ) + json + + + + jsonb_extract_path + + jsonb_extract_path ( from_json jsonb, VARIADIC path_elems text[] ) + jsonb + + + Extracts JSON sub-object at the specified path. + (This is functionally equivalent to the #> + operator, but writing the path out as a variadic list can be more + convenient in some cases.) + + + json_extract_path('{"f2":{"f3":1},"f4":{"f5":99,"f6":"foo"}}', 'f4', 'f6') + "foo" + + + + + + + json_extract_path_text + + json_extract_path_text ( from_json json, VARIADIC path_elems text[] ) + text + + + + jsonb_extract_path_text + + jsonb_extract_path_text ( from_json jsonb, VARIADIC path_elems text[] ) + text + + + Extracts JSON sub-object at the specified path as text. + (This is functionally equivalent to the #>> + operator.) + + + json_extract_path_text('{"f2":{"f3":1},"f4":{"f5":99,"f6":"foo"}}', 'f4', 'f6') + foo + + + + + + + json_object_keys + + json_object_keys ( json ) + setof text + + + + jsonb_object_keys + + jsonb_object_keys ( jsonb ) + setof text + + + Returns the set of keys in the top-level JSON object. + + + SELECT * FROM json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}') + + + json_object_keys +------------------ + f1 + f2 + + + + + + + + json_populate_record + + json_populate_record ( base anyelement, from_json json ) + anyelement + + + + jsonb_populate_record + + jsonb_populate_record ( base anyelement, from_json jsonb ) + anyelement + + + Expands the top-level JSON object to a row having the composite type + of the base argument. The JSON object + is scanned for fields whose names match column names of the output row + type, and their values are inserted into those columns of the output. + (Fields that do not correspond to any output column name are ignored.) + In typical use, the value of base is just + NULL, which means that any output columns that do + not match any object field will be filled with nulls. However, + if base isn't NULL then + the values it contains will be used for unmatched columns. + + + To convert a JSON value to the SQL type of an output column, the + following rules are applied in sequence: + + + + A JSON null value is converted to an SQL null in all cases. + + + + + If the output column is of type json + or jsonb, the JSON value is just reproduced exactly. + + + + + If the output column is a composite (row) type, and the JSON value + is a JSON object, the fields of the object are converted to columns + of the output row type by recursive application of these rules. + + + + + Likewise, if the output column is an array type and the JSON value + is a JSON array, the elements of the JSON array are converted to + elements of the output array by recursive application of these + rules. + + + + + Otherwise, if the JSON value is a string, the contents of the + string are fed to the input conversion function for the column's + data type. + + + + + Otherwise, the ordinary text representation of the JSON value is + fed to the input conversion function for the column's data type. + + + + + + While the example below uses a constant JSON value, typical use would + be to reference a json or jsonb column + laterally from another table in the query's FROM + clause. Writing json_populate_record in + the FROM clause is good practice, since all of the + extracted columns are available for use without duplicate function + calls. + + + CREATE TYPE subrowtype AS (d int, e text); + CREATE TYPE myrowtype AS (a int, b text[], c subrowtype); + + + SELECT * FROM json_populate_record(NULL::myrowtype, + '{"a": 1, "b": ["2", "a b"], "c": {"d": 4, "e": "a b c"}, "x": "foo"}') + + + a | b | c +---+-----------+------------- + 1 | {2,"a b"} | (4,"a b c") + + + + + + + + jsonb_populate_record_valid + + jsonb_populate_record_valid ( base anyelement, from_json json ) + boolean + + + Function for testing jsonb_populate_record. Returns + true if the input jsonb_populate_record + would finish without an error for the given input JSON object; that is, it's + valid input, false otherwise. + + + CREATE TYPE jsb_char2 AS (a char(2)); + + + SELECT jsonb_populate_record_valid(NULL::jsb_char2, '{"a": "aaa"}'); + + + jsonb_populate_record_valid +----------------------------- + f +(1 row) + + + SELECT * FROM jsonb_populate_record(NULL::jsb_char2, '{"a": "aaa"}') q; + + +ERROR: value too long for type character(2) + + SELECT jsonb_populate_record_valid(NULL::jsb_char2, '{"a": "aa"}'); + + + jsonb_populate_record_valid +----------------------------- + t +(1 row) + + + SELECT * FROM jsonb_populate_record(NULL::jsb_char2, '{"a": "aa"}') q; + + + a +---- + aa +(1 row) + + + + + + + + json_populate_recordset + + json_populate_recordset ( base anyelement, from_json json ) + setof anyelement + + + + jsonb_populate_recordset + + jsonb_populate_recordset ( base anyelement, from_json jsonb ) + setof anyelement + + + Expands the top-level JSON array of objects to a set of rows having + the composite type of the base argument. + Each element of the JSON array is processed as described above + for json[b]_populate_record. + + + CREATE TYPE twoints AS (a int, b int); + + + SELECT * FROM json_populate_recordset(NULL::twoints, '[{"a":1,"b":2}, {"a":3,"b":4}]') + + + a | b +---+--- + 1 | 2 + 3 | 4 + + + + + + + + json_to_record + + json_to_record ( json ) + record + + + + jsonb_to_record + + jsonb_to_record ( jsonb ) + record + + + Expands the top-level JSON object to a row having the composite type + defined by an AS clause. (As with all functions + returning record, the calling query must explicitly + define the structure of the record with an AS + clause.) The output record is filled from fields of the JSON object, + in the same way as described above + for json[b]_populate_record. Since there is no + input record value, unmatched columns are always filled with nulls. + + + CREATE TYPE myrowtype AS (a int, b text); + + + SELECT * FROM json_to_record('{"a":1,"b":[1,2,3],"c":[1,2,3],"e":"bar","r": {"a": 123, "b": "a b c"}}') AS x(a int, b text, c int[], d text, r myrowtype) + + + a | b | c | d | r +---+---------+---------+---+--------------- + 1 | [1,2,3] | {1,2,3} | | (123,"a b c") + + + + + + + + json_to_recordset + + json_to_recordset ( json ) + setof record + + + + jsonb_to_recordset + + jsonb_to_recordset ( jsonb ) + setof record + + + Expands the top-level JSON array of objects to a set of rows having + the composite type defined by an AS clause. (As + with all functions returning record, the calling query + must explicitly define the structure of the record with + an AS clause.) Each element of the JSON array is + processed as described above + for json[b]_populate_record. + + + SELECT * FROM json_to_recordset('[{"a":1,"b":"foo"}, {"a":"2","c":"bar"}]') AS x(a int, b text) + + + a | b +---+----- + 1 | foo + 2 | + + + + + + + + jsonb_set + + jsonb_set ( target jsonb, path text[], new_value jsonb , create_if_missing boolean ) + jsonb + + + Returns target + with the item designated by path + replaced by new_value, or with + new_value added if + create_if_missing is true (which is the + default) and the item designated by path + does not exist. + All earlier steps in the path must exist, or + the target is returned unchanged. + As with the path oriented operators, negative integers that + appear in the path count from the end + of JSON arrays. + If the last path step is an array index that is out of range, + and create_if_missing is true, the new + value is added at the beginning of the array if the index is negative, + or at the end of the array if it is positive. + + + jsonb_set('[{"f1":1,"f2":null},2,null,3]', '{0,f1}', '[2,3,4]', false) + [{"f1": [2, 3, 4], "f2": null}, 2, null, 3] + + + jsonb_set('[{"f1":1,"f2":null},2]', '{0,f3}', '[2,3,4]') + [{"f1": 1, "f2": null, "f3": [2, 3, 4]}, 2] + + + + + + + jsonb_set_lax + + jsonb_set_lax ( target jsonb, path text[], new_value jsonb , create_if_missing boolean , null_value_treatment text ) + jsonb + + + If new_value is not NULL, + behaves identically to jsonb_set. Otherwise behaves + according to the value + of null_value_treatment which must be one + of 'raise_exception', + 'use_json_null', 'delete_key', or + 'return_target'. The default is + 'use_json_null'. + + + jsonb_set_lax('[{"f1":1,"f2":null},2,null,3]', '{0,f1}', null) + [{"f1": null, "f2": null}, 2, null, 3] + + + jsonb_set_lax('[{"f1":99,"f2":null},2]', '{0,f3}', null, true, 'return_target') + [{"f1": 99, "f2": null}, 2] + + + + + + + jsonb_insert + + jsonb_insert ( target jsonb, path text[], new_value jsonb , insert_after boolean ) + jsonb + + + Returns target + with new_value inserted. If the item + designated by the path is an array + element, new_value will be inserted before + that item if insert_after is false (which + is the default), or after it + if insert_after is true. If the item + designated by the path is an object + field, new_value will be inserted only if + the object does not already contain that key. + All earlier steps in the path must exist, or + the target is returned unchanged. + As with the path oriented operators, negative integers that + appear in the path count from the end + of JSON arrays. + If the last path step is an array index that is out of range, the new + value is added at the beginning of the array if the index is negative, + or at the end of the array if it is positive. + + + jsonb_insert('{"a": [0,1,2]}', '{a, 1}', '"new_value"') + {"a": [0, "new_value", 1, 2]} + + + jsonb_insert('{"a": [0,1,2]}', '{a, 1}', '"new_value"', true) + {"a": [0, 1, "new_value", 2]} + + + + + + + json_strip_nulls + + json_strip_nulls ( target json ,strip_in_arrays boolean ) + json + + + + jsonb_strip_nulls + + jsonb_strip_nulls ( target jsonb ,strip_in_arrays boolean ) + jsonb + + + Deletes all object fields that have null values from the given JSON + value, recursively. + If strip_in_arrays is true (the default is false), + null array elements are also stripped. + Otherwise they are not stripped. Bare null values are never stripped. + + + json_strip_nulls('[{"f1":1, "f2":null}, 2, null, 3]') + [{"f1":1},2,null,3] + + + jsonb_strip_nulls('[1,2,null,3,4]', true) + [1,2,3,4] + + + + + + + + jsonb_path_exists + + jsonb_path_exists ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + boolean + + + Checks whether the JSON path returns any item for the specified JSON + value. + (This is useful only with SQL-standard JSON path expressions, not + predicate check + expressions, since those always return a value.) + If the vars argument is specified, it must + be a JSON object, and its fields provide named values to be + substituted into the jsonpath expression. + If the silent argument is specified and + is true, the function suppresses the same errors + as the @? and @@ operators do. + + + jsonb_path_exists('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') + t + + + + + + + jsonb_path_match + + jsonb_path_match ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + boolean + + + Returns the SQL boolean result of a JSON path predicate check + for the specified JSON value. + (This is useful only + with predicate + check expressions, not SQL-standard JSON path expressions, + since it will either fail or return NULL if the + path result is not a single boolean value.) + The optional vars + and silent arguments act the same as + for jsonb_path_exists. + + + jsonb_path_match('{"a":[1,2,3,4,5]}', 'exists($.a[*] ? (@ >= $min && @ <= $max))', '{"min":2, "max":4}') + t + + + + + + + jsonb_path_query + + jsonb_path_query ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + setof jsonb + + + Returns all JSON items returned by the JSON path for the specified + JSON value. + For SQL-standard JSON path expressions it returns the JSON + values selected from target. + For predicate + check expressions it returns the result of the predicate + check: true, false, + or null. + The optional vars + and silent arguments act the same as + for jsonb_path_exists. + + + SELECT * FROM jsonb_path_query('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') + + + jsonb_path_query +------------------ + 2 + 3 + 4 + + + + + + + + jsonb_path_query_array + + jsonb_path_query_array ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + jsonb + + + Returns all JSON items returned by the JSON path for the specified + JSON value, as a JSON array. + The parameters are the same as + for jsonb_path_query. + + + jsonb_path_query_array('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') + [2, 3, 4] + + + + + + + jsonb_path_query_first + + jsonb_path_query_first ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + jsonb + + + Returns the first JSON item returned by the JSON path for the + specified JSON value, or NULL if there are no + results. + The parameters are the same as + for jsonb_path_query. + + + jsonb_path_query_first('{"a":[1,2,3,4,5]}', '$.a[*] ? (@ >= $min && @ <= $max)', '{"min":2, "max":4}') + 2 + + + + + + + jsonb_path_exists_tz + + jsonb_path_exists_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + boolean + + + + jsonb_path_match_tz + + jsonb_path_match_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + boolean + + + + jsonb_path_query_tz + + jsonb_path_query_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + setof jsonb + + + + jsonb_path_query_array_tz + + jsonb_path_query_array_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + jsonb + + + + jsonb_path_query_first_tz + + jsonb_path_query_first_tz ( target jsonb, path jsonpath , vars jsonb , silent boolean ) + jsonb + + + These functions act like their counterparts described above without + the _tz suffix, except that these functions support + comparisons of date/time values that require timezone-aware + conversions. The example below requires interpretation of the + date-only value 2015-08-02 as a timestamp with time + zone, so the result depends on the current + setting. Due to this dependency, these + functions are marked as stable, which means these functions cannot be + used in indexes. Their counterparts are immutable, and so can be used + in indexes; but they will throw errors if asked to make such + comparisons. + + + jsonb_path_exists_tz('["2015-08-01 12:00:00-05"]', '$[*] ? (@.datetime() < "2015-08-02".datetime())') + t + + + + + + + jsonb_pretty + + jsonb_pretty ( jsonb ) + text + + + Converts the given JSON value to pretty-printed, indented text. + + + jsonb_pretty('[{"f1":1,"f2":null}, 2]') + + +[ + { + "f1": 1, + "f2": null + }, + 2 +] + + + + + + + + json_typeof + + json_typeof ( json ) + text + + + + jsonb_typeof + + jsonb_typeof ( jsonb ) + text + + + Returns the type of the top-level JSON value as a text string. + Possible types are + object, array, + string, number, + boolean, and null. + (The null result should not be confused + with an SQL NULL; see the examples.) + + + json_typeof('-123.4') + number + + + json_typeof('null'::json) + null + + + json_typeof(NULL::json) IS NULL + t + + + + +
+
+ + + The SQL/JSON Path Language + + + SQL/JSON path language + + + + SQL/JSON path expressions specify item(s) to be retrieved + from a JSON value, similarly to XPath expressions used + for access to XML content. In PostgreSQL, + path expressions are implemented as the jsonpath + data type and can use any elements described in + . + + + + JSON query functions and operators + pass the provided path expression to the path engine + for evaluation. If the expression matches the queried JSON data, + the corresponding JSON item, or set of items, is returned. + If there is no match, the result will be NULL, + false, or an error, depending on the function. + Path expressions are written in the SQL/JSON path language + and can include arithmetic expressions and functions. + + + + A path expression consists of a sequence of elements allowed + by the jsonpath data type. + The path expression is normally evaluated from left to right, but + you can use parentheses to change the order of operations. + If the evaluation is successful, a sequence of JSON items is produced, + and the evaluation result is returned to the JSON query function + that completes the specified computation. + + + + To refer to the JSON value being queried (the + context item), use the $ variable + in the path expression. The first element of a path must always + be $. It can be followed by one or more + accessor operators, + which go down the JSON structure level by level to retrieve sub-items + of the context item. Each accessor operator acts on the + result(s) of the previous evaluation step, producing zero, one, or more + output items from each input item. + + + + For example, suppose you have some JSON data from a GPS tracker that you + would like to parse, such as: + +SELECT '{ + "track": { + "segments": [ + { + "location": [ 47.763, 13.4034 ], + "start time": "2018-10-14 10:05:14", + "HR": 73 + }, + { + "location": [ 47.706, 13.2635 ], + "start time": "2018-10-14 10:39:21", + "HR": 135 + } + ] + } +}' AS json \gset + + (The above example can be copied-and-pasted + into psql to set things up for the following + examples. Then psql will + expand :'json' into a suitably-quoted string + constant containing the JSON value.) + + + + To retrieve the available track segments, you need to use the + .key accessor + operator to descend through surrounding JSON objects, for example: + +=> SELECT jsonb_path_query(:'json', '$.track.segments'); + jsonb_path_query +-----------------------------------------------------------&zwsp;-----------------------------------------------------------&zwsp;--------------------------------------------- + [{"HR": 73, "location": [47.763, 13.4034], "start time": "2018-10-14 10:05:14"}, {"HR": 135, "location": [47.706, 13.2635], "start time": "2018-10-14 10:39:21"}] + + + + + To retrieve the contents of an array, you typically use the + [*] operator. + The following example will return the location coordinates for all + the available track segments: + +=> SELECT jsonb_path_query(:'json', '$.track.segments[*].location'); + jsonb_path_query +------------------- + [47.763, 13.4034] + [47.706, 13.2635] + + Here we started with the whole JSON input value ($), + then the .track accessor selected the JSON object + associated with the "track" object key, then + the .segments accessor selected the JSON array + associated with the "segments" key within that + object, then the [*] accessor selected each element + of that array (producing a series of items), then + the .location accessor selected the JSON array + associated with the "location" key within each of + those objects. In this example, each of those objects had + a "location" key; but if any of them did not, + the .location accessor would have simply produced no + output for that input item. + + + + To return the coordinates of the first segment only, you can + specify the corresponding subscript in the [] + accessor operator. Recall that JSON array indexes are 0-relative: + +=> SELECT jsonb_path_query(:'json', '$.track.segments[0].location'); + jsonb_path_query +------------------- + [47.763, 13.4034] + + + + + The result of each path evaluation step can be processed + by one or more of the jsonpath operators and methods + listed in . + Each method name must be preceded by a dot. For example, + you can get the size of an array: + +=> SELECT jsonb_path_query(:'json', '$.track.segments.size()'); + jsonb_path_query +------------------ + 2 + + More examples of using jsonpath operators + and methods within path expressions appear below in + . + + + + A path can also contain + filter expressions that work similarly to the + WHERE clause in SQL. A filter expression begins with + a question mark and provides a condition in parentheses: + + +? (condition) + + + + + Filter expressions must be written just after the path evaluation step + to which they should apply. The result of that step is filtered to include + only those items that satisfy the provided condition. SQL/JSON defines + three-valued logic, so the condition can + produce true, false, + or unknown. The unknown value + plays the same role as SQL NULL and can be tested + for with the IS UNKNOWN predicate. Further path + evaluation steps use only those items for which the filter expression + returned true. + + + + The functions and operators that can be used in filter expressions are + listed in . Within a + filter expression, the @ variable denotes the value + being considered (i.e., one result of the preceding path step). You can + write accessor operators after @ to retrieve component + items. + + + + For example, suppose you would like to retrieve all heart rate values higher + than 130. You can achieve this as follows: + +=> SELECT jsonb_path_query(:'json', '$.track.segments[*].HR ? (@ > 130)'); + jsonb_path_query +------------------ + 135 + + + + + To get the start times of segments with such values, you have to + filter out irrelevant segments before selecting the start times, so the + filter expression is applied to the previous step, and the path used + in the condition is different: + +=> SELECT jsonb_path_query(:'json', '$.track.segments[*] ? (@.HR > 130)."start time"'); + jsonb_path_query +----------------------- + "2018-10-14 10:39:21" + + + + + You can use several filter expressions in sequence, if required. + The following example selects start times of all segments that + contain locations with relevant coordinates and high heart rate values: + +=> SELECT jsonb_path_query(:'json', '$.track.segments[*] ? (@.location[1] < 13.4) ? (@.HR > 130)."start time"'); + jsonb_path_query +----------------------- + "2018-10-14 10:39:21" + + + + + Using filter expressions at different nesting levels is also allowed. + The following example first filters all segments by location, and then + returns high heart rate values for these segments, if available: + +=> SELECT jsonb_path_query(:'json', '$.track.segments[*] ? (@.location[1] < 13.4).HR ? (@ > 130)'); + jsonb_path_query +------------------ + 135 + + + + + You can also nest filter expressions within each other. + This example returns the size of the track if it contains any + segments with high heart rate values, or an empty sequence otherwise: + +=> SELECT jsonb_path_query(:'json', '$.track ? (exists(@.segments[*] ? (@.HR > 130))).segments.size()'); + jsonb_path_query +------------------ + 2 + + + + + Deviations from the SQL Standard + + PostgreSQL's implementation of the SQL/JSON path + language has the following deviations from the SQL/JSON standard. + + + + Boolean Predicate Check Expressions + + As an extension to the SQL standard, + a PostgreSQL path expression can be a + Boolean predicate, whereas the SQL standard allows predicates only within + filters. While SQL-standard path expressions return the relevant + element(s) of the queried JSON value, predicate check expressions + return the single three-valued jsonb result of the + predicate: true, + false, or null. + For example, we could write this SQL-standard filter expression: + +=> SELECT jsonb_path_query(:'json', '$.track.segments ?(@[*].HR > 130)'); + jsonb_path_query +-----------------------------------------------------------&zwsp;---------------------- + {"HR": 135, "location": [47.706, 13.2635], "start time": "2018-10-14 10:39:21"} + + The similar predicate check expression simply + returns true, indicating that a match exists: + +=> SELECT jsonb_path_query(:'json', '$.track.segments[*].HR > 130'); + jsonb_path_query +------------------ + true + + + + + + Predicate check expressions are required in the + @@ operator (and the + jsonb_path_match function), and should not be used + with the @? operator (or the + jsonb_path_exists function). + + + + + + Regular Expression Interpretation + + There are minor differences in the interpretation of regular + expression patterns used in like_regex filters, as + described in . + + + + + + Strict and Lax Modes + + When you query JSON data, the path expression may not match the + actual JSON data structure. An attempt to access a non-existent + member of an object or element of an array is defined as a + structural error. SQL/JSON path expressions have two modes + of handling structural errors: + + + + + + lax (default) — the path engine implicitly adapts + the queried data to the specified path. + Any structural errors that cannot be fixed as described below + are suppressed, producing no match. + + + + + strict — if a structural error occurs, an error is raised. + + + + + + Lax mode facilitates matching of a JSON document and path + expression when the JSON data does not conform to the expected schema. + If an operand does not match the requirements of a particular operation, + it can be automatically wrapped as an SQL/JSON array, or unwrapped by + converting its elements into an SQL/JSON sequence before performing + the operation. Also, comparison operators automatically unwrap their + operands in lax mode, so you can compare SQL/JSON arrays + out-of-the-box. An array of size 1 is considered equal to its sole element. + Automatic unwrapping is not performed when: + + + + The path expression contains type() or + size() methods that return the type + and the number of elements in the array, respectively. + + + + + The queried JSON data contain nested arrays. In this case, only + the outermost array is unwrapped, while all the inner arrays + remain unchanged. Thus, implicit unwrapping can only go one + level down within each path evaluation step. + + + + + + + For example, when querying the GPS data listed above, you can + abstract from the fact that it stores an array of segments + when using lax mode: + +=> SELECT jsonb_path_query(:'json', 'lax $.track.segments.location'); + jsonb_path_query +------------------- + [47.763, 13.4034] + [47.706, 13.2635] + + + + + In strict mode, the specified path must exactly match the structure of + the queried JSON document, so using this path + expression will cause an error: + +=> SELECT jsonb_path_query(:'json', 'strict $.track.segments.location'); +ERROR: jsonpath member accessor can only be applied to an object + + To get the same result as in lax mode, you have to explicitly unwrap the + segments array: + +=> SELECT jsonb_path_query(:'json', 'strict $.track.segments[*].location'); + jsonb_path_query +------------------- + [47.763, 13.4034] + [47.706, 13.2635] + + + + + The unwrapping behavior of lax mode can lead to surprising results. For + instance, the following query using the .** accessor + selects every HR value twice: + +=> SELECT jsonb_path_query(:'json', 'lax $.**.HR'); + jsonb_path_query +------------------ + 73 + 135 + 73 + 135 + + This happens because the .** accessor selects both + the segments array and each of its elements, while + the .HR accessor automatically unwraps arrays when + using lax mode. To avoid surprising results, we recommend using + the .** accessor only in strict mode. The + following query selects each HR value just once: + +=> SELECT jsonb_path_query(:'json', 'strict $.**.HR'); + jsonb_path_query +------------------ + 73 + 135 + + + + + The unwrapping of arrays can also lead to unexpected results. Consider this + example, which selects all the location arrays: + +=> SELECT jsonb_path_query(:'json', 'lax $.track.segments[*].location'); + jsonb_path_query +------------------- + [47.763, 13.4034] + [47.706, 13.2635] +(2 rows) + + As expected it returns the full arrays. But applying a filter expression + causes the arrays to be unwrapped to evaluate each item, returning only the + items that match the expression: + +=> SELECT jsonb_path_query(:'json', 'lax $.track.segments[*].location ?(@[*] > 15)'); + jsonb_path_query +------------------ + 47.763 + 47.706 +(2 rows) + + This despite the fact that the full arrays are selected by the path + expression. Use strict mode to restore selecting the arrays: + +=> SELECT jsonb_path_query(:'json', 'strict $.track.segments[*].location ?(@[*] > 15)'); + jsonb_path_query +------------------- + [47.763, 13.4034] + [47.706, 13.2635] +(2 rows) + + + + + + SQL/JSON Path Operators and Methods + + + shows the operators and + methods available in jsonpath. Note that while the unary + operators and methods can be applied to multiple values resulting from a + preceding path step, the binary operators (addition etc.) can only be + applied to single values. In lax mode, methods applied to an array will be + executed for each value in the array. The exceptions are + .type() and .size(), which apply to + the array itself. + + + + <type>jsonpath</type> Operators and Methods + + + + + Operator/Method + + + Description + + + Example(s) + + + + + + + + number + number + number + + + Addition + + + jsonb_path_query('[2]', '$[0] + 3') + 5 + + + + + + + number + number + + + Unary plus (no operation); unlike addition, this can iterate over + multiple values + + + jsonb_path_query_array('{"x": [2,3,4]}', '+ $.x') + [2, 3, 4] + + + + + + number - number + number + + + Subtraction + + + jsonb_path_query('[2]', '7 - $[0]') + 5 + + + + + + - number + number + + + Negation; unlike subtraction, this can iterate over + multiple values + + + jsonb_path_query_array('{"x": [2,3,4]}', '- $.x') + [-2, -3, -4] + + + + + + number * number + number + + + Multiplication + + + jsonb_path_query('[4]', '2 * $[0]') + 8 + + + + + + number / number + number + + + Division + + + jsonb_path_query('[8.5]', '$[0] / 2') + 4.2500000000000000 + + + + + + number % number + number + + + Modulo (remainder) + + + jsonb_path_query('[32]', '$[0] % 10') + 2 + + + + + + value . type() + string + + + Type of the JSON item (see json_typeof) + + + jsonb_path_query_array('[1, "2", {}]', '$[*].type()') + ["number", "string", "object"] + + + + + + value . size() + number + + + Size of the JSON item (number of array elements, or 1 if not an + array) + + + jsonb_path_query('{"m": [11, 15]}', '$.m.size()') + 2 + + + + + + value . boolean() + boolean + + + Boolean value converted from a JSON boolean, number, or string + + + jsonb_path_query_array('[1, "yes", false]', '$[*].boolean()') + [true, true, false] + + + + + + value . string() + string + + + String value converted from a JSON boolean, number, string, or + datetime + + + jsonb_path_query_array('[1.23, "xyz", false]', '$[*].string()') + ["1.23", "xyz", "false"] + + + jsonb_path_query('"2023-08-15 12:34:56"', '$.timestamp().string()') + "2023-08-15T12:34:56" + + + + + + value . double() + number + + + Approximate floating-point number converted from a JSON number or + string + + + jsonb_path_query('{"len": "1.9"}', '$.len.double() * 2') + 3.8 + + + + + + number . ceiling() + number + + + Nearest integer greater than or equal to the given number + + + jsonb_path_query('{"h": 1.3}', '$.h.ceiling()') + 2 + + + + + + number . floor() + number + + + Nearest integer less than or equal to the given number + + + jsonb_path_query('{"h": 1.7}', '$.h.floor()') + 1 + + + + + + number . abs() + number + + + Absolute value of the given number + + + jsonb_path_query('{"z": -0.3}', '$.z.abs()') + 0.3 + + + + + + value . bigint() + bigint + + + Big integer value converted from a JSON number or string + + + jsonb_path_query('{"len": "9876543219"}', '$.len.bigint()') + 9876543219 + + + + + + value . decimal( [ precision [ , scale ] ] ) + decimal + + + Rounded decimal value converted from a JSON number or string + (precision and scale must be + integer values) + + + jsonb_path_query('1234.5678', '$.decimal(6, 2)') + 1234.57 + + + + + + value . integer() + integer + + + Integer value converted from a JSON number or string + + + jsonb_path_query('{"len": "12345"}', '$.len.integer()') + 12345 + + + + + + value . number() + numeric + + + Numeric value converted from a JSON number or string + + + jsonb_path_query('{"len": "123.45"}', '$.len.number()') + 123.45 + + + + + + string . datetime() + datetime_type + (see note) + + + Date/time value converted from a string + + + jsonb_path_query('["2015-8-1", "2015-08-12"]', '$[*] ? (@.datetime() < "2015-08-2".datetime())') + "2015-8-1" + + + + + + string . datetime(template) + datetime_type + (see note) + + + Date/time value converted from a string using the + specified to_timestamp template + + + jsonb_path_query_array('["12:30", "18:40"]', '$[*].datetime("HH24:MI")') + ["12:30:00", "18:40:00"] + + + + + + string . date() + date + + + Date value converted from a string + + + jsonb_path_query('"2023-08-15"', '$.date()') + "2023-08-15" + + + + + + string . time() + time without time zone + + + Time without time zone value converted from a string + + + jsonb_path_query('"12:34:56"', '$.time()') + "12:34:56" + + + + + + string . time(precision) + time without time zone + + + Time without time zone value converted from a string, with fractional + seconds adjusted to the given precision + + + jsonb_path_query('"12:34:56.789"', '$.time(2)') + "12:34:56.79" + + + + + + string . time_tz() + time with time zone + + + Time with time zone value converted from a string + + + jsonb_path_query('"12:34:56 +05:30"', '$.time_tz()') + "12:34:56+05:30" + + + + + + string . time_tz(precision) + time with time zone + + + Time with time zone value converted from a string, with fractional + seconds adjusted to the given precision + + + jsonb_path_query('"12:34:56.789 +05:30"', '$.time_tz(2)') + "12:34:56.79+05:30" + + + + + + string . timestamp() + timestamp without time zone + + + Timestamp without time zone value converted from a string + + + jsonb_path_query('"2023-08-15 12:34:56"', '$.timestamp()') + "2023-08-15T12:34:56" + + + + + + string . timestamp(precision) + timestamp without time zone + + + Timestamp without time zone value converted from a string, with + fractional seconds adjusted to the given precision + + + jsonb_path_query('"2023-08-15 12:34:56.789"', '$.timestamp(2)') + "2023-08-15T12:34:56.79" + + + + + + string . timestamp_tz() + timestamp with time zone + + + Timestamp with time zone value converted from a string + + + jsonb_path_query('"2023-08-15 12:34:56 +05:30"', '$.timestamp_tz()') + "2023-08-15T12:34:56+05:30" + + + + + + string . timestamp_tz(precision) + timestamp with time zone + + + Timestamp with time zone value converted from a string, with fractional + seconds adjusted to the given precision + + + jsonb_path_query('"2023-08-15 12:34:56.789 +05:30"', '$.timestamp_tz(2)') + "2023-08-15T12:34:56.79+05:30" + + + + + + object . keyvalue() + array + + + The object's key-value pairs, represented as an array of objects + containing three fields: "key", + "value", and "id"; + "id" is a unique identifier of the object the + key-value pair belongs to + + + jsonb_path_query_array('{"x": "20", "y": 32}', '$.keyvalue()') + [{"id": 0, "key": "x", "value": "20"}, {"id": 0, "key": "y", "value": 32}] + + + + +
+ + + + The result type of the datetime() and + datetime(template) + methods can be date, timetz, time, + timestamptz, or timestamp. + Both methods determine their result type dynamically. + + + The datetime() method sequentially tries to + match its input string to the ISO formats + for date, timetz, time, + timestamptz, and timestamp. It stops on + the first matching format and emits the corresponding data type. + + + The datetime(template) + method determines the result type according to the fields used in the + provided template string. + + + The datetime() and + datetime(template) methods + use the same parsing rules as the to_timestamp SQL + function does (see ), with three + exceptions. First, these methods don't allow unmatched template + patterns. Second, only the following separators are allowed in the + template string: minus sign, period, solidus (slash), comma, apostrophe, + semicolon, colon and space. Third, separators in the template string + must exactly match the input string. + + + If different date/time types need to be compared, an implicit cast is + applied. A date value can be cast to timestamp + or timestamptz, timestamp can be cast to + timestamptz, and time to timetz. + However, all but the first of these conversions depend on the current + setting, and thus can only be performed + within timezone-aware jsonpath functions. Similarly, other + date/time-related methods that convert strings to date/time types + also do this casting, which may involve the current + setting. Therefore, these conversions can + also only be performed within timezone-aware jsonpath + functions. + + + + + shows the available + filter expression elements. + + + + <type>jsonpath</type> Filter Expression Elements + + + + + Predicate/Value + + + Description + + + Example(s) + + + + + + + + value == value + boolean + + + Equality comparison (this, and the other comparison operators, work on + all JSON scalar values) + + + jsonb_path_query_array('[1, "a", 1, 3]', '$[*] ? (@ == 1)') + [1, 1] + + + jsonb_path_query_array('[1, "a", 1, 3]', '$[*] ? (@ == "a")') + ["a"] + + + + + + value != value + boolean + + + value <> value + boolean + + + Non-equality comparison + + + jsonb_path_query_array('[1, 2, 1, 3]', '$[*] ? (@ != 1)') + [2, 3] + + + jsonb_path_query_array('["a", "b", "c"]', '$[*] ? (@ <> "b")') + ["a", "c"] + + + + + + value < value + boolean + + + Less-than comparison + + + jsonb_path_query_array('[1, 2, 3]', '$[*] ? (@ < 2)') + [1] + + + + + + value <= value + boolean + + + Less-than-or-equal-to comparison + + + jsonb_path_query_array('["a", "b", "c"]', '$[*] ? (@ <= "b")') + ["a", "b"] + + + + + + value > value + boolean + + + Greater-than comparison + + + jsonb_path_query_array('[1, 2, 3]', '$[*] ? (@ > 2)') + [3] + + + + + + value >= value + boolean + + + Greater-than-or-equal-to comparison + + + jsonb_path_query_array('[1, 2, 3]', '$[*] ? (@ >= 2)') + [2, 3] + + + + + + true + boolean + + + JSON constant true + + + jsonb_path_query('[{"name": "John", "parent": false}, {"name": "Chris", "parent": true}]', '$[*] ? (@.parent == true)') + {"name": "Chris", "parent": true} + + + + + + false + boolean + + + JSON constant false + + + jsonb_path_query('[{"name": "John", "parent": false}, {"name": "Chris", "parent": true}]', '$[*] ? (@.parent == false)') + {"name": "John", "parent": false} + + + + + + null + value + + + JSON constant null (note that, unlike in SQL, + comparison to null works normally) + + + jsonb_path_query('[{"name": "Mary", "job": null}, {"name": "Michael", "job": "driver"}]', '$[*] ? (@.job == null) .name') + "Mary" + + + + + + boolean && boolean + boolean + + + Boolean AND + + + jsonb_path_query('[1, 3, 7]', '$[*] ? (@ > 1 && @ < 5)') + 3 + + + + + + boolean || boolean + boolean + + + Boolean OR + + + jsonb_path_query('[1, 3, 7]', '$[*] ? (@ < 1 || @ > 5)') + 7 + + + + + + ! boolean + boolean + + + Boolean NOT + + + jsonb_path_query('[1, 3, 7]', '$[*] ? (!(@ < 5))') + 7 + + + + + + boolean is unknown + boolean + + + Tests whether a Boolean condition is unknown. + + + jsonb_path_query('[-1, 2, 7, "foo"]', '$[*] ? ((@ > 0) is unknown)') + "foo" + + + + + + string like_regex string flag string + boolean + + + Tests whether the first operand matches the regular expression + given by the second operand, optionally with modifications + described by a string of flag characters (see + ). + + + jsonb_path_query_array('["abc", "abd", "aBdC", "abdacb", "babc"]', '$[*] ? (@ like_regex "^ab.*c")') + ["abc", "abdacb"] + + + jsonb_path_query_array('["abc", "abd", "aBdC", "abdacb", "babc"]', '$[*] ? (@ like_regex "^ab.*c" flag "i")') + ["abc", "aBdC", "abdacb"] + + + + + + string starts with string + boolean + + + Tests whether the second operand is an initial substring of the first + operand. + + + jsonb_path_query('["John Smith", "Mary Stone", "Bob Johnson"]', '$[*] ? (@ starts with "John")') + "John Smith" + + + + + + exists ( path_expression ) + boolean + + + Tests whether a path expression matches at least one SQL/JSON item. + Returns unknown if the path expression would result + in an error; the second example uses this to avoid a no-such-key error + in strict mode. + + + jsonb_path_query('{"x": [1, 2], "y": [2, 4]}', 'strict $.* ? (exists (@ ? (@[*] > 2)))') + [2, 4] + + + jsonb_path_query_array('{"value": 41}', 'strict $ ? (exists (@.name)) .name') + [] + + + + +
+ +
+ + + SQL/JSON Regular Expressions + + + LIKE_REGEX + in SQL/JSON + + + + SQL/JSON path expressions allow matching text to a regular expression + with the like_regex filter. For example, the + following SQL/JSON path query would case-insensitively match all + strings in an array that start with an English vowel: + +$[*] ? (@ like_regex "^[aeiou]" flag "i") + + + + + The optional flag string may include one or more of + the characters + i for case-insensitive match, + m to allow ^ + and $ to match at newlines, + s to allow . to match a newline, + and q to quote the whole pattern (reducing the + behavior to a simple substring match). + + + + The SQL/JSON standard borrows its definition for regular expressions + from the LIKE_REGEX operator, which in turn uses the + XQuery standard. PostgreSQL does not currently support the + LIKE_REGEX operator. Therefore, + the like_regex filter is implemented using the + POSIX regular expression engine described in + . This leads to various minor + discrepancies from standard SQL/JSON behavior, which are cataloged in + . + Note, however, that the flag-letter incompatibilities described there + do not apply to SQL/JSON, as it translates the XQuery flag letters to + match what the POSIX engine expects. + + + + Keep in mind that the pattern argument of like_regex + is a JSON path string literal, written according to the rules given in + . This means in particular that any + backslashes you want to use in the regular expression must be doubled. + For example, to match string values of the root document that contain + only digits: + +$.* ? (@ like_regex "^\\d+$") + + + +
+ + + SQL/JSON Query Functions + + SQL/JSON functions JSON_EXISTS(), + JSON_QUERY(), and JSON_VALUE() + described in can be used + to query JSON documents. Each of these functions apply a + path_expression (an SQL/JSON path query) to a + context_item (the document). See + for more details on what + the path_expression can contain. The + path_expression can also reference variables, + whose values are specified with their respective names in the + PASSING clause that is supported by each function. + context_item can be a jsonb value + or a character string that can be successfully cast to jsonb. + + + + SQL/JSON Query Functions + + + + + Function signature + + + Description + + + Example(s) + + + + + + + json_exists + +JSON_EXISTS ( +context_item, path_expression + PASSING { value AS varname } , ... +{ TRUE | FALSE | UNKNOWN | ERROR } ON ERROR ) boolean + + + + + + Returns true if the SQL/JSON path_expression + applied to the context_item yields any + items, false otherwise. + + + + + The ON ERROR clause specifies the behavior if + an error occurs during path_expression + evaluation. Specifying ERROR will cause an error to + be thrown with the appropriate message. Other options include + returning boolean values FALSE or + TRUE or the value UNKNOWN which + is actually an SQL NULL. The default when no ON ERROR + clause is specified is to return the boolean value + FALSE. + + + + + Examples: + + + JSON_EXISTS(jsonb '{"key1": [1,2,3]}', 'strict $.key1[*] ? (@ > $x)' PASSING 2 AS x) + t + + + JSON_EXISTS(jsonb '{"a": [1,2,3]}', 'lax $.a[5]' ERROR ON ERROR) + f + + + JSON_EXISTS(jsonb '{"a": [1,2,3]}', 'strict $.a[5]' ERROR ON ERROR) + + +ERROR: jsonpath array subscript is out of bounds + + + + + + json_query + +JSON_QUERY ( +context_item, path_expression + PASSING { value AS varname } , ... + RETURNING data_type FORMAT JSON ENCODING UTF8 + { WITHOUT | WITH { CONDITIONAL | UNCONDITIONAL } } ARRAY WRAPPER + { KEEP | OMIT } QUOTES ON SCALAR STRING + { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON EMPTY + { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON ERROR ) jsonb + + + + + + Returns the result of applying the SQL/JSON + path_expression to the + context_item. + + + + + By default, the result is returned as a value of type jsonb, + though the RETURNING clause can be used to return + as some other type to which it can be successfully coerced. + + + + + If the path expression may return multiple values, it might be necessary + to wrap those values using the WITH WRAPPER clause to + make it a valid JSON string, because the default behavior is to not wrap + them, as if WITHOUT WRAPPER were specified. The + WITH WRAPPER clause is by default taken to mean + WITH UNCONDITIONAL WRAPPER, which means that even a + single result value will be wrapped. To apply the wrapper only when + multiple values are present, specify WITH CONDITIONAL WRAPPER. + Getting multiple values in result will be treated as an error if + WITHOUT WRAPPER is specified. + + + + + If the result is a scalar string, by default, the returned value will + be surrounded by quotes, making it a valid JSON value. It can be made + explicit by specifying KEEP QUOTES. Conversely, + quotes can be omitted by specifying OMIT QUOTES. + To ensure that the result is a valid JSON value, OMIT QUOTES + cannot be specified when WITH WRAPPER is also + specified. + + + + + The ON EMPTY clause specifies the behavior if + evaluating path_expression yields an empty + set. The ON ERROR clause specifies the behavior + if an error occurs when evaluating path_expression, + when coercing the result value to the RETURNING type, + or when evaluating the ON EMPTY expression if the + path_expression evaluation returns an empty + set. + + + + + For both ON EMPTY and ON ERROR, + specifying ERROR will cause an error to be thrown with + the appropriate message. Other options include returning an SQL NULL, an + empty array (EMPTY ARRAY), + an empty object (EMPTY OBJECT), or a user-specified + expression (DEFAULT expression) + that can be coerced to jsonb or the type specified in RETURNING. + The default when ON EMPTY or ON ERROR + is not specified is to return an SQL NULL value. + + + + + Examples: + + + JSON_QUERY(jsonb '[1,[2,3],null]', 'lax $[*][$off]' PASSING 1 AS off WITH CONDITIONAL WRAPPER) + 3 + + + JSON_QUERY(jsonb '{"a": "[1, 2]"}', 'lax $.a' OMIT QUOTES) + [1, 2] + + + JSON_QUERY(jsonb '{"a": "[1, 2]"}', 'lax $.a' RETURNING int[] OMIT QUOTES ERROR ON ERROR) + + +ERROR: malformed array literal: "[1, 2]" +DETAIL: Missing "]" after array dimensions. + + + + + + + json_value + +JSON_VALUE ( +context_item, path_expression + PASSING { value AS varname } , ... + RETURNING data_type + { ERROR | NULL | DEFAULT expression } ON EMPTY + { ERROR | NULL | DEFAULT expression } ON ERROR ) text + + + + + + Returns the result of applying the SQL/JSON + path_expression to the + context_item. + + + + + Only use JSON_VALUE() if the extracted value is + expected to be a single SQL/JSON scalar item; + getting multiple values will be treated as an error. If you expect that + extracted value might be an object or an array, use the + JSON_QUERY function instead. + + + + + By default, the result, which must be a single scalar value, is + returned as a value of type text, though the + RETURNING clause can be used to return as some + other type to which it can be successfully coerced. + + + + + The ON ERROR and ON EMPTY + clauses have similar semantics as mentioned in the description of + JSON_QUERY, except the set of values returned in + lieu of throwing an error is different. + + + + + Note that scalar strings returned by JSON_VALUE + always have their quotes removed, equivalent to specifying + OMIT QUOTES in JSON_QUERY. + + + + + Examples: + + + JSON_VALUE(jsonb '"123.45"', '$' RETURNING float) + 123.45 + + + JSON_VALUE(jsonb '"03:04 2015-02-01"', '$.datetime("HH24:MI YYYY-MM-DD")' RETURNING date) + 2015-02-01 + + + JSON_VALUE(jsonb '[1,2]', 'strict $[$off]' PASSING 1 AS off) + 2 + + + JSON_VALUE(jsonb '[1,2]', 'strict $[*]' DEFAULT 9 ON ERROR) + 9 + + + + + +
+ + + The context_item expression is converted to + jsonb by an implicit cast if the expression is not already of + type jsonb. Note, however, that any parsing errors that occur + during that conversion are thrown unconditionally, that is, are not + handled according to the (specified or implicit) ON ERROR + clause. + + + + + JSON_VALUE() returns an SQL NULL if + path_expression returns a JSON + null, whereas JSON_QUERY() returns + the JSON null as is. + + +
+ + + JSON_TABLE + + json_table + + + + JSON_TABLE is an SQL/JSON function which + queries JSON data + and presents the results as a relational view, which can be accessed as a + regular SQL table. You can use JSON_TABLE inside + the FROM clause of a SELECT, + UPDATE, or DELETE and as data source + in a MERGE statement. + + + + Taking JSON data as input, JSON_TABLE uses a JSON path + expression to extract a part of the provided data to use as a + row pattern for the constructed view. Each SQL/JSON + value given by the row pattern serves as source for a separate row in the + constructed view. + + + + To split the row pattern into columns, JSON_TABLE + provides the COLUMNS clause that defines the + schema of the created view. For each column, a separate JSON path expression + can be specified to be evaluated against the row pattern to get an SQL/JSON + value that will become the value for the specified column in a given output + row. + + + + JSON data stored at a nested level of the row pattern can be extracted using + the NESTED PATH clause. Each + NESTED PATH clause can be used to generate one or more + columns using the data from a nested level of the row pattern. Those + columns can be specified using a COLUMNS clause that + looks similar to the top-level COLUMNS clause. Rows constructed from + NESTED COLUMNS are called child rows and are joined + against the row constructed from the columns specified in the parent + COLUMNS clause to get the row in the final view. Child + columns themselves may contain a NESTED PATH + specification thus allowing to extract data located at arbitrary nesting + levels. Columns produced by multiple NESTED PATHs at the + same level are considered to be siblings of each + other and their rows after joining with the parent row are combined using + UNION. + + + + The rows produced by JSON_TABLE are laterally + joined to the row that generated them, so you do not have to explicitly join + the constructed view with the original table holding JSON + data. + + + + The syntax is: + + + +JSON_TABLE ( + context_item, path_expression AS json_path_name PASSING { value AS varname } , ... + COLUMNS ( json_table_column , ... ) + { ERROR | EMPTY ARRAY} ON ERROR +) + + +where json_table_column is: + + name FOR ORDINALITY + | name type + FORMAT JSON ENCODING UTF8 + PATH path_expression + { WITHOUT | WITH { CONDITIONAL | UNCONDITIONAL } } ARRAY WRAPPER + { KEEP | OMIT } QUOTES ON SCALAR STRING + { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON EMPTY + { ERROR | NULL | EMPTY { ARRAY | OBJECT } | DEFAULT expression } ON ERROR + | name type EXISTS PATH path_expression + { ERROR | TRUE | FALSE | UNKNOWN } ON ERROR + | NESTED PATH path_expression AS json_path_name COLUMNS ( json_table_column , ... ) + + + + Each syntax element is described below in more detail. + + + + + + context_item, path_expression AS json_path_name PASSING { value AS varname } , ... + + + + The context_item specifies the input document + to query, the path_expression is an SQL/JSON + path expression defining the query, and json_path_name + is an optional name for the path_expression. + The optional PASSING clause provides data values for + the variables mentioned in the path_expression. + The result of the input data evaluation using the aforementioned elements + is called the row pattern, which is used as the + source for row values in the constructed view. + + + + + + + COLUMNS ( json_table_column , ... ) + + + + + The COLUMNS clause defining the schema of the + constructed view. In this clause, you can specify each column to be + filled with an SQL/JSON value obtained by applying a JSON path expression + against the row pattern. json_table_column has + the following variants: + + + + + + name FOR ORDINALITY + + + + Adds an ordinality column that provides sequential row numbering starting + from 1. Each NESTED PATH (see below) gets its own + counter for any nested ordinality columns. + + + + + + + name type + FORMAT JSON ENCODING UTF8 + PATH path_expression + + + + Inserts an SQL/JSON value obtained by applying + path_expression against the row pattern into + the view's output row after coercing it to specified + type. + + + Specifying FORMAT JSON makes it explicit that you + expect the value to be a valid json object. It only + makes sense to specify FORMAT JSON if + type is one of bpchar, + bytea, character varying, name, + json, jsonb, text, or a domain over + these types. + + + Optionally, you can specify WRAPPER and + QUOTES clauses to format the output. Note that + specifying OMIT QUOTES overrides + FORMAT JSON if also specified, because unquoted + literals do not constitute valid json values. + + + Optionally, you can use ON EMPTY and + ON ERROR clauses to specify whether to throw the error + or return the specified value when the result of JSON path evaluation is + empty and when an error occurs during JSON path evaluation or when + coercing the SQL/JSON value to the specified type, respectively. The + default for both is to return a NULL value. + + + + This clause is internally turned into and has the same semantics as + JSON_VALUE or JSON_QUERY. + The latter if the specified type is not a scalar type or if either of + FORMAT JSON, WRAPPER, or + QUOTES clause is present. + + + + + + + + name type + EXISTS PATH path_expression + + + + Inserts a boolean value obtained by applying + path_expression against the row pattern + into the view's output row after coercing it to specified + type. + + + The value corresponds to whether applying the PATH + expression to the row pattern yields any values. + + + The specified type should have a cast from the + boolean type. + + + Optionally, you can use ON ERROR to specify whether to + throw the error or return the specified value when an error occurs during + JSON path evaluation or when coercing SQL/JSON value to the specified + type. The default is to return a boolean value + FALSE. + + + + This clause is internally turned into and has the same semantics as + JSON_EXISTS. + + + + + + + + NESTED PATH path_expression AS json_path_name + COLUMNS ( json_table_column , ... ) + + + + + Extracts SQL/JSON values from nested levels of the row pattern, + generates one or more columns as defined by the COLUMNS + subclause, and inserts the extracted SQL/JSON values into those + columns. The json_table_column + expression in the COLUMNS subclause uses the same + syntax as in the parent COLUMNS clause. + + + + The NESTED PATH syntax is recursive, + so you can go down multiple nested levels by specifying several + NESTED PATH subclauses within each other. + It allows to unnest the hierarchy of JSON objects and arrays + in a single function invocation rather than chaining several + JSON_TABLE expressions in an SQL statement. + + + + + + + + In each variant of json_table_column described + above, if the PATH clause is omitted, path expression + $.name is used, where + name is the provided column name. + + + + + + + + + AS json_path_name + + + + + The optional json_path_name serves as an + identifier of the provided path_expression. + The name must be unique and distinct from the column names. + + + + + + + { ERROR | EMPTY } ON ERROR + + + + + The optional ON ERROR can be used to specify how to + handle errors when evaluating the top-level + path_expression. Use ERROR + if you want the errors to be thrown and EMPTY to + return an empty table, that is, a table containing 0 rows. Note that + this clause does not affect the errors that occur when evaluating + columns, for which the behavior depends on whether the + ON ERROR clause is specified against a given column. + + + + + + Examples + + + In the examples that follow, the following table containing JSON data + will be used: + + +CREATE TABLE my_films ( js jsonb ); + +INSERT INTO my_films VALUES ( +'{ "favorites" : [ + { "kind" : "comedy", "films" : [ + { "title" : "Bananas", + "director" : "Woody Allen"}, + { "title" : "The Dinner Game", + "director" : "Francis Veber" } ] }, + { "kind" : "horror", "films" : [ + { "title" : "Psycho", + "director" : "Alfred Hitchcock" } ] }, + { "kind" : "thriller", "films" : [ + { "title" : "Vertigo", + "director" : "Alfred Hitchcock" } ] }, + { "kind" : "drama", "films" : [ + { "title" : "Yojimbo", + "director" : "Akira Kurosawa" } ] } + ] }'); + + + + + The following query shows how to use JSON_TABLE to + turn the JSON objects in the my_films table + to a view containing columns for the keys kind, + title, and director contained in + the original JSON along with an ordinality column: + + +SELECT jt.* FROM + my_films, + JSON_TABLE (js, '$.favorites[*]' COLUMNS ( + id FOR ORDINALITY, + kind text PATH '$.kind', + title text PATH '$.films[*].title' WITH WRAPPER, + director text PATH '$.films[*].director' WITH WRAPPER)) AS jt; + + + + id | kind | title | director +----+----------+--------------------------------+---------------------------------- + 1 | comedy | ["Bananas", "The Dinner Game"] | ["Woody Allen", "Francis Veber"] + 2 | horror | ["Psycho"] | ["Alfred Hitchcock"] + 3 | thriller | ["Vertigo"] | ["Alfred Hitchcock"] + 4 | drama | ["Yojimbo"] | ["Akira Kurosawa"] +(4 rows) + + + + + The following is a modified version of the above query to show the + usage of PASSING arguments in the filter specified in + the top-level JSON path expression and the various options for the + individual columns: + + +SELECT jt.* FROM + my_films, + JSON_TABLE (js, '$.favorites[*] ? (@.films[*].director == $filter)' + PASSING 'Alfred Hitchcock' AS filter + COLUMNS ( + id FOR ORDINALITY, + kind text PATH '$.kind', + title text FORMAT JSON PATH '$.films[*].title' OMIT QUOTES, + director text PATH '$.films[*].director' KEEP QUOTES)) AS jt; + + + + id | kind | title | director +----+----------+---------+-------------------- + 1 | horror | Psycho | "Alfred Hitchcock" + 2 | thriller | Vertigo | "Alfred Hitchcock" +(2 rows) + + + + + The following is a modified version of the above query to show the usage + of NESTED PATH for populating title and director + columns, illustrating how they are joined to the parent columns id and + kind: + + +SELECT jt.* FROM + my_films, + JSON_TABLE ( js, '$.favorites[*] ? (@.films[*].director == $filter)' + PASSING 'Alfred Hitchcock' AS filter + COLUMNS ( + id FOR ORDINALITY, + kind text PATH '$.kind', + NESTED PATH '$.films[*]' COLUMNS ( + title text FORMAT JSON PATH '$.title' OMIT QUOTES, + director text PATH '$.director' KEEP QUOTES))) AS jt; + + + + id | kind | title | director +----+----------+---------+-------------------- + 1 | horror | Psycho | "Alfred Hitchcock" + 2 | thriller | Vertigo | "Alfred Hitchcock" +(2 rows) + + + + + + The following is the same query but without the filter in the root + path: + + +SELECT jt.* FROM + my_films, + JSON_TABLE ( js, '$.favorites[*]' + COLUMNS ( + id FOR ORDINALITY, + kind text PATH '$.kind', + NESTED PATH '$.films[*]' COLUMNS ( + title text FORMAT JSON PATH '$.title' OMIT QUOTES, + director text PATH '$.director' KEEP QUOTES))) AS jt; + + + + id | kind | title | director +----+----------+-----------------+-------------------- + 1 | comedy | Bananas | "Woody Allen" + 1 | comedy | The Dinner Game | "Francis Veber" + 2 | horror | Psycho | "Alfred Hitchcock" + 3 | thriller | Vertigo | "Alfred Hitchcock" + 4 | drama | Yojimbo | "Akira Kurosawa" +(5 rows) + + + + + + The following shows another query using a different JSON + object as input. It shows the UNION "sibling join" between + NESTED paths $.movies[*] and + $.books[*] and also the usage of + FOR ORDINALITY column at NESTED + levels (columns movie_id, book_id, + and author_id): + + +SELECT * FROM JSON_TABLE ( +'{"favorites": + [{"movies": + [{"name": "One", "director": "John Doe"}, + {"name": "Two", "director": "Don Joe"}], + "books": + [{"name": "Mystery", "authors": [{"name": "Brown Dan"}]}, + {"name": "Wonder", "authors": [{"name": "Jun Murakami"}, {"name":"Craig Doe"}]}] +}]}'::json, '$.favorites[*]' +COLUMNS ( + user_id FOR ORDINALITY, + NESTED '$.movies[*]' + COLUMNS ( + movie_id FOR ORDINALITY, + mname text PATH '$.name', + director text), + NESTED '$.books[*]' + COLUMNS ( + book_id FOR ORDINALITY, + bname text PATH '$.name', + NESTED '$.authors[*]' + COLUMNS ( + author_id FOR ORDINALITY, + author_name text PATH '$.name')))); + + + + user_id | movie_id | mname | director | book_id | bname | author_id | author_name +---------+----------+-------+----------+---------+---------+-----------+-------------- + 1 | 1 | One | John Doe | | | | + 1 | 2 | Two | Don Joe | | | | + 1 | | | | 1 | Mystery | 1 | Brown Dan + 1 | | | | 2 | Wonder | 1 | Jun Murakami + 1 | | | | 2 | Wonder | 2 | Craig Doe +(5 rows) + + + + +
diff --git a/doc/src/sgml/func/func-logical.sgml b/doc/src/sgml/func/func-logical.sgml new file mode 100644 index 0000000000000..65e50e65a8117 --- /dev/null +++ b/doc/src/sgml/func/func-logical.sgml @@ -0,0 +1,146 @@ + + Logical Operators + + + operator + logical + + + + Boolean + operators + operators, logical + + + + The usual logical operators are available: + + + AND (operator) + + + + OR (operator) + + + + NOT (operator) + + + + conjunction + + + + disjunction + + + + negation + + + +boolean AND boolean boolean +boolean OR boolean boolean +NOT boolean boolean + + + SQL uses a three-valued logic system with true, + false, and null, which represents unknown. + Observe the following truth tables: + + + + + + a + b + a AND b + a OR b + + + + + + TRUE + TRUE + TRUE + TRUE + + + + TRUE + FALSE + FALSE + TRUE + + + + TRUE + NULL + NULL + TRUE + + + + FALSE + FALSE + FALSE + FALSE + + + + FALSE + NULL + FALSE + NULL + + + + NULL + NULL + NULL + NULL + + + + + + + + + + a + NOT a + + + + + + TRUE + FALSE + + + + FALSE + TRUE + + + + NULL + NULL + + + + + + + + The operators AND and OR are + commutative, that is, you can switch the left and right operands + without affecting the result. (However, it is not guaranteed that + the left operand is evaluated before the right operand. See for more information about the + order of evaluation of subexpressions.) + + diff --git a/doc/src/sgml/func/func-matching.sgml b/doc/src/sgml/func/func-matching.sgml new file mode 100644 index 0000000000000..f466860ddb002 --- /dev/null +++ b/doc/src/sgml/func/func-matching.sgml @@ -0,0 +1,2487 @@ + + Pattern Matching + + + pattern matching + + + + There are three separate approaches to pattern matching provided + by PostgreSQL: the traditional + SQL LIKE operator, the + more recent SIMILAR TO operator (added in + SQL:1999), and POSIX-style regular + expressions. Aside from the basic does this string match + this pattern? operators, functions are available to extract + or replace matching substrings and to split a string at matching + locations. + + + + + If you have pattern matching needs that go beyond this, + consider writing a user-defined function in Perl or Tcl. + + + + + + While most regular-expression searches can be executed very quickly, + regular expressions can be contrived that take arbitrary amounts of + time and memory to process. Be wary of accepting regular-expression + search patterns from hostile sources. If you must do so, it is + advisable to impose a statement timeout. + + + + Searches using SIMILAR TO patterns have the same + security hazards, since SIMILAR TO provides many + of the same capabilities as POSIX-style regular + expressions. + + + + LIKE searches, being much simpler than the other + two options, are safer to use with possibly-hostile pattern sources. + + + + + SIMILAR TO and POSIX-style regular + expressions do not support nondeterministic collations. If required, use + LIKE or apply a different collation to the expression + to work around this limitation. + + + + <function>LIKE</function> + + + LIKE + + + +string LIKE pattern ESCAPE escape-character +string NOT LIKE pattern ESCAPE escape-character + + + + The LIKE expression returns true if the + string matches the supplied + pattern. (As + expected, the NOT LIKE expression returns + false if LIKE returns true, and vice versa. + An equivalent expression is + NOT (string LIKE + pattern).) + + + + If pattern does not contain percent + signs or underscores, then the pattern only represents the string + itself; in that case LIKE acts like the + equals operator. An underscore (_) in + pattern stands for (matches) any single + character; a percent sign (%) matches any sequence + of zero or more characters. + + + + Some examples: + +'abc' LIKE 'abc' true +'abc' LIKE 'a%' true +'abc' LIKE '_b_' true +'abc' LIKE 'c' false + + + + + LIKE pattern matching supports nondeterministic + collations (see ), such as + case-insensitive collations or collations that, say, ignore punctuation. + So with a case-insensitive collation, one could have: + +'AbC' LIKE 'abc' COLLATE case_insensitive true +'AbC' LIKE 'a%' COLLATE case_insensitive true + + With collations that ignore certain characters or in general that consider + strings of different lengths equal, the semantics can become a bit more + complicated. Consider these examples: + +'.foo.' LIKE 'foo' COLLATE ign_punct true +'.foo.' LIKE 'f_o' COLLATE ign_punct true +'.foo.' LIKE '_oo' COLLATE ign_punct false + + The way the matching works is that the pattern is partitioned into + sequences of wildcards and non-wildcard strings (wildcards being + _ and %). For example, the pattern + f_o is partitioned into f, _, o, the + pattern _oo is partitioned into _, + oo. The input string matches the pattern if it can be + partitioned in such a way that the wildcards match one character or any + number of characters respectively and the non-wildcard partitions are + equal under the applicable collation. So for example, '.foo.' + LIKE 'f_o' COLLATE ign_punct is true because one can partition + .foo. into .f, o, o., and then + '.f' = 'f' COLLATE ign_punct, 'o' + matches the _ wildcard, and 'o.' = 'o' COLLATE + ign_punct. But '.foo.' LIKE '_oo' COLLATE + ign_punct is false because .foo. cannot be + partitioned in a way that the first character is any character and the + rest of the string compares equal to oo. (Note that + the single-character wildcard always matches exactly one character, + independent of the collation. So in this example, the + _ would match ., but then the rest + of the input string won't match the rest of the pattern.) + + + + LIKE pattern matching always covers the entire + string. Therefore, if it's desired to match a sequence anywhere within + a string, the pattern must start and end with a percent sign. + + + + To match a literal underscore or percent sign without matching + other characters, the respective character in + pattern must be + preceded by the escape character. The default escape + character is the backslash but a different one can be selected by + using the ESCAPE clause. To match the escape + character itself, write two escape characters. + + + + + If you have turned off, + any backslashes you write in literal string constants will need to be + doubled. See for more information. + + + + + It's also possible to select no escape character by writing + ESCAPE ''. This effectively disables the + escape mechanism, which makes it impossible to turn off the + special meaning of underscore and percent signs in the pattern. + + + + According to the SQL standard, omitting ESCAPE + means there is no escape character (rather than defaulting to a + backslash), and a zero-length ESCAPE value is + disallowed. PostgreSQL's behavior in + this regard is therefore slightly nonstandard. + + + + The key word ILIKE can be used instead of + LIKE to make the match case-insensitive according to the + active locale. (But this does not support nondeterministic collations.) + This is not in the SQL standard but is a + PostgreSQL extension. + + + + The operator ~~ is equivalent to + LIKE, and ~~* corresponds to + ILIKE. There are also + !~~ and !~~* operators that + represent NOT LIKE and NOT + ILIKE, respectively. All of these operators are + PostgreSQL-specific. You may see these + operator names in EXPLAIN output and similar + places, since the parser actually translates LIKE + et al. to these operators. + + + + The phrases LIKE, ILIKE, + NOT LIKE, and NOT ILIKE are + generally treated as operators + in PostgreSQL syntax; for example they can + be used in expression + operator ANY + (subquery) constructs, although + an ESCAPE clause cannot be included there. In some + obscure cases it may be necessary to use the underlying operator names + instead. + + + + Also see the starts-with operator ^@ and the + corresponding starts_with() function, which are + useful in cases where simply matching the beginning of a string is + needed. + + + + + + <function>SIMILAR TO</function> Regular Expressions + + + regular expression + + + + + SIMILAR TO + + + substring + + + +string SIMILAR TO pattern ESCAPE escape-character +string NOT SIMILAR TO pattern ESCAPE escape-character + + + + The SIMILAR TO operator returns true or + false depending on whether its pattern matches the given string. + It is similar to LIKE, except that it + interprets the pattern using the SQL standard's definition of a + regular expression. SQL regular expressions are a curious cross + between LIKE notation and common (POSIX) regular + expression notation. + + + + Like LIKE, the SIMILAR TO + operator succeeds only if its pattern matches the entire string; + this is unlike common regular expression behavior where the pattern + can match any part of the string. + Also like + LIKE, SIMILAR TO uses + _ and % as wildcard characters denoting + any single character and any string, respectively (these are + comparable to . and .* in POSIX regular + expressions). + + + + In addition to these facilities borrowed from LIKE, + SIMILAR TO supports these pattern-matching + metacharacters borrowed from POSIX regular expressions: + + + + + | denotes alternation (either of two alternatives). + + + + + * denotes repetition of the previous item zero + or more times. + + + + + + denotes repetition of the previous item one + or more times. + + + + + ? denotes repetition of the previous item zero + or one time. + + + + + {m} denotes repetition + of the previous item exactly m times. + + + + + {m,} denotes repetition + of the previous item m or more times. + + + + + {m,n} + denotes repetition of the previous item at least m and + not more than n times. + + + + + Parentheses () can be used to group items into + a single logical item. + + + + + A bracket expression [...] specifies a character + class, just as in POSIX regular expressions. + + + + + Notice that the period (.) is not a metacharacter + for SIMILAR TO. + + + + As with LIKE, a backslash disables the special + meaning of any of these metacharacters. A different escape character + can be specified with ESCAPE, or the escape + capability can be disabled by writing ESCAPE ''. + + + + According to the SQL standard, omitting ESCAPE + means there is no escape character (rather than defaulting to a + backslash), and a zero-length ESCAPE value is + disallowed. PostgreSQL's behavior in + this regard is therefore slightly nonstandard. + + + + Another nonstandard extension is that following the escape character + with a letter or digit provides access to the escape sequences + defined for POSIX regular expressions; see + , + , and + below. + + + + Some examples: + +'abc' SIMILAR TO 'abc' true +'abc' SIMILAR TO 'a' false +'abc' SIMILAR TO '%(b|d)%' true +'abc' SIMILAR TO '(b|c)%' false +'-abc-' SIMILAR TO '%\mabc\M%' true +'xabcy' SIMILAR TO '%\mabc\M%' false + + + + + The substring function with three parameters + provides extraction of a substring that matches an SQL + regular expression pattern. The function can be written according + to standard SQL syntax: + +substring(string SIMILAR pattern ESCAPE escape-character) + + or using the now obsolete SQL:1999 syntax: + +substring(string FROM pattern FOR escape-character) + + or as a plain three-argument function: + +substring(string, pattern, escape-character) + + As with SIMILAR TO, the + specified pattern must match the entire data string, or else the + function fails and returns null. To indicate the part of the + pattern for which the matching data sub-string is of interest, + the pattern should contain + two occurrences of the escape character followed by a double quote + ("). + The text matching the portion of the pattern + between these separators is returned when the match is successful. + + + + The escape-double-quote separators actually + divide substring's pattern into three independent + regular expressions; for example, a vertical bar (|) + in any of the three sections affects only that section. Also, the first + and third of these regular expressions are defined to match the smallest + possible amount of text, not the largest, when there is any ambiguity + about how much of the data string matches which pattern. (In POSIX + parlance, the first and third regular expressions are forced to be + non-greedy.) + + + + As an extension to the SQL standard, PostgreSQL + allows there to be just one escape-double-quote separator, in which case + the third regular expression is taken as empty; or no separators, in which + case the first and third regular expressions are taken as empty. + + + + Some examples, with #" delimiting the return string: + +substring('foobar' SIMILAR '%#"o_b#"%' ESCAPE '#') oob +substring('foobar' SIMILAR '#"o_b#"%' ESCAPE '#') NULL + + + + + + <acronym>POSIX</acronym> Regular Expressions + + + regular expression + pattern matching + + + substring + + + regexp_count + + + regexp_instr + + + regexp_like + + + regexp_match + + + regexp_matches + + + regexp_replace + + + regexp_split_to_table + + + regexp_split_to_array + + + regexp_substr + + + + lists the available + operators for pattern matching using POSIX regular expressions. + + + + Regular Expression Match Operators + + + + + + Operator + + + Description + + + Example(s) + + + + + + + + text ~ text + boolean + + + String matches regular expression, case sensitively + + + 'thomas' ~ 't.*ma' + t + + + + + + text ~* text + boolean + + + String matches regular expression, case-insensitively + + + 'thomas' ~* 'T.*ma' + t + + + + + + text !~ text + boolean + + + String does not match regular expression, case sensitively + + + 'thomas' !~ 't.*max' + t + + + + + + text !~* text + boolean + + + String does not match regular expression, case-insensitively + + + 'thomas' !~* 'T.*ma' + f + + + + +
+ + + POSIX regular expressions provide a more + powerful means for pattern matching than the LIKE and + SIMILAR TO operators. + Many Unix tools such as egrep, + sed, or awk use a pattern + matching language that is similar to the one described here. + + + + A regular expression is a character sequence that is an + abbreviated definition of a set of strings (a regular + set). A string is said to match a regular expression + if it is a member of the regular set described by the regular + expression. As with LIKE, pattern characters + match string characters exactly unless they are special characters + in the regular expression language — but regular expressions use + different special characters than LIKE does. + Unlike LIKE patterns, a + regular expression is allowed to match anywhere within a string, unless + the regular expression is explicitly anchored to the beginning or + end of the string. + + + + Some examples: + +'abcd' ~ 'bc' true +'abcd' ~ 'a.c' true — dot matches any character +'abcd' ~ 'a.*d' true — * repeats the preceding pattern item +'abcd' ~ '(b|x)' true — | means OR, parentheses group +'abcd' ~ '^a' true — ^ anchors to start of string +'abcd' ~ '^(b|c)' false — would match except for anchoring + + + + + The POSIX pattern language is described in much + greater detail below. + + + + The substring function with two parameters, + substring(string from + pattern), provides extraction of a + substring + that matches a POSIX regular expression pattern. It returns null if + there is no match, otherwise the first portion of the text that matched the + pattern. But if the pattern contains any parentheses, the portion + of the text that matched the first parenthesized subexpression (the + one whose left parenthesis comes first) is + returned. You can put parentheses around the whole expression + if you want to use parentheses within it without triggering this + exception. If you need parentheses in the pattern before the + subexpression you want to extract, see the non-capturing parentheses + described below. + + + + Some examples: + +substring('foobar' FROM 'o.b') oob +substring('foobar' FROM 'o(.)b') o + + + + + The regexp_count function counts the number of + places where a POSIX regular expression pattern matches a string. + It has the syntax + regexp_count(string, + pattern + , start + , flags + ). + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. For example, including i in + flags specifies case-insensitive matching. + Supported flags are described in + . + + + + Some examples: + +regexp_count('ABCABCAXYaxy', 'A.') 3 +regexp_count('ABCABCAXYaxy', 'A.', 1, 'i') 4 + + + + + The regexp_instr function returns the starting or + ending position of the N'th match of a + POSIX regular expression pattern to a string, or zero if there is no + such match. It has the syntax + regexp_instr(string, + pattern + , start + , N + , endoption + , flags + , subexpr + ). + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + If N is specified + then the N'th match of the pattern + is located, otherwise the first match is located. + If the endoption parameter is omitted or + specified as zero, the function returns the position of the first + character of the match. Otherwise, endoption + must be one, and the function returns the position of the character + following the match. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Supported flags are described + in . + For a pattern containing parenthesized + subexpressions, subexpr is an integer + indicating which subexpression is of interest: the result identifies + the position of the substring matching that subexpression. + Subexpressions are numbered in the order of their leading parentheses. + When subexpr is omitted or zero, the result + identifies the position of the whole match regardless of + parenthesized subexpressions. + + + + Some examples: + +regexp_instr('number of your street, town zip, FR', '[^,]+', 1, 2) + 23 +regexp_instr(string=>'ABCDEFGHI', pattern=>'(c..)(...)', start=>1, "N"=>1, endoption=>0, flags=>'i', subexpr=>2) + 6 + + + + + The regexp_like function checks whether a match + of a POSIX regular expression pattern occurs within a string, + returning boolean true or false. It has the syntax + regexp_like(string, + pattern + , flags ). + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Supported flags are described + in . + This function has the same results as the ~ + operator if no flags are specified. If only the i + flag is specified, it has the same results as + the ~* operator. + + + + Some examples: + +regexp_like('Hello World', 'world') false +regexp_like('Hello World', 'world', 'i') true + + + + + The regexp_match function returns a text array of + matching substring(s) within the first match of a POSIX + regular expression pattern to a string. It has the syntax + regexp_match(string, + pattern , flags ). + If there is no match, the result is NULL. + If a match is found, and the pattern contains no + parenthesized subexpressions, then the result is a single-element text + array containing the substring matching the whole pattern. + If a match is found, and the pattern contains + parenthesized subexpressions, then the result is a text array + whose n'th element is the substring matching + the n'th parenthesized subexpression of + the pattern (not counting non-capturing + parentheses; see below for details). + The flags parameter is an optional text string + containing zero or more single-letter flags that change the function's + behavior. Supported flags are described + in . + + + + Some examples: + +SELECT regexp_match('foobarbequebaz', 'bar.*que'); + regexp_match +-------------- + {barbeque} +(1 row) + +SELECT regexp_match('foobarbequebaz', '(bar)(beque)'); + regexp_match +-------------- + {bar,beque} +(1 row) + + + + + + In the common case where you just want the whole matching substring + or NULL for no match, the best solution is to + use regexp_substr(). + However, regexp_substr() only exists + in PostgreSQL version 15 and up. When + working in older versions, you can extract the first element + of regexp_match()'s result, for example: + +SELECT (regexp_match('foobarbequebaz', 'bar.*que'))[1]; + regexp_match +-------------- + barbeque +(1 row) + + + + + + The regexp_matches function returns a set of text arrays + of matching substring(s) within matches of a POSIX regular + expression pattern to a string. It has the same syntax as + regexp_match. + This function returns no rows if there is no match, one row if there is + a match and the g flag is not given, or N + rows if there are N matches and the g flag + is given. Each returned row is a text array containing the whole + matched substring or the substrings matching parenthesized + subexpressions of the pattern, just as described above + for regexp_match. + regexp_matches accepts all the flags shown + in , plus + the g flag which commands it to return all matches, not + just the first one. + + + + Some examples: + +SELECT regexp_matches('foo', 'not there'); + regexp_matches +---------------- +(0 rows) + +SELECT regexp_matches('foobarbequebazilbarfbonk', '(b[^b]+)(b[^b]+)', 'g'); + regexp_matches +---------------- + {bar,beque} + {bazil,barf} +(2 rows) + + + + + + In most cases regexp_matches() should be used with + the g flag, since if you only want the first match, it's + easier and more efficient to use regexp_match(). + However, regexp_match() only exists + in PostgreSQL version 10 and up. When working in older + versions, a common trick is to place a regexp_matches() + call in a sub-select, for example: + +SELECT col1, (SELECT regexp_matches(col2, '(bar)(beque)')) FROM tab; + + This produces a text array if there's a match, or NULL if + not, the same as regexp_match() would do. Without the + sub-select, this query would produce no output at all for table rows + without a match, which is typically not the desired behavior. + + + + + The regexp_replace function provides substitution of + new text for substrings that match POSIX regular expression patterns. + It has the syntax + regexp_replace(string, + pattern, replacement + , flags ) + or + regexp_replace(string, + pattern, replacement, + start + , N + , flags ). + The source string is returned unchanged if + there is no match to the pattern. If there is a + match, the string is returned with the + replacement string substituted for the matching + substring. The replacement string can contain + \n, where n is 1 + through 9, to indicate that the source substring matching the + n'th parenthesized subexpression of the pattern should be + inserted, and it can contain \& to indicate that the + substring matching the entire pattern should be inserted. Write + \\ if you need to put a literal backslash in the replacement + text. + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + By default, only the first match of the pattern is replaced. + If N is specified and is greater than zero, + then the N'th match of the pattern + is replaced. + If the g flag is given, or + if N is specified and is zero, then all + matches at or after the start position are + replaced. (The g flag is ignored + when N is specified.) + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Supported flags (though + not g) are + described in . + + + + Some examples: + +regexp_replace('foobarbaz', 'b..', 'X') + fooXbaz +regexp_replace('foobarbaz', 'b..', 'X', 'g') + fooXX +regexp_replace('foobarbaz', 'b(..)', 'X\1Y', 'g') + fooXarYXazY +regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i') + X PXstgrXSQL fXnctXXn +regexp_replace(string=>'A PostgreSQL function', pattern=>'a|e|i|o|u', replacement=>'X', start=>1, "N"=>3, flags=>'i') + A PostgrXSQL function + + + + + The regexp_split_to_table function splits a string using a POSIX + regular expression pattern as a delimiter. It has the syntax + regexp_split_to_table(string, pattern + , flags ). + If there is no match to the pattern, the function returns the + string. If there is at least one match, for each match it returns + the text from the end of the last match (or the beginning of the string) + to the beginning of the match. When there are no more matches, it + returns the text from the end of the last match to the end of the string. + The flags parameter is an optional text string containing + zero or more single-letter flags that change the function's behavior. + regexp_split_to_table supports the flags described in + . + + + + The regexp_split_to_array function behaves the same as + regexp_split_to_table, except that regexp_split_to_array + returns its result as an array of text. It has the syntax + regexp_split_to_array(string, pattern + , flags ). + The parameters are the same as for regexp_split_to_table. + + + + Some examples: + +SELECT foo FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '\s+') AS foo; + foo +------- + the + quick + brown + fox + jumps + over + the + lazy + dog +(9 rows) + +SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s+'); + regexp_split_to_array +----------------------------------------------- + {the,quick,brown,fox,jumps,over,the,lazy,dog} +(1 row) + +SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; + foo +----- + t + h + e + q + u + i + c + k + b + r + o + w + n + f + o + x +(16 rows) + + + + + As the last example demonstrates, the regexp split functions ignore + zero-length matches that occur at the start or end of the string + or immediately after a previous match. This is contrary to the strict + definition of regexp matching that is implemented by + the other regexp functions, but is usually the most convenient behavior + in practice. Other software systems such as Perl use similar definitions. + + + + The regexp_substr function returns the substring + that matches a POSIX regular expression pattern, + or NULL if there is no match. It has the syntax + regexp_substr(string, + pattern + , start + , N + , flags + , subexpr + ). + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + If N is specified + then the N'th match of the pattern + is returned, otherwise the first match is returned. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Supported flags are described + in . + For a pattern containing parenthesized + subexpressions, subexpr is an integer + indicating which subexpression is of interest: the result is the + substring matching that subexpression. + Subexpressions are numbered in the order of their leading parentheses. + When subexpr is omitted or zero, the result + is the whole match regardless of parenthesized subexpressions. + + + + Some examples: + +regexp_substr('number of your street, town zip, FR', '[^,]+', 1, 2) + town zip +regexp_substr('ABCDEFGHI', '(c..)(...)', 1, 1, 'i', 2) + FGH + + + + + + + Regular Expression Details + + + PostgreSQL's regular expressions are implemented + using a software package written by Henry Spencer. Much of + the description of regular expressions below is copied verbatim from his + manual. + + + + Regular expressions (REs), as defined in + POSIX 1003.2, come in two forms: + extended REs or EREs + (roughly those of egrep), and + basic REs or BREs + (roughly those of ed). + PostgreSQL supports both forms, and + also implements some extensions + that are not in the POSIX standard, but have become widely used + due to their availability in programming languages such as Perl and Tcl. + REs using these non-POSIX extensions are called + advanced REs or AREs + in this documentation. AREs are almost an exact superset of EREs, + but BREs have several notational incompatibilities (as well as being + much more limited). + We first describe the ARE and ERE forms, noting features that apply + only to AREs, and then describe how BREs differ. + + + + + PostgreSQL always initially presumes that a regular + expression follows the ARE rules. However, the more limited ERE or + BRE rules can be chosen by prepending an embedded option + to the RE pattern, as described in . + This can be useful for compatibility with applications that expect + exactly the POSIX 1003.2 rules. + + + + + A regular expression is defined as one or more + branches, separated by + |. It matches anything that matches one of the + branches. + + + + A branch is zero or more quantified atoms or + constraints, concatenated. + It matches a match for the first, followed by a match for the second, etc.; + an empty branch matches the empty string. + + + + A quantified atom is an atom possibly followed + by a single quantifier. + Without a quantifier, it matches a match for the atom. + With a quantifier, it can match some number of matches of the atom. + An atom can be any of the possibilities + shown in . + The possible quantifiers and their meanings are shown in + . + + + + A constraint matches an empty string, but matches only when + specific conditions are met. A constraint can be used where an atom + could be used, except it cannot be followed by a quantifier. + The simple constraints are shown in + ; + some more constraints are described later. + + + + + Regular Expression Atoms + + + + + + + Atom + Description + + + + + + (re) + (where re is any regular expression) + matches a match for + re, with the match noted for possible reporting + + + + (?:re) + as above, but the match is not noted for reporting + (a non-capturing set of parentheses) + (AREs only) + + + + . + matches any single character + + + + [chars] + a bracket expression, + matching any one of the chars (see + for more detail) + + + + \k + (where k is a non-alphanumeric character) + matches that character taken as an ordinary character, + e.g., \\ matches a backslash character + + + + \c + where c is alphanumeric + (possibly followed by other characters) + is an escape, see + (AREs only; in EREs and BREs, this matches c) + + + + { + when followed by a character other than a digit, + matches the left-brace character {; + when followed by a digit, it is the beginning of a + bound (see below) + + + + x + where x is a single character with no other + significance, matches that character + + + +
+ + + An RE cannot end with a backslash (\). + + + + + If you have turned off, + any backslashes you write in literal string constants will need to be + doubled. See for more information. + + + + + Regular Expression Quantifiers + + + + + + + Quantifier + Matches + + + + + + * + a sequence of 0 or more matches of the atom + + + + + + a sequence of 1 or more matches of the atom + + + + ? + a sequence of 0 or 1 matches of the atom + + + + {m} + a sequence of exactly m matches of the atom + + + + {m,} + a sequence of m or more matches of the atom + + + + + {m,n} + a sequence of m through n + (inclusive) matches of the atom; m cannot exceed + n + + + + *? + non-greedy version of * + + + + +? + non-greedy version of + + + + + ?? + non-greedy version of ? + + + + {m}? + non-greedy version of {m} + + + + {m,}? + non-greedy version of {m,} + + + + + {m,n}? + non-greedy version of {m,n} + + + +
+ + + The forms using {...} + are known as bounds. + The numbers m and n within a bound are + unsigned decimal integers with permissible values from 0 to 255 inclusive. + + + + Non-greedy quantifiers (available in AREs only) match the + same possibilities as their corresponding normal (greedy) + counterparts, but prefer the smallest number rather than the largest + number of matches. + See for more detail. + + + + + A quantifier cannot immediately follow another quantifier, e.g., + ** is invalid. + A quantifier cannot + begin an expression or subexpression or follow + ^ or |. + + + + + Regular Expression Constraints + + + + + + + Constraint + Description + + + + + + ^ + matches at the beginning of the string + + + + $ + matches at the end of the string + + + + (?=re) + positive lookahead matches at any point + where a substring matching re begins + (AREs only) + + + + (?!re) + negative lookahead matches at any point + where no substring matching re begins + (AREs only) + + + + (?<=re) + positive lookbehind matches at any point + where a substring matching re ends + (AREs only) + + + + (?<!re) + negative lookbehind matches at any point + where no substring matching re ends + (AREs only) + + + +
+ + + Lookahead and lookbehind constraints cannot contain back + references (see ), + and all parentheses within them are considered non-capturing. + +
+ + + Bracket Expressions + + + A bracket expression is a list of + characters enclosed in []. It normally matches + any single character from the list (but see below). If the list + begins with ^, it matches any single character + not from the rest of the list. + If two characters + in the list are separated by -, this is + shorthand for the full range of characters between those two + (inclusive) in the collating sequence, + e.g., [0-9] in ASCII matches + any decimal digit. It is illegal for two ranges to share an + endpoint, e.g., a-c-e. Ranges are very + collating-sequence-dependent, so portable programs should avoid + relying on them. + + + + To include a literal ] in the list, make it the + first character (after ^, if that is used). To + include a literal -, make it the first or last + character, or the second endpoint of a range. To use a literal + - as the first endpoint of a range, enclose it + in [. and .] to make it a + collating element (see below). With the exception of these characters, + some combinations using [ + (see next paragraphs), and escapes (AREs only), all other special + characters lose their special significance within a bracket expression. + In particular, \ is not special when following + ERE or BRE rules, though it is special (as introducing an escape) + in AREs. + + + + Within a bracket expression, a collating element (a character, a + multiple-character sequence that collates as if it were a single + character, or a collating-sequence name for either) enclosed in + [. and .] stands for the + sequence of characters of that collating element. The sequence is + treated as a single element of the bracket expression's list. This + allows a bracket + expression containing a multiple-character collating element to + match more than one character, e.g., if the collating sequence + includes a ch collating element, then the RE + [[.ch.]]*c matches the first five characters of + chchcc. + + + + + PostgreSQL currently does not support multi-character collating + elements. This information describes possible future behavior. + + + + + Within a bracket expression, a collating element enclosed in + [= and =] is an equivalence + class, standing for the sequences of characters of all collating + elements equivalent to that one, including itself. (If there are + no other equivalent collating elements, the treatment is as if the + enclosing delimiters were [. and + .].) For example, if o and + ^ are the members of an equivalence class, then + [[=o=]], [[=^=]], and + [o^] are all synonymous. An equivalence class + cannot be an endpoint of a range. + + + + Within a bracket expression, the name of a character class + enclosed in [: and :] stands + for the list of all characters belonging to that class. A character + class cannot be used as an endpoint of a range. + The POSIX standard defines these character class + names: + alnum (letters and numeric digits), + alpha (letters), + blank (space and tab), + cntrl (control characters), + digit (numeric digits), + graph (printable characters except space), + lower (lower-case letters), + print (printable characters including space), + punct (punctuation), + space (any white space), + upper (upper-case letters), + and xdigit (hexadecimal digits). + The behavior of these standard character classes is generally + consistent across platforms for characters in the 7-bit ASCII set. + Whether a given non-ASCII character is considered to belong to one + of these classes depends on the collation + that is used for the regular-expression function or operator + (see ), or by default on the + database's LC_CTYPE locale setting (see + ). The classification of non-ASCII + characters can vary across platforms even in similarly-named + locales. (But the C locale never considers any + non-ASCII characters to belong to any of these classes.) + In addition to these standard character + classes, PostgreSQL defines + the word character class, which is the same as + alnum plus the underscore (_) + character, and + the ascii character class, which contains exactly + the 7-bit ASCII set. + + + + There are two special cases of bracket expressions: the bracket + expressions [[:<:]] and + [[:>:]] are constraints, + matching empty strings at the beginning + and end of a word respectively. A word is defined as a sequence + of word characters that is neither preceded nor followed by word + characters. A word character is any character belonging to the + word character class, that is, any letter, digit, + or underscore. This is an extension, compatible with but not + specified by POSIX 1003.2, and should be used with + caution in software intended to be portable to other systems. + The constraint escapes described below are usually preferable; they + are no more standard, but are easier to type. + + + + + Regular Expression Escapes + + + Escapes are special sequences beginning with \ + followed by an alphanumeric character. Escapes come in several varieties: + character entry, class shorthands, constraint escapes, and back references. + A \ followed by an alphanumeric character but not constituting + a valid escape is illegal in AREs. + In EREs, there are no escapes: outside a bracket expression, + a \ followed by an alphanumeric character merely stands for + that character as an ordinary character, and inside a bracket expression, + \ is an ordinary character. + (The latter is the one actual incompatibility between EREs and AREs.) + + + + Character-entry escapes exist to make it easier to specify + non-printing and other inconvenient characters in REs. They are + shown in . + + + + Class-shorthand escapes provide shorthands for certain + commonly-used character classes. They are + shown in . + + + + A constraint escape is a constraint, + matching the empty string if specific conditions are met, + written as an escape. They are + shown in . + + + + A back reference (\n) matches the + same string matched by the previous parenthesized subexpression specified + by the number n + (see ). For example, + ([bc])\1 matches bb or cc + but not bc or cb. + The subexpression must entirely precede the back reference in the RE. + Subexpressions are numbered in the order of their leading parentheses. + Non-capturing parentheses do not define subexpressions. + The back reference considers only the string characters matched by the + referenced subexpression, not any constraints contained in it. For + example, (^\d)\1 will match 22. + + + + Regular Expression Character-Entry Escapes + + + + + + + Escape + Description + + + + + + \a + alert (bell) character, as in C + + + + \b + backspace, as in C + + + + \B + synonym for backslash (\) to help reduce the need for backslash + doubling + + + + \cX + (where X is any character) the character whose + low-order 5 bits are the same as those of + X, and whose other bits are all zero + + + + \e + the character whose collating-sequence name + is ESC, + or failing that, the character with octal value 033 + + + + \f + form feed, as in C + + + + \n + newline, as in C + + + + \r + carriage return, as in C + + + + \t + horizontal tab, as in C + + + + \uwxyz + (where wxyz is exactly four hexadecimal digits) + the character whose hexadecimal value is + 0xwxyz + + + + + \Ustuvwxyz + (where stuvwxyz is exactly eight hexadecimal + digits) + the character whose hexadecimal value is + 0xstuvwxyz + + + + + \v + vertical tab, as in C + + + + \xhhh + (where hhh is any sequence of hexadecimal + digits) + the character whose hexadecimal value is + 0xhhh + (a single character no matter how many hexadecimal digits are used) + + + + + \0 + the character whose value is 0 (the null byte) + + + + \xy + (where xy is exactly two octal digits, + and is not a back reference) + the character whose octal value is + 0xy + + + + \xyz + (where xyz is exactly three octal digits, + and is not a back reference) + the character whose octal value is + 0xyz + + + +
+ + + Hexadecimal digits are 0-9, + a-f, and A-F. + Octal digits are 0-7. + + + + Numeric character-entry escapes specifying values outside the ASCII range + (0–127) have meanings dependent on the database encoding. When the + encoding is UTF-8, escape values are equivalent to Unicode code points, + for example \u1234 means the character U+1234. + For other multibyte encodings, character-entry escapes usually just + specify the concatenation of the byte values for the character. If the + escape value does not correspond to any legal character in the database + encoding, no error will be raised, but it will never match any data. + + + + The character-entry escapes are always taken as ordinary characters. + For example, \135 is ] in ASCII, but + \135 does not terminate a bracket expression. + + + + Regular Expression Class-Shorthand Escapes + + + + + + + Escape + Description + + + + + + \d + matches any digit, like + [[:digit:]] + + + + \s + matches any whitespace character, like + [[:space:]] + + + + \w + matches any word character, like + [[:word:]] + + + + \D + matches any non-digit, like + [^[:digit:]] + + + + \S + matches any non-whitespace character, like + [^[:space:]] + + + + \W + matches any non-word character, like + [^[:word:]] + + + +
+ + + The class-shorthand escapes also work within bracket expressions, + although the definitions shown above are not quite syntactically + valid in that context. + For example, [a-c\d] is equivalent to + [a-c[:digit:]]. + + + + Regular Expression Constraint Escapes + + + + + + + Escape + Description + + + + + + \A + matches only at the beginning of the string + (see for how this differs from + ^) + + + + \m + matches only at the beginning of a word + + + + \M + matches only at the end of a word + + + + \y + matches only at the beginning or end of a word + + + + \Y + matches only at a point that is not the beginning or end of a + word + + + + \Z + matches only at the end of the string + (see for how this differs from + $) + + + +
+ + + A word is defined as in the specification of + [[:<:]] and [[:>:]] above. + Constraint escapes are illegal within bracket expressions. + + + + Regular Expression Back References + + + + + + + Escape + Description + + + + + + \m + (where m is a nonzero digit) + a back reference to the m'th subexpression + + + + \mnn + (where m is a nonzero digit, and + nn is some more digits, and the decimal value + mnn is not greater than the number of closing capturing + parentheses seen so far) + a back reference to the mnn'th subexpression + + + +
+ + + + There is an inherent ambiguity between octal character-entry + escapes and back references, which is resolved by the following heuristics, + as hinted at above. + A leading zero always indicates an octal escape. + A single non-zero digit, not followed by another digit, + is always taken as a back reference. + A multi-digit sequence not starting with a zero is taken as a back + reference if it comes after a suitable subexpression + (i.e., the number is in the legal range for a back reference), + and otherwise is taken as octal. + + +
+ + + Regular Expression Metasyntax + + + In addition to the main syntax described above, there are some special + forms and miscellaneous syntactic facilities available. + + + + An RE can begin with one of two special director prefixes. + If an RE begins with ***:, + the rest of the RE is taken as an ARE. (This normally has no effect in + PostgreSQL, since REs are assumed to be AREs; + but it does have an effect if ERE or BRE mode had been specified by + the flags parameter to a regex function.) + If an RE begins with ***=, + the rest of the RE is taken to be a literal string, + with all characters considered ordinary characters. + + + + An ARE can begin with embedded options: + a sequence (?xyz) + (where xyz is one or more alphabetic characters) + specifies options affecting the rest of the RE. + These options override any previously determined options — + in particular, they can override the case-sensitivity behavior implied by + a regex operator, or the flags parameter to a regex + function. + The available option letters are + shown in . + Note that these same option letters are used in the flags + parameters of regex functions. + + + + ARE Embedded-Option Letters + + + + + + + Option + Description + + + + + + b + rest of RE is a BRE + + + + c + case-sensitive matching (overrides operator type) + + + + e + rest of RE is an ERE + + + + i + case-insensitive matching (see + ) (overrides operator type) + + + + m + historical synonym for n + + + + n + newline-sensitive matching (see + ) + + + + p + partial newline-sensitive matching (see + ) + + + + q + rest of RE is a literal (quoted) string, all ordinary + characters + + + + s + non-newline-sensitive matching (default) + + + + t + tight syntax (default; see below) + + + + w + inverse partial newline-sensitive (weird) matching + (see ) + + + + x + expanded syntax (see below) + + + +
+ + + Embedded options take effect at the ) terminating the sequence. + They can appear only at the start of an ARE (after the + ***: director if any). + + + + In addition to the usual (tight) RE syntax, in which all + characters are significant, there is an expanded syntax, + available by specifying the embedded x option. + In the expanded syntax, + white-space characters in the RE are ignored, as are + all characters between a # + and the following newline (or the end of the RE). This + permits paragraphing and commenting a complex RE. + There are three exceptions to that basic rule: + + + + + a white-space character or # preceded by \ is + retained + + + + + white space or # within a bracket expression is retained + + + + + white space and comments cannot appear within multi-character symbols, + such as (?: + + + + + For this purpose, white-space characters are blank, tab, newline, and + any character that belongs to the space character class. + + + + Finally, in an ARE, outside bracket expressions, the sequence + (?#ttt) + (where ttt is any text not containing a )) + is a comment, completely ignored. + Again, this is not allowed between the characters of + multi-character symbols, like (?:. + Such comments are more a historical artifact than a useful facility, + and their use is deprecated; use the expanded syntax instead. + + + + None of these metasyntax extensions is available if + an initial ***= director + has specified that the user's input be treated as a literal string + rather than as an RE. + +
+ + + Regular Expression Matching Rules + + + In the event that an RE could match more than one substring of a given + string, the RE matches the one starting earliest in the string. + If the RE could match more than one substring starting at that point, + either the longest possible match or the shortest possible match will + be taken, depending on whether the RE is greedy or + non-greedy. + + + + Whether an RE is greedy or not is determined by the following rules: + + + + Most atoms, and all constraints, have no greediness attribute (because + they cannot match variable amounts of text anyway). + + + + + Adding parentheses around an RE does not change its greediness. + + + + + A quantified atom with a fixed-repetition quantifier + ({m} + or + {m}?) + has the same greediness (possibly none) as the atom itself. + + + + + A quantified atom with other normal quantifiers (including + {m,n} + with m equal to n) + is greedy (prefers longest match). + + + + + A quantified atom with a non-greedy quantifier (including + {m,n}? + with m equal to n) + is non-greedy (prefers shortest match). + + + + + A branch — that is, an RE that has no top-level + | operator — has the same greediness as the first + quantified atom in it that has a greediness attribute. + + + + + An RE consisting of two or more branches connected by the + | operator is always greedy. + + + + + + + The above rules associate greediness attributes not only with individual + quantified atoms, but with branches and entire REs that contain quantified + atoms. What that means is that the matching is done in such a way that + the branch, or whole RE, matches the longest or shortest possible + substring as a whole. Once the length of the entire match + is determined, the part of it that matches any particular subexpression + is determined on the basis of the greediness attribute of that + subexpression, with subexpressions starting earlier in the RE taking + priority over ones starting later. + + + + An example of what this means: + +SELECT SUBSTRING('XY1234Z', 'Y*([0-9]{1,3})'); +Result: 123 +SELECT SUBSTRING('XY1234Z', 'Y*?([0-9]{1,3})'); +Result: 1 + + In the first case, the RE as a whole is greedy because Y* + is greedy. It can match beginning at the Y, and it matches + the longest possible string starting there, i.e., Y123. + The output is the parenthesized part of that, or 123. + In the second case, the RE as a whole is non-greedy because Y*? + is non-greedy. It can match beginning at the Y, and it matches + the shortest possible string starting there, i.e., Y1. + The subexpression [0-9]{1,3} is greedy but it cannot change + the decision as to the overall match length; so it is forced to match + just 1. + + + + In short, when an RE contains both greedy and non-greedy subexpressions, + the total match length is either as long as possible or as short as + possible, according to the attribute assigned to the whole RE. The + attributes assigned to the subexpressions only affect how much of that + match they are allowed to eat relative to each other. + + + + The quantifiers {1,1} and {1,1}? + can be used to force greediness or non-greediness, respectively, + on a subexpression or a whole RE. + This is useful when you need the whole RE to have a greediness attribute + different from what's deduced from its elements. As an example, + suppose that we are trying to separate a string containing some digits + into the digits and the parts before and after them. We might try to + do that like this: + +SELECT regexp_match('abc01234xyz', '(.*)(\d+)(.*)'); +Result: {abc0123,4,xyz} + + That didn't work: the first .* is greedy so + it eats as much as it can, leaving the \d+ to + match at the last possible place, the last digit. We might try to fix + that by making it non-greedy: + +SELECT regexp_match('abc01234xyz', '(.*?)(\d+)(.*)'); +Result: {abc,0,""} + + That didn't work either, because now the RE as a whole is non-greedy + and so it ends the overall match as soon as possible. We can get what + we want by forcing the RE as a whole to be greedy: + +SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); +Result: {abc,01234,xyz} + + Controlling the RE's overall greediness separately from its components' + greediness allows great flexibility in handling variable-length patterns. + + + + When deciding what is a longer or shorter match, + match lengths are measured in characters, not collating elements. + An empty string is considered longer than no match at all. + For example: + bb* + matches the three middle characters of abbbc; + (week|wee)(night|knights) + matches all ten characters of weeknights; + when (.*).* + is matched against abc the parenthesized subexpression + matches all three characters; and when + (a*)* is matched against bc + both the whole RE and the parenthesized + subexpression match an empty string. + + + + If case-independent matching is specified, + the effect is much as if all case distinctions had vanished from the + alphabet. + When an alphabetic that exists in multiple cases appears as an + ordinary character outside a bracket expression, it is effectively + transformed into a bracket expression containing both cases, + e.g., x becomes [xX]. + When it appears inside a bracket expression, all case counterparts + of it are added to the bracket expression, e.g., + [x] becomes [xX] + and [^x] becomes [^xX]. + + + + If newline-sensitive matching is specified, . + and bracket expressions using ^ + will never match the newline character + (so that matches will not cross lines unless the RE + explicitly includes a newline) + and ^ and $ + will match the empty string after and before a newline + respectively, in addition to matching at beginning and end of string + respectively. + But the ARE escapes \A and \Z + continue to match beginning or end of string only. + Also, the character class shorthands \D + and \W will match a newline regardless of this mode. + (Before PostgreSQL 14, they did not match + newlines when in newline-sensitive mode. + Write [^[:digit:]] + or [^[:word:]] to get the old behavior.) + + + + If partial newline-sensitive matching is specified, + this affects . and bracket expressions + as with newline-sensitive matching, but not ^ + and $. + + + + If inverse partial newline-sensitive matching is specified, + this affects ^ and $ + as with newline-sensitive matching, but not . + and bracket expressions. + This isn't very useful but is provided for symmetry. + + + + + Limits and Compatibility + + + No particular limit is imposed on the length of REs in this + implementation. However, + programs intended to be highly portable should not employ REs longer + than 256 bytes, + as a POSIX-compliant implementation can refuse to accept such REs. + + + + The only feature of AREs that is actually incompatible with + POSIX EREs is that \ does not lose its special + significance inside bracket expressions. + All other ARE features use syntax which is illegal or has + undefined or unspecified effects in POSIX EREs; + the *** syntax of directors likewise is outside the POSIX + syntax for both BREs and EREs. + + + + Many of the ARE extensions are borrowed from Perl, but some have + been changed to clean them up, and a few Perl extensions are not present. + Incompatibilities of note include \b, \B, + the lack of special treatment for a trailing newline, + the addition of complemented bracket expressions to the things + affected by newline-sensitive matching, + the restrictions on parentheses and back references in lookahead/lookbehind + constraints, and the longest/shortest-match (rather than first-match) + matching semantics. + + + + + Basic Regular Expressions + + + BREs differ from EREs in several respects. + In BREs, |, +, and ? + are ordinary characters and there is no equivalent + for their functionality. + The delimiters for bounds are + \{ and \}, + with { and } + by themselves ordinary characters. + The parentheses for nested subexpressions are + \( and \), + with ( and ) by themselves ordinary characters. + ^ is an ordinary character except at the beginning of the + RE or the beginning of a parenthesized subexpression, + $ is an ordinary character except at the end of the + RE or the end of a parenthesized subexpression, + and * is an ordinary character if it appears at the beginning + of the RE or the beginning of a parenthesized subexpression + (after a possible leading ^). + Finally, single-digit back references are available, and + \< and \> + are synonyms for + [[:<:]] and [[:>:]] + respectively; no other escapes are available in BREs. + + + + + + + Differences from SQL Standard and XQuery + + + LIKE_REGEX + + + + OCCURRENCES_REGEX + + + + POSITION_REGEX + + + + SUBSTRING_REGEX + + + + TRANSLATE_REGEX + + + + XQuery regular expressions + + + + Since SQL:2008, the SQL standard includes regular expression operators + and functions that performs pattern + matching according to the XQuery regular expression + standard: + + LIKE_REGEX + OCCURRENCES_REGEX + POSITION_REGEX + SUBSTRING_REGEX + TRANSLATE_REGEX + + PostgreSQL does not currently implement these + operators and functions. You can get approximately equivalent + functionality in each case as shown in . (Various optional clauses on + both sides have been omitted in this table.) + + + + Regular Expression Functions Equivalencies + + + + + SQL standard + PostgreSQL + + + + + + string LIKE_REGEX pattern + regexp_like(string, pattern) or string ~ pattern + + + + OCCURRENCES_REGEX(pattern IN string) + regexp_count(string, pattern) + + + + POSITION_REGEX(pattern IN string) + regexp_instr(string, pattern) + + + + SUBSTRING_REGEX(pattern IN string) + regexp_substr(string, pattern) + + + + TRANSLATE_REGEX(pattern IN string WITH replacement) + regexp_replace(string, pattern, replacement) + + + +
+ + + Regular expression functions similar to those provided by PostgreSQL are + also available in a number of other SQL implementations, whereas the + SQL-standard functions are not as widely implemented. Some of the + details of the regular expression syntax will likely differ in each + implementation. + + + + The SQL-standard operators and functions use XQuery regular expressions, + which are quite close to the ARE syntax described above. + Notable differences between the existing POSIX-based + regular-expression feature and XQuery regular expressions include: + + + + + XQuery character class subtraction is not supported. An example of + this feature is using the following to match only English + consonants: [a-z-[aeiou]]. + + + + + XQuery character class shorthands \c, + \C, \i, + and \I are not supported. + + + + + XQuery character class elements + using \p{UnicodeProperty} or the + inverse \P{UnicodeProperty} are not supported. + + + + + POSIX interprets character classes such as \w + (see ) + according to the prevailing locale (which you can control by + attaching a COLLATE clause to the operator or + function). XQuery specifies these classes by reference to Unicode + character properties, so equivalent behavior is obtained only with + a locale that follows the Unicode rules. + + + + + The SQL standard (not XQuery itself) attempts to cater for more + variants of newline than POSIX does. The + newline-sensitive matching options described above consider only + ASCII NL (\n) to be a newline, but SQL would have + us treat CR (\r), CRLF (\r\n) + (a Windows-style newline), and some Unicode-only characters like + LINE SEPARATOR (U+2028) as newlines as well. + Notably, . and \s should + count \r\n as one character not two according to + SQL. + + + + + Of the character-entry escapes described in + , + XQuery supports only \n, \r, + and \t. + + + + + XQuery does not support + the [:name:] syntax + for character classes within bracket expressions. + + + + + XQuery does not have lookahead or lookbehind constraints, + nor any of the constraint escapes described in + . + + + + + The metasyntax forms described in + do not exist in XQuery. + + + + + The regular expression flag letters defined by XQuery are + related to but not the same as the option letters for POSIX + (). While the + i and q options behave the + same, others do not: + + + + XQuery's s (allow dot to match newline) + and m (allow ^ + and $ to match at newlines) flags provide + access to the same behaviors as + POSIX's n, p + and w flags, but they + do not match the behavior of + POSIX's s and m flags. + Note in particular that dot-matches-newline is the default + behavior in POSIX but not XQuery. + + + + + XQuery's x (ignore whitespace in pattern) flag + is noticeably different from POSIX's expanded-mode flag. + POSIX's x flag also + allows # to begin a comment in the pattern, + and POSIX will not ignore a whitespace character after a + backslash. + + + + + + + + +
+
+
diff --git a/doc/src/sgml/func/func-math.sgml b/doc/src/sgml/func/func-math.sgml new file mode 100644 index 0000000000000..9dcf97e7c9e06 --- /dev/null +++ b/doc/src/sgml/func/func-math.sgml @@ -0,0 +1,1616 @@ + + Mathematical Functions and Operators + + + Mathematical operators are provided for many + PostgreSQL types. For types without + standard mathematical conventions + (e.g., date/time types) we + describe the actual behavior in subsequent sections. + + + + shows the mathematical + operators that are available for the standard numeric types. + Unless otherwise noted, operators shown as + accepting numeric_type are available for all + the types smallint, integer, + bigint, numeric, real, + and double precision. + Operators shown as accepting integral_type + are available for the types smallint, integer, + and bigint. + Except where noted, each form of an operator returns the same data type + as its argument(s). Calls involving multiple argument data types, such + as integer + numeric, + are resolved by using the type appearing later in these lists. + + + + Mathematical Operators + + + + + + Operator + + + Description + + + Example(s) + + + + + + + + numeric_type + numeric_type + numeric_type + + + Addition + + + 2 + 3 + 5 + + + + + + + numeric_type + numeric_type + + + Unary plus (no operation) + + + + 3.5 + 3.5 + + + + + + numeric_type - numeric_type + numeric_type + + + Subtraction + + + 2 - 3 + -1 + + + + + + - numeric_type + numeric_type + + + Negation + + + - (-4) + 4 + + + + + + numeric_type * numeric_type + numeric_type + + + Multiplication + + + 2 * 3 + 6 + + + + + + numeric_type / numeric_type + numeric_type + + + Division (for integral types, division truncates the result towards + zero) + + + 5.0 / 2 + 2.5000000000000000 + + + 5 / 2 + 2 + + + (-5) / 2 + -2 + + + + + + numeric_type % numeric_type + numeric_type + + + Modulo (remainder); available for smallint, + integer, bigint, and numeric + + + 5 % 4 + 1 + + + + + + numeric ^ numeric + numeric + + + double precision ^ double precision + double precision + + + Exponentiation + + + 2 ^ 3 + 8 + + + Unlike typical mathematical practice, multiple uses of + ^ will associate left to right by default: + + + 2 ^ 3 ^ 3 + 512 + + + 2 ^ (3 ^ 3) + 134217728 + + + + + + |/ double precision + double precision + + + Square root + + + |/ 25.0 + 5 + + + + + + ||/ double precision + double precision + + + Cube root + + + ||/ 64.0 + 4 + + + + + + @ numeric_type + numeric_type + + + Absolute value + + + @ -5.0 + 5.0 + + + + + + integral_type & integral_type + integral_type + + + Bitwise AND + + + 91 & 15 + 11 + + + + + + integral_type | integral_type + integral_type + + + Bitwise OR + + + 32 | 3 + 35 + + + + + + integral_type # integral_type + integral_type + + + Bitwise exclusive OR + + + 17 # 5 + 20 + + + + + + ~ integral_type + integral_type + + + Bitwise NOT + + + ~1 + -2 + + + + + + integral_type << integer + integral_type + + + Bitwise shift left + + + 1 << 4 + 16 + + + + + + integral_type >> integer + integral_type + + + Bitwise shift right + + + 8 >> 2 + 2 + + + + + +
+ + + shows the available + mathematical functions. + Many of these functions are provided in multiple forms with different + argument types. + Except where noted, any given form of a function returns the same + data type as its argument(s); cross-type cases are resolved in the + same way as explained above for operators. + The functions working with double precision data are mostly + implemented on top of the host system's C library; accuracy and behavior in + boundary cases can therefore vary depending on the host system. + + + + Mathematical Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + abs + + abs ( numeric_type ) + numeric_type + + + Absolute value + + + abs(-17.4) + 17.4 + + + + + + + cbrt + + cbrt ( double precision ) + double precision + + + Cube root + + + cbrt(64.0) + 4 + + + + + + + ceil + + ceil ( numeric ) + numeric + + + ceil ( double precision ) + double precision + + + Nearest integer greater than or equal to argument + + + ceil(42.2) + 43 + + + ceil(-42.8) + -42 + + + + + + + ceiling + + ceiling ( numeric ) + numeric + + + ceiling ( double precision ) + double precision + + + Nearest integer greater than or equal to argument (same + as ceil) + + + ceiling(95.3) + 96 + + + + + + + degrees + + degrees ( double precision ) + double precision + + + Converts radians to degrees + + + degrees(0.5) + 28.64788975654116 + + + + + + + div + + div ( y numeric, + x numeric ) + numeric + + + Integer quotient of y/x + (truncates towards zero) + + + div(9, 4) + 2 + + + + + + + erf + + erf ( double precision ) + double precision + + + Error function + + + erf(1.0) + 0.8427007929497149 + + + + + + + erfc + + erfc ( double precision ) + double precision + + + Complementary error function (1 - erf(x), without + loss of precision for large inputs) + + + erfc(1.0) + 0.15729920705028513 + + + + + + + exp + + exp ( numeric ) + numeric + + + exp ( double precision ) + double precision + + + Exponential (e raised to the given power) + + + exp(1.0) + 2.7182818284590452 + + + + + + + factorial + + factorial ( bigint ) + numeric + + + Factorial + + + factorial(5) + 120 + + + + + + + floor + + floor ( numeric ) + numeric + + + floor ( double precision ) + double precision + + + Nearest integer less than or equal to argument + + + floor(42.8) + 42 + + + floor(-42.8) + -43 + + + + + + + gamma + + gamma ( double precision ) + double precision + + + Gamma function + + + gamma(0.5) + 1.772453850905516 + + + gamma(6) + 120 + + + + + + + gcd + + gcd ( numeric_type, numeric_type ) + numeric_type + + + Greatest common divisor (the largest positive number that divides both + inputs with no remainder); returns 0 if both inputs + are zero; available for integer, bigint, + and numeric + + + gcd(1071, 462) + 21 + + + + + + + lcm + + lcm ( numeric_type, numeric_type ) + numeric_type + + + Least common multiple (the smallest strictly positive number that is + an integral multiple of both inputs); returns 0 if + either input is zero; available for integer, + bigint, and numeric + + + lcm(1071, 462) + 23562 + + + + + + + lgamma + + lgamma ( double precision ) + double precision + + + Natural logarithm of the absolute value of the gamma function + + + lgamma(1000) + 5905.220423209181 + + + + + + + ln + + ln ( numeric ) + numeric + + + ln ( double precision ) + double precision + + + Natural logarithm + + + ln(2.0) + 0.6931471805599453 + + + + + + + log + + log ( numeric ) + numeric + + + log ( double precision ) + double precision + + + Base 10 logarithm + + + log(100) + 2 + + + + + + + log10 + + log10 ( numeric ) + numeric + + + log10 ( double precision ) + double precision + + + Base 10 logarithm (same as log) + + + log10(1000) + 3 + + + + + + log ( b numeric, + x numeric ) + numeric + + + Logarithm of x to base b + + + log(2.0, 64.0) + 6.0000000000000000 + + + + + + + min_scale + + min_scale ( numeric ) + integer + + + Minimum scale (number of fractional decimal digits) needed + to represent the supplied value precisely + + + min_scale(8.4100) + 2 + + + + + + + mod + + mod ( y numeric_type, + x numeric_type ) + numeric_type + + + Remainder of y/x; + available for smallint, integer, + bigint, and numeric + + + mod(9, 4) + 1 + + + + + + + pi + + pi ( ) + double precision + + + Approximate value of π + + + pi() + 3.141592653589793 + + + + + + + power + + power ( a numeric, + b numeric ) + numeric + + + power ( a double precision, + b double precision ) + double precision + + + a raised to the power of b + + + power(9, 3) + 729 + + + + + + + radians + + radians ( double precision ) + double precision + + + Converts degrees to radians + + + radians(45.0) + 0.7853981633974483 + + + + + + + round + + round ( numeric ) + numeric + + + round ( double precision ) + double precision + + + Rounds to nearest integer. For numeric, ties are + broken by rounding away from zero. For double precision, + the tie-breaking behavior is platform dependent, but + round to nearest even is the most common rule. + + + round(42.4) + 42 + + + + + + round ( v numeric, s integer ) + numeric + + + Rounds v to s decimal + places. Ties are broken by rounding away from zero. + + + round(42.4382, 2) + 42.44 + + + round(1234.56, -1) + 1230 + + + + + + + scale + + scale ( numeric ) + integer + + + Scale of the argument (the number of decimal digits in the fractional part) + + + scale(8.4100) + 4 + + + + + + + sign + + sign ( numeric ) + numeric + + + sign ( double precision ) + double precision + + + Sign of the argument (-1, 0, or +1) + + + sign(-8.4) + -1 + + + + + + + sqrt + + sqrt ( numeric ) + numeric + + + sqrt ( double precision ) + double precision + + + Square root + + + sqrt(2) + 1.4142135623730951 + + + + + + + trim_scale + + trim_scale ( numeric ) + numeric + + + Reduces the value's scale (number of fractional decimal digits) by + removing trailing zeroes + + + trim_scale(8.4100) + 8.41 + + + + + + + trunc + + trunc ( numeric ) + numeric + + + trunc ( double precision ) + double precision + + + Truncates to integer (towards zero) + + + trunc(42.8) + 42 + + + trunc(-42.8) + -42 + + + + + + trunc ( v numeric, s integer ) + numeric + + + Truncates v to s + decimal places + + + trunc(42.4382, 2) + 42.43 + + + + + + + width_bucket + + width_bucket ( operand numeric, low numeric, high numeric, count integer ) + integer + + + width_bucket ( operand double precision, low double precision, high double precision, count integer ) + integer + + + Returns the number of the bucket in + which operand falls in a histogram + having count equal-width buckets spanning the + range low to high. + The buckets have inclusive lower bounds and exclusive upper bounds. + Returns 0 for an input less + than low, + or count+1 for an input + greater than or equal to high. + If low > high, + the behavior is mirror-reversed, with bucket 1 + now being the one just below low, and the + inclusive bounds now being on the upper side. + + + width_bucket(5.35, 0.024, 10.06, 5) + 3 + + + width_bucket(9, 10, 0, 10) + 2 + + + + + + width_bucket ( operand anycompatible, thresholds anycompatiblearray ) + integer + + + Returns the number of the bucket in + which operand falls given an array listing the + inclusive lower bounds of the buckets. + Returns 0 for an input less than the first lower + bound. operand and the array elements can be + of any type having standard comparison operators. + The thresholds array must be + sorted, smallest first, or unexpected results will be + obtained. + + + width_bucket(now(), array['yesterday', 'today', 'tomorrow']::timestamptz[]) + 2 + + + + +
+ + + shows functions for + generating random numbers. + + + + Random Functions + + + + + + Function + + + Description + + + Example(s) + + + + + + + + + random + + random ( ) + double precision + + + Returns a random value in the range 0.0 <= x < 1.0 + + + random() + 0.897124072839091 + + + + + + + random + + random ( min integer, max integer ) + integer + + + random ( min bigint, max bigint ) + bigint + + + random ( min numeric, max numeric ) + numeric + + + Returns a random value in the range + min <= x <= max. + For type numeric, the result will have the same number of + fractional decimal digits as min or + max, whichever has more. + + + random(1, 10) + 7 + + + random(-0.499, 0.499) + 0.347 + + + + + + + random_normal + + + random_normal ( + mean double precision + , stddev double precision ) + double precision + + + Returns a random value from the normal distribution with the given + parameters; mean defaults to 0.0 + and stddev defaults to 1.0 + + + random_normal(0.0, 1.0) + 0.051285419 + + + + + + + setseed + + setseed ( double precision ) + void + + + Sets the seed for subsequent random() and + random_normal() calls; + argument must be between -1.0 and 1.0, inclusive + + + setseed(0.12345) + + + + +
+ + + The random() and random_normal() + functions listed in and + use a + deterministic pseudo-random number generator. + It is fast but not suitable for cryptographic + applications; see the module for a more + secure alternative. + If setseed() is called, the series of results of + subsequent calls to these functions in the current session + can be repeated by re-issuing setseed() with the same + argument. + Without any prior setseed() call in the same + session, the first call to any of these functions obtains a seed + from a platform-dependent source of random bits. + + + + shows the + available trigonometric functions. Each of these functions comes in + two variants, one that measures angles in radians and one that + measures angles in degrees. + + + + Trigonometric Functions + + + + + + Function + + + Description + + + Example(s) + + + + + + + + + acos + + acos ( double precision ) + double precision + + + Inverse cosine, result in radians + + + acos(1) + 0 + + + + + + + acosd + + acosd ( double precision ) + double precision + + + Inverse cosine, result in degrees + + + acosd(0.5) + 60 + + + + + + + asin + + asin ( double precision ) + double precision + + + Inverse sine, result in radians + + + asin(1) + 1.5707963267948966 + + + + + + + asind + + asind ( double precision ) + double precision + + + Inverse sine, result in degrees + + + asind(0.5) + 30 + + + + + + + atan + + atan ( double precision ) + double precision + + + Inverse tangent, result in radians + + + atan(1) + 0.7853981633974483 + + + + + + + atand + + atand ( double precision ) + double precision + + + Inverse tangent, result in degrees + + + atand(1) + 45 + + + + + + + atan2 + + atan2 ( y double precision, + x double precision ) + double precision + + + Inverse tangent of + y/x, + result in radians + + + atan2(1, 0) + 1.5707963267948966 + + + + + + + atan2d + + atan2d ( y double precision, + x double precision ) + double precision + + + Inverse tangent of + y/x, + result in degrees + + + atan2d(1, 0) + 90 + + + + + + + cos + + cos ( double precision ) + double precision + + + Cosine, argument in radians + + + cos(0) + 1 + + + + + + + cosd + + cosd ( double precision ) + double precision + + + Cosine, argument in degrees + + + cosd(60) + 0.5 + + + + + + + cot + + cot ( double precision ) + double precision + + + Cotangent, argument in radians + + + cot(0.5) + 1.830487721712452 + + + + + + + cotd + + cotd ( double precision ) + double precision + + + Cotangent, argument in degrees + + + cotd(45) + 1 + + + + + + + sin + + sin ( double precision ) + double precision + + + Sine, argument in radians + + + sin(1) + 0.8414709848078965 + + + + + + + sind + + sind ( double precision ) + double precision + + + Sine, argument in degrees + + + sind(30) + 0.5 + + + + + + + tan + + tan ( double precision ) + double precision + + + Tangent, argument in radians + + + tan(1) + 1.5574077246549023 + + + + + + + tand + + tand ( double precision ) + double precision + + + Tangent, argument in degrees + + + tand(45) + 1 + + + + +
+ + + + Another way to work with angles measured in degrees is to use the unit + transformation functions radians() + and degrees() shown earlier. + However, using the degree-based trigonometric functions is preferred, + as that way avoids round-off error for special cases such + as sind(30). + + + + + shows the + available hyperbolic functions. + + + + Hyperbolic Functions + + + + + + Function + + + Description + + + Example(s) + + + + + + + + + sinh + + sinh ( double precision ) + double precision + + + Hyperbolic sine + + + sinh(1) + 1.1752011936438014 + + + + + + + cosh + + cosh ( double precision ) + double precision + + + Hyperbolic cosine + + + cosh(0) + 1 + + + + + + + tanh + + tanh ( double precision ) + double precision + + + Hyperbolic tangent + + + tanh(1) + 0.7615941559557649 + + + + + + + asinh + + asinh ( double precision ) + double precision + + + Inverse hyperbolic sine + + + asinh(1) + 0.881373587019543 + + + + + + + acosh + + acosh ( double precision ) + double precision + + + Inverse hyperbolic cosine + + + acosh(1) + 0 + + + + + + + atanh + + atanh ( double precision ) + double precision + + + Inverse hyperbolic tangent + + + atanh(0.5) + 0.5493061443340548 + + + + +
+ +
diff --git a/doc/src/sgml/func/func-merge-support.sgml b/doc/src/sgml/func/func-merge-support.sgml new file mode 100644 index 0000000000000..7f084271c13ae --- /dev/null +++ b/doc/src/sgml/func/func-merge-support.sgml @@ -0,0 +1,78 @@ + + Merge Support Functions + + + MERGE + RETURNING + + + + PostgreSQL includes one merge support function + that may be used in the RETURNING list of a + command to identify the action taken for each + row; see . + + + + Merge Support Functions + + + + + + Function + + + Description + + + + + + + + + merge_action + + merge_action ( ) + text + + + Returns the merge action command executed for the current row. This + will be 'INSERT', 'UPDATE', or + 'DELETE'. + + + + +
+ + + Example: + 0 THEN + UPDATE SET in_stock = true, quantity = s.quantity + WHEN MATCHED THEN + UPDATE SET in_stock = false, quantity = 0 + WHEN NOT MATCHED THEN + INSERT (product_id, in_stock, quantity) + VALUES (s.product_id, true, s.quantity) + RETURNING merge_action(), p.*; + + merge_action | product_id | in_stock | quantity +--------------+------------+----------+---------- + UPDATE | 1001 | t | 50 + UPDATE | 1002 | f | 0 + INSERT | 1003 | t | 10 +]]> + + + + Note that this function can only be used in the RETURNING + list of a MERGE command. It is an error to use it in any + other part of a query. + + +
diff --git a/doc/src/sgml/func/func-net.sgml b/doc/src/sgml/func/func-net.sgml new file mode 100644 index 0000000000000..1361a44c19767 --- /dev/null +++ b/doc/src/sgml/func/func-net.sgml @@ -0,0 +1,592 @@ + + Network Address Functions and Operators + + + The IP network address types, cidr and inet, + support the usual comparison operators shown in + + as well as the specialized operators and functions shown in + and + . + + + + Any cidr value can be cast to inet implicitly; + therefore, the operators and functions shown below as operating on + inet also work on cidr values. (Where there are + separate functions for inet and cidr, it is + because the behavior should be different for the two cases.) + Also, it is permitted to cast an inet value + to cidr. When this is done, any bits to the right of the + netmask are silently zeroed to create a valid cidr value. + + + + IP Address Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + inet << inet + boolean + + + Is subnet strictly contained by subnet? + This operator, and the next four, test for subnet inclusion. They + consider only the network parts of the two addresses (ignoring any + bits to the right of the netmasks) and determine whether one network + is identical to or a subnet of the other. + + + inet '192.168.1.5' << inet '192.168.1/24' + t + + + inet '192.168.0.5' << inet '192.168.1/24' + f + + + inet '192.168.1/24' << inet '192.168.1/24' + f + + + + + + inet <<= inet + boolean + + + Is subnet contained by or equal to subnet? + + + inet '192.168.1/24' <<= inet '192.168.1/24' + t + + + + + + inet >> inet + boolean + + + Does subnet strictly contain subnet? + + + inet '192.168.1/24' >> inet '192.168.1.5' + t + + + + + + inet >>= inet + boolean + + + Does subnet contain or equal subnet? + + + inet '192.168.1/24' >>= inet '192.168.1/24' + t + + + + + + inet && inet + boolean + + + Does either subnet contain or equal the other? + + + inet '192.168.1/24' && inet '192.168.1.80/28' + t + + + inet '192.168.1/24' && inet '192.168.2.0/28' + f + + + + + + ~ inet + inet + + + Computes bitwise NOT. + + + ~ inet '192.168.1.6' + 63.87.254.249 + + + + + + inet & inet + inet + + + Computes bitwise AND. + + + inet '192.168.1.6' & inet '0.0.0.255' + 0.0.0.6 + + + + + + inet | inet + inet + + + Computes bitwise OR. + + + inet '192.168.1.6' | inet '0.0.0.255' + 192.168.1.255 + + + + + + inet + bigint + inet + + + Adds an offset to an address. + + + inet '192.168.1.6' + 25 + 192.168.1.31 + + + + + + bigint + inet + inet + + + Adds an offset to an address. + + + 200 + inet '::ffff:fff0:1' + ::ffff:255.240.0.201 + + + + + + inet - bigint + inet + + + Subtracts an offset from an address. + + + inet '192.168.1.43' - 36 + 192.168.1.7 + + + + + + inet - inet + bigint + + + Computes the difference of two addresses. + + + inet '192.168.1.43' - inet '192.168.1.19' + 24 + + + inet '::1' - inet '::ffff:1' + -4294901760 + + + + +
+ + + IP Address Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + abbrev + + abbrev ( inet ) + text + + + Creates an abbreviated display format as text. + (The result is the same as the inet output function + produces; it is abbreviated only in comparison to the + result of an explicit cast to text, which for historical + reasons will never suppress the netmask part.) + + + abbrev(inet '10.1.0.0/32') + 10.1.0.0 + + + + + + abbrev ( cidr ) + text + + + Creates an abbreviated display format as text. + (The abbreviation consists of dropping all-zero octets to the right + of the netmask; more examples are in + .) + + + abbrev(cidr '10.1.0.0/16') + 10.1/16 + + + + + + + broadcast + + broadcast ( inet ) + inet + + + Computes the broadcast address for the address's network. + + + broadcast(inet '192.168.1.5/24') + 192.168.1.255/24 + + + + + + + family + + family ( inet ) + integer + + + Returns the address's family: 4 for IPv4, + 6 for IPv6. + + + family(inet '::1') + 6 + + + + + + + host + + host ( inet ) + text + + + Returns the IP address as text, ignoring the netmask. + + + host(inet '192.168.1.0/24') + 192.168.1.0 + + + + + + + hostmask + + hostmask ( inet ) + inet + + + Computes the host mask for the address's network. + + + hostmask(inet '192.168.23.20/30') + 0.0.0.3 + + + + + + + inet_merge + + inet_merge ( inet, inet ) + cidr + + + Computes the smallest network that includes both of the given networks. + + + inet_merge(inet '192.168.1.5/24', inet '192.168.2.5/24') + 192.168.0.0/22 + + + + + + + inet_same_family + + inet_same_family ( inet, inet ) + boolean + + + Tests whether the addresses belong to the same IP family. + + + inet_same_family(inet '192.168.1.5/24', inet '::1') + f + + + + + + + masklen + + masklen ( inet ) + integer + + + Returns the netmask length in bits. + + + masklen(inet '192.168.1.5/24') + 24 + + + + + + + netmask + + netmask ( inet ) + inet + + + Computes the network mask for the address's network. + + + netmask(inet '192.168.1.5/24') + 255.255.255.0 + + + + + + + network + + network ( inet ) + cidr + + + Returns the network part of the address, zeroing out + whatever is to the right of the netmask. + (This is equivalent to casting the value to cidr.) + + + network(inet '192.168.1.5/24') + 192.168.1.0/24 + + + + + + + set_masklen + + set_masklen ( inet, integer ) + inet + + + Sets the netmask length for an inet value. + The address part does not change. + + + set_masklen(inet '192.168.1.5/24', 16) + 192.168.1.5/16 + + + + + + set_masklen ( cidr, integer ) + cidr + + + Sets the netmask length for a cidr value. + Address bits to the right of the new netmask are set to zero. + + + set_masklen(cidr '192.168.1.0/24', 16) + 192.168.0.0/16 + + + + + + + text + + text ( inet ) + text + + + Returns the unabbreviated IP address and netmask length as text. + (This has the same result as an explicit cast to text.) + + + text(inet '192.168.1.5') + 192.168.1.5/32 + + + + +
+ + + + The abbrev, host, + and text functions are primarily intended to offer + alternative display formats for IP addresses. + + + + + The MAC address types, macaddr and macaddr8, + support the usual comparison operators shown in + + as well as the specialized functions shown in + . + In addition, they support the bitwise logical operators + ~, & and | + (NOT, AND and OR), just as shown above for IP addresses. + + + + MAC Address Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + trunc + + trunc ( macaddr ) + macaddr + + + Sets the last 3 bytes of the address to zero. The remaining prefix + can be associated with a particular manufacturer (using data not + included in PostgreSQL). + + + trunc(macaddr '12:34:56:78:90:ab') + 12:34:56:00:00:00 + + + + + + trunc ( macaddr8 ) + macaddr8 + + + Sets the last 5 bytes of the address to zero. The remaining prefix + can be associated with a particular manufacturer (using data not + included in PostgreSQL). + + + trunc(macaddr8 '12:34:56:78:90:ab:cd:ef') + 12:34:56:00:00:00:00:00 + + + + + + + macaddr8_set7bit + + macaddr8_set7bit ( macaddr8 ) + macaddr8 + + + Sets the 7th bit of the address to one, creating what is known as + modified EUI-64, for inclusion in an IPv6 address. + + + macaddr8_set7bit(macaddr8 '00:34:56:ab:cd:ef') + 02:34:56:ff:fe:ab:cd:ef + + + + +
+ +
diff --git a/doc/src/sgml/func/func-range.sgml b/doc/src/sgml/func/func-range.sgml new file mode 100644 index 0000000000000..3c5a34796a1d6 --- /dev/null +++ b/doc/src/sgml/func/func-range.sgml @@ -0,0 +1,1095 @@ + + Range/Multirange Functions and Operators + + + See for an overview of range types. + + + + shows the specialized operators + available for range types. + shows the specialized operators + available for multirange types. + In addition to those, the usual comparison operators shown in + are available for range + and multirange types. The comparison operators order first by the range lower + bounds, and only if those are equal do they compare the upper bounds. The + multirange operators compare each range until one is unequal. This + does not usually result in a useful overall ordering, but the operators are + provided to allow unique indexes to be constructed on ranges. + + + + Range Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + anyrange @> anyrange + boolean + + + Does the first range contain the second? + + + int4range(2,4) @> int4range(2,3) + t + + + + + + anyrange @> anyelement + boolean + + + Does the range contain the element? + + + '[2011-01-01,2011-03-01)'::tsrange @> '2011-01-10'::timestamp + t + + + + + + anyrange <@ anyrange + boolean + + + Is the first range contained by the second? + + + int4range(2,4) <@ int4range(1,7) + t + + + + + + anyelement <@ anyrange + boolean + + + Is the element contained in the range? + + + 42 <@ int4range(1,7) + f + + + + + + anyrange && anyrange + boolean + + + Do the ranges overlap, that is, have any elements in common? + + + int8range(3,7) && int8range(4,12) + t + + + + + + anyrange << anyrange + boolean + + + Is the first range strictly left of the second? + + + int8range(1,10) << int8range(100,110) + t + + + + + + anyrange >> anyrange + boolean + + + Is the first range strictly right of the second? + + + int8range(50,60) >> int8range(20,30) + t + + + + + + anyrange &< anyrange + boolean + + + Does the first range not extend to the right of the second? + + + int8range(1,20) &< int8range(18,20) + t + + + + + + anyrange &> anyrange + boolean + + + Does the first range not extend to the left of the second? + + + int8range(7,20) &> int8range(5,10) + t + + + + + + anyrange -|- anyrange + boolean + + + Are the ranges adjacent? + + + numrange(1.1,2.2) -|- numrange(2.2,3.3) + t + + + + + + anyrange + anyrange + anyrange + + + Computes the union of the ranges. The ranges must overlap or be + adjacent, so that the union is a single range (but + see range_merge()). + + + numrange(5,15) + numrange(10,20) + [5,20) + + + + + + anyrange * anyrange + anyrange + + + Computes the intersection of the ranges. + + + int8range(5,15) * int8range(10,20) + [10,15) + + + + + + anyrange - anyrange + anyrange + + + Computes the difference of the ranges. The second range must not be + contained in the first in such a way that the difference would not be + a single range. + + + int8range(5,15) - int8range(10,20) + [5,10) + + + + +
+ + + Multirange Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + anymultirange @> anymultirange + boolean + + + Does the first multirange contain the second? + + + '{[2,4)}'::int4multirange @> '{[2,3)}'::int4multirange + t + + + + + + anymultirange @> anyrange + boolean + + + Does the multirange contain the range? + + + '{[2,4)}'::int4multirange @> int4range(2,3) + t + + + + + + anymultirange @> anyelement + boolean + + + Does the multirange contain the element? + + + '{[2011-01-01,2011-03-01)}'::tsmultirange @> '2011-01-10'::timestamp + t + + + + + + anyrange @> anymultirange + boolean + + + Does the range contain the multirange? + + + '[2,4)'::int4range @> '{[2,3)}'::int4multirange + t + + + + + + anymultirange <@ anymultirange + boolean + + + Is the first multirange contained by the second? + + + '{[2,4)}'::int4multirange <@ '{[1,7)}'::int4multirange + t + + + + + + anymultirange <@ anyrange + boolean + + + Is the multirange contained by the range? + + + '{[2,4)}'::int4multirange <@ int4range(1,7) + t + + + + + + anyrange <@ anymultirange + boolean + + + Is the range contained by the multirange? + + + int4range(2,4) <@ '{[1,7)}'::int4multirange + t + + + + + + anyelement <@ anymultirange + boolean + + + Is the element contained by the multirange? + + + 4 <@ '{[1,7)}'::int4multirange + t + + + + + + anymultirange && anymultirange + boolean + + + Do the multiranges overlap, that is, have any elements in common? + + + '{[3,7)}'::int8multirange && '{[4,12)}'::int8multirange + t + + + + + + anymultirange && anyrange + boolean + + + Does the multirange overlap the range? + + + '{[3,7)}'::int8multirange && int8range(4,12) + t + + + + + + anyrange && anymultirange + boolean + + + Does the range overlap the multirange? + + + int8range(3,7) && '{[4,12)}'::int8multirange + t + + + + + + anymultirange << anymultirange + boolean + + + Is the first multirange strictly left of the second? + + + '{[1,10)}'::int8multirange << '{[100,110)}'::int8multirange + t + + + + + + anymultirange << anyrange + boolean + + + Is the multirange strictly left of the range? + + + '{[1,10)}'::int8multirange << int8range(100,110) + t + + + + + + anyrange << anymultirange + boolean + + + Is the range strictly left of the multirange? + + + int8range(1,10) << '{[100,110)}'::int8multirange + t + + + + + + anymultirange >> anymultirange + boolean + + + Is the first multirange strictly right of the second? + + + '{[50,60)}'::int8multirange >> '{[20,30)}'::int8multirange + t + + + + + + anymultirange >> anyrange + boolean + + + Is the multirange strictly right of the range? + + + '{[50,60)}'::int8multirange >> int8range(20,30) + t + + + + + + anyrange >> anymultirange + boolean + + + Is the range strictly right of the multirange? + + + int8range(50,60) >> '{[20,30)}'::int8multirange + t + + + + + + anymultirange &< anymultirange + boolean + + + Does the first multirange not extend to the right of the second? + + + '{[1,20)}'::int8multirange &< '{[18,20)}'::int8multirange + t + + + + + + anymultirange &< anyrange + boolean + + + Does the multirange not extend to the right of the range? + + + '{[1,20)}'::int8multirange &< int8range(18,20) + t + + + + + + anyrange &< anymultirange + boolean + + + Does the range not extend to the right of the multirange? + + + int8range(1,20) &< '{[18,20)}'::int8multirange + t + + + + + + anymultirange &> anymultirange + boolean + + + Does the first multirange not extend to the left of the second? + + + '{[7,20)}'::int8multirange &> '{[5,10)}'::int8multirange + t + + + + + + anymultirange &> anyrange + boolean + + + Does the multirange not extend to the left of the range? + + + '{[7,20)}'::int8multirange &> int8range(5,10) + t + + + + + + anyrange &> anymultirange + boolean + + + Does the range not extend to the left of the multirange? + + + int8range(7,20) &> '{[5,10)}'::int8multirange + t + + + + + + anymultirange -|- anymultirange + boolean + + + Are the multiranges adjacent? + + + '{[1.1,2.2)}'::nummultirange -|- '{[2.2,3.3)}'::nummultirange + t + + + + + + anymultirange -|- anyrange + boolean + + + Is the multirange adjacent to the range? + + + '{[1.1,2.2)}'::nummultirange -|- numrange(2.2,3.3) + t + + + + + + anyrange -|- anymultirange + boolean + + + Is the range adjacent to the multirange? + + + numrange(1.1,2.2) -|- '{[2.2,3.3)}'::nummultirange + t + + + + + + anymultirange + anymultirange + anymultirange + + + Computes the union of the multiranges. The multiranges need not overlap + or be adjacent. + + + '{[5,10)}'::nummultirange + '{[15,20)}'::nummultirange + {[5,10), [15,20)} + + + + + + anymultirange * anymultirange + anymultirange + + + Computes the intersection of the multiranges. + + + '{[5,15)}'::int8multirange * '{[10,20)}'::int8multirange + {[10,15)} + + + + + + anymultirange - anymultirange + anymultirange + + + Computes the difference of the multiranges. + + + '{[5,20)}'::int8multirange - '{[10,15)}'::int8multirange + {[5,10), [15,20)} + + + + +
+ + + The left-of/right-of/adjacent operators always return false when an empty + range or multirange is involved; that is, an empty range is not considered to + be either before or after any other range. + + + + Elsewhere empty ranges and multiranges are treated as the additive identity: + anything unioned with an empty value is itself. Anything minus an empty + value is itself. An empty multirange has exactly the same points as an empty + range. Every range contains the empty range. Every multirange contains as many + empty ranges as you like. + + + + The range union and difference operators will fail if the resulting range would + need to contain two disjoint sub-ranges, as such a range cannot be + represented. There are separate operators for union and difference that take + multirange parameters and return a multirange, and they do not fail even if + their arguments are disjoint. So if you need a union or difference operation + for ranges that may be disjoint, you can avoid errors by first casting your + ranges to multiranges. + + + + shows the functions + available for use with range types. + shows the functions + available for use with multirange types. + + + + Range Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + lower + + lower ( anyrange ) + anyelement + + + Extracts the lower bound of the range (NULL if the + range is empty or has no lower bound). + + + lower(numrange(1.1,2.2)) + 1.1 + + + + + + + upper + + upper ( anyrange ) + anyelement + + + Extracts the upper bound of the range (NULL if the + range is empty or has no upper bound). + + + upper(numrange(1.1,2.2)) + 2.2 + + + + + + + isempty + + isempty ( anyrange ) + boolean + + + Is the range empty? + + + isempty(numrange(1.1,2.2)) + f + + + + + + + lower_inc + + lower_inc ( anyrange ) + boolean + + + Is the range's lower bound inclusive? + + + lower_inc(numrange(1.1,2.2)) + t + + + + + + + upper_inc + + upper_inc ( anyrange ) + boolean + + + Is the range's upper bound inclusive? + + + upper_inc(numrange(1.1,2.2)) + f + + + + + + + lower_inf + + lower_inf ( anyrange ) + boolean + + + Does the range have no lower bound? (A lower bound of + -Infinity returns false.) + + + lower_inf('(,)'::daterange) + t + + + + + + + upper_inf + + upper_inf ( anyrange ) + boolean + + + Does the range have no upper bound? (An upper bound of + Infinity returns false.) + + + upper_inf('(,)'::daterange) + t + + + + + + + range_merge + + range_merge ( anyrange, anyrange ) + anyrange + + + Computes the smallest range that includes both of the given ranges. + + + range_merge('[1,2)'::int4range, '[3,4)'::int4range) + [1,4) + + + + + + + range_minus_multi + + range_minus_multi ( anyrange, anyrange ) + setof anyrange + + + Returns the non-empty range(s) remaining after subtracting the second range from the first. + One row is returned for each range, so if the second range splits the first into two parts, + there will be two results. If the subtraction yields an empty range, no rows are returned. + + + range_minus_multi('[0,10)'::int4range, '[3,4)'::int4range) + + + [0,3) + [4,10) + + + + + +
+ + + Multirange Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + lower + + lower ( anymultirange ) + anyelement + + + Extracts the lower bound of the multirange (NULL if the + multirange is empty or has no lower bound). + + + lower('{[1.1,2.2)}'::nummultirange) + 1.1 + + + + + + + upper + + upper ( anymultirange ) + anyelement + + + Extracts the upper bound of the multirange (NULL if the + multirange is empty or has no upper bound). + + + upper('{[1.1,2.2)}'::nummultirange) + 2.2 + + + + + + + isempty + + isempty ( anymultirange ) + boolean + + + Is the multirange empty? + + + isempty('{[1.1,2.2)}'::nummultirange) + f + + + + + + + lower_inc + + lower_inc ( anymultirange ) + boolean + + + Is the multirange's lower bound inclusive? + + + lower_inc('{[1.1,2.2)}'::nummultirange) + t + + + + + + + upper_inc + + upper_inc ( anymultirange ) + boolean + + + Is the multirange's upper bound inclusive? + + + upper_inc('{[1.1,2.2)}'::nummultirange) + f + + + + + + + lower_inf + + lower_inf ( anymultirange ) + boolean + + + Does the multirange have no lower bound? (A lower bound of + -Infinity returns false.) + + + lower_inf('{(,)}'::datemultirange) + t + + + + + + + upper_inf + + upper_inf ( anymultirange ) + boolean + + + Does the multirange have no upper bound? (An upper bound of + Infinity returns false.) + + + upper_inf('{(,)}'::datemultirange) + t + + + + + + + range_merge + + range_merge ( anymultirange ) + anyrange + + + Computes the smallest range that includes the entire multirange. + + + range_merge('{[1,2), [3,4)}'::int4multirange) + [1,4) + + + + + + + multirange (function) + + multirange ( anyrange ) + anymultirange + + + Returns a multirange containing just the given range. + + + multirange('[1,2)'::int4range) + {[1,2)} + + + + + + + unnest + for multirange + + unnest ( anymultirange ) + setof anyrange + + + Expands a multirange into a set of ranges in ascending order. + + + unnest('{[1,2), [3,4)}'::int4multirange) + + + [1,2) + [3,4) + + + + + + + + multirange_minus_multi + + multirange_minus_multi ( anymultirange, anymultirange ) + setof anymultirange + + + Returns the non-empty multirange(s) remaining after subtracting the second multirange from the first. + If the subtraction yields an empty multirange, no rows are returned. + Two rows are never returned, because a single multirange can always accommodate any result. + + + multirange_minus_multi('{[0,10)}'::int4multirange, '{[3,4)}'::int4multirange) + {[0,3), [4,10)} + + + + +
+ + + The lower_inc, upper_inc, + lower_inf, and upper_inf + functions all return false for an empty range or multirange. + +
diff --git a/doc/src/sgml/func/func-sequence.sgml b/doc/src/sgml/func/func-sequence.sgml new file mode 100644 index 0000000000000..4a2a6dc9369d6 --- /dev/null +++ b/doc/src/sgml/func/func-sequence.sgml @@ -0,0 +1,223 @@ + + Sequence Manipulation Functions + + + sequence + + + + This section describes functions for operating on sequence + objects, also called sequence generators or just sequences. + Sequence objects are special single-row tables created with . + Sequence objects are commonly used to generate unique identifiers + for rows of a table. The sequence functions, listed in , provide simple, multiuser-safe + methods for obtaining successive sequence values from sequence + objects. + + + + Sequence Functions + + + + + Function + + + Description + + + + + + + + + nextval + + nextval ( regclass ) + bigint + + + Advances the sequence object to its next value and returns that value. + This is done atomically: even if multiple sessions + execute nextval concurrently, each will safely + receive a distinct sequence value. + If the sequence object has been created with default parameters, + successive nextval calls will return successive + values beginning with 1. Other behaviors can be obtained by using + appropriate parameters in the + command. + + + This function requires USAGE + or UPDATE privilege on the sequence. + + + + + + + setval + + setval ( regclass, bigint , boolean ) + bigint + + + Sets the sequence object's current value, and optionally + its is_called flag. The two-parameter + form sets the sequence's last_value field to the + specified value and sets its is_called field to + true, meaning that the next + nextval will advance the sequence before + returning a value. The value that will be reported + by currval is also set to the specified value. + In the three-parameter form, is_called can be set + to either true + or false. true has the same + effect as the two-parameter form. If it is set + to false, the next nextval + will return exactly the specified value, and sequence advancement + commences with the following nextval. + Furthermore, the value reported by currval is not + changed in this case. For example, + +SELECT setval('myseq', 42); Next nextval will return 43 +SELECT setval('myseq', 42, true); Same as above +SELECT setval('myseq', 42, false); Next nextval will return 42 + + The result returned by setval is just the value of its + second argument. + + + This function requires UPDATE privilege on the + sequence. + + + + + + + currval + + currval ( regclass ) + bigint + + + Returns the value most recently obtained + by nextval for this sequence in the current + session. (An error is reported if nextval has + never been called for this sequence in this session.) Because this is + returning a session-local value, it gives a predictable answer whether + or not other sessions have executed nextval since + the current session did. + + + This function requires USAGE + or SELECT privilege on the sequence. + + + + + + + lastval + + lastval () + bigint + + + Returns the value most recently returned by + nextval in the current session. This function is + identical to currval, except that instead + of taking the sequence name as an argument it refers to whichever + sequence nextval was most recently applied to + in the current session. It is an error to call + lastval if nextval + has not yet been called in the current session. + + + This function requires USAGE + or SELECT privilege on the last used sequence. + + + + + + + pg_get_sequence_data + + pg_get_sequence_data ( regclass ) + record + ( last_value bigint, + is_called bool, + page_lsn pg_lsn ) + + + Returns information about the sequence. + last_value is the last sequence value + written to disk. If caching is used, this value can be greater than the + last value handed out from the sequence. + is_called indicates whether the sequence has + been used. page_lsn is the LSN corresponding + to the most recent WAL record that modified this sequence relation. + + + This function is primarily intended for internal use by pg_dump and by + logical replication to synchronize sequences. It requires + USAGE or SELECT privilege on the + sequence. + + + + +
+ + + + To avoid blocking concurrent transactions that obtain numbers from + the same sequence, the value obtained by nextval + is not reclaimed for re-use if the calling transaction later aborts. + This means that transaction aborts or database crashes can result in + gaps in the sequence of assigned values. That can happen without a + transaction abort, too. For example an INSERT with + an ON CONFLICT clause will compute the to-be-inserted + tuple, including doing any required nextval + calls, before detecting any conflict that would cause it to follow + the ON CONFLICT rule instead. + Thus, PostgreSQL sequence + objects cannot be used to obtain gapless + sequences. + + + + Likewise, sequence state changes made by setval + are immediately visible to other transactions, and are not undone if + the calling transaction rolls back. + + + + If the database cluster crashes before committing a transaction + containing a nextval + or setval call, the sequence state change might + not have made its way to persistent storage, so that it is uncertain + whether the sequence will have its original or updated state after the + cluster restarts. This is harmless for usage of the sequence within + the database, since other effects of uncommitted transactions will not + be visible either. However, if you wish to use a sequence value for + persistent outside-the-database purposes, make sure that the + nextval call has been committed before doing so. + + + + + The sequence to be operated on by a sequence function is specified by + a regclass argument, which is simply the OID of the sequence in the + pg_class system catalog. You do not have to look up the + OID by hand, however, since the regclass data type's input + converter will do the work for you. See + for details. + +
diff --git a/doc/src/sgml/func/func-srf.sgml b/doc/src/sgml/func/func-srf.sgml new file mode 100644 index 0000000000000..34a45971aadf5 --- /dev/null +++ b/doc/src/sgml/func/func-srf.sgml @@ -0,0 +1,306 @@ + + Set Returning Functions + + + set returning functions + functions + + + + This section describes functions that possibly return more than one row. + The most widely used functions in this class are series generating + functions, as detailed in and + . Other, more specialized + set-returning functions are described elsewhere in this manual. + See for ways to combine multiple + set-returning functions. + + + + Series Generating Functions + + + + + Function + + + Description + + + + + + + + + generate_series + + generate_series ( start integer, stop integer , step integer ) + setof integer + + + generate_series ( start bigint, stop bigint , step bigint ) + setof bigint + + + generate_series ( start numeric, stop numeric , step numeric ) + setof numeric + + + Generates a series of values from start + to stop, with a step size + of step. step + defaults to 1. + + + + + + generate_series ( start timestamp, stop timestamp, step interval ) + setof timestamp + + + generate_series ( start timestamp with time zone, stop timestamp with time zone, step interval , timezone text ) + setof timestamp with time zone + + + Generates a series of values from start + to stop, with a step size + of step. + In the timezone-aware form, times of day and daylight-savings + adjustments are computed according to the time zone named by + the timezone argument, or the current + setting if that is omitted. + + + + +
+ + + When step is positive, zero rows are returned if + start is greater than stop. + Conversely, when step is negative, zero rows are + returned if start is less than stop. + Zero rows are also returned if any input is NULL. + It is an error + for step to be zero. Some examples follow: + +SELECT * FROM generate_series(2,4); + generate_series +----------------- + 2 + 3 + 4 +(3 rows) + +SELECT * FROM generate_series(5,1,-2); + generate_series +----------------- + 5 + 3 + 1 +(3 rows) + +SELECT * FROM generate_series(4,3); + generate_series +----------------- +(0 rows) + +SELECT generate_series(1.1, 4, 1.3); + generate_series +----------------- + 1.1 + 2.4 + 3.7 +(3 rows) + +-- this example relies on the date-plus-integer operator: +SELECT current_date + s.a AS dates FROM generate_series(0,14,7) AS s(a); + dates +------------ + 2004-02-05 + 2004-02-12 + 2004-02-19 +(3 rows) + +SELECT * FROM generate_series('2008-03-01 00:00'::timestamp, + '2008-03-04 12:00', '10 hours'); + generate_series +--------------------- + 2008-03-01 00:00:00 + 2008-03-01 10:00:00 + 2008-03-01 20:00:00 + 2008-03-02 06:00:00 + 2008-03-02 16:00:00 + 2008-03-03 02:00:00 + 2008-03-03 12:00:00 + 2008-03-03 22:00:00 + 2008-03-04 08:00:00 +(9 rows) + +-- this example assumes that TimeZone is set to UTC; note the DST transition: +SELECT * FROM generate_series('2001-10-22 00:00 -04:00'::timestamptz, + '2001-11-01 00:00 -05:00'::timestamptz, + '1 day'::interval, 'America/New_York'); + generate_series +------------------------ + 2001-10-22 04:00:00+00 + 2001-10-23 04:00:00+00 + 2001-10-24 04:00:00+00 + 2001-10-25 04:00:00+00 + 2001-10-26 04:00:00+00 + 2001-10-27 04:00:00+00 + 2001-10-28 04:00:00+00 + 2001-10-29 05:00:00+00 + 2001-10-30 05:00:00+00 + 2001-10-31 05:00:00+00 + 2001-11-01 05:00:00+00 +(11 rows) + + + + + Subscript Generating Functions + + + + + Function + + + Description + + + + + + + + + generate_subscripts + + generate_subscripts ( array anyarray, dim integer ) + setof integer + + + Generates a series comprising the valid subscripts of + the dim'th dimension of the given array. + + + + + + generate_subscripts ( array anyarray, dim integer, reverse boolean ) + setof integer + + + Generates a series comprising the valid subscripts of + the dim'th dimension of the given array. + When reverse is true, returns the series in + reverse order. + + + + +
+ + + generate_subscripts is a convenience function that generates + the set of valid subscripts for the specified dimension of the given + array. + Zero rows are returned for arrays that do not have the requested dimension, + or if any input is NULL. + Some examples follow: + +-- basic usage: +SELECT generate_subscripts('{NULL,1,NULL,2}'::int[], 1) AS s; + s +--- + 1 + 2 + 3 + 4 +(4 rows) + +-- presenting an array, the subscript and the subscripted +-- value requires a subquery: +SELECT * FROM arrays; + a +-------------------- + {-1,-2} + {100,200,300} +(2 rows) + +SELECT a AS array, s AS subscript, a[s] AS value +FROM (SELECT generate_subscripts(a, 1) AS s, a FROM arrays) foo; + array | subscript | value +---------------+-----------+------- + {-1,-2} | 1 | -1 + {-1,-2} | 2 | -2 + {100,200,300} | 1 | 100 + {100,200,300} | 2 | 200 + {100,200,300} | 3 | 300 +(5 rows) + +-- unnest a 2D array: +CREATE OR REPLACE FUNCTION unnest2(anyarray) +RETURNS SETOF anyelement AS $$ +SELECT $1[i][j] + FROM generate_subscripts($1,1) g1(i), + generate_subscripts($1,2) g2(j); +$$ LANGUAGE sql IMMUTABLE; +CREATE FUNCTION +SELECT * FROM unnest2(ARRAY[[1,2],[3,4]]); + unnest2 +--------- + 1 + 2 + 3 + 4 +(4 rows) + + + + + ordinality + + + + When a function in the FROM clause is suffixed + by WITH ORDINALITY, a bigint column is + appended to the function's output column(s), which starts from 1 and + increments by 1 for each row of the function's output. + This is most useful in the case of set returning + functions such as unnest(). + + +-- set returning function WITH ORDINALITY: +SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n); + ls | n +-----------------+---- + pg_serial | 1 + pg_twophase | 2 + postmaster.opts | 3 + pg_notify | 4 + postgresql.conf | 5 + pg_tblspc | 6 + logfile | 7 + base | 8 + postmaster.pid | 9 + pg_ident.conf | 10 + global | 11 + pg_xact | 12 + pg_snapshots | 13 + pg_multixact | 14 + PG_VERSION | 15 + pg_wal | 16 + pg_hba.conf | 17 + pg_stat_tmp | 18 + pg_subtrans | 19 +(19 rows) + + + +
diff --git a/doc/src/sgml/func/func-statistics.sgml b/doc/src/sgml/func/func-statistics.sgml new file mode 100644 index 0000000000000..22dee263cc2a0 --- /dev/null +++ b/doc/src/sgml/func/func-statistics.sgml @@ -0,0 +1,85 @@ + + Statistics Information Functions + + + function + statistics + + + + PostgreSQL provides a function to inspect complex + statistics defined using the CREATE STATISTICS command. + + + + Inspecting MCV Lists + + + pg_mcv_list_items + + + +pg_mcv_list_items ( pg_mcv_list ) setof record + + + + pg_mcv_list_items returns a set of records describing + all items stored in a multi-column MCV list. It + returns the following columns: + + + + + + Name + Type + Description + + + + + + index + integer + index of the item in the MCV list + + + values + text[] + values stored in the MCV item + + + nulls + boolean[] + flags identifying NULL values + + + frequency + double precision + frequency of this MCV item + + + base_frequency + double precision + base frequency of this MCV item + + + + + + + + The pg_mcv_list_items function can be used like this: + + +SELECT m.* FROM pg_statistic_ext join pg_statistic_ext_data on (oid = stxoid), + pg_mcv_list_items(stxdmcv) m WHERE stxname = 'stts'; + + + Values of the pg_mcv_list type can be obtained only from the + pg_statistic_ext_data.stxdmcv + column. + + + + diff --git a/doc/src/sgml/func/func-string.sgml b/doc/src/sgml/func/func-string.sgml new file mode 100644 index 0000000000000..7ad1436e5f82e --- /dev/null +++ b/doc/src/sgml/func/func-string.sgml @@ -0,0 +1,1827 @@ + + String Functions and Operators + + + This section describes functions and operators for examining and + manipulating string values. Strings in this context include values + of the types character, character varying, + and text. Except where noted, these functions and operators + are declared to accept and return type text. They will + interchangeably accept character varying arguments. + Values of type character will be converted + to text before the function or operator is applied, resulting + in stripping any trailing spaces in the character value. + + + + SQL defines some string functions that use + key words, rather than commas, to separate + arguments. Details are in + . + PostgreSQL also provides versions of these functions + that use the regular function invocation syntax + (see ). + + + + + The string concatenation operator (||) will accept + non-string input, so long as at least one input is of string type, as shown + in . For other cases, inserting an + explicit coercion to text can be used to have non-string input + accepted. + + + + + <acronym>SQL</acronym> String Functions and Operators + + + + + Function/Operator + + + Description + + + Example(s) + + + + + + + + + character string + concatenation + + text || text + text + + + Concatenates the two strings. + + + 'Post' || 'greSQL' + PostgreSQL + + + + + + text || anynonarray + text + + + anynonarray || text + text + + + Converts the non-string input to text, then concatenates the two + strings. (The non-string input cannot be of an array type, because + that would create ambiguity with the array || + operators. If you want to concatenate an array's text equivalent, + cast it to text explicitly.) + + + 'Value: ' || 42 + Value: 42 + + + + + + + btrim + + btrim ( string text + , characters text ) + text + + + Removes the longest string containing only characters + in characters (a space by default) + from the start and end of string. + + + btrim('xyxtrimyyx', 'xyz') + trim + + + + + + + normalized + + + Unicode normalization + + text IS NOT form NORMALIZED + boolean + + + Checks whether the string is in the specified Unicode normalization + form. The optional form key word specifies the + form: NFC (the default), NFD, + NFKC, or NFKD. This expression can + only be used when the server encoding is UTF8. Note + that checking for normalization using this expression is often faster + than normalizing possibly already normalized strings. + + + U&'\0061\0308bc' IS NFD NORMALIZED + t + + + + + + + bit_length + + bit_length ( text ) + integer + + + Returns number of bits in the string (8 + times the octet_length). + + + bit_length('jose') + 32 + + + + + + + char_length + + + character string + length + + + length + of a character string + character string, length + + char_length ( text ) + integer + + + + character_length + + character_length ( text ) + integer + + + Returns number of characters in the string. + + + char_length('josé') + 4 + + + + + + + lower + + lower ( text ) + text + + + Converts the string to all lower case, according to the rules of the + database's locale. + + + lower('TOM') + tom + + + + + + + lpad + + lpad ( string text, + length integer + , fill text ) + text + + + Extends the string to length + length by prepending the characters + fill (a space by default). If the + string is already longer than + length then it is truncated (on the right). + + + lpad('hi', 5, 'xy') + xyxhi + + + + + + + ltrim + + ltrim ( string text + , characters text ) + text + + + Removes the longest string containing only characters in + characters (a space by default) from the start of + string. + + + ltrim('zzzytest', 'xyz') + test + + + + + + + normalize + + + Unicode normalization + + normalize ( text + , form ) + text + + + Converts the string to the specified Unicode + normalization form. The optional form key word + specifies the form: NFC (the default), + NFD, NFKC, or + NFKD. This function can only be used when the + server encoding is UTF8. + + + normalize(U&'\0061\0308bc', NFC) + U&'\00E4bc' + + + + + + + octet_length + + octet_length ( text ) + integer + + + Returns number of bytes in the string. + + + octet_length('josé') + 5 (if server encoding is UTF8) + + + + + + + octet_length + + octet_length ( character ) + integer + + + Returns number of bytes in the string. Since this version of the + function accepts type character directly, it will not + strip trailing spaces. + + + octet_length('abc '::character(4)) + 4 + + + + + + + overlay + + overlay ( string text PLACING newsubstring text FROM start integer FOR count integer ) + text + + + Replaces the substring of string that starts at + the start'th character and extends + for count characters + with newsubstring. + If count is omitted, it defaults to the length + of newsubstring. + + + overlay('Txxxxas' PLACING 'hom' FROM 2 FOR 4) + Thomas + + + + + + + position + + position ( substring text IN string text ) + integer + + + Returns first starting index of the specified + substring within + string, or zero if it's not present. + + + position('om' IN 'Thomas') + 3 + + + + + + + rpad + + rpad ( string text, + length integer + , fill text ) + text + + + Extends the string to length + length by appending the characters + fill (a space by default). If the + string is already longer than + length then it is truncated. + + + rpad('hi', 5, 'xy') + hixyx + + + + + + + rtrim + + rtrim ( string text + , characters text ) + text + + + Removes the longest string containing only characters in + characters (a space by default) from the end of + string. + + + rtrim('testxxzx', 'xyz') + test + + + + + + + substring + + substring ( string text FROM start integer FOR count integer ) + text + + + Extracts the substring of string starting at + the start'th character if that is specified, + and stopping after count characters if that is + specified. Provide at least one of start + and count. + + + substring('Thomas' FROM 2 FOR 3) + hom + + + substring('Thomas' FROM 3) + omas + + + substring('Thomas' FOR 2) + Th + + + + + + substring ( string text FROM pattern text ) + text + + + Extracts the first substring matching POSIX regular expression; see + . + + + substring('Thomas' FROM '...$') + mas + + + + + + substring ( string text SIMILAR pattern text ESCAPE escape text ) + text + + + substring ( string text FROM pattern text FOR escape text ) + text + + + Extracts the first substring matching SQL regular expression; + see . The first form has + been specified since SQL:2003; the second form was only in SQL:1999 + and should be considered obsolete. + + + substring('Thomas' SIMILAR '%#"o_a#"_' ESCAPE '#') + oma + + + + + + + trim + + trim ( LEADING | TRAILING | BOTH + characters text FROM + string text ) + text + + + Removes the longest string containing only characters in + characters (a space by default) from the + start, end, or both ends (BOTH is the default) + of string. + + + trim(both 'xyz' from 'yxTomxx') + Tom + + + + + + trim ( LEADING | TRAILING | BOTH FROM + string text , + characters text ) + text + + + This is a non-standard syntax for trim(). + + + trim(both from 'yxTomxx', 'xyz') + Tom + + + + + + + unicode_assigned + + unicode_assigned ( text ) + boolean + + + Returns true if all characters in the string are + assigned Unicode codepoints; false otherwise. This + function can only be used when the server encoding is + UTF8. + + + + + + + upper + + upper ( text ) + text + + + Converts the string to all upper case, according to the rules of the + database's locale. + + + upper('tom') + TOM + + + + +
+ + + Additional string manipulation functions and operators are available + and are listed in . (Some of + these are used internally to implement + the SQL-standard string functions listed in + .) + There are also pattern-matching operators, which are described in + , and operators for full-text + search, which are described in . + + + + Other String Functions and Operators + + + + + Function/Operator + + + Description + + + Example(s) + + + + + + + + + character string + prefix test + + text ^@ text + boolean + + + Returns true if the first string starts with the second string + (equivalent to the starts_with() function). + + + 'alphabet' ^@ 'alph' + t + + + + + + + ascii + + ascii ( text ) + integer + + + Returns the numeric code of the first character of the argument. + In UTF8 encoding, returns the Unicode code point + of the character. In other multibyte encodings, the argument must + be an ASCII character. + + + ascii('x') + 120 + + + + + + + chr + + chr ( integer ) + text + + + Returns the character with the given code. In UTF8 + encoding the argument is treated as a Unicode code point. In other + multibyte encodings the argument must designate + an ASCII character. chr(0) is + disallowed because text data types cannot store that character. + + + chr(65) + A + + + + + + + concat + + concat ( val1 "any" + , val2 "any" , ... ) + text + + + Concatenates the text representations of all the arguments. + NULL arguments are ignored. + + + concat('abcde', 2, NULL, 22) + abcde222 + + + + + + + concat_ws + + concat_ws ( sep text, + val1 "any" + , val2 "any" , ... ) + text + + + Concatenates all but the first argument, with separators. The first + argument is used as the separator string, and should not be NULL. + Other NULL arguments are ignored. + + + concat_ws(',', 'abcde', 2, NULL, 22) + abcde,2,22 + + + + + + + format + + format ( formatstr text + , formatarg "any" , ... ) + text + + + Formats arguments according to a format string; + see . + This function is similar to the C function sprintf. + + + format('Hello %s, %1$s', 'World') + Hello World, World + + + + + + + initcap + + initcap ( text ) + text + + + Converts the first letter of each word to upper case (or title case + if the letter is a digraph and locale is ICU or + builtin PG_UNICODE_FAST) + and the rest to lower case. When using the libc or + builtin locale provider, words are sequences of + alphanumeric characters separated by non-alphanumeric characters; + when using the ICU locale provider, words are separated according to + u_strToTitle ICU function. + + + This function is primarily used for convenient + display, and the specific result should not be relied upon because of + the differences between locale providers and between different + ICU versions. If specific word boundary rules are desired, + it is recommended to write a custom function. + + + initcap('hi THOMAS') + Hi Thomas + + + + + + + casefold + + casefold ( text ) + text + + + Performs case folding of the input string according to the collation. + Case folding is similar to case conversion, but the purpose of case + folding is to facilitate case-insensitive matching of strings, + whereas the purpose of case conversion is to convert to a particular + cased form. This function can only be used when the server encoding + is UTF8. + + + Ordinarily, case folding simply converts to lowercase, but there may + be exceptions depending on the collation. For instance, some + characters have more than two lowercase variants, or fold to uppercase. + + + Case folding may change the length of the string. For instance, in + the PG_UNICODE_FAST collation, ß + (U+00DF) folds to ss. + + + casefold can be used for Unicode Default Caseless + Matching. It does not always preserve the normalized form of the + input string (see ). + + + The libc provider doesn't support case folding, so + casefold is identical to . + + + + + + + left + + left ( string text, + n integer ) + text + + + Returns first n characters in the + string, or when n is negative, returns + all but last |n| characters. + + + left('abcde', 2) + ab + + + + + + + length + + length ( text ) + integer + + + Returns the number of characters in the string. + + + length('jose') + 4 + + + + + + + md5 + + md5 ( text ) + text + + + Computes the MD5 hash of + the argument, with the result written in hexadecimal. + + + md5('abc') + 900150983cd24fb0&zwsp;d6963f7d28e17f72 + + + + + + + parse_ident + + parse_ident ( qualified_identifier text + , strict_mode boolean DEFAULT true ) + text[] + + + Splits qualified_identifier into an array of + identifiers, removing any quoting of individual identifiers. By + default, extra characters after the last identifier are considered an + error; but if the second parameter is false, then such + extra characters are ignored. (This behavior is useful for parsing + names for objects like functions.) Note that this function does not + truncate over-length identifiers. If you want truncation you can cast + the result to name[]. + + + parse_ident('"SomeSchema".someTable') + {SomeSchema,sometable} + + + + + + + pg_client_encoding + + pg_client_encoding ( ) + name + + + Returns current client encoding name. + + + pg_client_encoding() + UTF8 + + + + + + + quote_ident + + quote_ident ( text ) + text + + + Returns the given string suitably quoted to be used as an identifier + in an SQL statement string. + Quotes are added only if necessary (i.e., if the string contains + non-identifier characters or would be case-folded). + Embedded quotes are properly doubled. + See also . + + + quote_ident('Foo bar') + "Foo bar" + + + + + + + quote_literal + + quote_literal ( text ) + text + + + Returns the given string suitably quoted to be used as a string literal + in an SQL statement string. + Embedded single-quotes and backslashes are properly doubled. + Note that quote_literal returns null on null + input; if the argument might be null, + quote_nullable is often more suitable. + See also . + + + quote_literal(E'O\'Reilly') + 'O''Reilly' + + + + + + quote_literal ( anyelement ) + text + + + Converts the given value to text and then quotes it as a literal. + Embedded single-quotes and backslashes are properly doubled. + + + quote_literal(42.5) + '42.5' + + + + + + + quote_nullable + + quote_nullable ( text ) + text + + + Returns the given string suitably quoted to be used as a string literal + in an SQL statement string; or, if the argument + is null, returns NULL. + Embedded single-quotes and backslashes are properly doubled. + See also . + + + quote_nullable(NULL) + NULL + + + + + + quote_nullable ( anyelement ) + text + + + Converts the given value to text and then quotes it as a literal; + or, if the argument is null, returns NULL. + Embedded single-quotes and backslashes are properly doubled. + + + quote_nullable(42.5) + '42.5' + + + + + + + regexp_count + + regexp_count ( string text, pattern text + , start integer + , flags text ) + integer + + + Returns the number of times the POSIX regular + expression pattern matches in + the string; see + . + + + regexp_count('123456789012', '\d\d\d', 2) + 3 + + + + + + + regexp_instr + + regexp_instr ( string text, pattern text + , start integer + , N integer + , endoption integer + , flags text + , subexpr integer ) + integer + + + Returns the position within string where + the N'th match of the POSIX regular + expression pattern occurs, or zero if there is + no such match; see . + + + regexp_instr('ABCDEF', 'c(.)(..)', 1, 1, 0, 'i') + 3 + + + regexp_instr('ABCDEF', 'c(.)(..)', 1, 1, 0, 'i', 2) + 5 + + + + + + + regexp_like + + regexp_like ( string text, pattern text + , flags text ) + boolean + + + Checks whether a match of the POSIX regular + expression pattern occurs + within string; see + . + + + regexp_like('Hello World', 'world$', 'i') + t + + + + + + + regexp_match + + regexp_match ( string text, pattern text , flags text ) + text[] + + + Returns substrings within the first match of the POSIX regular + expression pattern to + the string; see + . + + + regexp_match('foobarbequebaz', '(bar)(beque)') + {bar,beque} + + + + + + + regexp_matches + + regexp_matches ( string text, pattern text , flags text ) + setof text[] + + + Returns substrings within the first match of the POSIX regular + expression pattern to + the string, or substrings within all + such matches if the g flag is used; + see . + + + regexp_matches('foobarbequebaz', 'ba.', 'g') + + + {bar} + {baz} + + + + + + + + regexp_replace + + regexp_replace ( string text, pattern text, replacement text + , flags text ) + text + + + Replaces the substring that is the first match to the POSIX + regular expression pattern, or all such + matches if the g flag is used; see + . + + + regexp_replace('Thomas', '.[mN]a.', 'M') + ThM + + + + + + regexp_replace ( string text, pattern text, replacement text, + start integer + , N integer + , flags text ) + text + + + Replaces the substring that is the N'th + match to the POSIX regular expression pattern, + or all such matches if N is zero, with the + search beginning at the start'th character + of string. If N is + omitted, it defaults to 1. See + . + + + regexp_replace('Thomas', '.', 'X', 3, 2) + ThoXas + + + regexp_replace(string=>'hello world', pattern=>'l', replacement=>'XX', start=>1, "N"=>2) + helXXo world + + + + + + + regexp_split_to_array + + regexp_split_to_array ( string text, pattern text , flags text ) + text[] + + + Splits string using a POSIX regular + expression as the delimiter, producing an array of results; see + . + + + regexp_split_to_array('hello world', '\s+') + {hello,world} + + + + + + + regexp_split_to_table + + regexp_split_to_table ( string text, pattern text , flags text ) + setof text + + + Splits string using a POSIX regular + expression as the delimiter, producing a set of results; see + . + + + regexp_split_to_table('hello world', '\s+') + + + hello + world + + + + + + + + regexp_substr + + regexp_substr ( string text, pattern text + , start integer + , N integer + , flags text + , subexpr integer ) + text + + + Returns the substring within string that + matches the N'th occurrence of the POSIX + regular expression pattern, + or NULL if there is no such match; see + . + + + regexp_substr('ABCDEF', 'c(.)(..)', 1, 1, 'i') + CDEF + + + regexp_substr('ABCDEF', 'c(.)(..)', 1, 1, 'i', 2) + EF + + + + + + + repeat + + repeat ( string text, number integer ) + text + + + Repeats string the specified + number of times. + + + repeat('Pg', 4) + PgPgPgPg + + + + + + + replace + + replace ( string text, + from text, + to text ) + text + + + Replaces all occurrences in string of + substring from with + substring to. + + + replace('abcdefabcdef', 'cd', 'XX') + abXXefabXXef + + + + + + + reverse + + reverse ( text ) + text + + + Reverses the order of the characters in the string. + + + reverse('abcde') + edcba + + + + + + + right + + right ( string text, + n integer ) + text + + + Returns last n characters in the string, + or when n is negative, returns all but + first |n| characters. + + + right('abcde', 2) + de + + + + + + + split_part + + split_part ( string text, + delimiter text, + n integer ) + text + + + Splits string at occurrences + of delimiter and returns + the n'th field (counting from one), + or when n is negative, returns + the |n|'th-from-last field. + + + split_part('abc~@~def~@~ghi', '~@~', 2) + def + + + split_part('abc,def,ghi,jkl', ',', -2) + ghi + + + + + + + starts_with + + starts_with ( string text, prefix text ) + boolean + + + Returns true if string starts + with prefix. + + + starts_with('alphabet', 'alph') + t + + + + + + + string_to_array + + string_to_array ( string text, delimiter text , null_string text ) + text[] + + + Splits the string at occurrences + of delimiter and forms the resulting fields + into a text array. + If delimiter is NULL, + each character in the string will become a + separate element in the array. + If delimiter is an empty string, then + the string is treated as a single field. + If null_string is supplied and is + not NULL, fields matching that string are + replaced by NULL. + See also array_to_string. + + + string_to_array('xx~~yy~~zz', '~~', 'yy') + {xx,NULL,zz} + + + + + + + string_to_table + + string_to_table ( string text, delimiter text , null_string text ) + setof text + + + Splits the string at occurrences + of delimiter and returns the resulting fields + as a set of text rows. + If delimiter is NULL, + each character in the string will become a + separate row of the result. + If delimiter is an empty string, then + the string is treated as a single field. + If null_string is supplied and is + not NULL, fields matching that string are + replaced by NULL. + + + string_to_table('xx~^~yy~^~zz', '~^~', 'yy') + + + xx + NULL + zz + + + + + + + + strpos + + strpos ( string text, substring text ) + integer + + + Returns first starting index of the specified substring + within string, or zero if it's not present. + (Same as position(substring in + string), but note the reversed + argument order.) + + + strpos('high', 'ig') + 2 + + + + + + + substr + + substr ( string text, start integer , count integer ) + text + + + Extracts the substring of string starting at + the start'th character, + and extending for count characters if that is + specified. (Same + as substring(string + from start + for count).) + + + substr('alphabet', 3) + phabet + + + substr('alphabet', 3, 2) + ph + + + + + + + to_ascii + + to_ascii ( string text ) + text + + + to_ascii ( string text, + encoding name ) + text + + + to_ascii ( string text, + encoding integer ) + text + + + Converts string to ASCII + from another encoding, which may be identified by name or number. + If encoding is omitted the database encoding + is assumed (which in practice is the only useful case). + The conversion consists primarily of dropping accents. + Conversion is only supported + from LATIN1, LATIN2, + LATIN9, and WIN1250 encodings. + (See the module for another, more flexible + solution.) + + + to_ascii('Karél') + Karel + + + + + + + to_bin + + to_bin ( integer ) + text + + + to_bin ( bigint ) + text + + + Converts the number to its equivalent two's complement binary + representation. + + + to_bin(2147483647) + 1111111111111111111111111111111 + + + to_bin(-1234) + 11111111111111111111101100101110 + + + + + + + to_hex + + to_hex ( integer ) + text + + + to_hex ( bigint ) + text + + + Converts the number to its equivalent two's complement hexadecimal + representation. + + + to_hex(2147483647) + 7fffffff + + + to_hex(-1234) + fffffb2e + + + + + + + to_oct + + to_oct ( integer ) + text + + + to_oct ( bigint ) + text + + + Converts the number to its equivalent two's complement octal + representation. + + + to_oct(2147483647) + 17777777777 + + + to_oct(-1234) + 37777775456 + + + + + + + translate + + translate ( string text, + from text, + to text ) + text + + + Replaces each character in string that + matches a character in the from set with the + corresponding character in the to + set. If from is longer than + to, occurrences of the extra characters in + from are deleted. + + + translate('12345', '143', 'ax') + a2x5 + + + + + + + unistr + + unistr ( text ) + text + + + Evaluate escaped Unicode characters in the argument. Unicode characters + can be specified as + \XXXX (4 hexadecimal + digits), \+XXXXXX (6 + hexadecimal digits), + \uXXXX (4 hexadecimal + digits), or \UXXXXXXXX + (8 hexadecimal digits). To specify a backslash, write two + backslashes. All other characters are taken literally. + + + + If the server encoding is not UTF-8, the Unicode code point identified + by one of these escape sequences is converted to the actual server + encoding; an error is reported if that's not possible. + + + + This function provides a (non-standard) alternative to string + constants with Unicode escapes (see ). + + + + unistr('d\0061t\+000061') + data + + + unistr('d\u0061t\U00000061') + data + + + + + +
+ + + The concat, concat_ws and + format functions are variadic, so it is possible to + pass the values to be concatenated or formatted as an array marked with + the VARIADIC keyword (see ). The array's elements are + treated as if they were separate ordinary arguments to the function. + If the variadic array argument is NULL, concat + and concat_ws return NULL, but + format treats a NULL as a zero-element array. + + + + See also the aggregate function string_agg in + , and the functions for + converting between strings and the bytea type in + . + + + + <function>format</function> + + + format + + + + The function format produces output formatted according to + a format string, in a style similar to the C function + sprintf. + + + + +format(formatstr text , formatarg "any" , ... ) + + formatstr is a format string that specifies how the + result should be formatted. Text in the format string is copied + directly to the result, except where format specifiers are + used. Format specifiers act as placeholders in the string, defining how + subsequent function arguments should be formatted and inserted into the + result. Each formatarg argument is converted to text + according to the usual output rules for its data type, and then formatted + and inserted into the result string according to the format specifier(s). + + + + Format specifiers are introduced by a % character and have + the form + +%[position][flags][width]type + + where the component fields are: + + + + position (optional) + + + A string of the form n$ where + n is the index of the argument to print. + Index 1 means the first argument after + formatstr. If the position is + omitted, the default is to use the next argument in sequence. + + + + + + flags (optional) + + + Additional options controlling how the format specifier's output is + formatted. Currently the only supported flag is a minus sign + (-) which will cause the format specifier's output to be + left-justified. This has no effect unless the width + field is also specified. + + + + + + width (optional) + + + Specifies the minimum number of characters to use to + display the format specifier's output. The output is padded on the + left or right (depending on the - flag) with spaces as + needed to fill the width. A too-small width does not cause + truncation of the output, but is simply ignored. The width may be + specified using any of the following: a positive integer; an + asterisk (*) to use the next function argument as the + width; or a string of the form *n$ to + use the nth function argument as the width. + + + + If the width comes from a function argument, that argument is + consumed before the argument that is used for the format specifier's + value. If the width argument is negative, the result is left + aligned (as if the - flag had been specified) within a + field of length abs(width). + + + + + + type (required) + + + The type of format conversion to use to produce the format + specifier's output. The following types are supported: + + + + s formats the argument value as a simple + string. A null value is treated as an empty string. + + + + + I treats the argument value as an SQL + identifier, double-quoting it if necessary. + It is an error for the value to be null (equivalent to + quote_ident). + + + + + L quotes the argument value as an SQL literal. + A null value is displayed as the string NULL, without + quotes (equivalent to quote_nullable). + + + + + + + + + + + In addition to the format specifiers described above, the special sequence + %% may be used to output a literal % character. + + + + Here are some examples of the basic format conversions: + + +SELECT format('Hello %s', 'World'); +Result: Hello World + +SELECT format('Testing %s, %s, %s, %%', 'one', 'two', 'three'); +Result: Testing one, two, three, % + +SELECT format('INSERT INTO %I VALUES(%L)', 'Foo bar', E'O\'Reilly'); +Result: INSERT INTO "Foo bar" VALUES('O''Reilly') + +SELECT format('INSERT INTO %I VALUES(%L)', 'locations', 'C:\Program Files'); +Result: INSERT INTO locations VALUES('C:\Program Files') + + + + + Here are examples using width fields + and the - flag: + + +SELECT format('|%10s|', 'foo'); +Result: | foo| + +SELECT format('|%-10s|', 'foo'); +Result: |foo | + +SELECT format('|%*s|', 10, 'foo'); +Result: | foo| + +SELECT format('|%*s|', -10, 'foo'); +Result: |foo | + +SELECT format('|%-*s|', 10, 'foo'); +Result: |foo | + +SELECT format('|%-*s|', -10, 'foo'); +Result: |foo | + + + + + These examples show use of position fields: + + +SELECT format('Testing %3$s, %2$s, %1$s', 'one', 'two', 'three'); +Result: Testing three, two, one + +SELECT format('|%*2$s|', 'foo', 10, 'bar'); +Result: | bar| + +SELECT format('|%1$*2$s|', 'foo', 10, 'bar'); +Result: | foo| + + + + + Unlike the standard C function sprintf, + PostgreSQL's format function allows format + specifiers with and without position fields to be mixed + in the same format string. A format specifier without a + position field always uses the next argument after the + last argument consumed. + In addition, the format function does not require all + function arguments to be used in the format string. + For example: + + +SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three'); +Result: Testing three, two, three + + + + + The %I and %L format specifiers are particularly + useful for safely constructing dynamic SQL statements. See + . + + + +
diff --git a/doc/src/sgml/func/func-subquery.sgml b/doc/src/sgml/func/func-subquery.sgml new file mode 100644 index 0000000000000..a9f2b12e48c66 --- /dev/null +++ b/doc/src/sgml/func/func-subquery.sgml @@ -0,0 +1,349 @@ + + Subquery Expressions + + + EXISTS + + + + IN + + + + NOT IN + + + + ANY + + + + ALL + + + + SOME + + + + subquery + + + + This section describes the SQL-compliant subquery + expressions available in PostgreSQL. + All of the expression forms documented in this section return + Boolean (true/false) results. + + + + <literal>EXISTS</literal> + + +EXISTS (subquery) + + + + The argument of EXISTS is an arbitrary SELECT statement, + or subquery. The + subquery is evaluated to determine whether it returns any rows. + If it returns at least one row, the result of EXISTS is + true; if the subquery returns no rows, the result of EXISTS + is false. + + + + The subquery can refer to variables from the surrounding query, + which will act as constants during any one evaluation of the subquery. + + + + The subquery will generally only be executed long enough to determine + whether at least one row is returned, not all the way to completion. + It is unwise to write a subquery that has side effects (such as + calling sequence functions); whether the side effects occur + might be unpredictable. + + + + Since the result depends only on whether any rows are returned, + and not on the contents of those rows, the output list of the + subquery is normally unimportant. A common coding convention is + to write all EXISTS tests in the form + EXISTS(SELECT 1 WHERE ...). There are exceptions to + this rule however, such as subqueries that use INTERSECT. + + + + This simple example is like an inner join on col2, but + it produces at most one output row for each tab1 row, + even if there are several matching tab2 rows: + +SELECT col1 +FROM tab1 +WHERE EXISTS (SELECT 1 FROM tab2 WHERE col2 = tab1.col2); + + + + + + <literal>IN</literal> + + +expression IN (subquery) + + + + The right-hand side is a parenthesized + subquery, which must return exactly one column. The left-hand expression + is evaluated and compared to each row of the subquery result. + The result of IN is true if any equal subquery row is found. + The result is false if no equal row is found (including the + case where the subquery returns no rows). + + + + Note that if the left-hand expression yields null, or if there are + no equal right-hand values and at least one right-hand row yields + null, the result of the IN construct will be null, not false. + This is in accordance with SQL's normal rules for Boolean combinations + of null values. + + + + As with EXISTS, it's unwise to assume that the subquery will + be evaluated completely. + + + +row_constructor IN (subquery) + + + + The left-hand side of this form of IN is a row constructor, + as described in . + The right-hand side is a parenthesized + subquery, which must return exactly as many columns as there are + expressions in the left-hand row. The left-hand expressions are + evaluated and compared row-wise to each row of the subquery result. + The result of IN is true if any equal subquery row is found. + The result is false if no equal row is found (including the + case where the subquery returns no rows). + + + + As usual, null values in the rows are combined per + the normal rules of SQL Boolean expressions. Two rows are considered + equal if all their corresponding members are non-null and equal; the rows + are unequal if any corresponding members are non-null and unequal; + otherwise the result of that row comparison is unknown (null). + If all the per-row results are either unequal or null, with at least one + null, then the result of IN is null. + + + + + <literal>NOT IN</literal> + + +expression NOT IN (subquery) + + + + The right-hand side is a parenthesized + subquery, which must return exactly one column. The left-hand expression + is evaluated and compared to each row of the subquery result. + The result of NOT IN is true if only unequal subquery rows + are found (including the case where the subquery returns no rows). + The result is false if any equal row is found. + + + + Note that if the left-hand expression yields null, or if there are + no equal right-hand values and at least one right-hand row yields + null, the result of the NOT IN construct will be null, not true. + This is in accordance with SQL's normal rules for Boolean combinations + of null values. + + + + As with EXISTS, it's unwise to assume that the subquery will + be evaluated completely. + + + +row_constructor NOT IN (subquery) + + + + The left-hand side of this form of NOT IN is a row constructor, + as described in . + The right-hand side is a parenthesized + subquery, which must return exactly as many columns as there are + expressions in the left-hand row. The left-hand expressions are + evaluated and compared row-wise to each row of the subquery result. + The result of NOT IN is true if only unequal subquery rows + are found (including the case where the subquery returns no rows). + The result is false if any equal row is found. + + + + As usual, null values in the rows are combined per + the normal rules of SQL Boolean expressions. Two rows are considered + equal if all their corresponding members are non-null and equal; the rows + are unequal if any corresponding members are non-null and unequal; + otherwise the result of that row comparison is unknown (null). + If all the per-row results are either unequal or null, with at least one + null, then the result of NOT IN is null. + + + + + <literal>ANY</literal>/<literal>SOME</literal> + + +expression operator ANY (subquery) +expression operator SOME (subquery) + + + + The right-hand side is a parenthesized + subquery, which must return exactly one column. The left-hand expression + is evaluated and compared to each row of the subquery result using the + given operator, which must yield a Boolean + result. + The result of ANY is true if any true result is obtained. + The result is false if no true result is found (including the + case where the subquery returns no rows). + + + + SOME is a synonym for ANY. + IN is equivalent to = ANY. + + + + Note that if there are no successes and at least one right-hand row yields + null for the operator's result, the result of the ANY construct + will be null, not false. + This is in accordance with SQL's normal rules for Boolean combinations + of null values. + + + + As with EXISTS, it's unwise to assume that the subquery will + be evaluated completely. + + + +row_constructor operator ANY (subquery) +row_constructor operator SOME (subquery) + + + + The left-hand side of this form of ANY is a row constructor, + as described in . + The right-hand side is a parenthesized + subquery, which must return exactly as many columns as there are + expressions in the left-hand row. The left-hand expressions are + evaluated and compared row-wise to each row of the subquery result, + using the given operator. + The result of ANY is true if the comparison + returns true for any subquery row. + The result is false if the comparison returns false for every + subquery row (including the case where the subquery returns no + rows). + The result is NULL if no comparison with a subquery row returns true, + and at least one comparison returns NULL. + + + + See for details about the meaning + of a row constructor comparison. + + + + + <literal>ALL</literal> + + +expression operator ALL (subquery) + + + + The right-hand side is a parenthesized + subquery, which must return exactly one column. The left-hand expression + is evaluated and compared to each row of the subquery result using the + given operator, which must yield a Boolean + result. + The result of ALL is true if all rows yield true + (including the case where the subquery returns no rows). + The result is false if any false result is found. + The result is NULL if no comparison with a subquery row returns false, + and at least one comparison returns NULL. + + + + NOT IN is equivalent to <> ALL. + + + + As with EXISTS, it's unwise to assume that the subquery will + be evaluated completely. + + + +row_constructor operator ALL (subquery) + + + + The left-hand side of this form of ALL is a row constructor, + as described in . + The right-hand side is a parenthesized + subquery, which must return exactly as many columns as there are + expressions in the left-hand row. The left-hand expressions are + evaluated and compared row-wise to each row of the subquery result, + using the given operator. + The result of ALL is true if the comparison + returns true for all subquery rows (including the + case where the subquery returns no rows). + The result is false if the comparison returns false for any + subquery row. + The result is NULL if no comparison with a subquery row returns false, + and at least one comparison returns NULL. + + + + See for details about the meaning + of a row constructor comparison. + + + + + Single-Row Comparison + + + comparison + subquery result row + + + +row_constructor operator (subquery) + + + + The left-hand side is a row constructor, + as described in . + The right-hand side is a parenthesized subquery, which must return exactly + as many columns as there are expressions in the left-hand row. Furthermore, + the subquery cannot return more than one row. (If it returns zero rows, + the result is taken to be null.) The left-hand side is evaluated and + compared row-wise to the single subquery result row. + + + + See for details about the meaning + of a row constructor comparison. + + + diff --git a/doc/src/sgml/func/func-textsearch.sgml b/doc/src/sgml/func/func-textsearch.sgml new file mode 100644 index 0000000000000..290ad81d6979b --- /dev/null +++ b/doc/src/sgml/func/func-textsearch.sgml @@ -0,0 +1,1046 @@ + + Text Search Functions and Operators + + + full text search + functions and operators + + + + text search + functions and operators + + + + , + and + + summarize the functions and operators that are provided + for full text searching. See for a detailed + explanation of PostgreSQL's text search + facility. + + + + Text Search Operators + + + + + Operator + + + Description + + + Example(s) + + + + + + + + tsvector @@ tsquery + boolean + + + tsquery @@ tsvector + boolean + + + Does tsvector match tsquery? + (The arguments can be given in either order.) + + + to_tsvector('fat cats ate rats') @@ to_tsquery('cat & rat') + t + + + + + + text @@ tsquery + boolean + + + Does text string, after implicit invocation + of to_tsvector(), match tsquery? + + + 'fat cats ate rats' @@ to_tsquery('cat & rat') + t + + + + + + tsvector || tsvector + tsvector + + + Concatenates two tsvectors. If both inputs contain + lexeme positions, the second input's positions are adjusted + accordingly. + + + 'a:1 b:2'::tsvector || 'c:1 d:2 b:3'::tsvector + 'a':1 'b':2,5 'c':3 'd':4 + + + + + + tsquery && tsquery + tsquery + + + ANDs two tsquerys together, producing a query that + matches documents that match both input queries. + + + 'fat | rat'::tsquery && 'cat'::tsquery + ( 'fat' | 'rat' ) & 'cat' + + + + + + tsquery || tsquery + tsquery + + + ORs two tsquerys together, producing a query that + matches documents that match either input query. + + + 'fat | rat'::tsquery || 'cat'::tsquery + 'fat' | 'rat' | 'cat' + + + + + + !! tsquery + tsquery + + + Negates a tsquery, producing a query that matches + documents that do not match the input query. + + + !! 'cat'::tsquery + !'cat' + + + + + + tsquery <-> tsquery + tsquery + + + Constructs a phrase query, which matches if the two input queries + match at successive lexemes. + + + to_tsquery('fat') <-> to_tsquery('rat') + 'fat' <-> 'rat' + + + + + + tsquery @> tsquery + boolean + + + Does first tsquery contain the second? (This considers + only whether all the lexemes appearing in one query appear in the + other, ignoring the combining operators.) + + + 'cat'::tsquery @> 'cat & rat'::tsquery + f + + + + + + tsquery <@ tsquery + boolean + + + Is first tsquery contained in the second? (This + considers only whether all the lexemes appearing in one query appear + in the other, ignoring the combining operators.) + + + 'cat'::tsquery <@ 'cat & rat'::tsquery + t + + + 'cat'::tsquery <@ '!cat & rat'::tsquery + t + + + + +
+ + + In addition to these specialized operators, the usual comparison + operators shown in are + available for types tsvector and tsquery. + These are not very + useful for text searching but allow, for example, unique indexes to be + built on columns of these types. + + + + Text Search Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + array_to_tsvector + + array_to_tsvector ( text[] ) + tsvector + + + Converts an array of text strings to a tsvector. + The given strings are used as lexemes as-is, without further + processing. Array elements must not be empty strings + or NULL. + + + array_to_tsvector('{fat,cat,rat}'::text[]) + 'cat' 'fat' 'rat' + + + + + + + get_current_ts_config + + get_current_ts_config ( ) + regconfig + + + Returns the OID of the current default text search configuration + (as set by ). + + + get_current_ts_config() + english + + + + + + + length + + length ( tsvector ) + integer + + + Returns the number of lexemes in the tsvector. + + + length('fat:2,4 cat:3 rat:5A'::tsvector) + 3 + + + + + + + numnode + + numnode ( tsquery ) + integer + + + Returns the number of lexemes plus operators in + the tsquery. + + + numnode('(fat & rat) | cat'::tsquery) + 5 + + + + + + + plainto_tsquery + + plainto_tsquery ( + config regconfig, + query text ) + tsquery + + + Converts text to a tsquery, normalizing words according to + the specified or default configuration. Any punctuation in the string + is ignored (it does not determine query operators). The resulting + query matches documents containing all non-stopwords in the text. + + + plainto_tsquery('english', 'The Fat Rats') + 'fat' & 'rat' + + + + + + + phraseto_tsquery + + phraseto_tsquery ( + config regconfig, + query text ) + tsquery + + + Converts text to a tsquery, normalizing words according to + the specified or default configuration. Any punctuation in the string + is ignored (it does not determine query operators). The resulting + query matches phrases containing all non-stopwords in the text. + + + phraseto_tsquery('english', 'The Fat Rats') + 'fat' <-> 'rat' + + + phraseto_tsquery('english', 'The Cat and Rats') + 'cat' <2> 'rat' + + + + + + + websearch_to_tsquery + + websearch_to_tsquery ( + config regconfig, + query text ) + tsquery + + + Converts text to a tsquery, normalizing words according + to the specified or default configuration. Quoted word sequences are + converted to phrase tests. The word or is understood + as producing an OR operator, and a dash produces a NOT operator; + other punctuation is ignored. + This approximates the behavior of some common web search tools. + + + websearch_to_tsquery('english', '"fat rat" or cat dog') + 'fat' <-> 'rat' | 'cat' & 'dog' + + + + + + + querytree + + querytree ( tsquery ) + text + + + Produces a representation of the indexable portion of + a tsquery. A result that is empty or + just T indicates a non-indexable query. + + + querytree('foo & ! bar'::tsquery) + 'foo' + + + + + + + setweight + + setweight ( vector tsvector, weight "char" ) + tsvector + + + Assigns the specified weight to each element + of the vector. + + + setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A') + 'cat':3A 'fat':2A,4A 'rat':5A + + + + + + + setweight + setweight for specific lexeme(s) + + setweight ( vector tsvector, weight "char", lexemes text[] ) + tsvector + + + Assigns the specified weight to elements + of the vector that are listed + in lexemes. + The strings in lexemes are taken as lexemes + as-is, without further processing. Strings that do not match any + lexeme in vector are ignored. + + + setweight('fat:2,4 cat:3 rat:5,6B'::tsvector, 'A', '{cat,rat}') + 'cat':3A 'fat':2,4 'rat':5A,6A + + + + + + + strip + + strip ( tsvector ) + tsvector + + + Removes positions and weights from the tsvector. + + + strip('fat:2,4 cat:3 rat:5A'::tsvector) + 'cat' 'fat' 'rat' + + + + + + + to_tsquery + + to_tsquery ( + config regconfig, + query text ) + tsquery + + + Converts text to a tsquery, normalizing words according to + the specified or default configuration. The words must be combined + by valid tsquery operators. + + + to_tsquery('english', 'The & Fat & Rats') + 'fat' & 'rat' + + + + + + + to_tsvector + + to_tsvector ( + config regconfig, + document text ) + tsvector + + + Converts text to a tsvector, normalizing words according + to the specified or default configuration. Position information is + included in the result. + + + to_tsvector('english', 'The Fat Rats') + 'fat':2 'rat':3 + + + + + + to_tsvector ( + config regconfig, + document json ) + tsvector + + + to_tsvector ( + config regconfig, + document jsonb ) + tsvector + + + Converts each string value in the JSON document to + a tsvector, normalizing words according to the specified + or default configuration. The results are then concatenated in + document order to produce the output. Position information is + generated as though one stopword exists between each pair of string + values. (Beware that document order of the fields of a + JSON object is implementation-dependent when the input + is jsonb; observe the difference in the examples.) + + + to_tsvector('english', '{"aa": "The Fat Rats", "b": "dog"}'::json) + 'dog':5 'fat':2 'rat':3 + + + to_tsvector('english', '{"aa": "The Fat Rats", "b": "dog"}'::jsonb) + 'dog':1 'fat':4 'rat':5 + + + + + + + json_to_tsvector + + json_to_tsvector ( + config regconfig, + document json, + filter jsonb ) + tsvector + + + + jsonb_to_tsvector + + jsonb_to_tsvector ( + config regconfig, + document jsonb, + filter jsonb ) + tsvector + + + Selects each item in the JSON document that is requested by + the filter and converts each one to + a tsvector, normalizing words according to the specified + or default configuration. The results are then concatenated in + document order to produce the output. Position information is + generated as though one stopword exists between each pair of selected + items. (Beware that document order of the fields of a + JSON object is implementation-dependent when the input + is jsonb.) + The filter must be a jsonb + array containing zero or more of these keywords: + "string" (to include all string values), + "numeric" (to include all numeric values), + "boolean" (to include all boolean values), + "key" (to include all keys), or + "all" (to include all the above). + As a special case, the filter can also be a + simple JSON value that is one of these keywords. + + + json_to_tsvector('english', '{"a": "The Fat Rats", "b": 123}'::json, '["string", "numeric"]') + '123':5 'fat':2 'rat':3 + + + json_to_tsvector('english', '{"cat": "The Fat Rats", "dog": 123}'::json, '"all"') + '123':9 'cat':1 'dog':7 'fat':4 'rat':5 + + + + + + + ts_delete + + ts_delete ( vector tsvector, lexeme text ) + tsvector + + + Removes any occurrence of the given lexeme + from the vector. + The lexeme string is treated as a lexeme as-is, + without further processing. + + + ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat') + 'cat':3 'rat':5A + + + + + + ts_delete ( vector tsvector, lexemes text[] ) + tsvector + + + Removes any occurrences of the lexemes + in lexemes + from the vector. + The strings in lexemes are taken as lexemes + as-is, without further processing. Strings that do not match any + lexeme in vector are ignored. + + + ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat']) + 'cat':3 + + + + + + + ts_filter + + ts_filter ( vector tsvector, weights "char"[] ) + tsvector + + + Selects only elements with the given weights + from the vector. + + + ts_filter('fat:2,4 cat:3b,7c rat:5A'::tsvector, '{a,b}') + 'cat':3B 'rat':5A + + + + + + + ts_headline + + ts_headline ( + config regconfig, + document text, + query tsquery + , options text ) + text + + + Displays, in an abbreviated form, the match(es) for + the query in + the document, which must be raw text not + a tsvector. Words in the document are normalized + according to the specified or default configuration before matching to + the query. Use of this function is discussed in + , which also describes the + available options. + + + ts_headline('The fat cat ate the rat.', 'cat') + The fat <b>cat</b> ate the rat. + + + + + + ts_headline ( + config regconfig, + document json, + query tsquery + , options text ) + text + + + ts_headline ( + config regconfig, + document jsonb, + query tsquery + , options text ) + text + + + Displays, in an abbreviated form, match(es) for + the query that occur in string values + within the JSON document. + See for more details. + + + ts_headline('{"cat":"raining cats and dogs"}'::jsonb, 'cat') + {"cat": "raining <b>cats</b> and dogs"} + + + + + + + ts_rank + + ts_rank ( + weights real[], + vector tsvector, + query tsquery + , normalization integer ) + real + + + Computes a score showing how well + the vector matches + the query. See + for details. + + + ts_rank(to_tsvector('raining cats and dogs'), 'cat') + 0.06079271 + + + + + + + ts_rank_cd + + ts_rank_cd ( + weights real[], + vector tsvector, + query tsquery + , normalization integer ) + real + + + Computes a score showing how well + the vector matches + the query, using a cover density + algorithm. See for details. + + + ts_rank_cd(to_tsvector('raining cats and dogs'), 'cat') + 0.1 + + + + + + + ts_rewrite + + ts_rewrite ( query tsquery, + target tsquery, + substitute tsquery ) + tsquery + + + Replaces occurrences of target + with substitute + within the query. + See for details. + + + ts_rewrite('a & b'::tsquery, 'a'::tsquery, 'foo|bar'::tsquery) + 'b' & ( 'foo' | 'bar' ) + + + + + + ts_rewrite ( query tsquery, + select text ) + tsquery + + + Replaces portions of the query according to + target(s) and substitute(s) obtained by executing + a SELECT command. + See for details. + + + SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases') + 'b' & ( 'foo' | 'bar' ) + + + + + + + tsquery_phrase + + tsquery_phrase ( query1 tsquery, query2 tsquery ) + tsquery + + + Constructs a phrase query that searches + for matches of query1 + and query2 at successive lexemes (same + as <-> operator). + + + tsquery_phrase(to_tsquery('fat'), to_tsquery('cat')) + 'fat' <-> 'cat' + + + + + + tsquery_phrase ( query1 tsquery, query2 tsquery, distance integer ) + tsquery + + + Constructs a phrase query that searches + for matches of query1 and + query2 that occur exactly + distance lexemes apart. + + + tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10) + 'fat' <10> 'cat' + + + + + + + tsvector_to_array + + tsvector_to_array ( tsvector ) + text[] + + + Converts a tsvector to an array of lexemes. + + + tsvector_to_array('fat:2,4 cat:3 rat:5A'::tsvector) + {cat,fat,rat} + + + + + + + unnest + for tsvector + + unnest ( tsvector ) + setof record + ( lexeme text, + positions smallint[], + weights text ) + + + Expands a tsvector into a set of rows, one per lexeme. + + + SELECT * FROM unnest('cat:3 fat:2,4 rat:5A'::tsvector) + + + lexeme | positions | weights +--------+-----------+--------- + cat | {3} | {D} + fat | {2,4} | {D,D} + rat | {5} | {A} + + + + + +
+ + + + All the text search functions that accept an optional regconfig + argument will use the configuration specified by + + when that argument is omitted. + + + + + The functions in + + are listed separately because they are not usually used in everyday text + searching operations. They are primarily helpful for development and + debugging of new text search configurations. + + + + Text Search Debugging Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + + ts_debug + + ts_debug ( + config regconfig, + document text ) + setof record + ( alias text, + description text, + token text, + dictionaries regdictionary[], + dictionary regdictionary, + lexemes text[] ) + + + Extracts and normalizes tokens from + the document according to the specified or + default text search configuration, and returns information about how + each token was processed. + See for details. + + + ts_debug('english', 'The Brightest supernovaes') + (asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ... + + + + + + + ts_lexize + + ts_lexize ( dict regdictionary, token text ) + text[] + + + Returns an array of replacement lexemes if the input token is known to + the dictionary, or an empty array if the token is known to the + dictionary but it is a stop word, or NULL if it is not a known word. + See for details. + + + ts_lexize('english_stem', 'stars') + {star} + + + + + + + ts_parse + + ts_parse ( parser_name text, + document text ) + setof record + ( tokid integer, + token text ) + + + Extracts tokens from the document using the + named parser. + See for details. + + + ts_parse('default', 'foo - bar') + (1,foo) ... + + + + + + ts_parse ( parser_oid oid, + document text ) + setof record + ( tokid integer, + token text ) + + + Extracts tokens from the document using a + parser specified by OID. + See for details. + + + ts_parse(3722, 'foo - bar') + (1,foo) ... + + + + + + + ts_token_type + + ts_token_type ( parser_name text ) + setof record + ( tokid integer, + alias text, + description text ) + + + Returns a table that describes each type of token the named parser can + recognize. + See for details. + + + ts_token_type('default') + (1,asciiword,"Word, all ASCII") ... + + + + + + ts_token_type ( parser_oid oid ) + setof record + ( tokid integer, + alias text, + description text ) + + + Returns a table that describes each type of token a parser specified + by OID can recognize. + See for details. + + + ts_token_type(3722) + (1,asciiword,"Word, all ASCII") ... + + + + + + + ts_stat + + ts_stat ( sqlquery text + , weights text ) + setof record + ( word text, + ndoc integer, + nentry integer ) + + + Executes the sqlquery, which must return a + single tsvector column, and returns statistics about each + distinct lexeme contained in the data. + See for details. + + + ts_stat('SELECT vector FROM apod') + (foo,10,15) ... + + + + +
+ +
diff --git a/doc/src/sgml/func/func-trigger.sgml b/doc/src/sgml/func/func-trigger.sgml new file mode 100644 index 0000000000000..94b40adbdb84a --- /dev/null +++ b/doc/src/sgml/func/func-trigger.sgml @@ -0,0 +1,135 @@ + + Trigger Functions + + + While many uses of triggers involve user-written trigger functions, + PostgreSQL provides a few built-in trigger + functions that can be used directly in user-defined triggers. These + are summarized in . + (Additional built-in trigger functions exist, which implement foreign + key constraints and deferred index constraints. Those are not documented + here since users need not use them directly.) + + + + For more information about creating triggers, see + . + + + + Built-In Trigger Functions + + + + + Function + + + Description + + + Example Usage + + + + + + + + + suppress_redundant_updates_trigger + + suppress_redundant_updates_trigger ( ) + trigger + + + Suppresses do-nothing update operations. See below for details. + + + CREATE TRIGGER ... suppress_redundant_updates_trigger() + + + + + + + tsvector_update_trigger + + tsvector_update_trigger ( ) + trigger + + + Automatically updates a tsvector column from associated + plain-text document column(s). The text search configuration to use + is specified by name as a trigger argument. See + for details. + + + CREATE TRIGGER ... tsvector_update_trigger(tsvcol, 'pg_catalog.swedish', title, body) + + + + + + + tsvector_update_trigger_column + + tsvector_update_trigger_column ( ) + trigger + + + Automatically updates a tsvector column from associated + plain-text document column(s). The text search configuration to use + is taken from a regconfig column of the table. See + for details. + + + CREATE TRIGGER ... tsvector_update_trigger_column(tsvcol, tsconfigcol, title, body) + + + + +
+ + + The suppress_redundant_updates_trigger function, + when applied as a row-level BEFORE UPDATE trigger, + will prevent any update that does not actually change the data in the + row from taking place. This overrides the normal behavior which always + performs a physical row update + regardless of whether or not the data has changed. (This normal behavior + makes updates run faster, since no checking is required, and is also + useful in certain cases.) + + + + Ideally, you should avoid running updates that don't actually + change the data in the record. Redundant updates can cost considerable + unnecessary time, especially if there are lots of indexes to alter, + and space in dead rows that will eventually have to be vacuumed. + However, detecting such situations in client code is not + always easy, or even possible, and writing expressions to detect + them can be error-prone. An alternative is to use + suppress_redundant_updates_trigger, which will skip + updates that don't change the data. You should use this with care, + however. The trigger takes a small but non-trivial time for each record, + so if most of the records affected by updates do actually change, + use of this trigger will make updates run slower on average. + + + + The suppress_redundant_updates_trigger function can be + added to a table like this: + +CREATE TRIGGER z_min_update +BEFORE UPDATE ON tablename +FOR EACH ROW EXECUTE FUNCTION suppress_redundant_updates_trigger(); + + In most cases, you need to fire this trigger last for each row, so that + it does not override other triggers that might wish to alter the row. + Bearing in mind that triggers fire in name order, you would therefore + choose a trigger name that comes after the name of any other trigger + you might have on the table. (Hence the z prefix in the + example.) + +
diff --git a/doc/src/sgml/func/func-uuid.sgml b/doc/src/sgml/func/func-uuid.sgml new file mode 100644 index 0000000000000..2638e2bf855f2 --- /dev/null +++ b/doc/src/sgml/func/func-uuid.sgml @@ -0,0 +1,179 @@ + + UUID Functions + + + UUID + generating + + + + gen_random_uuid + + + + uuidv4 + + + + uuidv7 + + + + uuid_extract_timestamp + + + + uuid_extract_version + + + + shows the PostgreSQL + functions that can be used to generate UUIDs. + + + + <acronym>UUID</acronym> Generation Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + gen_random_uuid ( ) + uuid + + + uuidv4 ( ) + uuid + + + Generates a version 4 (random) UUID + + + gen_random_uuid() + 5b30857f-0bfa-48b5-ac0b-5c64e28078d1 + + + uuidv4() + b42410ee-132f-42ee-9e4f-09a6485c95b8 + + + + + uuidv7 + ( shift interval ) + uuid + + + Generates a version 7 (time-ordered) UUID. The timestamp is + computed using UNIX timestamp with millisecond precision + + sub-millisecond timestamp + random. The optional + parameter shift will shift the computed + timestamp by the given interval. + + + uuidv7() + 019535d9-3df7-79fb-b466-fa907fa17f9e + + + + +
+ + + + The module provides additional functions that + implement other standard algorithms for generating UUIDs. + + + + + shows the PostgreSQL + functions that can be used to extract information from UUIDs. + + + + <acronym>UUID</acronym> Extraction Functions + + + + + Function + + + Description + + + Example(s) + + + + + + + + uuid_extract_timestamp + ( uuid ) + timestamp with time zone + + + Extracts a timestamp with time zone from a UUID of + version 1 or 7. For other versions, this function returns null. + Note that the extracted timestamp is not necessarily exactly equal + to the time the UUID was generated; this depends on the + implementation that generated the UUID. + + + uuid_extract_timestamp('019535d9-3df7-79fb-b466-&zwsp;fa907fa17f9e'::uuid) + 2025-02-23 21:46:24.503-05 + + + + + uuid_extract_version + ( uuid ) + smallint + + + Extracts the version from a UUID of one of the variants described by + RFC + 9562. For other variants, this function returns null. + For example, for a UUID generated + by gen_random_uuid(), this function will + return 4. + + + uuid_extract_version('41db1265-8bc1-4ab3-992f-&zwsp;885799a4af1d'::uuid) + 4 + + + uuid_extract_version('019535d9-3df7-79fb-b466-&zwsp;fa907fa17f9e'::uuid) + 7 + + + + +
+ + + PostgreSQL also provides the usual comparison + operators shown in for + UUIDs. + + + See for details on the data type + uuid in PostgreSQL. + +
diff --git a/doc/src/sgml/func/func-window.sgml b/doc/src/sgml/func/func-window.sgml new file mode 100644 index 0000000000000..bcf755c9ebcab --- /dev/null +++ b/doc/src/sgml/func/func-window.sgml @@ -0,0 +1,292 @@ + + Window Functions + + + window function + built-in + + + + Window functions provide the ability to perform + calculations across sets of rows that are related to the current query + row. See for an introduction to this + feature, and for syntax + details. + + + + The built-in window functions are listed in + . Note that these functions + must be invoked using window function syntax, i.e., an + OVER clause is required. + + + + In addition to these functions, any built-in or user-defined + ordinary aggregate (i.e., not ordered-set or hypothetical-set aggregates) + can be used as a window function; see + for a list of the built-in aggregates. + Aggregate functions act as window functions only when an OVER + clause follows the call; otherwise they act as plain aggregates + and return a single row for the entire set. + + + + General-Purpose Window Functions + + + + + Function + + + Description + + + + + + + + + row_number + + row_number () + bigint + + + Returns the number of the current row within its partition, counting + from 1. + + + + + + + rank + + rank () + bigint + + + Returns the rank of the current row, with gaps; that is, + the row_number of the first row in its peer + group. + + + + + + + dense_rank + + dense_rank () + bigint + + + Returns the rank of the current row, without gaps; this function + effectively counts peer groups. + + + + + + + percent_rank + + percent_rank () + double precision + + + Returns the relative rank of the current row, that is + (rank - 1) / (total partition rows - 1). + The value thus ranges from 0 to 1 inclusive. + + + + + + + cume_dist + + cume_dist () + double precision + + + Returns the cumulative distribution, that is (number of partition rows + preceding or peers with current row) / (total partition rows). + The value thus ranges from 1/N to 1. + + + + + + + ntile + + ntile ( num_buckets integer ) + integer + + + Returns an integer ranging from 1 to the argument value, dividing the + partition as equally as possible. + + + + + + + lag + + lag ( value anycompatible + , offset integer + , default anycompatible ) null treatment + anycompatible + + + Returns value evaluated at + the row that is offset + rows before the current row within the partition; if there is no such + row, instead returns default + (which must be of a type compatible with + value). + Both offset and + default are evaluated + with respect to the current row. If omitted, + offset defaults to 1 and + default to NULL. + + + + + + + lead + + lead ( value anycompatible + , offset integer + , default anycompatible ) null treatment + anycompatible + + + Returns value evaluated at + the row that is offset + rows after the current row within the partition; if there is no such + row, instead returns default + (which must be of a type compatible with + value). + Both offset and + default are evaluated + with respect to the current row. If omitted, + offset defaults to 1 and + default to NULL. + + + + + + + first_value + + first_value ( value anyelement ) null treatment + anyelement + + + Returns value evaluated + at the row that is the first row of the window frame. + + + + + + + last_value + + last_value ( value anyelement ) null treatment + anyelement + + + Returns value evaluated + at the row that is the last row of the window frame. + + + + + + + nth_value + + nth_value ( value anyelement, n integer ) null treatment + anyelement + + + Returns value evaluated + at the row that is the n'th + row of the window frame (counting from 1); + returns NULL if there is no such row. + + + + +
+ + + All of the functions listed in + depend on the sort ordering + specified by the ORDER BY clause of the associated window + definition. Rows that are not distinct when considering only the + ORDER BY columns are said to be peers. + The four ranking functions (including cume_dist) are + defined so that they give the same answer for all rows of a peer group. + + + + Note that first_value, last_value, and + nth_value consider only the rows within the window + frame, which by default contains the rows from the start of the + partition through the last peer of the current row. This is + likely to give unhelpful results for last_value and + sometimes also nth_value. You can redefine the frame by + adding a suitable frame specification (RANGE, + ROWS or GROUPS) to + the OVER clause. + See for more information + about frame specifications. + + + + When an aggregate function is used as a window function, it aggregates + over the rows within the current row's window frame. + An aggregate used with ORDER BY and the default window frame + definition produces a running sum type of behavior, which may or + may not be what's wanted. To obtain + aggregation over the whole partition, omit ORDER BY or use + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING. + Other frame specifications can be used to obtain other effects. + + + + The null treatment option must be one of: + + RESPECT NULLS + IGNORE NULLS + + If unspecified, the default is RESPECT NULLS which includes NULL + values in any result calculation. IGNORE NULLS ignores NULL values. + This option is only allowed for the following functions: lag, + lead, first_value, last_value, + nth_value. + + + + + The SQL standard defines a FROM FIRST or FROM LAST + option for nth_value. This is not implemented in + PostgreSQL: only the default FROM FIRST + behavior is supported. (You can achieve the result of FROM LAST by + reversing the ORDER BY + ordering.) + + + +
diff --git a/doc/src/sgml/func/func-xml.sgml b/doc/src/sgml/func/func-xml.sgml new file mode 100644 index 0000000000000..511bc90852a58 --- /dev/null +++ b/doc/src/sgml/func/func-xml.sgml @@ -0,0 +1,1283 @@ + + + XML Functions + + + XML Functions + + + + The functions and function-like expressions described in this + section operate on values of type xml. See for information about the xml + type. The function-like expressions xmlparse + and xmlserialize for converting to and from + type xml are documented there, not in this section. + + + + Use of most of these functions + requires PostgreSQL to have been built + with configure --with-libxml. + + + + Producing XML Content + + + A set of functions and function-like expressions is available for + producing XML content from SQL data. As such, they are + particularly suitable for formatting query results into XML + documents for processing in client applications. + + + + <literal>xmltext</literal> + + + xmltext + + + +xmltext ( text ) xml + + + + The function xmltext returns an XML value with a single + text node containing the input argument as its content. Predefined entities + like ampersand (), left and right angle brackets + (]]>), and quotation marks () + are escaped. + + + + Example: +'); + xmltext +------------------------- + < foo & bar > +]]> + + + + + <literal>xmlcomment</literal> + + + xmlcomment + + + +xmlcomment ( text ) xml + + + + The function xmlcomment creates an XML value + containing an XML comment with the specified text as content. + The text cannot contain -- or end with a + -, otherwise the resulting construct + would not be a valid XML comment. + If the argument is null, the result is null. + + + + Example: + +]]> + + + + + <literal>xmlconcat</literal> + + + xmlconcat + + + +xmlconcat ( xml , ... ) xml + + + + The function xmlconcat concatenates a list + of individual XML values to create a single value containing an + XML content fragment. Null values are omitted; the result is + only null if there are no nonnull arguments. + + + + Example: +', 'foo'); + + xmlconcat +---------------------- + foo +]]> + + + + XML declarations, if present, are combined as follows. If all + argument values have the same XML version declaration, that + version is used in the result, else no version is used. If all + argument values have the standalone declaration value + yes, then that value is used in the result. If + all argument values have a standalone declaration value and at + least one is no, then that is used in the result. + Else the result will have no standalone declaration. If the + result is determined to require a standalone declaration but no + version declaration, a version declaration with version 1.0 will + be used because XML requires an XML declaration to contain a + version declaration. Encoding declarations are ignored and + removed in all cases. + + + + Example: +', ''); + + xmlconcat +----------------------------------- + +]]> + + + + + <literal>xmlelement</literal> + + + xmlelement + + + +xmlelement ( NAME name , XMLATTRIBUTES ( attvalue AS attname , ... ) , content , ... ) xml + + + + The xmlelement expression produces an XML + element with the given name, attributes, and content. + The name + and attname items shown in the syntax are + simple identifiers, not values. The attvalue + and content items are expressions, which can + yield any PostgreSQL data type. The + argument(s) within XMLATTRIBUTES generate attributes + of the XML element; the content value(s) are + concatenated to form its content. + + + + Examples: + + +SELECT xmlelement(NAME foo, xmlattributes('xyz' AS bar)); + + xmlelement +------------------ + + +SELECT xmlelement(NAME foo, xmlattributes(current_date AS bar), 'cont', 'ent'); + + xmlelement +------------------------------------- + content +]]> + + + + Element and attribute names that are not valid XML names are + escaped by replacing the offending characters by the sequence + _xHHHH_, where + HHHH is the character's Unicode + codepoint in hexadecimal notation. For example: + +]]> + + + + An explicit attribute name need not be specified if the attribute + value is a column reference, in which case the column's name will + be used as the attribute name by default. In other cases, the + attribute must be given an explicit name. So this example is + valid: + +CREATE TABLE test (a xml, b xml); +SELECT xmlelement(NAME test, xmlattributes(a, b)) FROM test; + + But these are not: + +SELECT xmlelement(NAME test, xmlattributes('constant'), a, b) FROM test; +SELECT xmlelement(NAME test, xmlattributes(func(a, b))) FROM test; + + + + + Element content, if specified, will be formatted according to + its data type. If the content is itself of type xml, + complex XML documents can be constructed. For example: + +]]> + + Content of other types will be formatted into valid XML character + data. This means in particular that the characters <, >, + and & will be converted to entities. Binary data (data type + bytea) will be represented in base64 or hex + encoding, depending on the setting of the configuration parameter + . The particular behavior for + individual data types is expected to evolve in order to align the + PostgreSQL mappings with those specified in SQL:2006 and later, + as discussed in . + + + + + <literal>xmlforest</literal> + + + xmlforest + + + +xmlforest ( content AS name , ... ) xml + + + + The xmlforest expression produces an XML + forest (sequence) of elements using the given names and content. + As for xmlelement, + each name must be a simple identifier, while + the content expressions can have any data + type. + + + + Examples: + +SELECT xmlforest('abc' AS foo, 123 AS bar); + + xmlforest +------------------------------ + <foo>abc</foo><bar>123</bar> + + +SELECT xmlforest(table_name, column_name) +FROM information_schema.columns +WHERE table_schema = 'pg_catalog'; + + xmlforest +------------------------------------&zwsp;----------------------------------- + <table_name>pg_authid</table_name>&zwsp;<column_name>rolname</column_name> + <table_name>pg_authid</table_name>&zwsp;<column_name>rolsuper</column_name> + ... + + + As seen in the second example, the element name can be omitted if + the content value is a column reference, in which case the column + name is used by default. Otherwise, a name must be specified. + + + + Element names that are not valid XML names are escaped as shown + for xmlelement above. Similarly, content + data is escaped to make valid XML content, unless it is already + of type xml. + + + + Note that XML forests are not valid XML documents if they consist + of more than one element, so it might be useful to wrap + xmlforest expressions in + xmlelement. + + + + + <literal>xmlpi</literal> + + + xmlpi + + + +xmlpi ( NAME name , content ) xml + + + + The xmlpi expression creates an XML + processing instruction. + As for xmlelement, + the name must be a simple identifier, while + the content expression can have any data type. + The content, if present, must not contain the + character sequence ?>. + + + + Example: + +]]> + + + + + <literal>xmlroot</literal> + + + xmlroot + + + +xmlroot ( xml, VERSION {text|NO VALUE} , STANDALONE {YES|NO|NO VALUE} ) xml + + + + The xmlroot expression alters the properties + of the root node of an XML value. If a version is specified, + it replaces the value in the root node's version declaration; if a + standalone setting is specified, it replaces the value in the + root node's standalone declaration. + + + +abc'), + version '1.0', standalone yes); + + xmlroot +---------------------------------------- + + abc +]]> + + + + + <literal>xmlagg</literal> + + + xmlagg + + + +xmlagg ( xml ) xml + + + + The function xmlagg is, unlike the other + functions described here, an aggregate function. It concatenates the + input values to the aggregate function call, + much like xmlconcat does, except that concatenation + occurs across rows rather than across expressions in a single row. + See for additional information + about aggregate functions. + + + + Example: +abc'); +INSERT INTO test VALUES (2, ''); +SELECT xmlagg(x) FROM test; + xmlagg +---------------------- + abc +]]> + + + + To determine the order of the concatenation, an ORDER BY + clause may be added to the aggregate call as described in + . For example: + +abc +]]> + + + + The following non-standard approach used to be recommended + in previous versions, and may still be useful in specific + cases: + +abc +]]> + + + + + + XML Predicates + + + The expressions described in this section check properties + of xml values. + + + + <literal>IS DOCUMENT</literal> + + + IS DOCUMENT + + + +xml IS DOCUMENT boolean + + + + The expression IS DOCUMENT returns true if the + argument XML value is a proper XML document, false if it is not + (that is, it is a content fragment), or null if the argument is + null. See about the difference + between documents and content fragments. + + + + + <literal>IS NOT DOCUMENT</literal> + + + IS NOT DOCUMENT + + + +xml IS NOT DOCUMENT boolean + + + + The expression IS NOT DOCUMENT returns false if the + argument XML value is a proper XML document, true if it is not (that is, + it is a content fragment), or null if the argument is null. + + + + + <literal>XMLEXISTS</literal> + + + XMLEXISTS + + + +XMLEXISTS ( text PASSING BY {REF|VALUE} xml BY {REF|VALUE} ) boolean + + + + The function xmlexists evaluates an XPath 1.0 + expression (the first argument), with the passed XML value as its context + item. The function returns false if the result of that evaluation + yields an empty node-set, true if it yields any other value. The + function returns null if any argument is null. A nonnull value + passed as the context item must be an XML document, not a content + fragment or any non-XML value. + + + + Example: + TorontoOttawa'); + + xmlexists +------------ + t +(1 row) +]]> + + + + The BY REF and BY VALUE clauses + are accepted in PostgreSQL, but are ignored, + as discussed in . + + + + In the SQL standard, the xmlexists function + evaluates an expression in the XML Query language, + but PostgreSQL allows only an XPath 1.0 + expression, as discussed in + . + + + + + <literal>xml_is_well_formed</literal> + + + xml_is_well_formed + + + + xml_is_well_formed_document + + + + xml_is_well_formed_content + + + +xml_is_well_formed ( text ) boolean +xml_is_well_formed_document ( text ) boolean +xml_is_well_formed_content ( text ) boolean + + + + These functions check whether a text string represents + well-formed XML, returning a Boolean result. + xml_is_well_formed_document checks for a well-formed + document, while xml_is_well_formed_content checks + for well-formed content. xml_is_well_formed does + the former if the configuration + parameter is set to DOCUMENT, or the latter if it is set to + CONTENT. This means that + xml_is_well_formed is useful for seeing whether + a simple cast to type xml will succeed, whereas the other two + functions are useful for seeing whether the corresponding variants of + XMLPARSE will succeed. + + + + Examples: + +'); + xml_is_well_formed +-------------------- + f +(1 row) + +SELECT xml_is_well_formed(''); + xml_is_well_formed +-------------------- + t +(1 row) + +SET xmloption TO CONTENT; +SELECT xml_is_well_formed('abc'); + xml_is_well_formed +-------------------- + t +(1 row) + +SELECT xml_is_well_formed_document('bar'); + xml_is_well_formed_document +----------------------------- + t +(1 row) + +SELECT xml_is_well_formed_document('bar'); + xml_is_well_formed_document +----------------------------- + f +(1 row) +]]> + + The last example shows that the checks include whether + namespaces are correctly matched. + + + + + + Processing XML + + + To process values of data type xml, PostgreSQL offers + the functions xpath and + xpath_exists, which evaluate XPath 1.0 + expressions, and the XMLTABLE + table function. + + + + <literal>xpath</literal> + + + XPath + + + +xpath ( xpath text, xml xml , nsarray text[] ) xml[] + + + + The function xpath evaluates the XPath 1.0 + expression xpath (given as text) + against the XML value + xml. It returns an array of XML values + corresponding to the node-set produced by the XPath expression. + If the XPath expression returns a scalar value rather than a node-set, + a single-element array is returned. + + + + The second argument must be a well formed XML document. In particular, + it must have a single root node element. + + + + The optional third argument of the function is an array of namespace + mappings. This array should be a two-dimensional text array with + the length of the second axis being equal to 2 (i.e., it should be an + array of arrays, each of which consists of exactly 2 elements). + The first element of each array entry is the namespace name (alias), the + second the namespace URI. It is not required that aliases provided in + this array be the same as those being used in the XML document itself (in + other words, both in the XML document and in the xpath + function context, aliases are local). + + + + Example: +test', + ARRAY[ARRAY['my', 'http://example.com']]); + + xpath +-------- + {test} +(1 row) +]]> + + + + To deal with default (anonymous) namespaces, do something like this: +test', + ARRAY[ARRAY['mydefns', 'http://example.com']]); + + xpath +-------- + {test} +(1 row) +]]> + + + + + <literal>xpath_exists</literal> + + + xpath_exists + + + +xpath_exists ( xpath text, xml xml , nsarray text[] ) boolean + + + + The function xpath_exists is a specialized form + of the xpath function. Instead of returning the + individual XML values that satisfy the XPath 1.0 expression, this function + returns a Boolean indicating whether the query was satisfied or not + (specifically, whether it produced any value other than an empty node-set). + This function is equivalent to the XMLEXISTS predicate, + except that it also offers support for a namespace mapping argument. + + + + Example: +test', + ARRAY[ARRAY['my', 'http://example.com']]); + + xpath_exists +-------------- + t +(1 row) +]]> + + + + + <literal>xmltable</literal> + + + xmltable + + + + table function + XMLTABLE + + + +XMLTABLE ( + XMLNAMESPACES ( namespace_uri AS namespace_name , ... ), + row_expression PASSING BY {REF|VALUE} document_expression BY {REF|VALUE} + COLUMNS name { type PATH column_expression DEFAULT default_expression NOT NULL | NULL + | FOR ORDINALITY } + , ... +) setof record + + + + The xmltable expression produces a table based + on an XML value, an XPath filter to extract rows, and a + set of column definitions. + Although it syntactically resembles a function, it can only appear + as a table in a query's FROM clause. + + + + The optional XMLNAMESPACES clause gives a + comma-separated list of namespace definitions, where + each namespace_uri is a text + expression and each namespace_name is a simple + identifier. It specifies the XML namespaces used in the document and + their aliases. A default namespace specification is not currently + supported. + + + + The required row_expression argument is an + XPath 1.0 expression (given as text) that is evaluated, + passing the XML value document_expression as + its context item, to obtain a set of XML nodes. These nodes are what + xmltable transforms into output rows. No rows + will be produced if the document_expression + is null, nor if the row_expression produces + an empty node-set or any value other than a node-set. + + + + document_expression provides the context + item for the row_expression. It must be a + well-formed XML document; fragments/forests are not accepted. + The BY REF and BY VALUE clauses + are accepted but ignored, as discussed in + . + + + + In the SQL standard, the xmltable function + evaluates expressions in the XML Query language, + but PostgreSQL allows only XPath 1.0 + expressions, as discussed in + . + + + + The required COLUMNS clause specifies the + column(s) that will be produced in the output table. + See the syntax summary above for the format. + A name is required for each column, as is a data type + (unless FOR ORDINALITY is specified, in which case + type integer is implicit). The path, default and + nullability clauses are optional. + + + + A column marked FOR ORDINALITY will be populated + with row numbers, starting with 1, in the order of nodes retrieved from + the row_expression's result node-set. + At most one column may be marked FOR ORDINALITY. + + + + + XPath 1.0 does not specify an order for nodes in a node-set, so code + that relies on a particular order of the results will be + implementation-dependent. Details can be found in + . + + + + + The column_expression for a column is an + XPath 1.0 expression that is evaluated for each row, with the current + node from the row_expression result as its + context item, to find the value of the column. If + no column_expression is given, then the + column name is used as an implicit path. + + + + If a column's XPath expression returns a non-XML value (which is limited + to string, boolean, or double in XPath 1.0) and the column has a + PostgreSQL type other than xml, the column will be set + as if by assigning the value's string representation to the PostgreSQL + type. (If the value is a boolean, its string representation is taken + to be 1 or 0 if the output + column's type category is numeric, otherwise true or + false.) + + + + If a column's XPath expression returns a non-empty set of XML nodes + and the column's PostgreSQL type is xml, the column will + be assigned the expression result exactly, if it is of document or + content form. + + + A result containing more than one element node at the top level, or + non-whitespace text outside of an element, is an example of content form. + An XPath result can be of neither form, for example if it returns an + attribute node selected from the element that contains it. Such a result + will be put into content form with each such disallowed node replaced by + its string value, as defined for the XPath 1.0 + string function. + + + + + + A non-XML result assigned to an xml output column produces + content, a single text node with the string value of the result. + An XML result assigned to a column of any other type may not have more than + one node, or an error is raised. If there is exactly one node, the column + will be set as if by assigning the node's string + value (as defined for the XPath 1.0 string function) + to the PostgreSQL type. + + + + The string value of an XML element is the concatenation, in document order, + of all text nodes contained in that element and its descendants. The string + value of an element with no descendant text nodes is an + empty string (not NULL). + Any xsi:nil attributes are ignored. + Note that the whitespace-only text() node between two non-text + elements is preserved, and that leading whitespace on a text() + node is not flattened. + The XPath 1.0 string function may be consulted for the + rules defining the string value of other XML node types and non-XML values. + + + + The conversion rules presented here are not exactly those of the SQL + standard, as discussed in . + + + + If the path expression returns an empty node-set + (typically, when it does not match) + for a given row, the column will be set to NULL, unless + a default_expression is specified; then the + value resulting from evaluating that expression is used. + + + + A default_expression, rather than being + evaluated immediately when xmltable is called, + is evaluated each time a default is needed for the column. + If the expression qualifies as stable or immutable, the repeat + evaluation may be skipped. + This means that you can usefully use volatile functions like + nextval in + default_expression. + + + + Columns may be marked NOT NULL. If the + column_expression for a NOT + NULL column does not match anything and there is + no DEFAULT or + the default_expression also evaluates to null, + an error is reported. + + + + Examples: + + + AU + Australia + + + JP + Japan + Shinzo Abe + 145935 + + + SG + Singapore + 697 + + +$$ AS data; + +SELECT xmltable.* + FROM xmldata, + XMLTABLE('//ROWS/ROW' + PASSING data + COLUMNS id int PATH '@id', + ordinality FOR ORDINALITY, + "COUNTRY_NAME" text, + country_id text PATH 'COUNTRY_ID', + size_sq_km float PATH 'SIZE[@unit = "sq_km"]', + size_other text PATH + 'concat(SIZE[@unit!="sq_km"], " ", SIZE[@unit!="sq_km"]/@unit)', + premier_name text PATH 'PREMIER_NAME' DEFAULT 'not specified'); + + id | ordinality | COUNTRY_NAME | country_id | size_sq_km | size_other | premier_name +----+------------+--------------+------------+------------+--------------+--------------- + 1 | 1 | Australia | AU | | | not specified + 5 | 2 | Japan | JP | | 145935 sq_mi | Shinzo Abe + 6 | 3 | Singapore | SG | 697 | | not specified +]]> + + The following example shows concatenation of multiple text() nodes, + usage of the column name as XPath filter, and the treatment of whitespace, + XML comments and processing instructions: + + + Hello2a2 bbbxxxCC + +$$ AS data; + +SELECT xmltable.* + FROM xmlelements, XMLTABLE('/root' PASSING data COLUMNS element text); + element +------------------------- + Hello2a2 bbbxxxCC +]]> + + + + The following example illustrates how + the XMLNAMESPACES clause can be used to specify + a list of namespaces + used in the XML document as well as in the XPath expressions: + + + + + +'::xml) +) +SELECT xmltable.* + FROM XMLTABLE(XMLNAMESPACES('http://example.com/myns' AS x, + 'http://example.com/b' AS "B"), + '/x:example/x:item' + PASSING (SELECT data FROM xmldata) + COLUMNS foo int PATH '@foo', + bar int PATH '@B:bar'); + foo | bar +-----+----- + 1 | 2 + 3 | 4 + 4 | 5 +(3 rows) +]]> + + + + + + Mapping Tables to XML + + + XML export + + + + The following functions map the contents of relational tables to + XML values. They can be thought of as XML export functionality: + +table_to_xml ( table regclass, nulls boolean, + tableforest boolean, targetns text ) xml +query_to_xml ( query text, nulls boolean, + tableforest boolean, targetns text ) xml +cursor_to_xml ( cursor refcursor, count integer, nulls boolean, + tableforest boolean, targetns text ) xml + + + + + table_to_xml maps the content of the named + table, passed as parameter table. The + regclass type accepts strings identifying tables using the + usual notation, including optional schema qualification and + double quotes (see for details). + query_to_xml executes the + query whose text is passed as parameter + query and maps the result set. + cursor_to_xml fetches the indicated number of + rows from the cursor specified by the parameter + cursor. This variant is recommended if + large tables have to be mapped, because the result value is built + up in memory by each function. + + + + If tableforest is false, then the resulting + XML document looks like this: + + + data + data + + + + ... + + + ... + +]]> + + If tableforest is true, the result is an + XML content fragment that looks like this: + + data + data + + + + ... + + +... +]]> + + If no table name is available, that is, when mapping a query or a + cursor, the string table is used in the first + format, row in the second format. + + + + The choice between these formats is up to the user. The first + format is a proper XML document, which will be important in many + applications. The second format tends to be more useful in the + cursor_to_xml function if the result values are to be + reassembled into one document later on. The functions for + producing XML content discussed above, in particular + xmlelement, can be used to alter the results + to taste. + + + + The data values are mapped in the same way as described for the + function xmlelement above. + + + + The parameter nulls determines whether null + values should be included in the output. If true, null values in + columns are represented as: + +]]> + where xsi is the XML namespace prefix for XML + Schema Instance. An appropriate namespace declaration will be + added to the result value. If false, columns containing null + values are simply omitted from the output. + + + + The parameter targetns specifies the + desired XML namespace of the result. If no particular namespace + is wanted, an empty string should be passed. + + + + The following functions return XML Schema documents describing the + mappings performed by the corresponding functions above: + +table_to_xmlschema ( table regclass, nulls boolean, + tableforest boolean, targetns text ) xml +query_to_xmlschema ( query text, nulls boolean, + tableforest boolean, targetns text ) xml +cursor_to_xmlschema ( cursor refcursor, nulls boolean, + tableforest boolean, targetns text ) xml + + It is essential that the same parameters are passed in order to + obtain matching XML data mappings and XML Schema documents. + + + + The following functions produce XML data mappings and the + corresponding XML Schema in one document (or forest), linked + together. They can be useful where self-contained and + self-describing results are wanted: + +table_to_xml_and_xmlschema ( table regclass, nulls boolean, + tableforest boolean, targetns text ) xml +query_to_xml_and_xmlschema ( query text, nulls boolean, + tableforest boolean, targetns text ) xml + + + + + In addition, the following functions are available to produce + analogous mappings of entire schemas or the entire current + database: + +schema_to_xml ( schema name, nulls boolean, + tableforest boolean, targetns text ) xml +schema_to_xmlschema ( schema name, nulls boolean, + tableforest boolean, targetns text ) xml +schema_to_xml_and_xmlschema ( schema name, nulls boolean, + tableforest boolean, targetns text ) xml + +database_to_xml ( nulls boolean, + tableforest boolean, targetns text ) xml +database_to_xmlschema ( nulls boolean, + tableforest boolean, targetns text ) xml +database_to_xml_and_xmlschema ( nulls boolean, + tableforest boolean, targetns text ) xml + + + These functions ignore tables that are not readable by the current user. + The database-wide functions additionally ignore schemas that the current + user does not have USAGE (lookup) privilege for. + + + + Note that these potentially produce a lot of data, which needs to + be built up in memory. When requesting content mappings of large + schemas or databases, it might be worthwhile to consider mapping the + tables separately instead, possibly even through a cursor. + + + + The result of a schema content mapping looks like this: + + + +table1-mapping + +table2-mapping + +... + +]]> + + where the format of a table mapping depends on the + tableforest parameter as explained above. + + + + The result of a database content mapping looks like this: + + + + + ... + + + + ... + + +... + +]]> + + where the schema mapping is as above. + + + + As an example of using the output produced by these functions, + shows an XSLT stylesheet that + converts the output of + table_to_xml_and_xmlschema to an HTML + document containing a tabular rendition of the table data. In a + similar manner, the results from these functions can be + converted into other XML-based formats. + + + + XSLT Stylesheet for Converting SQL/XML Output to HTML + + + + + + + + + + + + + <xsl:value-of select="name(current())"/> + + + + + + + + + + + + + + + + +
+ + +
+ +
+]]>
+
+
+
diff --git a/doc/src/sgml/func/func.sgml b/doc/src/sgml/func/func.sgml new file mode 100644 index 0000000000000..f351ef53f63d4 --- /dev/null +++ b/doc/src/sgml/func/func.sgml @@ -0,0 +1,84 @@ + + + + Functions and Operators + + + function + + + + operator + + + + PostgreSQL provides a large number of + functions and operators for the built-in data types. This chapter + describes most of them, although additional special-purpose functions + appear in relevant sections of the manual. Users can also + define their own functions and operators, as described in + . The + psql commands \df and + \do can be used to list all + available functions and operators, respectively. + + + + The notation used throughout this chapter to describe the argument and + result data types of a function or operator is like this: + +repeat ( text, integer ) text + + which says that the function repeat takes one text and + one integer argument and returns a result of type text. The right arrow + is also used to indicate the result of an example, thus: + +repeat('Pg', 4) PgPgPgPg + + + + + If you are concerned about portability then note that most of + the functions and operators described in this chapter, with the + exception of the most trivial arithmetic and comparison operators + and some explicitly marked functions, are not specified by the + SQL standard. Some of this extended functionality + is present in other SQL database management + systems, and in many cases this functionality is compatible and + consistent between the various implementations. + + + +&func-logical; +&func-comparison; +&func-math; +&func-string; +&func-binarystring; +&func-bitstring; +&func-matching; +&func-formatting; +&func-datetime; +&func-enum; +&func-geometry; +&func-net; +&func-textsearch; +&func-uuid; +&func-xml; +&func-json; +&func-sequence; +&func-conditional; +&func-array; +&func-range; +&func-aggregate; +&func-window; +&func-merge-support; +&func-subquery; +&func-comparisons; +&func-srf; +&func-info; +&func-admin; +&func-trigger; +&func-event-triggers; +&func-statistics; + + diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml index 46e87e01324dd..82410b1fbdfa1 100644 --- a/doc/src/sgml/gin.sgml +++ b/doc/src/sgml/gin.sgml @@ -394,7 +394,11 @@ Pointer extra_data) - Compare a partial-match query key to an index key. Returns an integer + Compare a partial-match query key to an index key. + partial_key is a query key that was returned + by extractQuery with an indication that it + requires partial match, and key is an index entry. + Returns an integer whose sign indicates the result: less than zero means the index key does not match the query, but the index scan should continue; zero means that the index key does match the query; greater than zero diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml index a373a8aa4b2fc..1871f74272176 100644 --- a/doc/src/sgml/gist.sgml +++ b/doc/src/sgml/gist.sgml @@ -291,7 +291,7 @@ CREATE INDEX ON my_table USING GIST (my_inet_column inet_ops); speed up building a GiST index. The optional twelfth method stratnum is used to translate compare types (from - src/include/nodes/primnodes.h) into strategy numbers + src/include/access/cmptype.h) into strategy numbers used by the operator class. This lets the core code look up operators for temporal constraint indexes. @@ -506,11 +506,11 @@ my_compress(PG_FUNCTION_ARGS) if (entry->leafkey) { /* replace entry->key with a compressed version */ - compressed_data_type *compressed_data = palloc(sizeof(compressed_data_type)); + compressed_data_type *compressed_data = palloc_object(compressed_data_type); /* fill *compressed_data from entry->key ... */ - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(compressed_data), entry->rel, entry->page, entry->offset, FALSE); } @@ -921,8 +921,8 @@ my_fetch(PG_FUNCTION_ARGS) fetched_data_type *fetched_data; GISTENTRY *retval; - retval = palloc(sizeof(GISTENTRY)); - fetched_data = palloc(sizeof(fetched_data_type)); + retval = palloc_object(GISTENTRY); + fetched_data = palloc_object(fetched_data_type); /* * Convert 'fetched_data' into the a Datum of the original datatype. @@ -1170,11 +1170,11 @@ my_sortsupport(PG_FUNCTION_ARGS) - stratnum + translate_cmptype Given a CompareType value from - src/include/nodes/primnodes.h, returns a strategy + src/include/access/cmptype.h, returns a strategy number used by this operator class for matching functionality. The function should return InvalidStrategy if the operator class has no matching strategy. @@ -1188,12 +1188,23 @@ my_sortsupport(PG_FUNCTION_ARGS) non-WITHOUT OVERLAPS part(s) of an index constraint. + + This support function corresponds to the index access method callback + function amtranslatecmptype (see ). The + amtranslatecmptype callback function for + GiST indexes merely calls down to the + translate_cmptype support function of the + respective operator family, since the GiST index access method has no + fixed strategy numbers itself. + + The SQL declaration of the function must look like this: -CREATE OR REPLACE FUNCTION my_stratnum(integer) +CREATE OR REPLACE FUNCTION my_translate_cmptype(integer) RETURNS smallint AS 'MODULE_PATHNAME' LANGUAGE C STRICT; @@ -1202,7 +1213,7 @@ LANGUAGE C STRICT; And the operator family registration must look like this: ALTER OPERATOR FAMILY my_opfamily USING gist ADD - FUNCTION 12 ("any", "any") my_stratnum(int); + FUNCTION 12 ("any", "any") my_translate_cmptype(int); @@ -1210,10 +1221,10 @@ ALTER OPERATOR FAMILY my_opfamily USING gist ADD The matching code in the C module could then follow this skeleton: -PG_FUNCTION_INFO_V1(my_stratnum); +PG_FUNCTION_INFO_V1(my_translate_cmptype); Datum -my_stratnum(PG_FUNCTION_ARGS) +my_translate_cmptype(PG_FUNCTION_ARGS) { CompareType cmptype = PG_GETARG_INT32(0); StrategyNumber ret = InvalidStrategy; @@ -1232,11 +1243,11 @@ my_stratnum(PG_FUNCTION_ARGS) One translation function is provided by PostgreSQL: - gist_stratnum_common is for operator classes that + gist_translate_cmptype_common is for operator classes that use the RT*StrategyNumber constants. The btree_gist extension defines a second translation function, - gist_stratnum_btree, for operator classes that use + gist_translate_cmptype_btree, for operator classes that use the BT*StrategyNumber constants. diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml index b88cac598e901..a76cf5c383fc8 100644 --- a/doc/src/sgml/glossary.sgml +++ b/doc/src/sgml/glossary.sgml @@ -81,6 +81,21 @@ + + Application time + + + In a temporal table, + the dimension of time that represents when the entity described by the table + changed (as opposed to the table itself). + + + For more information, see + . + + + + Asynchronous I/O AIO @@ -1419,11 +1434,15 @@ Relation - The generic term for all objects in a - database - that have a name and a list of - attributes - defined in a specific order. + Mathematically, a relation is a set of + tuples; + this is the sense meant in the term "relational database". + + + + In PostgreSQL, "relation" is commonly used to + mean an SQL object + that has a name and a list of attributes defined in a specific order. Tables, sequences, views, @@ -1431,15 +1450,14 @@ materialized views, composite types, and indexes are all relations. + A relation in this sense is a container or a descriptor for a set of tuples. + - More generically, a relation is a set of tuples; for example, - the result of a query is also a relation. - - - In PostgreSQL, - Class is an archaic synonym for - relation. + Class is an alternative but archaic term. + The system catalog + pg_class + holds an entry for each PostgreSQL relation. @@ -1844,6 +1862,22 @@ + + System time + + + In a temporal table, + the dimension of time that represents when the table itself was changed + (as opposed to the entity the table describes). + Often used for auditing, compliance, and debugging. + + + For more information, see + . + + + + Table @@ -1882,6 +1916,22 @@ + + Temporal table + + + Tables + that track application time + or system time (or both). + Not to be confused with temporary tables. + + + For more information, see + . + + + + Temporary table diff --git a/doc/src/sgml/hash.sgml b/doc/src/sgml/hash.sgml index 9e69ef91fe834..34f3b2cb0c1b3 100644 --- a/doc/src/sgml/hash.sgml +++ b/doc/src/sgml/hash.sgml @@ -125,11 +125,10 @@ Both scanning the index and inserting tuples require locating the bucket where a given tuple ought to be located. To do this, we need the bucket count, highmask, and lowmask from the metapage; however, it's undesirable - for performance reasons to have to have to lock and pin the metapage for - every such operation. Instead, we retain a cached copy of the metapage - in each backend's relcache entry. This will produce the correct bucket - mapping as long as the target bucket hasn't been split since the last - cache refresh. + for performance reasons to have to lock and pin the metapage for every such + operation. Instead, we retain a cached copy of the metapage in each + backend's relcache entry. This will produce the correct bucket mapping as + long as the target bucket hasn't been split since the last cache refresh.
diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml index b47d8b4106efb..33ca3f0286c17 100644 --- a/doc/src/sgml/high-availability.sgml +++ b/doc/src/sgml/high-availability.sgml @@ -151,7 +151,7 @@ protocol to make nodes agree on a serializable transactional order. A standby server can be implemented using file-based log shipping - () or streaming replication (see + () or streaming physical replication (see ), or a combination of both. For information on hot standby, see . @@ -628,7 +628,7 @@ protocol to make nodes agree on a serializable transactional order. In standby mode, the server continuously applies WAL received from the primary server. The standby server can read WAL from a WAL archive (see ) or directly from the primary - over a TCP connection (streaming replication). The standby server will + over a TCP connection (streaming physical replication). The standby server will also attempt to restore any WAL found in the standby cluster's pg_wal directory. That typically happens after a server restart, when the standby replays again WAL that was streamed from the @@ -745,8 +745,8 @@ protocol to make nodes agree on a serializable transactional order. A simple example of configuration is: primary_conninfo = 'host=192.168.1.50 port=5432 user=foo password=foopass options=''-c wal_sender_timeout=5000''' -restore_command = 'cp /path/to/archive/%f %p' -archive_cleanup_command = 'pg_archivecleanup /path/to/archive %r' +restore_command = 'cp "/path/to/archive/%f" "%p"' +archive_cleanup_command = 'pg_archivecleanup /path/to/archive "%r"' @@ -772,6 +772,14 @@ archive_cleanup_command = 'pg_archivecleanup /path/to/archive %r' generated, without waiting for the WAL file to be filled. + + + This discussion of streaming replication assumes physical replication. + Although you could treat a logical replication subscriber as a warm standby, + it would require some differences to what is described here. + + + Streaming replication is asynchronous by default (see ), in which case there is @@ -925,11 +933,11 @@ primary_conninfo = 'host=192.168.1.50 port=5432 user=foo password=foopass' Replication slots provide an automated way to ensure that the - primary server does - not remove WAL segments until they have been received by all standbys, - and that the primary does not remove rows which could cause a - recovery conflict even when the - standby is disconnected. + primary server does not remove WAL segments until they have been + received, physically or logically, by all standbys/subscribers, + and that the primary does not remove rows which could cause a recovery conflict on physical + replicas even when the standby is disconnected. In lieu of using replication slots, it is possible to prevent the removal @@ -943,9 +951,9 @@ primary_conninfo = 'host=192.168.1.50 port=5432 user=foo password=foopass' Similarly, on its own, without - also using a replication slot, provides protection against relevant rows + also using a physical replication slot, provides protection against relevant rows being removed by vacuum, but provides no protection during any time period - when the standby is not connected. + when the physical replication standby is disconnected. @@ -978,7 +986,7 @@ primary_conninfo = 'host=192.168.1.50 port=5432 user=foo password=foopass' Configuration Example - You can create a replication slot like this: + You can create a physical replication slot on the primary like this: postgres=# SELECT * FROM pg_create_physical_replication_slot('node_a_slot'); slot_name | lsn @@ -1376,6 +1384,60 @@ synchronous_standby_names = 'ANY 2 (s1, s2, s3)' + + Read-Your-Writes Consistency + + + In asynchronous replication, there is always a short window where changes + on the primary may not yet be visible on the standby due to replication + lag. This can lead to inconsistencies when an application writes data on + the primary and then immediately issues a read query on the standby. + However, it is possible to address this without switching to synchronous + replication. + + + + To address this, PostgreSQL offers a mechanism for read-your-writes + consistency. The key idea is to ensure that a client sees its own writes + by synchronizing the WAL replay on the standby with the known point of + change on the primary. + + + + This is achieved by the following steps. After performing write + operations, the application retrieves the current WAL location using a + function call like this. + + +postgres=# SELECT pg_current_wal_insert_lsn(); +pg_current_wal_insert_lsn +-------------------- +0/306EE20 +(1 row) + + + + + The LSN obtained from the primary is then communicated + to the standby server. This can be managed at the application level or + via the connection pooler. On the standby, the application issues the + command to block further processing until + the standby's WAL replay process reaches (or exceeds) the specified + LSN. + + +postgres=# WAIT FOR LSN '0/306EE20'; + status +-------- + success +(1 row) + + Once the command returns a status of success, it guarantees that all + changes up to the provided LSN have been applied, + ensuring that subsequent read queries will reflect the latest updates. + + + Continuous Archiving in Standby diff --git a/doc/src/sgml/hstore.sgml b/doc/src/sgml/hstore.sgml index 44325e0bba0c4..5f8d1d1ff4303 100644 --- a/doc/src/sgml/hstore.sgml +++ b/doc/src/sgml/hstore.sgml @@ -600,7 +600,7 @@ b Extracts an hstore's keys and values as a set of records. - select * from each('a=>1,b=>2') + SELECT * FROM each('a=>1,b=>2') key | value @@ -799,7 +799,7 @@ UPDATE tab SET h = h || hstore('c', '3'); If multiple keys are to be added or changed in one operation, the concatenation approach is more efficient than subscripting: -UPDATE tab SET h = h || hstore(array['q', 'w'], array['11', '12']); +UPDATE tab SET h = h || hstore(ARRAY['q', 'w'], ARRAY['11', '12']); diff --git a/doc/src/sgml/images/Makefile b/doc/src/sgml/images/Makefile index 645519095d066..fd55b9ad23f4b 100644 --- a/doc/src/sgml/images/Makefile +++ b/doc/src/sgml/images/Makefile @@ -5,7 +5,9 @@ ALL_IMAGES = \ genetic-algorithm.svg \ gin.svg \ - pagelayout.svg + pagelayout.svg \ + temporal-entities.svg \ + temporal-references.svg DITAA = ditaa DOT = dot diff --git a/doc/src/sgml/images/temporal-entities.svg b/doc/src/sgml/images/temporal-entities.svg new file mode 100644 index 0000000000000..23958c3203c2a --- /dev/null +++ b/doc/src/sgml/images/temporal-entities.svg @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + products + (5, 5.00, [1 Jan 2020,1 Jan 2022)) + 2020 + products + (6, 9.00, [1 Jan 2021,1 Jan 2024)) + 2021 + 2022 + products + (5, 8.00, [1 Jan 2022,)) + 2023 + 2024 + ... + + diff --git a/doc/src/sgml/images/temporal-entities.txt b/doc/src/sgml/images/temporal-entities.txt new file mode 100644 index 0000000000000..0def28e0a5922 --- /dev/null +++ b/doc/src/sgml/images/temporal-entities.txt @@ -0,0 +1,14 @@ ++-------------------------------------+-------------------------------------------------------+ +| cGRE | cGRE | +| products | products | +| (5, 5.00, [1 Jan 2020,1 Jan 2022)) | (5, 8.00, [1 Jan 2022,)) | +| | | ++------------------+------------------+-------------------------------------+-----------------+ + | cGRE | + | products | + | (6, 9.00, [1 Jan 2021,1 Jan 2024)) | + | | + +--------------------------------------------------------+ + +| | | | | | +2020 2021 2022 2023 2024 ... diff --git a/doc/src/sgml/images/temporal-references.svg b/doc/src/sgml/images/temporal-references.svg new file mode 100644 index 0000000000000..09230c4e4e90b --- /dev/null +++ b/doc/src/sgml/images/temporal-references.svg @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + products + (5, 5.00, [1 Jan 2020,1 Jan 2022)) + 2021 + variants + (8, 5, 'Medium', [1 Jan 2021,1 Jun 2023)) + 2020 + variants + (9, 5, 'XXL', [1 Mar 2022,1 Jun 2024)) + products + (5, 8.00, [1 Jan 2022,)) + 2023 + 2022 + 2024 + ... + + diff --git a/doc/src/sgml/images/temporal-references.txt b/doc/src/sgml/images/temporal-references.txt new file mode 100644 index 0000000000000..57dedc32e0b34 --- /dev/null +++ b/doc/src/sgml/images/temporal-references.txt @@ -0,0 +1,19 @@ ++------------------------------------+------------------------------------------------------+ +| cGRE | cGRE | +| products | products | +| (5, 5.00, [1 Jan 2020,1 Jan 2022)) | (5, 8.00, [1 Jan 2022,)) | +| | | ++------------------+-----------------+----------------------------+-------------------------+ + | cYEL | + | variants | + | (8, 5, 'Medium', [1 Jan 2021,1 Jun 2023)) | + | | + +-----------------------+----------------------+------------------+ + | cYEL | + | variants | + | (9, 5, 'XXL', [1 Mar 2022,1 Jun 2024)) | + | | + +-----------------------------------------+ + +| | | | | | +2020 2021 2022 2023 2024 ... diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 1aa4741a8eaee..63d7e376f195e 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -147,7 +147,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; - aminsertcleanup_function aminsertcleanup; + aminsertcleanup_function aminsertcleanup; /* can be NULL */ ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 9c4f76abf0dcd..55f39b0df2f2e 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -593,7 +593,7 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor); By default, B-tree indexes store their entries in ascending order with nulls last (table TID is treated as a tiebreaker column among otherwise equal entries). This means that a forward scan of an - index on column x produces output satisfying ORDER BY x + index on column x produces output satisfying ORDER BY x (or more verbosely, ORDER BY x ASC NULLS LAST). The index can also be scanned backward, producing output satisfying ORDER BY x DESC @@ -698,23 +698,23 @@ CREATE INDEX test3_desc_index ON test3 (id DESC NULLS LAST); indexes are best, but sometimes it's better to create separate indexes and rely on the index-combination feature. For example, if your workload includes a mix of queries that sometimes involve only column - x, sometimes only column y, and sometimes both + x, sometimes only column y, and sometimes both columns, you might choose to create two separate indexes on - x and y, relying on index combination to + x and y, relying on index combination to process the queries that use both columns. You could also create a multicolumn index on (x, y). This index would typically be more efficient than index combination for queries involving both columns, but as discussed in , it would be less useful for queries involving only y. Just how useful will depend on how effective the B-tree index skip scan - optimization is; if x has no more than several hundred + optimization is; if x has no more than several hundred distinct values, skip scan will make searches for specific - y values execute reasonably efficiently. A combination + y values execute reasonably efficiently. A combination of a multicolumn index on (x, y) and a separate index on - y might also serve reasonably well. For - queries involving only x, the multicolumn index could be + y might also serve reasonably well. For + queries involving only x, the multicolumn index could be used, though it would be larger and hence slower than an index on - x alone. The last alternative is to create all three + x alone. The last alternative is to create all three indexes, but this is probably only reasonable if the table is searched much more often than it is updated and all three types of query are common. If one of the types of query is much less common than the @@ -949,19 +949,19 @@ WHERE url = '/index.html' AND client_ip = inet '192.168.100.23'; command to create the index would look like this: CREATE INDEX orders_unbilled_index ON orders (order_nr) - WHERE billed is not true; + WHERE billed IS NOT TRUE; A possible query to use this index would be: -SELECT * FROM orders WHERE billed is not true AND order_nr < 10000; +SELECT * FROM orders WHERE billed IS NOT TRUE AND order_nr < 10000; However, the index can also be used in queries that do not involve order_nr at all, e.g.: -SELECT * FROM orders WHERE billed is not true AND amount > 5000.00; +SELECT * FROM orders WHERE billed IS NOT TRUE AND amount > 5000.00; This is not as efficient as a partial index on the amount column would be, since the system has to @@ -1179,9 +1179,9 @@ CREATE INDEX mytable_cat_data ON mytable (category, data); The query must reference only columns stored in the index. For - example, given an index on columns x - and y of a table that also has a - column z, these queries could use index-only scans: + example, given an index on columns x + and y of a table that also has a + column z, these queries could use index-only scans: SELECT x, y FROM tab WHERE x = 'key'; SELECT x FROM tab WHERE x = 'key' AND y < 42; @@ -1262,15 +1262,15 @@ CREATE INDEX tab_x_y ON tab(x) INCLUDE (y); - Because column y is not part of the index's search + Because column y is not part of the index's search key, it does not have to be of a data type that the index can handle; it's merely stored in the index and is not interpreted by the index machinery. Also, if the index is a unique index, that is CREATE UNIQUE INDEX tab_x_y ON tab(x) INCLUDE (y); - the uniqueness condition applies to just column x, - not to the combination of x and y. + the uniqueness condition applies to just column x, + not to the combination of x and y. (An INCLUDE clause can also be written in UNIQUE and PRIMARY KEY constraints, providing alternative syntax for setting up an index like @@ -1300,7 +1300,7 @@ CREATE UNIQUE INDEX tab_x_y ON tab(x) INCLUDE (y); CREATE INDEX tab_x_y ON tab(x, y); - even though they had no intention of ever using y as + even though they had no intention of ever using y as part of a WHERE clause. This works fine as long as the extra columns are trailing columns; making them be leading columns is unwise for the reasons explained in . @@ -1340,7 +1340,7 @@ SELECT f(x) FROM tab WHERE f(x) < 1; context f(x), but the planner does not notice that and concludes that an index-only scan is not possible. If an index-only scan seems sufficiently worthwhile, this can be worked around by - adding x as an included column, for example + adding x as an included column, for example CREATE INDEX tab_f_x ON tab (f(x)) INCLUDE (x); diff --git a/doc/src/sgml/information_schema.sgml b/doc/src/sgml/information_schema.sgml index 19dffe7be6aa7..60b4c4ae8c091 100644 --- a/doc/src/sgml/information_schema.sgml +++ b/doc/src/sgml/information_schema.sgml @@ -2089,15 +2089,15 @@ Since data types can be defined in a variety of ways in SQL, and PostgreSQL contains additional ways to define data types, their representation in the information schema - can be somewhat difficult. The column data_type + can be somewhat difficult. The column data_type is supposed to identify the underlying built-in type of the column. In PostgreSQL, this means that the type is defined in the system catalog schema pg_catalog. This column might be useful if the application can handle the well-known built-in types specially (for example, format the numeric types differently or use the data in - the precision columns). The columns udt_name, - udt_schema, and udt_catalog + the precision columns). The columns udt_name, + udt_schema, and udt_catalog always identify the underlying data type of the column, even if the column is based on a domain. (Since PostgreSQL treats built-in types like @@ -2107,8 +2107,8 @@ type, because in that case it wouldn't matter if the column is really based on a domain. If the column is based on a domain, the identity of the domain is stored in the columns - domain_name, domain_schema, - and domain_catalog. If you want to pair up + domain_name, domain_schema, + and domain_catalog. If you want to pair up columns with their associated data types and treat domains as separate types, you could write coalesce(domain_name, udt_name), etc. @@ -6376,7 +6376,7 @@ ORDER BY c.ordinal_position; the sequence data type (see above). The precision indicates the number of significant digits. It can be expressed in decimal (base 10) or binary (base 2) terms, as specified in the - column numeric_precision_radix. + column numeric_precision_radix.
@@ -6386,8 +6386,8 @@ ORDER BY c.ordinal_position; This column indicates in which base the values in the columns - numeric_precision and - numeric_scale are expressed. The value is + numeric_precision and + numeric_scale are expressed. The value is either 2 or 10. @@ -6402,7 +6402,7 @@ ORDER BY c.ordinal_position; of significant digits to the right of the decimal point. It can be expressed in decimal (base 10) or binary (base 2) terms, as specified in the column - numeric_precision_radix. + numeric_precision_radix. @@ -6461,10 +6461,10 @@ ORDER BY c.ordinal_position; - <literal>sql_features</literal> + <structname>sql_features</structname> - The table sql_features contains information + The table sql_features contains information about which formal features defined in the SQL standard are supported by PostgreSQL. This is the same information that is presented in . @@ -6556,10 +6556,10 @@ ORDER BY c.ordinal_position; - <literal>sql_implementation_info</literal> + <structname>sql_implementation_info</structname> - The table sql_implementation_info contains + The table sql_implementation_info contains information about various aspects that are left implementation-defined by the SQL standard. This information is primarily intended for use in the context of the ODBC interface; @@ -6638,10 +6638,10 @@ ORDER BY c.ordinal_position; - <literal>sql_parts</literal> + <structname>sql_parts</structname> - The table sql_parts contains information about + The table sql_parts contains information about which of the several parts of the SQL standard are supported by PostgreSQL. @@ -6714,10 +6714,10 @@ ORDER BY c.ordinal_position; - <literal>sql_sizing</literal> + <structname>sql_sizing</structname> - The table sql_sizing contains information about + The table sql_sizing contains information about various size limits and maximum values in PostgreSQL. This information is primarily intended for use in the context of the ODBC interface; @@ -7843,7 +7843,7 @@ ORDER BY c.ordinal_position; in PostgreSQL) and distinct types (not implemented in PostgreSQL). To be future-proof, use the - column user_defined_type_category to + column user_defined_type_category to differentiate between these. Other user-defined types such as base types and enums, which are PostgreSQL extensions, are not shown here. For domains, diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index de19f3ad92952..fe8d73e1f8c01 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -65,16 +65,15 @@ - The minimum required version of Meson is 0.54. + The minimum required version of Meson is 0.57.2.
- You need an ISO/ANSI C compiler (at least - C99-compliant). Recent - versions of GCC are recommended, but - PostgreSQL is known to build using a wide variety + You need a C compiler that supports at least C11. Recent versions of + GCC are recommended, but + PostgreSQL is known to build using a variety of compilers from different vendors. @@ -1677,10 +1676,6 @@ build-postgresql: using the GCC compiler: ./configure CC='gcc -m64' --enable-dtrace DTRACEFLAGS='-64' ... - - Using Sun's compiler: - -./configure CC='/opt/SUNWspro/bin/cc -xtarget=native64' --enable-dtrace DTRACEFLAGS='-64' ... @@ -3175,7 +3170,7 @@ ninja install Enable additional test suites, which are not run by default because they are not secure to run on a multiuser system, require special - software to run, or are resource intensive. The argument is a + software to run, or are resource-intensive. The argument is a whitespace-separated list of tests to enable. See for details. If the PG_TEST_EXTRA environment variable is set when the @@ -3714,24 +3709,13 @@ xcrun --show-sdk-path Required Tools - You can build with either GCC or Sun's compiler suite. For - better code optimization, Sun's compiler is strongly recommended - on the SPARC architecture. If - you are using Sun's compiler, be careful not to select - /usr/ucb/cc; - use /opt/SUNWspro/bin/cc. + Only GCC is supported as the compiler. Sun's compiler suite is no longer + supported. - You can download Sun Studio - from . - Many GNU tools are integrated into Solaris 10, or they are - present on the Solaris companion CD. If you need packages for - older versions of Solaris, you can find these tools - at . - If you prefer - sources, look - at . + Many additional dependencies can be installed via the package management + system. @@ -3754,27 +3738,6 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib" - - Compiling for Optimal Performance - - - On the SPARC architecture, Sun Studio is strongly recommended for - compilation. Try using the optimization - flag to generate significantly faster binaries. Do not use any - flags that modify behavior of floating-point operations - and errno processing (e.g., - ). - - - - If you do not have a reason to use 64-bit binaries on SPARC, - prefer the 32-bit version. The 64-bit operations are slower and - 64-bit binaries are slower than the 32-bit variants. On the - other hand, 32-bit code on the AMD64 CPU family is not native, - so 32-bit code is significantly slower on that CPU family. - - - Using DTrace for Tracing PostgreSQL @@ -3782,22 +3745,6 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib" Yes, using DTrace is possible. See for further information. - - - If you see the linking of the postgres executable abort with an - error message like: - -Undefined first referenced - symbol in file -AbortTransaction utils/probes.o -CommitTransaction utils/probes.o -ld: fatal: Symbol referencing errors. No output written to postgres -collect2: ld returned 1 exit status -make: *** [postgres] Error 1 - - your DTrace installation is too old to handle probes in static - functions. You need Solaris 10u4 or newer to use DTrace. - @@ -3847,17 +3794,13 @@ make: *** [postgres] Error 1 Both 32-bit and 64-bit builds are possible with the Microsoft Compiler suite. 32-bit PostgreSQL builds are possible with - Visual Studio 2015 to + Visual Studio 2019 to Visual Studio 2022, as well as standalone Windows SDK releases 10 and above. 64-bit PostgreSQL builds are supported with Microsoft Windows SDK version 10 and above or - Visual Studio 2015 and above. + Visual Studio 2019 and above. - + <application>libpq</application> — C Library @@ -2168,6 +2168,24 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname + + ssl_max_protocol_version + + + This parameter specifies the maximum SSL/TLS protocol version to allow + for the connection. Valid values are TLSv1, + TLSv1.1, TLSv1.2 and + TLSv1.3. The supported protocols depend on the + version of OpenSSL used, older versions + not supporting the most modern protocol versions. If not set, this + parameter is ignored and the connection will use the maximum bound + defined by the backend, if set. Setting the maximum protocol version + is mainly useful for testing or if some component has issues working + with a newer protocol. + + + + min_protocol_version @@ -2202,7 +2220,7 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname server does not support the protocol version requested by the client, the connection is automatically downgraded to a lower minor protocol version that the server supports. After the connection attempt has - completed you can use to + completed you can use to find out which exact protocol version was negotiated. @@ -2216,24 +2234,6 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname - - ssl_max_protocol_version - - - This parameter specifies the maximum SSL/TLS protocol version to allow - for the connection. Valid values are TLSv1, - TLSv1.1, TLSv1.2 and - TLSv1.3. The supported protocols depend on the - version of OpenSSL used, older versions - not supporting the most modern protocol versions. If not set, this - parameter is ignored and the connection will use the maximum bound - defined by the backend, if set. Setting the maximum protocol version - is mainly useful for testing or if some component has issues working - with a newer protocol. - - - - krbsrvname @@ -2320,6 +2320,19 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname + + servicefile + + + This option specifies the name of the per-user connection service file + (see ). + Defaults to ~/.pg_service.conf, or + %APPDATA%\postgresql\.pg_service.conf on + Microsoft Windows. + + + + target_session_attrs @@ -2740,26 +2753,6 @@ char *PQport(const PGconn *conn); - - PQservicePQservice - - - - Returns the service of the active connection. - - -char *PQservice(const PGconn *conn); - - - - - returns NULL if the - conn argument is NULL. - Otherwise, if there was no service provided, it returns an empty string. - - - - PQttyPQtty @@ -4515,7 +4508,7 @@ Oid PQftable(const PGresult *res, InvalidOid is returned if the column number is out of range, or if the specified column is not a simple reference to a table column. - You can query the system table pg_class to determine + You can query the system table pg_class to determine exactly which table is referenced. @@ -4585,7 +4578,7 @@ Oid PQftype(const PGresult *res, - You can query the system table pg_type to + You can query the system table pg_type to obtain the names and properties of the various data types. The OIDs of the built-in data types are defined in the file catalog/pg_type_d.h @@ -9160,12 +9153,8 @@ myEventProc(PGEventId evtId, void *evtInfo, void *passThrough) PGSERVICEFILE - PGSERVICEFILE specifies the name of the per-user - connection service file - (see ). - Defaults to ~/.pg_service.conf, or - %APPDATA%\postgresql\.pg_service.conf on - Microsoft Windows. + PGSERVICEFILE behaves the same as the + connection parameter. @@ -9596,7 +9585,8 @@ myEventProc(PGEventId evtId, void *evtInfo, void *passThrough) On Microsoft Windows, it is named %APPDATA%\postgresql\.pg_service.conf (where %APPDATA% refers to the Application Data subdirectory - in the user's profile). A different file name can be specified by + in the user's profile). A different file name can be specified using the + servicefile key word in a libpq connection string or by setting the environment variable PGSERVICEFILE. The system-wide file is named pg_service.conf. By default it is sought in the etc directory @@ -10432,10 +10422,14 @@ typedef struct PGoauthBearerRequest /* Hook outputs */ - /* Callback implementing a custom asynchronous OAuth flow. */ + /* + * Callback implementing a custom asynchronous OAuth flow. The signature is + * platform-dependent: PQ_SOCKTYPE is SOCKET on Windows, and int everywhere + * else. + */ PostgresPollingStatusType (*async) (PGconn *conn, struct PGoauthBearerRequest *request, - SOCKTYPE *altsock); + PQ_SOCKTYPE *altsock); /* Callback to clean up custom allocations. */ void (*cleanup) (PGconn *conn, struct PGoauthBearerRequest *request); @@ -10492,7 +10486,7 @@ typedef struct PGoauthBearerRequest hook. When the callback cannot make further progress without blocking, it should return either PGRES_POLLING_READING or PGRES_POLLING_WRITING after setting - *pgsocket to the file descriptor that will be marked + *altsock to the file descriptor that will be marked ready to read/write when progress can be made again. (This descriptor is then provided to the top-level polling loop via PQsocket().) Return PGRES_POLLING_OK @@ -10871,7 +10865,7 @@ main(int argc, char **argv) /* * Our test case here involves using a cursor, for which we must be inside * a transaction block. We could do the whole thing with a single - * PQexec() of "select * from pg_database", but that's too trivial to make + * PQexec() of "SELECT * FROM pg_database", but that's too trivial to make * a good example. */ @@ -10888,7 +10882,7 @@ main(int argc, char **argv) /* * Fetch rows from pg_database, the system catalog of databases */ - res = PQexec(conn, "DECLARE myportal CURSOR FOR select * from pg_database"); + res = PQexec(conn, "DECLARE myportal CURSOR FOR SELECT * FROM pg_database"); if (PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "DECLARE CURSOR failed: %s", PQerrorMessage(conn)); diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml index 686dd441d0223..58ce75d8b63c9 100644 --- a/doc/src/sgml/logical-replication.sgml +++ b/doc/src/sgml/logical-replication.sgml @@ -6,7 +6,7 @@ Logical replication is a method of replicating data objects and their changes, based upon their replication identity (usually a primary key). We - use the term logical in contrast to physical replication, which uses exact + use the term logical replication in contrast to physical replication, which uses exact block addresses and byte-by-byte replication. PostgreSQL supports both mechanisms concurrently, see . Logical replication allows fine-grained control over both data replication and @@ -41,27 +41,27 @@ Sending incremental changes in a single database or a subset of a - database to subscribers as they occur. + database to subscribers as they occur - Firing triggers for individual changes as they arrive on the - subscriber. + Sending a subset of the database to multiple databases (i.e., + broadcast) - Consolidating multiple databases into a single one (for example for - analytical purposes). + Consolidating multiple databases into a single one (e.g., for + analytics). - Replicating between different major versions of PostgreSQL. + Replicating between different major versions of PostgreSQL @@ -80,7 +80,8 @@ - Sharing a subset of the database between multiple databases. + Firing triggers for individual changes as they arrive on the + subscriber. @@ -102,16 +103,20 @@ A publication can be defined on any physical replication primary. The node where a publication is defined is referred to as publisher. A publication is a set of changes - generated from a table or a group of tables, and might also be described as - a change set or replication set. Each publication exists in only one database. + generated from a table, a group of tables or the current state of all + sequences, and might also be described as a change set or replication set. + Each publication exists in only one database. Publications are different from schemas and do not affect how the table is accessed. Each table can be added to multiple publications if needed. - Publications may currently only contain tables and all tables in schema. - Objects must be added explicitly, except when a publication is created for - ALL TABLES. + Publications may currently only contain tables or sequences. Objects must be + added explicitly, except when a publication is created using + FOR TABLES IN SCHEMA, FOR ALL TABLES, + or FOR ALL SEQUENCES. Unlike tables, sequences can be + synchronized at any time. For more information, see + . @@ -220,8 +225,10 @@ - Each subscription will receive changes via one replication slot (see - ). Additional replication + By default a new subscription creates a logical replication slot on + the publisher and then uses this slot to track relevant transaction + activity and preserve necessary WAL (see ). Additional replication slots may be required for the initial data synchronization of pre-existing table data and those will be dropped at the end of data synchronization. @@ -283,10 +290,10 @@ - Replication Slot Management + Logical Replication Slot Management - As mentioned earlier, each (active) subscription receives changes from a + As mentioned earlier, each (active) subscription uses a logical replication slot on the remote (publishing) side. @@ -298,7 +305,7 @@ Table relid, system identifier sysid) - Normally, the remote replication slot is created automatically when the + Normally, the remote logical replication slot is created automatically when the subscription is created using CREATE SUBSCRIPTION and it is dropped automatically when the subscription is dropped using @@ -437,7 +444,7 @@ Furthermore, because the initial data copy ignores the publish operation, and because publication pub3a has no row filter, - it means the copied table t3 contains all rows even when + it means the copied table t3 contains all rows even when they do not match the row filter of publication pub3b. /* sub # */ SELECT * FROM t3; @@ -533,17 +540,17 @@ - Examples: Deferred Replication Slot Creation + Examples: Deferred Logical Replication Slot Creation There are some cases (e.g. ) where, if the - remote replication slot was not created automatically, the user must create + remote logical replication slot was not created automatically, the user must create it manually before the subscription can be activated. The steps to create the slot and activate the subscription are shown in the following examples. These examples specify the standard logical decoding output plugin - (pgoutput), which is what the built-in logical - replication uses. + (), + which is what the built-in logical replication uses. First, create a publication for the examples to use. @@ -575,8 +582,8 @@ HINT: To initiate replication, you must manually create the replication slot, e /* pub # */ SELECT * FROM pg_create_logical_replication_slot('sub1', 'pgoutput'); slot_name | lsn ------------+----------- - sub1 | 0/19404D0 +-----------+------------ + sub1 | 0/019404D0 (1 row) @@ -617,8 +624,8 @@ HINT: To initiate replication, you must manually create the replication slot, e /* pub # */ SELECT * FROM pg_create_logical_replication_slot('myslot', 'pgoutput'); slot_name | lsn ------------+----------- - myslot | 0/19059A0 +-----------+------------ + myslot | 0/019059A0 (1 row) @@ -655,8 +662,8 @@ HINT: To initiate replication, you must manually create the replication slot, e /* pub # */ SELECT * FROM pg_create_logical_replication_slot('myslot', 'pgoutput'); slot_name | lsn ------------+----------- - myslot | 0/1905930 +-----------+------------ + myslot | 0/01905930 (1 row) @@ -709,8 +716,8 @@ HINT: To initiate replication, you must manually create the replication slot, e - To confirm that the standby server is indeed ready for failover, follow these - steps to verify that all necessary logical replication slots have been + To confirm that the standby server is indeed ready for failover for a given subscriber, follow these + steps to verify that all the logical replication slots required by that subscriber have been synchronized to the standby server: @@ -764,7 +771,7 @@ HINT: To initiate replication, you must manually create the replication slot, e Check that the logical replication slots identified above exist on the standby server and are ready for failover. -/* standby # */ SELECT slot_name, (synced AND NOT temporary AND NOT conflicting) AS failover_ready +/* standby # */ SELECT slot_name, (synced AND NOT temporary AND invalidation_reason IS NULL) AS failover_ready FROM pg_replication_slots WHERE slot_name IN ('sub1','sub2','sub3', 'pg_16394_sync_16385_7394666715149055164'); @@ -782,10 +789,42 @@ HINT: To initiate replication, you must manually create the replication slot, e If all the slots are present on the standby server and the result (failover_ready) of the above SQL query is true, then - existing subscriptions can continue subscribing to publications now on the - new primary server. + existing subscriptions can continue subscribing to publications on the new + primary server. + + The first two steps in the above procedure are meant for a + PostgreSQL subscriber. It is recommended to run + these steps on each subscriber node, that will be served by the designated + standby after failover, to obtain the complete list of replication + slots. This list can then be verified in Step 3 to ensure failover readiness. + Non-PostgreSQL subscribers, on the other hand, may + use their own methods to identify the replication slots used by their + respective subscriptions. + + + + In some cases, such as during a planned failover, it is necessary to confirm + that all subscribers, whether PostgreSQL or + non-PostgreSQL, will be able to continue + replication after failover to a given standby server. In such cases, use the + following SQL, instead of performing the first two steps above, to identify + which replication slots on the primary need to be synced to the standby that + is intended for promotion. This query returns the relevant replication slots + associated with all the failover-enabled subscriptions. + + + + +/* primary # */ SELECT array_agg(quote_literal(r.slot_name)) AS slots + FROM pg_replication_slots r + WHERE r.failover AND NOT r.temporary; + slots +------- + {'sub1','sub2','sub3', 'pg_16394_sync_16385_7394666715149055164'} +(1 row) + @@ -1002,8 +1041,8 @@ HINT: To initiate replication, you must manually create the replication slot, e Create some publications. Publication p1 has one table (t1) and that table has a row filter. Publication - p2 has two tables. Table t1 has no row - filter, and table t2 has a row filter. Publication + p2 has two tables. Table t1 has no row + filter, and table t2 has a row filter. Publication p3 has two tables, and both of them have a row filter. 5 AND c = 'NSW'); @@ -1016,35 +1055,35 @@ HINT: To initiate replication, you must manually create the replication slot, e defined) for each publication. 5) AND (c = 'NSW'::text)) + "public.t1" WHERE ((a > 5) AND (c = 'NSW'::text)) - Publication p2 - Owner | All tables | Inserts | Updates | Deletes | Truncates | Via root -----------+------------+---------+---------+---------+-----------+---------- - postgres | f | t | t | t | t | f + Publication p2 + Owner | All tables | All sequences | Inserts | Updates | Deletes | Truncates | Generated columns | Via root +----------+------------+---------------+---------+---------+---------+-----------+-------------------+---------- + postgres | f | f | t | t | t | t | none | f Tables: - "public.t1" - "public.t2" WHERE (e = 99) + "public.t1" + "public.t2" WHERE (e = 99) - Publication p3 - Owner | All tables | Inserts | Updates | Deletes | Truncates | Via root -----------+------------+---------+---------+---------+-----------+---------- - postgres | f | t | t | t | t | f + Publication p3 + Owner | All tables | All sequences | Inserts | Updates | Deletes | Truncates | Generated columns | Via root +----------+------------+---------------+---------+---------+---------+-----------+-------------------+---------- + postgres | f | f | t | t | t | t | none | f Tables: - "public.t2" WHERE (d = 10) - "public.t3" WHERE (g = 10) + "public.t2" WHERE (d = 10) + "public.t3" WHERE (g = 10) ]]> psql can be used to show the row filter expressions (if - defined) for each table. See that table t1 is a member + defined) for each table. See that table t1 is a member of two publications, but has a row filter only in p1. - See that table t2 is a member of two publications, and + See that table t2 is a member of two publications, and has a different row filter in each of them. - On the subscriber node, create a table t1 with the same + On the subscriber node, create a table t1 with the same definition as the one on the publisher, and also create the subscription s1 that subscribes to the publication p1. @@ -1440,14 +1479,14 @@ Publications: Examples - Create a table t1 to be used in the following example. + Create a table t1 to be used in the following example. /* pub # */ CREATE TABLE t1(id int, a text, b text, c text, d text, e text, PRIMARY KEY(id)); Create a publication p1. A column list is defined for - table t1 to reduce the number of columns that will be + table t1 to reduce the number of columns that will be replicated. Notice that the order of column names in the column list does not matter. @@ -1459,10 +1498,10 @@ Publications: for each publication. /* pub # */ \dRp+ - Publication p1 - Owner | All tables | Inserts | Updates | Deletes | Truncates | Via root -----------+------------+---------+---------+---------+-----------+---------- - postgres | f | t | t | t | t | f + Publication p1 + Owner | All tables | All sequences | Inserts | Updates | Deletes | Truncates | Generated columns | Via root +----------+------------+---------------+---------+---------+---------+-----------+-------------------+---------- + postgres | f | f | t | t | t | t | none | f Tables: "public.t1" (id, a, b, d) @@ -1488,9 +1527,9 @@ Publications: - On the subscriber node, create a table t1 which now + On the subscriber node, create a table t1 which now only needs a subset of the columns that were on the publisher table - t1, and also create the subscription + t1, and also create the subscription s1 that subscribes to the publication p1. @@ -1501,7 +1540,7 @@ Publications: - On the publisher node, insert some rows to table t1. + On the publisher node, insert some rows to table t1. /* pub # */ INSERT INTO t1 VALUES(1, 'a-1', 'b-1', 'c-1', 'd-1', 'e-1'); /* pub # */ INSERT INTO t1 VALUES(2, 'a-2', 'b-2', 'c-2', 'd-2', 'e-2'); @@ -1560,7 +1599,7 @@ Publications: /* sub # */ CREATE TABLE tab_gen_to_gen (a int, b int GENERATED ALWAYS AS (a * 100) STORED); /* sub # */ CREATE SUBSCRIPTION sub1 CONNECTION 'dbname=test_pub' PUBLICATION pub1; -/* sub # */ SELECT * from tab_gen_to_gen; +/* sub # */ SELECT * FROM tab_gen_to_gen; a | b ---+---- 1 | 100 @@ -1711,6 +1750,247 @@ Publications: + + Replicating Sequences + + + To synchronize sequences from a publisher to a subscriber, first publish + them using + CREATE PUBLICATION ... FOR ALL SEQUENCES and then + on the subscriber: + + + + + + + use CREATE SUBSCRIPTION + to initially synchronize the published sequences. + + + + + use + ALTER SUBSCRIPTION ... REFRESH PUBLICATION + to synchronize only newly added sequences. + + + + + use + ALTER SUBSCRIPTION ... REFRESH SEQUENCES + to re-synchronize all sequences currently known to the subscription. + + + + + + + A sequence synchronization worker will be started + after executing any of the above subscriber commands, and will exit once the + sequences are synchronized. + + + The ability to launch a sequence synchronization worker is limited by the + + max_sync_workers_per_subscription + configuration. + + + + Sequence Definition Mismatches + + The sequence synchronization worker validates that sequence definitions + match between publisher and subscriber. If mismatches exist, the worker + logs an error identifying them and exits. The apply worker continues + respawning the sequence synchronization worker until synchronization + succeeds. See also + wal_retrieve_retry_interval. + + + To resolve this, use + ALTER SEQUENCE + to align the subscriber's sequence parameters with those of the publisher. + + + + + Refreshing Out-of-Sync Sequences + + Subscriber sequence values will become out of sync as the publisher + advances them. + + + To detect this, compare the + pg_subscription_rel.srsublsn + on the subscriber with the page_lsn obtained + from the pg_get_sequence_data + function for the sequence on the publisher. Then run + + ALTER SUBSCRIPTION ... REFRESH SEQUENCES to + re-synchronize if necessary. + + + + Each sequence caches a block of values (typically 32) in memory before + generating a new WAL record, so its LSN advances only after the entire + cached batch has been consumed. As a result, sequence value drift cannot + be detected by LSN comparison when sequence increments fall within the + same cached block (typically 32 values). + + + + + + Examples + + + Create some sequences on the publisher. + +/* pub # */ CREATE SEQUENCE s1 START WITH 10 INCREMENT BY 1; +/* pub # */ CREATE SEQUENCE s2 START WITH 100 INCREMENT BY 10; + + + + Create the same sequences on the subscriber. + +/* sub # */ CREATE SEQUENCE s1 START WITH 10 INCREMENT BY 1; +/* sub # */ CREATE SEQUENCE s2 START WITH 100 INCREMENT BY 10; + + + + Advance the sequences on the publisher a few times. + +/* pub # */ SELECT nextval('s1'); + nextval +--------- + 10 +(1 row) +/* pub # */ SELECT nextval('s1'); + nextval +--------- + 11 +(1 row) +/* pub # */ SELECT nextval('s2'); + nextval +--------- + 100 +(1 row) +/* pub # */ SELECT nextval('s2'); + nextval +--------- + 110 +(1 row) + + + + Check the sequence page LSNs on the publisher. + +/* pub # */ SELECT * FROM pg_get_sequence_data('s1'); + last_value | is_called | page_lsn +------------+-----------+------------ + 11 | t | 0/0178F9E0 +(1 row) +/* pub # */ SELECT * FROM pg_get_sequence_data('s2'); + last_value | is_called | page_lsn +------------+-----------+------------ + 110 | t | 0/0178FAB0 +(1 row) + + + + Create a publication for the sequences. + +/* pub # */ CREATE PUBLICATION pub1 FOR ALL SEQUENCES; + + + + Subscribe to the publication. + +/* sub # */ CREATE SUBSCRIPTION sub1 +/* sub - */ CONNECTION 'host=localhost dbname=test_pub application_name=sub1' +/* sub - */ PUBLICATION pub1; + + + + Verify that the initial sequence values are synchronized. + +/* sub # */ SELECT last_value, is_called FROM s1; + last_value | is_called +------------+----------- + 11 | t +(1 row) + +/* sub # */ SELECT last_value, is_called FROM s2; + last_value | is_called +------------+----------- + 110 | t +(1 row) + + + + Confirm that the sequence page LSNs on the publisher have been recorded + on the subscriber. + +/* sub # */ SELECT srrelid::regclass, srsublsn FROM pg_subscription_rel; + srrelid | srsublsn +---------+------------ + s1 | 0/0178F9E0 + s2 | 0/0178FAB0 +(2 rows) + + + + Advance the sequences on the publisher 50 more times. + +/* pub # */ SELECT nextval('s1') FROM generate_series(1,50); +/* pub # */ SELECT nextval('s2') FROM generate_series(1,50); + + + + Check the sequence page LSNs on the publisher. + +/* pub # */ SELECT * FROM pg_get_sequence_data('s1'); + last_value | is_called | page_lsn +------------+-----------+------------ + 61 | t | 0/017CED28 +(1 row) + +/* pub # */ SELECT * FROM pg_get_sequence_data('s2'); + last_value | is_called | page_lsn +------------+-----------+------------ + 610 | t | 0/017CEDF8 +(1 row) + + + + The difference between the sequence page LSNs on the publisher and the + sequence page LSNs on the subscriber indicates that the sequences are out + of sync. Re-synchronize all sequences known to the subscriber using + + ALTER SUBSCRIPTION ... REFRESH SEQUENCES. + +/* sub # */ ALTER SUBSCRIPTION sub1 REFRESH SEQUENCES; + + + + Recheck the sequences on the subscriber. + +/* sub # */ SELECT last_value, is_called FROM s1; + last_value | is_called +------------+----------- + 61 | t +(1 row) + +/* sub # */ SELECT last_value, is_called FROM s2; + last_value | is_called +------------+----------- + 610 | t +(1 row) + + + + Conflicts @@ -1772,11 +2052,27 @@ Publications: + + update_deleted + + + The tuple to be updated was concurrently deleted by another origin. The + update will simply be skipped in this scenario. Note that this conflict + can only be detected when + track_commit_timestamp + and retain_dead_tuples + are enabled. Note that if a tuple cannot be found due to the table being + truncated, only a update_missing conflict will + arise. Additionally, if the tuple was deleted by the same origin, an + update_missing conflict will arise. + + + update_missing - The tuple to be updated was not found. The update will simply be + The row to be updated was not found. The update will simply be skipped in this scenario. @@ -1797,7 +2093,7 @@ Publications: delete_missing - The tuple to be deleted was not found. The delete will simply be + The row to be deleted was not found. The delete will simply be skipped in this scenario. @@ -1831,8 +2127,8 @@ DETAIL: detailed_explanation. where detail_values is one of: Key (column_name , ...)=(column_value , ...) - existing local tuple (column_name , ...)=(column_value , ...) - remote tuple (column_name , ...)=(column_value , ...) + existing local row (column_name , ...)=(column_value , ...) + remote row (column_name , ...)=(column_value , ...) replica identity {(column_name , ...)=(column_value , ...) | full (column_name , ...)=(column_value , ...)} @@ -1866,32 +2162,32 @@ DETAIL: detailed_explanation. detailed_explanation includes the origin, transaction ID, and commit timestamp of the transaction that - modified the existing local tuple, if available. + modified the existing local row, if available. The Key section includes the key values of the local - tuple that violated a unique constraint for + row that violated a unique constraint for insert_exists, update_exists or multiple_unique_conflicts conflicts. - The existing local tuple section includes the local - tuple if its origin differs from the remote tuple for + The existing local row section includes the local + row if its origin differs from the remote row for update_origin_differs or delete_origin_differs - conflicts, or if the key value conflicts with the remote tuple for + conflicts, or if the key value conflicts with the remote row for insert_exists, update_exists or multiple_unique_conflicts conflicts. - The remote tuple section includes the new tuple from + The remote row section includes the new row from the remote insert or update operation that caused the conflict. Note that - for an update operation, the column value of the new tuple will be null + for an update operation, the column value of the new row will be null if the value is unchanged and toasted. @@ -1899,7 +2195,7 @@ DETAIL: detailed_explanation. The replica identity section includes the replica identity key values that were used to search for the existing local - tuple to be updated or deleted. This may include the full tuple value + row to be updated or deleted. This may include the full row value if the local relation is marked with REPLICA IDENTITY FULL. @@ -1907,7 +2203,7 @@ DETAIL: detailed_explanation. column_name is the column name. - For existing local tuple, remote tuple, + For existing local row, remote row, and replica identity full cases, column names are logged only if the user lacks the privilege to access all columns of the table. If column names are present, they appear in the same order @@ -1964,16 +2260,16 @@ DETAIL: detailed_explanation. ERROR: conflict detected on relation "public.test": conflict=insert_exists DETAIL: Key already exists in unique index "t_pkey", which was modified locally in transaction 740 at 2024-06-26 10:47:04.727375+08. -Key (c)=(1); existing local tuple (1, 'local'); remote tuple (1, 'remote'). -CONTEXT: processing remote data for replication origin "pg_16395" during "INSERT" for replication target relation "public.test" in transaction 725 finished at 0/14C0378 +Key (c)=(1); existing local row (1, 'local'); remote row (1, 'remote'). +CONTEXT: processing remote data for replication origin "pg_16395" during "INSERT" for replication target relation "public.test" in transaction 725 finished at 0/014C0378 The LSN of the transaction that contains the change violating the constraint and - the replication origin name can be found from the server log (LSN 0/14C0378 and + the replication origin name can be found from the server log (LSN 0/014C0378 and replication origin pg_16395 in the above case). The transaction that produced the conflict can be skipped by using ALTER SUBSCRIPTION ... SKIP with the finish LSN - (i.e., LSN 0/14C0378). The finish LSN could be an LSN at which the transaction + (i.e., LSN 0/014C0378). The finish LSN could be an LSN at which the transaction is committed or prepared on the publisher. Alternatively, the transaction can also be skipped by calling the pg_replication_origin_advance() function. @@ -1984,7 +2280,7 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER disable_on_error option. Then, you can use pg_replication_origin_advance() function with the node_name (i.e., pg_16395) - and the next LSN of the finish LSN (i.e., 0/14C0379). The current position of + and the next LSN of the finish LSN (i.e., 0/014C0379). The current position of origins can be seen in the pg_replication_origin_status system view. Please note that skipping the whole transaction includes skipping changes that @@ -2040,16 +2336,19 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER - Sequence data is not replicated. The data in serial or identity columns - backed by sequences will of course be replicated as part of the table, - but the sequence itself would still show the start value on the - subscriber. If the subscriber is used as a read-only database, then this - should typically not be a problem. If, however, some kind of switchover - or failover to the subscriber database is intended, then the sequences - would need to be updated to the latest values, either by copying the - current data from the publisher (perhaps - using pg_dump) or by determining a sufficiently high - value from the tables themselves. + Incremental sequence changes are not replicated. Although the data in + serial or identity columns backed by sequences will be replicated as part + of the table, the sequences themselves do not replicate ongoing changes. + On the subscriber, a sequence will retain the last value it synchronized + from the publisher. If the subscriber is used as a read-only database, + then this should typically not be a problem. If, however, some kind of + switchover or failover to the subscriber database is intended, then the + sequences would need to be updated to the latest values, either by + executing + ALTER SUBSCRIPTION ... REFRESH SEQUENCES + or by copying the current data from the publisher (perhaps using + pg_dump) or by determining a sufficiently high value + from the tables themselves. @@ -2125,8 +2424,8 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER implemented by walsender and apply processes. The walsender process starts logical decoding (described in ) of the WAL and loads the standard - logical decoding output plugin (pgoutput). The plugin - transforms the changes read + logical decoding output plugin (). + The plugin transforms the changes read from WAL to the logical replication protocol (see ) and filters the data according to the publication specification. The data is then continuously @@ -2197,8 +2496,8 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER Monitoring - Because logical replication is based on a similar architecture as - physical streaming replication, + Because streaming logical replication is based on a similar architecture as + streaming physical replication, the monitoring on a publication node is similar to monitoring of a physical replication primary (see ). @@ -2240,9 +2539,9 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER - In order to be able to copy the initial table data, the role used for the - replication connection must have the SELECT privilege on - a published table (or be a superuser). + In order to be able to copy the initial table or sequence data, the role + used for the replication connection must have the SELECT + privilege on a published table or sequence (or be a superuser). @@ -2251,10 +2550,14 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER - To add tables to a publication, the user must have ownership rights on the - table. To add all tables in schema to a publication, the user must be a - superuser. To create a publication that publishes all tables or all tables in - schema automatically, the user must be a superuser. + To create a publication using FOR TABLE, the user must + have ownership rights on all the listed tables. To create a publication + using any of FOR ALL TABLES, + FOR ALL SEQUENCES, + or FOR TABLES IN SCHEMA, the user must be a superuser. To + alter a publication using ADD TABLE, the user must have + ownership rights on all the listed tables. To alter a publication using + ADD TABLES IN SCHEMA, the user must be a superuser. @@ -2279,8 +2582,11 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER privileges of the subscription owner. However, when performing an insert, update, delete, or truncate operation on a particular table, it will switch roles to the table owner and perform the operation with the table owner's - privileges. This means that the subscription owner needs to be able to - SET ROLE to each role that owns a replicated table. + privileges. Similarly, when synchronizing sequence data, it will switch to + the sequence owner's role and perform the operation using the sequence + owner's privileges. This means that the subscription owner needs to be able + to SET ROLE to each role that owns a replicated table or + sequence. @@ -2327,7 +2633,7 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER wal_level must be - set to logical. + set to replica or logical. @@ -2364,11 +2670,17 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER the subscriber, plus some reserve for table synchronization. + + max_replication_slots + must be set to at least 1 when retain_dead_tuples + is enabled for any subscription. + + max_logical_replication_workers must be set to at least the number of subscriptions (for leader apply - workers), plus some reserve for the table synchronization workers and - parallel apply workers. + workers), plus some reserve for the parallel apply workers, and + table/sequence synchronization workers. @@ -2381,8 +2693,9 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER max_sync_workers_per_subscription - controls the amount of parallelism of the initial data copy during the - subscription initialization or when new tables are added. + controls how many tables can be synchronized in parallel during + subscription initialization or when new tables are added. One additional + worker is also needed for sequence synchronization. @@ -2413,7 +2726,7 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER - Prepare for publisher upgrades + Prepare for Publisher Upgrades pg_upgrade attempts to migrate logical @@ -2442,7 +2755,7 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER The new cluster must have wal_level as - logical. + replica or logical. @@ -2485,7 +2798,7 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER - Prepare for subscriber upgrades + Prepare for Subscriber Upgrades Setup the @@ -2500,6 +2813,22 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER dependencies on clusters before version 17.0 will silently be ignored. + + + Commit timestamps and origin data are not preserved during the upgrade. + As a result, even if + retain_dead_tuples + is enabled, the upgraded subscriber may be unable to detect conflicts or + log relevant commit timestamps and origins when applying changes from the + publisher occurred before the upgrade. Additionally, immediately after the + upgrade, the vacuum may remove the deleted rows that are required for + conflict detection. This can affect the changes that were not replicated + before the upgrade. To ensure consistent conflict tracking, users should + ensure that all potentially conflicting changes are replicated to the + subscriber before initiating the upgrade. + + + There are some prerequisites for pg_upgrade to be able to upgrade the subscriptions. If these are not met an error @@ -2531,11 +2860,21 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER subscriptions present in the old cluster. + + + If there are subscriptions with retain_dead_tuples enabled, the reserved + replication slot pg_conflict_detection + must not exist on the new cluster. Additionally, the + wal_level on the + new cluster must be set to replica or + logical. + + - Upgrading logical replication clusters + Upgrading Logical Replication Clusters While upgrading a subscriber, write operations can be performed in the @@ -2599,7 +2938,7 @@ CONTEXT: processing remote data for replication origin "pg_16395" during "INSER - Steps to upgrade a two-node logical replication cluster + Steps to Upgrade a Two-node Logical Replication Cluster Let's say publisher is in node1 and subscriber is in node2. The subscriber node2 has @@ -2743,7 +3082,7 @@ pg_ctl -D /opt/PostgreSQL/data2_upgraded start -l logfile - Steps to upgrade a cascaded logical replication cluster + Steps to Upgrade a Cascaded Logical Replication Cluster Let's say we have a cascaded logical replication setup node1->node2->node3. @@ -2972,7 +3311,7 @@ pg_ctl -D /opt/PostgreSQL/data3_upgraded start -l logfile - Steps to upgrade a two-node circular logical replication cluster + Steps to Upgrade a Two-node Circular Logical Replication Cluster Let's say we have a circular logical replication setup node1->node2 and @@ -3203,8 +3542,8 @@ CREATE SUBSCRIPTION mysub CONNECTION 'dbname=foo host=bar user=repuser' PUBLICAT The above will start the replication process, which synchronizes the - initial table contents of the tables users and - departments and then starts replicating + initial table contents of the tables users and + departments and then starts replicating incremental changes to those tables. diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml index dd9e83b08eaf1..f36bf9462fa0d 100644 --- a/doc/src/sgml/logicaldecoding.sgml +++ b/doc/src/sgml/logicaldecoding.sgml @@ -47,7 +47,7 @@ Before you can use logical decoding, you must set - to logical and + to replica or higher and to at least 1. Then, you should connect to the target database (in the example below, postgres) as a superuser. @@ -57,14 +57,14 @@ postgres=# -- Create a slot named 'regression_slot' using the output plugin 'test_decoding' postgres=# SELECT * FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding', false, true); slot_name | lsn ------------------+----------- - regression_slot | 0/16B1970 +-----------------+------------ + regression_slot | 0/016B1970 (1 row) postgres=# SELECT slot_name, plugin, slot_type, database, active, restart_lsn, confirmed_flush_lsn FROM pg_replication_slots; slot_name | plugin | slot_type | database | active | restart_lsn | confirmed_flush_lsn ------------------+---------------+-----------+----------+--------+-------------+----------------- - regression_slot | test_decoding | logical | postgres | f | 0/16A4408 | 0/16A4440 +-----------------+---------------+-----------+----------+--------+-------------+--------------------- + regression_slot | test_decoding | logical | postgres | f | 0/016A4408 | 0/016A4440 (1 row) postgres=# -- There are no changes to see yet @@ -73,15 +73,15 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('regression_slot', NULL, NU -----+-----+------ (0 rows) -postgres=# CREATE TABLE data(id serial primary key, data text); +postgres=# CREATE TABLE data(id serial PRIMARY KEY, data text); CREATE TABLE postgres=# -- DDL isn't replicated, so all you'll see is the transaction postgres=# SELECT * FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-------+-------------- - 0/BA2DA58 | 10297 | BEGIN 10297 - 0/BA5A5A0 | 10297 | COMMIT 10297 + lsn | xid | data +------------+-------+-------------- + 0/0BA2DA58 | 10297 | BEGIN 10297 + 0/0BA5A5A0 | 10297 | COMMIT 10297 (2 rows) postgres=# -- Once changes are read, they're consumed and not emitted @@ -97,41 +97,41 @@ postgres=*# INSERT INTO data(data) VALUES('2'); postgres=*# COMMIT; postgres=# SELECT * FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-------+--------------------------------------------------------- - 0/BA5A688 | 10298 | BEGIN 10298 - 0/BA5A6F0 | 10298 | table public.data: INSERT: id[integer]:1 data[text]:'1' - 0/BA5A7F8 | 10298 | table public.data: INSERT: id[integer]:2 data[text]:'2' - 0/BA5A8A8 | 10298 | COMMIT 10298 + lsn | xid | data +------------+-------+--------------------------------------------------------- + 0/0BA5A688 | 10298 | BEGIN 10298 + 0/0BA5A6F0 | 10298 | table public.data: INSERT: id[integer]:1 data[text]:'1' + 0/0BA5A7F8 | 10298 | table public.data: INSERT: id[integer]:2 data[text]:'2' + 0/0BA5A8A8 | 10298 | COMMIT 10298 (4 rows) postgres=# INSERT INTO data(data) VALUES('3'); postgres=# -- You can also peek ahead in the change stream without consuming changes postgres=# SELECT * FROM pg_logical_slot_peek_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-------+--------------------------------------------------------- - 0/BA5A8E0 | 10299 | BEGIN 10299 - 0/BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3' - 0/BA5A990 | 10299 | COMMIT 10299 + lsn | xid | data +------------+-------+--------------------------------------------------------- + 0/0BA5A8E0 | 10299 | BEGIN 10299 + 0/0BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3' + 0/0BA5A990 | 10299 | COMMIT 10299 (3 rows) postgres=# -- The next call to pg_logical_slot_peek_changes() returns the same changes again postgres=# SELECT * FROM pg_logical_slot_peek_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-------+--------------------------------------------------------- - 0/BA5A8E0 | 10299 | BEGIN 10299 - 0/BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3' - 0/BA5A990 | 10299 | COMMIT 10299 + lsn | xid | data +------------+-------+--------------------------------------------------------- + 0/0BA5A8E0 | 10299 | BEGIN 10299 + 0/0BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3' + 0/0BA5A990 | 10299 | COMMIT 10299 (3 rows) postgres=# -- options can be passed to output plugin, to influence the formatting postgres=# SELECT * FROM pg_logical_slot_peek_changes('regression_slot', NULL, NULL, 'include-timestamp', 'on'); - lsn | xid | data ------------+-------+--------------------------------------------------------- - 0/BA5A8E0 | 10299 | BEGIN 10299 - 0/BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3' - 0/BA5A990 | 10299 | COMMIT 10299 (at 2017-05-10 12:07:21.272494-04) + lsn | xid | data +------------+-------+--------------------------------------------------------- + 0/0BA5A8E0 | 10299 | BEGIN 10299 + 0/0BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3' + 0/0BA5A990 | 10299 | COMMIT 10299 (at 2017-05-10 12:07:21.272494-04) (3 rows) postgres=# -- Remember to destroy a slot you no longer need to stop it consuming @@ -169,7 +169,7 @@ COMMIT 693 $ pg_recvlogical -d postgres --slot=test --drop-slot Example 2: -$ pg_recvlogical -d postgres --slot=test --create-slot --two-phase +$ pg_recvlogical -d postgres --slot=test --create-slot --enable-two-phase $ pg_recvlogical -d postgres --slot=test --start -f - ControlZ $ psql -d postgres -c "BEGIN;INSERT INTO data(data) VALUES('5');PREPARE TRANSACTION 'test';" @@ -200,37 +200,37 @@ postgres=*# INSERT INTO data(data) VALUES('5'); postgres=*# PREPARE TRANSACTION 'test_prepared1'; postgres=# SELECT * FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-----+--------------------------------------------------------- - 0/1689DC0 | 529 | BEGIN 529 - 0/1689DC0 | 529 | table public.data: INSERT: id[integer]:3 data[text]:'5' - 0/1689FC0 | 529 | PREPARE TRANSACTION 'test_prepared1', txid 529 + lsn | xid | data +------------+-----+--------------------------------------------------------- + 0/01689DC0 | 529 | BEGIN 529 + 0/01689DC0 | 529 | table public.data: INSERT: id[integer]:3 data[text]:'5' + 0/01689FC0 | 529 | PREPARE TRANSACTION 'test_prepared1', txid 529 (3 rows) postgres=# COMMIT PREPARED 'test_prepared1'; -postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-----+-------------------------------------------- - 0/168A060 | 529 | COMMIT PREPARED 'test_prepared1', txid 529 +postgres=# SELECT * from pg_logical_slot_get_changes('regression_slot', NULL, NULL); + lsn | xid | data +------------+-----+-------------------------------------------- + 0/0168A060 | 529 | COMMIT PREPARED 'test_prepared1', txid 529 (4 row) postgres=#-- you can also rollback a prepared transaction postgres=# BEGIN; postgres=*# INSERT INTO data(data) VALUES('6'); postgres=*# PREPARE TRANSACTION 'test_prepared2'; -postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-----+--------------------------------------------------------- - 0/168A180 | 530 | BEGIN 530 - 0/168A1E8 | 530 | table public.data: INSERT: id[integer]:4 data[text]:'6' - 0/168A430 | 530 | PREPARE TRANSACTION 'test_prepared2', txid 530 +postgres=# SELECT * from pg_logical_slot_get_changes('regression_slot', NULL, NULL); + lsn | xid | data +------------+-----+--------------------------------------------------------- + 0/0168A180 | 530 | BEGIN 530 + 0/0168A1E8 | 530 | table public.data: INSERT: id[integer]:4 data[text]:'6' + 0/0168A430 | 530 | PREPARE TRANSACTION 'test_prepared2', txid 530 (3 rows) postgres=# ROLLBACK PREPARED 'test_prepared2'; -postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NULL); - lsn | xid | data ------------+-----+---------------------------------------------- - 0/168A4B8 | 530 | ROLLBACK PREPARED 'test_prepared2', txid 530 +postgres=# SELECT * from pg_logical_slot_get_changes('regression_slot', NULL, NULL); + lsn | xid | data +------------+-----+---------------------------------------------- + 0/0168A4B8 | 530 | ROLLBACK PREPARED 'test_prepared2', txid 530 (1 row)
@@ -257,6 +257,47 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU log, which describe changes on a storage level, into an application-specific form such as a stream of tuples or SQL statements. + + + Logical decoding becomes available in two conditions: + + + + + When is set to logical. + + + + + When is set to replica + and at least one valid logical replication slot exists on the system. + + + + + If either condition is met, the operational WAL level becomes equivalent + to logical, which can be monitored through the + parameter. + + + When wal_level is set to replica, + logical decoding is automatically activated upon creation of the first + logical replication slot. This activation process involves several steps + and requires synchronization among processes, ensuring system-wide + consistency. Conversely, if wal_level is set to + replica and the last logical replication slot is dropped + or invalidated, logical decoding is automatically disabled. Note that the + deactivation of logical decoding might take some time as it is performed + asynchronously by the checkpointer process. + + + + + When wal_level is set to replica, + dropping or invalidating the last logical slot disables logical decoding + on the primary, resulting in slots on standbys being invalidated. + + @@ -275,9 +316,9 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU - PostgreSQL also has streaming replication slots - (see ), but they are used somewhat - differently there. + PostgreSQL can also use streaming replication slots + to maintain a standby server (see ), but + typically those use physical replication, not logical. @@ -290,7 +331,7 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU A logical slot will emit each change just once in normal operation. The current position of each slot is persisted only at checkpoint, so in - the case of a crash the slot may return to an earlier LSN, which will + the case of a crash the slot might return to an earlier LSN, which will then cause recent changes to be sent again when the server restarts. Logical decoding clients are responsible for avoiding ill effects from handling the same message more than once. Clients may wish to record @@ -328,7 +369,7 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). Existing logical slots on standby also get invalidated if - wal_level on the primary is reduced to less than + effective_wal_level on the primary is reduced to less than logical. This is done as soon as the standby detects such a change in the WAL stream. It means that, for walsenders that are lagging (if any), some WAL records up @@ -370,10 +411,10 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU pg_create_logical_replication_slot, or by using the failover option of - CREATE SUBSCRIPTION during slot creation, and then calling - - pg_sync_replication_slots - on the standby. By setting + CREATE SUBSCRIPTION during slot creation. + Additionally, enabling + sync_replication_slots on the standby + is required. By enabling sync_replication_slots on the standby, the failover slots can be synchronized periodically in the slotsync worker. For the synchronization to work, it is mandatory to @@ -398,6 +439,50 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU receiving the WAL up to the latest flushed position on the primary server. + + + While enabling + sync_replication_slots allows for automatic + periodic synchronization of failover slots, they can also be manually + synchronized using the + pg_sync_replication_slots function on the standby. + However, unlike automatic synchronization, it does not perform incremental + updates. It retries cyclically until all the failover slots that existed on + primary at the start of the function call are synchronized. Any slots created + after the function begins will not be synchronized. In contrast, automatic + synchronization via sync_replication_slots provides + continuous slot updates, enabling seamless failover and supporting high + availability. Therefore, it is the recommended method for synchronizing slots. + + + + + When slot synchronization is configured as recommended, + and the initial synchronization is performed either automatically or + manually via pg_sync_replication_slots, the standby + can persist the synchronized slot only if the following condition is met: + The logical replication slot on the primary must retain WALs and system + catalog rows that are still available on the standby. This ensures data + integrity and allows logical replication to continue smoothly after + promotion. + If the required WALs or catalog rows have already been purged from the + standby, the slot will not be persisted to avoid data loss. In such + cases, the following log message may appear: + +LOG: could not synchronize replication slot "failover_slot" +DETAIL: Synchronization could lead to data loss, because the remote slot needs WAL at LSN 0/03003F28 and catalog xmin 754, but the standby has LSN 0/03003F28 and catalog xmin 756. + + If the logical replication slot is actively used by a consumer, no + manual intervention is needed; the slot will advance automatically, + and synchronization will resume in the next cycle. However, if no + consumer is configured, it is advisable to manually advance the slot + on the primary using + pg_logical_slot_get_changes or + + pg_logical_slot_get_binary_changes, + allowing synchronization to proceed. + + The ability to resume logical replication after failover depends upon the pg_replication_slots.synced @@ -528,6 +613,170 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU Logical Decoding Output Plugins + + + PostgreSQL provides two logical decoding + output plugins, and + . You can also develop custom output plugins + (see for details). + + + + pgoutput — Standard Logical Decoding Output Plugin + + + pgoutput + + + + pgoutput is the standard logical decoding output + plugin provided by PostgreSQL. + It's used for the built-in + logical replication. + + + + Options + + + + proto_version (integer) + + + Specifies the protocol version. + Currently versions 1, 2, + 3, and 4 are supported. A valid + version is required. + + + Version 2 is supported on server version 14 + and above, and is required when streaming + is set to on to stream large in-progress + transactions. + + + Version 3 is supported on server version 15 + and above, and is required when two_phase + is enabled to stream two-phase commits. + + + Version 4 is supported on server version 16 + and above, and is required when streaming + is set to parallel to stream large in-progress + transactions to be applied in parallel. + + + + + + publication_names (string) + + + A comma-separated list of publication names to subscribe to. + The individual publication names are treated + as standard objects names and can be quoted the same as needed. + At least one publication name is required. + + + + + + binary (boolean) + + + Enables binary transfer mode. Binary mode is faster + than the text mode but slightly less robust. + The default is off. + + + + + + messages (boolean) + + + Enables sending the messages that are written by + pg_logical_emit_message. + The default is off. + + + + + + streaming (enum) + + + Enables streaming of in-progress transactions. Valid values are + off (the default), on and + parallel. + + + When set to off, pgoutput + fully decodes a transaction before sending it as a whole. + This mode works with any protocol version. + + + When set to on, pgoutput + streams large in-progress transactions. + This requires protocol version 2 or higher. + + + When set to parallel, pgoutput + streams large in-progress transactions and also sends + extra information in some messages to support parallel processing. + This requires protocol version 4 or higher. + + + + + + two_phase (boolean) + + + Enables sending two-phase transactions. + Minimum protocol version 3 is required to turn it on. + The default is off. + + + + + + origin (enum) + + + Specifies whether to send changes by their origin. Possible values are + none to only send the changes that have no origin + associated, or any + to send the changes regardless of their origin. This can be used + to avoid loops (infinite replication of the same data) among + replication nodes. + The default is any. + + + + + + + + + Notes + + + pgoutput produces binary output, + so functions expecting textual data ( + pg_logical_slot_peek_changes and + pg_logical_slot_get_changes) + cannot be used with it. Use + pg_logical_slot_peek_binary_changes or + pg_logical_slot_get_binary_changes + instead. + + + + + + + Writing Logical Decoding Output Plugins An example output plugin can be found in the @@ -1366,7 +1615,7 @@ commit_prepared_cb(...); <-- commit of the prepared transaction currently used for decoded changes) is selected and streamed. However, in some cases we still have to spill to disk even if streaming is enabled because we exceed the memory threshold but still have not decoded the - complete tuple e.g., only decoded toast table insert but not the main table + complete tuple e.g., only decoded TOAST table insert but not the main table insert. diff --git a/doc/src/sgml/ltree.sgml b/doc/src/sgml/ltree.sgml index 1c3543303f0ab..ff3c227727b57 100644 --- a/doc/src/sgml/ltree.sgml +++ b/doc/src/sgml/ltree.sgml @@ -645,7 +645,7 @@ Europe & Russia*@ & !Transportation siglen determines the signature length in bytes. The default signature length is 8 bytes. The length must be a positive multiple of int alignment - (4 bytes on most machines)) up to 2024. Longer + (4 bytes on most machines) up to 2024. Longer signatures lead to a more precise search (scanning a smaller fraction of the index and fewer heap pages), at the cost of a larger index. @@ -818,7 +818,7 @@ ltreetest=> SELECT subpath(path,0,2)||'Space'||subpath(path,2) FROM test WHER at a specified position in a path: CREATE FUNCTION ins_label(ltree, int, text) RETURNS ltree - AS 'select subpath($1,0,$2) || $3 || subpath($1,$2);' + AS 'SELECT subpath($1, 0, $2) || $3 || subpath($1, $2);' LANGUAGE SQL IMMUTABLE; ltreetest=> SELECT ins_label(path,2,'Space') FROM test WHERE path <@ 'Top.Science.Astronomy'; diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml index 600e4b3f2f3b8..08e6489afb8ed 100644 --- a/doc/src/sgml/maintenance.sgml +++ b/doc/src/sgml/maintenance.sgml @@ -614,8 +614,8 @@ examine this information is to execute queries such as: -SELECT c.oid::regclass as table_name, - greatest(age(c.relfrozenxid),age(t.relfrozenxid)) as age +SELECT c.oid::regclass AS table_name, + greatest(age(c.relfrozenxid), age(t.relfrozenxid)) AS age FROM pg_class c LEFT JOIN pg_class t ON c.reltoastrelid = t.oid WHERE c.relkind IN ('r', 'm'); @@ -779,7 +779,10 @@ HINT: Execute a database-wide VACUUM in that database. careful aging management, storage cleanup, and wraparound handling. There is a separate storage area which holds the list of members in each multixact, which also uses a 32-bit counter and which must also - be managed. + be managed. The system function + pg_get_multixact_members() described in + can be used to examine the + transaction IDs associated with a multixact ID. @@ -889,7 +892,8 @@ HINT: Execute a database-wide VACUUM in that database. the next database will be processed as soon as the first worker finishes. Each worker process will check each table within its database and execute VACUUM and/or ANALYZE as needed. - can be set to monitor + and + can be set to monitor autovacuum workers' activity. @@ -930,12 +934,16 @@ vacuum threshold = Minimum(vacuum max threshold, vacuum base threshold + vacuum The table is also vacuumed if the number of tuples inserted since the last vacuum has exceeded the defined insert threshold, which is defined as: -vacuum insert threshold = vacuum base insert threshold + vacuum insert scale factor * number of tuples +vacuum insert threshold = vacuum base insert threshold + vacuum insert scale factor * number of tuples * percent of table not frozen where the vacuum insert base threshold is , - and vacuum insert scale factor is - . + the vacuum insert scale factor is + , + the number of tuples is + pg_class.reltuples, + and the percent of the table not frozen is + 1 - pg_class.relallfrozen / pg_class.relpages. Such vacuums may allow portions of the table to be marked as all visible and also allow tuples to be frozen, which can reduce the work required in subsequent vacuums. @@ -1010,8 +1018,11 @@ analyze threshold = analyze base threshold + analyze scale factor * number of tu see . However, if the autovacuum is running to prevent transaction ID wraparound (i.e., the autovacuum query name in the pg_stat_activity view ends with - (to prevent wraparound)), the autovacuum is not - automatically interrupted. + (to prevent wraparound) or the + started_by column in the + pg_stat_progress_vacuum view shows + autovacuum_wraparound value), the autovacuum is + not automatically interrupted. diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 4265a22d4de35..817fd9f4ca7af 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1053,11 +1053,9 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser - BufferPin - The server process is waiting for exclusive access to - a data buffer. Buffer pin waits can be protracted if - another process holds an open cursor that last read data from the - buffer in question. See . + Buffer + The server process is waiting for access to a data buffer. + See . @@ -1137,7 +1135,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Here are examples of how wait events can be viewed: -SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event is NOT NULL; +SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event IS NOT NULL; pid | wait_event_type | wait_event ------+-----------------+------------ 2540 | Lock | relation @@ -1150,7 +1148,7 @@ SELECT a.pid, a.wait_event, w.description FROM pg_stat_activity a JOIN pg_wait_events w ON (a.wait_event_type = w.type AND a.wait_event = w.name) - WHERE a.wait_event is NOT NULL and a.state = 'active'; + WHERE a.wait_event IS NOT NULL AND a.state = 'active'; -[ RECORD 1 ]------------------------------------------------------&zwsp;------------ pid | 686674 wait_event | WALInitSync @@ -1287,6 +1285,10 @@ description | Waiting for a newly initialized WAL file to reach durable storage This standby's xmin horizon reported by . + This field will be null if a replication slot is used (in this case, + the standby's xmin is shown in + pg_replication_slots) + or if hot_standby_feedback is disabled. @@ -1620,6 +1622,17 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + mem_exceeded_countbigint + + + Number of times the memory used by logical decoding has exceeded + logical_decoding_work_mem. + + + + total_txns bigint @@ -1644,6 +1657,30 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + slotsync_skip_countbigint + + + Number of times the slot synchronization is skipped. Slot + synchronization occurs only on standby servers and thus this column has + no meaning on the primary server. + + + + + + + slotsync_last_skiptimestamp with time zone + + + Time at which last slot synchronization was skipped. Slot + synchronization occurs only on standby servers and thus this column has + no meaning on the primary server. + + + + stats_reset timestamp with time zone @@ -2030,8 +2067,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage Type of the subscription worker process. Possible types are - apply, parallel apply, and - table synchronization. + apply, parallel apply, + table synchronization, and + sequence synchronization. @@ -2179,7 +2217,18 @@ description | Waiting for a newly initialized WAL file to reach durable storage - sync_error_count bigint + sync_seq_error_count bigint + + + Number of times an error occurred in the sequence synchronization + worker. A single worker synchronizes all sequences, so one error + increment may represent failures across multiple sequences. + + + + + + sync_table_error_count bigint Number of times an error occurred during the initial table @@ -2223,6 +2272,17 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + confl_update_deleted bigint + + + Number of times the tuple to be updated was concurrently deleted by + another source during the application of changes. See + for details about this conflict. + + + confl_update_missing bigint @@ -3297,6 +3357,15 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + wal_fpi_bytes numeric + + + Total amount of WAL full page images in bytes + + + wal_buffers_full bigint @@ -3980,6 +4049,7 @@ description | Waiting for a newly initialized WAL file to reach durable storage Estimated number of rows inserted since this table was last vacuumed + (not counting VACUUM FULL) @@ -4066,7 +4136,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage total_vacuum_time double precision - Total time this table has been manually vacuumed, in milliseconds. + Total time this table has been manually vacuumed, in milliseconds + (not counting VACUUM FULL). (This includes the time spent sleeping due to cost-based delays.) @@ -4102,6 +4173,15 @@ description | Waiting for a newly initialized WAL file to reach durable storage cost-based delays.) + + + + stats_reset timestamp with time zone + + + Time at which these statistics were last reset + + @@ -4222,6 +4302,15 @@ description | Waiting for a newly initialized WAL file to reach durable storage index + + + + stats_reset timestamp with time zone + + + Time at which these statistics were last reset + + @@ -4419,6 +4508,15 @@ description | Waiting for a newly initialized WAL file to reach durable storage Number of buffer hits in this table's TOAST table indexes (if any) + + + + stats_reset timestamp with time zone + + + Time at which these statistics were last reset + + @@ -4519,6 +4617,15 @@ description | Waiting for a newly initialized WAL file to reach durable storage Number of buffer hits in this index + + + + stats_reset timestamp with time zone + + + Time at which these statistics were last reset + + @@ -4687,6 +4794,15 @@ description | Waiting for a newly initialized WAL file to reach durable storage other functions called by it, in milliseconds + + + + stats_reset timestamp with time zone + + + Time at which these statistics were last reset + + @@ -5649,11 +5765,36 @@ FROM pg_stat_get_backend_idset() AS backendid; Total time spent sleeping due to cost-based delay (see - , in milliseconds + ), in milliseconds (if is enabled, otherwise zero). + + + + started_by text + + + Shows what caused the current ANALYZE operation to be + started. Possible values are: + + + + manual: The analyze was started by an explicit + ANALYZE, or by VACUUM with + the option. + + + + + autovacuum: The analyze was started by an + autovacuum worker. + + + + + @@ -6594,6 +6735,81 @@ FROM pg_stat_get_backend_idset() AS backendid; stale. + + + + mode text + + + The mode in which the current VACUUM operation is + running. See for details of each + mode. Possible values are: + + + + normal: The operation is performing a standard + vacuum. It is neither required to run in aggressive mode nor operating + in failsafe mode. + + + + + aggressive: The operation is running an aggressive + vacuum, which must scan every page that is not marked all-frozen. + The parameters and + determine when a + table requires aggressive vacuuming. + + + + + failsafe: The vacuum has entered failsafe mode, + in which it performs only the minimum work necessary to avoid + transaction ID or multixact ID wraparound failure. + The parameters and + determine when the + vacuum enters failsafe mode. The vacuum may start in this mode or + switch to it while running; the value of the + mode column may transition from another + mode to failsafe during the operation. + + + + + + + + + started_by text + + + Shows what caused the current VACUUM operation to be + started. Possible values are: + + + + manual: The vacuum was started by an explicit + VACUUM command. + + + + + autovacuum: The vacuum was started by an autovacuum + worker. Vacuums run by autovacuum workers may be interrupted due to + lock conflicts. + + + + + autovacuum_wraparound: The vacuum was started by an + autovacuum worker to prevent transaction ID or multixact ID + wraparound. Vacuums run for wraparound protection are not interrupted + due to lock conflicts. + + + + + @@ -6778,6 +6994,16 @@ FROM pg_stat_get_backend_idset() AS backendid; advances when the phase is streaming database files. + + + + backup_type text + + + Backup type. Either full or + incremental. + + diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 487c5d758ffbf..3a113439e1dc4 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -73,9 +73,9 @@ passed as argument. For example: test=# SELECT * FROM page_header(get_raw_page('pg_class', 0)); - lsn | checksum | flags | lower | upper | special | pagesize | version | prune_xid ------------+----------+--------+-------+-------+---------+----------+---------+----------- - 0/24A1B50 | 0 | 1 | 232 | 368 | 8192 | 8192 | 4 | 0 + lsn | checksum | flags | lower | upper | special | pagesize | version | prune_xid +------------+----------+--------+-------+-------+---------+----------+---------+----------- + 0/024A1B50 | 0 | 1 | 232 | 368 | 8192 | 8192 | 4 | 0 The returned columns correspond to the fields in the PageHeaderData struct. @@ -741,9 +741,9 @@ test=# SELECT first_tid, nbytes, tids[0:5] AS some_tids For example: test=# SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); - lsn | nsn | rightlink | flags ------+-----+-----------+-------- - 0/1 | 0/0 | 1 | {leaf} + lsn | nsn | rightlink | flags +------------+------------+-----------+-------- + 0/0B5FE088 | 0/00000000 | 1 | {leaf} (1 row) @@ -932,8 +932,8 @@ test=# SELECT * FROM hash_bitmap_info('con_hash_index', 2052); test=# SELECT magic, version, ntuples, ffactor, bsize, bmsize, bmshift, test-# maxbucket, highmask, lowmask, ovflpoint, firstfree, nmaps, procid, -test-# regexp_replace(spares::text, '(,0)*}', '}') as spares, -test-# regexp_replace(mapp::text, '(,0)*}', '}') as mapp +test-# regexp_replace(spares::text, '(,0)*}', '}') AS spares, +test-# regexp_replace(mapp::text, '(,0)*}', '}') AS mapp test-# FROM hash_metapage_info(get_raw_page('con_hash_index', 0)); -[ RECORD 1 ]-------------------------------------------------&zwsp;------------------------------ magic | 105121344 diff --git a/doc/src/sgml/parallel.sgml b/doc/src/sgml/parallel.sgml index 1ce9abf86f525..af43484703eb0 100644 --- a/doc/src/sgml/parallel.sgml +++ b/doc/src/sgml/parallel.sgml @@ -299,6 +299,15 @@ EXPLAIN SELECT * FROM pgbench_accounts WHERE filler LIKE '%x%'; within each worker process. + + + In a parallel tid range scan, the range of blocks + will be subdivided into smaller ranges which are shared among the + cooperating processes. Each worker process will complete the scanning + of its given range of blocks before requesting an additional range of + blocks. + + Other scan types, such as scans of non-btree indexes, may support diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 106583fb2965d..5f6f1db0467e9 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1485,12 +1485,27 @@ CREATE STATISTICS stts (dependencies) ON city, zip FROM zipcodes; ANALYZE zipcodes; -SELECT stxname, stxkeys, stxddependencies - FROM pg_statistic_ext join pg_statistic_ext_data on (oid = stxoid) +SELECT stxkeys AS k, jsonb_pretty(stxddependencies::text::jsonb) AS dep + FROM pg_statistic_ext JOIN pg_statistic_ext_data ON (oid = stxoid) WHERE stxname = 'stts'; - stxname | stxkeys | stxddependencies ----------+---------+------------------------------------------ - stts | 1 5 | {"1 => 5": 1.000000, "5 => 1": 0.423130} +-[ RECORD 1 ]-------------------- +k | 1 5 +dep | [ + + | { + + | "degree": 1.000000,+ + | "attributes": [ + + | 1 + + | ], + + | "dependency": 5 + + | }, + + | { + + | "degree": 0.423130,+ + | "attributes": [ + + | 5 + + | ], + + | "dependency": 1 + + | } + + | ] (1 row) Here it can be seen that column 1 (zip code) fully determines column @@ -1576,12 +1591,42 @@ CREATE STATISTICS stts2 (ndistinct) ON city, state, zip FROM zipcodes; ANALYZE zipcodes; -SELECT stxkeys AS k, stxdndistinct AS nd - FROM pg_statistic_ext join pg_statistic_ext_data on (oid = stxoid) +SELECT stxkeys AS k, jsonb_pretty(stxdndistinct::text::jsonb) AS nd + FROM pg_statistic_ext JOIN pg_statistic_ext_data on (oid = stxoid) WHERE stxname = 'stts2'; --[ RECORD 1 ]------------------------------------------------------&zwsp;-- +-[ RECORD 1 ]------------------- k | 1 2 5 -nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178} +nd | [ + + | { + + | "ndistinct": 33178,+ + | "attributes": [ + + | 1, + + | 2 + + | ] + + | }, + + | { + + | "ndistinct": 33178,+ + | "attributes": [ + + | 1, + + | 5 + + | ] + + | }, + + | { + + | "ndistinct": 27435,+ + | "attributes": [ + + | 2, + + | 5 + + | ] + + | }, + + | { + + | "ndistinct": 33178,+ + | "attributes": [ + + | 1, + + | 2, + + | 5 + + | ] + + | } + + | ] (1 row) This indicates that there are three combinations of columns that diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 537d601494242..1e9aee10275f2 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -19,10 +19,18 @@ pg_buffercache_pages + + pg_buffercache_numa + + pg_buffercache_summary + + pg_buffercache_usage_counts + + pg_buffercache_evict @@ -35,16 +43,32 @@ pg_buffercache_evict_all + + pg_buffercache_mark_dirty + + + + pg_buffercache_mark_dirty_relation + + + + pg_buffercache_mark_dirty_all + + This module provides the pg_buffercache_pages() - function (wrapped in the pg_buffercache view), - pg_buffercache_numa_pages() function (wrapped in the - pg_buffercache_numa view), the + function (wrapped in the pg_buffercache view), the + pg_buffercache_os_pages() function (wrapped in the + pg_buffercache_os_pages and + pg_buffercache_numa views), the pg_buffercache_summary() function, the pg_buffercache_usage_counts() function, the - pg_buffercache_evict(), the - pg_buffercache_evict_relation() function and the - pg_buffercache_evict_all() function. + pg_buffercache_evict() function, the + pg_buffercache_evict_relation() function, the + pg_buffercache_evict_all() function, the + pg_buffercache_mark_dirty() function, the + pg_buffercache_mark_dirty_relation() function and the + pg_buffercache_mark_dirty_all() function. @@ -55,12 +79,16 @@ - The pg_buffercache_numa_pages() provides - NUMA node mappings for shared buffer entries. This - information is not part of pg_buffercache_pages() - itself, as it is much slower to retrieve. - The pg_buffercache_numa view wraps the function for - convenient use. + The pg_buffercache_os_pages() function provides OS + pages mappings for shared buffer entries. When its argument is + true, it also provides NUMA node + mappings for shared buffer entries (this information is not part of + pg_buffercache_pages() itself, as it is much + slower to retrieve). + The pg_buffercache_os_pages and + pg_buffercache_numa views wrap the function for + convenient use, with its argument set to false and + true respectively. @@ -99,6 +127,25 @@ function is restricted to superusers only. + + The pg_buffercache_mark_dirty() function allows a block + to be marked as dirty in the buffer pool given a buffer identifier. Use of + this function is restricted to superusers only. + + + + The pg_buffercache_mark_dirty_relation() function + allows all unpinned shared buffers in the relation to be marked as dirty in + the buffer pool given a relation identifier. Use of this function is + restricted to superusers only. + + + + The pg_buffercache_mark_dirty_all() function allows all + unpinned shared buffers to be marked as dirty in the buffer pool. Use of + this function is restricted to superusers only. + + The <structname>pg_buffercache</structname> View @@ -234,6 +281,53 @@ + + The <structname>pg_buffercache_os_pages</structname> View + + + The definitions of the columns exposed by the view are shown in + . + + + + <structname>pg_buffercache_os_pages</structname> Columns + + + + + Column Type + + + Description + + + + + + + + bufferid integer + + + ID, in the range 1..shared_buffers + + + + + + os_page_num bigint + + + Number of OS memory page for this buffer + + + + + +
+ +
+ The <structname>pg_buffercache_numa</structname> View @@ -270,7 +364,7 @@ os_page_num bigint - number of OS memory page for this buffer + Number of OS memory page for this buffer @@ -476,10 +570,12 @@ The pg_buffercache_evict() function takes a buffer identifier, as shown in the bufferid column of the pg_buffercache view. It returns information - about whether the buffer was evicted and flushed. The buffer_evicted + about whether the buffer was evicted and flushed. + The buffer_evicted column is true on success, and false if the buffer wasn't valid, if it couldn't be evicted because it was pinned, or if it became dirty again - after an attempt to write it out. The buffer_flushed column is true if the + after an attempt to write it out. + The buffer_flushed column is true if the buffer was flushed. This does not necessarily mean that buffer was flushed by us, it might be flushed by someone else. The result is immediately out of date upon return, as the buffer might become valid again at any time due @@ -489,7 +585,7 @@ - The <structname>pg_buffercache_evict_relation</structname> Function + The <function>pg_buffercache_evict_relation()</function> Function The pg_buffercache_evict_relation() function is very similar to the pg_buffercache_evict() function. The @@ -507,7 +603,7 @@ - The <structname>pg_buffercache_evict_all</structname> Function + The <function>pg_buffercache_evict_all()</function> Function The pg_buffercache_evict_all() function is very similar to the pg_buffercache_evict() function. The @@ -522,6 +618,61 @@ + + The <function>pg_buffercache_mark_dirty()</function> Function + + The pg_buffercache_mark_dirty() function takes a + buffer identifier, as shown in the bufferid + column of the pg_buffercache view. It returns + information about whether the buffer was marked as dirty. + The buffer_dirtied column is true on success, + and false if the buffer was already dirty if the buffer was not valid or + if it could not be marked as dirty because it was pinned. + The buffer_already_dirty column is true if + the buffer couldn't be marked as dirty because it was already dirty. The + result is immediately out of date upon return, as the buffer might become + valid again at any time due to concurrent activity. The function is + intended for developer testing only. + + + + + The <function>pg_buffercache_mark_dirty_relation()</function> Function + + The pg_buffercache_mark_dirty_relation() function is + very similar to the + pg_buffercache_mark_dirty() function. + The difference is that the + pg_buffercache_mark_dirty_relation() function takes a + relation identifier instead of buffer identifier. It tries to mark all + buffers dirty for all forks in that relation. + It returns the number of buffers marked as dirty, the number of buffers + already dirty and the number of buffers skipped because already pinned or + invalid. + The result is immediately out of date upon return, as the buffer might + become valid again at any time due to concurrent activity. The function is + intended for developer testing only. + + + + + The <function>pg_buffercache_mark_dirty_all()</function> Function + + The pg_buffercache_mark_dirty_all() function is + very similar to the pg_buffercache_mark_dirty() + function. + The difference is that the + pg_buffercache_mark_dirty_all() tries to mark all + buffers dirty in the buffer pool. + It returns the number of buffers marked as dirty, the number of buffers + already dirty and the number of buffers skipped because already pinned or + invalid. + The result is immediately out of date upon return, as the buffer might + become valid again at any time due to concurrent activity. The function is + intended for developer testing only. + + + Sample Output @@ -550,6 +701,46 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers public | spgist_text_tbl | 182 (10 rows) +regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count + FROM ( + SELECT bufferid, COUNT(*) as pages_per_buffer + FROM pg_buffercache_os_pages + GROUP BY bufferid + ) + GROUP BY pages_per_buffer + ORDER BY pages_per_buffer; + + pages_per_buffer | buffer_count +------------------+-------------- + 1 | 261120 + 2 | 1024 +(2 rows) + +regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages + FROM pg_buffercache b JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase IN (0, (SELECT oid FROM pg_database + WHERE datname = current_database())) + JOIN pg_namespace n ON n.oid = c.relnamespace + JOIN (SELECT bufferid FROM pg_buffercache_os_pages + GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10; + + nspname | relname | buffers_on_multiple_pages +------------+------------------------------+--------------------------- + public | delete_test_table | 3 + public | gin_test_idx | 2 + pg_catalog | pg_depend | 2 + public | quad_poly_tbl | 2 + pg_catalog | pg_depend_reference_index | 1 + pg_catalog | pg_index_indexrelid_index | 1 + pg_catalog | pg_constraint_contypid_index | 1 + pg_catalog | pg_statistic | 1 + pg_catalog | pg_depend_depender_index | 1 + pg_catalog | pg_operator | 1 +(10 rows) regression=# SELECT * FROM pg_buffercache_summary(); buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg diff --git a/doc/src/sgml/pgcrypto.sgml b/doc/src/sgml/pgcrypto.sgml index bc5c74ad017fa..6fc2069ad3ece 100644 --- a/doc/src/sgml/pgcrypto.sgml +++ b/doc/src/sgml/pgcrypto.sgml @@ -57,7 +57,7 @@ digest(data bytea, type text) returns bytea If you want the digest as a hexadecimal string, use encode() on the result. For example: -CREATE OR REPLACE FUNCTION sha1(bytea) returns text AS $$ +CREATE OR REPLACE FUNCTION sha1(bytea) RETURNS text AS $$ SELECT encode(digest($1, 'sha1'), 'hex') $$ LANGUAGE SQL STRICT IMMUTABLE; diff --git a/doc/src/sgml/pglogicalinspect.sgml b/doc/src/sgml/pglogicalinspect.sgml index 4b111f961133b..1c1a9d14e510a 100644 --- a/doc/src/sgml/pglogicalinspect.sgml +++ b/doc/src/sgml/pglogicalinspect.sgml @@ -95,7 +95,7 @@ two_phase_at | 0/40796AF8 initial_xmin_horizon | 0 building_full_snapshot | f in_slot_creation | f -last_serialized_snapshot | 0/0 +last_serialized_snapshot | 0/00000000 next_phase_at | 0 committed_count | 0 committed_xip | @@ -114,7 +114,7 @@ two_phase_at | 0/40796AF8 initial_xmin_horizon | 0 building_full_snapshot | f in_slot_creation | f -last_serialized_snapshot | 0/0 +last_serialized_snapshot | 0/00000000 next_phase_at | 0 committed_count | 0 committed_xip | diff --git a/doc/src/sgml/pgoverexplain.sgml b/doc/src/sgml/pgoverexplain.sgml index 21930fbd3bd76..e399c1cbad5f9 100644 --- a/doc/src/sgml/pgoverexplain.sgml +++ b/doc/src/sgml/pgoverexplain.sgml @@ -8,7 +8,7 @@ - The pg_overexplain extends EXPLAIN + The pg_overexplain module extends EXPLAIN with new options that provide additional output. It is mostly intended to assist with debugging of and development of the planner, rather than for general use. Since this module displays internal details of planner data @@ -17,6 +17,21 @@ often as) those data structures change. + + To use it, simply load it into the server. You can load it into an + individual session: + + +LOAD 'pg_overexplain'; + + + You can also preload it into some or all sessions by including + pg_overexplain in + or + in + postgresql.conf. + + EXPLAIN (DEBUG) @@ -24,8 +39,8 @@ The DEBUG option displays miscellaneous information from the plan tree that is not normally shown because it is not expected to be of general interest. For each individual plan node, it will display the - following fields. See Plan in - nodes/plannodes.h for additional documentation of these + following fields. See Plan in + nodes/plannodes.h for additional documentation of these fields. @@ -67,8 +82,8 @@ Once per query, the DEBUG option will display the - following fields. See PlannedStmt in - nodes/plannodes.h for additional detail. + following fields. See PlannedStmt in + nodes/plannodes.h for additional detail. @@ -82,7 +97,7 @@ Flags. A comma-separated list of Boolean structure - member names from the PlannedStmt that are set to + member names from the PlannedStmt that are set to true. It covers the following structure members: hasReturning, hasModifyingCTE, canSetTag, transientPlan, @@ -162,7 +177,7 @@ table entry (e.g. relation, subquery, or join), followed by the contents of various range table entry fields that are not normally part of - EXPLAIN output. Some of these fields are only displayed + EXPLAIN output. Some of these fields are only displayed for certain kinds of range table entries. For example, Eref is displayed for all types of range table entries, but CTE Name is displayed only for range table entries @@ -171,7 +186,7 @@ For more information about range table entries, see the definition of - RangeTblEntry in nodes/plannodes.h. + RangeTblEntry in nodes/parsenodes.h. diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml index 7baa07dcdbf7f..d753de5836efb 100644 --- a/doc/src/sgml/pgstatstatements.sgml +++ b/doc/src/sgml/pgstatstatements.sgml @@ -554,6 +554,24 @@ + + + generic_plan_calls bigint + + + Number of times the statement has been executed using a generic plan + + + + + + custom_plan_calls bigint + + + Number of times the statement has been executed using a custom plan + + + stats_since timestamp with time zone diff --git a/doc/src/sgml/pgstattuple.sgml b/doc/src/sgml/pgstattuple.sgml index 4071da4ed941a..54d8f90245e73 100644 --- a/doc/src/sgml/pgstattuple.sgml +++ b/doc/src/sgml/pgstattuple.sgml @@ -270,6 +270,15 @@ leaf_fragmentation | 0 page than is accounted for by internal_pages + leaf_pages + empty_pages + deleted_pages, because it also includes the index's metapage. + avg_leaf_density is the fraction of the index size that + is taken up by user data. Since indexes have a default fillfactor of 90, + this should be around 90 for newly built indexes of non-negligible size, + but usually deteriorates over time. + leaf_fragmentation represents a measure of disorder. + A higher leaf_fragmentation indicates that the + physical order of the index leaf pages increasingly deviates from their + logical order. This can have a significant impact if a large part + of the index is read from disk. @@ -368,7 +377,7 @@ pending_tuples | 0 pgstathashindex returns a record showing information about a HASH index. For example: -test=> select * from pgstathashindex('con_hash_index'); +test=> SELECT * FROM pgstathashindex('con_hash_index'); -[ RECORD 1 ]--+----------------- version | 4 bucket_pages | 33081 diff --git a/doc/src/sgml/pgsurgery.sgml b/doc/src/sgml/pgsurgery.sgml index 29bccd7f36d6c..68186122a2208 100644 --- a/doc/src/sgml/pgsurgery.sgml +++ b/doc/src/sgml/pgsurgery.sgml @@ -34,17 +34,17 @@ intended use of this function is to forcibly remove tuples that are not otherwise accessible. For example: -test=> select * from t1 where ctid = '(0, 1)'; +test=> SELECT * FROM t1 WHERE ctid = '(0, 1)'; ERROR: could not access status of transaction 4007513275 DETAIL: Could not open file "pg_xact/0EED": No such file or directory. -test=# select heap_force_kill('t1'::regclass, ARRAY['(0, 1)']::tid[]); +test=# SELECT heap_force_kill('t1'::regclass, ARRAY['(0, 1)']::tid[]); heap_force_kill ----------------- (1 row) -test=# select * from t1 where ctid = '(0, 1)'; +test=# SELECT * FROM t1 WHERE ctid = '(0, 1)'; (0 rows) @@ -70,19 +70,19 @@ test=> vacuum t1; ERROR: found xmin 507 from before relfrozenxid 515 CONTEXT: while scanning block 0 of relation "public.t1" -test=# select ctid from t1 where xmin = 507; +test=# SELECT ctid FROM t1 WHERE xmin = 507; ctid ------- (0,3) (1 row) -test=# select heap_force_freeze('t1'::regclass, ARRAY['(0, 3)']::tid[]); +test=# SELECT heap_force_freeze('t1'::regclass, ARRAY['(0, 3)']::tid[]); heap_force_freeze ------------------- (1 row) -test=# select ctid from t1 where xmin = 2; +test=# SELECT ctid FROM t1 WHERE xmin = 2; ctid ------- (0,3) diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml index 3a8121c70f1f1..79c3ead40bc71 100644 --- a/doc/src/sgml/pgwalinspect.sgml +++ b/doc/src/sgml/pgwalinspect.sgml @@ -73,9 +73,9 @@ postgres=# SELECT * FROM pg_get_wal_record_info('0/E419E28'); -[ RECORD 1 ]----+------------------------------------------------- -start_lsn | 0/E419E28 -end_lsn | 0/E419E68 -prev_lsn | 0/E419D78 +start_lsn | 0/0E419E28 +end_lsn | 0/0E419E68 +prev_lsn | 0/0E419D78 xid | 0 resource_manager | Heap2 record_type | VACUUM @@ -146,9 +146,9 @@ block_ref | postgres=# SELECT * FROM pg_get_wal_block_info('0/1230278', '0/12302B8'); -[ RECORD 1 ]-----+----------------------------------- -start_lsn | 0/1230278 -end_lsn | 0/12302B8 -prev_lsn | 0/122FD40 +start_lsn | 0/01230278 +end_lsn | 0/012302B8 +prev_lsn | 0/0122FD40 block_id | 0 reltablespace | 1663 reldatabase | 1 diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml index 068b804a18d70..e57867ba617f3 100644 --- a/doc/src/sgml/planstats.sgml +++ b/doc/src/sgml/planstats.sgml @@ -635,7 +635,7 @@ EXPLAIN (ANALYZE, TIMING OFF, BUFFERS OFF) SELECT * FROM t WHERE a = 1 AND b = 1 pg_mcv_list_items set-returning function. -SELECT m.* FROM pg_statistic_ext join pg_statistic_ext_data on (oid = stxoid), +SELECT m.* FROM pg_statistic_ext JOIN pg_statistic_ext_data ON (oid = stxoid), pg_mcv_list_items(stxdmcv) m WHERE stxname = 'stts2'; index | values | nulls | frequency | base_frequency -------+----------+-------+-----------+---------------- diff --git a/doc/src/sgml/plperl.sgml b/doc/src/sgml/plperl.sgml index 8007261d0224c..6f018645f1191 100644 --- a/doc/src/sgml/plperl.sgml +++ b/doc/src/sgml/plperl.sgml @@ -229,12 +229,12 @@ $$ LANGUAGE plperl; references to Perl arrays. Here is an example: -CREATE OR REPLACE function returns_array() +CREATE OR REPLACE FUNCTION returns_array() RETURNS text[][] AS $$ return [['a"b','c,d'],['e\\f','g']]; $$ LANGUAGE plperl; -select returns_array(); +SELECT returns_array(); @@ -468,8 +468,8 @@ optional maximum number of rows: $rv = spi_exec_query('SELECT * FROM my_table', 5); This returns up to 5 rows from the table - my_table. If my_table - has a column my_column, you can get that + my_table. If my_table + has a column my_column, you can get that value from row $i of the result like this: $foo = $rv->{rows}[$i]->{my_column}; @@ -512,7 +512,7 @@ INSERT INTO test (i, v) VALUES (3, 'third line'); INSERT INTO test (i, v) VALUES (4, 'immortal'); CREATE OR REPLACE FUNCTION test_munge() RETURNS SETOF test AS $$ - my $rv = spi_exec_query('select i, v from test;'); + my $rv = spi_exec_query('SELECT i, v FROM test;'); my $status = $rv->{status}; my $nrows = $rv->{processed}; foreach my $rn (0 .. $nrows - 1) { @@ -588,7 +588,7 @@ CREATE OR REPLACE FUNCTION lotsa_md5 (INTEGER) RETURNS SETOF foo_type AS $$ return; $$ LANGUAGE plperlu; -SELECT * from lotsa_md5(500); +SELECT * FROM lotsa_md5(500); @@ -1199,7 +1199,7 @@ $$ LANGUAGE plperl; $_TD->{new}{foo} - NEW value of column foo + NEW value of column foo @@ -1208,7 +1208,7 @@ $$ LANGUAGE plperl; $_TD->{old}{foo} - OLD value of column foo + OLD value of column foo diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml index e937491e6b89e..561f6e50d6371 100644 --- a/doc/src/sgml/plpgsql.sgml +++ b/doc/src/sgml/plpgsql.sgml @@ -1023,7 +1023,7 @@ IF count(*) > 0 FROM my_table THEN ... tax := subtotal * 0.06; my_record.user_id := 20; my_array[j] := 20; -my_array[1:3] := array[1,2,3]; +my_array[1:3] := ARRAY[1, 2, 3]; complex_array[n].realpart = 12.3; @@ -1037,7 +1037,7 @@ complex_array[n].realpart = 12.3; within a PL/pgSQL function just by writing the command. For example, you could create and fill a table by writing -CREATE TABLE mytable (id int primary key, data text); +CREATE TABLE mytable (id int PRIMARY KEY, data text); INSERT INTO mytable VALUES (1,'one'), (2,'two'); @@ -4962,13 +4962,13 @@ $$ LANGUAGE plpgsql; Variable substitution currently works only in SELECT, INSERT, UPDATE, - DELETE, and commands containing one of - these (such as EXPLAIN and CREATE TABLE - ... AS SELECT), - because the main SQL engine allows query parameters only in these - commands. To use a non-constant name or value in other statement - types (generically called utility statements), you must construct - the utility statement as a string and EXECUTE it. + DELETE, MERGE and commands + containing one of these (such as EXPLAIN and + CREATE TABLE ... AS SELECT), because the main SQL + engine allows query parameters only in these commands. To use a + non-constant name or value in other statement types (generically called + utility statements), you must construct the utility statement as a string + and EXECUTE it. @@ -5294,24 +5294,24 @@ a_output := a_output || $$ AND name LIKE 'foobar'$$ . For example: -a_output := a_output || '' if v_'' || - referrer_keys.kind || '' like '''''''''' +a_output := a_output || '' IF v_'' || + referrer_keys.kind || '' LIKE '''''''''' || referrer_keys.key_string || '''''''''' - then return '''''' || referrer_keys.referrer_type - || ''''''; end if;''; + THEN RETURN '''''' || referrer_keys.referrer_type + || ''''''; END IF;''; The value of a_output would then be: -if v_... like ''...'' then return ''...''; end if; +IF v_... LIKE ''...'' THEN RETURN ''...''; END IF; In the dollar-quoting approach, this becomes: -a_output := a_output || $$ if v_$$ || referrer_keys.kind || $$ like '$$ +a_output := a_output || $$ IF v_$$ || referrer_keys.kind || $$ LIKE '$$ || referrer_keys.key_string || $$' - then return '$$ || referrer_keys.referrer_type - || $$'; end if;$$; + THEN RETURN '$$ || referrer_keys.referrer_type + || $$'; END IF;$$; where we assume we only need to put single quote marks into a_output, because it will be re-quoted before use. diff --git a/doc/src/sgml/plpython.sgml b/doc/src/sgml/plpython.sgml index bee817ea822a2..c447452b7c5f0 100644 --- a/doc/src/sgml/plpython.sgml +++ b/doc/src/sgml/plpython.sgml @@ -1,6 +1,6 @@ - + PL/Python — Python Procedural Language PL/Python @@ -662,6 +662,14 @@ $$ LANGUAGE plpython3u; in PL/Python + + PL/Python can be used to define trigger + functions. + PostgreSQL requires that a function that is to + be called as a trigger must be declared as a function with no arguments and + a return type of trigger. + + When a function is used as a trigger, the dictionary TD contains trigger-related values: @@ -769,6 +777,74 @@ $$ LANGUAGE plpython3u; + + Event Trigger Functions + + + event trigger + in PL/Python + + + + PL/Python can be used to define event triggers + (see also ). + PostgreSQL requires that a function that is to + be called as an event trigger must be declared as a function with no + arguments and a return type of event_trigger. + + + + When a function is used as an event trigger, the dictionary + TD contains trigger-related values: + + + + TD["event"] + + + The event the trigger was fired for, as a string, for example + ddl_command_start. + + + + + + TD["tag"] + + + The command tag for which the trigger was fired, as a string, for + example DROP TABLE. + + + + + + + + shows an example of an + event trigger function in PL/Python. + + + + A <application>PL/Python</application> Event Trigger Function + + + This example trigger simply raises a NOTICE message + each time a supported command is executed. + + + +CREATE OR REPLACE FUNCTION pysnitch() RETURNS event_trigger +LANGUAGE plpython3u +AS $$ + plpy.notice("TD[event] => " + TD["event"] + " ; TD[tag] => " + TD["tag"]); +$$; + +CREATE EVENT TRIGGER pysnitch ON ddl_command_start EXECUTE FUNCTION pysnitch(); + + + + Database Access @@ -989,7 +1065,7 @@ $$ LANGUAGE plpython3u; CREATE FUNCTION count_odd_iterator() RETURNS integer AS $$ odd = 0 -for row in plpy.cursor("select num from largetable"): +for row in plpy.cursor("SELECT num FROM largetable"): if row['num'] % 2: odd += 1 return odd @@ -997,7 +1073,7 @@ $$ LANGUAGE plpython3u; CREATE FUNCTION count_odd_fetch(batch_size integer) RETURNS integer AS $$ odd = 0 -cursor = plpy.cursor("select num from largetable") +cursor = plpy.cursor("SELECT num FROM largetable") while True: rows = cursor.fetch(batch_size) if not rows: @@ -1010,7 +1086,7 @@ $$ LANGUAGE plpython3u; CREATE FUNCTION count_odd_prepared() RETURNS integer AS $$ odd = 0 -plan = plpy.prepare("select num from largetable where num % $1 <> 0", ["integer"]) +plan = plpy.prepare("SELECT num FROM largetable WHERE num % $1 <> 0", ["integer"]) rows = list(plpy.cursor(plan, [2])) # or: = list(plan.cursor([2])) return len(rows) diff --git a/doc/src/sgml/pltcl.sgml b/doc/src/sgml/pltcl.sgml index 5a8e4c9d37e99..9fd008a99d7cd 100644 --- a/doc/src/sgml/pltcl.sgml +++ b/doc/src/sgml/pltcl.sgml @@ -180,7 +180,7 @@ $$ LANGUAGE pltcl; column names. Here is an example: -CREATE FUNCTION square_cube(in int, out squared int, out cubed int) AS $$ +CREATE FUNCTION square_cube(IN int, OUT squared int, OUT cubed int) AS $$ return [list squared [expr {$1 * $1}] cubed [expr {$1 * $1 * $1}]] $$ LANGUAGE pltcl; diff --git a/doc/src/sgml/postgres-fdw.sgml b/doc/src/sgml/postgres-fdw.sgml index 781a01067f7d6..9b032fbf67509 100644 --- a/doc/src/sgml/postgres-fdw.sgml +++ b/doc/src/sgml/postgres-fdw.sgml @@ -1226,7 +1226,7 @@ postgres=# SELECT postgres_fdw_disconnect_all(); PostgresFdwCleanupResult - Waiting for transaction abort on remote server. + Waiting for transaction abort on a remote server. diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml index af476c82fcc1e..2101442c90fcb 100644 --- a/doc/src/sgml/postgres.sgml +++ b/doc/src/sgml/postgres.sgml @@ -49,7 +49,7 @@ break is not needed in a wider output rendering. - After you have successfully completed this tutorial you will want to + After you have successfully completed this tutorial, you will want to read the section to gain a better understanding of the SQL language, or for information about developing applications with diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index c4d3853cbf2c2..41c5954a424ac 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -189,7 +189,7 @@
- Protocol versions + Protocol Versions The current, latest version of the protocol is version 3.2. However, for @@ -226,7 +226,7 @@ - Protocol versions + Protocol Versions @@ -537,6 +537,11 @@ The frontend should not respond to this message, but should continue listening for a ReadyForQuery message. + + The PostgreSQL server will always send this + message, but some third party backend implementations of the protocol + that don't support query cancellation are known not to. + @@ -886,6 +891,16 @@ SELCT 1/0; Errors detected at semantic analysis or later, such as a misspelled table or column name, do not have this effect. + + + Lastly, note that all the statements within the Query message will + observe the same value of statement_timestamp(), + since that timestamp is updated only upon receipt of the Query + message. This will result in them all observing the same + value of transaction_timestamp() as well, + except in cases where the query string ends a previously-started + transaction and begins a new one. + @@ -1621,7 +1636,7 @@ SELCT 1/0; Likewise the server expects the client to not begin the SSL negotiation until it receives the server's - single byte response to the SSL request. If the + single-byte response to the SSL request. If the client begins the SSL negotiation immediately without waiting for the server response to be received it can reduce connection latency by one round-trip. However this comes at the cost of not being @@ -2225,6 +2240,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" The name of the slot to create. Must be a valid replication slot name (see ). + The name cannot be pg_conflict_detection as it + is reserved for the conflict detection. @@ -2538,8 +2555,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" - - XLogData (B) + + WALData (B) @@ -2587,11 +2604,11 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" - A single WAL record is never split across two XLogData messages. + A single WAL record is never split across two WALData messages. When a WAL record crosses a WAL page boundary, and is therefore already split using continuation records, it can be split at the page boundary. In other words, the first main WAL record and its - continuation records can be sent in different XLogData messages. + continuation records can be sent in different WALData messages. @@ -2643,6 +2660,65 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" + + + Primary status update (B) + + + + Byte1('s') + + + Identifies the message as a primary status update. + + + + + + Int64 + + + The latest WAL write position on the server. + + + + + + Int64 + + + The oldest transaction ID that is currently in the commit phase on + the server, along with its epoch. The most significant 32 bits are + the epoch. The least significant 32 bits are the transaction ID. + If no transactions are active on the server, this number will be + the next transaction ID to be assigned. + + + + + + Int64 + + + The next transaction ID to be assigned on the server, along with + its epoch. The most significant 32 bits are the epoch. The least + significant 32 bits are the transaction ID. + + + + + + Int64 + + + The server's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. + + + + + + @@ -2787,6 +2863,33 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" + + + Request primary status update (F) + + + + Byte1('p') + + + Identifies the message as a request for a primary status update. + + + + + + Int64 + + + The client's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. + + + + + + + @@ -2852,7 +2955,7 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" The name of an option passed to the slot's logical decoding output - plugin. See for + plugin. See for options that are accepted by the standard (pgoutput) plugin. @@ -3420,128 +3523,15 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" the physical streaming replication protocol. - - PostgreSQL logical decoding supports output - plugins. pgoutput is the standard one used for - the built-in logical replication. - - Logical Streaming Replication Parameters - Using the START_REPLICATION command, - pgoutput accepts the following options: - - - - - proto_version - - - - Protocol version. Currently versions 1, 2, - 3, and 4 are supported. A valid - version is required. - - - Version 2 is supported only for server version 14 - and above, and it allows streaming of large in-progress transactions. - - - Version 3 is supported only for server version 15 - and above, and it allows streaming of two-phase commits. - - - Version 4 is supported only for server version 16 - and above, and it allows streams of large in-progress transactions to - be applied in parallel. - - - - - - - publication_names - - - - Comma-separated list of publication names for which to subscribe - (receive changes). The individual publication names are treated - as standard objects names and can be quoted the same as needed. - At least one publication name is required. - - - - - - - binary - - - - Boolean option to use binary transfer mode. Binary mode is faster - than the text mode but slightly less robust. - - - - - - - messages - - - - Boolean option to enable sending the messages that are written - by pg_logical_emit_message. - - - - - - - streaming - - - - Boolean option to enable streaming of in-progress transactions. - It accepts an additional value "parallel" to enable sending extra - information with some messages to be used for parallelisation. - Minimum protocol version 2 is required to turn it on. Minimum protocol - version 4 is required for the "parallel" option. - - - - - - - two_phase - - - - Boolean option to enable two-phase transactions. Minimum protocol - version 3 is required to turn it on. - - - - - - - origin - - - - Option to send changes by their origin. Possible values are - none to only send the changes that have no origin - associated, or any - to send the changes regardless of their origin. This can be used - to avoid loops (infinite replication of the same data) among - replication nodes. - - - - - + The START_REPLICATION command can pass + options to the logical decoding output plugin associated + with the specified replication slot. + See for options + that are accepted by the standard (pgoutput) plugin. @@ -4146,7 +4136,7 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" message, indicated by the length field. - The maximum key length is 256 bytes. The + The minimum and maximum key length are 4 and 256 bytes, respectively. The PostgreSQL server only sends keys up to 32 bytes, but the larger maximum size allows for future server versions, as well as connection poolers and other middleware, to use @@ -4337,7 +4327,7 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" - Int32(16) + Int32 Length of message contents in bytes, including self. @@ -6081,13 +6071,14 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" - Int32(196608) + Int32 The protocol version number. The most significant 16 bits are - the major version number (3 for the protocol described here). - The least significant 16 bits are the minor version number - (0 for the protocol described here). + the major version number. The least significant 16 bits are the minor + version number. As an example protocol version 3.2 is represented as + 196610 in decimal or more clearly as + 0x00030002 in hexadecimal. @@ -7292,8 +7283,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" Int64 (XLogRecPtr) - The LSN of the abort. This field is available since protocol version - 4. + The LSN of the abort operation, present only when streaming is set to parallel. + This field is available since protocol version 4. @@ -7302,9 +7293,9 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" Int64 (TimestampTz) - Abort timestamp of the transaction. The value is in number - of microseconds since PostgreSQL epoch (2000-01-01). This field is - available since protocol version 4. + Abort timestamp of the transaction, present only when streaming is set to + parallel. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). + This field is available since protocol version 4. diff --git a/doc/src/sgml/queries.sgml b/doc/src/sgml/queries.sgml index a326960ff4dfb..4b522213171b8 100644 --- a/doc/src/sgml/queries.sgml +++ b/doc/src/sgml/queries.sgml @@ -410,7 +410,7 @@ FROM table_reference , table_r - To put this together, assume we have tables t1: + To put this together, assume we have tables t1: num | name -----+------ @@ -418,7 +418,7 @@ FROM table_reference , table_r 2 | b 3 | c - and t2: + and t2: num | value -----+------- @@ -1079,7 +1079,7 @@ SELECT select_list In the second query, we could not have written SELECT * FROM test1 GROUP BY x, because there is no single value - for the column y that could be associated with each + for the column y that could be associated with each group. The grouped-by columns can be referenced in the select list since they have a single value in each group. @@ -1145,10 +1145,36 @@ SELECT product_id, p.name, (sum(s.units) * p.price) AS sales In strict SQL, GROUP BY can only group by columns of - the source table but PostgreSQL extends + the source table, but PostgreSQL extends this to also allow GROUP BY to group by columns in the select list. Grouping by value expressions instead of simple - column names is also allowed. + column names is also allowed (but GROUP BY + expressions cannot contain aggregate functions or window functions). + + + + PostgreSQL also supports the syntax GROUP BY ALL, + which is equivalent to explicitly writing all select-list entries that + do not contain either an aggregate function or a window function. + This can greatly simplify ad-hoc exploration of data. + As an example, these queries are equivalent: + +=> SELECT a, b, a + b, sum(c) FROM test1 GROUP BY ALL; + a | b | ?column? | sum +---+---+----------+---- + 1 | 4 | 5 | 9 + 2 | 5 | 7 | 12 + 3 | 6 | 9 | 15 +(3 rows) + +=> SELECT a, b, a + b, sum(c) FROM test1 GROUP BY a, b, a + b; + a | b | ?column? | sum +---+---+----------+---- + 1 | 4 | 5 | 9 + 2 | 5 | 7 | 12 + 3 | 6 | 9 | 15 +(3 rows) + @@ -2206,7 +2232,7 @@ WITH RECURSIVE included_parts(sub_part, part, quantity) AS ( FROM included_parts pr, parts p WHERE p.part = pr.sub_part ) -SELECT sub_part, SUM(quantity) as total_quantity +SELECT sub_part, SUM(quantity) AS total_quantity FROM included_parts GROUP BY sub_part @@ -2577,7 +2603,7 @@ WHERE w2.key = 123; undesirable is WITH w AS ( - SELECT key, very_expensive_function(val) as f FROM some_table + SELECT key, very_expensive_function(val) AS f FROM some_table ) SELECT * FROM w AS w1 JOIN w AS w2 ON w1.f = w2.f; diff --git a/doc/src/sgml/query.sgml b/doc/src/sgml/query.sgml index 727a0cb185fb2..b190f28d41ea6 100644 --- a/doc/src/sgml/query.sgml +++ b/doc/src/sgml/query.sgml @@ -264,8 +264,18 @@ COPY weather FROM '/home/user/weather.txt'; where the file name for the source file must be available on the machine running the backend process, not the client, since the backend process - reads the file directly. You can read more about the - COPY command in . + reads the file directly. The data inserted above into the weather table + could also be inserted from a file containing (values are separated by a + tab character): + + +San Francisco 46 50 0.25 1994-11-27 +San Francisco 43 57 0.0 1994-11-29 +Hayward 37 54 \N 1994-11-29 + + + You can read more about the COPY command in + . diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml index f5be638867abe..e167406c74490 100644 --- a/doc/src/sgml/ref/allfiles.sgml +++ b/doc/src/sgml/ref/allfiles.sgml @@ -188,6 +188,7 @@ Complete list of usable sgml source files in this directory. + diff --git a/doc/src/sgml/ref/alter_database.sgml b/doc/src/sgml/ref/alter_database.sgml index 9da8920e12eff..1fc051e11a311 100644 --- a/doc/src/sgml/ref/alter_database.sgml +++ b/doc/src/sgml/ref/alter_database.sgml @@ -83,7 +83,7 @@ ALTER DATABASE name RESET ALL must be empty for this database, and no one can be connected to the database. Tables and indexes in non-default tablespaces are unaffected. The method used to copy files to the new tablespace - is affected by the setting. + is affected by the setting. diff --git a/doc/src/sgml/ref/alter_publication.sgml b/doc/src/sgml/ref/alter_publication.sgml index d5ea383e8bc95..028770f214955 100644 --- a/doc/src/sgml/ref/alter_publication.sgml +++ b/doc/src/sgml/ref/alter_publication.sgml @@ -23,15 +23,24 @@ PostgreSQL documentation ALTER PUBLICATION name ADD publication_object [, ...] ALTER PUBLICATION name SET publication_object [, ...] -ALTER PUBLICATION name DROP publication_object [, ...] +ALTER PUBLICATION name DROP publication_drop_object [, ...] ALTER PUBLICATION name SET ( publication_parameter [= value] [, ... ] ) ALTER PUBLICATION name OWNER TO { new_owner | CURRENT_ROLE | CURRENT_USER | SESSION_USER } ALTER PUBLICATION name RENAME TO new_name where publication_object is one of: - TABLE [ ONLY ] table_name [ * ] [ ( column_name [, ... ] ) ] [ WHERE ( expression ) ] [, ... ] + TABLE table_and_columns [, ... ] TABLES IN SCHEMA { schema_name | CURRENT_SCHEMA } [, ... ] + +and publication_drop_object is one of: + + TABLE [ ONLY ] table_name [ * ] [, ... ] + TABLES IN SCHEMA { schema_name | CURRENT_SCHEMA } [, ... ] + +and table_and_columns is: + + [ ONLY ] table_name [ * ] [ ( column_name [, ... ] ) ] [ WHERE ( expression ) ] @@ -57,8 +66,7 @@ ALTER PUBLICATION name RENAME TO DROP TABLES IN SCHEMA will not drop any schema tables that were specified using FOR TABLE/ - ADD TABLE, and the combination of DROP - with a WHERE clause is not allowed. + ADD TABLE. @@ -82,8 +90,9 @@ ALTER PUBLICATION name RENAME TO CREATE privilege on the database. Also, the new owner of a - FOR ALL TABLES - or FOR TABLES IN SCHEMA + FOR TABLES IN SCHEMA + or FOR ALL TABLES + or FOR ALL SEQUENCES publication must be a superuser. However, a superuser can change the ownership of a publication regardless of these restrictions. @@ -153,6 +162,7 @@ ALTER PUBLICATION name RENAME TO This clause alters publication parameters originally set by . See there for more information. + This clause is not applicable to sequences. diff --git a/doc/src/sgml/ref/alter_sequence.sgml b/doc/src/sgml/ref/alter_sequence.sgml index a998ccc7ead2f..db7b98fdf8bde 100644 --- a/doc/src/sgml/ref/alter_sequence.sgml +++ b/doc/src/sgml/ref/alter_sequence.sgml @@ -207,8 +207,8 @@ ALTER SEQUENCE [ IF EXISTS ] name S The optional clause RESTART [ WITH restart ] changes the current value of the sequence. This is similar to calling the - setval function with is_called = - false: the specified value will be returned by the + setval function with is_called + = false: the specified value will be returned by the next call of nextval. Writing RESTART with no restart value is equivalent to supplying diff --git a/doc/src/sgml/ref/alter_subscription.sgml b/doc/src/sgml/ref/alter_subscription.sgml index fdc648d007f1c..27c06439f4fd1 100644 --- a/doc/src/sgml/ref/alter_subscription.sgml +++ b/doc/src/sgml/ref/alter_subscription.sgml @@ -26,6 +26,7 @@ ALTER SUBSCRIPTION name SET PUBLICA ALTER SUBSCRIPTION name ADD PUBLICATION publication_name [, ...] [ WITH ( publication_option [= value] [, ... ] ) ] ALTER SUBSCRIPTION name DROP PUBLICATION publication_name [, ...] [ WITH ( publication_option [= value] [, ... ] ) ] ALTER SUBSCRIPTION name REFRESH PUBLICATION [ WITH ( refresh_option [= value] [, ... ] ) ] +ALTER SUBSCRIPTION name REFRESH SEQUENCES ALTER SUBSCRIPTION name ENABLE ALTER SUBSCRIPTION name DISABLE ALTER SUBSCRIPTION name SET ( subscription_parameter [= value] [, ... ] ) @@ -139,9 +140,10 @@ ALTER SUBSCRIPTION name RENAME TO < refresh (boolean) - When false, the command will not try to refresh table information. - REFRESH PUBLICATION should then be executed separately. - The default is true. + When false, the command will not try to refresh + table and sequence information. REFRESH PUBLICATION + should then be executed separately. The default is + true. @@ -158,13 +160,19 @@ ALTER SUBSCRIPTION name RENAME TO < REFRESH PUBLICATION - Fetch missing table information from publisher. This will start + Fetch missing table and sequence information from the publisher. This will start replication of tables that were added to the subscribed-to publications since CREATE SUBSCRIPTION or the last invocation of REFRESH PUBLICATION. + + The system catalog pg_subscription_rel + is updated to record all tables and sequences known to the subscription, + that are still part of the publication. + + refresh_option specifies additional options for the refresh operation. The supported options are: @@ -174,14 +182,25 @@ ALTER SUBSCRIPTION name RENAME TO < copy_data (boolean) - Specifies whether to copy pre-existing data in the publications - that are being subscribed to when the replication starts. - The default is true. + Specifies whether to copy pre-existing data for tables and synchronize + sequences in the publications that are being subscribed to when the replication + starts. The default is true. Previously subscribed tables are not copied, even if a table's row filter WHERE clause has since been modified. + + Previously subscribed sequences are not re-synchronized. To do that, + use + ALTER SUBSCRIPTION ... REFRESH SEQUENCES. + + + See for recommendations on how + to handle any warnings about sequence definition differences between + the publisher and the subscriber, which might occur when + copy_data = true. + See for details of how copy_data = true can interact with the @@ -200,6 +219,30 @@ ALTER SUBSCRIPTION name RENAME TO < + + REFRESH SEQUENCES + + + Re-synchronize sequence data with the publisher. Unlike + + ALTER SUBSCRIPTION ... REFRESH PUBLICATION which + only has the ability to synchronize newly added sequences, + REFRESH SEQUENCES will re-synchronize the sequence + data for all currently subscribed sequences. It does not add or remove + sequences from the subscription to match the publication. + + + See for + recommendations on how to handle any warnings about sequence definition + differences between the publisher and the subscriber. + + + See for recommendations on how to + identify and handle out-of-sync sequences. + + + + ENABLE @@ -235,8 +278,10 @@ ALTER SUBSCRIPTION name RENAME TO < password_required, run_as_owner, origin, - failover, and - two_phase. + failover, + two_phase, + retain_dead_tuples, and + max_retention_duration. Only a superuser can set password_required = false. @@ -261,8 +306,9 @@ ALTER SUBSCRIPTION name RENAME TO < - The failover - and two_phase + The failover, + two_phase, and + retain_dead_tuples parameters can only be altered when the subscription is disabled. @@ -285,6 +331,14 @@ ALTER SUBSCRIPTION name RENAME TO < option is changed from true to false, the publisher will replicate the transactions again when they are committed. + + + If the retain_dead_tuples + option is altered to false and no other subscription + has this option enabled, the replication slot named + pg_conflict_detection, created to retain + dead tuples for conflict detection, will be dropped. + diff --git a/doc/src/sgml/ref/alter_system.sgml b/doc/src/sgml/ref/alter_system.sgml index 1bde66d6ad2d3..b28919d1b262a 100644 --- a/doc/src/sgml/ref/alter_system.sgml +++ b/doc/src/sgml/ref/alter_system.sgml @@ -84,6 +84,8 @@ ALTER SYSTEM RESET ALL constants, identifiers, numbers, or comma-separated lists of these, as appropriate for the particular parameter. Values that are neither numbers nor valid identifiers must be quoted. + If the parameter accepts a list of values, NULL + can be written to specify an empty list. DEFAULT can be written to specify removing the parameter and its value from postgresql.auto.conf. @@ -136,7 +138,15 @@ ALTER SYSTEM SET wal_level = replica; in postgresql.conf: ALTER SYSTEM RESET wal_level; - + + + + + Set the list of preloaded extension modules to be empty: + +ALTER SYSTEM SET shared_preload_libraries TO NULL; + + diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index d63f3a621acc6..9abd8037f2807 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -37,6 +37,12 @@ ALTER TABLE [ IF EXISTS ] name ATTACH PARTITION partition_name { FOR VALUES partition_bound_spec | DEFAULT } ALTER TABLE [ IF EXISTS ] name DETACH PARTITION partition_name [ CONCURRENTLY | FINALIZE ] +ALTER TABLE [ IF EXISTS ] name + MERGE PARTITIONS (partition_name1, partition_name2 [, ...]) INTO partition_name +ALTER TABLE [ IF EXISTS ] name + SPLIT PARTITION partition_name INTO + (PARTITION partition_name1 { FOR VALUES partition_bound_spec | DEFAULT }, + PARTITION partition_name2 { FOR VALUES partition_bound_spec | DEFAULT } [, ...]) where action is one of: @@ -210,6 +216,8 @@ WITH ( MODULUS numeric_literal, REM When this form is used, the column's statistics are removed, so running ANALYZE on the table afterwards is recommended. + For a virtual generated column, ANALYZE + is not necessary because such columns never have statistics. @@ -240,9 +248,10 @@ WITH ( MODULUS numeric_literal, REM provided none of the records in the table contain a NULL value for the column. Ordinarily this is checked during the ALTER TABLE by scanning the - entire table; however, if a valid CHECK constraint is - found which proves no NULL can exist, then the - table scan is skipped. + entire table, unless NOT VALID is specified; + however, if a valid CHECK constraint exists + (and is not dropped in the same command) which proves no + NULL can exist, then the table scan is skipped. If a column has an invalid not-null constraint, SET NOT NULL validates it. @@ -270,6 +279,15 @@ WITH ( MODULUS numeric_literal, REM in a stored generated column is rewritten and all the future changes will apply the new generation expression. + + + When this form is used on a stored generated column, its statistics + are removed, so running + ANALYZE + on the table afterwards is recommended. + For a virtual generated column, ANALYZE + is not necessary because such columns never have statistics. + @@ -364,24 +382,22 @@ WITH ( MODULUS numeric_literal, REM n_distinct_inherited, which override the number-of-distinct-values estimates made by subsequent ANALYZE - operations. n_distinct affects the statistics for the table - itself, while n_distinct_inherited affects the statistics - gathered for the table plus its inheritance children. When set to a - positive value, ANALYZE will assume that the column contains - exactly the specified number of distinct nonnull values. When set to a - negative value, which must be greater - than or equal to -1, ANALYZE will assume that the number of - distinct nonnull values in the column is linear in the size of the - table; the exact count is to be computed by multiplying the estimated - table size by the absolute value of the given number. For example, - a value of -1 implies that all values in the column are distinct, while - a value of -0.5 implies that each value appears twice on the average. - This can be useful when the size of the table changes over time, since - the multiplication by the number of rows in the table is not performed - until query planning time. Specify a value of 0 to revert to estimating - the number of distinct values normally. For more information on the use - of statistics by the PostgreSQL query - planner, refer to . + operations. n_distinct affects the statistics for the + table itself, while n_distinct_inherited affects the + statistics gathered for the table plus its inheritance children, and for + the statistics gathered for partitioned tables. When the value + specified is a positive value, the query planner will assume that the + column contains exactly the specified number of distinct nonnull values. + Fractional values may also be specified by using values below 0 and + above or equal to -1. This instructs the query planner to estimate the + number of distinct values by multiplying the absolute value of the + specified number by the estimated number of rows in the table. For + example, a value of -1 implies that all values in the column are + distinct, while a value of -0.5 implies that each value appears twice on + average. This can be useful when the size of the table changes over + time. For more information on the use of statistics by the + PostgreSQL query planner, refer to + . Changing per-attribute options acquires a @@ -460,8 +476,8 @@ WITH ( MODULUS numeric_literal, REM This form adds a new constraint to a table using the same constraint syntax as CREATE TABLE, plus the option NOT - VALID, which is currently only allowed for foreign key, - CHECK constraints and not-null constraints. + VALID, which is currently only allowed for foreign-key, + CHECK, and not-null constraints. @@ -469,7 +485,7 @@ WITH ( MODULUS numeric_literal, REM existing rows in the table satisfy the new constraint. But if the NOT VALID option is used, this potentially-lengthy scan is skipped. The constraint will still be - enforced against subsequent inserts or updates (that is, they'll fail + applied against subsequent inserts or updates (that is, they'll fail unless there is a matching row in the referenced table, in the case of foreign keys, or they'll fail unless the new row matches the specified check condition). But the @@ -591,7 +607,7 @@ WITH ( MODULUS numeric_literal, REM This form validates a foreign key, check, or not-null constraint that was previously created as NOT VALID, by scanning the table to ensure there are no rows for which the constraint is not - satisfied. If the constraint is not enforced, an error is thrown. + satisfied. If the constraint was set to NOT ENFORCED, an error is thrown. Nothing happens if the constraint is already marked valid. (See below for an explanation of the usefulness of this command.) @@ -852,7 +868,7 @@ WITH ( MODULUS numeric_literal, REM SHARE UPDATE EXCLUSIVE lock will be taken for - fillfactor, toast and autovacuum storage parameters, as well as the + fillfactor, TOAST and autovacuum storage parameters, as well as the planner parameter parallel_workers. @@ -1147,18 +1163,206 @@ WITH ( MODULUS numeric_literal, REM + + MERGE PARTITIONS (partition_name1, partition_name2 [, ...]) INTO partition_name + + + + This form merges several partitions of the target table into a new partition. + Hash-partitioned target table is not supported. + Only simple, non-partitioned partitions can be merged. + The new partition (partition_name) + can have the same name as one of the merged partitions + (partition_name1, + partition_name2 [, ...]). + + + + If the DEFAULT partition is not in the + list of merged partitions: + + + + For range-partitioned tables, the ranges of merged partitions + must be adjacent in order to be merged. + The partition bounds of merged partitions are combined to form the new partition bound for + partition_name. + + + + + For list-partitioned tables, the partition bounds of + merged partitions are combined to form the new partition bound for + partition_name. + + + + If the DEFAULT partition is in the list of merged partitions: + + + + The partition partition_name + will be the new DEFAULT partition of the target table. + + + + + The partition bound specifications for merged partitions can be arbitrary. + + + + + + All merged partitions must have the same owner. + The owner of merged partitions will be the owner of the new partition. + It is the user's responsibility to setup ACL on + the new partition. + + + + ALTER TABLE MERGE PARTITION uses the partitioned + table itself as the template to construct the new partition. + The new partition will inherit the same table access method, persistence + type, and tablespace as the partitioned table. + + Constraints, column defaults, column generation expressions, identity + columns, indexes, and triggers are copied from the partitioned table to + the new partition. But extended statistics, security policies, etc, + won't be copied from the partitioned table. + Indexes and identity columns copied from the partitioned table will be + created afterward, once the data has been moved into the new partition. + + + + When partitions are merged, any objects depending on this partition, + such as constraints, triggers, extended statistics, etc, will be + dropped. + Eventually, we will drop all the merged partitions + (using RESTRICT mode) too; therefore, if any objects + are still dependent on them, + ALTER TABLE MERGE PARTITION would fail. + (see ). + + + + + Merging partitions acquires an ACCESS EXCLUSIVE lock on + the parent table, in addition to the ACCESS EXCLUSIVE + locks on the tables being merged and on the default partition (if any). + + + + + ALTER TABLE MERGE PARTITIONS creates a new partition and + moves data from all merging partitions into it, which can take a long time. + So it is not recommended to use the command to merge very big partitions + with small ones. + + + + + + + + SPLIT PARTITION partition_name INTO ( + PARTITION partition_name1 { FOR VALUES partition_bound_spec | DEFAULT }, + PARTITION partition_name2 { FOR VALUES partition_bound_spec | DEFAULT } + [, ...]) + + + + + This form splits a single partition of the target table into new + partitions. Hash-partitioned target table is not supported. + Only a simple, non-partitioned partition can be split. + If the split partition is the DEFAULT partition, + one of the new partitions must be DEFAULT. + If the partitioned table does not have a DEFAULT + partition, a DEFAULT partition can be defined as one + of the new partitions. + + + + The bounds of new partitions should not overlap with those of new or + existing partitions (except partition_name). + The combined bounds of new partitions + partition_name1, + partition_name2[, ...] + should be equal to the bounds of the split partition + partition_name. + One of the new partitions can have the same name as the split partition + partition_name + (this is suitable in case of splitting the DEFAULT + partition: after the split, the DEFAULT partition + remains with the same name, but its partition bound changes). + + + + New partitions will have the same owner as the parent partition. + It is the user's responsibility to setup ACL on new + partitions. + + + + ALTER TABLE SPLIT PARTITION uses the partitioned + table itself as the template to construct new partitions. + New partitions will inherit the same table access method, persistence + type, and tablespace as the partitioned table. + + + + Constraints, column defaults, column generation expressions, + identity columns, indexes, and triggers are copied from the partitioned + table to the new partitions. But extended statistics, security + policies, etc, won't be copied from the partitioned table. + Indexes and identity columns copied from the partitioned table will be + created afterward, once the data has been moved into the new partitions. + + + + When a partition is split, any objects that depend on this partition, + such as constraints, triggers, extended statistics, etc, will be dropped. + This occurs because ALTER TABLE SPLIT PARTITION uses + the partitioned table itself as the template to reconstruct these + objects later. + Eventually, we will drop the split partition + (using RESTRICT mode) too; therefore, if any objects + are still dependent on it, ALTER TABLE SPLIT PARTITION + would fail (see ). + + + + + Split partition acquires an ACCESS EXCLUSIVE lock on + the parent table, in addition to the ACCESS EXCLUSIVE + lock on the table being split. + + + + + + ALTER TABLE SPLIT PARTITION creates new partitions and + moves data from the split partition into them, which can take a long + time. So it is not recommended to use the command for splitting a + small fraction of rows out of a very big partition. + + + + + All the forms of ALTER TABLE that act on a single table, except RENAME, SET SCHEMA, - ATTACH PARTITION, and - DETACH PARTITION can be combined into + ATTACH PARTITION, DETACH PARTITION, + MERGE PARTITIONS, and SPLIT PARTITION + can be combined into a list of multiple alterations to be applied together. For example, it is possible to add several columns and/or alter the type of several columns in a single command. This is particularly useful with large - tables, since only one pass over the table need be made. + tables, since only one pass over the table needs to be made. @@ -1397,7 +1601,19 @@ WITH ( MODULUS numeric_literal, REM partition_name - The name of the table to attach as a new partition or to detach from this table. + The name of the table to attach as a new partition or to detach from this table, + or the name of split partition, or the name of the new merged partition. + + + + + + partition_name1 + partition_name2 + + + The names of the tables being merged into the new partition or split into + new partitions. @@ -1466,11 +1682,11 @@ WITH ( MODULUS numeric_literal, REM - Adding an enforced CHECK or NOT NULL + Adding a CHECK or NOT NULL constraint requires scanning the table to verify that existing rows meet the constraint, but does not require a table rewrite. If a CHECK - constraint is added as NOT ENFORCED, the validation will - not be performed. + constraint is added as NOT ENFORCED, no verification will + be performed. @@ -1485,7 +1701,7 @@ WITH ( MODULUS numeric_literal, REM - Scanning a large table to verify a new foreign key or check constraint + Scanning a large table to verify new foreign-key, check, or not-null constraints can take a long time, and other updates to the table are locked out until the ALTER TABLE ADD CONSTRAINT command is committed. The main purpose of the NOT VALID @@ -1632,7 +1848,7 @@ ALTER TABLE measurements ALTER TABLE transactions ADD COLUMN status varchar(30) DEFAULT 'old', - ALTER COLUMN status SET default 'current'; + ALTER COLUMN status SET DEFAULT 'current'; Existing rows will be filled with old, but then the default for subsequent commands will be current. @@ -1830,6 +2046,31 @@ ALTER TABLE measurement DETACH PARTITION measurement_y2015m12; + + To split a single partition of the range-partitioned table: + +ALTER TABLE sales_range SPLIT PARTITION sales_feb_mar_apr2023 INTO + (PARTITION sales_feb2023 FOR VALUES FROM ('2023-02-01') TO ('2023-03-01'), + PARTITION sales_mar2023 FOR VALUES FROM ('2023-03-01') TO ('2023-04-01'), + PARTITION sales_apr2023 FOR VALUES FROM ('2023-04-01') TO ('2023-05-01')); + + + + To split a single partition of the list-partitioned table: + +ALTER TABLE sales_list SPLIT PARTITION sales_all INTO + (PARTITION sales_west FOR VALUES IN ('Lisbon', 'New York', 'Madrid'), + PARTITION sales_east FOR VALUES IN ('Bejing', 'Delhi', 'Vladivostok'), + PARTITION sales_central FOR VALUES IN ('Warsaw', 'Berlin', 'Kyiv')); + + + + To merge several partitions into one partition of the target table: + +ALTER TABLE sales_list MERGE PARTITIONS (sales_west, sales_east, sales_central) + INTO sales_all; + + diff --git a/doc/src/sgml/ref/checkpoint.sgml b/doc/src/sgml/ref/checkpoint.sgml index db011a47d0458..cd981cf2cab9f 100644 --- a/doc/src/sgml/ref/checkpoint.sgml +++ b/doc/src/sgml/ref/checkpoint.sgml @@ -21,7 +21,12 @@ PostgreSQL documentation -CHECKPOINT +CHECKPOINT [ ( option [, ...] ) ] + +where option can be one of: + + FLUSH_UNLOGGED [ boolean ] + MODE { FAST | SPREAD } @@ -37,14 +42,24 @@ CHECKPOINT - The CHECKPOINT command forces an immediate + By default, the CHECKPOINT command forces a fast checkpoint when the command is issued, without waiting for a regular checkpoint scheduled by the system (controlled by the settings in ). + To request the checkpoint be spread over a longer interval, set the + MODE option to SPREAD. CHECKPOINT is not intended for use during normal operation. + + The server may consolidate concurrently requested checkpoints. Such + consolidated requests will contain a combined set of options. For example, + if one session requests a fast checkpoint and another requests a spread + checkpoint, the server may combine those requests and perform one fast + checkpoint. + + If executed during recovery, the CHECKPOINT command will force a restartpoint (see ) @@ -58,6 +73,55 @@ CHECKPOINT + + Parameters + + + + FLUSH_UNLOGGED + + + Normally, CHECKPOINT does not flush dirty buffers of + unlogged relations. This option, which is disabled by default, enables + flushing unlogged relations to disk. + + + + + + MODE + + + When set to FAST, which is the default, the requested + checkpoint will be completed as fast as possible, which may result in a + significantly higher rate of I/O during the checkpoint. + + + MODE can also be set to SPREAD to + request the checkpoint be spread over a longer interval (controlled via + the settings in ), like a + regular checkpoint scheduled by the system. This can reduce the rate of + I/O during the checkpoint. + + + + + + boolean + + + Specifies whether the selected option should be turned on or off. + You can write TRUE, ON, or + 1 to enable the option, and FALSE, + OFF, or 0 to disable it. The + boolean value can also + be omitted, in which case TRUE is assumed. + + + + + + Compatibility diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml index 8811f169ea0b1..0b47460080b92 100644 --- a/doc/src/sgml/ref/cluster.sgml +++ b/doc/src/sgml/ref/cluster.sgml @@ -220,7 +220,7 @@ CLUSTER [ ( option [, ...] ) ] [ Examples - Cluster the table employees on the basis of + Cluster the table employees on the basis of its index employees_ind: CLUSTER employees USING employees_ind; @@ -228,7 +228,7 @@ CLUSTER employees USING employees_ind; - Cluster the employees table using the same + Cluster the employees table using the same index that was used before: CLUSTER employees; diff --git a/doc/src/sgml/ref/comment.sgml b/doc/src/sgml/ref/comment.sgml index 5b43c56b13359..8d81244910ba7 100644 --- a/doc/src/sgml/ref/comment.sgml +++ b/doc/src/sgml/ref/comment.sgml @@ -301,7 +301,7 @@ COMMENT ON Examples - Attach a comment to the table mytable: + Attach a comment to the table mytable: COMMENT ON TABLE mytable IS 'This is my table.'; diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 8433344e5b6f5..53b0ea8f57357 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -37,7 +37,7 @@ COPY { table_name [ ( delimiter_character' NULL 'null_string' DEFAULT 'default_string' - HEADER [ boolean | MATCH ] + HEADER [ boolean | integer | MATCH ] QUOTE 'quote_character' ESCAPE 'escape_character' FORCE_QUOTE { ( column_name [, ...] ) | * } @@ -50,7 +50,7 @@ COPY { table_name [ ( - + Description @@ -101,11 +101,11 @@ COPY { table_name [ ( - + Parameters - + table_name @@ -114,7 +114,7 @@ COPY { table_name [ ( - + column_name @@ -125,7 +125,7 @@ COPY { table_name [ ( - + query @@ -149,7 +149,7 @@ COPY { table_name [ ( - + filename @@ -161,7 +161,7 @@ COPY { table_name [ ( - + PROGRAM @@ -180,7 +180,7 @@ COPY { table_name [ ( - + STDIN @@ -189,7 +189,7 @@ COPY { table_name [ ( - + STDOUT @@ -198,7 +198,7 @@ COPY { table_name [ ( - + boolean @@ -212,7 +212,16 @@ COPY { table_name [ ( - + + integer + + + Specifies a non-negative integer value passed to the selected option. + + + + + FORMAT @@ -226,7 +235,7 @@ COPY { table_name [ ( - + FREEZE @@ -249,7 +258,7 @@ COPY { table_name [ ( - + DELIMITER @@ -262,7 +271,7 @@ COPY { table_name [ ( - + NULL @@ -286,7 +295,7 @@ COPY { table_name [ ( - + DEFAULT @@ -299,25 +308,34 @@ COPY { table_name [ ( - + HEADER - Specifies that the file contains a header line with the names of each - column in the file. On output, the first line contains the column - names from the table. On input, the first line is discarded when this - option is set to true (or equivalent Boolean value). - If this option is set to MATCH, the number and names - of the columns in the header line must match the actual column names of - the table, in order; otherwise an error is raised. + On output, if this option is set to true + (or an equivalent Boolean value), the first line of the output will + contain the column names from the table. + Integer values 0 and 1 are + accepted as Boolean values, but other integers are not allowed for + COPY TO commands. + + + On input, if this option is set to true + (or an equivalent Boolean value), the first line of the input is + discarded. If set to a non-negative integer, that number of + lines are discarded. If set to MATCH, the first line + is discarded, and it must contain column names that exactly match the + table's columns, in both number and order; otherwise, an error is raised. + The MATCH value is only valid for + COPY FROM commands. + + This option is not allowed when using binary format. - The MATCH option is only valid for COPY - FROM commands. - + QUOTE @@ -329,7 +347,7 @@ COPY { table_name [ ( - + ESCAPE @@ -343,7 +361,7 @@ COPY { table_name [ ( - + FORCE_QUOTE @@ -357,7 +375,7 @@ COPY { table_name [ ( - + FORCE_NOT_NULL @@ -372,7 +390,7 @@ COPY { table_name [ ( - + FORCE_NULL @@ -387,7 +405,7 @@ COPY { table_name [ ( - + ON_ERROR @@ -415,7 +433,7 @@ COPY { table_name [ ( - + REJECT_LIMIT @@ -433,7 +451,7 @@ COPY { table_name [ ( - + ENCODING @@ -445,7 +463,7 @@ COPY { table_name [ ( - + LOG_VERBOSITY @@ -463,7 +481,7 @@ COPY { table_name [ ( - + WHERE @@ -492,7 +510,7 @@ WHERE condition - + Outputs @@ -516,18 +534,19 @@ COPY count - + Notes COPY TO can be used with plain - tables and populated materialized views. - For example, - COPY table - TO copies the same rows as + tables, populated materialized views, and partitioned tables. + For non-partitioned tables, COPY table + copies the same rows as SELECT * FROM ONLY table. + For partitioned tables, it copies the same rows as + SELECT * FROM table. However it doesn't directly support other relation types, - such as partitioned tables, inheritance child tables, or views. + such as inheritance child tables, or views. To copy all rows from such relations, use COPY (SELECT * FROM table) TO. @@ -1056,7 +1075,7 @@ versions of PostgreSQL. - + Examples @@ -1122,7 +1141,7 @@ ZW ZIMBABWE - + Compatibility diff --git a/doc/src/sgml/ref/create_aggregate.sgml b/doc/src/sgml/ref/create_aggregate.sgml index 222e0aa5c9d08..0472ac2e87459 100644 --- a/doc/src/sgml/ref/create_aggregate.sgml +++ b/doc/src/sgml/ref/create_aggregate.sgml @@ -384,9 +384,13 @@ SELECT col FROM tab ORDER BY col USING sortop LIMIT 1; The approximate average size (in bytes) of the aggregate's state value. If this parameter is omitted or is zero, a default estimate is used - based on the state_data_type. + based on the state_data_type. If set to a + negative value, it indicates the state data can grow unboundedly in + size, such as when the aggregate accumulates input rows (e.g., + array_agg, string_agg). The planner uses this value to estimate the memory required for a - grouped aggregate query. + grouped aggregate query and to avoid optimizations that may cause + excessive memory usage. @@ -568,7 +572,8 @@ SELECT col FROM tab ORDER BY col USING sortop LIMIT 1; The approximate average size (in bytes) of the aggregate's state value, when using moving-aggregate mode. This works the same as - state_data_size. + state_data_size, except that negative + values are not used to indicate unbounded state size. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 640c0425faec5..3544b15efdafa 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -140,7 +140,7 @@ CREATE DATABASE name after the creation of the new database. In some situations, this may have a noticeable negative impact on overall system performance. The FILE_COPY strategy is affected by the setting. + linkend="guc-file-copy-method"/> setting. @@ -150,12 +150,12 @@ CREATE DATABASE name Sets the default collation order and character classification in the new database. Collation affects the sort order applied to strings, - e.g., in queries with ORDER BY, as well as the order used in indexes - on text columns. Character classification affects the categorization - of characters, e.g., lower, upper, and digit. Also sets the - associated aspects of the operating system environment, - LC_COLLATE and LC_CTYPE. The - default is the same setting as the template database. See ORDER BY, as well as the + order used in indexes on text columns. Character classification + affects the categorization of characters, e.g., lower, upper, and + digit. Also sets the LC_CTYPE aspect of the + operating system environment. The default is the same setting as the + template database. See and for details. @@ -189,17 +189,16 @@ CREATE DATABASE name lc_collate - Sets LC_COLLATE in the database server's operating - system environment. The default is the setting of if specified, otherwise the same - setting as the template database. See below for additional - restrictions. + If is + libc, sets the default collation order to use in + the new database, overriding the setting . Otherwise, this setting is + ignored. - If is - libc, also sets the default collation order to use - in the new database, overriding the setting . + The default is the setting of + if specified, otherwise the same setting as the template database. + See below for additional restrictions. @@ -208,16 +207,18 @@ CREATE DATABASE name Sets LC_CTYPE in the database server's operating - system environment. The default is the setting of if specified, otherwise the same - setting as the template database. See below for additional - restrictions. + system environment. If is - libc, also sets the default character - classification to use in the new database, overriding the setting - . + libc, sets the default character classification to + use in the new database, overriding the setting . + + + The default is the setting of + if specified, otherwise the same setting as the template database. + See below for additional restrictions. diff --git a/doc/src/sgml/ref/create_foreign_table.sgml b/doc/src/sgml/ref/create_foreign_table.sgml index d08834ac9d291..08a8ceeae7586 100644 --- a/doc/src/sgml/ref/create_foreign_table.sgml +++ b/doc/src/sgml/ref/create_foreign_table.sgml @@ -232,7 +232,7 @@ WITH ( MODULUS numeric_literal, REM INCLUDING COMMENTS - Comments for the copied columns, constraints, and indexes will be + Comments for the copied columns and constraints will be copied. The default behavior is to exclude comments, resulting in the copied columns and constraints in the new table having no comments. @@ -360,7 +360,7 @@ WITH ( MODULUS numeric_literal, REM Currently, CHECK expressions cannot contain subqueries nor refer to variables other than columns of the - current row. The system column tableoid + current row. The system column tableoid may be referenced, but not any other system column. diff --git a/doc/src/sgml/ref/create_function.sgml b/doc/src/sgml/ref/create_function.sgml index 0d240484cd3f0..30bd4602f8d9a 100644 --- a/doc/src/sgml/ref/create_function.sgml +++ b/doc/src/sgml/ref/create_function.sgml @@ -649,7 +649,7 @@ END parameters. Thus for example these declarations conflict: CREATE FUNCTION foo(int) ... -CREATE FUNCTION foo(int, out text) ... +CREATE FUNCTION foo(int, OUT text) ... @@ -709,7 +709,7 @@ CREATE FUNCTION foo(int, int default 42) ... Add two integers using an SQL function: CREATE FUNCTION add(integer, integer) RETURNS integer - AS 'select $1 + $2;' + AS 'SELECT $1 + $2;' LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT; @@ -740,7 +740,7 @@ $$ LANGUAGE plpgsql; Return a record containing multiple output parameters: -CREATE FUNCTION dup(in int, out f1 int, out f2 text) +CREATE FUNCTION dup(IN int, OUT f1 int, OUT f2 text) AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$ LANGUAGE SQL; @@ -817,10 +817,10 @@ $$ LANGUAGE plpgsql SET search_path = admin, pg_temp; - This function's intention is to access a table admin.pwds. + This function's intention is to access a table admin.pwds. But without the SET clause, or with a SET clause mentioning only admin, the function could be subverted by - creating a temporary table named pwds. + creating a temporary table named pwds. diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 147a8f7587c71..bb7505d171b6d 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -814,7 +814,7 @@ Indexes: leveraging multiple CPUs in order to process the table rows faster. This feature is known as parallel index build. For index methods that support building indexes - in parallel (currently, B-tree and BRIN), + in parallel (currently, B-tree, GIN, and BRIN), maintenance_work_mem specifies the maximum amount of memory that can be used by each index build operation as a whole, regardless of how many worker processes were started. @@ -898,17 +898,17 @@ Indexes: Examples - To create a unique B-tree index on the column title in - the table films: + To create a unique B-tree index on the column title in + the table films: CREATE UNIQUE INDEX title_idx ON films (title); - To create a unique B-tree index on the column title - with included columns director - and rating in the table films: + To create a unique B-tree index on the column title + with included columns director + and rating in the table films: CREATE UNIQUE INDEX title_idx ON films (title) INCLUDE (director, rating); @@ -960,8 +960,8 @@ CREATE INDEX gin_idx ON documents_table USING GIN (locations) WITH (fastupdate = - To create an index on the column code in the table - films and have the index reside in the tablespace + To create an index on the column code in the table + films and have the index reside in the tablespace indexspace: CREATE INDEX code_idx ON films (code) TABLESPACE indexspace; diff --git a/doc/src/sgml/ref/create_operator.sgml b/doc/src/sgml/ref/create_operator.sgml index 3553d36454185..d2ffb1b2a500f 100644 --- a/doc/src/sgml/ref/create_operator.sgml +++ b/doc/src/sgml/ref/create_operator.sgml @@ -23,7 +23,7 @@ PostgreSQL documentation CREATE OPERATOR name ( {FUNCTION|PROCEDURE} = function_name - [, LEFTARG = left_type ] [, RIGHTARG = right_type ] + [, LEFTARG = left_type ] , RIGHTARG = right_type [, COMMUTATOR = com_op ] [, NEGATOR = neg_op ] [, RESTRICT = res_proc ] [, JOIN = join_proc ] [, HASHES ] [, MERGES ] @@ -88,8 +88,8 @@ CREATE OPERATOR name ( For binary operators, both LEFTARG and - RIGHTARG must be defined. For prefix operators only - RIGHTARG should be defined. + RIGHTARG must be defined. For prefix operators, only + RIGHTARG must be defined. The function_name function must have been previously defined using CREATE FUNCTION and must be defined to accept the correct number diff --git a/doc/src/sgml/ref/create_policy.sgml b/doc/src/sgml/ref/create_policy.sgml index e76c342d3da67..42d43ad7bf414 100644 --- a/doc/src/sgml/ref/create_policy.sgml +++ b/doc/src/sgml/ref/create_policy.sgml @@ -49,6 +49,8 @@ CREATE POLICY name ON WITH CHECK. When a USING expression returns true for a given row then that row is visible to the user, while if false or null is returned then the row is not visible. + Typically, no error occurs when a row is not visible, but see + for exceptions. When a WITH CHECK expression returns true for a row then that row is inserted or updated, while if false or null is returned then an error occurs. @@ -194,8 +196,9 @@ CREATE POLICY name ON SELECT), and will not be available for modification (in an UPDATE - or DELETE). Such rows are silently suppressed; no error - is reported. + or DELETE). Typically, such rows are silently + suppressed; no error is reported (but see + for exceptions). @@ -251,8 +254,10 @@ CREATE POLICY name ON INSERT or UPDATE command attempts to add rows to the table that do not pass the ALL - policy's WITH CHECK expression, the entire - command will be aborted. + policy's WITH CHECK expression (or its + USING expression, if it does not have a + WITH CHECK expression), the entire command will + be aborted. @@ -268,11 +273,50 @@ CREATE POLICY name ON SELECT policy will be returned during a SELECT query, and that queries that require SELECT permissions, such as - UPDATE, will also only see those records + UPDATE, DELETE, and + MERGE, will also only see those records that are allowed by the SELECT policy. A SELECT policy cannot have a WITH CHECK expression, as it only applies in cases where - records are being retrieved from the relation. + records are being retrieved from the relation, except as described + below. + + + If a data-modifying query has a RETURNING clause, + SELECT permissions are required on the relation, + and any newly inserted or updated rows from the relation must satisfy + the relation's SELECT policies in order to be + available to the RETURNING clause. If a newly + inserted or updated row does not satisfy the relation's + SELECT policies, an error will be thrown (inserted + or updated rows to be returned are never + silently ignored). + + + If an INSERT has an ON CONFLICT DO + NOTHING/UPDATE clause, SELECT + permissions are required on the relation, and the rows proposed for + insertion are checked using the relation's SELECT + policies. If a row proposed for insertion does not satisfy the + relation's SELECT policies, an error is thrown + (the INSERT is never silently + avoided). In addition, if the UPDATE path is + taken, the row to be updated and the new updated row are checked + against the relation's SELECT policies, and an + error is thrown if they are not satisfied (an auxiliary + UPDATE is never silently + avoided). + + + A MERGE command requires SELECT + permissions on both the source and target relations, and so each + relation's SELECT policies are applied before they + are joined, and the MERGE actions will only see + those records that are allowed by those policies. In addition, if + an UPDATE action is executed, the target relation's + SELECT policies are applied to the updated row, as + for a standalone UPDATE, except that an error is + thrown if they are not satisfied. @@ -292,10 +336,11 @@ CREATE POLICY name ON - Note that INSERT with ON CONFLICT DO - UPDATE checks INSERT policies' - WITH CHECK expressions only for rows appended - to the relation by the INSERT path. + Note that an INSERT with an ON CONFLICT + DO NOTHING/UPDATE clause will check the + INSERT policies' WITH CHECK + expressions for all rows proposed for insertion, regardless of + whether or not they end up being inserted. @@ -305,12 +350,12 @@ CREATE POLICY name ON Using UPDATE for a policy means that it will apply - to UPDATE, SELECT FOR UPDATE + to UPDATE, SELECT FOR UPDATE, and SELECT FOR SHARE commands, as well as auxiliary ON CONFLICT DO UPDATE clauses of - INSERT commands. - MERGE commands containing UPDATE - actions are affected as well. Since UPDATE + INSERT commands, and MERGE + commands containing UPDATE actions. + Since an UPDATE command involves pulling an existing record and replacing it with a new modified record, UPDATE policies accept both a USING expression and @@ -356,7 +401,8 @@ CREATE POLICY name ON USING expressions, an error will be thrown (the UPDATE path will never be silently - avoided). + avoided). The same applies to an UPDATE action + of a MERGE command. @@ -366,12 +412,18 @@ CREATE POLICY name ON Using DELETE for a policy means that it will apply - to DELETE commands. Only rows that pass this - policy will be seen by a DELETE command. There can - be rows that are visible through a SELECT that are - not available for deletion, if they do not pass the - USING expression for - the DELETE policy. + to DELETE commands and MERGE + commands containing DELETE actions. For a + DELETE command, only rows that pass this policy + will be seen by the DELETE command. There can + be rows that are visible through a SELECT policy + that are not available for deletion, if they do not pass the + USING expression for the DELETE + policy. Note, however, that a DELETE action in a + MERGE command will see rows that are visible + through SELECT policies, and if the + DELETE policy does not pass for such a row, an + error will be thrown. @@ -400,6 +452,15 @@ CREATE POLICY name ON + + summarizes how the different + types of policy apply to specific commands. In the table, + check means that the policy expression is checked and an + error is thrown if it returns false or null, whereas filter + means that the row is silently ignored if the policy expression returns + false or null. + +
Policies Applied by Command Type @@ -424,8 +485,8 @@ CREATE POLICY name ON - SELECT - Existing row + SELECT / COPY ... TO + Filter existing row @@ -433,63 +494,117 @@ CREATE POLICY name ON SELECT FOR UPDATE/SHARE - Existing row + Filter existing row - Existing row + Filter existing row - INSERT / MERGE ... THEN INSERT + INSERT + + Check new row  + + If read access is required to either the existing or new row (for + example, a WHERE or RETURNING + clause that refers to columns from the relation). + + + + Check new row - New row + + + UPDATE + + Filter existing row  & + check new row  + + + Filter existing row + Check new row - INSERT ... RETURNING + DELETE - New row + Filter existing row  + + + + + Filter existing row + + + INSERT ... ON CONFLICT + + Check new row  - If read access is required to the existing or new row (for example, - a WHERE or RETURNING clause - that refers to columns from the relation). + Row proposed for insertion is checked regardless of whether or not a + conflict occurs. - New row + + Check new row  + - UPDATE / MERGE ... THEN UPDATE + ON CONFLICT DO UPDATE - Existing & new rows + Check existing & new rows  + + New row of the auxiliary UPDATE command, which + might be different from the new row of the original + INSERT command. + + - Existing row - New row + Check existing row + + Check new row  + - DELETE + MERGE + Filter source & target rows + + + + + + + MERGE ... THEN INSERT - Existing row + Check new row  + Check new row - Existing row - ON CONFLICT DO UPDATE - Existing & new rows + MERGE ... THEN UPDATE + Check new row + + Check existing row + Check new row + + + + MERGE ... THEN DELETE + + - Existing row - New row + Check existing row diff --git a/doc/src/sgml/ref/create_publication.sgml b/doc/src/sgml/ref/create_publication.sgml index 802630f2df116..75a508bebfa44 100644 --- a/doc/src/sgml/ref/create_publication.sgml +++ b/doc/src/sgml/ref/create_publication.sgml @@ -22,14 +22,22 @@ PostgreSQL documentation CREATE PUBLICATION name - [ FOR ALL TABLES - | FOR publication_object [, ... ] ] + [ FOR { publication_object [, ... ] | publication_all_object [, ... ] } ] [ WITH ( publication_parameter [= value] [, ... ] ) ] where publication_object is one of: - TABLE [ ONLY ] table_name [ * ] [ ( column_name [, ... ] ) ] [ WHERE ( expression ) ] [, ... ] + TABLE table_and_columns [, ... ] TABLES IN SCHEMA { schema_name | CURRENT_SCHEMA } [, ... ] + +and publication_all_object is one of: + + ALL TABLES + ALL SEQUENCES + +and table_and_columns is: + + [ ONLY ] table_name [ * ] [ ( column_name [, ... ] ) ] [ WHERE ( expression ) ] @@ -120,16 +128,6 @@ CREATE PUBLICATION name - - FOR ALL TABLES - - - Marks the publication as one that replicates changes for all tables in - the database, including tables created in the future. - - - - FOR TABLES IN SCHEMA @@ -161,11 +159,37 @@ CREATE PUBLICATION name + + FOR ALL TABLES + + + Marks the publication as one that replicates changes for all tables in + the database, including tables created in the future. + + + + + + FOR ALL SEQUENCES + + + Marks the publication as one that synchronizes changes for all sequences + in the database, including sequences created in the future. + + + + Only persistent sequences are included in the publication. Temporary + sequences and unlogged sequences are excluded from the publication. + + + + WITH ( publication_parameter [= value] [, ... ] ) - This clause specifies optional parameters for a publication. The + This clause specifies optional parameters for a publication when + publishing tables. This clause is not applicable to sequences. The following parameters are supported: @@ -279,10 +303,10 @@ CREATE PUBLICATION name Notes - If FOR TABLE, FOR ALL TABLES or - FOR TABLES IN SCHEMA are not specified, then the - publication starts out with an empty set of tables. That is useful if - tables or schemas are to be added later. + If FOR TABLE, FOR TABLES IN SCHEMA, + FOR ALL TABLES or FOR ALL SEQUENCES + are not specified, then the publication starts out with an empty set of + tables. That is useful if tables or schemas are to be added later. @@ -298,8 +322,9 @@ CREATE PUBLICATION name To add a table to a publication, the invoking user must have ownership - rights on the table. The FOR ALL TABLES and - FOR TABLES IN SCHEMA clauses require the invoking + rights on the table. The FOR TABLES IN SCHEMA, + FOR ALL TABLES and + FOR ALL SEQUENCES clauses require the invoking user to be a superuser. @@ -449,6 +474,21 @@ CREATE PUBLICATION sales_publication FOR TABLES IN SCHEMA marketing, sales; CREATE PUBLICATION users_filtered FOR TABLE users (user_id, firstname); + + + Create a publication that publishes all sequences for synchronization: + +CREATE PUBLICATION all_sequences FOR ALL SEQUENCES; + + + + + Create a publication that publishes all changes in all tables, and + all sequences for synchronization: + +CREATE PUBLICATION all_tables_sequences FOR ALL TABLES, ALL SEQUENCES; + + diff --git a/doc/src/sgml/ref/create_sequence.sgml b/doc/src/sgml/ref/create_sequence.sgml index 1e283f13d15c6..0ffcd0febd1b5 100644 --- a/doc/src/sgml/ref/create_sequence.sgml +++ b/doc/src/sgml/ref/create_sequence.sgml @@ -70,7 +70,7 @@ SELECT * FROM name; to examine the parameters and current state of a sequence. In particular, - the last_value field of the sequence shows the last value + the last_value field of the sequence shows the last value allocated by any session. (Of course, this value might be obsolete by the time it's printed, if other sessions are actively doing nextval calls.) @@ -295,7 +295,7 @@ SELECT * FROM name; used for a sequence object that will be used concurrently by multiple sessions. Each session will allocate and cache successive sequence values during one access to the sequence object and - increase the sequence object's last_value accordingly. + increase the sequence object's last_value accordingly. Then, the next cache-1 uses of nextval within that session simply return the preallocated values without touching the sequence object. So, any @@ -319,7 +319,7 @@ SELECT * FROM name; class="parameter">cache setting greater than one you should only assume that the nextval values are all distinct, not that they are generated purely sequentially. Also, - last_value will reflect the latest value reserved by + last_value will reflect the latest value reserved by any session, whether or not it has yet been returned by nextval. diff --git a/doc/src/sgml/ref/create_subscription.sgml b/doc/src/sgml/ref/create_subscription.sgml index 57dec28a5df64..197be0c6f6b30 100644 --- a/doc/src/sgml/ref/create_subscription.sgml +++ b/doc/src/sgml/ref/create_subscription.sgml @@ -127,10 +127,10 @@ CREATE SUBSCRIPTION subscription_name Since no connection is made when this option is - false, no tables are subscribed. To initiate - replication, you must manually create the replication slot, enable - the failover if required, enable the subscription, and refresh the - subscription. See + false, no tables and sequences are subscribed. To + initiate replication, you must manually create the replication slot, + enable the failover if required, enable the subscription, and refresh + the subscription. See for examples. @@ -169,7 +169,9 @@ CREATE SUBSCRIPTION subscription_name Name of the publisher's replication slot to use. The default is - to use the name of the subscription for the slot name. + to use the name of the subscription for the slot name. The name cannot + be pg_conflict_detection as it is reserved for the + conflict detection. @@ -226,7 +228,7 @@ CREATE SUBSCRIPTION subscription_name for more about send/receive - functions). + functions). This parameter has no effect for sequences. @@ -263,6 +265,12 @@ CREATE SUBSCRIPTION subscription_namecopy_data = true can interact with the origin parameter. + + See + for recommendations on how to handle any warnings about sequence + definition differences between the publisher and the subscriber, + which might occur when copy_data = true. + @@ -278,6 +286,7 @@ CREATE SUBSCRIPTION subscription_name @@ -308,7 +317,8 @@ CREATE SUBSCRIPTION subscription_name setting within this subscription's apply worker processes. The default value - is off. + is off. This parameter has no effect for + sequences. @@ -338,7 +348,8 @@ CREATE SUBSCRIPTION subscription_name Specifies whether two-phase commit is enabled for this subscription. - The default is false. + The default is false. This parameter has no effect + for sequences. @@ -396,8 +407,8 @@ CREATE SUBSCRIPTION subscription_name If true, all replication actions are performed as the subscription owner. If false, replication workers will perform actions on each - table as the owner of that table. The latter configuration is - generally much more secure; for details, see + table or sequence as the owner of that relation. The latter + configuration is generally much more secure; for details, see . The default is false. @@ -415,6 +426,7 @@ CREATE SUBSCRIPTION subscription_nameorigin to any means that the publisher sends changes regardless of their origin. The default is any. + This parameter has no effect for sequences. See for details of how @@ -435,6 +447,133 @@ CREATE SUBSCRIPTION subscription_name + + + retain_dead_tuples (boolean) + + + Specifies whether the information (e.g., dead tuples, commit + timestamps, and origins) required for conflict detection on the + subscriber is retained. The default is false. + If set to true, the detection of + is enabled, and a physical + replication slot named pg_conflict_detection + is created on the subscriber to prevent the information for detecting + conflicts from being removed. This parameter has no effect for + sequences. + + + + Note that the information useful for conflict detection is retained + only after the creation of the slot. You can verify the existence of + this slot by querying pg_replication_slots. + And even if multiple subscriptions on one node enable this option, + only one replication slot will be created. Also, + wal_level must be set to replica + or higher to allow the replication slot to be used. + + + + + Note that the information for conflict detection cannot be purged if + the subscription is disabled; thus, the information will accumulate + until the subscription is enabled. To prevent excessive accumulation, + it is recommended to disable retain_dead_tuples + if the subscription will be inactive for an extended period. + + + + Additionally when enabling retain_dead_tuples for + conflict detection in logical replication, it is important to design the + replication topology to balance data retention requirements with + overall system performance. This option provides minimal performance + overhead when applied appropriately. The following scenarios illustrate + effective usage patterns when enabling this option. + + + + a. Large Tables with Bidirectional Writes: + For large tables subject to concurrent writes on both publisher and + subscriber nodes, publishers can define row filters when creating + publications to segment data. This allows multiple subscriptions + to replicate exclusive subsets of the table in parallel, optimizing + the throughput. + + + + b. Write-Enabled Subscribers: + If a subscriber node is expected to perform write operations, replication + can be structured using multiple publications and subscriptions. By + distributing tables across these publications, the workload is spread among + several apply workers, improving concurrency and reducing contention. + + + + c. Read-Only Subscribers: + In configurations involving single or multiple publisher nodes + performing concurrent write operations, read-only subscriber nodes may + replicate changes without seeing a performance impact if it does index + scan. However, if the subscriber is impacted due to replication lag or + scan performance (say due to sequential scans), it needs to follow one + of the two previous strategies to distribute the workload on the + subscriber. + + + + + This option cannot be enabled if the publisher is a physical standby. + + + + Enabling this option ensures retention of information useful for + conflict detection solely for changes occurring locally on the + publisher. For the changes originating from different origins, + reliable conflict detection cannot be guaranteed. + + + + + + max_retention_duration (integer) + + + Maximum duration in milliseconds for which this subscription's apply worker + is allowed to retain the information useful for conflict detection when + retain_dead_tuples is enabled. The default value + is 0, indicating that the information is retained + until it is no longer needed for detection purposes. + + + The information useful for conflict detection is no longer retained if + all apply workers associated with the subscriptions, where + retain_dead_tuples is enabled, confirm that the + retention duration has exceeded the + max_retention_duration set within the corresponding + subscription. The retention will automatically resume when at least one + apply worker confirms that the retention duration is within the + specified limit, or when a new subscription is created with + retain_dead_tuples = true. Alternatively, retention + can be manually resumed by re-enabling retain_dead_tuples. + + + Note that overall retention will not stop if other subscriptions that + have a value greater than 0 for this parameter have not exceeded it, + or if they set this option to 0. + + + This option is effective only when + retain_conflict_info is enabled and the apply + worker associated with the subscription is active. + + + + Note that setting a non-zero value for this option could lead to + information for conflict detection being removed prematurely, + potentially resulting in incorrect conflict detection. + + + + diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index 4a41b2f553007..77c5a763d4508 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -447,11 +447,6 @@ WITH ( MODULUS numeric_literal, REM the values in the new row, an error will be reported. - - Partitioned tables do not support EXCLUDE constraints; - however, you can define these constraints on individual partitions. - - See for more discussion on table partitioning. @@ -867,7 +862,7 @@ WITH ( MODULUS numeric_literal, REM Currently, CHECK expressions cannot contain subqueries nor refer to variables other than columns of the current row (see ). - The system column tableoid + The system column tableoid may be referenced, but not any other system column. @@ -929,6 +924,15 @@ WITH ( MODULUS numeric_literal, REM not other generated columns. Any functions and operators used must be immutable. References to other tables are not allowed. + + + A virtual generated column cannot have a user-defined type, and the + generation expression of a virtual generated column must not reference + user-defined functions or types, that is, it can only use built-in + functions or types. This applies also indirectly, such as for functions + or types that underlie operators or casts. (This restriction does not + exist for stored generated columns.) + @@ -1162,6 +1166,18 @@ WITH ( MODULUS numeric_literal, REM exclusion constraint on a subset of the table; internally this creates a partial index. Note that parentheses are required around the predicate. + + + When establishing an exclusion constraint for a multi-level partition + hierarchy, all the columns in the partition key of the target + partitioned table, as well as those of all its descendant partitioned + tables, must be included in the constraint definition. Additionally, + those columns must be compared using the equality operator. These + restrictions ensure that potentially-conflicting rows will exist in the + same partition. The constraint may also refer to other columns which + are not a part of any partition key, which can be compared using any + appropriate operator. + @@ -1363,8 +1379,8 @@ WITH ( MODULUS numeric_literal, REM REFERENCES (foreign key) constraints accept this clause. NOT NULL and CHECK constraints are not deferrable. Note that deferrable constraints cannot be used as - conflict arbitrators in an INSERT statement that - includes an ON CONFLICT DO UPDATE clause. + conflict arbiters in an INSERT statement that + includes an ON CONFLICT clause. @@ -1687,7 +1703,8 @@ WITH ( MODULUS numeric_literal, REM vacuum_truncate, toast.vacuum_truncate (boolean) - vacuum_truncate storage parameter + vacuum_truncate + storage parameter @@ -1949,6 +1966,21 @@ WITH ( MODULUS numeric_literal, REM + + log_autoanalyze_min_duration (integer) + + log_autoanalyze_min_duration + storage parameter + + + + + Per-table value for + parameter. + + + + vacuum_max_eager_freeze_failure_rate, toast.vacuum_max_eager_freeze_failure_rate (floating point) @@ -2225,7 +2257,7 @@ CREATE TABLE employees OF employee_type ( Create a range partitioned table: CREATE TABLE measurement ( - logdate date not null, + logdate date NOT NULL, peaktemp int, unitsales int ) PARTITION BY RANGE (logdate); @@ -2235,7 +2267,7 @@ CREATE TABLE measurement ( Create a range partitioned table with multiple columns in the partition key: CREATE TABLE measurement_year_month ( - logdate date not null, + logdate date NOT NULL, peaktemp int, unitsales int ) PARTITION BY RANGE (EXTRACT(YEAR FROM logdate), EXTRACT(MONTH FROM logdate)); @@ -2245,8 +2277,8 @@ CREATE TABLE measurement_year_month ( Create a list partitioned table: CREATE TABLE cities ( - city_id bigserial not null, - name text not null, + city_id bigserial NOT NULL, + name text NOT NULL, population bigint ) PARTITION BY LIST (left(lower(name), 1)); @@ -2255,8 +2287,8 @@ CREATE TABLE cities ( Create a hash partitioned table: CREATE TABLE orders ( - order_id bigint not null, - cust_id bigint not null, + order_id bigint NOT NULL, + cust_id bigint NOT NULL, status text ) PARTITION BY HASH (order_id); diff --git a/doc/src/sgml/ref/create_table_as.sgml b/doc/src/sgml/ref/create_table_as.sgml index 8429333e3af94..6b41226cbd629 100644 --- a/doc/src/sgml/ref/create_table_as.sgml +++ b/doc/src/sgml/ref/create_table_as.sgml @@ -266,8 +266,8 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI Examples - Create a new table films_recent consisting of only - recent entries from the table films: + Create a new table films_recent consisting of only + recent entries from the table films: CREATE TABLE films_recent AS @@ -286,8 +286,8 @@ CREATE TABLE films2 AS - Create a new temporary table films_recent, consisting of - only recent entries from the table films, using a + Create a new temporary table films_recent, consisting of + only recent entries from the table films, using a prepared statement. The new table will be dropped at commit: diff --git a/doc/src/sgml/ref/create_trigger.sgml b/doc/src/sgml/ref/create_trigger.sgml index 982ab6f3ee450..bb1426f4970d5 100644 --- a/doc/src/sgml/ref/create_trigger.sgml +++ b/doc/src/sgml/ref/create_trigger.sgml @@ -29,7 +29,7 @@ PostgreSQL documentation CREATE [ OR REPLACE ] [ CONSTRAINT ] TRIGGER name { BEFORE | AFTER | INSTEAD OF } { event [ OR ... ] } ON table_name [ FROM referenced_table_name ] - [ NOT DEFERRABLE | [ DEFERRABLE ] [ INITIALLY IMMEDIATE | INITIALLY DEFERRED ] ] + [ NOT DEFERRABLE | [ DEFERRABLE ] [ INITIALLY IMMEDIATE | INITIALLY DEFERRED ] ] [ ENFORCED ] [ REFERENCING { { OLD | NEW } TABLE [ AS ] transition_relation_name } [ ... ] ] [ FOR [ EACH ] { ROW | STATEMENT } ] [ WHEN ( condition ) ] @@ -197,9 +197,11 @@ CREATE [ OR REPLACE ] [ CONSTRAINT ] TRIGGER name of the rows inserted, deleted, or modified by the current SQL statement. This feature lets the trigger see a global view of what the statement did, not just one row at a time. This option is only allowed for - an AFTER trigger that is not a constraint trigger; also, if - the trigger is an UPDATE trigger, it must not specify - a column_name list. + an AFTER trigger on a plain table (not a foreign table). + The trigger should not be a constraint trigger. Also, if the trigger is + an UPDATE trigger, it must not specify + a column_name list when using + this option. OLD TABLE may only be specified once, and only for a trigger that can fire on UPDATE or DELETE; it creates a transition relation containing the before-images of all rows @@ -321,6 +323,15 @@ UPDATE OF column_name1 [, column_name2 + + ENFORCED + + + This is a noise word. Constraint triggers are always enforced. + + + + REFERENCING @@ -474,7 +485,7 @@ UPDATE OF column_name1 [, column_name2BEFORE UPDATE triggers are not considered. Conversely, a command such as UPDATE ... SET x = x ... - will fire a trigger on column x, even though the column's + will fire a trigger on column x, even though the column's value did not change. @@ -587,7 +598,7 @@ UPDATE OF column_name1 [, column_name2 Execute the function check_account_update whenever - a row of the table accounts is about to be updated: + a row of the table accounts is about to be updated: CREATE TRIGGER check_update @@ -597,7 +608,7 @@ CREATE TRIGGER check_update Modify that trigger definition to only execute the function if - column balance is specified as a target in + column balance is specified as a target in the UPDATE command: @@ -607,7 +618,7 @@ CREATE OR REPLACE TRIGGER check_update EXECUTE FUNCTION check_account_update(); - This form only executes the function if column balance + This form only executes the function if column balance has in fact changed value: @@ -618,7 +629,7 @@ CREATE TRIGGER check_update EXECUTE FUNCTION check_account_update(); - Call a function to log updates of accounts, but only if + Call a function to log updates of accounts, but only if something changed: diff --git a/doc/src/sgml/ref/create_view.sgml b/doc/src/sgml/ref/create_view.sgml index e8d9d3c8d0f64..f8a4740608a18 100644 --- a/doc/src/sgml/ref/create_view.sgml +++ b/doc/src/sgml/ref/create_view.sgml @@ -415,7 +415,7 @@ CREATE VIEW vista AS SELECT text 'Hello World' AS hello; DELETE, or MERGE statement on the view into the corresponding statement on the underlying base relation. INSERT statements that have an ON - CONFLICT UPDATE clause are fully supported. + CONFLICT DO UPDATE clause are fully supported. @@ -430,7 +430,7 @@ CREATE VIEW vista AS SELECT text 'Hello World' AS hello; an INSERT or MERGE command can potentially insert base-relation rows that do not satisfy the WHERE condition and thus are not - visible through the view (ON CONFLICT UPDATE may + visible through the view (ON CONFLICT DO UPDATE may similarly affect an existing row not visible through the view). The CHECK OPTION may be used to prevent INSERT, UPDATE, and @@ -492,7 +492,7 @@ CREATE VIEW comedies AS WHERE kind = 'Comedy'; This will create a view containing the columns that are in the - film table at the time of view creation. Though + film table at the time of view creation. Though * was used to create the view, columns added later to the table will not be part of the view. @@ -507,12 +507,12 @@ CREATE VIEW universal_comedies AS WHERE classification = 'U' WITH LOCAL CHECK OPTION; - This will create a view based on the comedies view, showing + This will create a view based on the comedies view, showing only films with kind = 'Comedy' and classification = 'U'. Any attempt to INSERT or UPDATE a row in the view will be rejected if the new row doesn't have classification = 'U', but the film - kind will not be checked. + kind will not be checked. @@ -525,8 +525,8 @@ CREATE VIEW pg_comedies AS WHERE classification = 'PG' WITH CASCADED CHECK OPTION; - This will create a view that checks both the kind and - classification of new rows. + This will create a view that checks both the kind and + classification of new rows. @@ -543,9 +543,9 @@ CREATE VIEW comedies AS WHERE f.kind = 'Comedy'; This view will support INSERT, UPDATE and - DELETE. All the columns from the films table will - be updatable, whereas the computed columns country and - avg_rating will be read-only. + DELETE. All the columns from the films table will + be updatable, whereas the computed columns country and + avg_rating will be read-only. diff --git a/doc/src/sgml/ref/createdb.sgml b/doc/src/sgml/ref/createdb.sgml index 5c4e0465ed9da..2ccbe13f39008 100644 --- a/doc/src/sgml/ref/createdb.sgml +++ b/doc/src/sgml/ref/createdb.sgml @@ -136,7 +136,8 @@ PostgreSQL documentation - Specifies the LC_COLLATE setting to be used in this database. + Specifies the LC_COLLATE setting to be used in this database (ignored + unless the locale provider is libc). diff --git a/doc/src/sgml/ref/delete.sgml b/doc/src/sgml/ref/delete.sgml index 29649f6afd65c..b9367f2b23cfd 100644 --- a/doc/src/sgml/ref/delete.sgml +++ b/doc/src/sgml/ref/delete.sgml @@ -285,7 +285,7 @@ DELETE FROM films WHERE kind <> 'Musical'; - Clear the table films: + Clear the table films: DELETE FROM films; @@ -323,6 +323,9 @@ DELETE FROM user_logs AS dl USING delete_batch AS del WHERE dl.ctid = del.ctid; + This use of ctid is only safe because + the query is repeatedly run, avoiding the problem of changed + ctids. diff --git a/doc/src/sgml/ref/drop_owned.sgml b/doc/src/sgml/ref/drop_owned.sgml index 46e1c229ec0fb..efda01a39e88b 100644 --- a/doc/src/sgml/ref/drop_owned.sgml +++ b/doc/src/sgml/ref/drop_owned.sgml @@ -33,7 +33,7 @@ DROP OWNED BY { name | CURRENT_ROLE database that are owned by one of the specified roles. Any privileges granted to the given roles on objects in the current database or on shared objects (databases, tablespaces, configuration - parameters, or other roles) will also be revoked. + parameters) will also be revoked. diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 6dda680aa0de8..7dee77fd366b0 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -241,7 +241,8 @@ ROLLBACK; Include information on WAL record generation. Specifically, include the number of records, number of full page images (fpi), the amount of WAL - generated in bytes and the number of times the WAL buffers became full. + generated in bytes, the amount of full page images generated in bytes, + and the number of times the WAL buffers became full. In text format, only non-zero values are printed. This parameter may only be used when ANALYZE is also enabled. It defaults to FALSE. diff --git a/doc/src/sgml/ref/grant.sgml b/doc/src/sgml/ref/grant.sgml index 999f657d5c008..043f5d5a40af4 100644 --- a/doc/src/sgml/ref/grant.sgml +++ b/doc/src/sgml/ref/grant.sgml @@ -434,7 +434,7 @@ GRANT role_name [, ...] TO Examples - Grant insert privilege to all users on table films: + Grant insert privilege to all users on table films: GRANT INSERT ON films TO PUBLIC; @@ -443,14 +443,14 @@ GRANT INSERT ON films TO PUBLIC; Grant all available privileges to user manuel on view - kinds: + kinds: GRANT ALL PRIVILEGES ON kinds TO manuel; Note that while the above will indeed grant all privileges if executed by a - superuser or the owner of kinds, when executed by someone + superuser or the owner of kinds, when executed by someone else it will only grant those permissions for which the someone else has grant options. diff --git a/doc/src/sgml/ref/insert.sgml b/doc/src/sgml/ref/insert.sgml index 3f13991779050..04962e39e128e 100644 --- a/doc/src/sgml/ref/insert.sgml +++ b/doc/src/sgml/ref/insert.sgml @@ -645,7 +645,7 @@ INSERT oid countExamples - Insert a single row into table films: + Insert a single row into table films: INSERT INTO films VALUES @@ -654,7 +654,7 @@ INSERT INTO films VALUES - In this example, the len column is + In this example, the len column is omitted and therefore it will have the default value: @@ -695,8 +695,8 @@ INSERT INTO films (code, title, did, date_prod, kind) VALUES This example inserts some rows into table - films from a table tmp_films - with the same column layout as films: + films from a table tmp_films + with the same column layout as films: INSERT INTO films SELECT * FROM tmp_films WHERE date_prod < '2004-05-07'; @@ -717,7 +717,7 @@ INSERT INTO tictactoe (game, board) - Insert a single row into table distributors, returning + Insert a single row into table distributors, returning the sequence number generated by the DEFAULT clause: @@ -742,8 +742,8 @@ INSERT INTO employees_log SELECT *, current_timestamp FROM upd; Insert or update new distributors as appropriate. Assumes a unique index has been defined that constrains values appearing in the - did column. Note that the special - excluded table is used to reference values originally + did column. Note that the special + excluded table is used to reference values originally proposed for insertion: INSERT INTO distributors (did, dname) @@ -754,8 +754,8 @@ INSERT INTO distributors (did, dname) Insert or update new distributors as above, returning information about any existing values that were updated, together with the new data - inserted. Note that the returned values for old_did - and old_dname will be NULL for + inserted. Note that the returned values for old_did + and old_dname will be NULL for non-conflicting rows: INSERT INTO distributors (did, dname) @@ -770,7 +770,7 @@ INSERT INTO distributors (did, dname) when an existing, excluded row (a row with a matching constrained column or columns after before row insert triggers fire) exists. Example assumes a unique index has been defined that constrains - values appearing in the did column: + values appearing in the did column: INSERT INTO distributors (did, dname) VALUES (7, 'Redline GmbH') ON CONFLICT (did) DO NOTHING; @@ -779,7 +779,7 @@ INSERT INTO distributors (did, dname) VALUES (7, 'Redline GmbH') Insert or update new distributors as appropriate. Example assumes a unique index has been defined that constrains values appearing in - the did column. WHERE clause is + the did column. WHERE clause is used to limit the rows actually updated (any existing row not updated will still be locked, though): @@ -799,8 +799,8 @@ INSERT INTO distributors (did, dname) VALUES (9, 'Antwerp Design') Insert new distributor if possible; otherwise DO NOTHING. Example assumes a unique index has been defined that constrains values appearing in the - did column on a subset of rows where the - is_active Boolean column evaluates to + did column on a subset of rows where the + is_active Boolean column evaluates to true: -- This statement could infer a partial unique index on "did" diff --git a/doc/src/sgml/ref/merge.sgml b/doc/src/sgml/ref/merge.sgml index ecbcd8345d874..c2e181066a4e1 100644 --- a/doc/src/sgml/ref/merge.sgml +++ b/doc/src/sgml/ref/merge.sgml @@ -23,37 +23,37 @@ PostgreSQL documentation [ WITH with_query [, ...] ] MERGE INTO [ ONLY ] target_table_name [ * ] [ [ AS ] target_alias ] -USING data_source ON join_condition -when_clause [...] -[ RETURNING [ WITH ( { OLD | NEW } AS output_alias [, ...] ) ] - { * | output_expression [ [ AS ] output_name ] } [, ...] ] + USING data_source ON join_condition + when_clause [...] + [ RETURNING [ WITH ( { OLD | NEW } AS output_alias [, ...] ) ] + { * | output_expression [ [ AS ] output_name ] } [, ...] ] where data_source is: -{ [ ONLY ] source_table_name [ * ] | ( source_query ) } [ [ AS ] source_alias ] + { [ ONLY ] source_table_name [ * ] | ( source_query ) } [ [ AS ] source_alias ] and when_clause is: -{ WHEN MATCHED [ AND condition ] THEN { merge_update | merge_delete | DO NOTHING } | - WHEN NOT MATCHED BY SOURCE [ AND condition ] THEN { merge_update | merge_delete | DO NOTHING } | - WHEN NOT MATCHED [ BY TARGET ] [ AND condition ] THEN { merge_insert | DO NOTHING } } + { WHEN MATCHED [ AND condition ] THEN { merge_update | merge_delete | DO NOTHING } | + WHEN NOT MATCHED BY SOURCE [ AND condition ] THEN { merge_update | merge_delete | DO NOTHING } | + WHEN NOT MATCHED [ BY TARGET ] [ AND condition ] THEN { merge_insert | DO NOTHING } } and merge_insert is: -INSERT [( column_name [, ...] )] -[ OVERRIDING { SYSTEM | USER } VALUE ] -{ VALUES ( { expression | DEFAULT } [, ...] ) | DEFAULT VALUES } + INSERT [( column_name [, ...] )] + [ OVERRIDING { SYSTEM | USER } VALUE ] + { VALUES ( { expression | DEFAULT } [, ...] ) | DEFAULT VALUES } and merge_update is: -UPDATE SET { column_name = { expression | DEFAULT } | - ( column_name [, ...] ) = [ ROW ] ( { expression | DEFAULT } [, ...] ) | - ( column_name [, ...] ) = ( sub-SELECT ) - } [, ...] + UPDATE SET { column_name = { expression | DEFAULT } | + ( column_name [, ...] ) = [ ROW ] ( { expression | DEFAULT } [, ...] ) | + ( column_name [, ...] ) = ( sub-SELECT ) + } [, ...] and merge_delete is: -DELETE + DELETE @@ -106,10 +106,11 @@ DELETE to compute and return value(s) based on each row inserted, updated, or deleted. Any expression using the source or target table's columns, or the merge_action() - function can be computed. When an INSERT or + function can be computed. By default, when an INSERT or UPDATE action is performed, the new values of the target - table's columns are used. When a DELETE is performed, - the old values of the target table's columns are used. The syntax of the + table's columns are used, and when a DELETE is performed, + the old values of the target table's columns are used, but it is also + possible to explicitly request old and new values. The syntax of the RETURNING list is identical to that of the output list of SELECT. diff --git a/doc/src/sgml/ref/pg_amcheck.sgml b/doc/src/sgml/ref/pg_amcheck.sgml index 6bfe28799c4e6..ef2bdfd19ae5d 100644 --- a/doc/src/sgml/ref/pg_amcheck.sgml +++ b/doc/src/sgml/ref/pg_amcheck.sgml @@ -41,7 +41,7 @@ PostgreSQL documentation - Only ordinary and toast table relations, materialized views, sequences, and + Only ordinary and TOAST table relations, materialized views, sequences, and btree indexes are currently supported. Other relation types are silently skipped. @@ -276,7 +276,7 @@ PostgreSQL documentation - By default, if a table is checked, its toast table, if any, will also + By default, if a table is checked, its TOAST table, if any, will also be checked, even if it is not explicitly selected by an option such as --table or --relation. This option suppresses that behavior. @@ -306,9 +306,9 @@ PostgreSQL documentation - By default, whenever a toast pointer is encountered in a table, + By default, whenever a TOAST pointer is encountered in a table, a lookup is performed to ensure that it references apparently-valid - entries in the toast table. These checks can be quite slow, and this + entries in the TOAST table. These checks can be quite slow, and this option can be used to skip them. @@ -368,9 +368,9 @@ PostgreSQL documentation End checking at the specified block number. An error will occur if the table relation being checked has fewer than this number of blocks. This option does not apply to indexes, and is probably only useful when - checking a single table relation. If both a regular table and a toast + checking a single table relation. If both a regular table and a TOAST table are checked, this option will apply to both, but higher-numbered - toast blocks may still be accessed while validating toast pointers, + TOAST blocks may still be accessed while validating TOAST pointers, unless that is suppressed using . diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index 9659f76042c5b..fecee08b0a536 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -500,8 +500,9 @@ PostgreSQL documentation - Sets checkpoint mode to fast (immediate) or spread (the default) + Sets checkpoint mode to fast or spread (see ). + The default is spread. diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index 95043aa329c02..e9e393495dfc7 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -247,5 +247,9 @@ PostgreSQL documentation remains unchanged, and pg_checksums can be re-run to perform the same operation. + + The target cluster must have the same major version as + pg_checksums. + diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml index 330a598f7013e..9a6d201e0b8e3 100644 --- a/doc/src/sgml/ref/pg_combinebackup.sgml +++ b/doc/src/sgml/ref/pg_combinebackup.sgml @@ -314,7 +314,7 @@ PostgreSQL documentation To avoid this problem, taking a new full backup after changing the checksum - state of the cluster using is + state of the cluster using is recommended. Otherwise, you can disable and then optionally reenable checksums on the directory produced by pg_combinebackup in order to correct the problem. diff --git a/doc/src/sgml/ref/pg_createsubscriber.sgml b/doc/src/sgml/ref/pg_createsubscriber.sgml index 4b1d08d5f16da..e450c6a5b3769 100644 --- a/doc/src/sgml/ref/pg_createsubscriber.sgml +++ b/doc/src/sgml/ref/pg_createsubscriber.sgml @@ -169,36 +169,6 @@ PostgreSQL documentation - - - - - - Remove all objects of the specified type from specified databases on the - target server. - - - - - - publications: - The FOR ALL TABLES publications established for this - subscriber are always removed; specifying this object type causes all - other publications replicated from the source server to be dropped as - well. - - - - - - The objects selected to be dropped are individually logged, including during - a . There is no opportunity to affect or stop the - dropping of the selected objects, so consider taking a backup of them - using pg_dump. - - - - @@ -259,6 +229,35 @@ PostgreSQL documentation + + + + + Drop all objects of the specified type from specified databases on the + target server. + + + + + + publications: + The FOR ALL TABLES publications established for this + subscriber are always dropped; specifying this object type causes all + other publications replicated from the source server to be dropped as + well. + + + + + + The objects selected to be dropped are individually logged, including during + a . There is no opportunity to affect or stop the + dropping of the selected objects, so consider taking a backup of them + using pg_dump. + + + + @@ -286,6 +285,14 @@ PostgreSQL documentation a generated name is assigned to the publication name. This option cannot be used together with . + + If a specified publication already exists on the publisher, it is reused. + It is useful to partially replicate the database if the specified + publication includes a list of tables. If the publication does not exist, + it is automatically created with FOR ALL TABLES. Use + option to preview which publications will be + reused and which will be created. + @@ -380,12 +387,12 @@ PostgreSQL documentation The source server must accept connections from the target server. The source server must not be in recovery. The source server must have as logical. The source server - must have configured to a value - greater than or equal to the number of specified databases plus existing - replication slots. The source server must have configured to a value greater than or equal - to the number of specified databases and existing WAL sender processes. + linkend="guc-wal-level"/> as replica or logical. + The source server must have + configured to a value greater than or equal to the number of specified + databases plus existing replication slots. The source server must have + configured to a value greater than or + equal to the number of specified databases and existing WAL sender processes. diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index c10bca63e55c5..688e23c0e908b 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -18,7 +18,7 @@ PostgreSQL documentation pg_dump - extract a PostgreSQL database into a script file or other archive file + export a PostgreSQL database as an SQL script or to other formats @@ -96,6 +96,18 @@ PostgreSQL documentation light of the limitations listed below. + + + Restoring a dump causes the destination to execute arbitrary code of the + source superusers' choice. Partial dumps and partial restores do not limit + that. If the source superusers are not trusted, the dumped SQL statements + must be inspected before restoring. Non-plain-text dumps can be inspected + by using pg_restore's + option. Note that the client running the dump and restore need not trust + the source or destination superusers. + + + @@ -285,8 +297,8 @@ PostgreSQL documentation file based output formats, in which case the standard output is used. It must be given for the directory output format however, where it specifies the target directory instead of a file. In this case the - directory is created by pg_dump and must not exist - before. + directory is created by pg_dump unless the directory + exists and is empty. @@ -1134,7 +1146,7 @@ PostgreSQL documentation - Do not dump statistics. + Do not dump statistics. This is the default. @@ -1252,6 +1264,29 @@ PostgreSQL documentation + + + + + Use the provided string as the psql + \restrict key in the dump output. This can only be + specified for plain-text dumps, i.e., when is + set to plain or the option + is omitted. If no restrict key is specified, + pg_dump will generate a random one as + needed. Keys may contain only alphanumeric characters. + + + This option is primarily intended for testing purposes and other + scenarios that require repeatable output (e.g., comparing dump files). + It is not recommended for general use, as a malicious server with + advance knowledge of the key may be able to inject arbitrary code that + will be executed on the machine that runs + psql with the dump output. + + + + @@ -1277,11 +1312,11 @@ PostgreSQL documentation The data section contains actual table data, large-object - contents, statistics for tables and materialized views and - sequence values. + contents, sequence values, and statistics for tables, + materialized views, and foreign tables. Post-data items include definitions of indexes, triggers, rules, statistics for indexes, and constraints other than validated check - constraints. + and not-null constraints. Pre-data items include all other data definition items. @@ -1354,12 +1389,22 @@ PostgreSQL documentation + + + + + Dump optimizer statistics. + + + + Dump only the statistics, not the schema (data definitions) or data. - Statistics for tables, materialized views, and indexes are dumped. + Optimizer statistics for tables, materialized views, foreign tables, + and indexes are dumped. @@ -1439,33 +1484,6 @@ PostgreSQL documentation - - - - - Dump data. This is the default. - - - - - - - - - Dump schema (data definitions). This is the default. - - - - - - - - - Dump statistics. This is the default. - - - - @@ -1681,14 +1699,15 @@ CREATE DATABASE foo WITH TEMPLATE template0; - By default, pg_dump will include most optimizer - statistics in the resulting dump file. However, some statistics may not be - included, such as those created explicitly with or custom statistics added by an - extension. Therefore, it may be useful to run ANALYZE - after restoring from a dump file to ensure optimal performance; see and for more - information. + When is specified, + pg_dump will include most optimizer statistics in the + resulting dump file. This does not include all statistics, such as + those created explicitly with , + custom statistics added by an extension, or statistics collected by the + cumulative statistics system. Therefore, it may still be useful to + run ANALYZE after restoring from a dump file to ensure + optimal performance; see and for more information. @@ -1860,7 +1879,7 @@ CREATE DATABASE foo WITH TEMPLATE template0; To dump all tables whose names start with mytable, except - for table mytable2, specify a filter file + for table mytable2, specify a filter file filter.txt like: include table mytable* diff --git a/doc/src/sgml/ref/pg_dumpall.sgml b/doc/src/sgml/ref/pg_dumpall.sgml index 8c5141d036c76..8834b7ec141ea 100644 --- a/doc/src/sgml/ref/pg_dumpall.sgml +++ b/doc/src/sgml/ref/pg_dumpall.sgml @@ -16,7 +16,7 @@ PostgreSQL documentation pg_dumpall - extract a PostgreSQL database cluster using a specified dump format + extract a PostgreSQL database cluster into a script file @@ -33,7 +33,7 @@ PostgreSQL documentation pg_dumpall is a utility for writing out (dumping) all PostgreSQL databases - of a cluster into an archive. The archive contains + of a cluster into one script file. The script file contains SQL commands that can be used as input to to restore the databases. It does this by calling for each database in the cluster. @@ -52,16 +52,11 @@ PostgreSQL documentation - Plain text SQL scripts will be written to the standard output. Use the + The SQL script will be written to the standard output. Use the / option or shell operators to redirect it into a file. - - Archives in other formats will be placed in a directory named using the - /, which is required in this case. - - pg_dumpall needs to connect several times to the PostgreSQL server (once per @@ -71,6 +66,16 @@ PostgreSQL documentation linkend="libpq-pgpass"/> for more information. + + + Restoring a dump causes the destination to execute arbitrary code of the + source superusers' choice. Partial dumps and partial restores do not limit + that. If the source superusers are not trusted, the dumped SQL statements + must be inspected before restoring. Note that the client running the dump + and restore need not trust the source or destination superusers. + + + @@ -126,85 +131,10 @@ PostgreSQL documentation Send output to the specified file. If this is omitted, the standard output is used. - Note: This option can only be omitted when is plain - - - - - - Specify the format of dump files. In plain format, all the dump data is - sent in a single text stream. This is the default. - - In all other modes, pg_dumpall first creates two files: - global.dat and map.dat, in the directory - specified by . - The first file contains global data, such as roles and tablespaces. The second - contains a mapping between database oids and names. These files are used by - pg_restore. Data for individual databases is placed in - databases subdirectory, named using the database's oid. - - - - d - directory - - - Output directory-format archives for each database, - suitable for input into pg_restore. The directory - will have database oid as its name. - - - - - - p - plain - - - Output a plain-text SQL script file (the default). - - - - - - c - custom - - - Output a custom-format archive for each database, - suitable for input into pg_restore. The archive - will be named dboid.dmp where dboid is the - oid of the database. - - - - - - t - tar - - - Output a tar-format archive for each database, - suitable for input into pg_restore. The archive - will be named dboid.tar where dboid is the - oid of the database. - - - - - - - Note: see for details - of how the various non plain text archives work. - - - - - @@ -567,7 +497,7 @@ exclude database PATTERN - Do not dump statistics. + Do not dump statistics. This is the default. @@ -671,6 +601,26 @@ exclude database PATTERN + + + + + Use the provided string as the psql + \restrict key in the dump output. If no restrict + key is specified, pg_dumpall will generate a + random one as needed. Keys may contain only alphanumeric characters. + + + This option is primarily intended for testing purposes and other + scenarios that require repeatable output (e.g., comparing dump files). + It is not recommended for general use, as a malicious server with + advance knowledge of the key may be able to inject arbitrary code that + will be executed on the machine that runs + psql with the dump output. + + + + @@ -685,12 +635,22 @@ exclude database PATTERN + + + + + Dump optimizer statistics. + + + + Dump only the statistics, not the schema (data definitions) or data. - Statistics for tables, materialized views, and indexes are dumped. + Optimizer statistics for tables, materialized views, foreign tables, + and indexes are dumped. @@ -719,33 +679,6 @@ exclude database PATTERN - - - - - Dump data. This is the default. - - - - - - - - - Dump schema (data definitions). This is the default. - - - - - - - - - Dump statistics. This is the default. - - - - @@ -957,14 +890,15 @@ exclude database PATTERN - By default, pg_dumpall will include most optimizer - statistics in the resulting dump file. However, some statistics may not be - included, such as those created explicitly with or custom statistics added by an - extension. Therefore, it may be useful to run ANALYZE - on each database after restoring from a dump file to ensure optimal - performance. You can also run vacuumdb -a -z to analyze - all databases. + When is specified, + pg_dumpall will include most optimizer statistics in the + resulting dump file. This does not include all statistics, such as + those created explicitly with , + custom statistics added by an extension, or statistics collected by the + cumulative statistics system. Therefore, it may still be useful to + run ANALYZE on each database after restoring from a dump + file to ensure optimal performance. You can also run vacuumdb -a + -z to analyze all databases. diff --git a/doc/src/sgml/ref/pg_recvlogical.sgml b/doc/src/sgml/ref/pg_recvlogical.sgml index 63a45c7018a45..5380d776bafb1 100644 --- a/doc/src/sgml/ref/pg_recvlogical.sgml +++ b/doc/src/sgml/ref/pg_recvlogical.sgml @@ -53,6 +53,16 @@ PostgreSQL documentation (ControlC) or SIGTERM signal. + + + When pg_recvlogical receives + a SIGHUP signal, it closes the current output file + and opens a new one using the filename specified by + the option. This allows us to rotate + the output file by first renaming the current file and then sending + a SIGHUP signal to + pg_recvlogical. + @@ -74,13 +84,13 @@ PostgreSQL documentation - The and are required + The and options are required for this action. - The and options - can be specified with . + The and + options can be specified with . @@ -94,7 +104,7 @@ PostgreSQL documentation - The is required for this action. + The option is required for this action. @@ -111,8 +121,8 @@ PostgreSQL documentation - The and , - are required for this action. + The , , and + options are required for this action. @@ -166,7 +176,7 @@ PostgreSQL documentation - + Enables the slot to be synchronized to the standbys. This option may @@ -196,7 +206,7 @@ PostgreSQL documentation Specifies how often pg_recvlogical should issue fsync() calls to ensure the output file is - safely flushed to disk. + safely flushed to disk. The default value is 10 seconds. @@ -265,8 +275,10 @@ PostgreSQL documentation When creating a slot, use the specified logical decoding output - plugin. See . This option has no - effect if the slot already exists. + plugin. See for + information about the plugins PostgreSQL + provides. The default is . + This option has no effect if the slot already exists. @@ -300,7 +312,8 @@ PostgreSQL documentation - + + (deprecated) Enables decoding of prepared transactions. This option may only be specified with diff --git a/doc/src/sgml/ref/pg_resetwal.sgml b/doc/src/sgml/ref/pg_resetwal.sgml index 2c019c2aac6eb..41f2b1d480c51 100644 --- a/doc/src/sgml/ref/pg_resetwal.sgml +++ b/doc/src/sgml/ref/pg_resetwal.sgml @@ -267,14 +267,17 @@ PostgreSQL documentation A safe value for the next multitransaction ID (first part) can be determined by looking for the numerically largest file name in the directory pg_multixact/offsets under the data directory, - adding one, and then multiplying by 65536 (0x10000). Conversely, a safe + adding one, and then multiplying by 32768 (0x8000). Conversely, a safe value for the oldest multitransaction ID (second part of ) can be determined by looking for the numerically smallest - file name in the same directory and multiplying by 65536. The file - names are in hexadecimal, so the easiest way to do this is to specify - the option value in hexadecimal and append four zeroes. + file name in the same directory and multiplying by 32768 (0x8000). + Note that the file names are in hexadecimal. It is usually easiest + to specify the option value in hexadecimal too. For example, if + 000F and 0007 are the greatest and + smallest entries in pg_multixact/offsets, + -m 0x80000,0x38000 will work. - + diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index 2295df62d03a8..a468a38361a13 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -18,9 +18,8 @@ PostgreSQL documentation pg_restore - restore a PostgreSQL database or cluster - from an archive created by pg_dump or - pg_dumpall + restore a PostgreSQL database from an + archive file created by pg_dump @@ -39,14 +38,13 @@ PostgreSQL documentation pg_restore is a utility for restoring a - PostgreSQL database or cluster from an archive - created by or - in one of the non-plain-text + PostgreSQL database from an archive + created by in one of the non-plain-text formats. It will issue the commands necessary to reconstruct the - database or cluster to the state it was in at the time it was saved. The - archives also allow pg_restore to + database to the state it was in at the time it was saved. The + archive files also allow pg_restore to be selective about what is restored, or even to reorder the items - prior to being restored. The archive formats are designed to be + prior to being restored. The archive files are designed to be portable across architectures. @@ -54,17 +52,10 @@ PostgreSQL documentation pg_restore can operate in two modes. If a database name is specified, pg_restore connects to that database and restores archive contents directly into - the database. - When restoring from a dump made by pg_dumpall, - each database will be created and then the restoration will be run in that - database. - - Otherwise, when a database name is not specified, a script containing the SQL - commands necessary to rebuild the database or cluster is created and written + the database. Otherwise, a script containing the SQL + commands necessary to rebuild the database is created and written to a file or standard output. This script output is equivalent to - the plain text output format of pg_dump or - pg_dumpall. - + the plain text output format of pg_dump. Some of the options controlling the output are therefore analogous to pg_dump options. @@ -77,6 +68,18 @@ PostgreSQL documentation pg_restore will not be able to load the data using COPY statements. + + + + Restoring a dump causes the destination to execute arbitrary code of the + source superusers' choice. Partial dumps and partial restores do not limit + that. If the source superusers are not trusted, the dumped SQL statements + must be inspected before restoring. Non-plain-text dumps can be inspected + by using pg_restore's + option. Note that the client running the dump and restore need not trust + the source or destination superusers. + + @@ -149,8 +152,6 @@ PostgreSQL documentation commands that mention this database. Access privileges for the database itself are also restored, unless is specified. - is required when restoring multiple databases - from an archive created by pg_dumpall. @@ -246,19 +247,6 @@ PostgreSQL documentation - - - - - - Restore only global objects (roles and tablespaces), no databases. - - - This option is only relevant when restoring from an archive made using pg_dumpall. - - - - @@ -603,28 +591,6 @@ PostgreSQL documentation - - - - - Do not restore databases whose name matches - pattern. - Multiple patterns can be excluded by writing multiple - switches. The - pattern parameter is - interpreted as a pattern according to the same rules used by - psql's \d - commands (see ), - so multiple databases can also be excluded by writing wildcard - characters in the pattern. When using wildcards, be careful to - quote the pattern if needed to prevent shell wildcard expansion. - - - This option is only relevant when restoring from an archive made using pg_dumpall. - - - - @@ -842,6 +808,28 @@ PostgreSQL documentation + + + + + Use the provided string as the psql + \restrict key in the dump output. This can only be + specified for SQL script output, i.e., when the + option is used. If no restrict key is specified, + pg_restore will generate a random one as + needed. Keys may contain only alphanumeric characters. + + + This option is primarily intended for testing purposes and other + scenarios that require repeatable output (e.g., comparing dump files). + It is not recommended for general use, as a malicious server with + advance knowledge of the key may be able to inject arbitrary code that + will be executed on the machine that runs + psql with the dump output. + + + + @@ -861,6 +849,16 @@ PostgreSQL documentation + + + + + Output commands to restore statistics, if the archive contains them. + This is the default. + + + + @@ -919,33 +917,6 @@ PostgreSQL documentation - - - - - Dump data. This is the default. - - - - - - - - - Dump schema (data definitions). This is the default. - - - - - - - - - Dump statistics. This is the default. - - - - diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index 5485033ed8c7c..5b155cfa12a7d 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -52,12 +52,32 @@ PostgreSQL documentation analogous to a base backup of the source data directory. Unlike taking a new base backup or using a tool like rsync, pg_rewind does not require comparing or copying - unchanged relation blocks in the cluster. Only changed blocks from existing - relation files are copied; all other files, including new relation files, - configuration files, and WAL segments, are copied in full. As such the - rewind operation is significantly faster than other approaches when the - database is large and only a small fraction of blocks differ between the - clusters. + unchanged relation blocks in the cluster: + + + + + Only changed blocks from existing relation files are copied. + + + + + WAL segments prior to the point where the source and target servers + have diverged are not copied. WAL segments generated after the source + and target servers have diverged are copied in full. + + + + + All other files, including new relation files and configuration files, + are copied in full. + + + + + As such, the rewind operation is significantly faster than other + approaches when the database is large and only a small fraction of blocks + differ between the clusters. @@ -352,10 +372,10 @@ PostgreSQL documentation a role, named rewind_user here: CREATE USER rewind_user LOGIN; -GRANT EXECUTE ON function pg_catalog.pg_ls_dir(text, boolean, boolean) TO rewind_user; -GRANT EXECUTE ON function pg_catalog.pg_stat_file(text, boolean) TO rewind_user; -GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text) TO rewind_user; -GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text, bigint, bigint, boolean) TO rewind_user; +GRANT EXECUTE ON FUNCTION pg_catalog.pg_ls_dir(text, boolean, boolean) TO rewind_user; +GRANT EXECUTE ON FUNCTION pg_catalog.pg_stat_file(text, boolean) TO rewind_user; +GRANT EXECUTE ON FUNCTION pg_catalog.pg_read_binary_file(text) TO rewind_user; +GRANT EXECUTE ON FUNCTION pg_catalog.pg_read_binary_file(text, bigint, bigint, boolean) TO rewind_user; diff --git a/doc/src/sgml/ref/pgarchivecleanup.sgml b/doc/src/sgml/ref/pgarchivecleanup.sgml index cd8f49b1c5bd7..79e751381ac41 100644 --- a/doc/src/sgml/ref/pgarchivecleanup.sgml +++ b/doc/src/sgml/ref/pgarchivecleanup.sgml @@ -44,7 +44,7 @@ PostgreSQL documentation server to use pg_archivecleanup, put this into its postgresql.conf configuration file: -archive_cleanup_command = 'pg_archivecleanup archivelocation %r' +archive_cleanup_command = 'pg_archivecleanup archivelocation "%r"' where archivelocation is the directory from which WAL segment files should be removed. @@ -198,7 +198,7 @@ pg_archivecleanup: removing file "archive/00000001000000370000000E" On Linux or Unix systems, you might use: -archive_cleanup_command = 'pg_archivecleanup -d /mnt/standby/archive %r 2>>cleanup.log' +archive_cleanup_command = 'pg_archivecleanup -d /mnt/standby/archive "%r" 2>>cleanup.log' where the archive directory is physically located on the standby server, so that the archive_command is accessing it across NFS, diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml index ab252d9fc74f9..2e401d1ceb8bc 100644 --- a/doc/src/sgml/ref/pgbench.sgml +++ b/doc/src/sgml/ref/pgbench.sgml @@ -76,9 +76,8 @@ tps = 896.967014 (without initial connection time) and number of transactions per client); these will be equal unless the run failed before completion or some SQL command(s) failed. (In mode, only the actual number of transactions is printed.) - The next line reports the number of failed transactions due to - serialization or deadlock errors (see - for more information). + The next line reports the number of failed transactions (see + for more information). The last line reports the number of transactions per second. @@ -759,6 +758,26 @@ pgbench options d + + + + + Allows clients to continue running even if an SQL statement fails + due to errors other than serialization or deadlock. By default, + clients abort after such errors, but with this option enabled, + they proceed to the next transaction instead. Note that + clients still abort even with this option if an error causes + the connection to fail. + See for more information. + + + This option is useful when your custom script may raise errors + such as unique constraint violations, but you want the benchmark + to continue and measure performance including those failures. + + + + @@ -790,6 +809,9 @@ pgbench options d deadlock failures; + + other failures; + See for more information. @@ -1188,10 +1210,8 @@ pgbench options d - - \gset [prefix] - \aset [prefix] - + \gset [prefix] + \aset [prefix] @@ -1203,16 +1223,17 @@ pgbench options d When the \gset command is used, the preceding SQL query is expected to return one row, the columns of which are stored into variables named after column names, and prefixed with prefix - if provided. + if provided. If the query returns zero or multiple rows, an error is raised. When the \aset command is used, all combined SQL queries (separated by \;) have their columns stored into variables named after column names, and prefixed with prefix - if provided. If a query returns no row, no assignment is made and the variable - can be tested for existence to detect this. If a query returns more than one - row, the last value is kept. + if provided. If a query returns no rows, no assignment is made. + This can be detected by initializing the variable beforehand with + a value the query cannot return, then checking whether it changes. + If a query returns more than one row, the last value is kept. @@ -2409,8 +2430,8 @@ END; will be reported as failed. If you use the option, the time of the failed transaction will be reported as - serialization or - deadlock depending on the type of failure (see + serialization, deadlock, or + other depending on the type of failure (see for more information). @@ -2638,6 +2659,17 @@ END; + + + other_sql_failures + + + number of transactions that got an SQL error + (zero unless both and + are specified) + + + @@ -2646,8 +2678,8 @@ END; pgbench --aggregate-interval=10 --time=20 --client=10 --log --rate=1000 --latency-limit=10 --failures-detailed --max-tries=10 test -1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0 -1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0 +1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0 0 +1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0 0 @@ -2826,7 +2858,7 @@ statement latencies in milliseconds, failures and retries: start a connection to the database server / the socket for connecting the client to the database server has become invalid). In such cases all clients of this thread stop while other threads continue to work. - However, is specified, all of the + However, if is specified, all of the threads stop immediately in this case. @@ -2851,10 +2883,20 @@ statement latencies in milliseconds, failures and retries: A client's run is aborted in case of a serious error; for example, the connection with the database server was lost or the end of script was reached - without completing the last transaction. In addition, if execution of an SQL - or meta command fails for reasons other than serialization or deadlock errors, - the client is aborted. Otherwise, if an SQL command fails with serialization or - deadlock errors, the client is not aborted. In such cases, the current + without completing the last transaction. The client also aborts + if a meta command fails, or if an SQL command fails for reasons other than + serialization or deadlock errors when + is not specified. With , + the client does not abort on such SQL errors and instead proceeds to + the next transaction. These cases are reported as + other failures in the output. If the error occurs + in a meta command, however, the client still aborts even when this option + is specified. + + + If an SQL command fails due to serialization or deadlock errors, the + client does not abort, regardless of whether + is used. Instead, the current transaction is rolled back, which also includes setting the client variables as they were before the run of this transaction (it is assumed that one transaction script contains only one transaction; see diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml index a5eb3aa25e02f..afe6a12be4b30 100644 --- a/doc/src/sgml/ref/pgtesttiming.sgml +++ b/doc/src/sgml/ref/pgtesttiming.sgml @@ -30,11 +30,23 @@ PostgreSQL documentation Description - pg_test_timing is a tool to measure the timing overhead - on your system and confirm that the system time never moves backwards. + pg_test_timing is a tool to measure the + timing overhead on your system and confirm that the system time never + moves backwards. It simply reads the system clock over and over again + as fast as it can for a specified length of time, and then prints + statistics about the observed differences in successive clock readings. + + + Smaller (but not zero) differences are better, since they imply both + more-precise clock hardware and less overhead to collect a clock reading. Systems that are slow to collect timing data can give less accurate EXPLAIN ANALYZE results. + + This tool is also helpful to determine if + the track_io_timing configuration parameter is likely + to produce useful results. + @@ -59,6 +71,21 @@ PostgreSQL documentation + + + + + + Specifies the cutoff percentage for the list of exact observed + timing durations (that is, the changes in the system clock value + from one reading to the next). The list will end once the running + percentage total reaches or exceeds this value, except that the + largest observed duration will always be printed. The default + cutoff is 99.99. + + + + @@ -92,205 +119,83 @@ PostgreSQL documentation Interpreting Results - Good results will show most (>90%) individual timing calls take less than - one microsecond. Average per loop overhead will be even lower, below 100 - nanoseconds. This example from an Intel i7-860 system using a TSC clock - source shows excellent performance: - - + The first block of output has four columns, with rows showing a + shifted-by-one log2(ns) histogram of timing durations (that is, the + differences between successive clock readings). This is not the + classic log2(n+1) histogram as it counts zeros separately and then + switches to log2(ns) starting from value 1. - - Note that different units are used for the per loop time than the - histogram. The loop can have resolution within a few nanoseconds (ns), - while the individual timing calls can only resolve down to one microsecond - (us). + The columns are: + + + nanosecond value that is >= the durations in this + bucket + + + percentage of durations in this bucket + + + running-sum percentage of durations in this and previous + buckets + + + count of durations in this bucket + + - - - - Measuring Executor Timing Overhead - - When the query executor is running a statement using - EXPLAIN ANALYZE, individual operations are timed as well - as showing a summary. The overhead of your system can be checked by - counting rows with the psql program: - - -CREATE TABLE t AS SELECT * FROM generate_series(1,100000); -\timing -SELECT COUNT(*) FROM t; -EXPLAIN ANALYZE SELECT COUNT(*) FROM t; - + The second block of output goes into more detail, showing the exact + timing differences observed. For brevity this list is cut off when the + running-sum percentage exceeds the user-selectable cutoff value. + However, the largest observed difference is always shown. - - The i7-860 system measured runs the count query in 9.8 ms while - the EXPLAIN ANALYZE version takes 16.6 ms, each - processing just over 100,000 rows. That 6.8 ms difference means the timing - overhead per row is 68 ns, about twice what pg_test_timing estimated it - would be. Even that relatively small amount of overhead is making the fully - timed count statement take almost 70% longer. On more substantial queries, - the timing overhead would be less problematic. + The example results below show that 99.99% of timing loops took between + 8 and 31 nanoseconds, with the worst case somewhere between 32768 and + 65535 nanoseconds. In the second block, we can see that typical loop + time is 16 nanoseconds, and the readings appear to have full nanosecond + precision. - - - - Changing Time Sources - On some newer Linux systems, it's possible to change the clock source used - to collect timing data at any time. A second example shows the slowdown - possible from switching to the slower acpi_pm time source, on the same - system used for the fast results above: - /sys/devices/system/clocksource/clocksource0/current_clocksource -# pg_test_timing -Per loop time including overhead: 722.92 ns +Testing timing overhead for 3 seconds. +Average loop time including overhead: 16.40 ns Histogram of timing durations: - < us % of total count - 1 27.84870 1155682 - 2 72.05956 2990371 - 4 0.07810 3241 - 8 0.01357 563 - 16 0.00007 3 + <= ns % of total running % count + 0 0.0000 0.0000 0 + 1 0.0000 0.0000 0 + 3 0.0000 0.0000 0 + 7 0.0000 0.0000 0 + 15 4.5452 4.5452 8313178 + 31 95.4527 99.9979 174581501 + 63 0.0001 99.9981 253 + 127 0.0001 99.9982 165 + 255 0.0000 99.9982 35 + 511 0.0000 99.9982 1 + 1023 0.0013 99.9994 2300 + 2047 0.0004 99.9998 690 + 4095 0.0000 99.9998 9 + 8191 0.0000 99.9998 8 + 16383 0.0002 100.0000 337 + 32767 0.0000 100.0000 2 + 65535 0.0000 100.0000 1 + +Observed timing durations up to 99.9900%: + ns % of total running % count + 15 4.5452 4.5452 8313178 + 16 58.3785 62.9237 106773354 + 17 33.6840 96.6078 61607584 + 18 3.1151 99.7229 5697480 + 19 0.2638 99.9867 482570 + 20 0.0093 99.9960 17054 +... + 38051 0.0000 100.0000 1 ]]> - - In this configuration, the sample EXPLAIN ANALYZE above - takes 115.9 ms. That's 1061 ns of timing overhead, again a small multiple - of what's measured directly by this utility. That much timing overhead - means the actual query itself is only taking a tiny fraction of the - accounted for time, most of it is being consumed in overhead instead. In - this configuration, any EXPLAIN ANALYZE totals involving - many timed operations would be inflated significantly by timing overhead. - - - - FreeBSD also allows changing the time source on the fly, and it logs - information about the timer selected during boot: - - -# dmesg | grep "Timecounter" -Timecounter "ACPI-fast" frequency 3579545 Hz quality 900 -Timecounter "i8254" frequency 1193182 Hz quality 0 -Timecounters tick every 10.000 msec -Timecounter "TSC" frequency 2531787134 Hz quality 800 -# sysctl kern.timecounter.hardware=TSC -kern.timecounter.hardware: ACPI-fast -> TSC - - - - - Other systems may only allow setting the time source on boot. On older - Linux systems the "clock" kernel setting is the only way to make this sort - of change. And even on some more recent ones, the only option you'll see - for a clock source is "jiffies". Jiffies are the older Linux software clock - implementation, which can have good resolution when it's backed by fast - enough timing hardware, as in this example: - - - - - - Clock Hardware and Timing Accuracy - - - Collecting accurate timing information is normally done on computers using - hardware clocks with various levels of accuracy. With some hardware the - operating systems can pass the system clock time almost directly to - programs. A system clock can also be derived from a chip that simply - provides timing interrupts, periodic ticks at some known time interval. In - either case, operating system kernels provide a clock source that hides - these details. But the accuracy of that clock source and how quickly it can - return results varies based on the underlying hardware. - - - - Inaccurate time keeping can result in system instability. Test any change - to the clock source very carefully. Operating system defaults are sometimes - made to favor reliability over best accuracy. And if you are using a virtual - machine, look into the recommended time sources compatible with it. Virtual - hardware faces additional difficulties when emulating timers, and there are - often per operating system settings suggested by vendors. - - - - The Time Stamp Counter (TSC) clock source is the most accurate one available - on current generation CPUs. It's the preferred way to track the system time - when it's supported by the operating system and the TSC clock is - reliable. There are several ways that TSC can fail to provide an accurate - timing source, making it unreliable. Older systems can have a TSC clock that - varies based on the CPU temperature, making it unusable for timing. Trying - to use TSC on some older multicore CPUs can give a reported time that's - inconsistent among multiple cores. This can result in the time going - backwards, a problem this program checks for. And even the newest systems - can fail to provide accurate TSC timing with very aggressive power saving - configurations. - - - - Newer operating systems may check for the known TSC problems and switch to a - slower, more stable clock source when they are seen. If your system - supports TSC time but doesn't default to that, it may be disabled for a good - reason. And some operating systems may not detect all the possible problems - correctly, or will allow using TSC even in situations where it's known to be - inaccurate. - - - - The High Precision Event Timer (HPET) is the preferred timer on systems - where it's available and TSC is not accurate. The timer chip itself is - programmable to allow up to 100 nanosecond resolution, but you may not see - that much accuracy in your system clock. - - - - Advanced Configuration and Power Interface (ACPI) provides a Power - Management (PM) Timer, which Linux refers to as the acpi_pm. The clock - derived from acpi_pm will at best provide 300 nanosecond resolution. - - - - Timers used on older PC hardware include the 8254 Programmable Interval - Timer (PIT), the real-time clock (RTC), the Advanced Programmable Interrupt - Controller (APIC) timer, and the Cyclone timer. These timers aim for - millisecond resolution. - - @@ -298,6 +203,8 @@ Histogram of timing durations: + Wiki + discussion about timing diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index aeeed297437e6..38ca09b423c32 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -70,6 +70,14 @@ PostgreSQL documentation pg_upgrade supports upgrades from 9.2.X and later to the current major release of PostgreSQL, including snapshot and beta releases. + + + + Upgrading a cluster causes the destination to execute arbitrary code of the + source superusers' choice. Ensure that the source superusers are trusted + before upgrading. + + @@ -825,10 +833,10 @@ psql --username=postgres --file=script.sql postgres Unless the option is specified, pg_upgrade will transfer most optimizer statistics - from the old cluster to the new cluster. However, some statistics may - not be transferred, such as those created explicitly with or custom statistics added by an - extension. + from the old cluster to the new cluster. This does not transfer + all statistics, such as those created explicitly with + , custom statistics added by + an extension, or statistics collected by the cumulative statistics system. @@ -1110,7 +1118,8 @@ psql --username=postgres --file=script.sql postgres regproc regprocedure - (regclass, regrole, and regtype can be upgraded.) + (regclass, regdatabase, regrole, and + regtype can be upgraded.) diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index 8f7d8758ca02f..f56c70263e07b 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -1067,8 +1067,8 @@ INSERT INTO tbls1 VALUES ($1, $2) \parse stmt1 - - \close prepared_statement_name + + \close_prepared prepared_statement_name @@ -1081,7 +1081,7 @@ INSERT INTO tbls1 VALUES ($1, $2) \parse stmt1 Example: SELECT $1 \parse stmt1 -\close stmt1 +\close_prepared stmt1 @@ -1101,7 +1101,16 @@ SELECT $1 \parse stmt1 Outputs information about the current database connection, - including TLS-related information if TLS is in use. + including SSL-related information if SSL is in use. + + + Note that the Client User field shows + the user at the time of connection, while the + Superuser field indicates whether + the current user (in the current execution context) has + superuser privileges. These users are usually the same, but they can + differ, for example, if the current user was changed with the + SET ROLE command. @@ -2522,7 +2531,7 @@ Tue Oct 26 21:40:57 CEST 1999 statement to be executed. For example, to create an index on each column of my_table: -=> SELECT format('create index on my_table(%I)', attname) +=> SELECT format('CREATE INDEX ON my_table (%I)', attname) -> FROM pg_attribute -> WHERE attrelid = 'my_table'::regclass AND attnum > 0 -> ORDER BY attnum @@ -2757,8 +2766,8 @@ hello 10 -- check for the existence of two separate records in the database and store -- the results in separate psql variables SELECT - EXISTS(SELECT 1 FROM customer WHERE customer_id = 123) as is_customer, - EXISTS(SELECT 1 FROM employee WHERE employee_id = 456) as is_employee + EXISTS (SELECT 1 FROM customer WHERE customer_id = 123) AS is_customer, + EXISTS (SELECT 1 FROM employee WHERE employee_id = 456) AS is_employee \gset \if :is_customer SELECT * FROM customer WHERE customer_id = 123; @@ -3090,6 +3099,26 @@ SELECT $1 \parse stmt1 + + display_false + + + Sets the string to be printed in place of a false value. + The default is to print f. + + + + + + display_true + + + Sets the string to be printed in place of a true value. + The default is to print t. + + + + expanded (or x) @@ -3542,6 +3571,24 @@ SELECT $1 \parse stmt1 + + \restrict restrict_key + + + Enter "restricted" mode with the provided key. In this mode, the only + allowed meta-command is \unrestrict, to exit + restricted mode. The key may contain only alphanumeric characters. + + + This command is primarily intended for use in plain-text dumps + generated by pg_dump, + pg_dumpall, and + pg_restore, but it may be useful elsewhere. + + + + + \s [ filename ] @@ -3701,7 +3748,7 @@ testdb=> \setenv LESS -imx4F All queries executed while a pipeline is ongoing use the extended query protocol. Queries are appended to the pipeline when ending with a semicolon. The meta-commands \bind, - \bind_named, \close or + \bind_named, \close_prepared or \parse can be used in an ongoing pipeline. While a pipeline is ongoing, \sendpipeline will append the current query buffer to the pipeline. Other meta-commands like @@ -3733,6 +3780,10 @@ testdb=> \setenv LESS -imx4F See for more details + + COPY is not supported while in pipeline mode. + + Example: @@ -3789,6 +3840,24 @@ SELECT 1 \bind \sendpipeline + + \unrestrict restrict_key + + + Exit "restricted" mode (i.e., where all other meta-commands are + blocked), provided the specified key matches the one given to + \restrict when restricted mode was entered. + + + This command is primarily intended for use in plain-text dumps + generated by pg_dump, + pg_dumpall, and + pg_restore, but it may be useful elsewhere. + + + + + \unset name @@ -3853,7 +3922,7 @@ SELECT 1 \bind \sendpipeline (if given) is reached, or the query no longer returns the minimum number of rows. Wait the specified number of seconds (default 2) between executions. The default wait can be changed with the variable - ). + . For backwards compatibility, seconds can be specified with or without an interval= prefix. @@ -3954,7 +4023,7 @@ SELECT 1 \bind \sendpipeline server as soon as it reaches the command-ending semicolon, even if more input remains on the current line. Thus for example entering -select 1; select 2; select 3; +SELECT 1; SELECT 2; SELECT 3; will result in the three SQL commands being individually sent to the server, with each one's results being displayed before @@ -3963,7 +4032,7 @@ select 1; select 2; select 3; command before it and the one after are effectively combined and sent to the server in one request. So for example -select 1\; select 2\; select 3; +SELECT 1\; SELECT 2\; SELECT 3; results in sending the three SQL commands to the server in a single request, when the non-backslashed semicolon is reached. @@ -4610,6 +4679,15 @@ bar + + SERVICEFILE + + + The service file name, if applicable. + + + + SHELL_ERROR @@ -4752,9 +4830,10 @@ bar WATCH_INTERVAL - This variable sets the default interval which \watch - waits between executing the query. Specifying an interval in the - command overrides this variable. + This variable sets the default interval, in seconds, which + \watch waits between executing the query. The + default is 2 seconds. Specifying an interval in the command overrides + this variable. @@ -4780,7 +4859,7 @@ bar testdb=> \set foo 'my_table' testdb=> SELECT * FROM :foo; - would query the table my_table. Note that this + would query the table my_table. Note that this may be unsafe: the value of the variable is copied literally, so it can contain unbalanced quotes, or even backslash commands. You must make sure that it makes sense where you put it. @@ -4915,6 +4994,17 @@ testdb=> INSERT INTO my_table VALUES (:'content'); + + %S + + + The current value of , or + ? if connected to a server running + PostgreSQL 17 or older. + + + + %s The name of the service. @@ -5491,7 +5581,7 @@ PSQL_EDITOR_LINENUMBER_ARG='--line ' input. Notice the changing prompt: testdb=> CREATE TABLE my_table ( -testdb(> first integer not null default 0, +testdb(> first integer NOT NULL DEFAULT 0, testdb(> second text) testdb-> ; CREATE TABLE @@ -5680,8 +5770,8 @@ testdb=> \crosstabview first second This second example shows a multiplication table with rows sorted in reverse numerical order and columns with an independent, ascending numerical order. -testdb=> SELECT t1.first as "A", t2.first+100 AS "B", t1.first*(t2.first+100) as "AxB", -testdb-> row_number() over(order by t2.first) AS ord +testdb=> SELECT t1.first AS "A", t2.first+100 AS "B", t1.first*(t2.first+100) AS "AxB", +testdb-> row_number() OVER (ORDER BY t2.first) AS ord testdb-> FROM my_table t1 CROSS JOIN my_table t2 ORDER BY 1 DESC testdb-> \crosstabview "A" "B" "AxB" ord A | 101 | 102 | 103 | 104 diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index c405539714695..185cd75ca3012 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -528,7 +528,7 @@ REINDEX INDEX my_index; - Rebuild all the indexes on the table my_table: + Rebuild all the indexes on the table my_table: REINDEX TABLE my_table; diff --git a/doc/src/sgml/ref/reindexdb.sgml b/doc/src/sgml/ref/reindexdb.sgml index abcb041179bb9..a90e48ea86ba9 100644 --- a/doc/src/sgml/ref/reindexdb.sgml +++ b/doc/src/sgml/ref/reindexdb.sgml @@ -433,7 +433,7 @@ PostgreSQL documentation - To reindex the table foo and the index + To reindex the table foo and the index bar in a database named abcd: $ reindexdb --table=foo --index=bar abcd diff --git a/doc/src/sgml/ref/security_label.sgml b/doc/src/sgml/ref/security_label.sgml index e5e5fb483e94e..aa45c0af2487b 100644 --- a/doc/src/sgml/ref/security_label.sgml +++ b/doc/src/sgml/ref/security_label.sgml @@ -84,6 +84,10 @@ SECURITY LABEL [ FOR provider ] ON based on object labels, rather than traditional discretionary access control (DAC) concepts such as users and groups. + + + You must own the database object to use SECURITY LABEL. + diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml index d7089eac0bee7..ca5dd14d62778 100644 --- a/doc/src/sgml/ref/select.sgml +++ b/doc/src/sgml/ref/select.sgml @@ -37,7 +37,7 @@ SELECT [ ALL | DISTINCT [ ON ( expressionexpression [ [ AS ] output_name ] } [, ...] ] [ FROM from_item [, ...] ] [ WHERE condition ] - [ GROUP BY [ ALL | DISTINCT ] grouping_element [, ...] ] + [ GROUP BY { ALL | [ ALL | DISTINCT ] grouping_element [, ...] } ] [ HAVING condition ] [ WINDOW window_name AS ( window_definition ) [, ...] ] [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] @@ -796,7 +796,7 @@ WHERE condition The optional GROUP BY clause has the general form -GROUP BY [ ALL | DISTINCT ] grouping_element [, ...] +GROUP BY { ALL | [ ALL | DISTINCT ] grouping_element [, ...] } @@ -808,21 +808,31 @@ GROUP BY [ ALL | DISTINCT ] grouping_elementgrouping_element can be an input column name, or the name or ordinal number of an output column (SELECT list item), or an arbitrary - expression formed from input-column values. In case of ambiguity, + expression formed from input-column values; however, it cannot contain + an aggregate function or a window function. In case of ambiguity, a GROUP BY name will be interpreted as an input-column name rather than an output column name. + + The form GROUP BY ALL with no explicit + grouping_elements + provided is equivalent to writing GROUP BY with the + numbers of all SELECT output columns that do not + contain either an aggregate function or a window function. + + If any of GROUPING SETS, ROLLUP or CUBE are present as grouping elements, then the GROUP BY clause as a whole defines some number of independent grouping sets. The effect of this is equivalent to constructing a UNION ALL between - subqueries with the individual grouping sets as their + subqueries having the individual grouping sets as their GROUP BY clauses. The optional DISTINCT - clause removes duplicate sets before processing; it does not - transform the UNION ALL into a UNION DISTINCT. + key word removes duplicate grouping sets before processing; it does not + transform the implied UNION ALL into + a UNION DISTINCT. For further details on the handling of grouping sets see . @@ -1758,8 +1768,8 @@ SELECT * FROM name Examples - To join the table films with the table - distributors: + To join the table films with the table + distributors: SELECT f.title, f.did, d.name, f.date_prod, f.kind @@ -1774,8 +1784,8 @@ SELECT f.title, f.did, d.name, f.date_prod, f.kind - To sum the column len of all films and group - the results by kind: + To sum the column len of all films and group + the results by kind: SELECT kind, sum(len) AS total FROM films GROUP BY kind; @@ -1791,8 +1801,8 @@ SELECT kind, sum(len) AS total FROM films GROUP BY kind; - To sum the column len of all films, group - the results by kind and show those group totals + To sum the column len of all films, group + the results by kind and show those group totals that are less than 5 hours: @@ -1811,7 +1821,7 @@ SELECT kind, sum(len) AS total The following two examples are identical ways of sorting the individual results according to the contents of the second column - (name): + (name): SELECT * FROM distributors ORDER BY name; @@ -1837,8 +1847,8 @@ SELECT * FROM distributors ORDER BY 2; The next example shows how to obtain the union of the tables - distributors and - actors, restricting the results to those that begin + distributors and + actors, restricting the results to those that begin with the letter W in each table. Only distinct rows are wanted, so the key word ALL is omitted. @@ -1917,7 +1927,7 @@ SELECT * FROM unnest(ARRAY['a','b','c','d','e','f']) WITH ORDINALITY; WITH t AS ( - SELECT random() as x FROM generate_series(1, 3) + SELECT random() AS x FROM generate_series(1, 3) ) SELECT * FROM t UNION ALL diff --git a/doc/src/sgml/ref/select_into.sgml b/doc/src/sgml/ref/select_into.sgml index ae7e6bed24f25..cbf865ff8383c 100644 --- a/doc/src/sgml/ref/select_into.sgml +++ b/doc/src/sgml/ref/select_into.sgml @@ -27,15 +27,15 @@ SELECT [ ALL | DISTINCT [ ON ( expressionnew_table [ FROM from_item [, ...] ] [ WHERE condition ] - [ GROUP BY expression [, ...] ] + [ GROUP BY { ALL | [ ALL | DISTINCT ] grouping_element [, ...] } ] [ HAVING condition ] [ WINDOW window_name AS ( window_definition ) [, ...] ] [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] [ ORDER BY expression [ ASC | DESC | USING operator ] [ NULLS { FIRST | LAST } ] [, ...] ] [ LIMIT { count | ALL } ] [ OFFSET start [ ROW | ROWS ] ] - [ FETCH { FIRST | NEXT } [ count ] { ROW | ROWS } ONLY ] - [ FOR { UPDATE | SHARE } [ OF table_name [, ...] ] [ NOWAIT ] [...] ] + [ FETCH { FIRST | NEXT } [ count ] { ROW | ROWS } { ONLY | WITH TIES } ] + [ FOR { UPDATE | NO KEY UPDATE | SHARE | KEY SHARE } [ OF from_reference [, ...] ] [ NOWAIT | SKIP LOCKED ] [...] ] @@ -120,8 +120,8 @@ SELECT [ ALL | DISTINCT [ ON ( expressionExamples - Create a new table films_recent consisting of only - recent entries from the table films: + Create a new table films_recent consisting of only + recent entries from the table films: SELECT * INTO films_recent FROM films WHERE date_prod >= '2002-01-01'; diff --git a/doc/src/sgml/ref/set.sgml b/doc/src/sgml/ref/set.sgml index 2218f54682ec3..16c7e9a7b2651 100644 --- a/doc/src/sgml/ref/set.sgml +++ b/doc/src/sgml/ref/set.sgml @@ -21,8 +21,8 @@ PostgreSQL documentation -SET [ SESSION | LOCAL ] configuration_parameter { TO | = } { value | 'value' | DEFAULT } -SET [ SESSION | LOCAL ] TIME ZONE { value | 'value' | LOCAL | DEFAULT } +SET [ SESSION | LOCAL ] configuration_parameter { TO | = } { value [, ...] | DEFAULT } +SET [ SESSION | LOCAL ] TIME ZONE { value | LOCAL | DEFAULT } @@ -123,7 +123,7 @@ SET [ SESSION | LOCAL ] TIME ZONE { valueconfiguration_parameter - Name of a settable run-time parameter. Available parameters are + Name of a settable configuration parameter. Available parameters are documented in and below. @@ -133,9 +133,12 @@ SET [ SESSION | LOCAL ] TIME ZONE { valuevalue - New value of parameter. Values can be specified as string + New value of the parameter. Values can be specified as string constants, identifiers, numbers, or comma-separated lists of these, as appropriate for the particular parameter. + Values that are neither numbers nor valid identifiers must be quoted. + If the parameter accepts a list of values, NULL + can be written to specify an empty list. DEFAULT can be written to specify resetting the parameter to its default value (that is, whatever value it would have had if no SET had been executed @@ -283,6 +286,19 @@ SELECT setseed(value); Set the schema search path: SET search_path TO my_schema, public; + + Note that this is not the same as + +SET search_path TO 'my_schema, public'; + + which would have the effect of setting search_path + to contain a single, probably-nonexistent schema name. + + + + Set the list of temporary tablespace names to be empty: + +SET temp_tablespaces TO NULL; diff --git a/doc/src/sgml/ref/truncate.sgml b/doc/src/sgml/ref/truncate.sgml index 9d846f88c9f60..d6efa286e993e 100644 --- a/doc/src/sgml/ref/truncate.sgml +++ b/doc/src/sgml/ref/truncate.sgml @@ -182,8 +182,8 @@ TRUNCATE [ TABLE ] [ ONLY ] name [ Examples - Truncate the tables bigtable and - fattable: + Truncate the tables bigtable and + fattable: TRUNCATE bigtable, fattable; @@ -199,8 +199,8 @@ TRUNCATE bigtable, fattable RESTART IDENTITY; - Truncate the table othertable, and cascade to any tables - that reference othertable via foreign-key + Truncate the table othertable, and cascade to any tables + that reference othertable via foreign-key constraints: diff --git a/doc/src/sgml/ref/update.sgml b/doc/src/sgml/ref/update.sgml index 12ec5ba070939..b523766abe303 100644 --- a/doc/src/sgml/ref/update.sgml +++ b/doc/src/sgml/ref/update.sgml @@ -57,7 +57,8 @@ UPDATE [ ONLY ] table_name [ * ] [ to compute and return value(s) based on each row actually updated. Any expression using the table's columns, and/or columns of other tables mentioned in FROM, can be computed. - The new (post-update) values of the table's columns are used. + By default, the new (post-update) values of the table's columns are used, + but it is also possible to request the old (pre-update) values. The syntax of the RETURNING list is identical to that of the output list of SELECT. @@ -502,6 +503,9 @@ UPDATE work_item SET status = 'failed' WHERE work_item.ctid = emr.ctid; This command will need to be repeated until no rows remain to be updated. + (This use of ctid is only safe because + the query is repeatedly run, avoiding the problem of changed + ctids.) Use of an ORDER BY clause allows the command to prioritize which rows will be updated; it can also prevent deadlock with other update operations if they use the same ordering. diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index bd5dcaf86a5cc..6d0fdd43cfb31 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -512,7 +512,7 @@ VACUUM [ ( option [, ...] ) ] [ Examples - To clean a single table onek, analyze it for + To clean a single table onek, analyze it for the optimizer and print a detailed vacuum activity report: diff --git a/doc/src/sgml/ref/vacuumdb.sgml b/doc/src/sgml/ref/vacuumdb.sgml index b0680a61814cc..508c8dfa14641 100644 --- a/doc/src/sgml/ref/vacuumdb.sgml +++ b/doc/src/sgml/ref/vacuumdb.sgml @@ -171,6 +171,16 @@ PostgreSQL documentation + + + + + Print, but do not execute, the vacuum and analyze commands that would + have been sent to the server. + + + + @@ -282,14 +292,24 @@ PostgreSQL documentation Only analyze relations that are missing statistics for a column, index - expression, or extended statistics object. This option prevents - vacuumdb from deleting existing statistics - so that the query optimizer's choices do not become transiently worse. + expression, or extended statistics object. When used with + , this option prevents + vacuumdb from temporarily replacing existing + statistics with ones generated with lower statistics targets, thus + avoiding transiently worse query optimizer choices. This option can only be used in conjunction with or . + + Note that requires + SELECT privileges on + pg_statistic + and + pg_statistic_ext_data, + which are restricted to superusers by default. + @@ -395,6 +415,15 @@ PostgreSQL documentation Multiple tables can be vacuumed by writing multiple switches. + + If no tables are specified with the option, + vacuumdb will clean all regular tables + and materialized views in the connected database. + If or + is also specified, + it will analyze all regular tables, partitioned tables, + and materialized views (but not foreign tables). + If you specify columns, you probably have to escape the parentheses diff --git a/doc/src/sgml/ref/wait_for.sgml b/doc/src/sgml/ref/wait_for.sgml new file mode 100644 index 0000000000000..3b8e842d1de98 --- /dev/null +++ b/doc/src/sgml/ref/wait_for.sgml @@ -0,0 +1,234 @@ + + + + + WAIT FOR + + + + WAIT FOR + 7 + SQL - Language Statements + + + + WAIT FOR + wait for target LSN to be replayed, optionally with a timeout + + + + +WAIT FOR LSN 'lsn' [ WITH ( option [, ...] ) ] + +where option can be: + + TIMEOUT 'timeout' + NO_THROW + + + + + Description + + + Waits until recovery replays lsn. + If no timeout is specified or it is set to + zero, this command waits indefinitely for the + lsn. + On timeout, or if the server is promoted before + lsn is reached, an error is emitted, + unless NO_THROW is specified in the WITH clause. + If NO_THROW is specified, then the command + doesn't throw errors. + + + + The possible return values are success, + timeout, and not in recovery. + + + + + Parameters + + + + lsn + + + Specifies the target LSN to wait for. + + + + + + WITH ( option [, ...] ) + + + This clause specifies optional parameters for the wait operation. + The following parameters are supported: + + + + TIMEOUT 'timeout' + + + When specified and timeout is greater than zero, + the command waits until lsn is reached or + the specified timeout has elapsed. + + + The timeout might be given as integer number of + milliseconds. Also it might be given as string literal with + integer number of milliseconds or a number with unit + (see ). + + + + + + NO_THROW + + + Specify to not throw an error in the case of timeout or + running on the primary. In this case the result status can be get from + the return value. + + + + + + + + + + + + Outputs + + + + success + + + This return value denotes that we have successfully reached + the target lsn. + + + + + + timeout + + + This return value denotes that the timeout happened before reaching + the target lsn. + + + + + + not in recovery + + + This return value denotes that the database server is not in a recovery + state. This might mean either the database server was not in recovery + at the moment of receiving the command, or it was promoted before + reaching the target lsn. + + + + + + + + Notes + + + WAIT FOR command waits till + lsn to be replayed on standby. + That is, after this command execution, the value returned by + pg_last_wal_replay_lsn should be greater or equal + to the lsn value. This is useful to achieve + read-your-writes-consistency, while using async replica for reads and + primary for writes. In that case, the lsn of the last + modification should be stored on the client application side or the + connection pooler side. + + + + WAIT FOR command should be called on standby. + If a user runs WAIT FOR on primary, it + will error out unless NO_THROW is specified in the WITH clause. + However, if WAIT FOR is + called on primary promoted from standby and lsn + was already replayed, then the WAIT FOR command just + exits immediately. + + + + + + Examples + + + You can use WAIT FOR command to wait for + the pg_lsn value. For example, an application could update + the movie table and get the lsn after + changes just made. This example uses pg_current_wal_insert_lsn + on primary server to get the lsn given that + synchronous_commit could be set to + off. + + +postgres=# UPDATE movie SET genre = 'Dramatic' WHERE genre = 'Drama'; +UPDATE 100 +postgres=# SELECT pg_current_wal_insert_lsn(); +pg_current_wal_insert_lsn +-------------------- +0/306EE20 +(1 row) + + + Then an application could run WAIT FOR + with the lsn obtained from primary. After that the + changes made on primary should be guaranteed to be visible on replica. + + +postgres=# WAIT FOR LSN '0/306EE20'; + status +-------- + success +(1 row) +postgres=# SELECT * FROM movie WHERE genre = 'Drama'; + genre +------- +(0 rows) + + + + + If the target LSN is not reached before the timeout, the error is thrown. + + +postgres=# WAIT FOR LSN '0/306EE20' WITH (TIMEOUT '0.1s'); +ERROR: timed out while waiting for target LSN 0/306EE20 to be replayed; current replay LSN 0/306EA60 + + + + + The same example uses WAIT FOR with + NO_THROW option. + +postgres=# WAIT FOR LSN '0/306EE20' WITH (TIMEOUT '100ms', NO_THROW); + status +-------- + timeout +(1 row) + + + + diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml index ff85ace83fc48..2cf02c37b17bd 100644 --- a/doc/src/sgml/reference.sgml +++ b/doc/src/sgml/reference.sgml @@ -216,6 +216,7 @@ &update; &vacuum; &values; + &waitFor; diff --git a/doc/src/sgml/regress.sgml b/doc/src/sgml/regress.sgml index bf4ffb3057636..d80dd46c5fdb9 100644 --- a/doc/src/sgml/regress.sgml +++ b/doc/src/sgml/regress.sgml @@ -125,6 +125,18 @@ make installcheck-parallel + + Running Specific Tests + + + A subset of the regression tests can be run with the command + make check-tests TESTS="boolean char" or + make installcheck-tests TESTS="boolean char". + Note that sometimes tests have dependencies on objects created by other + tests, which can cause unexpected failures. + + + Additional Test Suites @@ -254,7 +266,7 @@ make check-world -j8 >/dev/null Some test suites are not run by default, either because they are not secure to run on a multiuser system, because they require special software or - because they are resource intensive. You can decide which test suites to + because they are resource-intensive. You can decide which test suites to run additionally by setting the make or environment variable PG_TEST_EXTRA to a whitespace-separated list, for example: @@ -285,75 +297,88 @@ make check-world PG_TEST_EXTRA='kerberos ldap ssl load_balance libpq_encryption' - sepgsql + libpq_encryption - Runs the test suite under contrib/sepgsql. This - requires an SELinux environment that is set up in a specific way; see - . + Runs the test src/interfaces/libpq/t/005_negotiate_encryption.pl. + This opens TCP/IP listen sockets. If PG_TEST_EXTRA + also includes kerberos, additional tests that require + an MIT Kerberos installation are enabled. - ssl + load_balance - Runs the test suite under src/test/ssl. This opens TCP/IP listen sockets. + Runs the test src/interfaces/libpq/t/004_load_balance_dns.pl. + This requires editing the system hosts file and + opens TCP/IP listen sockets. - load_balance + oauth - Runs the test src/interfaces/libpq/t/004_load_balance_dns.pl. - This requires editing the system hosts file and - opens TCP/IP listen sockets. + Runs the test suite under src/test/modules/oauth_validator. + This opens TCP/IP listen sockets for a test server running HTTPS. - libpq_encryption + regress_dump_restore - Runs the test src/interfaces/libpq/t/005_negotiate_encryption.pl. - This opens TCP/IP listen sockets. If PG_TEST_EXTRA - also includes kerberos, additional tests that require - an MIT Kerberos installation are enabled. + Runs an additional test suite in + src/bin/pg_upgrade/t/002_pg_upgrade.pl which + cycles the regression database through pg_dump/ + pg_restore. Not enabled by default because it + is resource-intensive. - wal_consistency_checking + sepgsql - Uses wal_consistency_checking=all while running - certain tests under src/test/recovery. Not - enabled by default because it is resource intensive. + Runs the test suite under contrib/sepgsql. This + requires an SELinux environment that is set up in a specific way; see + . - xid_wraparound + ssl - Runs the test suite under src/test/modules/xid_wraparound. - Not enabled by default because it is resource intensive. + Runs the test suite under src/test/ssl. This opens TCP/IP listen sockets. - oauth + wal_consistency_checking - Runs the test suite under src/test/modules/oauth_validator. - This opens TCP/IP listen sockets for a test server running HTTPS. + Uses wal_consistency_checking=all while running + certain tests under src/test/recovery. Not + enabled by default because it is resource-intensive. + + + + + + xid_wraparound + + + Runs the test suite under src/test/modules/xid_wraparound. + Not enabled by default because it is resource-intensive. @@ -892,6 +917,14 @@ PG_TEST_NOCLEAN=1 make -C src/bin/pg_dump check PG_TEST_TIMEOUT_DEFAULT to a higher number will change the default to avoid this. + + + For certain tests, the environment variable + PG_TEST_FILE_READ_LINES can be set to limit the number of + lines read from large output files (head and tail). This is useful when + the test output contains a lot of unnecessary content, allowing the test + framework to read only a limited number of lines for its reports. + diff --git a/doc/src/sgml/release-18.sgml b/doc/src/sgml/release-18.sgml deleted file mode 100644 index 2ae03065f9451..0000000000000 --- a/doc/src/sgml/release-18.sgml +++ /dev/null @@ -1,3534 +0,0 @@ - - - - - Release 18 - - - Release date: - 2025-??-??, CURRENT AS OF 2025-05-23 - - - - Overview - - - PostgreSQL 18 contains many new features - and enhancements, including: - - - - - - - (to be completed) - - - - - - The above items and other new features of - PostgreSQL 18 are explained in more detail - in the sections below. - - - - - - - Migration to Version 18 - - - A dump/restore using or use of - or logical replication is required for - those wishing to migrate data from any previous release. See for general information on migrating to new - major releases. - - - - Version 18 contains a number of changes that may affect compatibility - with previous releases. Observe the following incompatibilities: - - - - - - - - -Change time zone abbreviation handling (Tom Lane) -§ - - - -The system will now favor the current session's time zone abbreviations before checking the server variable timezone_abbreviations. Previously timezone_abbreviations was -checked first. - - - - - - - -Deprecate MD5 password authentication (Nathan Bossart) -§ - - - -Warnings generated by their use can be disabled by the server variable md5_password_warnings. - - - - - - - -Change VACUUM and ANALYZE to process the inheritance children of a parent (Michael Harris) -§ - - - -The previous behavior can be performed by using the new ONLY option. - - - - - - - -Prevent COPY FROM from treating \. as an end-of-file marker when reading CSV files (Daniel Vérité, Tom Lane) -§ -§ - - - -psql will still treat \. as an end-of-file marker when reading CSV files from STDIN. Older psql clients connecting to Postgres 18 servers might experience \copy problems. This -release also enforces that \. must appear alone on a line. - - - - - - - -Disallow unlogged partitioned tables (Michael Paquier) -§ - - - -Previously ALTER TABLE SET [UN]LOGGED did nothing, and the creation of an unlogged partitioned table did not cause its children to be unlogged. - - - - - - - -Remove non-functional support for RULE privileges in GRANT/REVOKE (Fujii Masao) -§ - - - -These have been non-functional since Postgres 8.2. - - - - - - - -Remove column pg_backend_memory_contexts.parent (Melih Mutlu) -§ - - - -This is now longer needed since pg_backend_memory_contexts.path was added. - - - - - - - -Change pg_backend_memory_contexts.level and pg_log_backend_memory_contexts() to be one-based (Melih Mutlu, Atsushi Torikoshi, David Rowley, Fujii Masao) -§ -§ -§ - - - -These were previously zero-based. - - - - - - - - - Changes - - - Below you will find a detailed account of the changes between - PostgreSQL 18 and the previous major - release. - - - - Server - - - Optimizer - - - - - - - -Remove some unnecessary table self-joins (Andrey Lepikhov, Alexander Kuzmenkov, Alexander Korotkov, Alena Rybakina) -§ - - - -This optimization can be disabled using server variable enable_self_join_elimination. - - - - - - - -Convert some 'IN (VALUES ...)' to 'x = ANY ...' for better optimizer statistics (Alena Rybakina, Andrei Lepikhov) -§ - - - - - - - -Allow transforming OR-clauses to arrays for faster index processing (Alexander Korotkov, Andrey Lepikhov) -§ - - - - - - - -Speed up the processing of INTERSECT, EXCEPT, window aggregates, and view column aliases (Tom Lane, David Rowley) -§ -§ -§ -§ - - - - - - - -Allow the keys of SELECT DISTINCT to be internally reordered to avoid sorting (Richard Guo) -§ - - - -This optimization can be disabled using enable_distinct_reordering. - - - - - - - -Ignore GROUP BY columns that are functionally dependent on other columns (Zhang Mingli, Jian He, David Rowley) -§ - - - -If a GROUP BY clause includes all columns of a unique index, as well as other columns of the same table, those other columns are redundant and can be dropped -from the grouping. This was already true for non-deferred primary keys. - - - - - - - -Allow some HAVING clauses on GROUPING SETS to be pushed to WHERE clauses (Richard Guo) -§ -§ -§ -§ - - - -This allows earlier row filtering. This release also fixes some GROUPING SETS queries that used to return incorrect results. - - - - - - - -Improve row estimates for generate_series() using numeric and timestamp values (David Rowley, Song Jinzhou) -§ -§ - - - - - - - -Allow the optimizer to use "Right Semi Join" plans (Richard Guo) -§ - - - -Semi-joins are used when needing to find if there is at least one match. - - - - - - - -Allow merge joins to use incremental sorts (Richard Guo) -§ - - - - - - - -Improve the efficiency of planning queries accessing many partitions (Ashutosh Bapat, Yuya Watari, David Rowley) -§ -§ - - - - - - - -Allow partitionwise joins in more cases, and reduce its memory usage (Richard Guo, Tom Lane, Ashutosh Bapat) -§ -§ - - - - - - - -Improve cost estimates of partition queries (Nikita Malakhov, Andrei Lepikhov) -§ - - - - - - - -Improve SQL-language function plan caching (Alexander Pyhalov, Tom Lane) -§ -§ - - - - - - - -Improve handling of disabled optimizer features (Robert Haas) -§ - - - - - - - - - Indexes - - - - - - - -Allow skip scans of btree indexes (Peter Geoghegan) -§ -§ - - - -This allows multi-column btree indexes to be used by queries that only -equality-reference the second or later indexed columns. - - - - - - - -Allow non-btree unique indexes to be used as partition keys and in materialized views (Mark Dilger) -§ -§ - - - -The index type must still support equality. - - - - - - - -Allow GIN indexes to be created in parallel (Tomas Vondra, Matthias van de Meent) -§ - - - - - - - -Allow values to be sorted to speed rangetype GiST and btree index builds (Bernd Helmle) -§ - - - - - - - - - General Performance - - - - - - - -Add an asynchronous I/O subsystem (Andres Freund, Thomas Munro, Nazir Bilal Yavuz, Melanie Plageman) -§ -§ -§ -§ -§ -§ -§ -§ -§ -§ -§ - - - -This is enabled by server variable io_method, with server variables io_combine_limit and io_max_combine_limit added to control it. This also enables -effective_io_concurrency and maintenance_io_concurrency values greater than zero for systems without fadvise() support. The new system view pg_aios shows the file handles being used -for asynchronous I/O. - - - - - - - -Improve the locking performance of queries that access many relations (Tomas Vondra) -§ - - - - - - - -Improve the performance and reduce memory usage of hash joins and GROUP BY (David Rowley, Jeff Davis) -§ -§ -§ -§ -§ - - - -This also improves hash set operations used by EXCEPT, and hash lookups of subplan values. - - - - - - - -Allow normal vacuums to freeze some pages, even though they are all-visible (Melanie Plageman) -§ -§ - - - -This reduces the overhead of later full-relation freezing. The aggressiveness of this can be controlled by server variable and per-table setting vacuum_max_eager_freeze_failure_rate. -Previously vacuum never processed all-visible pages until freezing was required. - - - - - - - -Add server variable vacuum_truncate to control file truncation during VACUUM (Nathan Bossart, Gurjeet Singh) -§ - - - -A storage-level parameter with the same name and behavior already existed. - - - - - - - -Increase server variables effective_io_concurrency's and maintenance_io_concurrency's default values to 16 (Melanie Plageman) -§ -§ - - - -This more accurately reflects modern hardware. - - - - - - - - - Monitoring - - - - - - - -Increase the logging granularity of server variable log_connections (Melanie Plageman) -§ -§ - - - -This server variable was previously only boolean; these options are still supported. - - - - - - - -Add log_line_prefix escape "%L" to output the client IP address (Greg Sabino Mullane) -§ - - - - - - - -Add server variable log_lock_failure to log lock acquisition failures (Yuki Seino) -§ - - - -Specifically it reports SELECT ... NOWAIT lock failures. - - - - - - - -Modify pg_stat_all_tables and its variants to report the time spent in vacuum, analyze, and their automatic variants (Sami Imseih) -§ - - - -The new columns are total_vacuum_time, total_autovacuum_time, total_analyze_time, and total_autoanalyze_time. - - - - - - - -Add delay time reporting to VACUUM and ANALYZE (Bertrand Drouvot, Nathan Bossart) -§ -§ - - - -This information appears in the autovacuum logs, the system views pg_stat_progress_vacuum and pg_stat_progress_analyze, and the output of VACUUM and ANALYZE when in VERBOSE -mode; tracking must be enabled with the server variable track_cost_delay_timing. - - - - - - - -Add per-backend I/O statistics reporting (Bertrand Drouvot) -§ -§ - - - -The statistics are accessed via pg_stat_get_backend_io(). Per-backend I/O statistics can be cleared via pg_stat_reset_backend_stats(). - - - - - - - -Add pg_stat_io columns to report I/O activity in bytes (Nazir Bilal Yavuz) -§ - - - -The new columns are read_bytes, write_bytes, and extend_bytes. The op_bytes column, which always equaled BLCKSZ, has been removed. - - - - - - - -Add WAL I/O activity rows to pg_stat_io (Nazir Bilal Yavuz, Bertrand Drouvot, Michael Paquier) -§ -§ -§ - - - -This includes WAL receiver activity and a wait event for such writes. - - - - - - - - -Change server variable track_wal_io_timing to control tracking WAL timing in pg_stat_io instead of pg_stat_wal (Bertrand Drouvot) -§ - - - - - - - -Remove read/sync columns from pg_stat_wal (Bertrand Drouvot) -§ -§ - - - -This removes columns wal_write, wal_sync, wal_write_time, and wal_sync_time. - - - - - - - -Add function pg_stat_get_backend_wal() to return per-backend WAL statistics (Bertrand Drouvot) -§ - - - -Per-backend WAL statistics can be cleared via pg_stat_reset_backend_stats(). - - - - - - - -Add function pg_ls_summariesdir() to specifically list the contents of PGDATA/pg_wal/summaries (Yushi Ogiwara) -§ - - - - - - - -Add column pg_stat_checkpointer.num_done to report the number of completed checkpoints (Anton A. Melnikov) -§ - - - -Columns num_timed and num_requested count both completed and skipped checkpoints. - - - - - - - -Add column pg_stat_checkpointer.slru_written to report SLRU buffers written (Nitin Jadhav) -§ - - - -Also, modify the checkpoint server log message to report separate shared buffer and SLRU buffer values. - - - - - - - -Add columns to pg_stat_database to report parallel workers activity (Benoit Lobréau) -§ - - - -The new columns are parallel_workers_to_launch and parallel_workers_launched. - - - - - - - -Have query jumbling of arrays consider only the first and last array elements (Dmitry Dolgov, Sami Imseih) -§ -§ - - - -Jumbling is used by pg_stat_statements. - - - - - - - -Adjust query jumbling to group together queries using the same relation name (Michael Paquier, Sami Imseih) -§ - - - -This is true even if the tables in different schemas have different column names. - - - - - - - -Add column pg_backend_memory_contexts.type to report the type of memory context (David Rowley) -§ - - - - - - - -Add column pg_backend_memory_contexts.path to show memory context parents (Melih Mutlu) -§ - - - - - - - - - Privileges - - - - - - - -Add function pg_get_acl() to retrieve database access control details (Joel Jacobson) -§ -§ - - - - - - - -Add function has_largeobject_privilege() to check large object privileges (Yugo Nagata) -§ - - - - - - - -Allow ALTER DEFAULT PRIVILEGES to define large object default privileges (Takatsuka Haruka, Yugo Nagata, Laurenz Albe) -§ - - - - - - - -Add predefined role pg_signal_autovacuum_worker (Kirill Reshke) -§ - - - -This allows sending signals to autovacuum workers. - - - - - - - - - Server Configuration - - - - - - - -Add support for the OAuth authentication method (Jacob Champion, Daniel Gustafsson, Thomas Munro) -§ - - - -This adds an "oauth" authentication method to pg_hba.conf, libpq OAuth options, a server variable oauth_validator_libraries to load token validation libraries, and -a configure flag --with-libcurl to add the required compile-time libraries. - - - - - - - -Add server variable ssl_tls13_ciphers to allow specification of multiple colon-separated TLSv1.3 cipher suites (Erica Zhang, Daniel Gustafsson) -§ - - - - - - - -Change server variable ssl_groups's default to include elliptic curve X25519 (Daniel Gustafsson, Jacob Champion) -§ - - - - - - - -Rename server variable ssl_ecdh_curve to ssl_groups and allow multiple colon-separated ECDH curves to be specified (Erica Zhang, Daniel Gustafsson) -§ - - -The previous name still works. - - - - - - - -Add function pg_check_fipsmode() to report the server's FIPS mode (Daniel Gustafsson) -§ - - - - - - - -Make cancel request keys 256 bits (Heikki Linnakangas, Jelte Fennema-Nio) -§ -§ - - - -This is only possible when the server and client support wire protocol version 3.2, introduced in this release. - - - - - - - -Add server variable autovacuum_worker_slots to specify the maximum number of background workers (Nathan Bossart) -§ - - - -With this variable set, autovacuum_max_workers can be adjusted at runtime up to this maximum without a server restart. - - - - - - - -Allow specification of the fixed number of dead tuples that will trigger an autovacuum (Nathan Bossart, Frédéric Yhuel) -§ - - - -The server variable is autovacuum_vacuum_max_threshold. Percentages are still used for triggering. - - - - - - - -Change server variable max_files_per_process to limit only files opened by a backend (Andres Freund) -§ - - - -Previously files opened by the postmaster were also counted toward this limit. - - - - - - - -Add server variable num_os_semaphores to report the required number of semaphores (Nathan Bossart) -§ - - - -This is useful for operating system configuration. - - - - - - - -Add server variable extension_control_path to specify the location of extension control files (Peter Eisentraut, Matheus Alcantara) -§ -§ - - - - - - - - - Streaming Replication and Recovery - - - - - - - -Allow inactive replication slots to be automatically invalided using server variable idle_replication_slot_timeout (Nisha Moond, Bharath Rupireddy) -§ - - - - - - - -Add server variable max_active_replication_origins to control the maximum active replication origins (Euler Taveira) -§ - - - -This was previously controlled by max_replication_slots, but this new setting allows a higher origin count in cases where fewer slots are required. - - - - - - - - - <link linkend="logical-replication">Logical Replication</link> - - - - - - - -Allow the values of generated columns to be logically replicated (Shubham Khanna, Vignesh C, Zhijie Hou, Shlok Kyal, Peter Smith) -§ -§ -§ -§ - - - -If the publication specifies a column list, all specified columns, generated and non-generated, are published. Without a specified column list, publication option publish_generated_columns -controls whether generated columns are published. Previously generated columns were not replicated and the subscriber had to compute the values if possible; this is particularly -useful for non-Postgres subscribers which lack such a capability. - - - - - - - -Change the default CREATE SUBSCRIPTION streaming option from "off" to "parallel" (Vignesh C) -§ - - - - - - - -Allow ALTER SUBSCRIPTION to change the replication slot's two-phase commit behavior (Hayato Kuroda, Ajin Cherian, Amit Kapila, Zhijie Hou) -§ -§ - - - - - - - -Log conflicts while applying logical replication changes (Zhijie Hou, Nisha Moond) -§ -§ -§ -§ -§ - - - -Also report in new columns of pg_stat_subscription_stats. - - - - - - - - - - - Utility Commands - - - - - - - -Allow generated columns to be virtual, and make them the default (Peter Eisentraut, Jian He, Richard Guo, Dean Rasheed) -§ -§ -§ - - - -Virtual generated columns generate their values when the columns are read, not written. The write behavior can still be specified via the STORED option. - - - - - - - -Add OLD/NEW support to RETURNING in DML queries (Dean Rasheed) -§ - - - -Previously RETURNING only returned new values for INSERT and UPDATE, and old values for DELETE; MERGE would return the appropriate value for the internal query executed. This new syntax -allows the RETURNING list of INSERT/UPDATE/DELETE/MERGE to explicitly return old and new values by using the special aliases "old" and "new". These aliases can be renamed to -avoid identifier conflicts. - - - - - - - -Allow foreign tables to be created like existing local tables (Zhang Mingli) -§ - - - -The syntax is CREATE FOREIGN TABLE ... LIKE. - - - - - - - -Allow LIKE with nondeterministic collations (Peter Eisentraut) -§ - - - - - - - -Allow text position search functions with nondeterministic collations (Peter Eisentraut) -§ - - - -These used to generate an error. - - - - - - - -Add builtin collation provider PG_UNICODE_FAST (Jeff Davis) -§ - - - -This locale supports case mapping, but sorts in code point order, not natural language order. - - - - - - - -Allow VACUUM and ANALYZE to process partitioned tables without processing their children (Michael Harris) -§ - - - -This is enabled with the new ONLY option. This is useful since autovacuum does not process partitioned tables, just its children. - - - - - - - -Add functions to modify per-relation and per-column optimizer statistics (Corey Huinker) -§ -§ -§ - - - -The functions are pg_restore_relation_stats(), pg_restore_attribute_stats(), pg_clear_relation_stats(), and pg_clear_attribute_stats. - - - - - - - - -Add server variable file_copy_method to control the file copying method (Nazir Bilal Yavuz) -§ - - - -This controls whether CREATE DATABASE ... STRATEGY=FILE_COPY and ALTER DATABASE ... SET TABLESPACE uses file copy or clone. - - - - - - - <link linkend="ddl-constraints">Constraints</link> - - - - - - - -Allow the specification of non-overlapping PRIMARY KEY and UNIQUE constraints (Paul A. Jungwirth) -§ - - - -This is specified by WITHOUT OVERLAPS on the last column. - - - - - - - -Allow CHECK and foreign key constraints to be specified as NOT ENFORCED (Amul Sul) -§ -§ - - - -This also adds column pg_constraint.conenforced. - - - - - - - -Require primary/foreign key relationships to use either deterministic collations or the the same nondeterministic collations (Peter Eisentraut) -§ - - - -The restore of a pg_dump, also used by pg_upgrade, will fail if these requirements are not met; schema changes must be made for these upgrade methods to succeed. - - - - - - - -Store column NOT NULL specifications in pg_constraint (Álvaro Herrera, Bernd Helmle) -§ - - - -This allows names to be specified for NOT NULL constraint. This also adds NOT NULL constraints to foreign tables and NOT NULL inheritance control to local tables. - - - - - - - -Allow ALTER TABLE to set the NOT VALID attribute of NOT NULL constraints (Rushabh Lathia, Jian He) -§ - - - - - - - -Allow modification of the inheritability of NOT NULL constraints (Suraj Kharage, Álvaro Herrera) -§ -§ - - - -The syntax is ALTER TABLE ... ALTER CONSTRAINT ... [NO] INHERIT. - - - - - - - -Allow NOT VALID foreign key constraints on partitioned tables (Amul Sul) -§ - - - - - - - -Allow dropping of constraints ONLY on partitioned tables (Álvaro Herrera) -§ - - - -This was previously erroneously prohibited. - - - - - - - - <link linkend="sql-copy"><command>COPY</command></link> - - - - - - - -Add REJECT_LIMIT to control the number of invalid rows COPY FROM can ignore (Atsushi Torikoshi) -§ - - - -This is available when ON_ERROR = 'ignore'. - - - - - - - -Allow COPY TO to copy rows from populated materialized view (Jian He) -§ - - - - - - - -Add COPY LOG_VERBOSITY level "silent" to suppress log output of ignored rows (Atsushi Torikoshi) -§ - - - -This new level suppresses output for discarded input rows when on_error = 'ignore'. - - - - - - - -Disallow COPY FREEZE on foreign tables (Nathan Bossart) -§ - - - -Previously, the COPY worked but the FREEZE was ignored, so disallow this command. - - - - - - - - <link linkend="sql-explain"><command>EXPLAIN</command></link> - - - - - - - -Automatically include BUFFERS output in EXPLAIN ANALYZE (Guillaume Lelarge, David Rowley) -§ - - - - - - - -Add WAL, CPU, and average read statistics output to EXPLAIN ANALYZE VERBOSE (Anthonin Bonnefoy) -§ -§ - - - - - - - -Add full WAL buffer count to EXPLAIN (WAL), VACUUM/ANALYZE (VERBOSE), and autovacuum log output (Bertrand Drouvot) -§ -§ - - - - - - - -In EXPLAIN ANALYZE, report the number of index lookups used per index scan node (Peter Geoghegan) -§ - - - - - - - -Modify EXPLAIN to output fractional row counts (Ibrar Ahmed, Ilia Evdokimov, Robert Haas) -§ -§ - - - - - - - -Add memory and disk usage details to Material, Window Aggregate, and common table expression nodes in EXPLAIN (David Rowley, Tatsuo Ishii) -§ -§ -§ -§ - - - - - - - - -Add details about window function arguments to EXPLAIN output (Tom Lane) -§ - - - - - - - -Add "Parallel Bitmap Heap Scan" worker cache statistics to EXPLAIN ANALYZE (David Geier, Heikki Linnakangas, Donghang Lin, Alena Rybakina, David Rowley) -§ - - - - - - - -Indicate disabled nodes in EXPLAIN ANALYZE output (Robert Haas, David Rowley, Laurenz Albe) -§ -§ -§ - - - - - - - - - - - Data Types - - - - - - - -Improve Unicode full case mapping and conversion (Jeff Davis) -§ -§ - - - -This adds the ability to do conditional and title case mapping, and case map single characters to multiple characters. - - - - - - - -Allow jsonb "null" values to be cast to scalar types as NULL (Tom Lane) -§ - - - -Previously such casts generated an error. - - - - - - - -Add optional parameter to json{b}_strip_nulls to allow removal of null array elements (Florents Tselai) -§ - - - - - - - -Add function array_sort() which sorts an array's first dimension (Junwang Zhao, Jian He) -§ - - - - - - - -Add function array_reverse() which reverses an array's first dimension (Aleksander Alekseev) -§ - - - - - - - -Add function reverse() to reverse bytea bytes (Aleksander Alekseev) -§ - - - - - - - -Allow casting between integer types and bytea (Aleksander Alekseev) -§ - - - -The integer values are stored as bytea two's complement values. - - - - - - - -Update Unicode data to Unicode 16.0.0 (Peter Eisentraut) -§ - - - - - - - -Add full text search stemming for Estonian (Tom Lane) -§ - - - - - - - -Improve the XML error codes to more closely match the SQL standard (Tom Lane) -§ - - - -These errors are reported via SQLSTATE. - - - - - - - - - Functions - - - - - - - -Add function CASEFOLD() to allow for more sophisticated case-insensitive matching (Jeff Davis) -§ - - - -Allows more accurate comparison, i.e., a character can have multiple upper or lower case equivalents, or upper or lower case conversion changes the number of characters. - - - - - - - -Allow MIN()/MAX() aggregates on arrays and composite types (Aleksander Alekseev, Marat Buharov) -§ -§ - - - - - - - -Add a WEEK option to EXTRACT() (Tom Lane) -§ - - - - - - - -Improve the output EXTRACT(QUARTER ...) for negative values (Tom Lane) -§ - - - - - - - -Add roman numeral support to to_number() (Hunaid Sohail) -§ - - - -This is accessed via the "RN" pattern. - - - - - - - -Add UUID version 7 generation function uuidv7() (Andrey Borodin) -§ - - - -This UUID value is temporally sortable. Function alias uuidv4() has been added to explicitly generate version 4 UUIDs. - - - - - - - -Add functions crc32() and crc32c() to compute CRC values (Aleksander Alekseev) -§ - - - - - - - -Add math functions gamma() and lgamma() (Dean Rasheed) -§ - - - - - - - -Allow "=>" syntax for named cursor arguments in plpgsql (Pavel Stehule) -§ - - - -We previously only accepted ":=". - - - - - - - -Allow regexp_match[es]/regexp_like/regexp_replace/regexp_count/regexp_instr/regexp_substr/regexp_split_to_table/regexp_split_to_array() to use named arguments (Jian He) -§ - - - - - - - - - <link linkend="libpq">libpq</link> - - - - - - - -Add function PQfullProtocolVersion() to report the full, including minor, protocol version number (Jacob Champion, Jelte Fennema-Nio) -§ - - - - - - - -Add libpq connection parameters and environment variables to specify the minimum and maximum acceptable protocol version for connections (Jelte Fennema-Nio) -§ -§ - - - - - - - -Add libpq function PQservice() to return the connection service name (Michael Banck) -§ - - - - - - - -Report search_path changes to the client (Alexander Kukushkin, Jelte Fennema-Nio, Tomas Vondra) -§ -§ - - - - - - - -Add PQtrace() output for all message types, including authentication (Jelte Fennema-Nio) -§ -§ -§ -§ -§ - - - - - - - -Add libpq connection parameter sslkeylogfile which dumps out SSL key material (Abhishek Chanda, Daniel Gustafsson) -§ - - - -This is useful for debugging. - - - - - - - -Modify some libpq function signatures to use int64_t (Thomas Munro) -§ - - - -These previously used pg_int64, which is now deprecated. - - - - - - - - - <xref linkend="app-psql"/> - - - - - - - -Allow psql to parse, bind, and close named prepared statements (Anthonin Bonnefoy, Michael Paquier) -§ - - - -This is accomplished with new commands \parse, \bind_named, and \close. - - - - - - - -Add psql backslash commands to allowing issuance of pipeline queries (Anthonin Bonnefoy) -§ -§ -§ - - - -The new commands are \startpipeline, \syncpipeline, \sendpipeline, \endpipeline, \flushrequest, \flush, and \getresults. - - - - - - - -Allow adding pipeline status to the psql prompt and add related state variables (Anthonin Bonnefoy) -§ - - - -The new prompt character is "%P" and the new psql variables are PIPELINE_SYNC_COUNT, PIPELINE_COMMAND_COUNT, and PIPELINE_RESULT_COUNT. - - - - - - - -Allow adding the connection service name to the psql prompt or access it via psql variable (Michael Banck) -§ - - - - - - - -Add psql option to use expanded mode on all list commands (Dean Rasheed) -§ - - - -Adding 'x' enables this. - - - - - - - -Change psql's \conninfo to use tabular format and include more information (Álvaro Herrera, Maiquel Grassi, Hunaid Sohail) -§ - - - - - - - -Add function's leakproof indicator to psql's \df+, \do+, \dAo+, and \dC+ outputs (Yugo Nagata) -§ - - - - - - - -Add access method details for partitioned relations in \dP+ (Justin Pryzby) -§ - - - - - - - -Add "default_version" to the psql \dx extension output (Magnus Hagander) -§ - - - - - - - -Add psql variable WATCH_INTERVAL to set the default \watch wait time (Daniel Gustafsson) -§ - - - - - - - - - Server Applications - - - - - - - -Change initdb to default to enabling checksums (Greg Sabino Mullane) -§ -§ - - - -The new initdb option --no-data-checksums disables checksums. - - - - - - - -Add initdb option --no-sync-data-files to avoid syncing heap/index files (Nathan Bossart) -§ - - - -initdb --no-sync is still available to avoid syncing any files. - - - - - - - -Add vacuumdb option --missing-stats-only to compute only missing optimizer statistics (Corey Huinker, Nathan Bossart) -§ -§ - - - -This option can only be used by --analyze-only and --analyze-in-stages. - - - - - - - -Add pg_combinebackup option -k/--link to enable hard linking (Israel Barth Rubio, Robert Haas) -§ - - - -Only some files can be hard linked. This should not be used if the backups will be used independently. - - - - - - - -Allow pg_verifybackup to verify tar-format backups (Amul Sul) -§ - - - - - - - -If pg_rewind's --source-server specifies a database name, use it in --write-recovery-conf output (Masahiko Sawada) -§ - - - - - - - -Add pg_resetwal option --char-signedness to change the default char signedness (Masahiko Sawada) -§ - - - - - - - - <link - linkend="app-pgdump"><application>pg_dump</application></link>/<link - linkend="app-pg-dumpall"><application>pg_dumpall</application></link>/<link - linkend="app-pgrestore"><application>pg_restore</application></link> - - - - - - - -Allow pg_dumpall to dump in the same output formats as pg_dump supports (Mahendra Singh Thalor, Andrew Dunstan) -§ - - - -Also modify pg_restore to handle such dumps. Previously pg_dumpall only supported text format. - - - - - - - -Add pg_dump options --with-schema, --with-data, and --with-statistics (Jeff Davis) -§ - - - - - - - -Add pg_dump and pg_dumpall option --sequence-data to dump sequence data that would normally be excluded (Nathan Bossart) -§ -§ - - - - - - - -Add pg_dump, pg_dumpall, and pg_restore options --statistics-only, --no-statistics, --no-data, and --no-schema (Corey Huinker, Jeff Davis) -§ - - - - - - - -Add option --no-policies to disable row level security policy processing in pg_dump, pg_dumpall, pg_restore (Nikolay Samokhvalov) -§ - - - -This is useful for migrating to systems with different policies. - - - - - - - - - <link linkend="pgupgrade"><application>pg_upgrade</application></link> - - - - - - - -Allow pg_upgrade to preserve optimizer statistics (Corey Huinker, Jeff Davis, Nathan Bossart) -§ -§ -§ -§ - - - -Extended statistics are not preserved. Also add pg_upgrade option --no-statistics to disable statistics preservation. - - - - - - - -Allow pg_upgrade to process database checks in parallel (Nathan Bossart) -§ -§ -§ -§ -§ -§ -§ -§ -§ -§ -§ - - - -This is controlled by the existing --jobs option. - - - - - - - -Add pg_upgrade option --swap to swap directories rather than copy, clone, or link files (Nathan Bossart) -§ - - - -This mode is potentially the fastest. - - - - - - - -Add pg_upgrade option --set-char-signedness to set the default char signedness of new cluster (Masahiko Sawada) -§ -§ - - - -This is to handle cases where a pre-Postgres 18 cluster's default CPU signedness does not match the new cluster. - - - - - - - - - Logical Replication Applications> - - - - - - - -Add pg_createsubscriber option --all to create logical replicas for all databases (Shubham Khanna) -§ - - - - - - - -Add pg_createsubscriber option --remove to remove publications (Shubham Khanna) -§ - - - - - - - -Add pg_createsubscriber option --enable-two-phase to enable prepared transactions (Shubham Khanna) -§ - - - - - - - -Add pg_recvlogical option --failover to specify failover slots (Hayato Kuroda) -§ - - - - - - - -Allow pg_recvlogical --drop-slot to work without --dbname (Hayato Kuroda) -§ - - - - - - - - - - - Source Code - - - - - - - -Separate the loading and running of injection points (Michael Paquier, Heikki Linnakangas) -§ -§ - - - -Injection points can now be created, but not run, via INJECTION_POINT_LOAD(), and such injection points can be run via INJECTION_POINT_CACHED(). - - - - - - - -Support runtime arguments in injection points (Michael Paquier) -§ - - - - - - - -Allow inline injection point test code with IS_INJECTION_POINT_ATTACHED() (Heikki Linnakangas) -§ - - - - - - - -Improve the performance of processing long JSON strings using SIMD instructions (David Rowley) -§ - - - - - - - -Speed up CRC32C calculations using x86 AVX-512 instructions (Raghuveer Devulapalli, Paul Amonson) -§ - - - - - - - -Add ARM Neon and SVE CPU intrinsics for popcount (integer bit counting) (Chiranmoy Bhattacharya, Devanga Susmitha, Rama Malladi) -§ -§ - - - - - - - -Improve the speed of multiplication (Joel Jacobson, Dean Rasheed) -§ - - - - - - - -Add configure option --with-libnuma to enable NUMA awareness (Jakub Wartak, Bertrand Drouvot) -§ -§ -§ - - - -The function pg_numa_available() reports on NUMA awareness, and system views pg_shmem_allocations_numa and pg_buffercache_numa which report on shared memory distribution across -NUMA nodes. - - - - - - - -Add TOAST table to pg_index to allow for very large index expression indexes (Nathan Bossart) -§ - - - - - - - -Remove column pg_attribute.attcacheoff (David Rowley) -§ - - - - - - - -Add column pg_class.relallfrozen (Melanie Plageman) -§ - - - - - - - -Add amgettreeheight, amconsistentequality, and amconsistentordering to the index access method API (Mark Dilger) -§ -§ - - - - - - - -Add GiST support function stratnum (Paul A. Jungwirth) -§ - - - - - - - -Record the default CPU signedness of "char" in pg_controldata (Masahiko Sawada) -§ - - - - - - - -Add support for Python "Limited API" in PL/Python (Peter Eisentraut) -§ -§ - - - -This helps prevent problems caused by Python 3.x version mismatches. - - - - - - - -Change the minimum supported Python version to 3.6.8 (Jacob Champion) -§ - - - - - - - -Remove support for OpenSSL versions older than 1.1.1 (Daniel Gustafsson) -§ -§ - - - - - - - -If LLVM is enabled, require version 14 or later (Thomas Munro) -§ - - - - - - - -Add macro PG_MODULE_MAGIC_EXT to allow extensions to report their name and version (Andrei Lepikhov) -§ - - - -This information can be access via the new function pg_get_loaded_modules(). - - - - - - - -Document that SPI_connect/SPI_connect_ext() always returns success (SPI_OK_CONNECT) (Stepan Neretin) -§ - - - -Errors are always reported via ereport(). - - - - - - - -Remove the experimental designation of Meson builds on Windows (Aleksander Alekseev) -§ - - - - - - - -Add documentation section about API and ABI compatibility (David Wheeler, Peter Eisentraut) -§ - - - - - - - -Remove configure options --disable-spinlocks and --disable-atomics (Thomas Munro) -§ -§ - - - -Thirty-two bit atomic operations are now required. - - - - - - - -Remove support for the HPPA/PA-RISC architecture (Tom Lane) -§ - - - - - - - - - Additional Modules - - - - - - - -Add extension pg_logicalinspect to inspect logical snapshots (Bertrand Drouvot) -§ - - - - - - - -Add extension pg_overexplain which adds debug details to EXPLAIN output (Robert Haas) -§ - - - - - - - -Add output columns to postgres_fdw_get_connections() (Hayato Kuroda, Sagar Dilip Shedge) -§ -§ -§ -§ - - - -New output column "used_in_xact" indicates if the foreign data wrapper is being used by a current transaction, "closed" indicates if it is closed, "user_name" indicates the -user name, and "remote_backend_pid" indicates the remote backend process identifier. - - - - - - - -Allow SCRAM authentication from the client to be passed to postgres_fdw servers (Matheus Alcantara, Peter Eisentraut) -§ - - - -This avoids storing postgres_fdw authentication information in the database, and is enabled with the postgres_fdw "use_scram_passthrough" connection option. libpq uses new connection -parameters scram_client_key and scram_server_key. - - - - - - - -Allow SCRAM authentication from the client to be passed to dblink servers (Matheus Alcantara) -§ - - - - - - - -Add on_error and log_verbosity options to file_fdw (Atsushi Torikoshi) -§ - - - -These control how file_fdw handles and reports invalid file rows. - - - - - - - -Add "reject_limit" to control the number of invalid rows file_fdw can ignore (Atsushi Torikoshi) -§ - - - -This is active when ON_ERROR = 'ignore'. - - - - - - - -Add configurable variable min_password_length to passwordcheck (Emanuele Musella, Maurizio Boriani) -§ - - - -This controls the minimum password length. - - - - - - - -Have pgbench report the number of failed, retried, or skipped transactions in per-script reports (Yugo Nagata) -§ - - - - - - - -Add isn server variable "weak" to control invalid check digit acceptance (Viktor Holmberg) -§ - - - -This was previously only controlled by function isn_weak(). - - - - - - - -Allow values to be sorted to speed btree_gist index builds (Bernd Helmle, Andrey Borodin) -§ - - - - - - - -Add amcheck function gin_index_check() to verify GIN indexes (Grigory Kryachko, Heikki Linnakangas, Andrey Borodin) -§ - - - - - - - -Add functions pg_buffercache_evict_relation() and pg_buffercache_evict_all() to evict unpinned shared buffers (Nazir Bilal Yavuz) -§ - - - -The existing function pg_buffercache_evict() now returns the buffer flush status. - - - - - - - -Allow extensions to install custom EXPLAIN options (Robert Haas, Sami Imseih) -§ -§ -§ - - - - - - - -Allow extensions to use the server's cumulative statistics API (Michael Paquier) -§ -§ - - - - - - - <link linkend="pgstatstatements"><application>pg_stat_statements</application></link> - - - - - - - -Allow the queries of CREATE TABLE AS and DECLARE to be tracked by pg_stat_statements (Anthonin Bonnefoy) -§ - - - -They are also now assigned query ids. - - - - - - - -Allow the parameterization of SET values in pg_stat_statements (Greg Sabino Mullane, Michael Paquier) -§ - - - -This reduces the bloat caused by SET statements with differing constants. - - - - - - - -Add pg_stat_statements columns to report parallel activity (Guillaume Lelarge) -§ - - - -The new columns are parallel_workers_to_launch and parallel_workers_launched. - - - - - - - -Add pg_stat_statements.wal_buffers_full to report full WAL buffers (Bertrand Drouvot) -§ - - - - - - - - - <link linkend="pgcrypto"><application>pgcrypto</application></link> - - - - - - - -Add pgcrypto functions sha256crypt() and sha512crypt() (Bernd Helmle) -§ - - - - - - - -Add CFB mode to pgcrypto encryption and decryption (Umar Hayat) -§ - - - - - - - -Add pgcrypto server variable builtin_crypto_enabled to allow disabling builtin non-FIPS mode cryptographic functions (Daniel Gustafsson, Joe Conway) -§ - - - -This is useful for guaranteeing FIPS mode behavior. - - - - - - - - - - - - Acknowledgments - - - The following individuals (in alphabetical order) have contributed - to this release as patch authors, committers, reviewers, testers, - or reporters of issues. - - - - (to be completed) - - - - diff --git a/doc/src/sgml/release-19.sgml b/doc/src/sgml/release-19.sgml new file mode 100644 index 0000000000000..8d242b5b28141 --- /dev/null +++ b/doc/src/sgml/release-19.sgml @@ -0,0 +1,16 @@ + + + + + Release 19 + + + Release date: + 2026-??-?? + + + + This is just a placeholder for now. + + + diff --git a/doc/src/sgml/release.sgml b/doc/src/sgml/release.sgml index cee577ff8d353..a659d382db95c 100644 --- a/doc/src/sgml/release.sgml +++ b/doc/src/sgml/release.sgml @@ -70,7 +70,7 @@ For new features, add links to the documentation sections. All the active branches have to be edited concurrently when doing that. --> -&release-18; +&release-19; Prior Releases diff --git a/doc/src/sgml/rules.sgml b/doc/src/sgml/rules.sgml index 8467d961fd0a0..7f23962f524c4 100644 --- a/doc/src/sgml/rules.sgml +++ b/doc/src/sgml/rules.sgml @@ -60,6 +60,7 @@ SQL statement where the single parts that it is built from are stored separately. These query trees can be shown in the server log if you set the configuration parameters + debug_print_raw_parse, debug_print_parse, debug_print_rewritten, or debug_print_plan. The rule actions are also @@ -661,8 +662,8 @@ SELECT shoe_ready.shoename, shoe_ready.sh_avail, command other than a SELECT, the result relation points to the range-table entry where the result should go. Everything else is absolutely the same. So having two tables - t1 and t2 with columns a and - b, the query trees for the two statements: + t1 and t2 with columns a and + b, the query trees for the two statements: SELECT t2.b FROM t1, t2 WHERE t1.a = t2.a; @@ -675,27 +676,27 @@ UPDATE t1 SET b = t2.b FROM t2 WHERE t1.a = t2.a; - The range tables contain entries for the tables t1 and t2. + The range tables contain entries for the tables t1 and t2. The target lists contain one variable that points to column - b of the range table entry for table t2. + b of the range table entry for table t2. - The qualification expressions compare the columns a of both + The qualification expressions compare the columns a of both range-table entries for equality. - The join trees show a simple join between t1 and t2. + The join trees show a simple join between t1 and t2. @@ -704,7 +705,7 @@ UPDATE t1 SET b = t2.b FROM t2 WHERE t1.a = t2.a; The consequence is, that both query trees result in similar execution plans: They are both joins over the two tables. For the - UPDATE the missing columns from t1 are added to + UPDATE the missing columns from t1 are added to the target list by the planner and the final query tree will read as: @@ -726,7 +727,7 @@ SELECT t1.a, t2.b FROM t1, t2 WHERE t1.a = t2.a; one is a SELECT command and the other is an UPDATE is handled higher up in the executor, where it knows that this is an UPDATE, and it knows that - this result should go into table t1. But which of the rows + this result should go into table t1. But which of the rows that are there has to be replaced by the new row? @@ -738,7 +739,7 @@ SELECT t1.a, t2.b FROM t1, t2 WHERE t1.a = t2.a; This is a system column containing the file block number and position in the block for the row. Knowing the table, the CTID can be used to retrieve the - original row of t1 to be updated. After adding the + original row of t1 to be updated. After adding the CTID to the target list, the query actually looks like: @@ -967,7 +968,7 @@ CREATE MATERIALIZED VIEW sales_summary AS SELECT seller_no, invoice_date, - sum(invoice_amt)::numeric(13,2) as sales_amt + sum(invoice_amt)::numeric(13,2) AS sales_amt FROM invoice WHERE invoice_date < CURRENT_DATE GROUP BY @@ -1690,7 +1691,7 @@ CREATE RULE shoelace_ok_ins AS ON INSERT TO shoelace_ok WHERE sl_name = NEW.ok_name; - Now you can fill the table shoelace_arrive with + Now you can fill the table shoelace_arrive with the data from the parts list: @@ -2354,7 +2355,7 @@ CREATE RULE computer_del AS ON DELETE TO computer DELETE FROM computer WHERE hostname = 'mypc.local.net'; - the table computer is scanned by index (fast), and the + the table computer is scanned by index (fast), and the command issued by the trigger would also use an index scan (also fast). The extra command from the rule would be: @@ -2420,16 +2421,16 @@ Nestloop This shows, that the planner does not realize that the qualification for hostname in - computer could also be used for an index scan on - software when there are multiple qualification + computer could also be used for an index scan on + software when there are multiple qualification expressions combined with AND, which is what it does in the regular-expression version of the command. The trigger will get invoked once for each of the 2000 old computers that have to be deleted, and that will result in one index scan over - computer and 2000 index scans over - software. The rule implementation will do it with two + computer and 2000 index scans over + software. The rule implementation will do it with two commands that use indexes. And it depends on the overall size of - the table software whether the rule will still be faster in the + the table software whether the rule will still be faster in the sequential scan situation. 2000 command executions from the trigger over the SPI manager take some time, even if all the index blocks will soon be in the cache. @@ -2442,7 +2443,7 @@ DELETE FROM computer WHERE manufacturer = 'bim'; Again this could result in many rows to be deleted from - computer. So the trigger will again run many commands + computer. So the trigger will again run many commands through the executor. The command generated by the rule will be: @@ -2451,7 +2452,7 @@ DELETE FROM software WHERE computer.manufacturer = 'bim' The plan for that command will again be the nested loop over two - index scans, only using a different index on computer: + index scans, only using a different index on computer: Nestloop diff --git a/doc/src/sgml/seg.sgml b/doc/src/sgml/seg.sgml index dc66e24f2f514..2e879c3e45202 100644 --- a/doc/src/sgml/seg.sgml +++ b/doc/src/sgml/seg.sgml @@ -46,7 +46,7 @@ when you fetch it? Watch: -test=> select 6.50 :: float8 as "pH"; +test=> SELECT 6.50::float8 AS "pH"; pH --- 6.5 @@ -72,7 +72,7 @@ test=> select 6.50 :: float8 as "pH"; Check this out: -test=> select '6.25 .. 6.50'::seg as "pH"; +test=> SELECT '6.25 .. 6.50'::seg AS "pH"; pH ------------ 6.25 .. 6.50 @@ -377,7 +377,7 @@ test=> select '6.25 .. 6.50'::seg as "pH"; boundary if the resulting interval includes a power of ten: -postgres=> select '10(+-)1'::seg as seg; +postgres=> SELECT '10(+-)1'::seg AS seg; seg --------- 9.0 .. 11 -- should be: 9 .. 11 diff --git a/doc/src/sgml/sepgsql.sgml b/doc/src/sgml/sepgsql.sgml index 03ed7d1c90d15..ddac625355737 100644 --- a/doc/src/sgml/sepgsql.sgml +++ b/doc/src/sgml/sepgsql.sgml @@ -442,7 +442,7 @@ UPDATE t1 SET x = 2, y = func1(y) WHERE z = 100; The default database privilege system allows database superusers to modify system catalogs using DML commands, and reference or modify - toast tables. These operations are prohibited when + TOAST tables. These operations are prohibited when sepgsql is enabled. @@ -613,7 +613,7 @@ postgres=# SELECT cid, cname, show_credit(cid) FROM customer; the original one. For example: -regression=# select sepgsql_getcon(); +regression=# SELECT sepgsql_getcon(); sepgsql_getcon ------------------------------------------------------- unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 diff --git a/doc/src/sgml/sourcerepo.sgml b/doc/src/sgml/sourcerepo.sgml index 6c13c5a30cde6..f4368e83ab39d 100644 --- a/doc/src/sgml/sourcerepo.sgml +++ b/doc/src/sgml/sourcerepo.sgml @@ -40,7 +40,7 @@ - To begin using the Git repository, make a clone of the official mirror: + To begin using the Git repository, make a clone of the official mirror: git clone https://git.postgresql.org/git/postgresql.git @@ -51,16 +51,6 @@ git clone https://git.postgresql.org/git/postgresql.git The files will be placed in a new subdirectory postgresql of your current directory. - - - The Git mirror can also be reached via the Git protocol. Just change the URL - prefix to git, as in: - - -git clone git://git.postgresql.org/git/postgresql.git - - - diff --git a/doc/src/sgml/sources.sgml b/doc/src/sgml/sources.sgml index fa68d4d024a93..760f9b69d4778 100644 --- a/doc/src/sgml/sources.sgml +++ b/doc/src/sgml/sources.sgml @@ -153,11 +153,12 @@ ereport(ERROR, errmsg("function %s is not unique", func_signature_string(funcname, nargs, NIL, actual_arg_types)), - errhint("Unable to choose a best candidate function. " - "You might need to add explicit typecasts.")); + errdetail("Could not choose a best candidate function."), + errhint("You might need to add explicit type casts.")); This illustrates the use of format codes to embed run-time values into - a message text. Also, an optional hint message is provided. + a message text. Also, optional detail + and hint messages are provided. The auxiliary function calls can be written in any order, but conventionally errcode and errmsg appear first. @@ -907,12 +908,12 @@ BETTER: unrecognized node type: 42 C Standard Code in PostgreSQL should only rely on language - features available in the C99 standard. That means a conforming - C99 compiler has to be able to compile postgres, at least aside + features available in the C11 standard. That means a conforming + C11 compiler has to be able to compile postgres, at least aside from a few platform dependent pieces. - A few features included in the C99 standard are, at this time, not + A few features included in the C11 standard are, at this time, not permitted to be used in core PostgreSQL code. This currently includes variable length arrays, intermingled declarations and code, // comments, universal @@ -924,13 +925,11 @@ BETTER: unrecognized node type: 42 features can be used, if a fallback is provided. - For example _Static_assert() and + For example typeof() and __builtin_constant_p are currently used, even though they are from newer revisions of the C standard and a GCC extension respectively. If not available - we respectively fall back to using a C99 compatible replacement that - performs the same checks, but emits rather cryptic messages and do not - use __builtin_constant_p. + we do not use them. diff --git a/doc/src/sgml/spi.sgml b/doc/src/sgml/spi.sgml index 7e2f2df965dba..e30d0962ae761 100644 --- a/doc/src/sgml/spi.sgml +++ b/doc/src/sgml/spi.sgml @@ -846,7 +846,7 @@ int SPI_execute_extended(const char *command, int SPI_execute_with_args(const char *command, int nargs, Oid *argtypes, - Datum *values, const char *nulls, + const Datum *values, const char *nulls, bool read_only, long count) @@ -1671,7 +1671,7 @@ bool SPI_is_cursor_plan(SPIPlanPtr plan) -int SPI_execute_plan(SPIPlanPtr plan, Datum * values, const char * nulls, +int SPI_execute_plan(SPIPlanPtr plan, const Datum * values, const char * nulls, bool read_only, long count) @@ -2317,7 +2317,7 @@ Portal SPI_cursor_open(const char * name, SPIPlanPtr name, const char *command, int nargs, Oid *argtypes, - Datum *values, const char *nulls, + const Datum *values, const char *nulls, bool read_only, int cursorOptions) diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index 61250799ec076..02ddfda834a2e 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -39,6 +39,8 @@ these required items, the cluster configuration files
Contents of <varname>PGDATA</varname> + + @@ -743,6 +745,8 @@ There are five parts to each page. Overall Page Layout Page Layout + + diff --git a/doc/src/sgml/stylesheet-fo.xsl b/doc/src/sgml/stylesheet-fo.xsl index e7916a6a88347..aec6de7064a7b 100644 --- a/doc/src/sgml/stylesheet-fo.xsl +++ b/doc/src/sgml/stylesheet-fo.xsl @@ -14,24 +14,11 @@ 3 - - - - - - - - - - - 1.5em +0 +0 @@ -42,6 +29,8 @@ an "Unresolved ID reference found" warning during PDF builds. solid 1pt black + 0.25in + 0.25in 12pt 12pt 6pt @@ -415,5 +404,21 @@ an "Unresolved ID reference found" warning during PDF builds. + + + + + + + + + + + + diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index 916189a7d68ce..34c83880a66e3 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -1834,8 +1834,8 @@ FROM generate_series(1,10) AS s(i); The syntax of a window function call is one of the following: -function_name (expression , expression ... ) [ FILTER ( WHERE filter_clause ) ] OVER window_name -function_name (expression , expression ... ) [ FILTER ( WHERE filter_clause ) ] OVER ( window_definition ) +function_name (expression , expression ... ) null treatment [ FILTER ( WHERE filter_clause ) ] OVER window_name +function_name (expression , expression ... ) null treatment [ FILTER ( WHERE filter_clause ) ] OVER ( window_definition ) function_name ( * ) [ FILTER ( WHERE filter_clause ) ] OVER window_name function_name ( * ) [ FILTER ( WHERE filter_clause ) ] OVER ( window_definition ) @@ -1873,7 +1873,9 @@ EXCLUDE NO OTHERS Here, expression represents any value - expression that does not itself contain window function calls. + expression that does not itself contain window function calls. Some + non-aggregate functions allow a null treatment clause, + described in . @@ -2048,7 +2050,7 @@ EXCLUDE NO OTHERS The built-in window functions are described in . Other window functions can be added by + linkend="functions-window-table"/>. Other window functions can be added by the user. Also, any built-in or user-defined general-purpose or statistical aggregate can be used as a window function. (Ordered-set and hypothetical-set aggregates cannot presently be used as window functions.) @@ -2428,8 +2430,8 @@ SELECT ROW(1,2.5,'this is a test'); which will be expanded to a list of the elements of the row value, just as occurs when the .* syntax is used at the top level of a SELECT list (see ). - For example, if table t has - columns f1 and f2, these are the same: + For example, if table t has + columns f1 and f2, these are the same: SELECT ROW(t.*, 42) FROM t; SELECT ROW(t.f1, t.f2, 42) FROM t; diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index b58c52ea50f5a..7ff7ca4f7193d 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -53,7 +53,7 @@ pg_aios - In-use asynchronous IO handles + in-use asynchronous IO handles @@ -81,6 +81,11 @@ open cursors + + pg_dsm_registry_allocations + shared memory allocations tracked in the DSM registry + + pg_file_settings summary of configuration file contents @@ -131,6 +136,11 @@ prepared transactions + + pg_publication_sequences + publications and information of their associated sequences + + pg_publication_tables publications and information of their associated tables @@ -1086,6 +1096,75 @@ AND c1.path[c2.level] = c2.path[c2.level]; + + <structname>pg_dsm_registry_allocations</structname> + + + pg_dsm_registry_allocations + + + + The pg_dsm_registry_allocations view shows shared + memory allocations tracked in the dynamic shared memory (DSM) registry. + This includes memory allocated by extensions using the mechanisms detailed + in . + + +
+ <structname>pg_dsm_registry_allocations</structname> Columns + + + + + Column Type + + + Description + + + + + + + + name text + + + The name of the allocation in the DSM registry. + + + + + + type text + + + The type of allocation. Possible values are segment, + area, and hash, which correspond + to dynamic shared memory segments, areas, and hash tables, respectively. + + + + + + size int8 + + + Size of the allocation in bytes. NULL for entries that failed + initialization. + + + + +
+ + + By default, the pg_dsm_registry_allocations view + can be read only by superusers or roles with privileges of the + pg_read_all_stats role. + + + <structname>pg_file_settings</structname> @@ -2475,6 +2554,67 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + <structname>pg_publication_sequences</structname> + + + pg_publication_sequences + + + + The view pg_publication_sequences provides + information about the mapping between publications and sequences. + + + + <structname>pg_publication_sequences</structname> Columns + + + + + Column Type + + + Description + + + + + + + + pubname name + (references pg_publication.pubname) + + + Name of publication + + + + + + schemaname name + (references pg_namespace.nspname) + + + Name of schema containing sequence + + + + + + sequencename name + (references pg_class.relname) + + + Name of sequence + + + + +
+
+ <structname>pg_publication_tables</structname> @@ -2819,21 +2959,18 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx unreserved means that the slot no longer retains the required WAL files and some of them are to be removed at - the next checkpoint. This state can return + the next checkpoint. This typically occurs when + is set to + a non-negative value. This state can return to reserved or extended. - lost means that some required WAL files have - been removed and this slot is no longer usable. + lost means that this slot is no longer usable. - The last two states are seen only when - is - non-negative. If restart_lsn is NULL, this - field is null. @@ -2925,14 +3062,15 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx wal_level_insufficient means that the - primary doesn't have a sufficient to - perform logical decoding. It is set only for logical slots. + primary doesn't have an + sufficient to perform logical decoding. It is set only for logical + slots. idle_timeout means that the slot has remained - idle longer than the configured + inactive longer than the configured duration. @@ -2965,6 +3103,49 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + + slotsync_skip_reasontext + + + The reason for the last slot synchronization skip. Slot + synchronization occurs only on standby servers and thus this column has + no meaning on the primary server. It is relevant mainly for logical slots + on standby servers whose synced field is + true. It is NULL if slot + synchronization is successful. + Possible values are: + + + + wal_or_rows_removed means that the required WALs or + catalog rows have already been removed or are at the risk of removal + from the standby. + + + + + wal_not_flushed means that the standby had not + flushed the WAL corresponding to the position reserved on the failover + slot. + + + + + no_consistent_snapshot means that the standby could + not build a consistent snapshot to decode WALs from + restart_lsn. + + + + + slot_invalidated means that the synced slot is + invalidated. + + + + + @@ -3932,7 +4113,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx passwd text - Password (possibly encrypted); null if none. See + Encrypted password; null if none. See pg_authid for details of how encrypted passwords are stored. diff --git a/doc/src/sgml/tablefunc.sgml b/doc/src/sgml/tablefunc.sgml index e10fe7009d163..69cafa00ad66e 100644 --- a/doc/src/sgml/tablefunc.sgml +++ b/doc/src/sgml/tablefunc.sgml @@ -293,10 +293,10 @@ INSERT INTO ct(rowid, attribute, value) VALUES('test2','att4','val8'); SELECT * FROM crosstab( - 'select rowid, attribute, value - from ct - where attribute = ''att2'' or attribute = ''att3'' - order by 1,2') + 'SELECT rowid, attribute, value + FROM ct + WHERE attribute = ''att2'' OR attribute = ''att3'' + ORDER BY 1, 2') AS ct(row_name text, category_1 text, category_2 text, category_3 text); row_name | category_1 | category_2 | category_3 @@ -371,10 +371,10 @@ CREATE TYPE tablefunc_crosstab_N AS ( SELECT * FROM crosstab3( - 'select rowid, attribute, value - from ct - where attribute = ''att2'' or attribute = ''att3'' - order by 1,2'); + 'SELECT rowid, attribute, value + FROM ct + WHERE attribute = ''att2'' OR attribute = ''att3'' + ORDER BY 1, 2'); @@ -407,7 +407,7 @@ CREATE TYPE my_crosstab_float8_5_cols AS ( ); CREATE OR REPLACE FUNCTION crosstab_float8_5_cols(text) - RETURNS setof my_crosstab_float8_5_cols + RETURNS SETOF my_crosstab_float8_5_cols AS '$libdir/tablefunc','crosstab' LANGUAGE C STABLE STRICT; @@ -426,7 +426,7 @@ CREATE OR REPLACE FUNCTION crosstab_float8_5_cols( OUT my_category_3 float8, OUT my_category_4 float8, OUT my_category_5 float8) - RETURNS setof record + RETURNS SETOF record AS '$libdir/tablefunc','crosstab' LANGUAGE C STABLE STRICT; @@ -572,18 +572,18 @@ row_name extra cat1 cat2 cat3 cat4 Here are two complete examples: -create table sales(year int, month int, qty int); -insert into sales values(2007, 1, 1000); -insert into sales values(2007, 2, 1500); -insert into sales values(2007, 7, 500); -insert into sales values(2007, 11, 1500); -insert into sales values(2007, 12, 2000); -insert into sales values(2008, 1, 1000); - -select * from crosstab( - 'select year, month, qty from sales order by 1', - 'select m from generate_series(1,12) m' -) as ( +CREATE TABLE sales (year int, month int, qty int); +INSERT INTO sales VALUES (2007, 1, 1000); +INSERT INTO sales VALUES (2007, 2, 1500); +INSERT INTO sales VALUES (2007, 7, 500); +INSERT INTO sales VALUES (2007, 11, 1500); +INSERT INTO sales VALUES (2007, 12, 2000); +INSERT INTO sales VALUES (2008, 1, 1000); + +SELECT * FROM crosstab( + 'SELECT year, month, qty FROM sales ORDER BY 1', + 'SELECT m FROM generate_series(1, 12) m' +) AS ( year int, "Jan" int, "Feb" int, diff --git a/doc/src/sgml/tcn.sgml b/doc/src/sgml/tcn.sgml index 32a1025cc6b79..98278fbee3730 100644 --- a/doc/src/sgml/tcn.sgml +++ b/doc/src/sgml/tcn.sgml @@ -43,32 +43,32 @@ A brief example of using the extension follows. -test=# create table tcndata +test=# CREATE TABLE tcndata test-# ( -test(# a int not null, -test(# b date not null, +test(# a int NOT NULL, +test(# b date NOT NULL, test(# c text, -test(# primary key (a, b) +test(# PRIMARY KEY (a, b) test(# ); CREATE TABLE -test=# create trigger tcndata_tcn_trigger -test-# after insert or update or delete on tcndata -test-# for each row execute function triggered_change_notification(); +test=# CREATE TRIGGER tcndata_tcn_trigger +test-# AFTER INSERT OR UPDATE OR DELETE ON tcndata +test-# FOR EACH ROW EXECUTE FUNCTION triggered_change_notification(); CREATE TRIGGER -test=# listen tcn; +test=# LISTEN tcn; LISTEN -test=# insert into tcndata values (1, date '2012-12-22', 'one'), +test=# INSERT INTO tcndata VALUES (1, date '2012-12-22', 'one'), test-# (1, date '2012-12-23', 'another'), test-# (2, date '2012-12-23', 'two'); INSERT 0 3 Asynchronous notification "tcn" with payload ""tcndata",I,"a"='1',"b"='2012-12-22'" received from server process with PID 22770. Asynchronous notification "tcn" with payload ""tcndata",I,"a"='1',"b"='2012-12-23'" received from server process with PID 22770. Asynchronous notification "tcn" with payload ""tcndata",I,"a"='2',"b"='2012-12-23'" received from server process with PID 22770. -test=# update tcndata set c = 'uno' where a = 1; +test=# UPDATE tcndata SET c = 'uno' WHERE a = 1; UPDATE 2 Asynchronous notification "tcn" with payload ""tcndata",U,"a"='1',"b"='2012-12-22'" received from server process with PID 22770. Asynchronous notification "tcn" with payload ""tcndata",U,"a"='1',"b"='2012-12-23'" received from server process with PID 22770. -test=# delete from tcndata where a = 1 and b = date '2012-12-22'; +test=# DELETE FROM tcndata WHERE a = 1 AND b = date '2012-12-22'; DELETE 1 Asynchronous notification "tcn" with payload ""tcndata",D,"a"='1',"b"='2012-12-22'" received from server process with PID 22770. diff --git a/doc/src/sgml/test-decoding.sgml b/doc/src/sgml/test-decoding.sgml index 5d1ae8f4f52e2..7d3d590471a32 100644 --- a/doc/src/sgml/test-decoding.sgml +++ b/doc/src/sgml/test-decoding.sgml @@ -25,16 +25,16 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'include-xids', '0'); - lsn | xid | data ------------+-----+-------------------------------------------------- - 0/16D30F8 | 691 | BEGIN - 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:2 data[text]:'arg' - 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:3 data[text]:'demo' - 0/16D32A0 | 691 | COMMIT - 0/16D32D8 | 692 | BEGIN - 0/16D3398 | 692 | table public.data: DELETE: id[int4]:2 - 0/16D3398 | 692 | table public.data: DELETE: id[int4]:3 - 0/16D3398 | 692 | COMMIT + lsn | xid | data +------------+-----+-------------------------------------------------- + 0/016D30F8 | 691 | BEGIN + 0/016D32A0 | 691 | table public.data: INSERT: id[int4]:2 data[text]:'arg' + 0/016D32A0 | 691 | table public.data: INSERT: id[int4]:3 data[text]:'demo' + 0/016D32A0 | 691 | COMMIT + 0/016D32D8 | 692 | BEGIN + 0/016D3398 | 692 | table public.data: DELETE: id[int4]:2 + 0/016D3398 | 692 | table public.data: DELETE: id[int4]:3 + 0/016D3398 | 692 | COMMIT (8 rows) @@ -45,18 +45,18 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'i postgres[33712]=#* SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'stream-changes', '1'); - lsn | xid | data ------------+-----+-------------------------------------------------- - 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 - 0/16B21F8 | 503 | streaming change for TXN 503 - 0/16B2300 | 503 | streaming change for TXN 503 - 0/16B2408 | 503 | streaming change for TXN 503 - 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 - 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 - 0/16BECA8 | 503 | streaming change for TXN 503 - 0/16BEDB0 | 503 | streaming change for TXN 503 - 0/16BEEB8 | 503 | streaming change for TXN 503 - 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 + lsn | xid | data +------------+-----+-------------------------------------------------- + 0/016B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/016B21F8 | 503 | streaming change for TXN 503 + 0/016B2300 | 503 | streaming change for TXN 503 + 0/016B2408 | 503 | streaming change for TXN 503 + 0/016BEBA0 | 503 | closing a streamed block for transaction TXN 503 + 0/016B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/016BECA8 | 503 | streaming change for TXN 503 + 0/016BEDB0 | 503 | streaming change for TXN 503 + 0/016BEEB8 | 503 | streaming change for TXN 503 + 0/016BEBA0 | 503 | closing a streamed block for transaction TXN 503 (10 rows) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 908857a54af5f..d20484cb232f4 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1355,7 +1355,7 @@ ts_headline( config - Warning: Cross-site scripting (XSS) safety + Warning: Cross-site Scripting (XSS) Safety The output from ts_headline is not guaranteed to be safe for direct inclusion in web pages. When @@ -1974,12 +1974,12 @@ SELECT title, body FROM messages WHERE tsv @@ to_tsquery('title & body'); CREATE FUNCTION messages_trigger() RETURNS trigger AS $$ -begin +BEGIN new.tsv := setweight(to_tsvector('pg_catalog.english', coalesce(new.title,'')), 'A') || setweight(to_tsvector('pg_catalog.english', coalesce(new.body,'')), 'D'); - return new; -end + RETURN new; +END $$ LANGUAGE plpgsql; CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml index e9214dcf1b1bd..0062f1a3fd152 100644 --- a/doc/src/sgml/trigger.sgml +++ b/doc/src/sgml/trigger.sgml @@ -129,10 +129,9 @@ In all cases, a trigger is executed as part of the same transaction as the statement that triggered it, so if either the statement or the trigger causes an error, the effects of both will be rolled back. - Also, the trigger will always run in the security context of the role - that executed the statement that caused the trigger to fire, unless - the trigger function is defined as SECURITY DEFINER, - in which case it will run as the function owner. + Also, the trigger will always run as the role that queued the trigger + event, unless the trigger function is marked as SECURITY + DEFINER, in which case it will run as the function owner. @@ -824,7 +823,7 @@ typedef struct Trigger attnum (1-based) is a member of this bitmap set, call bms_is_member(attnum - FirstLowInvalidHeapAttributeNumber, - trigdata->tg_updatedcols)). + trigdata->tg_updatedcols). diff --git a/doc/src/sgml/typeconv.sgml b/doc/src/sgml/typeconv.sgml index 2874874248668..96aa02e4fabe0 100644 --- a/doc/src/sgml/typeconv.sgml +++ b/doc/src/sgml/typeconv.sgml @@ -465,9 +465,9 @@ try a similar case with ~, we get: SELECT ~ '20' AS "negation"; -ERROR: operator is not unique: ~ "unknown" -HINT: Could not choose a best candidate operator. You might need to add -explicit type casts. +ERROR: operator is not unique: ~ unknown +DETAIL: Could not choose a best candidate operator. +HINT: You might need to add explicit type casts. This happens because the system cannot decide which of the several possible ~ operators should be preferred. We can help @@ -490,7 +490,7 @@ SELECT ~ CAST('20' AS int8) AS "negation"; Here is another example of resolving an operator with one known and one unknown input: -SELECT array[1,2] <@ '{1,2,3}' as "is subset"; +SELECT ARRAY[1, 2] <@ '{1,2,3}' AS "is subset"; is subset ----------- @@ -901,8 +901,8 @@ the parser will try to convert that to text: SELECT substr(1234, 3); ERROR: function substr(integer, integer) does not exist -HINT: No function matches the given name and argument types. You might need -to add explicit type casts. +DETAIL: No function of that name accepts the given argument types. +HINT: You might need to add explicit type casts. This does not work because integer does not have an implicit cast diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml index 94100ed26091a..744821ca997ca 100644 --- a/doc/src/sgml/unaccent.sgml +++ b/doc/src/sgml/unaccent.sgml @@ -144,7 +144,7 @@ mydb=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules'); To test the dictionary, you can try: -mydb=# select ts_lexize('unaccent','Hôtel'); +mydb=# SELECT ts_lexize('unaccent', 'Hôtel'); ts_lexize ----------- {Hotel} @@ -160,19 +160,19 @@ mydb=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french ); mydb=# ALTER TEXT SEARCH CONFIGURATION fr ALTER MAPPING FOR hword, hword_part, word WITH unaccent, french_stem; -mydb=# select to_tsvector('fr','Hôtels de la Mer'); +mydb=# SELECT to_tsvector('fr', 'Hôtels de la Mer'); to_tsvector ------------------- 'hotel':1 'mer':4 (1 row) -mydb=# select to_tsvector('fr','Hôtel de la Mer') @@ to_tsquery('fr','Hotels'); +mydb=# SELECT to_tsvector('fr', 'Hôtel de la Mer') @@ to_tsquery('fr', 'Hotels'); ?column? ---------- t (1 row) -mydb=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels')); +mydb=# SELECT ts_headline('fr', 'Hôtel de la Mer', to_tsquery('fr', 'Hotels')); ts_headline ------------------------ <b>Hôtel</b> de la Mer diff --git a/doc/src/sgml/xfunc.sgml b/doc/src/sgml/xfunc.sgml index 2d81afce8cb9b..e9288bd6b5e20 100644 --- a/doc/src/sgml/xfunc.sgml +++ b/doc/src/sgml/xfunc.sgml @@ -397,8 +397,8 @@ SELECT tf1(17, 100.0); In this example, we chose the name accountno for the first argument, but this is the same as the name of a column in the - bank table. Within the UPDATE command, - accountno refers to the column bank.accountno, + bank table. Within the UPDATE command, + accountno refers to the column bank.accountno, so tf1.accountno must be used to refer to the argument. We could of course avoid this by using a different name for the argument. @@ -1016,7 +1016,7 @@ SELECT *, upper(fooname) FROM getfoo(1) AS t1; This feature is normally used when calling the function in the FROM clause. In this case each row returned by the function becomes a row of the table seen by the query. For example, assume that - table foo has the same contents as above, and we say: + table foo has the same contents as above, and we say: CREATE FUNCTION getfoo(int) RETURNS SETOF foo AS $$ @@ -1409,7 +1409,7 @@ DETAIL: A result of type anyelement requires at least one input of type anyelem For example: CREATE FUNCTION dup (f1 anyelement, OUT f2 anyelement, OUT f3 anyarray) -AS 'select $1, array[$1,$1]' LANGUAGE SQL; +AS 'SELECT $1, ARRAY[$1, $1]' LANGUAGE SQL; SELECT * FROM dup(22); f2 | f3 @@ -2051,8 +2051,7 @@ PG_MODULE_MAGIC_EXT( - By-value types can only be 1, 2, or 4 bytes in length - (also 8 bytes, if sizeof(Datum) is 8 on your machine). + By-value types can only be 1, 2, 4, or 8 bytes in length. You should be careful to define your types such that they will be the same size (in bytes) on all architectures. For example, the long type is dangerous because it is 4 bytes on some @@ -2165,7 +2164,7 @@ memcpy(destination->data, buffer, 40); it's considered good style to use the macro VARHDRSZ to refer to the size of the overhead for a variable-length type. Also, the length field must be set using the - SET_VARSIZE macro, not by simple assignment. + SET_VARSIZE function, not by simple assignment. @@ -2400,7 +2399,7 @@ PG_FUNCTION_INFO_V1(funcname); To call another version-1 function, you can use DirectFunctionCalln(func, arg1, ..., argn). This is particularly useful when you want - to call functions defined in the standard internal library, by using an + to call functions defined in the standard internal function library by using an interface similar to their SQL signature. @@ -2492,7 +2491,7 @@ makepoint(PG_FUNCTION_ARGS) /* Here, the pass-by-reference nature of Point is not hidden. */ Point *pointx = PG_GETARG_POINT_P(0); Point *pointy = PG_GETARG_POINT_P(1); - Point *new_point = (Point *) palloc(sizeof(Point)); + Point *new_point = palloc_object(Point); new_point->x = pointx->x; new_point->y = pointy->y; @@ -3669,11 +3668,14 @@ LWLockRelease(AddinShmemInitLock); shmem_startup_hook provides a convenient place for the initialization code, but it is not strictly required that all such code - be placed in this hook. Each backend will execute the registered - shmem_startup_hook shortly after it attaches to shared - memory. Note that add-ins should still acquire + be placed in this hook. On Windows (and anywhere else where + EXEC_BACKEND is defined), each backend executes the + registered shmem_startup_hook shortly after it + attaches to shared memory, so add-ins should still acquire AddinShmemInitLock within this hook, as shown in the - example above. + example above. On other platforms, only the postmaster process executes + the shmem_startup_hook, and each backend automatically + inherits the pointers to shared memory. @@ -3694,15 +3696,18 @@ LWLockRelease(AddinShmemInitLock); use the shared memory should obtain a pointer to it by calling: void *GetNamedDSMSegment(const char *name, size_t size, - void (*init_callback) (void *ptr), - bool *found) + void (*init_callback) (void *ptr, void *arg), + bool *found, void *arg) If a dynamic shared memory segment with the given name does not yet exist, this function will allocate it and initialize it with the provided init_callback callback function. If the segment has already been allocated and initialized by another backend, this function simply attaches the existing dynamic shared memory segment to the current - backend. + backend. In the former case, GetNamedDSMSegment + passes the void *arg argument to the + init_callback. This is particularly useful for + reusing an initialization callback function for multiple DSM segments. @@ -3760,7 +3765,7 @@ LWLockPadded *GetNamedLWLockTranche(const char *tranche_name) shmem_request_hook. To do so, first allocate a tranche_id by calling: -int LWLockNewTrancheId(void) +int LWLockNewTrancheId(const char *name) Next, initialize each LWLock, passing the new tranche_id as an argument: @@ -3778,17 +3783,8 @@ void LWLockInitialize(LWLock *lock, int tranche_id) - Finally, each backend using the tranche_id should - associate it with a tranche_name by calling: - -void LWLockRegisterTranche(int tranche_id, const char *tranche_name) - - - - - A complete usage example of LWLockNewTrancheId, - LWLockInitialize, and - LWLockRegisterTranche can be found in + A complete usage example of LWLockNewTrancheId and + LWLockInitialize can be found in contrib/pg_prewarm/autoprewarm.c in the PostgreSQL source tree. @@ -3947,7 +3943,7 @@ extern bool InjectionPointDetach(const char *name); - Enabling injections points requires + Enabling injection points requires with configure or with Meson. @@ -4012,7 +4008,7 @@ extern PgStat_Kind pgstat_register_kind(PgStat_Kind kind, An example describing how to register and use custom statistics can be - found in src/test/modules/injection_points. + found in src/test/modules/test_custom_stats.
@@ -4166,6 +4162,31 @@ supportfn(internal) returns internal expression and an actual execution of the target function. + + SupportRequestSimplify is not used + for set-returning + functions. Instead, support functions can implement + the SupportRequestInlineInFrom request to expand + function calls appearing in the FROM clause of a + query. (It's also allowed to support this request for + non-set-returning functions, although + typically SupportRequestSimplify would serve as + well.) For this request type, a successful result must be + a SELECT Query tree, which will replace + the FROM item as though a sub-select had been + written instead. The Query tree must appear as it would after parse + analysis and rewrite processing. One way to ensure that that's true + is to build a SQL string then feed it + through pg_parse_query + and pg_analyze_and_rewrite, or related + functions. PARAM_EXTERN Param + nodes can appear within the Query to represent the function's + arguments; they will be replaced by the actual argument expressions. + As with SupportRequestSimplify, it is the support + function's responsibility that the replacement Query be equivalent to + normal execution of the target function. + + For target functions that return boolean, it is often useful to estimate the fraction of rows that will be selected by a WHERE clause using that diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 7e23a7b6e4323..3d315df2f9803 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -598,7 +598,7 @@ 11 - stratnum + translate_cmptype translate compare types to strategy numbers used by the operator class (optional) 12 diff --git a/doc/src/sgml/xoper.sgml b/doc/src/sgml/xoper.sgml index 954a90d77d0ed..853b07a9f1489 100644 --- a/doc/src/sgml/xoper.sgml +++ b/doc/src/sgml/xoper.sgml @@ -21,7 +21,7 @@ PostgreSQL supports prefix - and infix operators. Operators can be + and binary (or infix) operators. Operators can be overloaded;overloadingoperators that is, the same operator name can be used for different operators that have different numbers and types of operands. When a query is diff --git a/doc/src/sgml/xtypes.sgml b/doc/src/sgml/xtypes.sgml index e67e5bdf4c4ac..df56d1c3ace68 100644 --- a/doc/src/sgml/xtypes.sgml +++ b/doc/src/sgml/xtypes.sgml @@ -89,7 +89,7 @@ complex_in(PG_FUNCTION_ARGS) errmsg("invalid input syntax for type %s: \"%s\"", "complex", str))); - result = (Complex *) palloc(sizeof(Complex)); + result = palloc_object(Complex); result->x = x; result->y = y; PG_RETURN_POINTER(result); diff --git a/meson.build b/meson.build index d142e3e408b38..d7c5193d4cee4 100644 --- a/meson.build +++ b/meson.build @@ -8,13 +8,14 @@ project('postgresql', ['c'], - version: '18beta1', + version: '19devel', license: 'PostgreSQL', - # We want < 0.56 for python 3.5 compatibility on old platforms. EPEL for - # RHEL 7 has 0.55. < 0.54 would require replacing some uses of the fs - # module, < 0.53 all uses of fs. So far there's no need to go to >=0.56. - meson_version: '>=0.54', + # We want < 0.62 for python 3.6 compatibility on old platforms. + # RHEL 8 has 0.58. < 0.57 would require various additional + # backward-compatibility conditionals. + # Meson 0.57.0 and 0.57.1 are buggy, therefore >=0.57.2. + meson_version: '>=0.57.2', default_options: [ 'warning_level=1', #-Wall equivalent 'b_pch=false', @@ -349,6 +350,7 @@ missing = find_program('config/missing', native: true) cp = find_program('cp', required: false, native: true) xmllint_bin = find_program(get_option('XMLLINT'), native: true, required: false) xsltproc_bin = find_program(get_option('XSLTPROC'), native: true, required: false) +nm = find_program('nm', required: false, native: true) bison_flags = [] if bison.found() @@ -451,6 +453,14 @@ else segsize = (get_option('segsize') * 1024 * 1024 * 1024) / blocksize endif +# If we don't have largefile support, can't handle segment size >= 2GB. +if cc.sizeof('off_t', args: test_c_args) < 8 + segsize_bytes = segsize * blocksize + if segsize_bytes >= (2 * 1024 * 1024 * 1024) + error('Large file support is not enabled. Segment size cannot be larger than 1GB.') + endif +endif + cdata.set('BLCKSZ', blocksize, description: '''Size of a disk block --- this also limits the size of a tuple. You can set it bigger if you need bigger tuples (although TOAST should reduce the need @@ -545,6 +555,33 @@ dir_doc_extension = dir_doc / 'extension' # used, they need to be added to test_c_args as well. ############################################################### +# Do we need an option to enable C11? +c11_test = ''' +#if !defined __STDC_VERSION__ || __STDC_VERSION__ < 201112L +# error "Compiler does not advertise C11 conformance" +#endif +''' + +if not cc.compiles(c11_test, name: 'C11') + c11_ok = false + if cc.get_id() == 'msvc' + c11_test_args = ['/std:c11'] + else + c11_test_args = ['-std=gnu11', '-std=c11'] + endif + foreach arg : c11_test_args + if cc.compiles(c11_test, name: 'C11 with @0@'.format(arg), args: [arg]) + c11_ok = true + cflags += arg + break + endif + endforeach + if not c11_ok + error('C compiler does not support C11') + endif +endif + + postgres_inc = [include_directories(postgres_inc_d)] test_lib_d = postgres_lib_d test_c_args = cppflags + cflags @@ -943,10 +980,10 @@ if not libcurlopt.disabled() # libcurl and one of either epoll or kqueue. oauth_flow_supported = ( libcurl.found() - and (cc.check_header('sys/event.h', required: false, - args: test_c_args, include_directories: postgres_inc) - or cc.check_header('sys/epoll.h', required: false, - args: test_c_args, include_directories: postgres_inc)) + and (cc.has_header('sys/event.h', + args: test_c_args, include_directories: postgres_inc) + or cc.has_header('sys/epoll.h', + args: test_c_args, include_directories: postgres_inc)) ) if oauth_flow_supported @@ -990,6 +1027,12 @@ liburingopt = get_option('liburing') liburing = dependency('liburing', required: liburingopt) if liburing.found() cdata.set('USE_LIBURING', 1) + + if cc.has_function('io_uring_queue_init_mem', + dependencies: liburing, args: test_c_args) + cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1) + endif + endif @@ -1205,7 +1248,7 @@ if not perlopt.disabled() if cc.get_id() == 'msvc' # prevent binary mismatch between MSVC built plperl and Strawberry or # msys ucrt perl libraries - perl_v = run_command(perl, '-V').stdout() + perl_v = run_command(perl, '-V', check: false).stdout() if not perl_v.contains('USE_THREAD_SAFE_LOCALE') perl_ccflags += ['-DNO_THREAD_SAFE_LOCALE'] endif @@ -1284,7 +1327,7 @@ pyopt = get_option('plpython') python3_dep = not_found_dep if not pyopt.disabled() pm = import('python') - python3_inst = pm.find_installation(python.path(), required: pyopt) + python3_inst = pm.find_installation(python.full_path(), required: pyopt) if python3_inst.found() python3_dep = python3_inst.dependency(embed: true, required: pyopt) # Remove this check after we depend on Meson >= 1.1.0 @@ -1573,7 +1616,10 @@ if uuidopt != 'none' elif uuidopt == 'ossp' # In upstream, the package and library is called just 'uuid', but many # distros change it to 'ossp-uuid'. - uuid = dependency('ossp-uuid', 'uuid', required: false) + uuid = dependency('ossp-uuid', required: false) + if not uuid.found() + uuid = dependency('uuid', required: false) + endif uuidfunc = 'uuid_export' uuidheader = 'uuid.h' @@ -1693,49 +1739,6 @@ endif # Compiler tests ############################################################### -# Do we need -std=c99 to compile C99 code? We don't want to add -std=c99 -# unnecessarily, because we optionally rely on newer features. -c99_test = ''' -#include -#include -#include -#include - -struct named_init_test { - int a; - int b; -}; - -extern void structfunc(struct named_init_test); - -int main(int argc, char **argv) -{ - struct named_init_test nit = { - .a = 3, - .b = 5, - }; - - for (int loop_var = 0; loop_var < 3; loop_var++) - { - nit.a += nit.b; - } - - structfunc((struct named_init_test){1, 0}); - - return nit.a != 0; -} -''' - -if not cc.compiles(c99_test, name: 'c99', args: test_c_args) - if cc.compiles(c99_test, name: 'c99 with -std=c99', - args: test_c_args + ['-std=c99']) - test_c_args += '-std=c99' - cflags += '-std=c99' - else - error('C compiler does not support C99') - endif -endif - if host_machine.endian() == 'big' cdata.set('WORDS_BIGENDIAN', 1) endif @@ -1773,6 +1776,8 @@ cdata.set('SIZEOF_LONG', cc.sizeof('long', args: test_c_args)) cdata.set('SIZEOF_LONG_LONG', cc.sizeof('long long', args: test_c_args)) cdata.set('SIZEOF_VOID_P', cc.sizeof('void *', args: test_c_args)) cdata.set('SIZEOF_SIZE_T', cc.sizeof('size_t', args: test_c_args)) +cdata.set('SIZEOF_INTMAX_T', cc.sizeof('intmax_t', args: test_c_args, + prefix: '#include ')) # Check if __int128 is a working 128 bit integer type, and if so @@ -1818,7 +1823,7 @@ if cc.links(''' if not meson.is_cross_build() r = cc.run(''' /* This must match the corresponding code in c.h: */ - #if defined(__GNUC__) || defined(__SUNPRO_C) + #if defined(__GNUC__) #define pg_attribute_aligned(a) __attribute__((aligned(a))) #elif defined(_MSC_VER) #define pg_attribute_aligned(a) __declspec(align(a)) @@ -1875,23 +1880,21 @@ if cc.compiles(''' endif -# Check if the C compiler understands _Static_assert(), -# and define HAVE__STATIC_ASSERT if so. -# -# We actually check the syntax ({ _Static_assert(...) }), because we need -# gcc-style compound expressions to be able to wrap the thing into macros. +# Check if the C compiler supports GCC-style statement expressions. if cc.compiles(''' int main(int arg, char **argv) { ({ _Static_assert(1, "foo"); }); } ''', - name: '_Static_assert', + name: 'statement expressions', args: test_c_args) - cdata.set('HAVE__STATIC_ASSERT', 1) + cdata.set('HAVE_STATEMENT_EXPRESSIONS', 1) endif +# Select the format archetype to be used to check printf-type functions. +# # Need to check a call with %m because netbsd supports gnu_printf but emits a # warning for each use of %m. printf_attributes = ['gnu_printf', '__syslog__', 'printf'] @@ -1906,11 +1909,24 @@ attrib_error_args = cc.get_supported_arguments('-Werror=format', '-Werror=ignore foreach a : printf_attributes if cc.compiles(testsrc.format(a), args: test_c_args + attrib_error_args, name: 'format ' + a) - cdata.set('PG_PRINTF_ATTRIBUTE', a) + cdata.set('PG_C_PRINTF_ATTRIBUTE', a) break endif endforeach +# We need to repeat the test for C++ because gcc and clang prefer different +# format archetypes. +if llvm.found() + attrib_error_args = cpp.get_supported_arguments('-Werror=format', '-Werror=ignored-attributes') + foreach a : printf_attributes + if cpp.compiles(testsrc.format(a), + args: attrib_error_args, name: 'cppformat ' + a) + cdata.set('PG_CXX_PRINTF_ATTRIBUTE', a) + break + endif + endforeach +endif + if cc.has_function_attribute('visibility:default') and \ cc.has_function_attribute('visibility:hidden') @@ -1985,10 +2001,7 @@ if cc.links(''' cdata.set('HAVE__BUILTIN_OP_OVERFLOW', 1) endif - -# XXX: The configure.ac check for __cpuid() is broken, we don't copy that -# here. To prevent problems due to two detection methods working, stop -# checking after one. +# Check for __get_cpuid() and __cpuid(). if cc.links(''' #include int main(int arg, char **argv) @@ -2156,13 +2169,20 @@ endforeach if cc.get_id() == 'msvc' cflags_warn += [ - '/wd4018', # signed/unsigned mismatch - '/wd4244', # conversion from 'type1' to 'type2', possible loss of data - '/wd4273', # inconsistent DLL linkage - '/wd4101', # unreferenced local variable - '/wd4102', # unreferenced label + # Warnings to disable: + # from /W1: '/wd4090', # different 'modifier' qualifiers + # from /W2: + '/wd4244', # conversion from 'type1' to 'type2', possible loss of data + # from /W3: + '/wd4018', # signed/unsigned mismatch + '/wd4101', # unreferenced local variable [like -Wunused-variable, but there is no "unused" attribute, so too noisy] '/wd4267', # conversion from 'size_t' to 'type', possible loss of data + + # Additional warnings to enable: + '/w24062', # enumerator 'identifier' in switch of enum 'enumeration' is not handled [like -Wswitch] + '/w24102', # unreferenced label [like -Wunused-label] + '/w24777', # 'function' : format string 'string' requires an argument of type 'type1', but variadic argument number has type 'type2' [like -Wformat] ] cppflags += [ @@ -2465,6 +2485,7 @@ int main(void) { __m128i z; + x = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi32_si128(0)), x); y = _mm512_clmulepi64_epi128(x, y, 0); z = _mm_ternarylogic_epi64( _mm512_castsi512_si128(y), @@ -2621,6 +2642,7 @@ header_checks = [ 'sys/signalfd.h', 'sys/ucred.h', 'termios.h', + 'uchar.h', 'ucred.h', 'xlocale.h', ] @@ -2811,13 +2833,12 @@ int main(void) endforeach -# MSVC doesn't cope well with defining restrict to __restrict, the spelling it -# understands, because it conflicts with __declspec(restrict). Therefore we -# define pg_restrict to the appropriate definition, which presumably won't -# conflict. -# -# We assume C99 support, so we don't need to make this conditional. -cdata.set('pg_restrict', '__restrict') +# Even though restrict is in C99 and should be supported by all +# supported compilers, this indirection is useful because __restrict +# also works in C++ in all supported compilers. (If not, then we +# might have to write a real test.) (restrict is not part of the C++ +# standard.) +cdata.set('restrict', '__restrict') # Most libraries are included only if they demonstrably provide a function we @@ -3124,6 +3145,8 @@ gen_export_kwargs = { 'install': false, } +# command to create stamp files on all OSs +stamp_cmd = [python, '-c', 'import sys; open(sys.argv[1], "w")', '@OUTPUT0@'] ### @@ -3145,13 +3168,13 @@ gen_kwlist_cmd = [ ### if host_system == 'windows' - pg_ico = meson.source_root() / 'src' / 'port' / 'win32.ico' + pg_ico = meson.project_source_root() / 'src' / 'port' / 'win32.ico' win32ver_rc = files('src/port/win32ver.rc') rcgen = find_program('src/tools/rcgen', native: true) rcgen_base_args = [ '--srcdir', '@SOURCE_DIR@', - '--builddir', meson.build_root(), + '--builddir', meson.project_build_root(), '--rcout', '@OUTPUT0@', '--out', '@OUTPUT1@', '--input', '@INPUT@', @@ -3160,11 +3183,11 @@ if host_system == 'windows' if cc.get_argument_syntax() == 'msvc' rc = find_program('rc', required: true) - rcgen_base_args += ['--rc', rc.path()] + rcgen_base_args += ['--rc', rc.full_path()] rcgen_outputs = ['@BASENAME@.rc', '@BASENAME@.res'] else windres = find_program('windres', required: true) - rcgen_base_args += ['--windres', windres.path()] + rcgen_base_args += ['--windres', windres.full_path()] rcgen_outputs = ['@BASENAME@.rc', '@BASENAME@.obj'] endif @@ -3241,14 +3264,14 @@ subdir('src/port') frontend_common_code = declare_dependency( compile_args: ['-DFRONTEND'], include_directories: [postgres_inc], - sources: generated_headers, + sources: generated_headers_stamp, dependencies: [os_deps, zlib, zstd, lz4], ) backend_common_code = declare_dependency( compile_args: ['-DBUILDING_DLL'], include_directories: [postgres_inc], - sources: generated_headers, + sources: generated_headers_stamp, dependencies: [os_deps, zlib, zstd], ) @@ -3263,7 +3286,7 @@ shlib_code = declare_dependency( frontend_stlib_code = declare_dependency( include_directories: [postgres_inc], link_with: [common_static, pgport_static], - sources: generated_headers, + sources: generated_headers_stamp, dependencies: [os_deps, libintl], ) @@ -3271,7 +3294,7 @@ frontend_stlib_code = declare_dependency( frontend_shlib_code = declare_dependency( include_directories: [postgres_inc], link_with: [common_shlib, pgport_shlib], - sources: generated_headers, + sources: generated_headers_stamp, dependencies: [shlib_code, os_deps, libintl], ) @@ -3281,7 +3304,7 @@ frontend_shlib_code = declare_dependency( frontend_no_fe_utils_code = declare_dependency( include_directories: [postgres_inc], link_with: [common_static, pgport_static], - sources: generated_headers, + sources: generated_headers_stamp, dependencies: [os_deps, libintl], ) @@ -3296,6 +3319,7 @@ libpq_deps += [ ] libpq_oauth_deps += [ + thread_dep, libcurl, ] @@ -3308,7 +3332,7 @@ subdir('src/interfaces/libpq-oauth') frontend_code = declare_dependency( include_directories: [postgres_inc], link_with: [fe_utils, common_static, pgport_static], - sources: generated_headers, + sources: generated_headers_stamp, dependencies: [os_deps, libintl], ) @@ -3338,7 +3362,7 @@ backend_code = declare_dependency( include_directories: [postgres_inc], link_args: ldflags_be, link_with: [], - sources: generated_headers + generated_backend_headers, + sources: generated_backend_headers_stamp, dependencies: os_deps + backend_both_deps + backend_deps, ) @@ -3397,7 +3421,7 @@ foreach t1 : configure_files potentially_conflicting_files += meson.current_build_dir() / t endforeach foreach sub, fnames : generated_sources_ac - sub = meson.build_root() / sub + sub = meson.project_build_root() / sub foreach fname : fnames potentially_conflicting_files += sub / fname endforeach @@ -3463,6 +3487,13 @@ installed_targets = [ ecpg_targets, ] +if oauth_flow_supported + installed_targets += [ + libpq_oauth_so, + libpq_oauth_st, + ] +endif + # all targets that require building code all_built = [ installed_targets, @@ -3497,7 +3528,7 @@ run_target('install-test-files', ############################################################### # DESTDIR for the installation we'll run tests in -test_install_destdir = meson.build_root() / 'tmp_install/' +test_install_destdir = meson.project_build_root() / 'tmp_install/' # DESTDIR + prefix appropriately munged if build_system != 'windows' @@ -3540,7 +3571,7 @@ test('install_test_files', is_parallel: false, suite: ['setup']) -test_result_dir = meson.build_root() / 'testrun' +test_result_dir = meson.project_build_root() / 'testrun' # XXX: pg_regress doesn't assign unique ports on windows. To avoid the @@ -3551,12 +3582,12 @@ testport = 40000 test_env = environment() -test_initdb_template = meson.build_root() / 'tmp_install' / 'initdb-template' +test_initdb_template = meson.project_build_root() / 'tmp_install' / 'initdb-template' test_env.set('PG_REGRESS', pg_regress.full_path()) test_env.set('REGRESS_SHLIB', regress_module.full_path()) test_env.set('INITDB_TEMPLATE', test_initdb_template) # for Cluster.pm's portlock logic -test_env.set('top_builddir', meson.build_root()) +test_env.set('top_builddir', meson.project_build_root()) # Add the temporary installation to the library search path on platforms where # that works (everything but windows, basically). On windows everything @@ -3600,26 +3631,20 @@ sys.exit(sp.returncode) # Test Generation ############################################################### -# When using a meson version understanding exclude_suites, define a -# 'tmp_install' test setup (the default) that excludes tests running against a -# pre-existing install and a 'running' setup that conflicts with creation of -# the temporary installation and tap tests (which don't support running -# against a running server). +# Define a 'tmp_install' test setup (the default) that excludes tests +# running against a pre-existing install and a 'running' setup that +# conflicts with creation of the temporary installation and tap tests +# (which don't support running against a running server). running_suites = [] install_suites = [] -if meson.version().version_compare('>=0.57') - runningcheck = true -else - runningcheck = false -endif testwrap = files('src/tools/testwrap') foreach test_dir : tests testwrap_base = [ testwrap, - '--basedir', meson.build_root(), + '--basedir', meson.project_build_root(), '--srcdir', test_dir['sd'], # Some test suites are not run by default but can be run if selected by the # user via variable PG_TEST_EXTRA. Pass configuration time value of @@ -3668,11 +3693,9 @@ foreach test_dir : tests '--dbname', dbname, ] + t.get('regress_args', []) - test_selection = [] - if t.has_key('schedule') - test_selection += ['--schedule', t['schedule'],] - endif + test_schedule = t.get('schedule', []) + test_selection = [] if kind == 'isolation' test_selection += t.get('specs', []) else @@ -3696,12 +3719,13 @@ foreach test_dir : tests testwrap_base, '--testgroup', test_group, '--testname', kind, + '--schedule', test_schedule, + '--tests', test_selection, '--', test_command_base, '--outputdir', test_output, '--temp-instance', test_output / 'tmp_check', '--port', testport.to_string(), - test_selection, ], suite: test_group, kwargs: test_kwargs, @@ -3709,17 +3733,18 @@ foreach test_dir : tests install_suites += test_group # some tests can't support running against running DB - if runningcheck and t.get('runningcheck', true) + if t.get('runningcheck', true) test(test_group_running / kind, python, args: [ testwrap_base, '--testgroup', test_group_running, '--testname', kind, + '--schedule', test_schedule, + '--tests', test_selection, '--', test_command_base, '--outputdir', test_output_running, - test_selection, ], is_parallel: t.get('runningcheck-parallel', true), suite: test_group_running, @@ -3736,8 +3761,8 @@ foreach test_dir : tests endif test_command = [ - perl.path(), - '-I', meson.source_root() / 'src/test/perl', + perl.full_path(), + '-I', meson.project_source_root() / 'src/test/perl', '-I', test_dir['sd'], ] @@ -3792,13 +3817,11 @@ foreach test_dir : tests endforeach # directories with tests # repeat condition so meson realizes version dependency -if meson.version().version_compare('>=0.57') - add_test_setup('tmp_install', - is_default: true, - exclude_suites: running_suites) - add_test_setup('running', - exclude_suites: ['setup'] + install_suites) -endif +add_test_setup('tmp_install', + is_default: true, + exclude_suites: running_suites) +add_test_setup('running', + exclude_suites: ['setup'] + install_suites) @@ -3855,7 +3878,7 @@ tar_gz = custom_target('tar.gz', '--format', 'tar.gz', '-9', '--prefix', distdir + '/', - '-o', join_paths(meson.build_root(), '@OUTPUT@'), + '-o', join_paths(meson.project_build_root(), '@OUTPUT@'), pg_git_revision], output: distdir + '.tar.gz', ) @@ -3865,11 +3888,11 @@ if bzip2.found() build_always_stale: true, command: [git, '-C', '@SOURCE_ROOT@', '-c', 'core.autocrlf=false', - '-c', 'tar.tar.bz2.command="@0@" -c'.format(bzip2.path()), + '-c', 'tar.tar.bz2.command="@0@" -c'.format(bzip2.full_path()), 'archive', '--format', 'tar.bz2', '--prefix', distdir + '/', - '-o', join_paths(meson.build_root(), '@OUTPUT@'), + '-o', join_paths(meson.project_build_root(), '@OUTPUT@'), pg_git_revision], output: distdir + '.tar.bz2', ) @@ -3886,10 +3909,7 @@ alias_target('pgdist', [tar_gz, tar_bz2]) # But not if we are in a subproject, in case the parent project wants to # create a dist using the standard Meson command. if not meson.is_subproject() - # We can only pass the identifier perl here when we depend on >= 0.55 - if meson.version().version_compare('>=0.55') - meson.add_dist_script(perl, '-e', 'exit 1') - endif + meson.add_dist_script(perl, '-e', 'exit 1') endif @@ -3898,106 +3918,102 @@ endif # The End, The End, My Friend ############################################################### -if meson.version().version_compare('>=0.57') +summary( + { + 'data block size': '@0@ kB'.format(cdata.get('BLCKSZ') / 1024), + 'WAL block size': '@0@ kB'.format(cdata.get('XLOG_BLCKSZ') / 1024), + 'segment size': get_option('segsize_blocks') != 0 ? + '@0@ blocks'.format(cdata.get('RELSEG_SIZE')) : + '@0@ GB'.format(get_option('segsize')), + }, + section: 'Data layout', +) + +summary( + { + 'host system': '@0@ @1@'.format(host_system, host_cpu), + 'build system': '@0@ @1@'.format(build_machine.system(), + build_machine.cpu_family()), + }, + section: 'System', +) - summary( - { - 'data block size': '@0@ kB'.format(cdata.get('BLCKSZ') / 1024), - 'WAL block size': '@0@ kB'.format(cdata.get('XLOG_BLCKSZ') / 1024), - 'segment size': get_option('segsize_blocks') != 0 ? - '@0@ blocks'.format(cdata.get('RELSEG_SIZE')) : - '@0@ GB'.format(get_option('segsize')), - }, - section: 'Data layout', - ) +summary( + { + 'linker': '@0@'.format(cc.get_linker_id()), + 'C compiler': '@0@ @1@'.format(cc.get_id(), cc.version()), + }, + section: 'Compiler', +) - summary( - { - 'host system': '@0@ @1@'.format(host_system, host_cpu), - 'build system': '@0@ @1@'.format(build_machine.system(), - build_machine.cpu_family()), - }, - section: 'System', - ) +summary( + { + 'CPP FLAGS': ' '.join(cppflags), + 'C FLAGS, functional': ' '.join(cflags), + 'C FLAGS, warnings': ' '.join(cflags_warn), + 'C FLAGS, modules': ' '.join(cflags_mod), + 'C FLAGS, user specified': ' '.join(get_option('c_args')), + 'LD FLAGS': ' '.join(ldflags + get_option('c_link_args')), + }, + section: 'Compiler Flags', +) +if llvm.found() summary( { - 'linker': '@0@'.format(cc.get_linker_id()), - 'C compiler': '@0@ @1@'.format(cc.get_id(), cc.version()), + 'C++ compiler': '@0@ @1@'.format(cpp.get_id(), cpp.version()), }, section: 'Compiler', ) summary( { - 'CPP FLAGS': ' '.join(cppflags), - 'C FLAGS, functional': ' '.join(cflags), - 'C FLAGS, warnings': ' '.join(cflags_warn), - 'C FLAGS, modules': ' '.join(cflags_mod), - 'C FLAGS, user specified': ' '.join(get_option('c_args')), - 'LD FLAGS': ' '.join(ldflags + get_option('c_link_args')), + 'C++ FLAGS, functional': ' '.join(cxxflags), + 'C++ FLAGS, warnings': ' '.join(cxxflags_warn), + 'C++ FLAGS, user specified': ' '.join(get_option('cpp_args')), }, section: 'Compiler Flags', ) +endif - if llvm.found() - summary( - { - 'C++ compiler': '@0@ @1@'.format(cpp.get_id(), cpp.version()), - }, - section: 'Compiler', - ) - - summary( - { - 'C++ FLAGS, functional': ' '.join(cxxflags), - 'C++ FLAGS, warnings': ' '.join(cxxflags_warn), - 'C++ FLAGS, user specified': ' '.join(get_option('cpp_args')), - }, - section: 'Compiler Flags', - ) - endif - - summary( - { - 'bison': '@0@ @1@'.format(bison.full_path(), bison_version), - 'dtrace': dtrace, - 'flex': '@0@ @1@'.format(flex.full_path(), flex_version), - }, - section: 'Programs', - ) - - summary( - { - 'bonjour': bonjour, - 'bsd_auth': bsd_auth, - 'docs': docs_dep, - 'docs_pdf': docs_pdf_dep, - 'gss': gssapi, - 'icu': icu, - 'ldap': ldap, - 'libcurl': libcurl, - 'libnuma': libnuma, - 'liburing': liburing, - 'libxml': libxml, - 'libxslt': libxslt, - 'llvm': llvm, - 'lz4': lz4, - 'nls': libintl, - 'openssl': ssl, - 'pam': pam, - 'plperl': [perl_dep, perlversion], - 'plpython': python3_dep, - 'pltcl': tcl_dep, - 'readline': readline, - 'selinux': selinux, - 'systemd': systemd, - 'uuid': uuid, - 'zlib': zlib, - 'zstd': zstd, - }, - section: 'External libraries', - list_sep: ' ', - ) +summary( + { + 'bison': '@0@ @1@'.format(bison.full_path(), bison_version), + 'dtrace': dtrace, + 'flex': '@0@ @1@'.format(flex.full_path(), flex_version), + }, + section: 'Programs', +) -endif +summary( + { + 'bonjour': bonjour, + 'bsd_auth': bsd_auth, + 'docs': docs_dep, + 'docs_pdf': docs_pdf_dep, + 'gss': gssapi, + 'icu': icu, + 'ldap': ldap, + 'libcurl': libcurl, + 'libnuma': libnuma, + 'liburing': liburing, + 'libxml': libxml, + 'libxslt': libxslt, + 'llvm': llvm, + 'lz4': lz4, + 'nls': libintl, + 'openssl': ssl, + 'pam': pam, + 'plperl': [perl_dep, perlversion], + 'plpython': python3_dep, + 'pltcl': tcl_dep, + 'readline': readline, + 'selinux': selinux, + 'systemd': systemd, + 'uuid': uuid, + 'zlib': zlib, + 'zstd': zstd, + }, + section: 'External libraries', + list_sep: ' ', +) diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 04952b533ded9..371cd7eba2c18 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -254,7 +254,7 @@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ PG_SYSROOT = @PG_SYSROOT@ -override CPPFLAGS := $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) ifdef PGXS override CPPFLAGS := -I$(includedir_server) -I$(includedir_internal) $(CPPFLAGS) @@ -267,7 +267,6 @@ endif # not PGXS CC = @CC@ GCC = @GCC@ -SUN_STUDIO_CC = @SUN_STUDIO_CC@ CXX = @CXX@ CFLAGS = @CFLAGS@ CFLAGS_SL = @CFLAGS_SL@ @@ -293,6 +292,7 @@ FLEX = @FLEX@ FLEXFLAGS = @FLEXFLAGS@ $(LFLAGS) DTRACE = @DTRACE@ DTRACEFLAGS = @DTRACEFLAGS@ +NM = @NM@ ZIC = @ZIC@ # Linking @@ -796,9 +796,6 @@ ifeq ($(PORTNAME),win32) LIBS += -lws2_32 endif -# Not really standard libc functions, used by the backend. -TAS = @TAS@ - ########################################################################## # diff --git a/src/Makefile.shlib b/src/Makefile.shlib index fa81f6ffdd6d9..3825af5b22872 100644 --- a/src/Makefile.shlib +++ b/src/Makefile.shlib @@ -112,7 +112,7 @@ ifeq ($(PORTNAME), darwin) ifneq ($(SO_MAJOR_VERSION), 0) version_link = -compatibility_version $(SO_MAJOR_VERSION) -current_version $(SO_MAJOR_VERSION).$(SO_MINOR_VERSION) endif - LINK.shared = $(COMPILER) -dynamiclib -install_name '$(libdir)/lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX)' $(version_link) $(exported_symbols_list) + LINK.shared = $(COMPILER) -dynamiclib -install_name '$(libdir)/lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX)' $(version_link) shlib = lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) shlib_major = lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) else @@ -122,7 +122,7 @@ ifeq ($(PORTNAME), darwin) BUILD.exports = $(AWK) '/^[^\#]/ {printf "_%s\n",$$1}' $< >$@ exports_file = $(SHLIB_EXPORTS:%.txt=%.list) ifneq (,$(exports_file)) - exported_symbols_list = -exported_symbols_list $(exports_file) + LINK.shared += -exported_symbols_list $(exports_file) endif endif diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 01e1db7f856be..45d306037a4bb 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -68,7 +68,7 @@ typedef struct BrinShared int scantuplesortstates; /* Query ID, for report in worker processes */ - uint64 queryid; + int64 queryid; /* * workersdonecv is used to monitor the progress of workers. All parallel @@ -318,7 +318,7 @@ initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context); - bistate = palloc0(sizeof(BrinInsertState)); + bistate = palloc0_object(BrinInsertState); bistate->bis_desc = brin_build_desc(idxRel); bistate->bis_rmAccess = brinRevmapInitialize(idxRel, &bistate->bis_pages_per_range); @@ -573,7 +573,6 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) Relation heapRel; BrinOpaque *opaque; BlockNumber nblocks; - BlockNumber heapBlk; int64 totalpages = 0; FmgrInfo *consistentFn; MemoryContext oldcxt; @@ -735,9 +734,10 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) /* * Now scan the revmap. We start by querying for heap page 0, * incrementing by the number of pages per range; this gives us a full - * view of the table. + * view of the table. We make use of uint64 for heapBlk as a BlockNumber + * could wrap for tables with close to 2^32 pages. */ - for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) + for (uint64 heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) { bool addrange; bool gottuple = false; @@ -749,7 +749,7 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) MemoryContextReset(perRangeCxt); - tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, + tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, (BlockNumber) heapBlk, &buf, &off, &size, BUFFER_LOCK_SHARE); if (tup) { @@ -924,7 +924,7 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) /* add the pages in the range to the output bitmap, if needed */ if (addrange) { - BlockNumber pageno; + uint64 pageno; for (pageno = heapBlk; pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1; @@ -1185,7 +1185,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) { SortCoordinate coordinate; - coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate = palloc0_object(SortCoordinateData); coordinate->isWorker = false; coordinate->nParticipants = state->bs_leader->nparticipanttuplesorts; @@ -1478,8 +1478,8 @@ brin_summarize_range(PG_FUNCTION_ARGS) /* Restore userid and security context */ SetUserIdAndSecContext(save_userid, save_sec_context); - relation_close(indexRel, ShareUpdateExclusiveLock); - relation_close(heapRel, ShareUpdateExclusiveLock); + index_close(indexRel, ShareUpdateExclusiveLock); + table_close(heapRel, ShareUpdateExclusiveLock); PG_RETURN_INT32((int32) numSummarized); } @@ -1568,8 +1568,8 @@ brin_desummarize_range(PG_FUNCTION_ARGS) errmsg("index \"%s\" is not valid", RelationGetRelationName(indexRel)))); - relation_close(indexRel, ShareUpdateExclusiveLock); - relation_close(heapRel, ShareUpdateExclusiveLock); + index_close(indexRel, ShareUpdateExclusiveLock); + table_close(heapRel, ShareUpdateExclusiveLock); PG_RETURN_VOID(); } @@ -1608,7 +1608,7 @@ brin_build_desc(Relation rel) opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO); opcinfo[keyno] = (BrinOpcInfo *) - DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid)); + DatumGetPointer(FunctionCall1(opcInfoFn, ObjectIdGetDatum(attr->atttypid))); totalstored += opcinfo[keyno]->oi_nstored; } @@ -2171,28 +2171,42 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) { - BlockNumber nblocks; - BlockNumber blkno; + BlockRangeReadStreamPrivate p; + ReadStream *stream; + Buffer buf; + + p.current_blocknum = 0; + p.last_exclusive = RelationGetNumberOfBlocks(idxrel); + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE | + READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, + strategy, + idxrel, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); /* * Scan the index in physical order, and clean up any possible mess in * each page. */ - nblocks = RelationGetNumberOfBlocks(idxrel); - for (blkno = 0; blkno < nblocks; blkno++) + while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer) { - Buffer buf; - CHECK_FOR_INTERRUPTS(); - buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno, - RBM_NORMAL, strategy); - brin_page_cleanup(idxrel, buf); ReleaseBuffer(buf); } + read_stream_end(stream); + /* * Update all upper pages in the index's FSM, as well. This ensures not * only that we propagate leaf-page FSM updates made by brin_page_cleanup, @@ -2262,7 +2276,7 @@ add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], - nulls[keyno]); + BoolGetDatum(nulls[keyno])); /* if that returned true, we need to insert the updated tuple */ modified |= DatumGetBool(result); @@ -2370,7 +2384,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, Size estsort; BrinShared *brinshared; Sharedsort *sharedsort; - BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader)); + BrinLeader *brinleader = palloc0_object(BrinLeader); WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; @@ -2814,7 +2828,7 @@ _brin_parallel_scan_and_build(BrinBuildState *state, IndexInfo *indexInfo; /* Initialize local tuplesort coordination state */ - coordinate = palloc0(sizeof(SortCoordinateData)); + coordinate = palloc0_object(SortCoordinateData); coordinate->isWorker = true; coordinate->nParticipants = -1; coordinate->sharedsort = sharedsort; diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c index 82b425ce37daa..64dbb7b853265 100644 --- a/src/backend/access/brin/brin_bloom.c +++ b/src/backend/access/brin/brin_bloom.c @@ -114,6 +114,7 @@ */ #include "postgres.h" +#include #include #include "access/brin.h" @@ -126,6 +127,7 @@ #include "catalog/pg_am.h" #include "catalog/pg_type.h" #include "common/hashfn.h" +#include "port/pg_bitutils.h" #include "utils/fmgrprotos.h" #include "utils/rel.h" @@ -540,7 +542,7 @@ brin_bloom_add_value(PG_FUNCTION_ARGS) BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); Datum newval = PG_GETARG_DATUM(2); - bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_BOOL(3); BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS(); Oid colloid = PG_GET_COLLATION(); FmgrInfo *hashFn; diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c index d21ab3a668cce..79c5a0aa18578 100644 --- a/src/backend/access/brin/brin_minmax.c +++ b/src/backend/access/brin/brin_minmax.c @@ -66,7 +66,7 @@ brin_minmax_add_value(PG_FUNCTION_ARGS) BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); Datum newval = PG_GETARG_DATUM(2); - bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_BOOL(3); Oid colloid = PG_GET_COLLATION(); FmgrInfo *cmpFn; Datum compar; @@ -225,8 +225,8 @@ brin_minmax_union(PG_FUNCTION_ARGS) /* Adjust minimum, if B's min is less than A's min */ finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, BTLessStrategyNumber); - needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[0], - col_a->bv_values[0]); + needsadj = DatumGetBool(FunctionCall2Coll(finfo, colloid, col_b->bv_values[0], + col_a->bv_values[0])); if (needsadj) { if (!attr->attbyval) @@ -238,8 +238,8 @@ brin_minmax_union(PG_FUNCTION_ARGS) /* Adjust maximum, if B's max is greater than A's max */ finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, BTGreaterStrategyNumber); - needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[1], - col_a->bv_values[1]); + needsadj = DatumGetBool(FunctionCall2Coll(finfo, colloid, col_b->bv_values[1], + col_a->bv_values[1])); if (needsadj) { if (!attr->attbyval) diff --git a/src/backend/access/brin/brin_minmax_multi.c b/src/backend/access/brin/brin_minmax_multi.c index 0d1507a2a3624..0298a9da8ba74 100644 --- a/src/backend/access/brin/brin_minmax_multi.c +++ b/src/backend/access/brin/brin_minmax_multi.c @@ -276,7 +276,7 @@ static int compare_values(const void *a, const void *b, void *arg); * function (which should be BTLessStrategyNumber). */ static void -AssertArrayOrder(FmgrInfo *cmp, Oid colloid, Datum *values, int nvalues) +AssertArrayOrder(FmgrInfo *cmp, Oid colloid, const Datum *values, int nvalues) { int i; Datum lt; @@ -624,7 +624,7 @@ brin_range_serialize(Ranges *range) for (i = 0; i < nvalues; i++) { - len += VARSIZE_ANY(range->values[i]); + len += VARSIZE_ANY(DatumGetPointer(range->values[i])); } } else if (typlen == -2) /* cstring */ @@ -1340,7 +1340,7 @@ build_distances(FmgrInfo *distanceFn, Oid colloid, return NULL; ndistances = (neranges - 1); - distances = (DistanceValue *) palloc0(sizeof(DistanceValue) * ndistances); + distances = palloc0_array(DistanceValue, ndistances); /* * Walk through the ranges once and compute the distance between the @@ -1504,7 +1504,7 @@ reduce_expanded_ranges(ExpandedRange *eranges, int neranges, /* allocate space for the boundary values */ nvalues = 0; - values = (Datum *) palloc(sizeof(Datum) * max_values); + values = palloc_array(Datum, max_values); /* add the global min/max values, from the first/last range */ values[nvalues++] = eranges[0].minval; @@ -1992,8 +1992,8 @@ brin_minmax_multi_distance_tid(PG_FUNCTION_ARGS) double da1, da2; - ItemPointer pa1 = (ItemPointer) PG_GETARG_DATUM(0); - ItemPointer pa2 = (ItemPointer) PG_GETARG_DATUM(1); + ItemPointer pa1 = (ItemPointer) PG_GETARG_POINTER(0); + ItemPointer pa2 = (ItemPointer) PG_GETARG_POINTER(1); /* * We know the values are range boundaries, but the range may be collapsed @@ -2032,7 +2032,7 @@ brin_minmax_multi_distance_numeric(PG_FUNCTION_ARGS) d = DirectFunctionCall2(numeric_sub, a2, a1); /* a2 - a1 */ - PG_RETURN_FLOAT8(DirectFunctionCall1(numeric_float8, d)); + PG_RETURN_DATUM(DirectFunctionCall1(numeric_float8, d)); } /* @@ -2414,7 +2414,7 @@ brin_minmax_multi_add_value(PG_FUNCTION_ARGS) BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); Datum newval = PG_GETARG_DATUM(2); - bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_BOOL(3); MinMaxMultiOptions *opts = (MinMaxMultiOptions *) PG_GET_OPCLASS_OPTIONS(); Oid colloid = PG_GET_COLLATION(); bool modified = false; diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index 6d8dd1512d6a7..c80b87da3d248 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -176,7 +176,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, brin_can_do_samepage_update(oldbuf, origsz, newsz)) { START_CRIT_SECTION(); - if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz)) + if (!PageIndexTupleOverwrite(oldpage, oldoff, newtup, newsz)) elog(ERROR, "failed to replace BRIN tuple"); MarkBufferDirty(oldbuf); @@ -250,8 +250,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, brin_page_init(newpage, BRIN_PAGETYPE_REGULAR); PageIndexTupleDeleteNoCompact(oldpage, oldoff); - newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz, - InvalidOffsetNumber, false, false); + newoff = PageAddItem(newpage, newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(oldbuf); @@ -341,7 +340,7 @@ brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, - BrinTuple *tup, Size itemsz) + const BrinTuple *tup, Size itemsz) { Page page; BlockNumber blk; @@ -408,8 +407,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, START_CRIT_SECTION(); if (extended) brin_page_init(page, BRIN_PAGETYPE_REGULAR); - off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, - false, false); + off = PageAddItem(page, tup, itemsz, InvalidOffsetNumber, false, false); if (off == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(*buffer); @@ -893,7 +891,11 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) page = BufferGetPage(buffer); brin_page_init(page, BRIN_PAGETYPE_REGULAR); MarkBufferDirty(buffer); - log_newpage_buffer(buffer, true); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); /* diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c index 4e380ecc71097..644f9889a8aec 100644 --- a/src/backend/access/brin/brin_revmap.c +++ b/src/backend/access/brin/brin_revmap.c @@ -79,7 +79,7 @@ brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange) page = BufferGetPage(meta); metadata = (BrinMetaPageData *) PageGetContents(page); - revmap = palloc(sizeof(BrinRevmap)); + revmap = palloc_object(BrinRevmap); revmap->rm_irel = idxrel; revmap->rm_pagesPerRange = metadata->pagesPerRange; revmap->rm_lastRevmapPage = metadata->lastRevmapPage; diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c index 861f397e6db58..43850ce8f48c5 100644 --- a/src/backend/access/brin/brin_tuple.c +++ b/src/backend/access/brin/brin_tuple.c @@ -119,13 +119,12 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, Assert(brdesc->bd_totalstored > 0); - values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); - nulls = (bool *) palloc0(sizeof(bool) * brdesc->bd_totalstored); - phony_nullbitmap = (bits8 *) - palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored)); + values = palloc_array(Datum, brdesc->bd_totalstored); + nulls = palloc0_array(bool, brdesc->bd_totalstored); + phony_nullbitmap = palloc_array(bits8, BITMAPLEN(brdesc->bd_totalstored)); #ifdef TOAST_INDEX_HACK - untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); + untoasted_values = palloc_array(Datum, brdesc->bd_totalstored); #endif /* @@ -488,9 +487,9 @@ brin_new_memtuple(BrinDesc *brdesc) sizeof(BrinValues) * brdesc->bd_tupdesc->natts); dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored); - dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored); - dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); - dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + dtup->bt_values = palloc_array(Datum, brdesc->bd_totalstored); + dtup->bt_allnulls = palloc_array(bool, brdesc->bd_tupdesc->natts); + dtup->bt_hasnulls = palloc_array(bool, brdesc->bd_tupdesc->natts); dtup->bt_empty_range = true; diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index 85db2f0fd5ace..c8b4decb3ec4e 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -31,7 +31,7 @@ brin_xlog_createidx(XLogReaderState *record) /* create the index' metapage */ buf = XLogInitBufferForRedo(record, 0); Assert(BufferIsValid(buf)); - page = (Page) BufferGetPage(buf); + page = BufferGetPage(buf); brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version); PageSetLSN(page, lsn); MarkBufferDirty(buf); @@ -82,12 +82,12 @@ brin_xlog_insert_update(XLogReaderState *record, Assert(tuple->bt_blkno == xlrec->heapBlk); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "brin_xlog_insert_update: invalid max offset number"); - offnum = PageAddItem(page, (Item) tuple, tuplen, offnum, true, false); + offnum = PageAddItem(page, tuple, tuplen, offnum, true, false); if (offnum == InvalidOffsetNumber) elog(PANIC, "brin_xlog_insert_update: failed to add tuple"); @@ -104,7 +104,7 @@ brin_xlog_insert_update(XLogReaderState *record, ItemPointerData tid; ItemPointerSet(&tid, regpgno, xlrec->offnum); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, tid); @@ -146,7 +146,7 @@ brin_xlog_update(XLogReaderState *record) Page page; OffsetNumber offnum; - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); offnum = xlrec->oldOffnum; @@ -185,11 +185,11 @@ brin_xlog_samepage_update(XLogReaderState *record) brintuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (!PageIndexTupleOverwrite(page, offnum, (Item) brintuple, tuplen)) + if (!PageIndexTupleOverwrite(page, offnum, brintuple, tuplen)) elog(PANIC, "brin_xlog_samepage_update: failed to replace tuple"); PageSetLSN(page, lsn); @@ -254,7 +254,7 @@ brin_xlog_revmap_extend(XLogReaderState *record) */ buf = XLogInitBufferForRedo(record, 1); - page = (Page) BufferGetPage(buf); + page = BufferGetPage(buf); brin_page_init(page, BRIN_PAGETYPE_REVMAP); PageSetLSN(page, lsn); diff --git a/src/backend/access/common/attmap.c b/src/backend/access/common/attmap.c index 4901ebecef73f..4cfa9ee78f990 100644 --- a/src/backend/access/common/attmap.c +++ b/src/backend/access/common/attmap.c @@ -41,9 +41,9 @@ make_attrmap(int maplen) { AttrMap *res; - res = (AttrMap *) palloc0(sizeof(AttrMap)); + res = palloc0_object(AttrMap); res->maplen = maplen; - res->attnums = (AttrNumber *) palloc0(sizeof(AttrNumber) * maplen); + res->attnums = palloc0_array(AttrNumber, maplen); return res; } diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 969d1028cae89..b7820d692e2ea 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -105,7 +105,7 @@ missing_hash(const void *key, Size keysize) { const missing_cache_key *entry = (missing_cache_key *) key; - return hash_bytes((const unsigned char *) entry->value, entry->len); + return hash_bytes((const unsigned char *) DatumGetPointer(entry->value), entry->len); } static int @@ -123,7 +123,7 @@ missing_match(const void *key1, const void *key2, Size keysize) } static void -init_missing_cache() +init_missing_cache(void) { HASHCTL hash_ctl; @@ -189,7 +189,7 @@ getmissingattr(TupleDesc tupleDesc, if (att->attlen > 0) key.len = att->attlen; else - key.len = VARSIZE_ANY(attrmiss->am_value); + key.len = VARSIZE_ANY(DatumGetPointer(attrmiss->am_value)); key.value = attrmiss->am_value; entry = hash_search(missing_cache, &key, HASH_ENTER, &found); @@ -901,9 +901,9 @@ expand_tuple(HeapTuple *targetHeapTuple, att->attlen, attrmiss[attnum].am_value); - targetDataLen = att_addlength_pointer(targetDataLen, - att->attlen, - attrmiss[attnum].am_value); + targetDataLen = att_addlength_datum(targetDataLen, + att->attlen, + attrmiss[attnum].am_value); } else { @@ -1230,8 +1230,8 @@ heap_modify_tuple(HeapTuple tuple, * O(N^2) if there are many non-replaced columns, so it seems better to * err on the side of linear cost. */ - values = (Datum *) palloc(numberOfAttributes * sizeof(Datum)); - isnull = (bool *) palloc(numberOfAttributes * sizeof(bool)); + values = palloc_array(Datum, numberOfAttributes); + isnull = palloc_array(bool, numberOfAttributes); heap_deform_tuple(tuple, tupleDesc, values, isnull); @@ -1292,8 +1292,8 @@ heap_modify_tuple_by_cols(HeapTuple tuple, * allocate and fill values and isnull arrays from the tuple, then replace * selected columns from the input arrays. */ - values = (Datum *) palloc(numberOfAttributes * sizeof(Datum)); - isnull = (bool *) palloc(numberOfAttributes * sizeof(bool)); + values = palloc_array(Datum, numberOfAttributes); + isnull = palloc_array(bool, numberOfAttributes); heap_deform_tuple(tuple, tupleDesc, values, isnull); @@ -1502,7 +1502,6 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor, * Allocate and zero the space needed. */ mem = palloc0(len + extra); - memset(mem, 0, extra); tuple = (MinimalTuple) (mem + extra); /* diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c index 1986b943a28be..3efa3889c6f5b 100644 --- a/src/backend/access/common/indextuple.c +++ b/src/backend/access/common/indextuple.c @@ -172,7 +172,7 @@ index_form_tuple_context(TupleDesc tupleDescriptor, values, #endif isnull, - (char *) tp + hoff, + tp + hoff, data_size, &tupmask, (hasnull ? (bits8 *) tp + sizeof(IndexTupleData) : NULL)); diff --git a/src/backend/access/common/printsimple.c b/src/backend/access/common/printsimple.c index f346ab3e8125b..756f1c4822d72 100644 --- a/src/backend/access/common/printsimple.c +++ b/src/backend/access/common/printsimple.c @@ -23,6 +23,7 @@ #include "libpq/pqformat.h" #include "libpq/protocol.h" #include "utils/builtins.h" +#include "varatt.h" /* * At startup time, send a RowDescription message. @@ -123,7 +124,7 @@ printsimple(TupleTableSlot *slot, DestReceiver *self) case OIDOID: { - Oid num = ObjectIdGetDatum(value); + Oid num = DatumGetObjectId(value); char str[10]; /* 10 digits */ int len; diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index 830a3d883aa2e..47b2b2d2335cc 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -22,6 +22,7 @@ #include "utils/lsyscache.h" #include "utils/memdebug.h" #include "utils/memutils.h" +#include "varatt.h" static void printtup_startup(DestReceiver *self, int operation, @@ -70,7 +71,7 @@ typedef struct DestReceiver * printtup_create_DR(CommandDest dest) { - DR_printtup *self = (DR_printtup *) palloc0(sizeof(DR_printtup)); + DR_printtup *self = palloc0_object(DR_printtup); self->pub.receiveSlot = printtup; /* might get changed later */ self->pub.rStartup = printtup_startup; @@ -350,7 +351,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self) */ if (thisState->typisvarlena) VALGRIND_CHECK_MEM_IS_DEFINED(DatumGetPointer(attr), - VARSIZE_ANY(attr)); + VARSIZE_ANY(DatumGetPointer(attr))); if (thisState->format == 0) { @@ -430,7 +431,7 @@ printatt(unsigned attributeId, value != NULL ? " = \"" : "", value != NULL ? value : "", value != NULL ? "\"" : "", - (unsigned int) (attributeP->atttypid), + attributeP->atttypid, attributeP->attlen, attributeP->atttypmod, attributeP->attbyval ? 't' : 'f'); diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 46c1dce222d10..31926d8a368ae 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -322,12 +322,21 @@ static relopt_int intRelOpts[] = { { "log_autovacuum_min_duration", - "Sets the minimum execution time above which autovacuum actions will be logged", + "Sets the minimum execution time above which vacuum actions by autovacuum will be logged", RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, ShareUpdateExclusiveLock }, -1, -1, INT_MAX }, + { + { + "log_autoanalyze_min_duration", + "Sets the minimum execution time above which analyze actions by autovacuum will be logged", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock + }, + -1, -1, INT_MAX + }, { { "toast_tuple_target", @@ -767,7 +776,7 @@ register_reloptions_validator(local_relopts *relopts, relopts_validator validato static void add_local_reloption(local_relopts *relopts, relopt_gen *newoption, int offset) { - local_relopt *opt = palloc(sizeof(*opt)); + local_relopt *opt = palloc_object(local_relopt); Assert(offset < relopts->relopt_struct_size); @@ -1164,7 +1173,7 @@ add_local_string_reloption(local_relopts *relopts, const char *name, * but we declare them as Datums to avoid including array.h in reloptions.h. */ Datum -transformRelOptions(Datum oldOptions, List *defList, const char *namspace, +transformRelOptions(Datum oldOptions, List *defList, const char *nameSpace, const char *const validnsps[], bool acceptOidsOff, bool isReset) { Datum result; @@ -1179,7 +1188,7 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, astate = NULL; /* Copy any oldOptions that aren't to be replaced */ - if (PointerIsValid(DatumGetPointer(oldOptions))) + if (DatumGetPointer(oldOptions) != NULL) { ArrayType *array = DatumGetArrayTypeP(oldOptions); Datum *oldoptions; @@ -1190,8 +1199,8 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, for (i = 0; i < noldoptions; i++) { - char *text_str = VARDATA(oldoptions[i]); - int text_len = VARSIZE(oldoptions[i]) - VARHDRSZ; + char *text_str = VARDATA(DatumGetPointer(oldoptions[i])); + int text_len = VARSIZE(DatumGetPointer(oldoptions[i])) - VARHDRSZ; /* Search for a match in defList */ foreach(cell, defList) @@ -1200,14 +1209,14 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, int kw_len; /* ignore if not in the same namespace */ - if (namspace == NULL) + if (nameSpace == NULL) { if (def->defnamespace != NULL) continue; } else if (def->defnamespace == NULL) continue; - else if (strcmp(def->defnamespace, namspace) != 0) + else if (strcmp(def->defnamespace, nameSpace) != 0) continue; kw_len = strlen(def->defname); @@ -1243,8 +1252,9 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, } else { - text *t; + const char *name; const char *value; + text *t; Size len; /* @@ -1276,14 +1286,14 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, } /* ignore if not in the same namespace */ - if (namspace == NULL) + if (nameSpace == NULL) { if (def->defnamespace != NULL) continue; } else if (def->defnamespace == NULL) continue; - else if (strcmp(def->defnamespace, namspace) != 0) + else if (strcmp(def->defnamespace, nameSpace) != 0) continue; /* @@ -1291,11 +1301,19 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, * have just "name", assume "name=true" is meant. Note: the * namespace is not output. */ + name = def->defname; if (def->arg != NULL) value = defGetString(def); else value = "true"; + /* Insist that name not contain "=", else "a=b=c" is ambiguous */ + if (strchr(name, '=') != NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option name \"%s\": must not contain \"=\"", + name))); + /* * This is not a great place for this test, but there's no other * convenient place to filter the option out. As WITH (oids = @@ -1303,7 +1321,7 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, * amount of ugly. */ if (acceptOidsOff && def->defnamespace == NULL && - strcmp(def->defname, "oids") == 0) + strcmp(name, "oids") == 0) { if (defGetBoolean(def)) ereport(ERROR, @@ -1313,11 +1331,11 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, continue; } - len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value); + len = VARHDRSZ + strlen(name) + 1 + strlen(value); /* +1 leaves room for sprintf's trailing null */ t = (text *) palloc(len + 1); SET_VARSIZE(t, len); - sprintf(VARDATA(t), "%s=%s", def->defname, value); + sprintf(VARDATA(t), "%s=%s", name, value); astate = accumArrayResult(astate, PointerGetDatum(t), false, TEXTOID, @@ -1348,7 +1366,7 @@ untransformRelOptions(Datum options) int i; /* Nothing to do if no options */ - if (!PointerIsValid(DatumGetPointer(options))) + if (DatumGetPointer(options) == NULL) return result; array = DatumGetArrayTypeP(options); @@ -1447,8 +1465,8 @@ parseRelOptionsInternal(Datum options, bool validate, for (i = 0; i < noptions; i++) { - char *text_str = VARDATA(optiondatums[i]); - int text_len = VARSIZE(optiondatums[i]) - VARHDRSZ; + char *text_str = VARDATA(DatumGetPointer(optiondatums[i])); + int text_len = VARSIZE(DatumGetPointer(optiondatums[i])) - VARHDRSZ; int j; /* Search for a match in reloptions */ @@ -1540,7 +1558,7 @@ parseRelOptions(Datum options, bool validate, relopt_kind kind, } /* Done if no options */ - if (PointerIsValid(DatumGetPointer(options))) + if (DatumGetPointer(options) != NULL) parseRelOptionsInternal(options, validate, reloptions, numoptions); *numrelopts = numoptions; @@ -1552,7 +1570,7 @@ static relopt_value * parseLocalRelOptions(local_relopts *relopts, Datum options, bool validate) { int nopts = list_length(relopts->options); - relopt_value *values = palloc(sizeof(*values) * nopts); + relopt_value *values = palloc_array(relopt_value, nopts); ListCell *lc; int i = 0; @@ -1886,7 +1904,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) {"autovacuum_multixact_freeze_table_age", RELOPT_TYPE_INT, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, multixact_freeze_table_age)}, {"log_autovacuum_min_duration", RELOPT_TYPE_INT, - offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, log_min_duration)}, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, log_vacuum_min_duration)}, + {"log_autoanalyze_min_duration", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, log_analyze_min_duration)}, {"toast_tuple_target", RELOPT_TYPE_INT, offsetof(StdRdOptions, toast_tuple_target)}, {"autovacuum_vacuum_cost_delay", RELOPT_TYPE_REAL, @@ -1971,7 +1991,7 @@ void * build_local_reloptions(local_relopts *relopts, Datum options, bool validate) { int noptions = list_length(relopts->options); - relopt_parse_elt *elems = palloc(sizeof(*elems) * noptions); + relopt_parse_elt *elems = palloc_array(relopt_parse_elt, noptions); relopt_value *vals; void *opts; int i = 0; @@ -2083,7 +2103,7 @@ index_reloptions(amoptions_function amoptions, Datum reloptions, bool validate) Assert(amoptions != NULL); /* Assume function is strict */ - if (!PointerIsValid(DatumGetPointer(reloptions))) + if (DatumGetPointer(reloptions) == NULL) return NULL; return amoptions(reloptions, validate); diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c index 5bd75fb499cef..6fae41e7b6d19 100644 --- a/src/backend/access/common/tidstore.c +++ b/src/backend/access/common/tidstore.c @@ -166,7 +166,7 @@ TidStoreCreateLocal(size_t max_bytes, bool insert_only) size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE; size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE; - ts = palloc0(sizeof(TidStore)); + ts = palloc0_object(TidStore); /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */ while (16 * maxBlockSize > max_bytes) @@ -212,7 +212,7 @@ TidStoreCreateShared(size_t max_bytes, int tranche_id) size_t dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE; size_t dsa_max_size = DSA_MAX_SEGMENT_SIZE; - ts = palloc0(sizeof(TidStore)); + ts = palloc0_object(TidStore); /* * Choose the initial and maximum DSA segment sizes to be no longer than @@ -250,7 +250,7 @@ TidStoreAttach(dsa_handle area_handle, dsa_pointer handle) Assert(DsaPointerIsValid(handle)); /* create per-backend state */ - ts = palloc0(sizeof(TidStore)); + ts = palloc0_object(TidStore); area = dsa_attach(area_handle); @@ -418,7 +418,7 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, /* Return true if the given TID is present in the TidStore */ bool -TidStoreIsMember(TidStore *ts, ItemPointer tid) +TidStoreIsMember(TidStore *ts, const ItemPointerData *tid) { int wordnum; int bitnum; @@ -472,7 +472,7 @@ TidStoreBeginIterate(TidStore *ts) { TidStoreIter *iter; - iter = palloc0(sizeof(TidStoreIter)); + iter = palloc0_object(TidStoreIter); iter->ts = ts; if (TidStoreIsShared(ts)) diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 21f2f4af97e3f..926f1e4008abe 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -25,11 +25,11 @@ /* GUC */ int default_toast_compression = TOAST_PGLZ_COMPRESSION; -#define NO_LZ4_SUPPORT() \ +#define NO_COMPRESSION_SUPPORT(method) \ ereport(ERROR, \ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ - errmsg("compression method lz4 not supported"), \ - errdetail("This functionality requires the server to be built with lz4 support."))) + errmsg("compression method %s not supported", method), \ + errdetail("This functionality requires the server to be built with %s support.", method))) /* * Compress a varlena using PGLZ. @@ -139,7 +139,7 @@ struct varlena * lz4_compress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 valsize; @@ -182,7 +182,7 @@ struct varlena * lz4_decompress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -215,7 +215,7 @@ struct varlena * lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -289,7 +289,7 @@ CompressionNameToMethod(const char *compression) else if (strcmp(compression, "lz4") == 0) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); #endif return TOAST_LZ4_COMPRESSION; } diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 7d8be8346ce52..d06af82de15d9 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -64,11 +64,11 @@ toast_compress_datum(Datum value, char cmethod) switch (cmethod) { case TOAST_PGLZ_COMPRESSION: - tmp = pglz_compress_datum((const struct varlena *) value); + tmp = pglz_compress_datum((const struct varlena *) DatumGetPointer(value)); cmid = TOAST_PGLZ_COMPRESSION_ID; break; case TOAST_LZ4_COMPRESSION: - tmp = lz4_compress_datum((const struct varlena *) value); + tmp = lz4_compress_datum((const struct varlena *) DatumGetPointer(value)); cmid = TOAST_LZ4_COMPRESSION_ID; break; default: @@ -121,22 +121,10 @@ toast_save_datum(Relation rel, Datum value, { Relation toastrel; Relation *toastidxs; - HeapTuple toasttup; TupleDesc toasttupDesc; - Datum t_values[3]; - bool t_isnull[3]; CommandId mycid = GetCurrentCommandId(true); struct varlena *result; struct varatt_external toast_pointer; - union - { - struct varlena hdr; - /* this is to make the union big enough for a chunk: */ - char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; - /* ensure union is aligned well enough: */ - int32 align_it; - } chunk_data; - int32 chunk_size; int32 chunk_seq = 0; char *data_p; int32 data_todo; @@ -144,7 +132,7 @@ toast_save_datum(Relation rel, Datum value, int num_indexes; int validIndex; - Assert(!VARATT_IS_EXTERNAL(value)); + Assert(!VARATT_IS_EXTERNAL(dval)); /* * Open the toast relation and its indexes. We can use the index to check @@ -289,21 +277,21 @@ toast_save_datum(Relation rel, Datum value, } } - /* - * Initialize constant parts of the tuple data - */ - t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); - t_values[2] = PointerGetDatum(&chunk_data); - t_isnull[0] = false; - t_isnull[1] = false; - t_isnull[2] = false; - /* * Split up the item into chunks */ while (data_todo > 0) { - int i; + HeapTuple toasttup; + Datum t_values[3]; + bool t_isnull[3] = {0}; + union + { + alignas(int32) struct varlena hdr; + /* this is to make the union big enough for a chunk: */ + char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; + } chunk_data; + int32 chunk_size; CHECK_FOR_INTERRUPTS(); @@ -315,9 +303,12 @@ toast_save_datum(Relation rel, Datum value, /* * Build a tuple and store it */ + t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); t_values[1] = Int32GetDatum(chunk_seq++); SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); memcpy(VARDATA(&chunk_data), data_p, chunk_size); + t_values[2] = PointerGetDatum(&chunk_data); + toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); heap_insert(toastrel, toasttup, mycid, options, NULL); @@ -333,7 +324,7 @@ toast_save_datum(Relation rel, Datum value, * Note also that there had better not be any user-created index on * the TOAST table, since we don't bother to update anything else. */ - for (i = 0; i < num_indexes; i++) + for (int i = 0; i < num_indexes; i++) { /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) @@ -577,7 +568,7 @@ toast_open_indexes(Relation toastrel, *num_indexes = list_length(indexlist); /* Open all the index relations */ - *toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation)); + *toastidxs = palloc_array(Relation, *num_indexes); foreach(lc, indexlist) (*toastidxs)[i++] = index_open(lfirst_oid(lc), lock); diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c index 54dc2f4ab80ab..99c197e361f68 100644 --- a/src/backend/access/common/tupconvert.c +++ b/src/backend/access/common/tupconvert.c @@ -18,6 +18,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/tupconvert.h" #include "executor/tuptable.h" @@ -74,17 +75,17 @@ convert_tuples_by_position(TupleDesc indesc, } /* Prepare the map structure */ - map = (TupleConversionMap *) palloc(sizeof(TupleConversionMap)); + map = palloc_object(TupleConversionMap); map->indesc = indesc; map->outdesc = outdesc; map->attrMap = attrMap; /* preallocate workspace for Datum arrays */ n = outdesc->natts + 1; /* +1 for NULL */ - map->outvalues = (Datum *) palloc(n * sizeof(Datum)); - map->outisnull = (bool *) palloc(n * sizeof(bool)); + map->outvalues = palloc_array(Datum, n); + map->outisnull = palloc_array(bool, n); n = indesc->natts + 1; /* +1 for NULL */ - map->invalues = (Datum *) palloc(n * sizeof(Datum)); - map->inisnull = (bool *) palloc(n * sizeof(bool)); + map->invalues = palloc_array(Datum, n); + map->inisnull = palloc_array(bool, n); map->invalues[0] = (Datum) 0; /* set up the NULL entry */ map->inisnull[0] = true; @@ -131,16 +132,16 @@ convert_tuples_by_name_attrmap(TupleDesc indesc, Assert(attrMap != NULL); /* Prepare the map structure */ - map = (TupleConversionMap *) palloc(sizeof(TupleConversionMap)); + map = palloc_object(TupleConversionMap); map->indesc = indesc; map->outdesc = outdesc; map->attrMap = attrMap; /* preallocate workspace for Datum arrays */ - map->outvalues = (Datum *) palloc(n * sizeof(Datum)); - map->outisnull = (bool *) palloc(n * sizeof(bool)); + map->outvalues = palloc_array(Datum, n); + map->outisnull = palloc_array(bool, n); n = indesc->natts + 1; /* +1 for NULL */ - map->invalues = (Datum *) palloc(n * sizeof(Datum)); - map->inisnull = (bool *) palloc(n * sizeof(bool)); + map->invalues = palloc_array(Datum, n); + map->inisnull = palloc_array(bool, n); map->invalues[0] = (Datum) 0; /* set up the NULL entry */ map->inisnull[0] = true; diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index ffd0c78f905a5..bcd1ddcc68b43 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -142,10 +142,17 @@ void verify_compact_attribute(TupleDesc tupdesc, int attnum) { #ifdef USE_ASSERT_CHECKING - CompactAttribute *cattr = &tupdesc->compact_attrs[attnum]; + CompactAttribute cattr; Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum); CompactAttribute tmp; + /* + * Make a temp copy of the TupleDesc's CompactAttribute. This may be a + * shared TupleDesc and the attcacheoff might get changed by another + * backend. + */ + memcpy(&cattr, &tupdesc->compact_attrs[attnum], sizeof(CompactAttribute)); + /* * Populate the temporary CompactAttribute from the corresponding * Form_pg_attribute @@ -156,11 +163,11 @@ verify_compact_attribute(TupleDesc tupdesc, int attnum) * Make the attcacheoff match since it's been reset to -1 by * populate_compact_attribute_internal. Same with attnullability. */ - tmp.attcacheoff = cattr->attcacheoff; - tmp.attnullability = cattr->attnullability; + tmp.attcacheoff = cattr.attcacheoff; + tmp.attnullability = cattr.attnullability; /* Check the freshly populated CompactAttribute matches the TupleDesc's */ - Assert(memcmp(&tmp, cattr, sizeof(CompactAttribute)) == 0); + Assert(memcmp(&tmp, &cattr, sizeof(CompactAttribute)) == 0); #endif } @@ -354,7 +361,7 @@ CreateTupleDescCopyConstr(TupleDesc tupdesc) /* Copy the TupleConstr data structure, if any */ if (constr) { - TupleConstr *cpy = (TupleConstr *) palloc0(sizeof(TupleConstr)); + TupleConstr *cpy = palloc0_object(TupleConstr); cpy->has_not_null = constr->has_not_null; cpy->has_generated_stored = constr->has_generated_stored; @@ -467,8 +474,8 @@ TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno, /* * sanity checks */ - Assert(PointerIsValid(src)); - Assert(PointerIsValid(dst)); + Assert(src); + Assert(dst); Assert(srcAttno >= 1); Assert(srcAttno <= src->natts); Assert(dstAttno >= 1); @@ -808,10 +815,10 @@ hashRowType(TupleDesc desc) uint32 s; int i; - s = hash_combine(0, hash_uint32(desc->natts)); - s = hash_combine(s, hash_uint32(desc->tdtypeid)); + s = hash_combine(0, hash_bytes_uint32(desc->natts)); + s = hash_combine(s, hash_bytes_uint32(desc->tdtypeid)); for (i = 0; i < desc->natts; ++i) - s = hash_combine(s, hash_uint32(TupleDescAttr(desc, i)->atttypid)); + s = hash_combine(s, hash_bytes_uint32(TupleDescAttr(desc, i)->atttypid)); return s; } @@ -846,7 +853,7 @@ TupleDescInitEntry(TupleDesc desc, /* * sanity checks */ - Assert(PointerIsValid(desc)); + Assert(desc); Assert(attributeNumber >= 1); Assert(attributeNumber <= desc->natts); Assert(attdim >= 0); @@ -918,7 +925,7 @@ TupleDescInitBuiltinEntry(TupleDesc desc, Form_pg_attribute att; /* sanity checks */ - Assert(PointerIsValid(desc)); + Assert(desc); Assert(attributeNumber >= 1); Assert(attributeNumber <= desc->natts); Assert(attdim >= 0); @@ -986,7 +993,7 @@ TupleDescInitBuiltinEntry(TupleDesc desc, case INT8OID: att->attlen = 8; - att->attbyval = FLOAT8PASSBYVAL; + att->attbyval = true; att->attalign = TYPALIGN_DOUBLE; att->attstorage = TYPSTORAGE_PLAIN; att->attcompression = InvalidCompressionMethod; @@ -1023,7 +1030,7 @@ TupleDescInitEntryCollation(TupleDesc desc, /* * sanity checks */ - Assert(PointerIsValid(desc)); + Assert(desc); Assert(attributeNumber >= 1); Assert(attributeNumber <= desc->natts); diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 644d484ea53c6..47f881ae59685 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -85,7 +85,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode, { GinBtreeStack *stack; - stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + stack = palloc_object(GinBtreeStack); stack->blkno = btree->rootBlkno; stack->buffer = ReadBuffer(btree->index, btree->rootBlkno); stack->parent = NULL; @@ -152,7 +152,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode, } else { - GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + GinBtreeStack *ptr = palloc_object(GinBtreeStack); ptr->parent = stack; stack = ptr; @@ -246,7 +246,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack) blkno = root->blkno; buffer = root->buffer; - ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + ptr = palloc_object(GinBtreeStack); for (;;) { diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c index 302cb2092a9a8..f73ee7b1fa4ba 100644 --- a/src/backend/access/gin/ginbulk.c +++ b/src/backend/access/gin/ginbulk.c @@ -93,7 +93,7 @@ ginAllocEntryAccumulator(void *arg) */ if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY) { - accum->entryallocator = palloc(sizeof(GinEntryAccumulator) * DEF_NENTRY); + accum->entryallocator = palloc_array(GinEntryAccumulator, DEF_NENTRY); accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator); accum->eas_used = 0; } @@ -177,8 +177,7 @@ ginInsertBAEntry(BuildAccumulator *accum, ea->maxcount = DEF_NPTR; ea->count = 1; ea->shouldSort = false; - ea->list = - (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR); + ea->list = palloc_array(ItemPointerData, DEF_NPTR); ea->list[0] = *heapptr; accum->allocatedMemory += GetMemoryChunkSpace(ea->list); } diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 6c2c61947204a..0140bd4904b24 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -140,20 +140,20 @@ GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast) { GinPostingList *seg = GinDataLeafPageGetPostingList(page); Size len = GinDataLeafPageGetPostingListSize(page); - Pointer endptr = ((Pointer) seg) + len; + char *endptr = (char *) seg + len; GinPostingList *next; /* Skip to the segment containing advancePast+1 */ if (ItemPointerIsValid(&advancePast)) { next = GinNextPostingListSegment(seg); - while ((Pointer) next < endptr && + while ((char *) next < endptr && ginCompareItemPointers(&next->first, &advancePast) <= 0) { seg = next; next = GinNextPostingListSegment(seg); } - len = endptr - (Pointer) seg; + len = endptr - (char *) seg; } if (len > 0) @@ -607,11 +607,11 @@ dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (append) elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", - maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + maxitems, BufferGetBlockNumber(buf), leaf->lsize, items->nitem - items->curitem - maxitems); else elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)", - maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + maxitems, BufferGetBlockNumber(buf), leaf->lsize, items->nitem - items->curitem - maxitems); } else @@ -693,11 +693,11 @@ dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (append) elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)", - maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + maxitems, BufferGetBlockNumber(buf), leaf->lsize, leaf->rsize, items->nitem - items->curitem - maxitems); else elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)", - maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + maxitems, BufferGetBlockNumber(buf), leaf->lsize, leaf->rsize, items->nitem - items->curitem - maxitems); } @@ -1332,7 +1332,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, static void * dataPrepareDownlink(GinBtree btree, Buffer lbuf) { - PostingItem *pitem = palloc(sizeof(PostingItem)); + PostingItem *pitem = palloc_object(PostingItem); Page lpage = BufferGetPage(lbuf); PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf)); @@ -1371,10 +1371,10 @@ disassembleLeaf(Page page) { disassembledLeaf *leaf; GinPostingList *seg; - Pointer segbegin; - Pointer segend; + char *segbegin; + char *segend; - leaf = palloc0(sizeof(disassembledLeaf)); + leaf = palloc0_object(disassembledLeaf); dlist_init(&leaf->segments); if (GinPageIsCompressed(page)) @@ -1383,11 +1383,11 @@ disassembleLeaf(Page page) * Create a leafSegmentInfo entry for each segment. */ seg = GinDataLeafPageGetPostingList(page); - segbegin = (Pointer) seg; + segbegin = (char *) seg; segend = segbegin + GinDataLeafPageGetPostingListSize(page); - while ((Pointer) seg < segend) + while ((char *) seg < segend) { - leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); + leafSegmentInfo *seginfo = palloc_object(leafSegmentInfo); seginfo->action = GIN_SEGMENT_UNMODIFIED; seginfo->seg = seg; @@ -1414,7 +1414,7 @@ disassembleLeaf(Page page) if (nuncompressed > 0) { - seginfo = palloc(sizeof(leafSegmentInfo)); + seginfo = palloc_object(leafSegmentInfo); seginfo->action = GIN_SEGMENT_REPLACE; seginfo->seg = NULL; @@ -1455,7 +1455,7 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) */ if (dlist_is_empty(&leaf->segments)) { - newseg = palloc(sizeof(leafSegmentInfo)); + newseg = palloc_object(leafSegmentInfo); newseg->seg = NULL; newseg->items = newItems; newseg->nitems = nNewItems; @@ -1512,7 +1512,7 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) cur->seg != NULL && SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize) { - newseg = palloc(sizeof(leafSegmentInfo)); + newseg = palloc_object(leafSegmentInfo); newseg->seg = NULL; newseg->items = nextnew; newseg->nitems = nthis; @@ -1629,7 +1629,7 @@ leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) if (seginfo->action != GIN_SEGMENT_INSERT) seginfo->action = GIN_SEGMENT_REPLACE; - nextseg = palloc(sizeof(leafSegmentInfo)); + nextseg = palloc_object(leafSegmentInfo); nextseg->action = GIN_SEGMENT_INSERT; nextseg->seg = NULL; nextseg->items = &seginfo->items[npacked]; @@ -1779,7 +1779,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, Buffer buffer; Page tmppage; Page page; - Pointer ptr; + char *ptr; int nrootitems; int rootsize; bool is_build = (buildStats != NULL); @@ -1795,7 +1795,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, */ nrootitems = 0; rootsize = 0; - ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage); + ptr = (char *) GinDataLeafPageGetPostingList(tmppage); while (nrootitems < nitems) { GinPostingList *segment; diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index ba6bbd562df0c..23a41ce9486b1 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -171,7 +171,7 @@ ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, { if (nipd > 0) { - ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + ipd = ginPostingListDecode(ptr, &ndecoded); if (nipd != ndecoded) elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", nipd, ndecoded); @@ -183,7 +183,7 @@ ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, } else { - ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + ipd = palloc_array(ItemPointerData, nipd); memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); } *nitems = nipd; @@ -563,7 +563,7 @@ entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, entryPreparePage(btree, page, off, insertData, updateblkno); placed = PageAddItem(page, - (Item) insertData->entry, + insertData->entry, IndexTupleSize(insertData->entry), off, false, false); if (placed != off) @@ -684,7 +684,7 @@ entrySplitPage(GinBtree btree, Buffer origbuf, lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + if (PageAddItem(page, itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); ptr += MAXALIGN(IndexTupleSize(itup)); @@ -708,7 +708,7 @@ entryPrepareDownlink(GinBtree btree, Buffer lbuf) itup = getRightMostTuple(lpage); - insertData = palloc(sizeof(GinBtreeEntryInsertData)); + insertData = palloc_object(GinBtreeEntryInsertData); insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno); insertData->isDelete = false; @@ -727,12 +727,12 @@ ginEntryFillRoot(GinBtree btree, Page root, IndexTuple itup; itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno); - if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + if (PageAddItem(root, itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); pfree(itup); itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno); - if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + if (PageAddItem(root, itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); pfree(itup); } diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index a6d88572cc29b..33816f8551f15 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -57,7 +57,7 @@ typedef struct KeyArray */ static int32 writeListPage(Relation index, Buffer buffer, - IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) + const IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) { Page page = BufferGetPage(buffer); int32 i, @@ -83,7 +83,7 @@ writeListPage(Relation index, Buffer buffer, ptr += this_size; size += this_size; - l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); + l = PageAddItem(page, tuples[i], this_size, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", @@ -384,7 +384,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) for (i = 0; i < collector->ntuples; i++) { tupsize = IndexTupleSize(collector->tuples[i]); - l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); + l = PageAddItem(page, collector->tuples[i], tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index f29ccd3c2d1ff..b3e2e9d5f6e30 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -489,7 +489,7 @@ startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot) static int entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg) { - const GinScanKey key = (const GinScanKey) arg; + const GinScanKeyData *key = arg; int i1 = *(const int *) a1; int i2 = *(const int *) a2; uint32 n1 = key->scanEntry[i1]->predictNumberResult; @@ -552,7 +552,7 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) { MemoryContextSwitchTo(so->tempCtx); - entryIndexes = (int *) palloc(sizeof(int) * key->nentries); + entryIndexes = palloc_array(int, key->nentries); for (i = 0; i < key->nentries; i++) entryIndexes[i] = i; qsort_arg(entryIndexes, key->nentries, sizeof(int), @@ -1327,6 +1327,8 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, */ do { + CHECK_FOR_INTERRUPTS(); + ItemPointerSetMin(item); match = true; for (i = 0; i < so->nkeys && match; i++) @@ -1871,7 +1873,7 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) LockBuffer(pos.pendingBuffer, GIN_SHARE); pos.firstOffset = FirstOffsetNumber; UnlockReleaseBuffer(metabuffer); - pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys); + pos.hasMatchKey = palloc_array(bool, so->nkeys); /* * loop for each heap row. scanGetCandidate returns full row or row's @@ -1966,8 +1968,6 @@ gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm) for (;;) { - CHECK_FOR_INTERRUPTS(); - if (!scanGetItem(scan, iptr, &iptr, &recheck)) break; diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index a65acd8910493..fc6af7c751b5b 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -152,7 +152,9 @@ typedef struct * only in the leader process. */ GinLeader *bs_leader; - int bs_worker_id; + + /* number of participating workers (including leader) */ + int bs_num_workers; /* used to pass information from workers to leader */ double bs_numtuples; @@ -218,7 +220,8 @@ addItemPointersToLeafTuple(GinState *ginstate, ItemPointerData *newItems, *oldItems; int oldNPosting, - newNPosting; + newNPosting, + nwritten; GinPostingList *compressedList; Assert(!GinIsPostingTree(old)); @@ -235,18 +238,19 @@ addItemPointersToLeafTuple(GinState *ginstate, /* Compress the posting list, and try to a build tuple with room for it */ res = NULL; - compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, - NULL); - pfree(newItems); - if (compressedList) + compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, &nwritten); + if (nwritten == newNPosting) { res = GinFormTuple(ginstate, attnum, key, category, (char *) compressedList, SizeOfGinPostingList(compressedList), newNPosting, false); - pfree(compressedList); } + + pfree(newItems); + pfree(compressedList); + if (!res) { /* posting list would be too big, convert to posting tree */ @@ -293,17 +297,19 @@ buildFreshLeafTuple(GinState *ginstate, { IndexTuple res = NULL; GinPostingList *compressedList; + int nwritten; /* try to build a posting list tuple with all the items */ - compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, NULL); - if (compressedList) + compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, &nwritten); + if (nwritten == nitem) { res = GinFormTuple(ginstate, attnum, key, category, (char *) compressedList, SizeOfGinPostingList(compressedList), nitem, false); - pfree(compressedList); } + pfree(compressedList); + if (!res) { /* posting list would be too big, build posting tree */ @@ -479,6 +485,15 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values, /* * ginFlushBuildState * Write all data from BuildAccumulator into the tuplesort. + * + * The number of TIDs written to the tuplesort at once is limited, to reduce + * the amount of memory needed when merging the intermediate results later. + * The leader will see up to two chunks per worker, so calculate the limit to + * not need more than MaxAllocSize overall. + * + * We don't need to worry about overflowing maintenance_work_mem. We can't + * build chunks larger than work_mem, and that limit was set so that workers + * produce sufficiently small chunks. */ static void ginFlushBuildState(GinBuildState *buildstate, Relation index) @@ -489,28 +504,44 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index) uint32 nlist; OffsetNumber attnum; TupleDesc tdesc = RelationGetDescr(index); + uint32 maxlen; + + /* maximum number of TIDs per chunk (two chunks per worker) */ + maxlen = MaxAllocSize / sizeof(ItemPointerData); + maxlen /= (2 * buildstate->bs_num_workers); ginBeginBAScan(&buildstate->accum); while ((list = ginGetBAEntry(&buildstate->accum, &attnum, &key, &category, &nlist)) != NULL) { /* information about the key */ - Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1)); + CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1)); - /* GIN tuple and tuple length */ - GinTuple *tup; - Size tuplen; + /* start of the chunk */ + uint32 offset = 0; - /* there could be many entries, so be willing to abort here */ - CHECK_FOR_INTERRUPTS(); + /* split the entry into smaller chunk with up to maxlen items */ + while (offset < nlist) + { + /* GIN tuple and tuple length */ + GinTuple *tup; + Size tuplen; + uint32 len = Min(maxlen, nlist - offset); - tup = _gin_build_tuple(attnum, category, - key, attr->attlen, attr->attbyval, - list, nlist, &tuplen); + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + + tup = _gin_build_tuple(attnum, category, + key, attr->attlen, attr->attbyval, + &list[offset], len, + &tuplen); - tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); + offset += len; - pfree(tup); + tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); + + pfree(tup); + } } MemoryContextReset(buildstate->tmpCtx); @@ -676,7 +707,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) { SortCoordinate coordinate; - coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate = palloc0_object(SortCoordinateData); coordinate->isWorker = false; coordinate->nParticipants = state->bs_leader->nparticipanttuplesorts; @@ -760,7 +791,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) /* * Return statistics */ - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result = palloc_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; @@ -836,7 +867,7 @@ gininsert(Relation index, Datum *values, bool *isnull, if (ginstate == NULL) { oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context); - ginstate = (GinState *) palloc(sizeof(GinState)); + ginstate = palloc_object(GinState); initGinState(ginstate, index); indexInfo->ii_AmCache = ginstate; MemoryContextSwitchTo(oldCtx); @@ -903,7 +934,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, Size estsort; GinBuildShared *ginshared; Sharedsort *sharedsort; - GinLeader *ginleader = (GinLeader *) palloc0(sizeof(GinLeader)); + GinLeader *ginleader = palloc0_object(GinLeader); WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; @@ -1228,7 +1259,7 @@ AssertCheckGinBuffer(GinBuffer *buffer) static GinBuffer * GinBufferInit(Relation index) { - GinBuffer *buffer = palloc0(sizeof(GinBuffer)); + GinBuffer *buffer = palloc0_object(GinBuffer); int i, nKeys; TupleDesc desc = RelationGetDescr(index); @@ -1242,7 +1273,7 @@ GinBufferInit(Relation index) nKeys = IndexRelationGetNumberOfKeyAttributes(index); - buffer->ssup = palloc0(sizeof(SortSupportData) * nKeys); + buffer->ssup = palloc0_array(SortSupportData, nKeys); /* * Lookup ordering operator for the index key data type, and initialize @@ -1370,10 +1401,46 @@ GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup) * enough TIDs to trim (with values less than "first" TID from the new tuple), * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary * number). + * + * We try freezing TIDs at the beginning of the list first, before attempting + * to trim the buffer. This may allow trimming the data earlier, reducing the + * memory usage and excluding it from the mergesort. */ static bool GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup) { + /* + * Check if the last TID in the current list is frozen. This is the case + * when merging non-overlapping lists, e.g. in each parallel worker. + */ + if ((buffer->nitems > 0) && + (ItemPointerCompare(&buffer->items[buffer->nitems - 1], + GinTupleGetFirst(tup)) == 0)) + buffer->nfrozen = buffer->nitems; + + /* + * Now find the last TID we know to be frozen, i.e. the last TID right + * before the new GIN tuple. + * + * Start with the first not-yet-frozen tuple, and walk until we find the + * first TID that's higher. If we already know the whole list is frozen + * (i.e. nfrozen == nitems), this does nothing. + * + * XXX This might do a binary search for sufficiently long lists, but it + * does not seem worth the complexity. Overlapping lists should be rare + * common, TID comparisons are cheap, and we should quickly freeze most of + * the list. + */ + for (int i = buffer->nfrozen; i < buffer->nitems; i++) + { + /* Is the TID after the first TID of the new tuple? Can't freeze. */ + if (ItemPointerCompare(&buffer->items[i], + GinTupleGetFirst(tup)) > 0) + break; + + buffer->nfrozen++; + } + /* not enough TIDs to trim (1024 is somewhat arbitrary number) */ if (buffer->nfrozen < 1024) return false; @@ -1439,48 +1506,6 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup) buffer->key = (Datum) 0; } - /* - * Try freeze TIDs at the beginning of the list, i.e. exclude them from - * the mergesort. We can do that with TIDs before the first TID in the new - * tuple we're about to add into the buffer. - * - * We do this incrementally when adding data into the in-memory buffer, - * and not later (e.g. when hitting a memory limit), because it allows us - * to skip the frozen data during the mergesort, making it cheaper. - */ - - /* - * Check if the last TID in the current list is frozen. This is the case - * when merging non-overlapping lists, e.g. in each parallel worker. - */ - if ((buffer->nitems > 0) && - (ItemPointerCompare(&buffer->items[buffer->nitems - 1], - GinTupleGetFirst(tup)) == 0)) - buffer->nfrozen = buffer->nitems; - - /* - * Now find the last TID we know to be frozen, i.e. the last TID right - * before the new GIN tuple. - * - * Start with the first not-yet-frozen tuple, and walk until we find the - * first TID that's higher. If we already know the whole list is frozen - * (i.e. nfrozen == nitems), this does nothing. - * - * XXX This might do a binary search for sufficiently long lists, but it - * does not seem worth the complexity. Overlapping lists should be rare - * common, TID comparisons are cheap, and we should quickly freeze most of - * the list. - */ - for (int i = buffer->nfrozen; i < buffer->nitems; i++) - { - /* Is the TID after the first TID of the new tuple? Can't freeze. */ - if (ItemPointerCompare(&buffer->items[i], - GinTupleGetFirst(tup)) > 0) - break; - - buffer->nfrozen++; - } - /* add the new TIDs into the buffer, combine using merge-sort */ { int nnew; @@ -1759,7 +1784,7 @@ _gin_parallel_merge(GinBuildState *state) ++numtuples); } - /* relase all the memory */ + /* release all the memory */ GinBufferFree(buffer); tuplesort_end(state->bs_sortstate); @@ -1947,7 +1972,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort, GinBufferReset(buffer); } - /* relase all the memory */ + /* release all the memory */ GinBufferFree(buffer); tuplesort_end(worker_sort); @@ -2005,7 +2030,7 @@ _gin_parallel_scan_and_build(GinBuildState *state, IndexInfo *indexInfo; /* Initialize local tuplesort coordination state */ - coordinate = palloc0(sizeof(SortCoordinateData)); + coordinate = palloc0_object(SortCoordinateData); coordinate->isWorker = true; coordinate->nParticipants = -1; coordinate->sharedsort = sharedsort; @@ -2013,6 +2038,9 @@ _gin_parallel_scan_and_build(GinBuildState *state, /* remember how much space is allowed for the accumulated entries */ state->work_mem = (sortmem / 2); + /* remember how many workers participate in the build */ + state->bs_num_workers = ginshared->scantuplesortstates; + /* Begin "partial" tuplesort */ state->bs_sortstate = tuplesort_begin_index_gin(heap, index, state->work_mem, @@ -2187,9 +2215,12 @@ typedef struct * * For by-reference data types, we store the actual data. For by-val types * we simply copy the whole Datum, so that we don't have to care about stuff - * like endianess etc. We could make it a little bit smaller, but it's not + * like endianness etc. We could make it a little bit smaller, but it's not * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the - * start of the TID list anyway. So we wouldn't save anything. + * start of the TID list anyway. So we wouldn't save anything. (This would + * not be a good idea for the permanent in-index data, since we'd prefer + * that that not depend on sizeof(Datum). But this is just a transient + * representation to use while sorting the data.) * * The TID list is serialized as compressed - it's highly compressible, and * we already have ginCompressPostingList for this purpose. The list may be @@ -2233,7 +2264,7 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category, else if (typlen > 0) keylen = typlen; else if (typlen == -1) - keylen = VARSIZE_ANY(key); + keylen = VARSIZE_ANY(DatumGetPointer(key)); else if (typlen == -2) keylen = strlen(DatumGetPointer(key)) + 1; else @@ -2248,7 +2279,7 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category, while (ncompressed < nitems) { int cnt; - GinSegmentInfo *seginfo = palloc(sizeof(GinSegmentInfo)); + GinSegmentInfo *seginfo = palloc_object(GinSegmentInfo); seginfo->seg = ginCompressPostingList(&items[ncompressed], (nitems - ncompressed), @@ -2381,7 +2412,7 @@ _gin_parse_tuple_items(GinTuple *a) Assert(ndecoded == a->nitems); - return (ItemPointer) items; + return items; } /* diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c index 48eadec87b0b1..1bf061803dadd 100644 --- a/src/backend/access/gin/ginpostinglist.c +++ b/src/backend/access/gin/ginpostinglist.c @@ -84,7 +84,7 @@ #define MaxBytesPerInteger 7 static inline uint64 -itemptr_to_uint64(const ItemPointer iptr) +itemptr_to_uint64(const ItemPointerData *iptr) { uint64 val; @@ -194,7 +194,7 @@ decode_varbyte(unsigned char **ptr) * byte at the end, if any, is zero. */ GinPostingList * -ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, +ginCompressPostingList(const ItemPointerData *ipd, int nipd, int maxsize, int *nwritten) { uint64 prev; diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index c2d1771bd77b5..2ca635909f9cc 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -33,7 +33,7 @@ ginbeginscan(Relation rel, int nkeys, int norderbys) scan = RelationGetIndexScan(rel, nkeys, norderbys); /* allocate private workspace */ - so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData)); + so = (GinScanOpaque) palloc_object(GinScanOpaqueData); so->keys = NULL; so->nkeys = 0; so->tempCtx = AllocSetContextCreate(CurrentMemoryContext, @@ -98,7 +98,7 @@ ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum, } /* Nope, create a new entry */ - scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData)); + scanEntry = palloc_object(GinScanEntryData); scanEntry->queryKey = queryKey; scanEntry->queryCategory = queryCategory; scanEntry->isPartialMatch = isPartialMatch; @@ -123,8 +123,7 @@ ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum, if (so->totalentries >= so->allocentries) { so->allocentries *= 2; - so->entries = (GinScanEntry *) - repalloc(so->entries, so->allocentries * sizeof(GinScanEntry)); + so->entries = repalloc_array(so->entries, GinScanEntry, so->allocentries); } so->entries[so->totalentries++] = scanEntry; @@ -170,10 +169,8 @@ ginFillScanKey(GinScanOpaque so, OffsetNumber attnum, key->nuserentries = nQueryValues; /* Allocate one extra array slot for possible "hidden" entry */ - key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) * - (nQueryValues + 1)); - key->entryRes = (GinTernaryValue *) palloc0(sizeof(GinTernaryValue) * - (nQueryValues + 1)); + key->scanEntry = palloc_array(GinScanEntry, nQueryValues + 1); + key->entryRes = palloc0_array(GinTernaryValue, nQueryValues + 1); key->query = query; key->queryValues = queryValues; @@ -271,6 +268,7 @@ ginNewScanKey(IndexScanDesc scan) ScanKey scankey = scan->keyData; GinScanOpaque so = (GinScanOpaque) scan->opaque; int i; + int numExcludeOnly; bool hasNullQuery = false; bool attrHasNormalScan[INDEX_MAX_KEYS] = {false}; MemoryContext oldCtx; @@ -393,6 +391,7 @@ ginNewScanKey(IndexScanDesc scan) * excludeOnly scan key must receive a GIN_CAT_EMPTY_QUERY hidden entry * and be set to normal (excludeOnly = false). */ + numExcludeOnly = 0; for (i = 0; i < so->nkeys; i++) { GinScanKey key = &so->keys[i]; @@ -406,6 +405,47 @@ ginNewScanKey(IndexScanDesc scan) ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_QUERY); attrHasNormalScan[key->attnum - 1] = true; } + else + numExcludeOnly++; + } + + /* + * If we left any excludeOnly scan keys as-is, move them to the end of the + * scan key array: they must appear after normal key(s). + */ + if (numExcludeOnly > 0) + { + GinScanKey tmpkeys; + int iNormalKey; + int iExcludeOnly; + + /* We'd better have made at least one normal key */ + Assert(numExcludeOnly < so->nkeys); + /* Make a temporary array to hold the re-ordered scan keys */ + tmpkeys = (GinScanKey) palloc(so->nkeys * sizeof(GinScanKeyData)); + /* Re-order the keys ... */ + iNormalKey = 0; + iExcludeOnly = so->nkeys - numExcludeOnly; + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = &so->keys[i]; + + if (key->excludeOnly) + { + memcpy(tmpkeys + iExcludeOnly, key, sizeof(GinScanKeyData)); + iExcludeOnly++; + } + else + { + memcpy(tmpkeys + iNormalKey, key, sizeof(GinScanKeyData)); + iNormalKey++; + } + } + Assert(iNormalKey == so->nkeys - numExcludeOnly); + Assert(iExcludeOnly == so->nkeys); + /* ... and copy them back to so->keys[] */ + memcpy(so->keys, tmpkeys, so->nkeys * sizeof(GinScanKeyData)); + pfree(tmpkeys); } /* diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 78f7b7a2495cf..605f80aad3918 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -500,9 +500,9 @@ ginExtractEntries(GinState *ginstate, OffsetNumber attnum, if (isNull) { *nentries = 1; - entries = (Datum *) palloc(sizeof(Datum)); + entries = palloc_object(Datum); entries[0] = (Datum) 0; - *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + *categories = palloc_object(GinNullCategory); (*categories)[0] = GIN_CAT_NULL_ITEM; return entries; } @@ -522,9 +522,9 @@ ginExtractEntries(GinState *ginstate, OffsetNumber attnum, if (entries == NULL || *nentries <= 0) { *nentries = 1; - entries = (Datum *) palloc(sizeof(Datum)); + entries = palloc_object(Datum); entries[0] = (Datum) 0; - *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + *categories = palloc_object(GinNullCategory); (*categories)[0] = GIN_CAT_EMPTY_ITEM; return entries; } @@ -548,7 +548,7 @@ ginExtractEntries(GinState *ginstate, OffsetNumber attnum, keyEntryData *keydata; cmpEntriesArg arg; - keydata = (keyEntryData *) palloc(*nentries * sizeof(keyEntryData)); + keydata = palloc_array(keyEntryData, *nentries); for (i = 0; i < *nentries; i++) { keydata[i].datum = entries[i]; diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index fbbe3a6dd7046..d7baf7c847c4c 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -65,7 +65,7 @@ ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items, * First TID to be deleted: allocate memory to hold the * remaining items. */ - tmpitems = palloc(sizeof(ItemPointerData) * nitem); + tmpitems = palloc_array(ItemPointerData, nitem); memcpy(tmpitems, items, sizeof(ItemPointerData) * i); } } @@ -260,7 +260,7 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, { if (!parent->child) { - me = (DataPageDeleteStack *) palloc0(sizeof(DataPageDeleteStack)); + me = palloc0_object(DataPageDeleteStack); me->parent = parent; parent->child = me; me->leftBuffer = InvalidBuffer; @@ -547,7 +547,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 pfree(plist); PageIndexTupleDelete(tmppage, i); - if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i) + if (PageAddItem(tmppage, itup, IndexTupleSize(itup), i, false, false) != i) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gvs->index)); @@ -584,7 +584,7 @@ ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, if (stats == NULL) { /* Yes, so initialize stats to zeroes */ - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); /* * and cleanup any pending inserts @@ -714,7 +714,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) */ if (stats == NULL) { - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); initGinState(&ginstate, index); ginInsertCleanup(&ginstate, !AmAutoVacuumWorkerProcess(), false, true, stats); @@ -753,7 +753,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_SHARE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (GinPageIsRecyclable(page)) { diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 55a1ec09776ba..34c01a01165c3 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -30,7 +30,7 @@ ginRedoClearIncompleteSplit(XLogReaderState *record, uint8 block_id) if (XLogReadBufferForRedo(record, block_id, &buffer) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); GinPageGetOpaque(page)->flags &= ~GIN_INCOMPLETE_SPLIT; PageSetLSN(page, lsn); @@ -50,7 +50,7 @@ ginRedoCreatePTree(XLogReaderState *record) Page page; buffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED); @@ -93,7 +93,7 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda itup = &data->tuple; - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber) + if (PageAddItem(page, itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber) { RelFileLocator locator; ForkNumber forknum; @@ -119,12 +119,12 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) int actionno; int segno; GinPostingList *oldseg; - Pointer segmentend; + char *segmentend; char *walbuf; int totalsize; - Pointer tailCopy = NULL; - Pointer writePtr; - Pointer segptr; + void *tailCopy = NULL; + char *writePtr; + char *segptr; /* * If the page is in pre-9.4 format, convert to new format first. @@ -164,8 +164,8 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) } oldseg = GinDataLeafPageGetPostingList(page); - writePtr = (Pointer) oldseg; - segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page); + writePtr = (char *) oldseg; + segmentend = (char *) oldseg + GinDataLeafPageGetPostingListSize(page); segno = 0; walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf); @@ -212,7 +212,7 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) if (tailCopy) { Assert(writePtr + segsize < PageGetSpecialPointer(page)); - memcpy(writePtr, (Pointer) oldseg, segsize); + memcpy(writePtr, oldseg, segsize); } writePtr += segsize; oldseg = GinNextPostingListSegment(oldseg); @@ -243,7 +243,7 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) a_action = GIN_SEGMENT_REPLACE; } - segptr = (Pointer) oldseg; + segptr = (char *) oldseg; if (segptr != segmentend) segsize = SizeOfGinPostingList(oldseg); else @@ -264,7 +264,7 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) { int tailSize = segmentend - segptr; - tailCopy = (Pointer) palloc(tailSize); + tailCopy = palloc(tailSize); memcpy(tailCopy, segptr, tailSize); segptr = tailCopy; oldseg = (GinPostingList *) segptr; @@ -301,7 +301,7 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) } /* Copy the rest of unmodified segments if any. */ - segptr = (Pointer) oldseg; + segptr = (char *) oldseg; if (segptr != segmentend && tailCopy) { int restSize = segmentend - segptr; @@ -311,7 +311,7 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) writePtr += restSize; } - totalsize = writePtr - (Pointer) GinDataLeafPageGetPostingList(page); + totalsize = writePtr - (char *) GinDataLeafPageGetPostingList(page); GinDataPageSetDataSize(page, totalsize); } @@ -368,7 +368,6 @@ ginRedoInsert(XLogReaderState *record) #endif payload += sizeof(BlockIdData); rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload); - payload += sizeof(BlockIdData); ginRedoClearIncompleteSplit(record, 1); } @@ -574,8 +573,7 @@ ginRedoUpdateMetapage(XLogReaderState *record) { tupsize = IndexTupleSize(tuples); - if (PageAddItem(page, (Item) tuples, tupsize, off, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, tuples, tupsize, off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page"); tuples = (IndexTuple) (((char *) tuples) + tupsize); @@ -655,7 +653,7 @@ ginRedoInsertListPage(XLogReaderState *record) { tupsize = IndexTupleSize(tuples); - l = PageAddItem(page, (Item) tuples, tupsize, off, false, false); + l = PageAddItem(page, tuples, tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page"); diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 8015ff19f05bc..76e0e11f2283a 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -172,7 +172,7 @@ it splits the page, and constructs the new downlink tuples for the split pages. The caller must then call gistplacetopage() on the parent page to insert the downlink tuples. The parent page that holds the downlink to the child might have migrated as a result of concurrent splits of the -parent, gistFindCorrectParent() is used to find the parent page. +parent, so gistFindCorrectParent() is used to find the parent page. Splitting the root page works slightly differently. At root split, gistplacetopage() allocates the new child pages and replaces the old root @@ -291,7 +291,7 @@ Buffering build algorithm ------------------------- In the buffering index build algorithm, some or all internal nodes have a -buffer attached to them. When a tuple is inserted at the top, the descend down +buffer attached to them. When a tuple is inserted at the top, the descent down the tree is stopped as soon as a buffer is reached, and the tuple is pushed to the buffer. When a buffer gets too full, all the tuples in it are flushed to the lower level, where they again hit lower level buffers or leaf pages. This @@ -455,8 +455,8 @@ be reused. In order to delete an empty page, its downlink must be removed from the parent. We scan all the internal pages, whose block numbers we memorized in the first stage, and look for downlinks to pages that we have memorized as being empty. Whenever we find one, we acquire a lock on the parent and child -page, re-check that the child page is still empty. Then, we remove the -downlink and mark the child as deleted, and release the locks. +page and re-check that the child page is still empty. Then we remove the +downlink, mark the child as deleted, and release the locks. The insertion algorithm would get confused, if an internal page was completely empty. So we never delete the last child of an internal page, even if it's diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 7b24380c97801..c26d8538f0539 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -43,7 +43,7 @@ static void gistprunepage(Relation rel, Page page, Buffer buffer, #define ROTATEDIST(d) do { \ - SplitPageLayout *tmp = (SplitPageLayout *) palloc0(sizeof(SplitPageLayout)); \ + SplitPageLayout *tmp = palloc0_object(SplitPageLayout); \ tmp->block.blkno = InvalidBlockNumber; \ tmp->buffer = InvalidBuffer; \ tmp->next = (d); \ @@ -392,7 +392,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, /* Prepare a vector of all the downlinks */ for (ptr = dist; ptr; ptr = ptr->next) ndownlinks++; - downlinks = palloc(sizeof(IndexTuple) * ndownlinks); + downlinks = palloc_array(IndexTuple, ndownlinks); for (i = 0, ptr = dist; ptr; ptr = ptr->next) downlinks[i++] = ptr->itup; @@ -410,7 +410,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, /* Prepare split-info to be returned to caller */ for (ptr = dist; ptr; ptr = ptr->next) { - GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + GISTPageSplitInfo *si = palloc_object(GISTPageSplitInfo); si->buf = ptr->buffer; si->downlink = ptr->itup; @@ -430,7 +430,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, { IndexTuple thistup = (IndexTuple) data; - if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) + if (PageAddItem(ptr->page, data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); /* @@ -551,8 +551,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, if (ntup == 1) { /* One-for-one replacement, so use PageIndexTupleOverwrite */ - if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup, - IndexTupleSize(*itup))) + if (!PageIndexTupleOverwrite(page, oldoffnum, *itup, IndexTupleSize(*itup))) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); } @@ -683,7 +682,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, state.stack = stack = stack->parent; } - if (XLogRecPtrIsInvalid(stack->lsn)) + if (!XLogRecPtrIsValid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* @@ -696,10 +695,10 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, gistcheckpage(state.r, stack->buffer); } - stack->page = (Page) BufferGetPage(stack->buffer); + stack->page = BufferGetPage(stack->buffer); stack->lsn = xlocked ? PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer); - Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); + Assert(!RelationNeedsWAL(state.r) || XLogRecPtrIsValid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the @@ -783,7 +782,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; - stack->page = (Page) BufferGetPage(stack->buffer); + stack->page = BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { @@ -824,7 +823,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, xlocked = false; /* descend to the chosen child */ - item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + item = palloc0_object(GISTInsertStack); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; @@ -847,7 +846,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; - stack->page = (Page) BufferGetPage(stack->buffer); + stack->page = BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) @@ -924,7 +923,7 @@ gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) *ptr; BlockNumber blkno; - top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + top = palloc0_object(GISTInsertStack); top->blkno = GIST_ROOT_BLKNO; top->downlinkoffnum = InvalidOffsetNumber; @@ -938,7 +937,7 @@ gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) buffer = ReadBuffer(r, top->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(r, buffer); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (GistPageIsLeaf(page)) { @@ -976,7 +975,7 @@ gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) * leaf pages, and we assume that there can't be any non-leaf * pages behind leaf pages. */ - ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr = palloc0_object(GISTInsertStack); ptr->blkno = GistPageGetOpaque(page)->rightlink; ptr->downlinkoffnum = InvalidOffsetNumber; ptr->parent = top->parent; @@ -1001,7 +1000,7 @@ gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) else { /* Append this child to the list of pages to visit later */ - ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr = palloc0_object(GISTInsertStack); ptr->blkno = blkno; ptr->downlinkoffnum = i; ptr->parent = top; @@ -1033,7 +1032,7 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child, bool is_build) GISTInsertStack *ptr; gistcheckpage(r, parent->buffer); - parent->page = (Page) BufferGetPage(parent->buffer); + parent->page = BufferGetPage(parent->buffer); maxoff = PageGetMaxOffsetNumber(parent->page); /* Check if the downlink is still where it was before */ @@ -1098,7 +1097,7 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child, bool is_build) parent->buffer = ReadBuffer(r, parent->blkno); LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); - parent->page = (Page) BufferGetPage(parent->buffer); + parent->page = BufferGetPage(parent->buffer); } /* @@ -1121,7 +1120,7 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child, bool is_build) while (ptr) { ptr->buffer = ReadBuffer(r, ptr->blkno); - ptr->page = (Page) BufferGetPage(ptr->buffer); + ptr->page = BufferGetPage(ptr->buffer); ptr = ptr->parent; } @@ -1219,7 +1218,7 @@ gistfixsplit(GISTInsertState *state, GISTSTATE *giststate) */ for (;;) { - GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + GISTPageSplitInfo *si = palloc_object(GISTPageSplitInfo); IndexTuple downlink; page = BufferGetPage(buf); @@ -1483,8 +1482,8 @@ gistSplit(Relation r, gistSplitByKey(r, page, itup, len, giststate, &v, 0); /* form left and right vector */ - lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); - rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); + lvectup = palloc_array(IndexTuple, len + 1); + rvectup = palloc_array(IndexTuple, len + 1); for (i = 0; i < v.splitVector.spl_nleft; i++) lvectup[i] = itup[v.splitVector.spl_left[i] - 1]; @@ -1553,7 +1552,7 @@ initGISTstate(Relation index) oldCxt = MemoryContextSwitchTo(scanCxt); /* Create and fill in the GISTSTATE */ - giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); + giststate = palloc_object(GISTSTATE); giststate->scanCxt = scanCxt; giststate->tempCxt = scanCxt; /* caller must change this if needed */ diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 9e707167d984b..b9fa196149dc1 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -346,7 +346,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) /* * Return statistics */ - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result = palloc_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; @@ -409,7 +409,7 @@ gist_indexsortbuild(GISTBuildState *state) state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM); /* Allocate a temporary buffer for the first leaf page batch. */ - levelstate = palloc0(sizeof(GistSortedBuildLevelState)); + levelstate = palloc0_object(GistSortedBuildLevelState); levelstate->pages[0] = palloc(BLCKSZ); levelstate->parent = NULL; gistinitpage(levelstate->pages[0], F_LEAF); @@ -526,7 +526,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, else { /* Create split layout from single page */ - dist = (SplitPageLayout *) palloc0(sizeof(SplitPageLayout)); + dist = palloc0_object(SplitPageLayout); union_tuple = gistunion(state->indexrel, itvec, vect_len, state->giststate); dist->itup = union_tuple; @@ -558,7 +558,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, { IndexTuple thistup = (IndexTuple) data; - if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) + if (PageAddItem(target, data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel)); data += IndexTupleSize(thistup); @@ -597,7 +597,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, parent = levelstate->parent; if (parent == NULL) { - parent = palloc0(sizeof(GistSortedBuildLevelState)); + parent = palloc0_object(GistSortedBuildLevelState); parent->pages[0] = palloc(BLCKSZ); parent->parent = NULL; gistinitpage(parent->pages[0], 0); @@ -969,7 +969,7 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, buffer = ReadBuffer(indexrel, blkno); LockBuffer(buffer, GIST_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); childoffnum = gistchoose(indexrel, page, itup, giststate); iid = PageGetItemId(page, childoffnum); idxtuple = (IndexTuple) PageGetItem(page, iid); @@ -1154,7 +1154,7 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, /* Create an array of all the downlink tuples */ ndownlinks = list_length(splitinfo); - downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks); + downlinks = palloc_array(IndexTuple, ndownlinks); i = 0; foreach(lc, splitinfo) { @@ -1448,7 +1448,7 @@ gistGetMaxLevel(Relation index) * pro forma. */ LockBuffer(buffer, GIST_SHARE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (GistPageIsLeaf(page)) { diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c index 0707254d18ea1..c86d17c4a562b 100644 --- a/src/backend/access/gist/gistbuildbuffers.c +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -46,7 +46,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) GISTBuildBuffers *gfbb; HASHCTL hashCtl; - gfbb = palloc(sizeof(GISTBuildBuffers)); + gfbb = palloc_object(GISTBuildBuffers); gfbb->pagesPerBuffer = pagesPerBuffer; gfbb->levelStep = levelStep; @@ -60,7 +60,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) /* Initialize free page management. */ gfbb->nFreeBlocks = 0; gfbb->freeBlocksLen = 32; - gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); + gfbb->freeBlocks = palloc_array(long, gfbb->freeBlocksLen); /* * Current memory context will be used for all in-memory data structures @@ -87,8 +87,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) * buffers are inserted here when they are created. */ gfbb->buffersOnLevelsLen = 1; - gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * - gfbb->buffersOnLevelsLen); + gfbb->buffersOnLevels = palloc_array(List *, gfbb->buffersOnLevelsLen); gfbb->buffersOnLevels[0] = NIL; /* @@ -96,8 +95,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) * into main memory. */ gfbb->loadedBuffersLen = 32; - gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * - sizeof(GISTNodeBuffer *)); + gfbb->loadedBuffers = palloc_array(GISTNodeBuffer *, gfbb->loadedBuffersLen); gfbb->loadedBuffersCount = 0; gfbb->rootlevel = maxLevel; @@ -582,9 +580,7 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, * Allocate memory for information about relocation buffers. */ splitPagesCount = list_length(splitinfo); - relocationBuffersInfos = - (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * - splitPagesCount); + relocationBuffersInfos = palloc_array(RelocationBufferInfo, splitPagesCount); /* * Fill relocation buffers information for node buffers of pages produced diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 387d997234537..9ba45acfff384 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -46,7 +46,7 @@ gistkillitems(IndexScanDesc scan) bool killedsomething = false; Assert(so->curBlkno != InvalidBlockNumber); - Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); + Assert(XLogRecPtrIsValid(so->curPageLSN)); Assert(so->killedItems != NULL); buffer = ReadBuffer(scan->indexRelation, so->curBlkno); @@ -353,7 +353,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * parentlsn < nsn), or if the system crashed after a page split but * before the downlink was inserted into the parent. */ - if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && + if (XLogRecPtrIsValid(pageItem->data.parentlsn) && (GistFollowRight(page) || pageItem->data.parentlsn < GistPageGetNSN(page)) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c index 392163cb22900..9ac06504be17e 100644 --- a/src/backend/access/gist/gistproc.c +++ b/src/backend/access/gist/gistproc.c @@ -171,7 +171,7 @@ gist_box_union(PG_FUNCTION_ARGS) *pageunion; numranges = entryvec->n; - pageunion = (BOX *) palloc(sizeof(BOX)); + pageunion = palloc_object(BOX); cur = DatumGetBoxP(entryvec->vector[0].key); memcpy(pageunion, cur, sizeof(BOX)); @@ -237,7 +237,7 @@ fallbackSplit(GistEntryVector *entryvec, GIST_SPLITVEC *v) v->spl_left[v->spl_nleft] = i; if (unionL == NULL) { - unionL = (BOX *) palloc(sizeof(BOX)); + unionL = palloc_object(BOX); *unionL = *cur; } else @@ -250,7 +250,7 @@ fallbackSplit(GistEntryVector *entryvec, GIST_SPLITVEC *v) v->spl_right[v->spl_nright] = i; if (unionR == NULL) { - unionR = (BOX *) palloc(sizeof(BOX)); + unionR = palloc_object(BOX); *unionR = *cur; } else @@ -698,8 +698,8 @@ gist_box_picksplit(PG_FUNCTION_ARGS) v->spl_nright = 0; /* Allocate bounding boxes of left and right groups */ - leftBox = palloc0(sizeof(BOX)); - rightBox = palloc0(sizeof(BOX)); + leftBox = palloc0_object(BOX); + rightBox = palloc0_object(BOX); /* * Allocate an array for "common entries" - entries which can be placed to @@ -1042,10 +1042,10 @@ gist_poly_compress(PG_FUNCTION_ARGS) POLYGON *in = DatumGetPolygonP(entry->key); BOX *r; - r = (BOX *) palloc(sizeof(BOX)); + r = palloc_object(BOX); memcpy(r, &(in->boundbox), sizeof(BOX)); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); @@ -1107,13 +1107,13 @@ gist_circle_compress(PG_FUNCTION_ARGS) CIRCLE *in = DatumGetCircleP(entry->key); BOX *r; - r = (BOX *) palloc(sizeof(BOX)); + r = palloc_object(BOX); r->high.x = float8_pl(in->center.x, in->radius); r->low.x = float8_mi(in->center.x, in->radius); r->high.y = float8_pl(in->center.y, in->radius); r->low.y = float8_mi(in->center.y, in->radius); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); @@ -1171,9 +1171,9 @@ gist_point_compress(PG_FUNCTION_ARGS) if (entry->leafkey) /* Point, actually */ { - BOX *box = palloc(sizeof(BOX)); + BOX *box = palloc_object(BOX); Point *point = DatumGetPointP(entry->key); - GISTENTRY *retval = palloc(sizeof(GISTENTRY)); + GISTENTRY *retval = palloc_object(GISTENTRY); box->high = box->low = *point; @@ -1200,9 +1200,9 @@ gist_point_fetch(PG_FUNCTION_ARGS) Point *r; GISTENTRY *retval; - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); - r = (Point *) palloc(sizeof(Point)); + r = palloc_object(Point); r->x = in->high.x; r->y = in->high.y; gistentryinit(*retval, PointerGetDatum(r), @@ -1707,8 +1707,8 @@ gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup) * Abbreviated version of Z-order comparison * * The abbreviated format is a Z-order value computed from the two 32-bit - * floats. If SIZEOF_DATUM == 8, the 64-bit Z-order value fits fully in the - * abbreviated Datum, otherwise use its most significant bits. + * floats. Now that sizeof(Datum) is always 8, the 64-bit Z-order value + * always fits fully in the abbreviated Datum. */ static Datum gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup) @@ -1718,11 +1718,7 @@ gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup) z = point_zorder_internal(p->x, p->y); -#if SIZEOF_DATUM == 8 - return (Datum) z; -#else - return (Datum) (z >> 32); -#endif + return UInt64GetDatum(z); } /* diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 700fa959d03d8..01b8ff0b6facf 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -90,7 +90,7 @@ gistbeginscan(Relation r, int nkeys, int norderbys) oldCxt = MemoryContextSwitchTo(giststate->scanCxt); /* initialize opaque data */ - so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData)); + so = palloc0_object(GISTScanOpaqueData); so->giststate = giststate; giststate->tempCxt = createTempGistContext(); so->queue = NULL; @@ -101,8 +101,8 @@ gistbeginscan(Relation r, int nkeys, int norderbys) so->qual_ok = true; /* in case there are zero keys */ if (scan->numberOfOrderBys > 0) { - scan->xs_orderbyvals = palloc0(sizeof(Datum) * scan->numberOfOrderBys); - scan->xs_orderbynulls = palloc(sizeof(bool) * scan->numberOfOrderBys); + scan->xs_orderbyvals = palloc0_array(Datum, scan->numberOfOrderBys); + scan->xs_orderbynulls = palloc_array(bool, scan->numberOfOrderBys); memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); } diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c index 49838ceb07b19..21fea505d9c6d 100644 --- a/src/backend/access/gist/gistsplit.c +++ b/src/backend/access/gist/gistsplit.c @@ -51,7 +51,7 @@ gistunionsubkeyvec(GISTSTATE *giststate, IndexTuple *itvec, int i, cleanedLen = 0; - cleanedItVec = (IndexTuple *) palloc(sizeof(IndexTuple) * gsvp->len); + cleanedItVec = palloc_array(IndexTuple, gsvp->len); for (i = 0; i < gsvp->len; i++) { @@ -501,7 +501,7 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec * Locate don't-care tuples, if any. If there are none, the split is * optimal, so just fall out and return false. */ - v->spl_dontcare = (bool *) palloc0(sizeof(bool) * (entryvec->n + 1)); + v->spl_dontcare = palloc0_array(bool, entryvec->n + 1); NumDontCare = findDontCares(r, giststate, entryvec->vector, v, attno); @@ -738,9 +738,9 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, * call will overwrite that with its own result. */ backupSplit = v->splitVector; - backupSplit.spl_left = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); + backupSplit.spl_left = palloc_array(OffsetNumber, len); memcpy(backupSplit.spl_left, v->splitVector.spl_left, sizeof(OffsetNumber) * v->splitVector.spl_nleft); - backupSplit.spl_right = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); + backupSplit.spl_right = palloc_array(OffsetNumber, len); memcpy(backupSplit.spl_right, v->splitVector.spl_right, sizeof(OffsetNumber) * v->splitVector.spl_nright); /* Recursively decide how to split the don't-care tuples */ diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index a6b701943d3de..6fc56c3c893f8 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -44,10 +44,10 @@ gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off) Size sz = IndexTupleSize(itup[i]); OffsetNumber l; - l = PageAddItem(page, (Item) itup[i], sz, off, false, false); + l = PageAddItem(page, itup[i], sz, off, false, false); if (l == InvalidOffsetNumber) - elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes", - i, len, (int) sz); + elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %zu bytes", + i, len, sz); off++; } } @@ -100,7 +100,7 @@ gistextractpage(Page page, int *len /* out */ ) maxoff = PageGetMaxOffsetNumber(page); *len = maxoff; - itvec = palloc(sizeof(IndexTuple) * maxoff); + itvec = palloc_array(IndexTuple, maxoff); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); @@ -113,7 +113,7 @@ gistextractpage(Page page, int *len /* out */ ) IndexTuple * gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen) { - itvec = (IndexTuple *) repalloc(itvec, sizeof(IndexTuple) * ((*len) + addlen)); + itvec = repalloc_array(itvec, IndexTuple, (*len) + addlen); memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen); *len += addlen; return itvec; @@ -157,7 +157,7 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, { int i; GistEntryVector *evec; - int attrsize; + int attrsize = 0; /* silence compiler warning */ evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ); @@ -242,7 +242,7 @@ gistMakeUnionKey(GISTSTATE *giststate, int attno, char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ]; } storage; GistEntryVector *evec = &storage.gev; - int dstsize; + int dstsize = 0; /* silence compiler warning */ evec->n = 2; @@ -1040,7 +1040,7 @@ gistGetFakeLSN(Relation rel) Assert(!RelationNeedsWAL(rel)); /* No need for an actual record if we already have a distinct LSN */ - if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn) + if (XLogRecPtrIsValid(lastlsn) && lastlsn == currlsn) currlsn = gistXLogAssignLSN(); lastlsn = currlsn; @@ -1058,11 +1058,11 @@ gistGetFakeLSN(Relation rel) } /* - * This is a stratnum support function for GiST opclasses that use the - * RT*StrategyNumber constants. + * This is a stratnum translation support function for GiST opclasses that use + * the RT*StrategyNumber constants. */ Datum -gist_stratnum_common(PG_FUNCTION_ARGS) +gist_translate_cmptype_common(PG_FUNCTION_ARGS) { CompareType cmptype = PG_GETARG_INT32(0); @@ -1090,9 +1090,9 @@ gist_stratnum_common(PG_FUNCTION_ARGS) /* * Returns the opclass's private stratnum used for the given compare type. * - * Calls the opclass's GIST_STRATNUM_PROC support function, if any, - * and returns the result. - * Returns InvalidStrategy if the function is not defined. + * Calls the opclass's GIST_TRANSLATE_CMPTYPE_PROC support function, if any, + * and returns the result. Returns InvalidStrategy if the function is not + * defined. */ StrategyNumber gisttranslatecmptype(CompareType cmptype, Oid opfamily) @@ -1101,7 +1101,7 @@ gisttranslatecmptype(CompareType cmptype, Oid opfamily) Datum result; /* Check whether the function is provided. */ - funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_STRATNUM_PROC); + funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_TRANSLATE_CMPTYPE_PROC); if (!OidIsValid(funcid)) return InvalidStrategy; diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index dca236b6e5735..7591ad4da1acd 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -61,7 +61,7 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, { /* allocate stats if first time through, else re-use existing struct */ if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); gistvacuumscan(info, stats, callback, callback_state); @@ -85,7 +85,7 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) */ if (stats == NULL) { - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); gistvacuumscan(info, stats, NULL, NULL); } @@ -330,7 +330,7 @@ gistvacuumpage(GistVacState *vstate, Buffer buffer) * exclusive lock. */ LockBuffer(buffer, GIST_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (gistPageRecyclable(page)) { @@ -528,7 +528,7 @@ gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistVacState *vstate) RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)) { diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c index 2a49e6d20f049..2ed6f74fce97b 100644 --- a/src/backend/access/gist/gistvalidate.c +++ b/src/backend/access/gist/gistvalidate.c @@ -138,7 +138,7 @@ gistvalidate(Oid opclassoid) ok = check_amproc_signature(procform->amproc, VOIDOID, true, 1, 1, INTERNALOID); break; - case GIST_STRATNUM_PROC: + case GIST_TRANSLATE_CMPTYPE_PROC: ok = check_amproc_signature(procform->amproc, INT2OID, true, 1, 1, INT4OID) && procform->amproclefttype == ANYOID && @@ -265,7 +265,7 @@ gistvalidate(Oid opclassoid) if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC || - i == GIST_STRATNUM_PROC) + i == GIST_TRANSLATE_CMPTYPE_PROC) continue; /* optional methods */ ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -336,7 +336,7 @@ gistadjustmembers(Oid opfamilyoid, case GIST_FETCH_PROC: case GIST_OPTIONS_PROC: case GIST_SORTSUPPORT_PROC: - case GIST_STRATNUM_PROC: + case GIST_TRANSLATE_CMPTYPE_PROC: /* Optional, so force it to be a soft family dependency */ op->ref_is_hard = false; op->ref_is_family = true; diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index b354e4ba5d1b7..6484ca5e2cac2 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -83,7 +83,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record) data = begin = XLogRecGetBlockData(record, 0, &datalen); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (xldata->ntodelete == 1 && xldata->ntoinsert == 1) { @@ -98,9 +98,8 @@ gistRedoPageUpdateRecord(XLogReaderState *record) data += sizeof(OffsetNumber); itup = (IndexTuple) data; itupsize = IndexTupleSize(itup); - if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize)) - elog(ERROR, "failed to add item to GiST index page, size %d bytes", - (int) itupsize); + if (!PageIndexTupleOverwrite(page, offnum, itup, itupsize)) + elog(ERROR, "failed to add item to GiST index page, size %zu bytes", itupsize); data += itupsize; /* should be nothing left after consuming 1 tuple */ Assert(data - begin == datalen); @@ -133,10 +132,9 @@ gistRedoPageUpdateRecord(XLogReaderState *record) data += sz; - l = PageAddItem(page, (Item) itup, sz, off, false, false); + l = PageAddItem(page, itup, sz, off, false, false); if (l == InvalidOffsetNumber) - elog(ERROR, "failed to add item to GiST index page, size %d bytes", - (int) sz); + elog(ERROR, "failed to add item to GiST index page, size %zu bytes", sz); off++; ninserted++; } @@ -201,7 +199,7 @@ gistRedoDeleteRecord(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); PageIndexMultiDelete(page, toDelete, xldata->ntodelete); @@ -280,7 +278,7 @@ gistRedoPageSplitRecord(XLogReaderState *record) } buffer = XLogInitBufferForRedo(record, i + 1); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); data = XLogRecGetBlockData(record, i + 1, &datalen); tuples = decodePageSplitRecord(data, datalen, &num); @@ -348,7 +346,7 @@ gistRedoPageDelete(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &leafBuffer) == BLK_NEEDS_REDO) { - Page page = (Page) BufferGetPage(leafBuffer); + Page page = BufferGetPage(leafBuffer); GistPageSetDeleted(page, xldata->deleteXid); @@ -358,7 +356,7 @@ gistRedoPageDelete(XLogReaderState *record) if (XLogReadBufferForRedo(record, 1, &parentBuffer) == BLK_NEEDS_REDO) { - Page page = (Page) BufferGetPage(parentBuffer); + Page page = BufferGetPage(parentBuffer); PageIndexTupleDelete(page, xldata->downlinkOffset); diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 13dc59c124a75..fc9031117c98b 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -171,11 +171,10 @@ Metapage Caching Both scanning the index and inserting tuples require locating the bucket where a given tuple ought to be located. To do this, we need the bucket count, highmask, and lowmask from the metapage; however, it's undesirable -for performance reasons to have to have to lock and pin the metapage for -every such operation. Instead, we retain a cached copy of the metapage -in each backend's relcache entry. This will produce the correct -bucket mapping as long as the target bucket hasn't been split since the -last cache refresh. +for performance reasons to have to lock and pin the metapage for every such +operation. Instead, we retain a cached copy of the metapage in each backend's +relcache entry. This will produce the correct bucket mapping as long as the +target bucket hasn't been split since the last cache refresh. To guard against the possibility that such a split has occurred, the primary page of each bucket chain stores the number of buckets that diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 53061c819fbf0..e388252afdcc8 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -193,7 +193,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) /* * Return statistics */ - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result = palloc_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; @@ -318,8 +318,7 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) * entries. */ if (so->killedItems == NULL) - so->killedItems = (int *) - palloc(MaxIndexTuplesPerPage * sizeof(int)); + so->killedItems = palloc_array(int, MaxIndexTuplesPerPage); if (so->numKilled < MaxIndexTuplesPerPage) so->killedItems[so->numKilled++] = so->currPos.itemIndex; @@ -381,7 +380,7 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) scan = RelationGetIndexScan(rel, nkeys, norderbys); - so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); + so = (HashScanOpaque) palloc_object(HashScanOpaqueData); HashScanPosInvalidate(so->currPos); so->hashso_bucket_buf = InvalidBuffer; so->hashso_split_bucket_buf = InvalidBuffer; @@ -633,7 +632,7 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* return statistics */ if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index 8d97067fe5403..b3c0294884c07 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -38,7 +38,7 @@ hash_xlog_init_meta_page(XLogReaderState *record) Assert(BufferIsValid(metabuf)); _hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid, xlrec->ffactor, true); - page = (Page) BufferGetPage(metabuf); + page = BufferGetPage(metabuf); PageSetLSN(page, lsn); MarkBufferDirty(metabuf); @@ -137,8 +137,7 @@ hash_xlog_insert(XLogReaderState *record) page = BufferGetPage(buffer); - if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "hash_xlog_insert: failed to add item"); PageSetLSN(page, lsn); @@ -235,7 +234,7 @@ hash_xlog_add_ovfl_page(XLogReaderState *record) if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) { - Page mappage = (Page) BufferGetPage(mapbuffer); + Page mappage = BufferGetPage(mapbuffer); uint32 *freep = NULL; uint32 *bitmap_page_bit; @@ -315,8 +314,6 @@ hash_xlog_split_allocate_page(XLogReaderState *record) Buffer oldbuf; Buffer newbuf; Buffer metabuf; - Size datalen PG_USED_FOR_ASSERTS_ONLY; - char *data; XLogRedoAction action; /* @@ -376,6 +373,10 @@ hash_xlog_split_allocate_page(XLogReaderState *record) { Page page; HashMetaPage metap; + Size datalen; + char *data; + uint32 *uidata; + int uidatacount; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); @@ -383,34 +384,31 @@ hash_xlog_split_allocate_page(XLogReaderState *record) data = XLogRecGetBlockData(record, 2, &datalen); + /* + * This cast is ok because XLogRecGetBlockData() returns a MAXALIGNed + * buffer. + */ + uidata = (uint32 *) data; + uidatacount = 0; + if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) { - uint32 lowmask; - uint32 *highmask; - - /* extract low and high masks. */ - memcpy(&lowmask, data, sizeof(uint32)); - highmask = (uint32 *) ((char *) data + sizeof(uint32)); + uint32 lowmask = uidata[uidatacount++]; + uint32 highmask = uidata[uidatacount++]; /* update metapage */ metap->hashm_lowmask = lowmask; - metap->hashm_highmask = *highmask; - - data += sizeof(uint32) * 2; + metap->hashm_highmask = highmask; } if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) { - uint32 ovflpoint; - uint32 *ovflpages; - - /* extract information of overflow pages. */ - memcpy(&ovflpoint, data, sizeof(uint32)); - ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); + uint32 ovflpoint = uidata[uidatacount++]; + uint32 ovflpages = uidata[uidatacount++]; /* update metapage */ - metap->hashm_spares[ovflpoint] = *ovflpages; metap->hashm_ovflpoint = ovflpoint; + metap->hashm_spares[ovflpoint] = ovflpages; } MarkBufferDirty(metabuf); @@ -538,7 +536,7 @@ hash_xlog_move_page_contents(XLogReaderState *record) data = begin = XLogRecGetBlockData(record, 1, &datalen); - writepage = (Page) BufferGetPage(writebuf); + writepage = BufferGetPage(writebuf); if (xldata->ntups > 0) { @@ -557,10 +555,9 @@ hash_xlog_move_page_contents(XLogReaderState *record) data += itemsz; - l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + l = PageAddItem(writepage, itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) - elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", - (int) itemsz); + elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %zu bytes", itemsz); ninserted++; } @@ -584,7 +581,7 @@ hash_xlog_move_page_contents(XLogReaderState *record) ptr = XLogRecGetBlockData(record, 2, &len); - page = (Page) BufferGetPage(deletebuf); + page = BufferGetPage(deletebuf); if (len > 0) { @@ -592,7 +589,7 @@ hash_xlog_move_page_contents(XLogReaderState *record) OffsetNumber *unend; unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); + unend = (OffsetNumber *) (ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); @@ -670,7 +667,7 @@ hash_xlog_squeeze_page(XLogReaderState *record) data = begin = XLogRecGetBlockData(record, 1, &datalen); - writepage = (Page) BufferGetPage(writebuf); + writepage = BufferGetPage(writebuf); if (xldata->ntups > 0) { @@ -689,10 +686,9 @@ hash_xlog_squeeze_page(XLogReaderState *record) data += itemsz; - l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + l = PageAddItem(writepage, itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) - elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", - (int) itemsz); + elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %zu bytes", itemsz); ninserted++; } @@ -807,7 +803,7 @@ hash_xlog_squeeze_page(XLogReaderState *record) /* replay the record for bitmap page */ if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) { - Page mappage = (Page) BufferGetPage(mapbuf); + Page mappage = BufferGetPage(mapbuf); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; @@ -895,7 +891,7 @@ hash_xlog_delete(XLogReaderState *record) ptr = XLogRecGetBlockData(record, 1, &len); - page = (Page) BufferGetPage(deletebuf); + page = BufferGetPage(deletebuf); if (len > 0) { @@ -903,7 +899,7 @@ hash_xlog_delete(XLogReaderState *record) OffsetNumber *unend; unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); + unend = (OffsetNumber *) (ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); @@ -946,7 +942,7 @@ hash_xlog_split_cleanup(XLogReaderState *record) { HashPageOpaque bucket_opaque; - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); bucket_opaque = HashPageGetOpaque(page); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; @@ -1029,7 +1025,7 @@ hash_xlog_vacuum_one_page(XLogReaderState *record) if (action == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); PageIndexMultiDelete(page, toDelete, xldata->ntuples); diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 10de1580dc211..0f9f97f7e3d09 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -310,10 +310,8 @@ _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup, itup_off = _hash_binsearch(page, hashkey); } - if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) - == InvalidOffsetNumber) - elog(ERROR, "failed to add index item to \"%s\"", - RelationGetRelationName(rel)); + if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); return itup_off; } @@ -352,10 +350,8 @@ _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, itup_offsets[i] = itup_off; - if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) - == InvalidOffsetNumber) - elog(ERROR, "failed to add index item to \"%s\"", - RelationGetRelationName(rel)); + if (PageAddItem(page, itups[i], itemsize, itup_off, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); } } diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c index 6e8c0e68a92c8..0fd41f4f4ca89 100644 --- a/src/backend/access/hash/hashsort.c +++ b/src/backend/access/hash/hashsort.c @@ -59,7 +59,7 @@ struct HSpool HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets) { - HSpool *hspool = (HSpool *) palloc0(sizeof(HSpool)); + HSpool *hspool = palloc0_object(HSpool); hspool->index = index; @@ -106,7 +106,7 @@ _h_spooldestroy(HSpool *hspool) * spool an index entry into the sort file. */ void -_h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull) +_h_spool(HSpool *hspool, const ItemPointerData *self, const Datum *values, const bool *isnull) { tuplesort_putindextuplevalues(hspool->sortstate, hspool->index, self, values, isnull); diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 66c39f606540b..f41233fcd07d4 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -316,7 +316,7 @@ _hash_get_indextuple_hashkey(IndexTuple itup) */ bool _hash_convert_tuple(Relation index, - Datum *user_values, bool *user_isnull, + const Datum *user_values, const bool *user_isnull, Datum *index_values, bool *index_isnull) { uint32 hashkey; diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock index 843c2e58f929d..16f7d78b7d232 100644 --- a/src/backend/access/heap/README.tuplock +++ b/src/backend/access/heap/README.tuplock @@ -199,3 +199,35 @@ under a reader holding a pin. A reader of a heap_fetch() result tuple may witness a torn read. Current inplace-updated fields are aligned and are no wider than four bytes, and current readers don't need consistency across fields. Hence, they get by with just fetching each field once. + +During logical decoding, caches reflect an inplace update no later than the +next XLOG_XACT_INVALIDATIONS. That record witnesses the end of a command. +Tuples of its cmin are then visible to decoding, as are inplace updates of any +lower LSN. Inplace updates of a higher LSN may also be visible, even if those +updates would have been invisible to a non-historic snapshot matching +decoding's historic snapshot. (In other words, decoding may see inplace +updates that were not visible to a similar snapshot taken during original +transaction processing.) That's a consequence of inplace update violating +MVCC: there are no snapshot-specific versions of inplace-updated values. This +all makes it hard to reason about inplace-updated column reads during logical +decoding, but the behavior does suffice for relhasindex. A relhasindex=t in +CREATE INDEX becomes visible no later than the new pg_index row. While it may +be visible earlier, that's harmless. Finding zero indexes despite +relhasindex=t is normal in more cases than this, e.g. after DROP INDEX. +Example of a case that meaningfully reacts to the inplace inval: + +CREATE TABLE cat (c int) WITH (user_catalog_table = true); +CREATE TABLE normal (d int); +... +CREATE INDEX ON cat (c)\; INSERT INTO normal VALUES (1); + +If the output plugin reads "cat" during decoding of the INSERT, it's fair to +want that read to see relhasindex=t and use the new index. + +An alternative would be to have decoding of XLOG_HEAP_INPLACE immediately +execute its invals. That would behave more like invals during original +transaction processing. It would remove the decoding-specific delay in e.g. a +decoding plugin witnessing a relfrozenxid change. However, a good use case +for that is unlikely, since the plugin would still witness relfrozenxid +changes prematurely. Hence, inplace update takes the trivial approach of +delegating to XLOG_XACT_INVALIDATIONS. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9ec8cda1c6801..469397e734439 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -63,7 +63,7 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, bool all_visible_cleared, bool new_all_visible_cleared); #ifdef USE_ASSERT_CHECKING static void check_lock_if_inplace_updateable_rel(Relation relation, - ItemPointer otid, + const ItemPointerData *otid, HeapTuple newtup); static void check_inplace_rel_lock(HeapTuple oldtup); #endif @@ -72,7 +72,7 @@ static Bitmapset *HeapDetermineColumnsInfo(Relation relation, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external); -static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, +static bool heap_acquire_tuplock(Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock); static inline BlockNumber heapgettup_advance_block(HeapScanDesc scan, @@ -85,8 +85,11 @@ static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2); -static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, - ItemPointer ctid, TransactionId xid, +static TM_Result heap_lock_updated_tuple(Relation rel, + uint16 prior_infomask, + TransactionId prior_rawxmax, + const ItemPointerData *prior_ctid, + TransactionId xid, LockTupleMode mode); static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2); @@ -95,7 +98,7 @@ static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member); static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, - Relation rel, ItemPointer ctid, XLTW_Oper oper, + Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining); static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, @@ -213,6 +216,27 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = #define TUPLOCK_from_mxstatus(status) \ (MultiXactStatusLock[(status)]) +/* + * Check that we have a valid snapshot if we might need TOAST access. + */ +static inline void +AssertHasSnapshotForToast(Relation rel) +{ +#ifdef USE_ASSERT_CHECKING + + /* bootstrap mode in particular breaks this rule */ + if (!IsNormalProcessingMode()) + return; + + /* if the relation doesn't have a TOAST table, we are good */ + if (!OidIsValid(rel->rd_rel->reltoastrelid)) + return; + + Assert(HaveRegisteredOrActiveSnapshot()); + +#endif /* USE_ASSERT_CHECKING */ +} + /* ---------------------------------------------------------------- * heap support routines * ---------------------------------------------------------------- @@ -237,7 +261,9 @@ heap_scan_stream_read_next_parallel(ReadStream *stream, /* parallel scan */ table_block_parallelscan_startblock_init(scan->rs_base.rs_rd, scan->rs_parallelworkerdata, - (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel); + (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel, + scan->rs_startblock, + scan->rs_numblocks); /* may return InvalidBlockNumber if there are no more blocks */ scan->rs_prefetch_block = table_block_parallelscan_nextpage(scan->rs_base.rs_rd, @@ -392,28 +418,41 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) scan->rs_base.rs_flags |= SO_ALLOW_SYNC; else scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - } - else if (keep_startblock) - { + /* - * When rescanning, we want to keep the previous startblock setting, - * so that rewinding a cursor doesn't generate surprising results. - * Reset the active syncscan setting, though. + * If not rescanning, initialize the startblock. Finding the actual + * start location is done in table_block_parallelscan_startblock_init, + * based on whether an alternative start location has been set with + * heap_setscanlimits, or using the syncscan location, when syncscan + * is enabled. */ - if (allow_sync && synchronize_seqscans) - scan->rs_base.rs_flags |= SO_ALLOW_SYNC; - else - scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - } - else if (allow_sync && synchronize_seqscans) - { - scan->rs_base.rs_flags |= SO_ALLOW_SYNC; - scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks); + if (!keep_startblock) + scan->rs_startblock = InvalidBlockNumber; } else { - scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - scan->rs_startblock = 0; + if (keep_startblock) + { + /* + * When rescanning, we want to keep the previous startblock + * setting, so that rewinding a cursor doesn't generate surprising + * results. Reset the active syncscan setting, though. + */ + if (allow_sync && synchronize_seqscans) + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + } + else if (allow_sync && synchronize_seqscans) + { + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks); + } + else + { + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + scan->rs_startblock = 0; + } } scan->rs_numblocks = InvalidBlockNumber; @@ -1097,7 +1136,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, */ if (flags & SO_TYPE_BITMAPSCAN) { - BitmapHeapScanDesc bscan = palloc(sizeof(BitmapHeapScanDescData)); + BitmapHeapScanDesc bscan = palloc_object(BitmapHeapScanDescData); /* * Bitmap Heap scans do not have any fields that a normal Heap Scan @@ -1106,7 +1145,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan = (HeapScanDesc) bscan; } else - scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData)); + scan = (HeapScanDesc) palloc_object(HeapScanDescData); scan->rs_base.rs_rd = relation; scan->rs_base.rs_snapshot = snapshot; @@ -1122,6 +1161,17 @@ heap_beginscan(Relation relation, Snapshot snapshot, if (!(snapshot && IsMVCCSnapshot(snapshot))) scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; + /* Check that a historic snapshot is not used for non-catalog tables */ + if (snapshot && + IsHistoricMVCCSnapshot(snapshot) && + !RelationIsAccessibleInLogicalDecoding(relation)) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot query non-catalog table \"%s\" during logical decoding", + RelationGetRelationName(relation)))); + } + /* * For seqscan and sample scans in a serializable transaction, acquire a * predicate lock on the entire relation. This is required not only to @@ -1154,7 +1204,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, * when doing a parallel scan. */ if (parallel_scan != NULL) - scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData)); + scan->rs_parallelworkerdata = palloc_object(ParallelBlockTableScanWorkerData); else scan->rs_parallelworkerdata = NULL; @@ -1163,7 +1213,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, * initscan() and we don't want to allocate memory again */ if (nkeys > 0) - scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); + scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys); else scan->rs_base.rs_key = NULL; @@ -1990,7 +2040,7 @@ GetBulkInsertState(void) { BulkInsertState bistate; - bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData)); + bistate = (BulkInsertState) palloc_object(BulkInsertStateData); bistate->strategy = GetAccessStrategy(BAS_BULKWRITE); bistate->current_buf = InvalidBuffer; bistate->next_free = InvalidBlockNumber; @@ -2066,6 +2116,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, Assert(HeapTupleHeaderGetNatts(tup->t_data) <= RelationGetNumberOfAttributes(relation)); + AssertHasSnapshotForToast(relation); + /* * Fill in tuple header fields and toast the tuple if necessary. * @@ -2214,7 +2266,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, ReleaseBuffer(vmbuffer); /* - * If tuple is cachable, mark it for invalidation from the caches in case + * If tuple is cacheable, mark it for invalidation from the caches in case * we abort. Note it is OK to do this after releasing the buffer, because * the heaptup data structure is all in local memory, not in the shared * buffer. @@ -2343,6 +2395,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ Assert(!(options & HEAP_INSERT_NO_LOGICAL)); + AssertHasSnapshotForToast(relation); + needwal = RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -2430,7 +2484,11 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0; if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN)) + { all_frozen_set = true; + /* Lock the vmbuffer before entering the critical section */ + LockBuffer(vmbuffer, BUFFER_LOCK_EXCLUSIVE); + } /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -2470,7 +2528,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, * going to add further frozen rows to it. * * If we're only adding already frozen rows to a previously empty - * page, mark it as all-visible. + * page, mark it as all-frozen and update the visibility map. We're + * already holding a pin on the vmbuffer. */ if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN)) { @@ -2481,7 +2540,14 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, vmbuffer, VISIBILITYMAP_VALID_BITS); } else if (all_frozen_set) + { PageSetAllVisible(page); + visibilitymap_set_vmbits(BufferGetBlockNumber(buffer), + vmbuffer, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN, + relation->rd_locator); + } /* * XXX Should we set PageSetPrunable on this page ? See heap_insert() @@ -2529,6 +2595,12 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, xlrec->flags = 0; if (all_visible_cleared) xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED; + + /* + * We don't have to worry about including a conflict xid in the + * WAL record, as HEAP_INSERT_FROZEN intentionally violates + * visibility rules. + */ if (all_frozen_set) xlrec->flags = XLH_INSERT_ALL_FROZEN_SET; @@ -2592,6 +2664,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, XLogBeginInsert(); XLogRegisterData(xlrec, tupledata - scratch.data); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + if (all_frozen_set) + XLogRegisterBuffer(1, vmbuffer, 0); XLogRegisterBufData(0, tupledata, totaldatalen); @@ -2601,29 +2675,17 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, recptr = XLogInsert(RM_HEAP2_ID, info); PageSetLSN(page, recptr); + if (all_frozen_set) + { + Assert(BufferIsDirty(vmbuffer)); + PageSetLSN(BufferGetPage(vmbuffer), recptr); + } } END_CRIT_SECTION(); - /* - * If we've frozen everything on the page, update the visibilitymap. - * We're already holding pin on the vmbuffer. - */ if (all_frozen_set) - { - Assert(PageIsAllVisible(page)); - Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); - - /* - * It's fine to use InvalidTransactionId here - this is only used - * when HEAP_INSERT_FROZEN is specified, which intentionally - * violates visibility rules. - */ - visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer, - InvalidXLogRecPtr, vmbuffer, - InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); - } + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(buffer); ndone += nthispage; @@ -2656,7 +2718,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); /* - * If tuples are cachable, mark them for invalidation from the caches in + * If tuples are cacheable, mark them for invalidation from the caches in * case we abort. Note it is OK to do this after releasing the buffer, * because the heaptuples data structure is all in local memory, not in * the shared buffer. @@ -2742,7 +2804,7 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) * generated by another transaction). */ TM_Result -heap_delete(Relation relation, ItemPointer tid, +heap_delete(Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart) { @@ -2765,6 +2827,8 @@ heap_delete(Relation relation, ItemPointer tid, Assert(ItemPointerIsValid(tid)); + AssertHasSnapshotForToast(relation); + /* * Forbid this during a parallel operation, lest it allocate a combo CID. * Other workers might need that combo CID for visibility checks, and we @@ -3163,7 +3227,7 @@ heap_delete(Relation relation, ItemPointer tid, * via ereport(). */ void -simple_heap_delete(Relation relation, ItemPointer tid) +simple_heap_delete(Relation relation, const ItemPointerData *tid) { TM_Result result; TM_FailureData tmfd; @@ -3209,7 +3273,7 @@ simple_heap_delete(Relation relation, ItemPointer tid) * generated by another transaction). */ TM_Result -heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, +heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes) @@ -3260,6 +3324,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, Assert(HeapTupleHeaderGetNatts(newtup->t_data) <= RelationGetNumberOfAttributes(relation)); + AssertHasSnapshotForToast(relation); + /* * Forbid this during a parallel operation, lest it allocate a combo CID. * Other workers might need that combo CID for visibility checks, and we @@ -4190,7 +4256,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, */ static void check_lock_if_inplace_updateable_rel(Relation relation, - ItemPointer otid, + const ItemPointerData *otid, HeapTuple newtup) { /* LOCKTAG_TUPLE acceptable for any catalog */ @@ -4451,7 +4517,7 @@ HeapDetermineColumnsInfo(Relation relation, * via ereport(). */ void -simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, +simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes) { TM_Result result; @@ -4515,7 +4581,6 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * * Input parameters: * relation: relation containing tuple (caller must hold suitable lock) - * tid: TID of tuple to lock * cid: current command ID (used for visibility test, and stored into * tuple's cmax if lock is successful) * mode: indicates if shared or exclusive tuple lock is desired @@ -4754,11 +4819,13 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * If there are updates, follow the update chain; bail out if * that cannot be done. */ - if (follow_updates && updated) + if (follow_updates && updated && + !ItemPointerEquals(&tuple->t_self, &t_ctid)) { TM_Result res; - res = heap_lock_updated_tuple(relation, tuple, &t_ctid, + res = heap_lock_updated_tuple(relation, + infomask, xwait, &t_ctid, GetCurrentTransactionId(), mode); if (res != TM_Ok) @@ -4953,7 +5020,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, case LockWaitError: if (!ConditionalMultiXactIdWait((MultiXactId) xwait, status, infomask, relation, - NULL, log_lock_failure)) + NULL, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -4991,7 +5058,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, } break; case LockWaitError: - if (!ConditionalXactLockTableWait(xwait, log_lock_failure)) + if (!ConditionalXactLockTableWait(xwait, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -5001,11 +5068,13 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, } /* if there are updates, follow the update chain */ - if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) + if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask) && + !ItemPointerEquals(&tuple->t_self, &t_ctid)) { TM_Result res; - res = heap_lock_updated_tuple(relation, tuple, &t_ctid, + res = heap_lock_updated_tuple(relation, + infomask, xwait, &t_ctid, GetCurrentTransactionId(), mode); if (res != TM_Ok) @@ -5238,7 +5307,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * wait_policy is Skip. */ static bool -heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, +heap_acquire_tuplock(Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock) { if (*have_tuple_lock) @@ -5256,7 +5325,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, break; case LockWaitError: - if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failure)) + if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -5659,7 +5728,8 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, * version as well. */ static TM_Result -heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, +heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, + const ItemPointerData *tid, TransactionId xid, LockTupleMode mode) { TM_Result result; @@ -5672,7 +5742,6 @@ heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, old_infomask2; TransactionId xmax, new_xmax; - TransactionId priorXmax = InvalidTransactionId; bool cleared_all_frozen = false; bool pinned_desired_page; Buffer vmbuffer = InvalidBuffer; @@ -5986,7 +6055,10 @@ heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, * Follow update chain when locking an updated tuple, acquiring locks (row * marks) on the updated versions. * - * The initial tuple is assumed to be already locked. + * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding + * fields from the initial tuple. We will lock the tuples starting from the + * one that 'prior_ctid' points to. Note: This function does not lock the + * initial tuple itself. * * This function doesn't check visibility, it just unconditionally marks the * tuple(s) as locked. If any tuple in the updated chain is being deleted @@ -6004,16 +6076,22 @@ heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, * levels, because that would lead to a serializability failure. */ static TM_Result -heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, +heap_lock_updated_tuple(Relation rel, + uint16 prior_infomask, + TransactionId prior_raw_xmax, + const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode) { + INJECTION_POINT("heap_lock_updated_tuple", NULL); + /* - * If the tuple has not been updated, or has moved into another partition - * (effectively a delete) stop here. + * If the tuple has moved into another partition (effectively a delete) + * stop here. */ - if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) && - !ItemPointerEquals(&tuple->t_self, ctid)) + if (!ItemPointerIndicatesMovedPartitions(prior_ctid)) { + TransactionId prior_xmax; + /* * If this is the first possibly-multixact-able operation in the * current transaction, set my per-backend OldestMemberMXactId @@ -6025,7 +6103,9 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, */ MultiXactIdSetOldestMember(); - return heap_lock_updated_tuple_rec(rel, ctid, xid, mode); + prior_xmax = (prior_infomask & HEAP_XMAX_IS_MULTI) ? + MultiXactIdGetUpdateXid(prior_raw_xmax, prior_infomask) : prior_raw_xmax; + return heap_lock_updated_tuple_rec(rel, prior_xmax, prior_ctid, xid, mode); } /* nothing to lock */ @@ -6049,23 +6129,23 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, * An explicit confirmation WAL record also makes logical decoding simpler. */ void -heap_finish_speculative(Relation relation, ItemPointer tid) +heap_finish_speculative(Relation relation, const ItemPointerData *tid) { Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); offnum = ItemPointerGetOffsetNumber(tid); - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(ERROR, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(ERROR, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -6136,7 +6216,7 @@ heap_finish_speculative(Relation relation, ItemPointer tid) * confirmation records. */ void -heap_abort_speculative(Relation relation, ItemPointer tid) +heap_abort_speculative(Relation relation, const ItemPointerData *tid) { TransactionId xid = GetCurrentTransactionId(); ItemId lp; @@ -6287,10 +6367,13 @@ heap_abort_speculative(Relation relation, ItemPointer tid) * Since this is intended for system catalogs and SERIALIZABLE doesn't cover * DDL, this doesn't guarantee any particular predicate locking. * - * One could modify this to return true for tuples with delete in progress, - * All inplace updaters take a lock that conflicts with DROP. If explicit - * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an - * update. + * heap_delete() is a rarer source of blocking transactions (xwait). We'll + * wait for such a transaction just like for the normal heap_update() case. + * Normal concurrent DROP commands won't cause that, because all inplace + * updaters take some lock that conflicts with DROP. An explicit SQL "DELETE + * FROM pg_class" can cause it. By waiting, if the concurrent transaction + * executed both "DELETE FROM pg_class" and "INSERT INTO pg_class", our caller + * can find the successor tuple. * * Readers of inplace-updated fields expect changes to those fields are * durable. For example, vac_truncate_clog() reads datfrozenxid from @@ -6331,15 +6414,17 @@ heap_inplace_lock(Relation relation, Assert(BufferIsValid(buffer)); /* - * Construct shared cache inval if necessary. Because we pass a tuple - * version without our own inplace changes or inplace changes other - * sessions complete while we wait for locks, inplace update mustn't - * change catcache lookup keys. But we aren't bothering with index - * updates either, so that's true a fortiori. After LockBuffer(), it - * would be too late, because this might reach a - * CatalogCacheInitializeCache() that locks "buffer". + * Register shared cache invals if necessary. Other sessions may finish + * inplace updates of this tuple between this step and LockTuple(). Since + * inplace updates don't change cache keys, that's harmless. + * + * While it's tempting to register invals only after confirming we can + * return true, the following obstacle precludes reordering steps that + * way. Registering invals might reach a CatalogCacheInitializeCache() + * that locks "buffer". That would hang indefinitely if running after our + * own LockBuffer(). Hence, we must register invals before LockBuffer(). */ - CacheInvalidateHeapTupleInplace(relation, oldtup_ptr, NULL); + CacheInvalidateHeapTupleInplace(relation, oldtup_ptr); LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -6577,10 +6662,6 @@ heap_inplace_update_and_unlock(Relation relation, /* * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we * do this before UnlockTuple(). - * - * If we're mutating a tuple visible only to this transaction, there's an - * equivalent transactional inval from the action that created the tuple, - * and this inval is superfluous. */ AtInplace_Inval(); @@ -6591,10 +6672,10 @@ heap_inplace_update_and_unlock(Relation relation, AcceptInvalidationMessages(); /* local processing of just-sent inval */ /* - * Queue a transactional inval. The immediate invalidation we just sent - * is the only one known to be necessary. To reduce risk from the - * transition to immediate invalidation, continue sending a transactional - * invalidation like we've long done. Third-party code might rely on it. + * Queue a transactional inval, for logical decoding and for third-party + * code that might have been relying on it since long before inplace + * update adopted immediate invalidation. See README.tuplock section + * "Reading inplace-updated columns" for logical decoding details. */ if (!IsBootstrapProcessingMode()) CacheInvalidateHeapTuple(relation, tuple, NULL); @@ -6833,7 +6914,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * even member XIDs >= OldestXmin often won't be kept by second pass. */ nnewmembers = 0; - newmembers = palloc(sizeof(MultiXactMember) * nmembers); + newmembers = palloc_array(MultiXactMember, nmembers); has_lockers = false; update_xid = InvalidTransactionId; update_committed = false; @@ -7658,7 +7739,7 @@ DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, static bool Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, - Relation rel, ItemPointer ctid, XLTW_Oper oper, + Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure) { bool result = true; @@ -7735,7 +7816,7 @@ Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, */ static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, - Relation rel, ItemPointer ctid, XLTW_Oper oper, + Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining) { (void) Do_MultiXactIdWait(multi, status, infomask, false, @@ -8021,7 +8102,7 @@ index_delete_prefetch_buffer(Relation rel, static inline void index_delete_check_htid(TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, - ItemPointer htid, TM_IndexStatus *istatus) + const ItemPointerData *htid, TM_IndexStatus *istatus) { OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid); ItemId iid; @@ -8649,7 +8730,7 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) Assert(delstate->ndeltids > 0); /* Calculate per-heap-block count of TIDs */ - blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids); + blockgroups = palloc_array(IndexDeleteCounts, delstate->ndeltids); for (int i = 0; i < delstate->ndeltids; i++) { TM_IndexDelete *ideltid = &delstate->deltids[i]; @@ -8844,8 +8925,8 @@ log_heap_update(Relation reln, Buffer oldbuf, * * Skip this if we're taking a full-page image of the new page, as we * don't include the new tuple in the WAL record in that case. Also - * disable if wal_level='logical', as logical decoding needs to be able to - * read the new tuple in whole from the WAL record alone. + * disable if effective_wal_level='logical', as logical decoding needs to + * be able to read the new tuple in whole from the WAL record alone. */ if (oldbuf == newbuf && !need_tuple_data && !XLogCheckBufferNeedsBackup(newbuf)) @@ -9017,8 +9098,8 @@ log_heap_update(Relation reln, Buffer oldbuf, /* * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record * - * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog - * tuples. + * This is only used when effective_wal_level is logical, and only for + * catalog tuples. */ static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ac082fefa77a7..dd4fe6bf62ff4 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -81,7 +81,7 @@ heapam_slot_callbacks(Relation relation) static IndexFetchTableData * heapam_index_fetch_begin(Relation rel) { - IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); + IndexFetchHeapData *hscan = palloc0_object(IndexFetchHeapData); hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; @@ -464,7 +464,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, return TM_WouldBlock; break; case LockWaitError: - if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failure)) + if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -717,8 +717,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, /* Preallocate values/isnull arrays */ natts = newTupDesc->natts; - values = (Datum *) palloc(natts * sizeof(Datum)); - isnull = (bool *) palloc(natts * sizeof(bool)); + values = palloc_array(Datum, natts); + isnull = palloc_array(bool, natts); /* Initialize the rewrite operation */ rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff, @@ -2280,7 +2280,7 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, if (!pagemode) LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - page = (Page) BufferGetPage(hscan->rs_cbuf); + page = BufferGetPage(hscan->rs_cbuf); all_visible = PageIsAllVisible(page) && !scan->rs_snapshot->takenDuringRecovery; maxoffset = PageGetMaxOffsetNumber(page); diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 05f6946fe60d2..bf899c2d2c698 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -144,6 +144,55 @@ HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, SetHintBits(tuple, buffer, infomask, xid); } +/* + * If HEAP_MOVED_OFF or HEAP_MOVED_IN are set on the tuple, remove them and + * adjust hint bits. See the comment for SetHintBits() for more background. + * + * This helper returns false if the row ought to be invisible, true otherwise. + */ +static inline bool +HeapTupleCleanMoved(HeapTupleHeader tuple, Buffer buffer) +{ + TransactionId xvac; + + /* only used by pre-9.0 binary upgrades */ + if (likely(!(tuple->t_infomask & (HEAP_MOVED_OFF | HEAP_MOVED_IN)))) + return true; + + xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + elog(ERROR, "encountered tuple with HEAP_MOVED considered current"); + + if (TransactionIdIsInProgress(xvac)) + elog(ERROR, "encountered tuple with HEAP_MOVED considered in-progress"); + + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + + return true; +} /* * HeapTupleSatisfiesSelf @@ -179,45 +228,8 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ @@ -372,45 +384,8 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; /* * An invalid Xmin can be left behind by a speculative insertion that @@ -468,45 +443,8 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, if (HeapTupleHeaderXminInvalid(tuple)) return TM_Invisible; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return TM_Invisible; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return TM_Invisible; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return TM_Invisible; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return TM_Invisible; - } - } - } + else if (!HeapTupleCleanMoved(tuple, buffer)) + return TM_Invisible; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= curcid) @@ -756,45 +694,8 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ @@ -979,45 +880,8 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!XidInMVCCSnapshot(xvac, snapshot)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (XidInMVCCSnapshot(xvac, snapshot)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) @@ -1222,43 +1086,8 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de { if (HeapTupleHeaderXminInvalid(tuple)) return HEAPTUPLE_DEAD; - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - } + else if (!HeapTupleCleanMoved(tuple, buffer)) + return HEAPTUPLE_DEAD; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 30f4c2d3c6719..1823feff298f8 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -35,7 +35,9 @@ heap_xlog_prune_freeze(XLogReaderState *record) Buffer buffer; RelFileLocator rlocator; BlockNumber blkno; - XLogRedoAction action; + Buffer vmbuffer = InvalidBuffer; + uint8 vmflags = 0; + Size freespace = 0; XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); memcpy(&xlrec, maindataptr, SizeOfHeapPrune); @@ -50,11 +52,22 @@ heap_xlog_prune_freeze(XLogReaderState *record) Assert((xlrec.flags & XLHP_CLEANUP_LOCK) != 0 || (xlrec.flags & (XLHP_HAS_REDIRECTIONS | XLHP_HAS_DEAD_ITEMS)) == 0); + if (xlrec.flags & XLHP_VM_ALL_VISIBLE) + { + vmflags = VISIBILITYMAP_ALL_VISIBLE; + if (xlrec.flags & XLHP_VM_ALL_FROZEN) + vmflags |= VISIBILITYMAP_ALL_FROZEN; + } + /* - * We are about to remove and/or freeze tuples. In Hot Standby mode, - * ensure that there are no queries running for which the removed tuples - * are still visible or which still consider the frozen xids as running. - * The conflict horizon XID comes after xl_heap_prune. + * After xl_heap_prune is the optional snapshot conflict horizon. + * + * In Hot Standby mode, we must ensure that there are no running queries + * which would conflict with the changes in this record. That means we + * can't replay this record if it removes tuples that are still visible to + * transactions on the standby, freeze tuples with xids that are still + * considered running on the standby, or set a page as all-visible in the + * VM if it isn't all-visible to all transactions on the standby. */ if ((xlrec.flags & XLHP_HAS_CONFLICT_HORIZON) != 0) { @@ -71,14 +84,14 @@ heap_xlog_prune_freeze(XLogReaderState *record) } /* - * If we have a full-page image, restore it and we're done. + * If we have a full-page image of the heap block, restore it and we're + * done with the heap block. */ - action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, - (xlrec.flags & XLHP_CLEANUP_LOCK) != 0, - &buffer); - if (action == BLK_NEEDS_REDO) + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, + (xlrec.flags & XLHP_CLEANUP_LOCK) != 0, + &buffer) == BLK_NEEDS_REDO) { - Page page = (Page) BufferGetPage(buffer); + Page page = BufferGetPage(buffer); OffsetNumber *redirected; OffsetNumber *nowdead; OffsetNumber *nowunused; @@ -90,6 +103,7 @@ heap_xlog_prune_freeze(XLogReaderState *record) xlhp_freeze_plan *plans; OffsetNumber *frz_offsets; char *dataptr = XLogRecGetBlockData(record, 0, &datalen); + bool do_prune; heap_xlog_deserialize_prune_and_freeze(dataptr, xlrec.flags, &nplans, &plans, &frz_offsets, @@ -97,11 +111,16 @@ heap_xlog_prune_freeze(XLogReaderState *record) &ndead, &nowdead, &nunused, &nowunused); + do_prune = nredirected > 0 || ndead > 0 || nunused > 0; + + /* Ensure the record does something */ + Assert(do_prune || nplans > 0 || vmflags & VISIBILITYMAP_VALID_BITS); + /* * Update all line pointers per the record, and repair fragmentation * if needed. */ - if (nredirected > 0 || ndead > 0 || nunused > 0) + if (do_prune) heap_page_prune_execute(buffer, (xlrec.flags & XLHP_CLEANUP_LOCK) == 0, redirected, nredirected, @@ -139,35 +158,95 @@ heap_xlog_prune_freeze(XLogReaderState *record) Assert((char *) frz_offsets == dataptr + datalen); /* - * Note: we don't worry about updating the page's prunability hints. - * At worst this will cause an extra prune cycle to occur soon. + * The critical integrity requirement here is that we must never end + * up with the visibility map bit set and the page-level + * PD_ALL_VISIBLE bit unset. If that were to occur, a subsequent page + * modification would fail to clear the visibility map bit. */ + if (vmflags & VISIBILITYMAP_VALID_BITS) + PageSetAllVisible(page); - PageSetLSN(page, lsn); MarkBufferDirty(buffer); + + /* + * See log_heap_prune_and_freeze() for commentary on when we set the + * heap page LSN. + */ + if (do_prune || nplans > 0 || + ((vmflags & VISIBILITYMAP_VALID_BITS) && XLogHintBitIsNeeded())) + PageSetLSN(page, lsn); + + /* + * Note: we don't worry about updating the page's prunability hints. + * At worst this will cause an extra prune cycle to occur soon. + */ } /* - * If we released any space or line pointers, update the free space map. + * If we 1) released any space or line pointers or 2) set PD_ALL_VISIBLE + * or the VM, update the freespace map. + * + * Even when no actual space is freed (when only marking the page + * all-visible or frozen), we still update the FSM. Because the FSM is + * unlogged and maintained heuristically, it often becomes stale on + * standbys. If such a standby is later promoted and runs VACUUM, it will + * skip recalculating free space for pages that were marked + * all-visible/all-forzen. FreeSpaceMapVacuum() can then propagate overly + * optimistic free space values upward, causing future insertions to + * select pages that turn out to be unusable. In bulk, this can lead to + * long stalls. + * + * To prevent this, always update the FSM even when only marking a page + * all-visible/all-frozen. * - * Do this regardless of a full-page image being applied, since the FSM - * data is not in the page anyway. + * Do this regardless of whether a full-page image is logged, since FSM + * data is not part of the page itself. */ if (BufferIsValid(buffer)) { - if (xlrec.flags & (XLHP_HAS_REDIRECTIONS | - XLHP_HAS_DEAD_ITEMS | - XLHP_HAS_NOW_UNUSED_ITEMS)) - { - Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + if ((xlrec.flags & (XLHP_HAS_REDIRECTIONS | + XLHP_HAS_DEAD_ITEMS | + XLHP_HAS_NOW_UNUSED_ITEMS)) || + (vmflags & VISIBILITYMAP_VALID_BITS)) + freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); - UnlockReleaseBuffer(buffer); + /* + * We want to avoid holding an exclusive lock on the heap buffer while + * doing IO (either of the FSM or the VM), so we'll release it now. + */ + UnlockReleaseBuffer(buffer); + } - XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); - } - else - UnlockReleaseBuffer(buffer); + /* + * Now read and update the VM block. + * + * We must redo changes to the VM even if the heap page was skipped due to + * LSN interlock. See comment in heap_xlog_multi_insert() for more details + * on replaying changes to the VM. + */ + if ((vmflags & VISIBILITYMAP_VALID_BITS) && + XLogReadBufferForRedoExtended(record, 1, + RBM_ZERO_ON_ERROR, + false, + &vmbuffer) == BLK_NEEDS_REDO) + { + Page vmpage = BufferGetPage(vmbuffer); + + /* initialize the page if it was read as zeros */ + if (PageIsNew(vmpage)) + PageInit(vmpage, BLCKSZ, 0); + + visibilitymap_set_vmbits(blkno, vmbuffer, vmflags, rlocator); + + Assert(BufferIsDirty(vmbuffer)); + PageSetLSN(vmpage, lsn); } + + if (BufferIsValid(vmbuffer)) + UnlockReleaseBuffer(vmbuffer); + + if (freespace > 0) + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); } /* @@ -295,7 +374,6 @@ heap_xlog_visible(XLogReaderState *record) LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); reln = CreateFakeRelcacheEntry(rlocator); - visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, xlrec->snapshotConflictHorizon, vmbits); @@ -344,7 +422,7 @@ heap_xlog_delete(XLogReaderState *record) xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record); Buffer buffer; Page page; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; BlockNumber blkno; RelFileLocator target_locator; @@ -373,10 +451,10 @@ heap_xlog_delete(XLogReaderState *record) { page = BufferGetPage(buffer); - if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) - lp = PageGetItemId(page, xlrec->offnum); - - if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) + if (xlrec->offnum < 1 || xlrec->offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, xlrec->offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -438,6 +516,9 @@ heap_xlog_insert(XLogReaderState *record) ItemPointerSetBlockNumber(&target_tid, blkno); ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + /* No freezing in the heap_insert() code path */ + Assert(!(xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)); + /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. @@ -497,8 +578,7 @@ heap_xlog_insert(XLogReaderState *record) HeapTupleHeaderSetCmin(htup, FirstCommandId); htup->t_ctid = target_tid; - if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, - true, true) == InvalidOffsetNumber) + if (PageAddItem(page, htup, newlen, xlrec->offnum, true, true) == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ @@ -508,10 +588,6 @@ heap_xlog_insert(XLogReaderState *record) if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); - /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ - if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) - PageSetAllVisible(page); - MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) @@ -553,6 +629,7 @@ heap_xlog_multi_insert(XLogReaderState *record) int i; bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; XLogRedoAction action; + Buffer vmbuffer = InvalidBuffer; /* * Insertion doesn't overwrite MVCC data, so no conflict processing is @@ -573,11 +650,11 @@ heap_xlog_multi_insert(XLogReaderState *record) if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(rlocator); - Buffer vmbuffer = InvalidBuffer; visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; FreeFakeRelcacheEntry(reln); } @@ -600,7 +677,7 @@ heap_xlog_multi_insert(XLogReaderState *record) tupdata = XLogRecGetBlockData(record, 0, &len); endptr = tupdata + len; - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); for (i = 0; i < xlrec->ntuples; i++) { @@ -641,7 +718,7 @@ heap_xlog_multi_insert(XLogReaderState *record) ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); - offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + offnum = PageAddItem(page, htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); } @@ -664,6 +741,53 @@ heap_xlog_multi_insert(XLogReaderState *record) if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); + buffer = InvalidBuffer; + + /* + * Read and update the visibility map (VM) block. + * + * We must always redo VM changes, even if the corresponding heap page + * update was skipped due to the LSN interlock. Each VM block covers + * multiple heap pages, so later WAL records may update other bits in the + * same block. If this record includes an FPI (full-page image), + * subsequent WAL records may depend on it to guard against torn pages. + * + * Heap page changes are replayed first to preserve the invariant: + * PD_ALL_VISIBLE must be set on the heap page if the VM bit is set. + * + * Note that we released the heap page lock above. During normal + * operation, this would be unsafe — a concurrent modification could + * clear PD_ALL_VISIBLE while the VM bit remained set, violating the + * invariant. + * + * During recovery, however, no concurrent writers exist. Therefore, + * updating the VM without holding the heap page lock is safe enough. This + * same approach is taken when replaying xl_heap_visible records (see + * heap_xlog_visible()). + */ + if ((xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) && + XLogReadBufferForRedoExtended(record, 1, RBM_ZERO_ON_ERROR, false, + &vmbuffer) == BLK_NEEDS_REDO) + { + Page vmpage = BufferGetPage(vmbuffer); + + /* initialize the page if it was read as zeros */ + if (PageIsNew(vmpage)) + PageInit(vmpage, BLCKSZ, 0); + + visibilitymap_set_vmbits(blkno, + vmbuffer, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN, + rlocator); + + Assert(BufferIsDirty(vmbuffer)); + PageSetLSN(vmpage, lsn); + } + + if (BufferIsValid(vmbuffer)) + UnlockReleaseBuffer(vmbuffer); + /* * If the page is running low on free space, update the FSM as well. * Arbitrarily, our definition of "low" is less than 20%. We can't do much @@ -693,7 +817,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) nbuffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleData oldtup; HeapTupleHeader htup; uint16 prefixlen = 0, @@ -757,10 +881,10 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) { page = BufferGetPage(obuffer); offnum = xlrec->old_offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -802,7 +926,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) { nbuffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(nbuffer); + page = BufferGetPage(nbuffer); PageInit(page, BufferGetPageSize(nbuffer), 0); newaction = BLK_NEEDS_REDO; } @@ -915,7 +1039,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; - offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + offnum = PageAddItem(page, htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); @@ -963,7 +1087,7 @@ heap_xlog_confirm(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) @@ -971,10 +1095,10 @@ heap_xlog_confirm(XLogReaderState *record) page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -1002,7 +1126,7 @@ heap_xlog_lock(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; /* @@ -1028,13 +1152,13 @@ heap_xlog_lock(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -1076,7 +1200,7 @@ heap_xlog_lock_updated(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; xlrec = (xl_heap_lock_updated *) XLogRecGetData(record); @@ -1107,10 +1231,10 @@ heap_xlog_lock_updated(XLogReaderState *record) page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -1139,7 +1263,7 @@ heap_xlog_inplace(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; uint32 oldlen; Size newlen; @@ -1151,10 +1275,10 @@ heap_xlog_inplace(XLogReaderState *record) page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c index cb1e57030f64c..60e765fbfce14 100644 --- a/src/backend/access/heap/heaptoast.c +++ b/src/backend/access/heap/heaptoast.c @@ -561,15 +561,15 @@ toast_flatten_tuple_to_datum(HeapTupleHeader tup, */ HeapTuple toast_build_flattened_tuple(TupleDesc tupleDesc, - Datum *values, - bool *isnull) + const Datum *values, + const bool *isnull) { HeapTuple new_tuple; int numAttrs = tupleDesc->natts; int num_to_free; int i; Datum new_values[MaxTupleAttributeNumber]; - Pointer freeable_values[MaxTupleAttributeNumber]; + void *freeable_values[MaxTupleAttributeNumber]; /* * We can pass the caller's isnull array directly to heap_form_tuple, but @@ -593,7 +593,7 @@ toast_build_flattened_tuple(TupleDesc tupleDesc, { new_value = detoast_external_attr(new_value); new_values[i] = PointerGetDatum(new_value); - freeable_values[num_to_free++] = (Pointer) new_value; + freeable_values[num_to_free++] = new_value; } } } diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index c482c9d61b265..24c83e349c602 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -58,9 +58,7 @@ RelationPutHeapTuple(Relation relation, /* Add the tuple to the page */ pageHeader = BufferGetPage(buffer); - offnum = PageAddItem(pageHeader, (Item) tuple->t_data, - tuple->t_len, InvalidOffsetNumber, false, true); - + offnum = PageAddItem(pageHeader, tuple->t_data, tuple->t_len, InvalidOffsetNumber, false, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple to page"); diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index a8025889be088..07aa08cfe14d6 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -19,6 +19,7 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/transam.h" +#include "access/visibilitymapdefs.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "commands/vacuum.h" @@ -42,7 +43,7 @@ typedef struct /* whether or not dead items can be set LP_UNUSED during pruning */ bool mark_unused_now; /* whether to attempt freezing tuples */ - bool freeze; + bool attempt_freeze; struct VacuumCutoffs *cutoffs; /*------------------------------------------------------- @@ -128,6 +129,13 @@ typedef struct int lpdead_items; /* number of items in the array */ OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */ + /* + * The snapshot conflict horizon used when freezing tuples. The final + * snapshot conflict horizon for the record may be newer if pruning + * removes newer transaction IDs. + */ + TransactionId frz_conflict_horizon; + /* * all_visible and all_frozen indicate if the all-visible and all-frozen * bits in the visibility map can be set for this page after pruning. @@ -137,15 +145,11 @@ typedef struct * bits. It is only valid if we froze some tuples, and all_frozen is * true. * - * NOTE: all_visible and all_frozen don't include LP_DEAD items. That's - * convenient for heap_page_prune_and_freeze(), to use them to decide - * whether to freeze the page or not. The all_visible and all_frozen - * values returned to the caller are adjusted to include LP_DEAD items at - * the end. - * - * all_frozen should only be considered valid if all_visible is also set; - * we don't bother to clear the all_frozen flag every time we clear the - * all_visible flag. + * NOTE: all_visible and all_frozen initially don't include LP_DEAD items. + * That's convenient for heap_page_prune_and_freeze() to use them to + * decide whether to freeze the page or not. The all_visible and + * all_frozen values returned to the caller are adjusted to include + * LP_DEAD items after we determine whether to opportunistically freeze. */ bool all_visible; bool all_frozen; @@ -153,6 +157,14 @@ typedef struct } PruneState; /* Local functions */ +static void prune_freeze_setup(PruneFreezeParams *params, + TransactionId *new_relfrozen_xid, + MultiXactId *new_relmin_mxid, + PruneFreezeResult *presult, + PruneState *prstate); +static void prune_freeze_plan(Oid reloid, Buffer buffer, + PruneState *prstate, + OffsetNumber *off_loc); static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer); @@ -176,6 +188,10 @@ static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetN static void page_verify_redirects(Page page); +static bool heap_page_will_freeze(Relation relation, Buffer buffer, + bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune, + PruneState *prstate); + /* * Optionally prune and repair fragmentation in the specified page. @@ -256,12 +272,22 @@ heap_page_prune_opt(Relation relation, Buffer buffer) PruneFreezeResult presult; /* - * For now, pass mark_unused_now as false regardless of whether or - * not the relation has indexes, since we cannot safely determine - * that during on-access pruning with the current implementation. + * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option + * regardless of whether or not the relation has indexes, since we + * cannot safely determine that during on-access pruning with the + * current implementation. */ - heap_page_prune_and_freeze(relation, buffer, vistest, 0, - NULL, &presult, PRUNE_ON_ACCESS, &dummy_off_loc, NULL, NULL); + PruneFreezeParams params = { + .relation = relation, + .buffer = buffer, + .reason = PRUNE_ON_ACCESS, + .options = 0, + .vistest = vistest, + .cutoffs = NULL, + }; + + heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc, + NULL, NULL); /* * Report the number of tuples reclaimed to pgstats. This is @@ -293,87 +319,29 @@ heap_page_prune_opt(Relation relation, Buffer buffer) } } - /* - * Prune and repair fragmentation and potentially freeze tuples on the - * specified page. - * - * Caller must have pin and buffer cleanup lock on the page. Note that we - * don't update the FSM information for page on caller's behalf. Caller might - * also need to account for a reduction in the length of the line pointer - * array following array truncation by us. - * - * If the HEAP_PRUNE_FREEZE option is set, we will also freeze tuples if it's - * required in order to advance relfrozenxid / relminmxid, or if it's - * considered advantageous for overall system performance to do so now. The - * 'cutoffs', 'presult', 'new_relfrozen_xid' and 'new_relmin_mxid' arguments - * are required when freezing. When HEAP_PRUNE_FREEZE option is set, we also - * set presult->all_visible and presult->all_frozen on exit, to indicate if - * the VM bits can be set. They are always set to false when the - * HEAP_PRUNE_FREEZE option is not set, because at the moment only callers - * that also freeze need that information. + * Helper for heap_page_prune_and_freeze() to initialize the PruneState using + * the provided parameters. * - * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD - * (see heap_prune_satisfies_vacuum). - * - * options: - * MARK_UNUSED_NOW indicates that dead items can be set LP_UNUSED during - * pruning. - * - * FREEZE indicates that we will also freeze tuples, and will return - * 'all_visible', 'all_frozen' flags to the caller. - * - * cutoffs contains the freeze cutoffs, established by VACUUM at the beginning - * of vacuuming the relation. Required if HEAP_PRUNE_FREEZE option is set. - * cutoffs->OldestXmin is also used to determine if dead tuples are - * HEAPTUPLE_RECENTLY_DEAD or HEAPTUPLE_DEAD. - * - * presult contains output parameters needed by callers, such as the number of - * tuples removed and the offsets of dead items on the page after pruning. - * heap_page_prune_and_freeze() is responsible for initializing it. Required - * by all callers. - * - * reason indicates why the pruning is performed. It is included in the WAL - * record for debugging and analysis purposes, but otherwise has no effect. - * - * off_loc is the offset location required by the caller to use in error - * callback. - * - * new_relfrozen_xid and new_relmin_mxid must provided by the caller if the - * HEAP_PRUNE_FREEZE option is set. On entry, they contain the oldest XID and - * multi-XID seen on the relation so far. They will be updated with oldest - * values present on the page after pruning. After processing the whole - * relation, VACUUM can use these values as the new relfrozenxid/relminmxid - * for the relation. + * params, new_relfrozen_xid, new_relmin_mxid, and presult are input + * parameters and are not modified by this function. Only prstate is modified. */ -void -heap_page_prune_and_freeze(Relation relation, Buffer buffer, - GlobalVisState *vistest, - int options, - struct VacuumCutoffs *cutoffs, - PruneFreezeResult *presult, - PruneReason reason, - OffsetNumber *off_loc, - TransactionId *new_relfrozen_xid, - MultiXactId *new_relmin_mxid) +static void +prune_freeze_setup(PruneFreezeParams *params, + TransactionId *new_relfrozen_xid, + MultiXactId *new_relmin_mxid, + PruneFreezeResult *presult, + PruneState *prstate) { - Page page = BufferGetPage(buffer); - BlockNumber blockno = BufferGetBlockNumber(buffer); - OffsetNumber offnum, - maxoff; - PruneState prstate; - HeapTupleData tup; - bool do_freeze; - bool do_prune; - bool do_hint; - bool hint_bit_fpi; - int64 fpi_before = pgWalUsage.wal_fpi; - /* Copy parameters to prstate */ - prstate.vistest = vistest; - prstate.mark_unused_now = (options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; - prstate.freeze = (options & HEAP_PAGE_PRUNE_FREEZE) != 0; - prstate.cutoffs = cutoffs; + prstate->vistest = params->vistest; + prstate->mark_unused_now = + (params->options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; + + /* cutoffs must be provided if we will attempt freezing */ + Assert(!(params->options & HEAP_PAGE_PRUNE_FREEZE) || params->cutoffs); + prstate->attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0; + prstate->cutoffs = params->cutoffs; /* * Our strategy is to scan the page and make lists of items to change, @@ -386,40 +354,49 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * prunable, we will save the lowest relevant XID in new_prune_xid. Also * initialize the rest of our working state. */ - prstate.new_prune_xid = InvalidTransactionId; - prstate.latest_xid_removed = InvalidTransactionId; - prstate.nredirected = prstate.ndead = prstate.nunused = prstate.nfrozen = 0; - prstate.nroot_items = 0; - prstate.nheaponly_items = 0; + prstate->new_prune_xid = InvalidTransactionId; + prstate->latest_xid_removed = InvalidTransactionId; + prstate->nredirected = prstate->ndead = prstate->nunused = 0; + prstate->nfrozen = 0; + prstate->nroot_items = 0; + prstate->nheaponly_items = 0; /* initialize page freezing working state */ - prstate.pagefrz.freeze_required = false; - if (prstate.freeze) + prstate->pagefrz.freeze_required = false; + if (prstate->attempt_freeze) { Assert(new_relfrozen_xid && new_relmin_mxid); - prstate.pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid; - prstate.pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid; - prstate.pagefrz.FreezePageRelminMxid = *new_relmin_mxid; - prstate.pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid; + prstate->pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid; + prstate->pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid; + prstate->pagefrz.FreezePageRelminMxid = *new_relmin_mxid; + prstate->pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid; } else { - Assert(new_relfrozen_xid == NULL && new_relmin_mxid == NULL); - prstate.pagefrz.FreezePageRelminMxid = InvalidMultiXactId; - prstate.pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId; - prstate.pagefrz.FreezePageRelfrozenXid = InvalidTransactionId; - prstate.pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId; + Assert(!new_relfrozen_xid && !new_relmin_mxid); + prstate->pagefrz.FreezePageRelminMxid = InvalidMultiXactId; + prstate->pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId; + prstate->pagefrz.FreezePageRelfrozenXid = InvalidTransactionId; + prstate->pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId; } - prstate.ndeleted = 0; - prstate.live_tuples = 0; - prstate.recently_dead_tuples = 0; - prstate.hastup = false; - prstate.lpdead_items = 0; - prstate.deadoffsets = presult->deadoffsets; + prstate->ndeleted = 0; + prstate->live_tuples = 0; + prstate->recently_dead_tuples = 0; + prstate->hastup = false; + prstate->lpdead_items = 0; /* - * Caller may update the VM after we're done. We can keep track of + * deadoffsets are filled in during pruning but are only used to populate + * PruneFreezeResult->deadoffsets. To avoid needing two copies of the + * array, just save a pointer to the result offsets array in the + * PruneState. + */ + prstate->deadoffsets = presult->deadoffsets; + prstate->frz_conflict_horizon = InvalidTransactionId; + + /* + * Vacuum may update the VM after we're done. We can keep track of * whether the page will be all-visible and all-frozen after pruning and * freezing to help the caller to do that. * @@ -436,15 +413,16 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * are tuples present that are not visible to everyone or if there are * dead tuples which are not yet removable. However, dead tuples which * will be removed by the end of vacuuming should not preclude us from - * opportunistically freezing. Because of that, we do not clear - * all_visible when we see LP_DEAD items. We fix that at the end of the - * function, when we return the value to the caller, so that the caller - * doesn't set the VM bit incorrectly. + * opportunistically freezing. Because of that, we do not immediately + * clear all_visible and all_frozen when we see LP_DEAD items. We fix + * that after scanning the line pointers. We must correct all_visible and + * all_frozen before we return them to the caller, so that the caller + * doesn't set the VM bits incorrectly. */ - if (prstate.freeze) + if (prstate->attempt_freeze) { - prstate.all_visible = true; - prstate.all_frozen = true; + prstate->all_visible = true; + prstate->all_frozen = true; } else { @@ -452,8 +430,8 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * Initializing to false allows skipping the work to update them in * heap_prune_record_unchanged_lp_normal(). */ - prstate.all_visible = false; - prstate.all_frozen = false; + prstate->all_visible = false; + prstate->all_frozen = false; } /* @@ -464,10 +442,29 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * running transaction on the standby does not see tuples on the page as * all-visible, so the conflict horizon remains InvalidTransactionId. */ - prstate.visibility_cutoff_xid = InvalidTransactionId; + prstate->visibility_cutoff_xid = InvalidTransactionId; +} - maxoff = PageGetMaxOffsetNumber(page); - tup.t_tableOid = RelationGetRelid(relation); +/* + * Helper for heap_page_prune_and_freeze(). Iterates over every tuple on the + * page, examines its visibility information, and determines the appropriate + * action for each tuple. All tuples are processed and classified during this + * phase, but no modifications are made to the page until the later execution + * stage. + * + * *off_loc is used for error callback and cleared before returning. + */ +static void +prune_freeze_plan(Oid reloid, Buffer buffer, PruneState *prstate, + OffsetNumber *off_loc) +{ + Page page = BufferGetPage(buffer); + BlockNumber blockno = BufferGetBlockNumber(buffer); + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + OffsetNumber offnum; + HeapTupleData tup; + + tup.t_tableOid = reloid; /* * Determine HTSV for all tuples, and queue them up for processing as HOT @@ -502,13 +499,13 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, */ *off_loc = offnum; - prstate.processed[offnum] = false; - prstate.htsv[offnum] = -1; + prstate->processed[offnum] = false; + prstate->htsv[offnum] = -1; /* Nothing to do if slot doesn't contain a tuple */ if (!ItemIdIsUsed(itemid)) { - heap_prune_record_unchanged_lp_unused(page, &prstate, offnum); + heap_prune_record_unchanged_lp_unused(page, prstate, offnum); continue; } @@ -518,17 +515,17 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * If the caller set mark_unused_now true, we can set dead line * pointers LP_UNUSED now. */ - if (unlikely(prstate.mark_unused_now)) - heap_prune_record_unused(&prstate, offnum, false); + if (unlikely(prstate->mark_unused_now)) + heap_prune_record_unused(prstate, offnum, false); else - heap_prune_record_unchanged_lp_dead(page, &prstate, offnum); + heap_prune_record_unchanged_lp_dead(page, prstate, offnum); continue; } if (ItemIdIsRedirected(itemid)) { /* This is the start of a HOT chain */ - prstate.root_items[prstate.nroot_items++] = offnum; + prstate->root_items[prstate->nroot_items++] = offnum; continue; } @@ -542,21 +539,15 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, tup.t_len = ItemIdGetLength(itemid); ItemPointerSet(&tup.t_self, blockno, offnum); - prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, - buffer); + prstate->htsv[offnum] = heap_prune_satisfies_vacuum(prstate, &tup, + buffer); if (!HeapTupleHeaderIsHeapOnly(htup)) - prstate.root_items[prstate.nroot_items++] = offnum; + prstate->root_items[prstate->nroot_items++] = offnum; else - prstate.heaponly_items[prstate.nheaponly_items++] = offnum; + prstate->heaponly_items[prstate->nheaponly_items++] = offnum; } - /* - * If checksums are enabled, heap_prune_satisfies_vacuum() may have caused - * an FPI to be emitted. - */ - hint_bit_fpi = fpi_before != pgWalUsage.wal_fpi; - /* * Process HOT chains. * @@ -568,30 +559,30 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * the page instead of using the root_items array, also did it in * ascending offset number order.) */ - for (int i = prstate.nroot_items - 1; i >= 0; i--) + for (int i = prstate->nroot_items - 1; i >= 0; i--) { - offnum = prstate.root_items[i]; + offnum = prstate->root_items[i]; /* Ignore items already processed as part of an earlier chain */ - if (prstate.processed[offnum]) + if (prstate->processed[offnum]) continue; /* see preceding loop */ *off_loc = offnum; /* Process this item or chain of items */ - heap_prune_chain(page, blockno, maxoff, offnum, &prstate); + heap_prune_chain(page, blockno, maxoff, offnum, prstate); } /* * Process any heap-only tuples that were not already processed as part of * a HOT chain. */ - for (int i = prstate.nheaponly_items - 1; i >= 0; i--) + for (int i = prstate->nheaponly_items - 1; i >= 0; i--) { - offnum = prstate.heaponly_items[i]; + offnum = prstate->heaponly_items[i]; - if (prstate.processed[offnum]) + if (prstate->processed[offnum]) continue; /* see preceding loop */ @@ -610,7 +601,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * return true for an XMIN_INVALID tuple, so this code will work even * when there were sequential updates within the aborted transaction.) */ - if (prstate.htsv[offnum] == HEAPTUPLE_DEAD) + if (prstate->htsv[offnum] == HEAPTUPLE_DEAD) { ItemId itemid = PageGetItemId(page, offnum); HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid); @@ -618,8 +609,8 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, if (likely(!HeapTupleHeaderIsHotUpdated(htup))) { HeapTupleHeaderAdvanceConflictHorizon(htup, - &prstate.latest_xid_removed); - heap_prune_record_unused(&prstate, offnum, true); + &prstate->latest_xid_removed); + heap_prune_record_unused(prstate, offnum, true); } else { @@ -636,7 +627,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, } } else - heap_prune_record_unchanged_lp_normal(page, &prstate, offnum); + heap_prune_record_unchanged_lp_normal(page, prstate, offnum); } /* We should now have processed every tuple exactly once */ @@ -647,75 +638,90 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, { *off_loc = offnum; - Assert(prstate.processed[offnum]); + Assert(prstate->processed[offnum]); } #endif /* Clear the offset information once we have processed the given page. */ *off_loc = InvalidOffsetNumber; +} - do_prune = prstate.nredirected > 0 || - prstate.ndead > 0 || - prstate.nunused > 0; +/* + * Decide whether to proceed with freezing according to the freeze plans + * prepared for the given heap buffer. If freezing is chosen, this function + * performs several pre-freeze checks. + * + * The values of do_prune, do_hint_prune, and did_tuple_hint_fpi must be + * determined before calling this function. + * + * prstate is both an input and output parameter. + * + * Returns true if we should apply the freeze plans and freeze tuples on the + * page, and false otherwise. + */ +static bool +heap_page_will_freeze(Relation relation, Buffer buffer, + bool did_tuple_hint_fpi, + bool do_prune, + bool do_hint_prune, + PruneState *prstate) +{ + bool do_freeze = false; /* - * Even if we don't prune anything, if we found a new value for the - * pd_prune_xid field or the page was marked full, we will update the hint - * bit. + * If the caller specified we should not attempt to freeze any tuples, + * validate that everything is in the right state and return. */ - do_hint = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || - PageIsFull(page); + if (!prstate->attempt_freeze) + { + Assert(!prstate->all_frozen && prstate->nfrozen == 0); + Assert(prstate->lpdead_items == 0 || !prstate->all_visible); + return false; + } - /* - * Decide if we want to go ahead with freezing according to the freeze - * plans we prepared, or not. - */ - do_freeze = false; - if (prstate.freeze) + if (prstate->pagefrz.freeze_required) { - if (prstate.pagefrz.freeze_required) - { - /* - * heap_prepare_freeze_tuple indicated that at least one XID/MXID - * from before FreezeLimit/MultiXactCutoff is present. Must - * freeze to advance relfrozenxid/relminmxid. - */ - do_freeze = true; - } - else + /* + * heap_prepare_freeze_tuple indicated that at least one XID/MXID from + * before FreezeLimit/MultiXactCutoff is present. Must freeze to + * advance relfrozenxid/relminmxid. + */ + do_freeze = true; + } + else + { + /* + * Opportunistically freeze the page if we are generating an FPI + * anyway and if doing so means that we can set the page all-frozen + * afterwards (might not happen until VACUUM's final heap pass). + * + * XXX: Previously, we knew if pruning emitted an FPI by checking + * pgWalUsage.wal_fpi before and after pruning. Once the freeze and + * prune records were combined, this heuristic couldn't be used + * anymore. The opportunistic freeze heuristic must be improved; + * however, for now, try to approximate the old logic. + */ + if (prstate->all_frozen && prstate->nfrozen > 0) { + Assert(prstate->all_visible); + /* - * Opportunistically freeze the page if we are generating an FPI - * anyway and if doing so means that we can set the page - * all-frozen afterwards (might not happen until VACUUM's final - * heap pass). - * - * XXX: Previously, we knew if pruning emitted an FPI by checking - * pgWalUsage.wal_fpi before and after pruning. Once the freeze - * and prune records were combined, this heuristic couldn't be - * used anymore. The opportunistic freeze heuristic must be - * improved; however, for now, try to approximate the old logic. + * Freezing would make the page all-frozen. Have already emitted + * an FPI or will do so anyway? */ - if (prstate.all_visible && prstate.all_frozen && prstate.nfrozen > 0) + if (RelationNeedsWAL(relation)) { - /* - * Freezing would make the page all-frozen. Have already - * emitted an FPI or will do so anyway? - */ - if (RelationNeedsWAL(relation)) + if (did_tuple_hint_fpi) + do_freeze = true; + else if (do_prune) { - if (hint_bit_fpi) + if (XLogCheckBufferNeedsBackup(buffer)) + do_freeze = true; + } + else if (do_hint_prune) + { + if (XLogHintBitIsNeeded() && XLogCheckBufferNeedsBackup(buffer)) do_freeze = true; - else if (do_prune) - { - if (XLogCheckBufferNeedsBackup(buffer)) - do_freeze = true; - } - else if (do_hint) - { - if (XLogHintBitIsNeeded() && XLogCheckBufferNeedsBackup(buffer)) - do_freeze = true; - } } } } @@ -727,18 +733,34 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * Validate the tuples we will be freezing before entering the * critical section. */ - heap_pre_freeze_checks(buffer, prstate.frozen, prstate.nfrozen); + heap_pre_freeze_checks(buffer, prstate->frozen, prstate->nfrozen); + + /* + * Calculate what the snapshot conflict horizon should be for a record + * freezing tuples. We can use the visibility_cutoff_xid as our cutoff + * for conflicts when the whole page is eligible to become all-frozen + * in the VM once we're done with it. Otherwise, we generate a + * conservative cutoff by stepping back from OldestXmin. + */ + if (prstate->all_frozen) + prstate->frz_conflict_horizon = prstate->visibility_cutoff_xid; + else + { + /* Avoids false conflicts when hot_standby_feedback in use */ + prstate->frz_conflict_horizon = prstate->cutoffs->OldestXmin; + TransactionIdRetreat(prstate->frz_conflict_horizon); + } } - else if (prstate.nfrozen > 0) + else if (prstate->nfrozen > 0) { /* * The page contained some tuples that were not already frozen, and we * chose not to freeze them now. The page won't be all-frozen then. */ - Assert(!prstate.pagefrz.freeze_required); + Assert(!prstate->pagefrz.freeze_required); - prstate.all_frozen = false; - prstate.nfrozen = 0; /* avoid miscounts in instrumentation */ + prstate->all_frozen = false; + prstate->nfrozen = 0; /* avoid miscounts in instrumentation */ } else { @@ -750,10 +772,130 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, */ } + return do_freeze; +} + + +/* + * Prune and repair fragmentation and potentially freeze tuples on the + * specified page. + * + * Caller must have pin and buffer cleanup lock on the page. Note that we + * don't update the FSM information for page on caller's behalf. Caller might + * also need to account for a reduction in the length of the line pointer + * array following array truncation by us. + * + * params contains the input parameters used to control freezing and pruning + * behavior. See the definition of PruneFreezeParams for more on what each + * parameter does. + * + * If the HEAP_PAGE_PRUNE_FREEZE option is set in params, we will freeze + * tuples if it's required in order to advance relfrozenxid / relminmxid, or + * if it's considered advantageous for overall system performance to do so + * now. The 'params.cutoffs', 'presult', 'new_relfrozen_xid' and + * 'new_relmin_mxid' arguments are required when freezing. When + * HEAP_PAGE_PRUNE_FREEZE option is passed, we also set presult->all_visible + * and presult->all_frozen after determining whether or not to + * opportunistically freeze, to indicate if the VM bits can be set. They are + * always set to false when the HEAP_PAGE_PRUNE_FREEZE option is not passed, + * because at the moment only callers that also freeze need that information. + * + * presult contains output parameters needed by callers, such as the number of + * tuples removed and the offsets of dead items on the page after pruning. + * heap_page_prune_and_freeze() is responsible for initializing it. Required + * by all callers. + * + * off_loc is the offset location required by the caller to use in error + * callback. + * + * new_relfrozen_xid and new_relmin_mxid must be provided by the caller if the + * HEAP_PAGE_PRUNE_FREEZE option is set in params. On entry, they contain the + * oldest XID and multi-XID seen on the relation so far. They will be updated + * with the oldest values present on the page after pruning. After processing + * the whole relation, VACUUM can use these values as the new + * relfrozenxid/relminmxid for the relation. + */ +void +heap_page_prune_and_freeze(PruneFreezeParams *params, + PruneFreezeResult *presult, + OffsetNumber *off_loc, + TransactionId *new_relfrozen_xid, + MultiXactId *new_relmin_mxid) +{ + Buffer buffer = params->buffer; + Page page = BufferGetPage(buffer); + PruneState prstate; + bool do_freeze; + bool do_prune; + bool do_hint_prune; + bool did_tuple_hint_fpi; + int64 fpi_before = pgWalUsage.wal_fpi; + + /* Initialize prstate */ + prune_freeze_setup(params, + new_relfrozen_xid, new_relmin_mxid, + presult, &prstate); + + /* + * Examine all line pointers and tuple visibility information to determine + * which line pointers should change state and which tuples may be frozen. + * Prepare queue of state changes to later be executed in a critical + * section. + */ + prune_freeze_plan(RelationGetRelid(params->relation), + buffer, &prstate, off_loc); + + /* + * If checksums are enabled, calling heap_prune_satisfies_vacuum() while + * checking tuple visibility information in prune_freeze_plan() may have + * caused an FPI to be emitted. + */ + did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi; + + do_prune = prstate.nredirected > 0 || + prstate.ndead > 0 || + prstate.nunused > 0; + + /* + * Even if we don't prune anything, if we found a new value for the + * pd_prune_xid field or the page was marked full, we will update the hint + * bit. + */ + do_hint_prune = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || + PageIsFull(page); + + /* + * Decide if we want to go ahead with freezing according to the freeze + * plans we prepared, or not. + */ + do_freeze = heap_page_will_freeze(params->relation, buffer, + did_tuple_hint_fpi, + do_prune, + do_hint_prune, + &prstate); + + /* + * While scanning the line pointers, we did not clear + * all_visible/all_frozen when encountering LP_DEAD items because we + * wanted the decision whether or not to freeze the page to be unaffected + * by the short-term presence of LP_DEAD items. These LP_DEAD items are + * effectively assumed to be LP_UNUSED items in the making. It doesn't + * matter which vacuum heap pass (initial pass or final pass) ends up + * setting the page all-frozen, as long as the ongoing VACUUM does it. + * + * Now that we finished determining whether or not to freeze the page, + * update all_visible and all_frozen so that they reflect the true state + * of the page for setting PD_ALL_VISIBLE and VM bits. + */ + if (prstate.lpdead_items > 0) + prstate.all_visible = prstate.all_frozen = false; + + Assert(!prstate.all_frozen || prstate.all_visible); + /* Any error while applying the changes is critical */ START_CRIT_SECTION(); - if (do_hint) + if (do_hint_prune) { /* * Update the page's pd_prune_xid field to either zero, or the lowest @@ -794,9 +936,9 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, MarkBufferDirty(buffer); /* - * Emit a WAL XLOG_HEAP2_PRUNE_FREEZE record showing what we did + * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did */ - if (RelationNeedsWAL(relation)) + if (RelationNeedsWAL(params->relation)) { /* * The snapshotConflictHorizon for the whole record should be the @@ -808,35 +950,19 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * on the standby with xids older than the youngest tuple this * record will freeze will conflict. */ - TransactionId frz_conflict_horizon = InvalidTransactionId; TransactionId conflict_xid; - /* - * We can use the visibility_cutoff_xid as our cutoff for - * conflicts when the whole page is eligible to become all-frozen - * in the VM once we're done with it. Otherwise we generate a - * conservative cutoff by stepping back from OldestXmin. - */ - if (do_freeze) - { - if (prstate.all_visible && prstate.all_frozen) - frz_conflict_horizon = prstate.visibility_cutoff_xid; - else - { - /* Avoids false conflicts when hot_standby_feedback in use */ - frz_conflict_horizon = prstate.cutoffs->OldestXmin; - TransactionIdRetreat(frz_conflict_horizon); - } - } - - if (TransactionIdFollows(frz_conflict_horizon, prstate.latest_xid_removed)) - conflict_xid = frz_conflict_horizon; + if (TransactionIdFollows(prstate.frz_conflict_horizon, + prstate.latest_xid_removed)) + conflict_xid = prstate.frz_conflict_horizon; else conflict_xid = prstate.latest_xid_removed; - log_heap_prune_and_freeze(relation, buffer, + log_heap_prune_and_freeze(params->relation, buffer, + InvalidBuffer, /* vmbuffer */ + 0, /* vmflags */ conflict_xid, - true, reason, + true, params->reason, prstate.frozen, prstate.nfrozen, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, @@ -852,30 +978,8 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, presult->nfrozen = prstate.nfrozen; presult->live_tuples = prstate.live_tuples; presult->recently_dead_tuples = prstate.recently_dead_tuples; - - /* - * It was convenient to ignore LP_DEAD items in all_visible earlier on to - * make the choice of whether or not to freeze the page unaffected by the - * short-term presence of LP_DEAD items. These LP_DEAD items were - * effectively assumed to be LP_UNUSED items in the making. It doesn't - * matter which vacuum heap pass (initial pass or final pass) ends up - * setting the page all-frozen, as long as the ongoing VACUUM does it. - * - * Now that freezing has been finalized, unset all_visible if there are - * any LP_DEAD items on the page. It needs to reflect the present state - * of the page, as expected by our caller. - */ - if (prstate.all_visible && prstate.lpdead_items == 0) - { - presult->all_visible = prstate.all_visible; - presult->all_frozen = prstate.all_frozen; - } - else - { - presult->all_visible = false; - presult->all_frozen = false; - } - + presult->all_visible = prstate.all_visible; + presult->all_frozen = prstate.all_frozen; presult->hastup = prstate.hastup; /* @@ -894,7 +998,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, presult->lpdead_items = prstate.lpdead_items; /* the presult->deadoffsets array was already filled in */ - if (prstate.freeze) + if (prstate.attempt_freeze) { if (presult->nfrozen > 0) { @@ -1254,8 +1358,9 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, prstate->ndead++; /* - * Deliberately delay unsetting all_visible until later during pruning. - * Removable dead tuples shouldn't preclude freezing the page. + * Deliberately delay unsetting all_visible and all_frozen until later + * during pruning. Removable dead tuples shouldn't preclude freezing the + * page. */ /* Record the dead offset for vacuum */ @@ -1383,6 +1488,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb if (!HeapTupleHeaderXminCommitted(htup)) { prstate->all_visible = false; + prstate->all_frozen = false; break; } @@ -1397,14 +1503,15 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb /* * For now always use prstate->cutoffs for this test, because - * we only update 'all_visible' when freezing is requested. We - * could use GlobalVisTestIsRemovableXid instead, if a - * non-freezing caller wanted to set the VM bit. + * we only update 'all_visible' and 'all_frozen' when freezing + * is requested. We could use GlobalVisTestIsRemovableXid + * instead, if a non-freezing caller wanted to set the VM bit. */ Assert(prstate->cutoffs); if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin)) { prstate->all_visible = false; + prstate->all_frozen = false; break; } @@ -1418,6 +1525,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb case HEAPTUPLE_RECENTLY_DEAD: prstate->recently_dead_tuples++; prstate->all_visible = false; + prstate->all_frozen = false; /* * This tuple will soon become DEAD. Update the hint field so @@ -1437,6 +1545,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb * does, so be consistent. */ prstate->all_visible = false; + prstate->all_frozen = false; /* * If we wanted to optimize for aborts, we might consider marking @@ -1455,6 +1564,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb */ prstate->live_tuples++; prstate->all_visible = false; + prstate->all_frozen = false; /* * This tuple may soon become DEAD. Update the hint field so that @@ -1476,7 +1586,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb } /* Consider freezing any normal tuples which will not be removed */ - if (prstate->freeze) + if (prstate->attempt_freeze) { bool totally_frozen; @@ -1519,10 +1629,10 @@ heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber * hastup/nonempty_pages as provisional no matter how LP_DEAD items are * handled (handled here, or handled later on). * - * Similarly, don't unset all_visible until later, at the end of - * heap_page_prune_and_freeze(). This will allow us to attempt to freeze - * the page after pruning. As long as we unset it before updating the - * visibility map, this will be correct. + * Similarly, don't unset all_visible and all_frozen until later, at the + * end of heap_page_prune_and_freeze(). This will allow us to attempt to + * freeze the page after pruning. As long as we unset it before updating + * the visibility map, this will be correct. */ /* Record the dead offset for vacuum */ @@ -1563,7 +1673,7 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused) { - Page page = (Page) BufferGetPage(buffer); + Page page = BufferGetPage(buffer); OffsetNumber *offnum; HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY; @@ -2026,7 +2136,7 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, } /* - * Write an XLOG_HEAP2_PRUNE_FREEZE WAL record + * Write an XLOG_HEAP2_PRUNE* WAL record * * This is used for several different page maintenance operations: * @@ -2045,12 +2155,17 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * replaying 'unused' items depends on whether they were all previously marked * as dead. * + * If the VM is being updated, vmflags will contain the bits to set. In this + * case, vmbuffer should already have been updated and marked dirty and should + * still be pinned and locked. + * * Note: This function scribbles on the 'frozen' array. * * Note: This is called in a critical section, so careful what you do here. */ void log_heap_prune_and_freeze(Relation relation, Buffer buffer, + Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, PruneReason reason, @@ -2062,6 +2177,7 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, xl_heap_prune xlrec; XLogRecPtr recptr; uint8 info; + uint8 regbuf_flags_heap; /* The following local variables hold data registered in the WAL record: */ xlhp_freeze_plan plans[MaxHeapTuplesPerPage]; @@ -2070,8 +2186,26 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, xlhp_prune_items dead_items; xlhp_prune_items unused_items; OffsetNumber frz_offsets[MaxHeapTuplesPerPage]; + bool do_prune = nredirected > 0 || ndead > 0 || nunused > 0; + bool do_set_vm = vmflags & VISIBILITYMAP_VALID_BITS; + + Assert((vmflags & VISIBILITYMAP_VALID_BITS) == vmflags); xlrec.flags = 0; + regbuf_flags_heap = REGBUF_STANDARD; + + /* + * We can avoid an FPI of the heap page if the only modification we are + * making to it is to set PD_ALL_VISIBLE and checksums/wal_log_hints are + * disabled. Note that if we explicitly skip an FPI, we must not stamp the + * heap page with this record's LSN. Recovery skips records <= the stamped + * LSN, so this could lead to skipping an earlier FPI needed to repair a + * torn page. + */ + if (!do_prune && + nfrozen == 0 && + (!do_set_vm || !XLogHintBitIsNeeded())) + regbuf_flags_heap |= REGBUF_NO_IMAGE; /* * Prepare data for the buffer. The arrays are not actually in the @@ -2079,7 +2213,11 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, * page image, the arrays can be omitted. */ XLogBeginInsert(); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBuffer(0, buffer, regbuf_flags_heap); + + if (do_set_vm) + XLogRegisterBuffer(1, vmbuffer, 0); + if (nfrozen > 0) { int nplans; @@ -2136,6 +2274,12 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, * Prepare the main xl_heap_prune record. We already set the XLHP_HAS_* * flag above. */ + if (vmflags & VISIBILITYMAP_ALL_VISIBLE) + { + xlrec.flags |= XLHP_VM_ALL_VISIBLE; + if (vmflags & VISIBILITYMAP_ALL_FROZEN) + xlrec.flags |= XLHP_VM_ALL_FROZEN; + } if (RelationIsAccessibleInLogicalDecoding(relation)) xlrec.flags |= XLHP_IS_CATALOG_REL; if (TransactionIdIsValid(conflict_xid)) @@ -2168,5 +2312,19 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, } recptr = XLogInsert(RM_HEAP2_ID, info); - PageSetLSN(BufferGetPage(buffer), recptr); + if (do_set_vm) + { + Assert(BufferIsDirty(vmbuffer)); + PageSetLSN(BufferGetPage(vmbuffer), recptr); + } + + /* + * See comment at the top of the function about regbuf_flags_heap for + * details on when we can advance the page LSN. + */ + if (do_prune || nfrozen > 0 || (do_set_vm && XLogHintBitIsNeeded())) + { + Assert(BufferIsDirty(buffer)); + PageSetLSN(BufferGetPage(buffer), recptr); + } } diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index e6d2b5fced198..3b3c87fd62ed8 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -150,7 +150,7 @@ typedef struct RewriteStateData HTAB *rs_old_new_tid_map; /* unmatched B tuples */ HTAB *rs_logical_mappings; /* logical remapping files */ uint32 rs_num_rewrite_mappings; /* # in memory mappings */ -} RewriteStateData; +} RewriteStateData; /* * The lookup keys for the hash tables are tuple TID and xmin (we must check @@ -249,7 +249,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm old_cxt = MemoryContextSwitchTo(rw_cxt); /* Create and fill in the state struct */ - state = palloc0(sizeof(RewriteStateData)); + state = palloc0_object(RewriteStateData); state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; @@ -673,8 +673,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) } /* And now we can insert the tuple into the page */ - newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, - InvalidOffsetNumber, false, true); + newoff = PageAddItem(page, heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); @@ -1170,7 +1169,7 @@ CheckPointLogicalRewriteHeap(void) cutoff = ReplicationSlotsComputeLogicalRestartLSN(); /* don't start earlier than the restart lsn */ - if (cutoff != InvalidXLogRecPtr && redo < cutoff) + if (XLogRecPtrIsValid(cutoff) && redo < cutoff) cutoff = redo; mappings_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR); @@ -1205,7 +1204,7 @@ CheckPointLogicalRewriteHeap(void) lsn = ((uint64) hi) << 32 | lo; - if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + if (lsn < cutoff || !XLogRecPtrIsValid(cutoff)) { elog(DEBUG1, "removing logical rewrite file \"%s\"", path); if (unlink(path) < 0) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f28326bad0951..30778a15639e7 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -140,7 +140,6 @@ #include "access/visibilitymap.h" #include "access/xloginsert.h" #include "catalog/storage.h" -#include "commands/dbcommands.h" #include "commands/progress.h" #include "commands/vacuum.h" #include "common/int.h" @@ -423,7 +422,7 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ static void lazy_scan_heap(LVRelState *vacrel); static void heap_vacuum_eager_scan_setup(LVRelState *vacrel, - VacuumParams *params); + const VacuumParams params); static BlockNumber heap_vac_scan_next_block(ReadStream *stream, void *callback_private_data, void *per_buffer_data); @@ -431,7 +430,7 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); -static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, Buffer vmbuffer, bool all_visible_according_to_vm, bool *has_lpdead_items, bool *vm_page_frozen); @@ -464,8 +463,21 @@ static void dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber * int num_offsets); static void dead_items_reset(LVRelState *vacrel); static void dead_items_cleanup(LVRelState *vacrel); -static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, - TransactionId *visibility_cutoff_xid, bool *all_frozen); + +#ifdef USE_ASSERT_CHECKING +static bool heap_page_is_all_visible(Relation rel, Buffer buf, + TransactionId OldestXmin, + bool *all_frozen, + TransactionId *visibility_cutoff_xid, + OffsetNumber *logging_offnum); +#endif +static bool heap_page_would_be_all_visible(Relation rel, Buffer buf, + TransactionId OldestXmin, + OffsetNumber *deadoffsets, + int ndeadoffsets, + bool *all_frozen, + TransactionId *visibility_cutoff_xid, + OffsetNumber *logging_offnum); static void update_relstats_all_indexes(LVRelState *vacrel); static void vacuum_error_callback(void *arg); static void update_vacuum_error_info(LVRelState *vacrel, @@ -485,7 +497,7 @@ static void restore_vacuum_error_info(LVRelState *vacrel, * vacuum options or for relfrozenxid/relminmxid advancement. */ static void -heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) +heap_vacuum_eager_scan_setup(LVRelState *vacrel, const VacuumParams params) { uint32 randseed; BlockNumber allvisible; @@ -504,7 +516,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) vacrel->eager_scan_remaining_successes = 0; /* If eager scanning is explicitly disabled, just return. */ - if (params->max_eager_freeze_failure_rate == 0) + if (params.max_eager_freeze_failure_rate == 0) return; /* @@ -581,11 +593,11 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) vacrel->next_eager_scan_region_start = randseed % EAGER_SCAN_REGION_SIZE; - Assert(params->max_eager_freeze_failure_rate > 0 && - params->max_eager_freeze_failure_rate <= 1); + Assert(params.max_eager_freeze_failure_rate > 0 && + params.max_eager_freeze_failure_rate <= 1); vacrel->eager_scan_max_fails_per_region = - params->max_eager_freeze_failure_rate * + params.max_eager_freeze_failure_rate * EAGER_SCAN_REGION_SIZE; /* @@ -612,7 +624,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) * and locked the relation. */ void -heap_vacuum_rel(Relation rel, VacuumParams *params, +heap_vacuum_rel(Relation rel, const VacuumParams params, BufferAccessStrategy bstrategy) { LVRelState *vacrel; @@ -634,9 +646,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, ErrorContextCallback errcallback; char **indnames = NULL; - verbose = (params->options & VACOPT_VERBOSE) != 0; + verbose = (params.options & VACOPT_VERBOSE) != 0; instrument = (verbose || (AmAutoVacuumWorkerProcess() && - params->log_min_duration >= 0)); + params.log_vacuum_min_duration >= 0)); if (instrument) { pg_rusage_init(&ru0); @@ -652,6 +664,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, RelationGetRelid(rel)); + if (AmAutoVacuumWorkerProcess()) + pgstat_progress_update_param(PROGRESS_VACUUM_STARTED_BY, + params.is_wraparound + ? PROGRESS_VACUUM_STARTED_BY_AUTOVACUUM_WRAPAROUND + : PROGRESS_VACUUM_STARTED_BY_AUTOVACUUM); + else + pgstat_progress_update_param(PROGRESS_VACUUM_STARTED_BY, + PROGRESS_VACUUM_STARTED_BY_MANUAL); /* * Setup error traceback support for ereport() first. The idea is to set @@ -665,7 +685,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * of each rel. It's convenient for code in lazy_scan_heap to always use * these temp copies. */ - vacrel = (LVRelState *) palloc0(sizeof(LVRelState)); + vacrel = palloc0_object(LVRelState); vacrel->dbname = get_database_name(MyDatabaseId); vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel)); vacrel->relname = pstrdup(RelationGetRelationName(rel)); @@ -685,7 +705,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, if (instrument && vacrel->nindexes > 0) { /* Copy index names used by instrumentation (not error reporting) */ - indnames = palloc(sizeof(char *) * vacrel->nindexes); + indnames = palloc_array(char *, vacrel->nindexes); for (int i = 0; i < vacrel->nindexes; i++) indnames[i] = pstrdup(RelationGetRelationName(vacrel->indrels[i])); } @@ -699,9 +719,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * The truncate param allows user to avoid attempting relation truncation, * though it can't force truncation to happen. */ - Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED); - Assert(params->truncate != VACOPTVALUE_UNSPECIFIED && - params->truncate != VACOPTVALUE_AUTO); + Assert(params.index_cleanup != VACOPTVALUE_UNSPECIFIED); + Assert(params.truncate != VACOPTVALUE_UNSPECIFIED && + params.truncate != VACOPTVALUE_AUTO); /* * While VacuumFailSafeActive is reset to false before calling this, we @@ -711,14 +731,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->consider_bypass_optimization = true; vacrel->do_index_vacuuming = true; vacrel->do_index_cleanup = true; - vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED); - if (params->index_cleanup == VACOPTVALUE_DISABLED) + vacrel->do_rel_truncate = (params.truncate != VACOPTVALUE_DISABLED); + if (params.index_cleanup == VACOPTVALUE_DISABLED) { /* Force disable index vacuuming up-front */ vacrel->do_index_vacuuming = false; vacrel->do_index_cleanup = false; } - else if (params->index_cleanup == VACOPTVALUE_ENABLED) + else if (params.index_cleanup == VACOPTVALUE_ENABLED) { /* Force index vacuuming. Note that failsafe can still bypass. */ vacrel->consider_bypass_optimization = false; @@ -726,7 +746,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, else { /* Default/auto, make all decisions dynamically */ - Assert(params->index_cleanup == VACOPTVALUE_AUTO); + Assert(params.index_cleanup == VACOPTVALUE_AUTO); } /* Initialize page counters explicitly (be tidy) */ @@ -757,7 +777,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->vm_new_visible_pages = 0; vacrel->vm_new_visible_frozen_pages = 0; vacrel->vm_new_frozen_pages = 0; - vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); /* * Get cutoffs that determine which deleted tuples are considered DEAD, @@ -776,7 +795,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * to increase the number of dead tuples it can prune away.) */ vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs); + vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); vacrel->vistest = GlobalVisTestFor(rel); + /* Initialize state used to track oldest extant XID/MXID */ vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; @@ -788,7 +809,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, */ vacrel->skippedallvis = false; skipwithvm = true; - if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) + if (params.options & VACOPT_DISABLE_PAGE_SKIPPING) { /* * Force aggressive mode, and disable skipping blocks using the @@ -807,6 +828,12 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, */ heap_vacuum_eager_scan_setup(vacrel, params); + /* Report the vacuum mode: 'normal' or 'aggressive' */ + pgstat_progress_update_param(PROGRESS_VACUUM_MODE, + vacrel->aggressive + ? PROGRESS_VACUUM_MODE_AGGRESSIVE + : PROGRESS_VACUUM_MODE_NORMAL); + if (verbose) { if (vacrel->aggressive) @@ -829,7 +856,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * is already dangerously old.) */ lazy_check_wraparound_failsafe(vacrel); - dead_items_alloc(vacrel, params->nworkers); + dead_items_alloc(vacrel, params.nworkers); /* * Call lazy_scan_heap to perform all required heap pruning, index @@ -934,8 +961,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * soon in cases where the failsafe prevented significant amounts of heap * vacuuming. */ - pgstat_report_vacuum(RelationGetRelid(rel), - rel->rd_rel->relisshared, + pgstat_report_vacuum(rel, Max(vacrel->new_live_tuples, 0), vacrel->recently_dead_tuples + vacrel->missed_dead_tuples, @@ -946,9 +972,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - if (verbose || params->log_min_duration == 0 || + if (verbose || params.log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, - params->log_min_duration)) + params.log_vacuum_min_duration)) { long secs_dur; int usecs_dur; @@ -983,10 +1009,10 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * Aggressiveness already reported earlier, in dedicated * VACUUM VERBOSE ereport */ - Assert(!params->is_wraparound); + Assert(!params.is_wraparound); msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n"); } - else if (params->is_wraparound) + else if (params.is_wraparound) { /* * While it's possible for a VACUUM to be both is_wraparound @@ -1135,10 +1161,11 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, total_blks_read, total_blks_dirtied); appendStringInfo(&buf, - _("WAL usage: %" PRId64 " records, %" PRId64 " full page images, %" PRIu64 " bytes, %" PRId64 " buffers full\n"), + _("WAL usage: %" PRId64 " records, %" PRId64 " full page images, %" PRIu64 " bytes, %" PRIu64 " full page image bytes, %" PRId64 " buffers full\n"), walusage.wal_records, walusage.wal_fpi, walusage.wal_bytes, + walusage.wal_fpi_bytes, walusage.wal_buffers_full); appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); @@ -1244,6 +1271,7 @@ lazy_scan_heap(LVRelState *vacrel) Buffer buf; Page page; uint8 blk_info = 0; + int ndeleted = 0; bool has_lpdead_items; void *per_buffer_data = NULL; bool vm_page_frozen = false; @@ -1386,10 +1414,10 @@ lazy_scan_heap(LVRelState *vacrel) * line pointers previously marked LP_DEAD. */ if (got_cleanup_lock) - lazy_scan_prune(vacrel, buf, blkno, page, - vmbuffer, - blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, - &has_lpdead_items, &vm_page_frozen); + ndeleted = lazy_scan_prune(vacrel, buf, blkno, page, + vmbuffer, + blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, + &has_lpdead_items, &vm_page_frozen); /* * Count an eagerly scanned page as a failure or a success. @@ -1413,11 +1441,25 @@ lazy_scan_heap(LVRelState *vacrel) if (vm_page_frozen) { - Assert(vacrel->eager_scan_remaining_successes > 0); - vacrel->eager_scan_remaining_successes--; + if (vacrel->eager_scan_remaining_successes > 0) + vacrel->eager_scan_remaining_successes--; if (vacrel->eager_scan_remaining_successes == 0) { + /* + * Report only once that we disabled eager scanning. We + * may eagerly read ahead blocks in excess of the success + * or failure caps before attempting to freeze them, so we + * could reach here even after disabling additional eager + * scanning. + */ + if (vacrel->eager_scan_max_fails_per_region > 0) + ereport(vacrel->verbose ? INFO : DEBUG2, + (errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of relation \"%s.%s.%s\"", + orig_eager_scan_success_limit, + vacrel->dbname, vacrel->relnamespace, + vacrel->relname))); + /* * If we hit our success cap, permanently disable eager * scanning by setting the other eager scan management @@ -1426,19 +1468,10 @@ lazy_scan_heap(LVRelState *vacrel) vacrel->eager_scan_remaining_fails = 0; vacrel->next_eager_scan_region_start = InvalidBlockNumber; vacrel->eager_scan_max_fails_per_region = 0; - - ereport(vacrel->verbose ? INFO : DEBUG2, - (errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of \"%s.%s.%s\"", - orig_eager_scan_success_limit, - vacrel->dbname, vacrel->relnamespace, - vacrel->relname))); } } - else - { - Assert(vacrel->eager_scan_remaining_fails > 0); + else if (vacrel->eager_scan_remaining_fails > 0) vacrel->eager_scan_remaining_fails--; - } } /* @@ -1475,7 +1508,7 @@ lazy_scan_heap(LVRelState *vacrel) * table has indexes. There will only be newly-freed space if we * held the cleanup lock and lazy_scan_prune() was called. */ - if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items && + if (got_cleanup_lock && vacrel->nindexes == 0 && ndeleted > 0 && blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) { FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, @@ -1866,8 +1899,6 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, */ if (!PageIsAllVisible(page)) { - uint8 old_vmbits; - START_CRIT_SECTION(); /* mark buffer dirty before writing a WAL record */ @@ -1883,28 +1914,20 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, * WAL-logged, and if not, do that now. */ if (RelationNeedsWAL(vacrel->rel) && - PageGetLSN(page) == InvalidXLogRecPtr) + !XLogRecPtrIsValid(PageGetLSN(page))) log_newpage_buffer(buf, true); PageSetAllVisible(page); - old_vmbits = visibilitymap_set(vacrel->rel, blkno, buf, - InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN); + visibilitymap_set(vacrel->rel, blkno, buf, + InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN); END_CRIT_SECTION(); - /* - * If the page wasn't already set all-visible and/or all-frozen in - * the VM, count it as newly set for logging. - */ - if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - vacrel->vm_new_visible_pages++; - vacrel->vm_new_visible_frozen_pages++; - } - else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0) - vacrel->vm_new_frozen_pages++; + /* Count the newly all-frozen pages for logging */ + vacrel->vm_new_visible_pages++; + vacrel->vm_new_visible_frozen_pages++; } freespace = PageGetHeapFreeSpace(page); @@ -1940,8 +1963,10 @@ cmpOffsetNumbers(const void *a, const void *b) * *vm_page_frozen is set to true if the page is newly set all-frozen in the * VM. The caller currently only uses this for determining whether an eagerly * scanned page was successfully set all-frozen. + * + * Returns the number of tuples deleted from the page during HOT pruning. */ -static void +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, @@ -1953,7 +1978,14 @@ lazy_scan_prune(LVRelState *vacrel, { Relation rel = vacrel->rel; PruneFreezeResult presult; - int prune_options = 0; + PruneFreezeParams params = { + .relation = rel, + .buffer = buf, + .reason = PRUNE_VACUUM_SCAN, + .options = HEAP_PAGE_PRUNE_FREEZE, + .vistest = vacrel->vistest, + .cutoffs = &vacrel->cutoffs, + }; Assert(BufferGetBlockNumber(buf) == blkno); @@ -1972,12 +2004,11 @@ lazy_scan_prune(LVRelState *vacrel, * tuples. Pruning will have determined whether or not the page is * all-visible. */ - prune_options = HEAP_PAGE_PRUNE_FREEZE; if (vacrel->nindexes == 0) - prune_options |= HEAP_PAGE_PRUNE_MARK_UNUSED_NOW; + params.options |= HEAP_PAGE_PRUNE_MARK_UNUSED_NOW; - heap_page_prune_and_freeze(rel, buf, vacrel->vistest, prune_options, - &vacrel->cutoffs, &presult, PRUNE_VACUUM_SCAN, + heap_page_prune_and_freeze(¶ms, + &presult, &vacrel->offnum, &vacrel->NewRelfrozenXid, &vacrel->NewRelminMxid); @@ -2003,7 +2034,6 @@ lazy_scan_prune(LVRelState *vacrel, * agreement with heap_page_is_all_visible() using an assertion. */ #ifdef USE_ASSERT_CHECKING - /* Note that all_frozen value does not matter when !all_visible */ if (presult.all_visible) { TransactionId debug_cutoff; @@ -2011,9 +2041,9 @@ lazy_scan_prune(LVRelState *vacrel, Assert(presult.lpdead_items == 0); - if (!heap_page_is_all_visible(vacrel, buf, - &debug_cutoff, &debug_all_frozen)) - Assert(false); + Assert(heap_page_is_all_visible(vacrel->rel, buf, + vacrel->cutoffs.OldestXmin, &debug_all_frozen, + &debug_cutoff, &vacrel->offnum)); Assert(presult.all_frozen == debug_all_frozen); @@ -2056,6 +2086,7 @@ lazy_scan_prune(LVRelState *vacrel, *has_lpdead_items = (presult.lpdead_items > 0); Assert(!presult.all_visible || !(*has_lpdead_items)); + Assert(!presult.all_frozen || presult.all_visible); /* * Handle setting visibility map bit based on information from the VM (as @@ -2123,8 +2154,11 @@ lazy_scan_prune(LVRelState *vacrel, else if (all_visible_according_to_vm && !PageIsAllVisible(page) && visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + vacrel->relname, blkno))); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); } @@ -2145,8 +2179,11 @@ lazy_scan_prune(LVRelState *vacrel, */ else if (presult.lpdead_items > 0 && PageIsAllVisible(page)) { - elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", + vacrel->relname, blkno))); + PageClearAllVisible(page); MarkBufferDirty(buf); visibilitymap_clear(vacrel->rel, blkno, vmbuffer, @@ -2155,11 +2192,10 @@ lazy_scan_prune(LVRelState *vacrel, /* * If the all-visible page is all-frozen but not marked as such yet, mark - * it as all-frozen. Note that all_frozen is only valid if all_visible is - * true, so we must check both all_visible and all_frozen. + * it as all-frozen. */ - else if (all_visible_according_to_vm && presult.all_visible && - presult.all_frozen && !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) + else if (all_visible_according_to_vm && presult.all_frozen && + !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) { uint8 old_vmbits; @@ -2212,6 +2248,8 @@ lazy_scan_prune(LVRelState *vacrel, *vm_page_frozen = true; } } + + return presult.ndeleted; } /* @@ -2842,8 +2880,10 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, OffsetNumber unused[MaxHeapTuplesPerPage]; int nunused = 0; TransactionId visibility_cutoff_xid; + TransactionId conflict_xid = InvalidTransactionId; bool all_frozen; LVSavedErrInfo saved_err_info; + uint8 vmflags = 0; Assert(vacrel->do_index_vacuuming); @@ -2854,6 +2894,35 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno, InvalidOffsetNumber); + /* + * Before marking dead items unused, check whether the page will become + * all-visible once that change is applied. This lets us reap the tuples + * and mark the page all-visible within the same critical section, + * enabling both changes to be emitted in a single WAL record. Since the + * visibility checks may perform I/O and allocate memory, they must be + * done outside the critical section. + */ + if (heap_page_would_be_all_visible(vacrel->rel, buffer, + vacrel->cutoffs.OldestXmin, + deadoffsets, num_offsets, + &all_frozen, &visibility_cutoff_xid, + &vacrel->offnum)) + { + vmflags |= VISIBILITYMAP_ALL_VISIBLE; + if (all_frozen) + { + vmflags |= VISIBILITYMAP_ALL_FROZEN; + Assert(!TransactionIdIsValid(visibility_cutoff_xid)); + } + + /* + * Take the lock on the vmbuffer before entering a critical section. + * The heap page lock must also be held while updating the VM to + * ensure consistency. + */ + LockBuffer(vmbuffer, BUFFER_LOCK_EXCLUSIVE); + } + START_CRIT_SECTION(); for (int i = 0; i < num_offsets; i++) @@ -2873,6 +2942,19 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, /* Attempt to truncate line pointer array now */ PageTruncateLinePointerArray(page); + if ((vmflags & VISIBILITYMAP_VALID_BITS) != 0) + { + /* + * The page is guaranteed to have had dead line pointers, so we always + * set PD_ALL_VISIBLE. + */ + PageSetAllVisible(page); + visibilitymap_set_vmbits(blkno, + vmbuffer, vmflags, + vacrel->rel->rd_locator); + conflict_xid = visibility_cutoff_xid; + } + /* * Mark buffer dirty before we write WAL. */ @@ -2882,7 +2964,9 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, if (RelationNeedsWAL(vacrel->rel)) { log_heap_prune_and_freeze(vacrel->rel, buffer, - InvalidTransactionId, + vmflags != 0 ? vmbuffer : InvalidBuffer, + vmflags, + conflict_xid, false, /* no cleanup lock required */ PRUNE_VACUUM_CLEANUP, NULL, 0, /* frozen */ @@ -2891,53 +2975,15 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, unused, nunused); } - /* - * End critical section, so we safely can do visibility tests (which - * possibly need to perform IO and allocate memory!). If we crash now the - * page (including the corresponding vm bit) might not be marked all - * visible, but that's fine. A later vacuum will fix that. - */ END_CRIT_SECTION(); - /* - * Now that we have removed the LP_DEAD items from the page, once again - * check if the page has become all-visible. The page is already marked - * dirty, exclusively locked, and, if needed, a full page image has been - * emitted. - */ - Assert(!PageIsAllVisible(page)); - if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid, - &all_frozen)) + if ((vmflags & VISIBILITYMAP_ALL_VISIBLE) != 0) { - uint8 old_vmbits; - uint8 flags = VISIBILITYMAP_ALL_VISIBLE; - + /* Count the newly set VM page for logging */ + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + vacrel->vm_new_visible_pages++; if (all_frozen) - { - Assert(!TransactionIdIsValid(visibility_cutoff_xid)); - flags |= VISIBILITYMAP_ALL_FROZEN; - } - - PageSetAllVisible(page); - old_vmbits = visibilitymap_set(vacrel->rel, blkno, buffer, - InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid, - flags); - - /* - * If the page wasn't already set all-visible and/or all-frozen in the - * VM, count it as newly set for logging. - */ - if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - vacrel->vm_new_visible_pages++; - if (all_frozen) - vacrel->vm_new_visible_frozen_pages++; - } - - else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 && - all_frozen) - vacrel->vm_new_frozen_pages++; + vacrel->vm_new_visible_frozen_pages++; } /* Revert to the previous phase information for error traceback */ @@ -2967,9 +3013,10 @@ lazy_check_wraparound_failsafe(LVRelState *vacrel) { const int progress_index[] = { PROGRESS_VACUUM_INDEXES_TOTAL, - PROGRESS_VACUUM_INDEXES_PROCESSED + PROGRESS_VACUUM_INDEXES_PROCESSED, + PROGRESS_VACUUM_MODE }; - int64 progress_val[2] = {0, 0}; + int64 progress_val[3] = {0, 0, PROGRESS_VACUUM_MODE_FAILSAFE}; VacuumFailsafeActive = true; @@ -2985,8 +3032,8 @@ lazy_check_wraparound_failsafe(LVRelState *vacrel) vacrel->do_index_cleanup = false; vacrel->do_rel_truncate = false; - /* Reset the progress counters */ - pgstat_progress_update_multi_param(2, progress_index, progress_val); + /* Reset the progress counters and set the failsafe mode */ + pgstat_progress_update_multi_param(3, progress_index, progress_val); ereport(WARNING, (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans", @@ -3340,6 +3387,9 @@ lazy_truncate_heap(LVRelState *vacrel) static BlockNumber count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) { + StaticAssertDecl((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, + "prefetch size must be power of 2"); + BlockNumber blkno; BlockNumber prefetchedUntil; instr_time starttime; @@ -3354,8 +3404,6 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) * in forward direction, so that OS-level readahead can kick in. */ blkno = vacrel->rel_pages; - StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, - "prefetch size must be power of 2"); prefetchedUntil = InvalidBlockNumber; while (blkno > vacrel->nonempty_pages) { @@ -3533,7 +3581,7 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) * locally. */ - dead_items_info = (VacDeadItemsInfo *) palloc(sizeof(VacDeadItemsInfo)); + dead_items_info = palloc_object(VacDeadItemsInfo); dead_items_info->max_bytes = vac_work_mem * (Size) 1024; dead_items_info->num_items = 0; vacrel->dead_items_info = dead_items_info; @@ -3572,6 +3620,8 @@ dead_items_reset(LVRelState *vacrel) if (ParallelVacuumIsActive(vacrel)) { parallel_vacuum_reset_dead_items(vacrel->pvs); + vacrel->dead_items = parallel_vacuum_get_dead_items(vacrel->pvs, + &vacrel->dead_items_info); return; } @@ -3600,31 +3650,90 @@ dead_items_cleanup(LVRelState *vacrel) vacrel->pvs = NULL; } +#ifdef USE_ASSERT_CHECKING + /* - * Check if every tuple in the given page is visible to all current and future - * transactions. Also return the visibility_cutoff_xid which is the highest - * xmin amongst the visible tuples. Set *all_frozen to true if every tuple - * on this page is frozen. - * - * This is a stripped down version of lazy_scan_prune(). If you change - * anything here, make sure that everything stays in sync. Note that an - * assertion calls us to verify that everybody still agrees. Be sure to avoid - * introducing new side-effects here. + * Wrapper for heap_page_would_be_all_visible() which can be used for callers + * that expect no LP_DEAD on the page. Currently assert-only, but there is no + * reason not to use it outside of asserts. */ static bool -heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, +heap_page_is_all_visible(Relation rel, Buffer buf, + TransactionId OldestXmin, + bool *all_frozen, TransactionId *visibility_cutoff_xid, - bool *all_frozen) + OffsetNumber *logging_offnum) +{ + + return heap_page_would_be_all_visible(rel, buf, + OldestXmin, + NULL, 0, + all_frozen, + visibility_cutoff_xid, + logging_offnum); +} +#endif + +/* + * Check whether the heap page in buf is all-visible except for the dead + * tuples referenced in the deadoffsets array. + * + * Vacuum uses this to check if a page would become all-visible after reaping + * known dead tuples. This function does not remove the dead items. + * + * This cannot be called in a critical section, as the visibility checks may + * perform IO and allocate memory. + * + * Returns true if the page is all-visible other than the provided + * deadoffsets and false otherwise. + * + * OldestXmin is used to determine visibility. + * + * Output parameters: + * + * - *all_frozen: true if every tuple on the page is frozen + * - *visibility_cutoff_xid: newest xmin; valid only if page is all-visible + * - *logging_offnum: OffsetNumber of current tuple being processed; + * used by vacuum's error callback system. + * + * Callers looking to verify that the page is already all-visible can call + * heap_page_is_all_visible(). + * + * This logic is closely related to heap_prune_record_unchanged_lp_normal(). + * If you modify this function, ensure consistency with that code. An + * assertion cross-checks that both remain in agreement. Do not introduce new + * side-effects. + */ +static bool +heap_page_would_be_all_visible(Relation rel, Buffer buf, + TransactionId OldestXmin, + OffsetNumber *deadoffsets, + int ndeadoffsets, + bool *all_frozen, + TransactionId *visibility_cutoff_xid, + OffsetNumber *logging_offnum) { Page page = BufferGetPage(buf); BlockNumber blockno = BufferGetBlockNumber(buf); OffsetNumber offnum, maxoff; bool all_visible = true; + int matched_dead_count = 0; *visibility_cutoff_xid = InvalidTransactionId; *all_frozen = true; + Assert(ndeadoffsets == 0 || deadoffsets); + +#ifdef USE_ASSERT_CHECKING + /* Confirm input deadoffsets[] is strictly sorted */ + if (ndeadoffsets > 1) + { + for (int i = 1; i < ndeadoffsets; i++) + Assert(deadoffsets[i - 1] < deadoffsets[i]); + } +#endif + maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff && all_visible; @@ -3637,7 +3746,7 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, * Set the offset number so that we can display it along with any * error that occurred while processing this tuple. */ - vacrel->offnum = offnum; + *logging_offnum = offnum; itemid = PageGetItemId(page, offnum); /* Unused or redirect line pointers are of no interest */ @@ -3652,19 +3761,26 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, */ if (ItemIdIsDead(itemid)) { - all_visible = false; - *all_frozen = false; - break; + if (!deadoffsets || + matched_dead_count >= ndeadoffsets || + deadoffsets[matched_dead_count] != offnum) + { + *all_frozen = all_visible = false; + break; + } + matched_dead_count++; + continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(vacrel->rel); + tuple.t_tableOid = RelationGetRelid(rel); - switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, - buf)) + /* Visibility checks may do IO or allocate memory */ + Assert(CritSectionCount == 0); + switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_LIVE: { @@ -3683,8 +3799,7 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, * that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); - if (!TransactionIdPrecedes(xmin, - vacrel->cutoffs.OldestXmin)) + if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; *all_frozen = false; @@ -3719,7 +3834,7 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, } /* scan along page */ /* Clear the offset information once we have processed the given page. */ - vacrel->offnum = InvalidOffsetNumber; + *logging_offnum = InvalidOffsetNumber; return all_visible; } diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 745a04ef26e29..d14588e92ae74 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -14,7 +14,8 @@ * visibilitymap_clear - clear bits for one page in the visibility map * visibilitymap_pin - pin a map page for setting a bit * visibilitymap_pin_ok - check whether correct map page is already pinned - * visibilitymap_set - set a bit in a previously pinned page + * visibilitymap_set - set bit(s) in a previously pinned page and log + * visibilitymap_set_vmbits - set bit(s) in a pinned page * visibilitymap_get_status - get status of bits * visibilitymap_count - count number of bits set in visibility map * visibilitymap_prepare_truncate - @@ -255,11 +256,12 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, uint8 status; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_set flags 0x%02X for %s %d", + flags, RelationGetRelationName(rel), heapBlk); #endif - Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); - Assert(InRecovery || PageIsAllVisible((Page) BufferGetPage(heapBuf))); + Assert(InRecovery || !XLogRecPtrIsValid(recptr)); + Assert(InRecovery || PageIsAllVisible(BufferGetPage(heapBuf))); Assert((flags & VISIBILITYMAP_VALID_BITS) == flags); /* Must never set all_frozen bit without also setting all_visible bit */ @@ -269,6 +271,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); + Assert(!BufferIsValid(heapBuf) || + BufferIsLockedByMeInMode(heapBuf, BUFFER_LOCK_EXCLUSIVE)); + /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); @@ -287,7 +292,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, if (RelationNeedsWAL(rel)) { - if (XLogRecPtrIsInvalid(recptr)) + if (!XLogRecPtrIsValid(recptr)) { Assert(!InRecovery); recptr = log_heap_visible(rel, heapBuf, vmBuf, cutoff_xid, flags); @@ -318,6 +323,73 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, return status; } +/* + * Set VM (visibility map) flags in the VM block in vmBuf. + * + * This function is intended for callers that log VM changes together + * with the heap page modifications that rendered the page all-visible. + * Callers that log VM changes separately should use visibilitymap_set(). + * + * vmBuf must be pinned and exclusively locked, and it must cover the VM bits + * corresponding to heapBlk. + * + * In normal operation (not recovery), this must be called inside a critical + * section that also applies the necessary heap page changes and, if + * applicable, emits WAL. + * + * The caller is responsible for ensuring consistency between the heap page + * and the VM page by holding a pin and exclusive lock on the buffer + * containing heapBlk. + * + * rlocator is used only for debugging messages. + */ +uint8 +visibilitymap_set_vmbits(BlockNumber heapBlk, + Buffer vmBuf, uint8 flags, + const RelFileLocator rlocator) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + Page page; + uint8 *map; + uint8 status; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_set flags 0x%02X for %s %d", + flags, + relpathbackend(rlocator, MyProcNumber, MAIN_FORKNUM).str, + heapBlk); +#endif + + /* Call in same critical section where WAL is emitted. */ + Assert(InRecovery || CritSectionCount > 0); + + /* Flags should be valid. Also never clear bits with this function */ + Assert((flags & VISIBILITYMAP_VALID_BITS) == flags); + + /* Must never set all_frozen bit without also setting all_visible bit */ + Assert(flags != VISIBILITYMAP_ALL_FROZEN); + + /* Check that we have the right VM page pinned */ + if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) + elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); + + Assert(BufferIsLockedByMeInMode(vmBuf, BUFFER_LOCK_EXCLUSIVE)); + + page = BufferGetPage(vmBuf); + map = (uint8 *) PageGetContents(page); + + status = (map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS; + if (flags != status) + { + map[mapByte] |= (flags << mapOffset); + MarkBufferDirty(vmBuf); + } + + return status; +} + /* * visibilitymap_get_status - get status of bits * @@ -364,7 +436,7 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { *vmbuf = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(*vmbuf)) - return false; + return (uint8) 0; } map = PageGetContents(BufferGetPage(*vmbuf)); diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index f0f4f974bcedb..60684c5342279 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -42,6 +42,19 @@ GetIndexAmRoutine(Oid amhandler) elog(ERROR, "index access method handler function %u did not return an IndexAmRoutine struct", amhandler); + /* Assert that all required callbacks are present. */ + Assert(routine->ambuild != NULL); + Assert(routine->ambuildempty != NULL); + Assert(routine->aminsert != NULL); + Assert(routine->ambulkdelete != NULL); + Assert(routine->amvacuumcleanup != NULL); + Assert(routine->amcostestimate != NULL); + Assert(routine->amoptions != NULL); + Assert(routine->amvalidate != NULL); + Assert(routine->ambeginscan != NULL); + Assert(routine->amrescan != NULL); + Assert(routine->amendscan != NULL); + return routine; } diff --git a/src/backend/access/index/amvalidate.c b/src/backend/access/index/amvalidate.c index 4cf237019adaf..8d7e7171bd7e1 100644 --- a/src/backend/access/index/amvalidate.c +++ b/src/backend/access/index/amvalidate.c @@ -118,7 +118,7 @@ identify_opfamily_groups(CatCList *oprlist, CatCList *proclist) } /* Time for a new group */ - thisgroup = (OpFamilyOpFuncGroup *) palloc(sizeof(OpFamilyOpFuncGroup)); + thisgroup = palloc_object(OpFamilyOpFuncGroup); if (oprform && (!procform || (oprform->amoplefttype < procform->amproclefttype || diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 0cb27af131095..b7f10a1aed0bb 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -81,7 +81,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) { IndexScanDesc scan; - scan = (IndexScanDesc) palloc(sizeof(IndexScanDescData)); + scan = palloc_object(IndexScanDescData); scan->heapRelation = NULL; /* may be set later */ scan->xs_heapfetch = NULL; @@ -94,11 +94,11 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) * We allocate key workspace here, but it won't get filled until amrescan. */ if (nkeys > 0) - scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); + scan->keyData = palloc_array(ScanKeyData, nkeys); else scan->keyData = NULL; if (norderbys > 0) - scan->orderByData = (ScanKey) palloc(sizeof(ScanKeyData) * norderbys); + scan->orderByData = palloc_array(ScanKeyData, norderbys); else scan->orderByData = NULL; @@ -310,8 +310,8 @@ index_compute_xid_horizon_for_tuples(Relation irel, delstate.bottomup = false; delstate.bottomupfreespace = 0; delstate.ndeltids = 0; - delstate.deltids = palloc(nitems * sizeof(TM_IndexDelete)); - delstate.status = palloc(nitems * sizeof(TM_IndexStatus)); + delstate.deltids = palloc_array(TM_IndexDelete, nitems); + delstate.status = palloc_array(TM_IndexStatus, nitems); /* identify what the index tuples about to be deleted point to */ for (int i = 0; i < nitems; i++) @@ -401,7 +401,7 @@ systable_beginscan(Relation heapRelation, else irel = NULL; - sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData)); + sysscan = palloc_object(SysScanDescData); sysscan->heap_rel = heapRelation; sysscan->irel = irel; @@ -488,7 +488,7 @@ systable_beginscan(Relation heapRelation, * is declared. */ static inline void -HandleConcurrentAbort() +HandleConcurrentAbort(void) { if (TransactionIdIsValid(CheckXidAlive) && !TransactionIdIsInProgress(CheckXidAlive) && @@ -667,7 +667,7 @@ systable_beginscan_ordered(Relation heapRelation, elog(WARNING, "using index \"%s\" despite IgnoreSystemIndexes", RelationGetRelationName(indexRelation)); - sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData)); + sysscan = palloc_object(SysScanDescData); sysscan->heap_rel = heapRelation; sysscan->irel = indexRelation; @@ -781,10 +781,11 @@ systable_endscan_ordered(SysScanDesc sysscan) * systable_inplace_update_begin --- update a row "in place" (overwrite it) * * Overwriting violates both MVCC and transactional safety, so the uses of - * this function in Postgres are extremely limited. Nonetheless we find some - * places to use it. See README.tuplock section "Locking to write - * inplace-updated tables" and later sections for expectations of readers and - * writers of a table that gets inplace updates. Standard flow: + * this function in Postgres are extremely limited. This makes no effort to + * support updating cache key columns or other indexed columns. Nonetheless + * we find some places to use it. See README.tuplock section "Locking to + * write inplace-updated tables" and later sections for expectations of + * readers and writers of a table that gets inplace updates. Standard flow: * * ... [any slow preparation not requiring oldtup] ... * systable_inplace_update_begin([...], &tup, &inplace_state); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da66..0492d92d23b13 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -75,7 +75,7 @@ #define RELATION_CHECKS \ do { \ Assert(RelationIsValid(indexRelation)); \ - Assert(PointerIsValid(indexRelation->rd_indam)); \ + Assert(indexRelation->rd_indam); \ if (unlikely(ReindexIsProcessingIndex(RelationGetRelid(indexRelation)))) \ ereport(ERROR, \ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ @@ -85,9 +85,9 @@ do { \ #define SCAN_CHECKS \ ( \ - AssertMacro(IndexScanIsValid(scan)), \ + AssertMacro(scan), \ AssertMacro(RelationIsValid(scan->indexRelation)), \ - AssertMacro(PointerIsValid(scan->indexRelation->rd_indam)) \ + AssertMacro(scan->indexRelation->rd_indam) \ ) #define CHECK_REL_PROCEDURE(pname) \ @@ -263,6 +263,16 @@ index_beginscan(Relation heapRelation, Assert(snapshot != InvalidSnapshot); + /* Check that a historic snapshot is not used for non-catalog tables */ + if (IsHistoricMVCCSnapshot(snapshot) && + !RelationIsAccessibleInLogicalDecoding(heapRelation)) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot query non-catalog table \"%s\" during logical decoding", + RelationGetRelationName(heapRelation)))); + } + scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false); /* @@ -986,11 +996,6 @@ index_store_float8_orderby_distances(IndexScanDesc scan, Oid *orderByTypes, { if (orderByTypes[i] == FLOAT8OID) { -#ifndef USE_FLOAT8_BYVAL - /* must free any old value to avoid memory leakage */ - if (!scan->xs_orderbynulls[i]) - pfree(DatumGetPointer(scan->xs_orderbyvals[i])); -#endif if (distances && !distances[i].isnull) { scan->xs_orderbyvals[i] = Float8GetDatum(distances[i].value); diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index c5cd4e0177fa5..0daf640af96c7 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -18,6 +18,7 @@ OBJS = \ nbtinsert.o \ nbtpage.o \ nbtpreprocesskeys.o \ + nbtreadpage.o \ nbtree.o \ nbtsearch.o \ nbtsort.o \ diff --git a/src/backend/access/nbtree/meson.build b/src/backend/access/nbtree/meson.build index 80962de6e6ed9..027b891966440 100644 --- a/src/backend/access/nbtree/meson.build +++ b/src/backend/access/nbtree/meson.build @@ -6,6 +6,7 @@ backend_sources += files( 'nbtinsert.c', 'nbtpage.c', 'nbtpreprocesskeys.c', + 'nbtreadpage.c', 'nbtree.c', 'nbtsearch.c', 'nbtsort.c', diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 4da5a3c1d161d..188c27b4925f7 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -278,32 +278,12 @@ btint8cmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(A_LESS_THAN_B); } -#if SIZEOF_DATUM < 8 -static int -btint8fastcmp(Datum x, Datum y, SortSupport ssup) -{ - int64 a = DatumGetInt64(x); - int64 b = DatumGetInt64(y); - - if (a > b) - return A_GREATER_THAN_B; - else if (a == b) - return 0; - else - return A_LESS_THAN_B; -} -#endif - Datum btint8sortsupport(PG_FUNCTION_ARGS) { SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); -#if SIZEOF_DATUM >= 8 ssup->comparator = ssup_datum_signed_cmp; -#else - ssup->comparator = btint8fastcmp; -#endif PG_RETURN_VOID(); } @@ -555,7 +535,7 @@ btcharcmp(PG_FUNCTION_ARGS) static Datum char_decrement(Relation rel, Datum existing, bool *underflow) { - uint8 cexisting = UInt8GetDatum(existing); + uint8 cexisting = DatumGetUInt8(existing); if (cexisting == 0) { @@ -571,7 +551,7 @@ char_decrement(Relation rel, Datum existing, bool *underflow) static Datum char_increment(Relation rel, Datum existing, bool *overflow) { - uint8 cexisting = UInt8GetDatum(existing); + uint8 cexisting = DatumGetUInt8(existing); if (cexisting == UCHAR_MAX) { diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index 08884116aecbe..4c8ac8e753a83 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -16,6 +16,7 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "utils/rel.h" @@ -81,7 +82,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, * That ought to leave us with a good split point when pages full of * duplicates can be split several times. */ - state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state = palloc_object(BTDedupStateData); state->deduplicate = true; state->nmaxitems = 0; state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK); @@ -125,8 +126,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, Size hitemsz = ItemIdGetLength(hitemid); IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); - if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, - false, false) == InvalidOffsetNumber) + if (PageAddItem(newpage, hitem, hitemsz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add highkey"); } @@ -321,7 +321,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, newitemsz += sizeof(ItemIdData); /* Initialize deduplication state */ - state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state = palloc_object(BTDedupStateData); state->deduplicate = true; state->nmaxitems = 0; state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */ @@ -355,8 +355,8 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, delstate.bottomup = true; delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz); delstate.ndeltids = 0; - delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); - delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); + delstate.deltids = palloc_array(TM_IndexDelete, MaxTIDsPerBTreePage); + delstate.status = palloc_array(TM_IndexStatus, MaxTIDsPerBTreePage); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -569,8 +569,7 @@ _bt_dedup_finish_pending(Page newpage, BTDedupState state) tuplesz = IndexTupleSize(state->base); Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base))); Assert(tuplesz <= BTMaxItemSize); - if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff, - false, false) == InvalidOffsetNumber) + if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add tuple to page"); spacesaving = 0; @@ -589,8 +588,7 @@ _bt_dedup_finish_pending(Page newpage, BTDedupState state) Assert(tuplesz == MAXALIGN(IndexTupleSize(final))); Assert(tuplesz <= BTMaxItemSize); - if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false, - false) == InvalidOffsetNumber) + if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add tuple to page"); pfree(final); @@ -861,7 +859,7 @@ _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz) * returned posting list tuple (they must be included in htids array.) */ IndexTuple -_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids) +_bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids) { uint32 keysize, newsize; diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index aa82cede30aa4..031eb76ba8cff 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -17,6 +17,7 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "access/transam.h" #include "access/xloginsert.h" #include "common/int.h" @@ -25,6 +26,7 @@ #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/injection_point.h" /* Minimum tree height for application of fastpath optimization */ #define BTREE_FASTPATH_MIN_LEVEL 2 @@ -60,7 +62,7 @@ static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly); static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf); -static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, +static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem); static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, @@ -1238,6 +1240,13 @@ _bt_insertonpg(Relation rel, * page. *---------- */ +#ifdef USE_INJECTION_POINTS + if (P_ISLEAF(opaque)) + INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL); + else + INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL); +#endif + _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly); } else @@ -1277,8 +1286,7 @@ _bt_insertonpg(Relation rel, if (postingoff != 0) memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); - if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false, - false) == InvalidOffsetNumber) + if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add new item to block %u in index \"%s\"", BufferGetBlockNumber(buf), RelationGetRelationName(rel)); @@ -1472,6 +1480,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Page origpage; Page leftpage, rightpage; + PGAlignedBlock leftpage_buf, + rightpage_buf; BlockNumber origpagenumber, rightpagenumber; BTPageOpaque ropaque, @@ -1542,8 +1552,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, newitem, &newitemonleft); - /* Allocate temp buffer for leftpage */ - leftpage = PageGetTempPage(origpage); + /* Use temporary buffer for leftpage */ + leftpage = leftpage_buf.data; _bt_pageinit(leftpage, BufferGetPageSize(buf)); lopaque = BTPageGetOpaque(leftpage); @@ -1697,8 +1707,7 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Assert(BTreeTupleGetNAtts(lefthighkey, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey))); - if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false, - false) == InvalidOffsetNumber) + if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add high key to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1706,19 +1715,23 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, /* * Acquire a new right page to split into, now that left page has a new - * high key. From here on, it's not okay to throw an error without - * zeroing rightpage first. This coding rule ensures that we won't - * confuse future VACUUM operations, which might otherwise try to re-find - * a downlink to a leftover junk page as the page undergoes deletion. + * high key. * - * It would be reasonable to start the critical section just after the new - * rightpage buffer is acquired instead; that would allow us to avoid - * leftover junk pages without bothering to zero rightpage. We do it this - * way because it avoids an unnecessary PANIC when either origpage or its - * existing sibling page are corrupt. + * To not confuse future VACUUM operations, we zero the right page and + * work on an in-memory copy of it before writing WAL, then copy its + * contents back to the actual page once we start the critical section + * work. This simplifies the split work, so as there is no need to zero + * the right page before throwing an error. */ rbuf = _bt_allocbuf(rel, heaprel); - rightpage = BufferGetPage(rbuf); + rightpage = rightpage_buf.data; + + /* + * Copy the contents of the right page into its temporary location, and + * zero the original space. + */ + memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ); + memset(BufferGetPage(rbuf), 0, BLCKSZ); rightpagenumber = BufferGetBlockNumber(rbuf); /* rightpage was initialized by _bt_allocbuf */ ropaque = BTPageGetOpaque(rightpage); @@ -1764,10 +1777,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0); Assert(BTreeTupleGetNAtts(righthighkey, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); - if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff, - false, false) == InvalidOffsetNumber) + if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add high key to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1815,7 +1826,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff, false)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1828,7 +1838,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, afterrightoff == minusinfoff)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1842,7 +1851,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, { if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1854,7 +1862,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff, afterrightoff == minusinfoff)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1875,7 +1882,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, afterrightoff == minusinfoff)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1895,7 +1901,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, sopaque = BTPageGetOpaque(spage); if (sopaque->btpo_prev != origpagenumber) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("right sibling's left-link doesn't match: " @@ -1938,9 +1943,19 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, * original. We need to do this before writing the WAL record, so that * XLogInsert can WAL log an image of the page if necessary. */ - PageRestoreTempPage(leftpage, origpage); + memcpy(origpage, leftpage, BLCKSZ); /* leftpage, lopaque must not be used below here */ + /* + * Move the contents of the right page from its temporary location to the + * destination buffer, before writing the WAL record. Unlike the left + * page, the right page and its opaque area are still needed to complete + * the update of the page, so reinitialize them. + */ + rightpage = BufferGetPage(rbuf); + memcpy(rightpage, rightpage_buf.data, BLCKSZ); + ropaque = BTPageGetOpaque(rightpage); + MarkBufferDirty(buf); MarkBufferDirty(rbuf); @@ -2278,6 +2293,7 @@ _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack) /* Was this the only page on the level before split? */ wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); + INJECTION_POINT("nbtree-finish-incomplete-split", NULL); elog(DEBUG1, "finishing incomplete split of %u/%u", BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); @@ -2527,8 +2543,7 @@ _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) * benefit of _bt_restore_page(). */ Assert(BTreeTupleGetNAtts(left_item, rel) == 0); - if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY, - false, false) == InvalidOffsetNumber) + if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add leftkey to new root page" " while splitting block %u of index \"%s\"", BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); @@ -2539,8 +2554,7 @@ _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) Assert(BTreeTupleGetNAtts(right_item, rel) > 0); Assert(BTreeTupleGetNAtts(right_item, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); - if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, - false, false) == InvalidOffsetNumber) + if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add rightkey to new root page" " while splitting block %u of index \"%s\"", BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); @@ -2629,7 +2643,7 @@ _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) static inline bool _bt_pgaddtup(Page page, Size itemsize, - IndexTuple itup, + const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem) { @@ -2644,8 +2658,7 @@ _bt_pgaddtup(Page page, itemsize = sizeof(IndexTupleData); } - if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false, - false) == InvalidOffsetNumber)) + if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)) return false; return true; @@ -2950,7 +2963,7 @@ _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, */ spacentids = ndeletable + 1; ntids = 0; - tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids); + tidblocks = palloc_array(BlockNumber, spacentids); /* * First add the table block for the incoming newitem. This is the one diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c79dd38ee18f3..cfb07b2bca93c 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -33,6 +33,7 @@ #include "storage/indexfsm.h" #include "storage/predicate.h" #include "storage/procarray.h" +#include "utils/injection_point.h" #include "utils/memdebug.h" #include "utils/memutils.h" #include "utils/snapmgr.h" @@ -1194,8 +1195,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, itup = updatable[i]->itup; itemsz = MAXALIGN(IndexTupleSize(itup)); - if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, - itemsz)) + if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz)) elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", BufferGetBlockNumber(buf), RelationGetRelationName(rel)); } @@ -1314,8 +1314,7 @@ _bt_delitems_delete(Relation rel, Buffer buf, itup = updatable[i]->itup; itemsz = MAXALIGN(IndexTupleSize(itup)); - if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, - itemsz)) + if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz)) elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", BufferGetBlockNumber(buf), RelationGetRelationName(rel)); } @@ -2005,6 +2004,10 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) return; } } + else + { + INJECTION_POINT("nbtree-finish-half-dead-page-vacuum", NULL); + } /* * Then unlink it from its siblings. Each call to @@ -2239,8 +2242,7 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, else BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber); - if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple, - IndexTupleSize(&trunctuple))) + if (!PageIndexTupleOverwrite(page, P_HIKEY, &trunctuple, IndexTupleSize(&trunctuple))) elog(ERROR, "could not overwrite high key in half-dead page"); /* Must mark buffers dirty before XLogInsert */ @@ -2352,6 +2354,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, _bt_unlockbuf(rel, leafbuf); + INJECTION_POINT("nbtree-leave-page-half-dead", NULL); + /* * Check here, as calling loops will have locks held, preventing * interrupts from being processed. @@ -2978,7 +2982,7 @@ _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly) vstate->maxbufsize = (int) maxbufsize; /* Allocate buffer, indicate that there are currently 0 pending pages */ - vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize); + vstate->pendingpages = palloc_array(BTPendingFSM, vstate->bufsize); vstate->npendingpages = 0; } diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index a136e4bbfdfb5..5b251c5058dcd 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -16,10 +16,13 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "common/int.h" #include "lib/qunique.h" #include "utils/array.h" #include "utils/lsyscache.h" #include "utils/memutils.h" +#include "utils/rel.h" typedef struct BTScanKeyPreproc { @@ -56,13 +59,15 @@ static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); +static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap); +static int _bt_reorder_array_cmp(const void *a, const void *b); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out, int *numSkipArrayKeys_out); static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype, StrategyNumber strat, - Datum *elems, int nelems); + const Datum *elems, int nelems); static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype, FmgrInfo *orderproc, FmgrInfo **sortprocp); static int _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, @@ -96,7 +101,7 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * incomplete sets of cross-type operators, we may fail to detect redundant * or contradictory keys, but we can survive that.) * - * The output keys must be sorted by index attribute. Presently we expect + * Required output keys are sorted by index attribute. Presently we expect * (but verify) that the input keys are already so sorted --- this is done * by match_clauses_to_index() in indxpath.c. Some reordering of the keys * within each attribute may be done as a byproduct of the processing here. @@ -117,6 +122,10 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * For the first attribute without an "=" key, any "<" and "<=" keys are * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD. * This can be seen to be correct by considering the above example. + * (Actually, the z key _will_ be marked SK_BT_REQFWD, since preprocessing + * will generate a skip array on y -- except when DEBUG_DISABLE_SKIP_SCAN. + * See below description of how and why we generate skip array = keys in the + * presence of a "contradictory" condition such as "y < 4".) * * If we never generated skip array scan keys, it would be possible for "gaps" * to appear that make it unsafe to mark any subsequent input scan keys @@ -127,29 +136,36 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * This has the potential to be much more efficient than a full index scan * (though it behaves like a full scan when there's many distinct "x" values). * - * If possible, redundant keys are eliminated: we keep only the tightest + * Typically, redundant keys are eliminated: we keep only the tightest * >/>= bound and the tightest />= or both - * 4::int AND x > 10::bigint", and we are unable to determine - * which key is more restrictive for lack of a suitable cross-type operator. - * _bt_first will arbitrarily pick one of the keys to do the initial - * positioning with. If it picks x > 4, then the x > 10 condition will fail - * until we reach index entries > 10; but we can't stop the scan just because - * x > 10 is failing. On the other hand, if we are scanning backwards, then - * failure of either key is indeed enough to stop the scan. (In general, when - * inequality keys are present, the initial-positioning code only promises to - * position before the first possible match, not exactly at the first match, - * for a forward scan; or after the last match for a backward scan.) + * we cannot eliminate either key. + * + * When all redundant keys could not be eliminated, we'll output a key array + * that can more or less be treated as if it had no redundant keys. Suppose + * we have "x > 4::int AND x > 10::bigint AND x < 70", and we are unable to + * determine which > key is more restrictive for lack of a suitable cross-type + * operator. We'll arbitrarily pick one of the > keys; the other > key won't + * be marked required. Obviously, the scan will be less efficient if we + * choose x > 4 over x > 10 -- but it can still largely proceed as if there + * was only a single > condition. "x > 10" will be placed at the end of the + * so->keyData[] output array. It'll always be evaluated last, after the keys + * that could be marked required in the usual way (after "x > 4 AND x < 70"). + * This can sometimes result in so->keyData[] keys that aren't even in index + * attribute order (if the qual involves multiple attributes). The scan's + * required keys will still be in attribute order, though, so it can't matter. + * + * This scheme ensures that _bt_first always uses the same set of keys at the + * start of a forwards scan as those _bt_checkkeys uses to determine when to + * end a similar backwards scan (and vice-versa). _bt_advance_array_keys + * depends on this: it expects to be able to reliably predict what the next + * _bt_first call will do by testing whether _bt_checkkeys' routines report + * that the final tuple on the page is past the end of matches for the scan's + * keys with the scan direction flipped. If it is (if continuescan=false), + * then it follows that calling _bt_first will, at a minimum, relocate the + * scan to the very next leaf page (in the current scan direction). * * As a byproduct of this work, we can detect contradictory quals such * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false, @@ -165,13 +181,18 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * array will generate its array elements from a range that's constrained by * any merged input inequalities (which won't get output in so->keyData[]). * - * Row comparison keys currently have a couple of notable limitations. - * Right now we just transfer them into the preprocessed array without any - * editorialization. We can treat them the same as an ordinary inequality - * comparison on the row's first index column, for the purposes of the logic - * about required keys. Also, we are unable to merge a row comparison key - * into a skip array (only ordinary inequalities are merged). A key that - * comes after a Row comparison key is therefore never marked as required. + * Row compares are treated as ordinary inequality comparisons on the row's + * first index column whenever possible. We treat their first subkey as if it + * was a simple scalar inequality for the purposes of the logic about required + * keys. This also gives us limited ability to detect contradictory/redundant + * conditions involving a row compare: we can do so whenever it involves an + * SK_ISNULL condition on a row compare's first column (the same rules used + * with simple inequalities work just as well here). We have no ability to + * detect redundant/contradictory conditions in any other row compare case. + * Note in particular that we are unable to merge a row comparison key into a + * skip array (only ordinary inequalities are merged). Any so->keyData[] key + * on a column that comes after a row comparison's first column can therefore + * never be marked as required at present. * * Note: the reason we have to copy the preprocessed scan keys into private * storage is that we are modifying the array based on comparisons of the @@ -188,7 +209,8 @@ _bt_preprocess_keys(IndexScanDesc scan) int numberOfEqualCols; ScanKey inkeys; BTScanKeyPreproc xform[BTMaxStrategyNumber]; - bool test_result; + bool test_result, + redundant_key_kept = false; AttrNumber attno; ScanKey arrayKeyData; int *keyDataMap = NULL; @@ -388,7 +410,8 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = NULL; xform[j].inkeyi = -1; } - /* else, cannot determine redundancy, keep both keys */ + else + redundant_key_kept = true; } /* track number of attrs for which we have "=" keys */ numberOfEqualCols++; @@ -409,6 +432,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTLessStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* try to keep only one of >, >= */ @@ -426,6 +451,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTGreaterStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* @@ -466,25 +493,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* check strategy this key's operator corresponds to */ j = inkey->sk_strategy - 1; - /* if row comparison, push it directly to the output array */ - if (inkey->sk_flags & SK_ROW_HEADER) - { - ScanKey outkey = &so->keyData[new_numberOfKeys++]; - - memcpy(outkey, inkey, sizeof(ScanKeyData)); - if (arrayKeyData) - keyDataMap[new_numberOfKeys - 1] = i; - if (numberOfEqualCols == attno - 1) - _bt_mark_scankey_required(outkey); - - /* - * We don't support RowCompare using equality; such a qual would - * mess up the numberOfEqualCols tracking. - */ - Assert(j != (BTEqualStrategyNumber - 1)); - continue; - } - if (inkey->sk_strategy == BTEqualStrategyNumber && (inkey->sk_flags & SK_SEARCHARRAY)) { @@ -593,9 +601,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * the new scan key. * * Note: We do things this way around so that our arrays are - * always in the same order as their corresponding scan keys, - * even with incomplete opfamilies. _bt_advance_array_keys - * depends on this. + * always in the same order as their corresponding scan keys. + * _bt_preprocess_array_keys_final expects this. */ ScanKey outkey = &so->keyData[new_numberOfKeys++]; @@ -607,6 +614,7 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = inkey; xform[j].inkeyi = i; xform[j].arrayidx = arrayidx; + redundant_key_kept = true; } } } @@ -622,6 +630,15 @@ _bt_preprocess_keys(IndexScanDesc scan) if (arrayKeyData) _bt_preprocess_array_keys_final(scan, keyDataMap); + /* + * If there are remaining redundant inequality keys, we must make sure + * that each index attribute has no more than one required >/>= key, and + * no more than one required qual_ok) + _bt_unmark_keys(scan, keyDataMap); + /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */ } @@ -746,9 +763,12 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) * * Depending on the operator type, the key may be required for both scan * directions or just one. Also, if the key is a row comparison header, - * we have to mark its first subsidiary ScanKey as required. (Subsequent - * subsidiary ScanKeys are normally for lower-order columns, and thus - * cannot be required, since they're after the first non-equality scankey.) + * we have to mark the appropriate subsidiary ScanKeys as required. In such + * cases, the first subsidiary key is required, but subsequent ones are + * required only as long as they correspond to successive index columns and + * match the leading column as to sort direction. Otherwise the row + * comparison ordering is different from the index ordering and so we can't + * stop the scan on the basis of those lower-order columns. * * Note: when we set required-key flag bits in a subsidiary scankey, we are * scribbling on a data structure belonging to the index AM's caller, not on @@ -786,12 +806,25 @@ _bt_mark_scankey_required(ScanKey skey) if (skey->sk_flags & SK_ROW_HEADER) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + AttrNumber attno = skey->sk_attno; /* First subkey should be same column/operator as the header */ - Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == skey->sk_attno); + Assert(subkey->sk_attno == attno); Assert(subkey->sk_strategy == skey->sk_strategy); - subkey->sk_flags |= addflags; + + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != attno) + break; /* non-adjacent key, so not required */ + if (subkey->sk_strategy != skey->sk_strategy) + break; /* wrong direction, so not required */ + subkey->sk_flags |= addflags; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + attno++; + } } } @@ -847,8 +880,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, cmp_op; StrategyNumber strat; - Assert(!((leftarg->sk_flags | rightarg->sk_flags) & - (SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_MEMBER)); /* * First, deal with cases where one or both args are NULL. This should @@ -924,6 +956,16 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, return true; } + /* + * We don't yet know how to determine redundancy when it involves a row + * compare key (barring simple cases involving IS NULL/IS NOT NULL) + */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_HEADER) + { + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); + return false; + } + /* * If either leftarg or rightarg are equality-type array scankeys, we need * specialized handling (since by now we know that IS NULL wasn't used) @@ -1379,6 +1421,7 @@ _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, Datum orig_sk_argument = high_compare->sk_argument, new_sk_argument; bool uflow; + int16 lookupstrat; Assert(high_compare->sk_strategy == BTLessStrategyNumber); @@ -1400,9 +1443,14 @@ _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, return; } - /* Look up <= operator (might fail) */ - leop = get_opfamily_member(opfamily, opcintype, opcintype, - BTLessEqualStrategyNumber); + /* + * Look up <= operator (might fail), accounting for the fact that a + * high_compare on a DESC column already had its strategy commuted + */ + lookupstrat = BTLessEqualStrategyNumber; + if (high_compare->sk_flags & SK_BT_DESC) + lookupstrat = BTGreaterEqualStrategyNumber; /* commute this too */ + leop = get_opfamily_member(opfamily, opcintype, opcintype, lookupstrat); if (!OidIsValid(leop)) return; cmp_proc = get_opcode(leop); @@ -1431,6 +1479,7 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, Datum orig_sk_argument = low_compare->sk_argument, new_sk_argument; bool oflow; + int16 lookupstrat; Assert(low_compare->sk_strategy == BTGreaterStrategyNumber); @@ -1452,9 +1501,14 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, return; } - /* Look up >= operator (might fail) */ - geop = get_opfamily_member(opfamily, opcintype, opcintype, - BTGreaterEqualStrategyNumber); + /* + * Look up >= operator (might fail), accounting for the fact that a + * low_compare on a DESC column already had its strategy commuted + */ + lookupstrat = BTGreaterEqualStrategyNumber; + if (low_compare->sk_flags & SK_BT_DESC) + lookupstrat = BTLessEqualStrategyNumber; /* commute this too */ + geop = get_opfamily_member(opfamily, opcintype, opcintype, lookupstrat); if (!OidIsValid(geop)) return; cmp_proc = get_opcode(geop); @@ -1467,6 +1521,283 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, } } +/* + * _bt_unmark_keys() -- make superfluous required keys nonrequired after all + * + * When _bt_preprocess_keys fails to eliminate one or more redundant keys, it + * calls here to make sure that no index attribute has more than one > or >= + * key marked required, and no more than one required < or <= key. Attributes + * with = keys will always get one = key as their required key. All other + * keys that were initially marked required get "unmarked" here. That way, + * _bt_first and _bt_checkkeys will reliably agree on which keys to use to + * start and/or to end the scan. + * + * We also relocate keys that become/started out nonrequired to the end of + * so->keyData[]. That way, _bt_first and _bt_checkkeys cannot fail to reach + * a required key due to some earlier nonrequired key getting in the way. + * + * Only call here when _bt_compare_scankey_args returned false at least once + * (otherwise, calling here will just waste cycles). + */ +static void +_bt_unmark_keys(IndexScanDesc scan, int *keyDataMap) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + AttrNumber attno; + bool *unmarkikey; + int nunmark, + nunmarked, + nkept, + firsti; + ScanKey keepKeys, + unmarkKeys; + FmgrInfo *keepOrderProcs = NULL, + *unmarkOrderProcs = NULL; + bool haveReqEquals, + haveReqForward, + haveReqBackward; + + /* + * Do an initial pass over so->keyData[] that determines which keys to + * keep as required. We expect so->keyData[] to still be in attribute + * order when we're called (though we don't expect any particular order + * among each attribute's keys). + * + * When both equality and inequality keys remain on a single attribute, we + * *must* make sure that exactly one of the equalities remains required. + * Any requiredness markings that we might leave on later keys/attributes + * are predicated on there being required = keys on all prior columns. + */ + unmarkikey = palloc0(so->numberOfKeys * sizeof(bool)); + nunmark = 0; + + /* Set things up for first key's attribute */ + attno = so->keyData[0].sk_attno; + firsti = 0; + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + + if (origkey->sk_attno != attno) + { + /* Reset for next attribute */ + attno = origkey->sk_attno; + firsti = i; + + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + } + + /* Equalities get priority over inequalities */ + if (haveReqEquals) + { + /* + * We already found the first "=" key for this attribute. We've + * already decided that all its other keys will be unmarked. + */ + Assert(!(origkey->sk_flags & SK_SEARCHNULL)); + unmarkikey[i] = true; + nunmark++; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQFWD) && + (origkey->sk_flags & SK_BT_REQBKWD)) + { + /* + * Found the first "=" key for attno. All other attno keys will + * be unmarked. + */ + Assert(origkey->sk_strategy == BTEqualStrategyNumber); + + haveReqEquals = true; + for (int j = firsti; j < i; j++) + { + /* Unmark any prior inequality keys on attno after all */ + if (!unmarkikey[j]) + { + unmarkikey[j] = true; + nunmark++; + } + } + continue; + } + + /* Deal with inequalities next */ + if ((origkey->sk_flags & SK_BT_REQFWD) && !haveReqForward) + { + haveReqForward = true; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQBKWD) && !haveReqBackward) + { + haveReqBackward = true; + continue; + } + + /* + * We have either a redundant inequality key that will be unmarked, or + * we have a key that wasn't marked required in the first place + */ + unmarkikey[i] = true; + nunmark++; + } + + /* Should only be called when _bt_compare_scankey_args reported failure */ + Assert(nunmark > 0); + + /* + * Next, allocate temp arrays: one for required keys that'll remain + * required, the other for all remaining keys + */ + unmarkKeys = palloc(nunmark * sizeof(ScanKeyData)); + keepKeys = palloc((so->numberOfKeys - nunmark) * sizeof(ScanKeyData)); + nunmarked = 0; + nkept = 0; + if (so->numArrayKeys) + { + unmarkOrderProcs = palloc(nunmark * sizeof(FmgrInfo)); + keepOrderProcs = palloc((so->numberOfKeys - nunmark) * sizeof(FmgrInfo)); + } + + /* + * Next, copy the contents of so->keyData[] into the appropriate temp + * array. + * + * Scans with = array keys need us to maintain invariants around the order + * of so->orderProcs[] and so->arrayKeys[] relative to so->keyData[]. See + * _bt_preprocess_array_keys_final for a full explanation. + */ + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + ScanKey unmark; + + if (!unmarkikey[i]) + { + /* + * Key gets to keep its original requiredness markings. + * + * Key will stay in its original position, unless we're going to + * unmark an earlier key (in which case this key gets moved back). + */ + memcpy(keepKeys + nkept, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = nkept; + memcpy(keepOrderProcs + nkept, &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + nkept++; + continue; + } + + /* + * Key will be unmarked as needed, and moved to the end of the array, + * next to other keys that will become (or always were) nonrequired + */ + unmark = unmarkKeys + nunmarked; + memcpy(unmark, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = (so->numberOfKeys - nunmark) + nunmarked; + memcpy(&unmarkOrderProcs[nunmarked], &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + /* + * Preprocessing only generates skip arrays when it knows that they'll + * be the only required = key on the attr. We'll never unmark them. + */ + Assert(!(unmark->sk_flags & SK_BT_SKIP)); + + /* + * Also shouldn't have to unmark an IS NULL or an IS NOT NULL key. + * They aren't cross-type, so an incomplete opfamily can't matter. + */ + Assert(!(unmark->sk_flags & SK_ISNULL) || + !(unmark->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))); + + /* Clear requiredness flags on redundant key (and on any subkeys) */ + unmark->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (unmark->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(unmark->sk_argument); + + Assert(subkey->sk_strategy == unmark->sk_strategy); + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + subkey->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + } + + nunmarked++; + } + + /* Copy both temp arrays back into so->keyData[] to reorder */ + Assert(nkept == so->numberOfKeys - nunmark); + Assert(nunmarked == nunmark); + memcpy(so->keyData, keepKeys, sizeof(ScanKeyData) * nkept); + memcpy(so->keyData + nkept, unmarkKeys, sizeof(ScanKeyData) * nunmarked); + + /* Done with temp arrays */ + pfree(unmarkikey); + pfree(keepKeys); + pfree(unmarkKeys); + + /* + * Now copy so->orderProcs[] temp entries needed by scans with = array + * keys back (just like with the so->keyData[] temp arrays) + */ + if (so->numArrayKeys) + { + memcpy(so->orderProcs, keepOrderProcs, sizeof(FmgrInfo) * nkept); + memcpy(so->orderProcs + nkept, unmarkOrderProcs, + sizeof(FmgrInfo) * nunmarked); + + /* Also fix-up array->scan_key references */ + for (int arridx = 0; arridx < so->numArrayKeys; arridx++) + { + BTArrayKeyInfo *array = &so->arrayKeys[arridx]; + + array->scan_key = keyDataMap[array->scan_key]; + } + + /* + * Sort so->arrayKeys[] based on its new BTArrayKeyInfo.scan_key + * offsets, so that its order matches so->keyData[] order as expected + */ + qsort(so->arrayKeys, so->numArrayKeys, sizeof(BTArrayKeyInfo), + _bt_reorder_array_cmp); + + /* Done with temp arrays */ + pfree(unmarkOrderProcs); + pfree(keepOrderProcs); + } +} + +/* + * qsort comparator for reordering so->arrayKeys[] BTArrayKeyInfo entries + */ +static int +_bt_reorder_array_cmp(const void *a, const void *b) +{ + BTArrayKeyInfo *arraya = (BTArrayKeyInfo *) a; + BTArrayKeyInfo *arrayb = (BTArrayKeyInfo *) b; + + return pg_cmp_s32(arraya->scan_key, arrayb->scan_key); +} + /* * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys * @@ -1532,6 +1863,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) * (also checks if we should add extra skip arrays based on input keys) */ numArrayKeys = _bt_num_array_keys(scan, skip_eq_ops, &numSkipArrayKeys); + so->skipScan = (numSkipArrayKeys > 0); /* Quit if nothing to do. */ if (numArrayKeys == 0) @@ -1561,7 +1893,6 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ - so->skipScan = (numSkipArrayKeys > 0); so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); /* Allocate space for ORDER procs used to help _bt_checkkeys */ @@ -2247,7 +2578,7 @@ _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out, static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype, StrategyNumber strat, - Datum *elems, int nelems) + const Datum *elems, int nelems) { Relation rel = scan->indexRelation; Oid cmp_op; diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c new file mode 100644 index 0000000000000..b3b8b55341108 --- /dev/null +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -0,0 +1,3718 @@ +/*------------------------------------------------------------------------- + * + * nbtreadpage.c + * Leaf page reading for btree index scans. + * + * NOTES + * This file contains code to return items that satisfy the scan's + * search-type scan keys within caller-supplied btree leaf page. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtreadpage.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/relscan.h" +#include "storage/predicate.h" +#include "utils/datum.h" +#include "utils/rel.h" + + +/* + * _bt_readpage state used across _bt_checkkeys calls for a page + */ +typedef struct BTReadPageState +{ + /* Input parameters, set by _bt_readpage for _bt_checkkeys */ + ScanDirection dir; /* current scan direction */ + OffsetNumber minoff; /* Lowest non-pivot tuple's offset */ + OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ + IndexTuple finaltup; /* Needed by scans with array keys */ + Page page; /* Page being read */ + bool firstpage; /* page is first for primitive scan? */ + bool forcenonrequired; /* treat all keys as nonrequired? */ + int startikey; /* start comparisons from this scan key */ + + /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ + OffsetNumber offnum; /* current tuple's page offset number */ + + /* Output parameters, set by _bt_checkkeys for _bt_readpage */ + OffsetNumber skip; /* Array keys "look ahead" skip offnum */ + bool continuescan; /* Terminate ongoing (primitive) index scan? */ + + /* + * Private _bt_checkkeys state used to manage "look ahead" optimization + * and primscan scheduling (only used during scans with array keys) + */ + int16 rechecks; + int16 targetdistance; + int16 nskipadvances; + +} BTReadPageState; + + +static void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate); +static bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup); +static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup); +static void _bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup); +static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, const ItemPointerData *heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset); +static bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, + IndexTuple tuple, int tupnatts); +static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool advancenonrequired, bool forcenonrequired, + bool *continuescan, int *ikey); +static bool _bt_check_rowcompare(ScanKey header, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + ScanDirection dir, bool forcenonrequired, bool *continuescan); +static bool _bt_rowcompare_cmpresult(ScanKey subkey, int cmpresult); +static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, TupleDesc tupdesc, int tupnatts, + bool readpagetup, int sktrig, bool *scanBehind); +static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, + int tupnatts, TupleDesc tupdesc); +static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int sktrig, bool sktrig_required); +static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, + bool *skip_array_set); +static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, + BTArrayKeyInfo *array, bool low_not_high); +static void _bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + int32 set_elem_result, Datum tupdatum, bool tupnull); +static void _bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur); +static void _bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +#ifdef USE_ASSERT_CHECKING +static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); +#endif + + +/* + * _bt_readpage() -- Load data from current index page into so->currPos + * + * Caller must have pinned and read-locked so->currPos.buf; the buffer's state + * is not changed here. Also, currPos.moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of so->currPos are + * initialized from scratch here. + * + * We scan the current page starting at offnum and moving in the indicated + * direction. All items matching the scan keys are loaded into currPos.items. + * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports + * that there can be no more matching tuples in the current scan direction + * (could just be for the current primitive index scan when scan has arrays). + * + * In the case of a parallel scan, caller must have called _bt_parallel_seize + * prior to calling this function; this function will invoke + * _bt_parallel_release before returning. + * + * Returns true if any matching items found on the page, false if none. + */ +bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, + bool firstpage) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + BTReadPageState pstate; + bool arrayKeys, + ignore_killed_tuples = scan->ignore_killed_tuples; + int itemIndex, + indnatts; + + /* save the page/buffer block number, along with its sibling links */ + page = BufferGetPage(so->currPos.buf); + opaque = BTPageGetOpaque(page); + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + so->currPos.prevPage = opaque->btpo_prev; + so->currPos.nextPage = opaque->btpo_next; + /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */ + pstate.dir = so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + + /* either moreRight or moreLeft should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : + so->currPos.moreLeft); + Assert(!P_IGNORE(opaque)); + Assert(BTScanPosIsPinned(so->currPos)); + Assert(!so->needPrimScan); + + /* initialize local variables */ + indnatts = IndexRelationGetNumberOfAttributes(rel); + arrayKeys = so->numArrayKeys != 0; + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* initialize page-level state that we'll pass to _bt_checkkeys */ + pstate.minoff = minoff; + pstate.maxoff = maxoff; + pstate.finaltup = NULL; + pstate.page = page; + pstate.firstpage = firstpage; + pstate.forcenonrequired = false; + pstate.startikey = 0; + pstate.offnum = InvalidOffsetNumber; + pstate.skip = InvalidOffsetNumber; + pstate.continuescan = true; /* default assumption */ + pstate.rechecks = 0; + pstate.targetdistance = 0; + pstate.nskipadvances = 0; + + if (scan->parallel_scan) + { + /* allow next/prev page to be read by other worker without delay */ + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, so->currPos.nextPage, + so->currPos.currPage); + else + _bt_parallel_release(scan, so->currPos.prevPage, + so->currPos.currPage); + } + + PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); + + if (ScanDirectionIsForward(dir)) + { + /* SK_SEARCHARRAY forward scans must provide high key up front */ + if (arrayKeys) + { + if (!P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (unlikely(so->scanBehind) && + !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) + { + /* Schedule another primitive index scan after all */ + so->currPos.moreRight = false; + so->needPrimScan = true; + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, + so->currPos.currPage); + return false; + } + } + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ + } + + /* + * Consider pstate.startikey optimization once the ongoing primitive + * index scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_set_startikey(scan, &pstate); + + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual + */ + if (ignore_killed_tuples && ItemIdIsDead(iid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + + itup = (IndexTuple) PageGetItem(page, iid); + Assert(!BTreeTupleIsPivot(itup)); + + pstate.offnum = offnum; + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); + + /* + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) + */ + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum < pstate.skip); + Assert(!pstate.forcenonrequired); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + + if (passes_quals) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* Set up posting list state (and remember first TID) */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + + /* Remember all later TIDs (must be at least one) */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!pstate.continuescan) + break; + + offnum = OffsetNumberNext(offnum); + } + + /* + * We don't need to visit page to the right when the high key + * indicates that no more matches will be found there. + * + * Checking the high key like this works out more often than you might + * think. Leaf page splits pick a split point between the two most + * dissimilar tuples (this is weighed against the need to evenly share + * free space). Leaf pages with high key attribute values that can + * only appear on non-pivot tuples on the right sibling page are + * common. + */ + if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + IndexTuple itup = (IndexTuple) PageGetItem(page, iid); + int truncatt; + + /* Reset arrays, per _bt_set_startikey contract */ + if (pstate.forcenonrequired) + _bt_start_array_keys(scan, dir); + pstate.forcenonrequired = false; + pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */ + + truncatt = BTreeTupleGetNAtts(itup, rel); + _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); + } + + if (!pstate.continuescan) + so->currPos.moreRight = false; + + Assert(itemIndex <= MaxTIDsPerBTreePage); + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; + } + else + { + /* SK_SEARCHARRAY backward scans must provide final tuple up front */ + if (arrayKeys) + { + if (minoff <= maxoff && !P_LEFTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, minoff); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (unlikely(so->scanBehind) && + !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) + { + /* Schedule another primitive index scan after all */ + so->currPos.moreLeft = false; + so->needPrimScan = true; + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, + so->currPos.currPage); + return false; + } + } + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ + } + + /* + * Consider pstate.startikey optimization once the ongoing primitive + * index scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_set_startikey(scan, &pstate); + + /* load items[] in descending order */ + itemIndex = MaxTIDsPerBTreePage; + + offnum = Min(offnum, maxoff); + + while (offnum >= minoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool tuple_alive; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual. Most of the + * time, it's a win to not bother examining the tuple's index + * keys, but just skip to the next tuple (previous, actually, + * since we're scanning backwards). However, if this is the first + * tuple on the page, we do check the index keys, to prevent + * uselessly advancing to the page to the left. This is similar + * to the high key optimization used by forward scans. + */ + if (ignore_killed_tuples && ItemIdIsDead(iid)) + { + if (offnum > minoff) + { + offnum = OffsetNumberPrev(offnum); + continue; + } + + tuple_alive = false; + } + else + tuple_alive = true; + + itup = (IndexTuple) PageGetItem(page, iid); + Assert(!BTreeTupleIsPivot(itup)); + + pstate.offnum = offnum; + if (arrayKeys && offnum == minoff && pstate.forcenonrequired) + { + /* Reset arrays, per _bt_set_startikey contract */ + pstate.forcenonrequired = false; + pstate.startikey = 0; + _bt_start_array_keys(scan, dir); + } + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); + + if (arrayKeys && so->scanBehind) + { + /* + * Done scanning this page, but not done with the current + * primscan. + * + * Note: Forward scans don't check this explicitly, since they + * prefer to reuse pstate.skip for this instead. + */ + Assert(!passes_quals && pstate.continuescan); + Assert(!pstate.forcenonrequired); + + break; + } + + /* + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) + */ + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum > pstate.skip); + Assert(!pstate.forcenonrequired); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + + if (passes_quals && tuple_alive) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + uint16 nitems = BTreeTupleGetNPosting(itup); + int tupleOffset; + + /* Set up posting list state (and remember last TID) */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, nitems - 1), + itup); + + /* Remember all prior TIDs (must be at least one) */ + for (int i = nitems - 2; i >= 0; i--) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!pstate.continuescan) + break; + + offnum = OffsetNumberPrev(offnum); + } + + /* + * We don't need to visit page to the left when no more matches will + * be found there + */ + if (!pstate.continuescan) + so->currPos.moreLeft = false; + + Assert(itemIndex >= 0); + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxTIDsPerBTreePage - 1; + so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + } + + /* + * If _bt_set_startikey told us to temporarily treat the scan's keys as + * nonrequired (possible only during scans with array keys), there must be + * no lasting consequences for the scan's array keys. The scan's arrays + * should now have exactly the same elements as they would have had if the + * nonrequired behavior had never been used. (In general, a scan's arrays + * are expected to track its progress through the index's key space.) + * + * We are required (by _bt_set_startikey) to call _bt_checkkeys against + * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's + * arrays to recover. Assert that that step hasn't been missed. + */ + Assert(!pstate.forcenonrequired); + + return (so->currPos.firstItem <= so->currPos.lastItem); +} + +/* + * _bt_start_array_keys() -- Initialize array keys at start of a scan + * + * Set up the cur_elem counters and fill in the first sk_argument value for + * each array scankey. + */ +void +_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(so->numArrayKeys); + Assert(so->qual_ok); + + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + + _bt_array_set_low_or_high(rel, skey, array, + ScanDirectionIsForward(dir)); + } + so->scanBehind = so->oppositeDirCheck = false; /* reset */ +} + +/* + * Determines an offset to the first scan key (an so->keyData[]-wise offset) + * that is _not_ guaranteed to be satisfied by every tuple from pstate.page, + * which is set in pstate.startikey for _bt_checkkeys calls for the page. + * This allows caller to save cycles on comparisons of a prefix of keys while + * reading pstate.page. + * + * Also determines if later calls to _bt_checkkeys (for pstate.page) should be + * forced to treat all required scan keys >= pstate.startikey as nonrequired + * (that is, if they're to be treated as if any SK_BT_REQFWD/SK_BT_REQBKWD + * markings that were set by preprocessing were not set at all, for the + * duration of _bt_checkkeys calls prior to the call for pstate.finaltup). + * This is indicated to caller by setting pstate.forcenonrequired. + * + * Call here at the start of reading a leaf page beyond the first one for the + * primitive index scan. We consider all non-pivot tuples, so it doesn't make + * sense to call here when only a subset of those tuples can ever be read. + * This is also a good idea on performance grounds; not calling here when on + * the first page (first for the current primitive scan) avoids wasting cycles + * during selective point queries. They typically don't stand to gain as much + * when we can set pstate.startikey, and are likely to notice the overhead of + * calling here. (Also, allowing pstate.forcenonrequired to be set on a + * primscan's first page would mislead _bt_advance_array_keys, which expects + * pstate.nskipadvances to be representative of every first page's key space.) + * + * Caller must call _bt_start_array_keys and reset startikey/forcenonrequired + * ahead of the finaltup _bt_checkkeys call when we set forcenonrequired=true. + * This will give _bt_checkkeys the opportunity to call _bt_advance_array_keys + * with sktrig_required=true, restoring the invariant that the scan's required + * arrays always track the scan's progress through the index's key space. + * Caller won't need to do this on the rightmost/leftmost page in the index + * (where pstate.finaltup isn't ever set), since forcenonrequired will never + * be set here in the first place. + */ +static void +_bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); + ItemId iid; + IndexTuple firsttup, + lasttup; + int startikey = 0, + arrayidx = 0, + firstchangingattnum; + bool start_past_saop_eq = false; + + Assert(!so->scanBehind); + Assert(pstate->minoff < pstate->maxoff); + Assert(!pstate->firstpage); + Assert(pstate->startikey == 0); + Assert(!so->numArrayKeys || pstate->finaltup || + P_RIGHTMOST(BTPageGetOpaque(pstate->page)) || + P_LEFTMOST(BTPageGetOpaque(pstate->page))); + + if (so->numberOfKeys == 0) + return; + + /* minoff is an offset to the lowest non-pivot tuple on the page */ + iid = PageGetItemId(pstate->page, pstate->minoff); + firsttup = (IndexTuple) PageGetItem(pstate->page, iid); + + /* maxoff is an offset to the highest non-pivot tuple on the page */ + iid = PageGetItemId(pstate->page, pstate->maxoff); + lasttup = (IndexTuple) PageGetItem(pstate->page, iid); + + /* Determine the first attribute whose values change on caller's page */ + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + + for (; startikey < so->numberOfKeys; startikey++) + { + ScanKey key = so->keyData + startikey; + BTArrayKeyInfo *array; + Datum firstdatum, + lastdatum; + bool firstnull, + lastnull; + int32 result; + + /* + * Determine if it's safe to set pstate.startikey to an offset to a + * key that comes after this key, by examining this key + */ + if (key->sk_flags & SK_ROW_HEADER) + { + /* RowCompare inequality (header key) */ + ScanKey subkey = (ScanKey) DatumGetPointer(key->sk_argument); + bool satisfied = false; + + for (;;) + { + int cmpresult; + bool firstsatisfies = false; + + if (subkey->sk_attno > firstchangingattnum) /* >, not >= */ + break; /* unsafe, preceding attr has multiple + * distinct values */ + + if (subkey->sk_flags & SK_ISNULL) + break; /* unsafe, unsatisfiable NULL subkey arg */ + + firstdatum = index_getattr(firsttup, subkey->sk_attno, + tupdesc, &firstnull); + lastdatum = index_getattr(lasttup, subkey->sk_attno, + tupdesc, &lastnull); + + if (firstnull || lastnull) + break; /* unsafe, NULL value won't satisfy subkey */ + + /* + * Compare the first tuple's datum for this row compare member + */ + cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, + subkey->sk_collation, + firstdatum, + subkey->sk_argument)); + if (subkey->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(cmpresult); + + if (cmpresult != 0 || (subkey->sk_flags & SK_ROW_END)) + { + firstsatisfies = _bt_rowcompare_cmpresult(subkey, + cmpresult); + if (!firstsatisfies) + { + /* Unsafe, firstdatum does not satisfy subkey */ + break; + } + } + + /* + * Compare the last tuple's datum for this row compare member + */ + cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, + subkey->sk_collation, + lastdatum, + subkey->sk_argument)); + if (subkey->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(cmpresult); + + if (cmpresult != 0 || (subkey->sk_flags & SK_ROW_END)) + { + if (!firstsatisfies) + { + /* + * It's only safe to set startikey beyond the row + * compare header key when both firsttup and lasttup + * satisfy the key as a whole based on the same + * deciding subkey/attribute. That can't happen now. + */ + break; /* unsafe */ + } + + satisfied = _bt_rowcompare_cmpresult(subkey, cmpresult); + break; /* safe iff 'satisfied' is true */ + } + + /* Move on to next row member/subkey */ + if (subkey->sk_flags & SK_ROW_END) + break; /* defensive */ + subkey++; + + /* + * We deliberately don't check if the next subkey has the same + * strategy as this iteration's subkey (which happens when + * subkeys for both ASC and DESC columns are used together), + * nor if any subkey is marked required. This is safe because + * in general all prior index attributes must have only one + * distinct value (across all of the tuples on the page) in + * order for us to even consider any subkey's attribute. + */ + } + + if (satisfied) + { + /* Safe, row compare satisfied by every tuple on page */ + continue; + } + + break; /* unsafe */ + } + if (key->sk_strategy != BTEqualStrategyNumber) + { + /* + * Scalar inequality key. + * + * It's definitely safe for _bt_checkkeys to avoid assessing this + * inequality when the page's first and last non-pivot tuples both + * satisfy the inequality (since the same must also be true of all + * the tuples in between these two). + * + * Unlike the "=" case, it doesn't matter if this attribute has + * more than one distinct value (though it _is_ necessary for any + * and all _prior_ attributes to contain no more than one distinct + * value amongst all of the tuples from pstate.page). + */ + if (key->sk_attno > firstchangingattnum) /* >, not >= */ + break; /* unsafe, preceding attr has multiple + * distinct values */ + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); + lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); + + if (key->sk_flags & SK_ISNULL) + { + /* IS NOT NULL key */ + Assert(key->sk_flags & SK_SEARCHNOTNULL); + + if (firstnull || lastnull) + break; /* unsafe */ + + /* Safe, IS NOT NULL key satisfied by every tuple */ + continue; + } + + /* Test firsttup */ + if (firstnull || + !DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, firstdatum, + key->sk_argument))) + break; /* unsafe */ + + /* Test lasttup */ + if (lastnull || + !DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, lastdatum, + key->sk_argument))) + break; /* unsafe */ + + /* Safe, scalar inequality satisfied by every tuple */ + continue; + } + + /* Some = key (could be a scalar = key, could be an array = key) */ + Assert(key->sk_strategy == BTEqualStrategyNumber); + + if (!(key->sk_flags & SK_SEARCHARRAY)) + { + /* + * Scalar = key (possibly an IS NULL key). + * + * It is unsafe to set pstate.startikey to an ikey beyond this + * key, unless the = key is satisfied by every possible tuple on + * the page (possible only when attribute has just one distinct + * value among all tuples on the page). + */ + if (key->sk_attno >= firstchangingattnum) + break; /* unsafe, multiple distinct attr values */ + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, + &firstnull); + if (key->sk_flags & SK_ISNULL) + { + /* IS NULL key */ + Assert(key->sk_flags & SK_SEARCHNULL); + + if (!firstnull) + break; /* unsafe */ + + /* Safe, IS NULL key satisfied by every tuple */ + continue; + } + if (firstnull || + !DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, firstdatum, + key->sk_argument))) + break; /* unsafe */ + + /* Safe, scalar = key satisfied by every tuple */ + continue; + } + + /* = array key (could be a SAOP array, could be a skip array) */ + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == startikey); + if (array->num_elems != -1) + { + /* + * SAOP array = key. + * + * Handle this like we handle scalar = keys (though binary search + * for a matching element, to avoid relying on key's sk_argument). + */ + if (key->sk_attno >= firstchangingattnum) + break; /* unsafe, multiple distinct attr values */ + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, + &firstnull); + _bt_binsrch_array_skey(&so->orderProcs[startikey], + false, NoMovementScanDirection, + firstdatum, firstnull, array, key, + &result); + if (result != 0) + break; /* unsafe */ + + /* Safe, SAOP = key satisfied by every tuple */ + start_past_saop_eq = true; + continue; + } + + /* + * Skip array = key + */ + Assert(key->sk_flags & SK_BT_SKIP); + if (array->null_elem) + { + /* + * Non-range skip array = key. + * + * Safe, non-range skip array "satisfied" by every tuple on page + * (safe even when "key->sk_attno > firstchangingattnum"). + */ + continue; + } + + /* + * Range skip array = key. + * + * Handle this like we handle scalar inequality keys (but avoid using + * key's sk_argument directly, as in the SAOP array case). + */ + if (key->sk_attno > firstchangingattnum) /* >, not >= */ + break; /* unsafe, preceding attr has multiple + * distinct values */ + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); + lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); + + /* Test firsttup */ + _bt_binsrch_skiparray_skey(false, ForwardScanDirection, + firstdatum, firstnull, array, key, + &result); + if (result != 0) + break; /* unsafe */ + + /* Test lasttup */ + _bt_binsrch_skiparray_skey(false, ForwardScanDirection, + lastdatum, lastnull, array, key, + &result); + if (result != 0) + break; /* unsafe */ + + /* Safe, range skip array satisfied by every tuple on page */ + } + + /* + * Use of forcenonrequired is typically undesirable, since it'll force + * _bt_readpage caller to read every tuple on the page -- even though, in + * general, it might well be possible to end the scan on an earlier tuple. + * However, caller must use forcenonrequired when start_past_saop_eq=true, + * since the usual required array behavior might fail to roll over to the + * SAOP array. + * + * We always prefer forcenonrequired=true during scans with skip arrays + * (except on the first page of each primitive index scan), though -- even + * when "startikey == 0". That way, _bt_advance_array_keys's low-order + * key precheck optimization can always be used (unless on the first page + * of the scan). It seems slightly preferable to check more tuples when + * that allows us to do significantly less skip array maintenance. + */ + pstate->forcenonrequired = (start_past_saop_eq || so->skipScan); + pstate->startikey = startikey; + + /* + * _bt_readpage caller is required to call _bt_checkkeys against page's + * finaltup with forcenonrequired=false whenever we initially set + * forcenonrequired=true. That way the scan's arrays will reliably track + * its progress through the index's key space. + * + * We don't expect this when _bt_readpage caller has no finaltup due to + * its page being the rightmost (or the leftmost, during backwards scans). + * When we see that _bt_readpage has no finaltup, back out of everything. + */ + Assert(!pstate->forcenonrequired || so->numArrayKeys); + if (pstate->forcenonrequired && !pstate->finaltup) + { + pstate->forcenonrequired = false; + pstate->startikey = 0; + } +} + +/* + * Test whether caller's finaltup tuple is still before the start of matches + * for the current array keys. + * + * Called at the start of reading a page during a scan with array keys, though + * only when the so->scanBehind flag was set on the scan's prior page. + * + * Returns false if the tuple is still before the start of matches. When that + * happens, caller should cut its losses and start a new primitive index scan. + * Otherwise returns true. + */ +static bool +_bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup) +{ + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); + bool scanBehind; + + Assert(so->numArrayKeys); + + if (_bt_tuple_before_array_skeys(scan, dir, finaltup, tupdesc, + nfinaltupatts, false, 0, &scanBehind)) + return false; + + /* + * If scanBehind was set, all of the untruncated attribute values from + * finaltup that correspond to an array match the array's current element, + * but there are other keys associated with truncated suffix attributes. + * Array advancement must have incremented the scan's arrays on the + * previous page, resulting in a set of array keys that happen to be an + * exact match for the current page high key's untruncated prefix values. + * + * This page definitely doesn't contain tuples that the scan will need to + * return. The next page may or may not contain relevant tuples. Handle + * this by cutting our losses and starting a new primscan. + */ + if (scanBehind) + return false; + + if (!so->oppositeDirCheck) + return true; + + return _bt_oppodir_checkkeys(scan, dir, finaltup); +} + +/* + * Test whether an indextuple fails to satisfy an inequality required in the + * opposite direction only. + * + * Caller's finaltup tuple is the page high key (for forwards scans), or the + * first non-pivot tuple (for backwards scans). Called during scans with + * required array keys and required opposite-direction inequalities. + * + * Returns false if an inequality scan key required in the opposite direction + * only isn't satisfied (and any earlier required scan keys are satisfied). + * Otherwise returns true. + * + * An unsatisfied inequality required in the opposite direction only might + * well enable skipping over many leaf pages, provided another _bt_first call + * takes place. This type of unsatisfied inequality won't usually cause + * _bt_checkkeys to stop the scan to consider array advancement/starting a new + * primitive index scan. + */ +static bool +_bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup) +{ + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); + bool continuescan; + ScanDirection flipped = -dir; + int ikey = 0; + + Assert(so->numArrayKeys); + + _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, false, + false, &continuescan, + &ikey); + + if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber) + return false; + + return true; +} + +/* Save an index item into so->currPos.items[itemIndex] */ +static void +_bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + + currItem->heapTid = itup->t_tid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + Size itupsz = IndexTupleSize(itup); + + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += MAXALIGN(itupsz); + } +} + +/* + * Setup state to save TIDs/items from a single posting list tuple. + * + * Saves an index item into so->currPos.items[itemIndex] for TID that is + * returned to scan first. Second or subsequent TIDs for posting list should + * be saved by calling _bt_savepostingitem(). + * + * Returns an offset into tuple storage space that main tuple is stored at if + * needed. + */ +static int +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + const ItemPointerData *heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(BTreeTupleIsPosting(itup)); + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + + return currItem->tupleOffset; + } + + return 0; +} + +/* + * Save an index item into so->currPos.items[itemIndex] for current posting + * tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. Caller passes its return value as tupleOffset. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every TID + * that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = tupleOffset; +} + +#define LOOK_AHEAD_REQUIRED_RECHECKS 3 +#define LOOK_AHEAD_DEFAULT_DISTANCE 5 +#define NSKIPADVANCES_THRESHOLD 3 + +/* + * Test whether an indextuple satisfies all the scankey conditions. + * + * Return true if so, false if not. If the tuple fails to pass the qual, + * we also determine whether there's any need to continue the scan beyond + * this tuple, and set pstate.continuescan accordingly. See comments for + * _bt_preprocess_keys() about how this is done. + * + * Forward scan callers can pass a high key tuple in the hopes of having + * us set *continuescan to false, and avoiding an unnecessary visit to + * the page to the right. + * + * Advances the scan's array keys when necessary for arrayKeys=true callers. + * Scans without any array keys must always pass arrayKeys=false. + * + * Also stops and starts primitive index scans for arrayKeys=true callers. + * Scans with array keys are required to set up page state that helps us with + * this. The page's finaltup tuple (the page high key for a forward scan, or + * the page's first non-pivot tuple for a backward scan) must be set in + * pstate.finaltup ahead of the first call here for the page. Set this to + * NULL for rightmost page (or the leftmost page for backwards scans). + * + * scan: index scan descriptor (containing a search-type scankey) + * pstate: page level input and output parameters + * arrayKeys: should we advance the scan's array keys if necessary? + * tuple: index tuple to test + * tupnatts: number of attributes in tupnatts (high key may be truncated) + */ +static bool +_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, + IndexTuple tuple, int tupnatts) +{ + TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; + ScanDirection dir = pstate->dir; + int ikey = pstate->startikey; + bool res; + + Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); + Assert(arrayKeys || so->numArrayKeys == 0); + + res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, arrayKeys, + pstate->forcenonrequired, &pstate->continuescan, + &ikey); + + /* + * If _bt_check_compare relied on the pstate.startikey optimization, call + * again (in assert-enabled builds) to verify it didn't affect our answer. + * + * Note: we can't do this when !pstate.forcenonrequired, since any arrays + * before pstate.startikey won't have advanced on this page at all. + */ + Assert(!pstate->forcenonrequired || arrayKeys); +#ifdef USE_ASSERT_CHECKING + if (pstate->startikey > 0 && !pstate->forcenonrequired) + { + bool dres, + dcontinuescan; + int dikey = 0; + + /* Pass arrayKeys=false to avoid array side-effects */ + dres = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, + pstate->forcenonrequired, &dcontinuescan, + &dikey); + Assert(res == dres); + Assert(pstate->continuescan == dcontinuescan); + + /* + * Should also get the same ikey result. We need a slightly weaker + * assertion during arrayKeys calls, since they might be using an + * array that couldn't be marked required during preprocessing. + */ + Assert(arrayKeys || ikey == dikey); + Assert(ikey <= dikey); + } +#endif + + /* + * Only one _bt_check_compare call is required in the common case where + * there are no equality strategy array scan keys. Otherwise we can only + * accept _bt_check_compare's answer unreservedly when it didn't set + * pstate.continuescan=false. + */ + if (!arrayKeys || pstate->continuescan) + return res; + + /* + * _bt_check_compare call set continuescan=false in the presence of + * equality type array keys. This could mean that the tuple is just past + * the end of matches for the current array keys. + * + * It's also possible that the scan is still _before_ the _start_ of + * tuples matching the current set of array keys. Check for that first. + */ + Assert(!pstate->forcenonrequired); + if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, + ikey, NULL)) + { + /* Override _bt_check_compare, continue primitive scan */ + pstate->continuescan = true; + + /* + * We will end up here repeatedly given a group of tuples > the + * previous array keys and < the now-current keys (for a backwards + * scan it's just the same, though the operators swap positions). + * + * We must avoid allowing this linear search process to scan very many + * tuples from well before the start of tuples matching the current + * array keys (or from well before the point where we'll once again + * have to advance the scan's array keys). + * + * We keep the overhead under control by speculatively "looking ahead" + * to later still-unscanned items from this same leaf page. We'll + * only attempt this once the number of tuples that the linear search + * process has examined starts to get out of hand. + */ + pstate->rechecks++; + if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS) + { + /* See if we should skip ahead within the current leaf page */ + _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc); + + /* + * Might have set pstate.skip to a later page offset. When that + * happens then _bt_readpage caller will inexpensively skip ahead + * to a later tuple from the same page (the one just after the + * tuple we successfully "looked ahead" to). + */ + } + + /* This indextuple doesn't match the current qual, in any case */ + return false; + } + + /* + * Caller's tuple is >= the current set of array keys and other equality + * constraint scan keys (or <= if this is a backwards scan). It's now + * clear that we _must_ advance any required array keys in lockstep with + * the scan. + */ + return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc, + ikey, true); +} + +/* + * Test whether an indextuple satisfies current scan condition. + * + * Return true if so, false if not. If not, also sets *continuescan to false + * when it's also not possible for any later tuples to pass the current qual + * (with the scan's current set of array keys, in the current scan direction), + * in addition to setting *ikey to the so->keyData[] subscript/offset for the + * unsatisfied scan key (needed when caller must consider advancing the scan's + * array keys). + * + * This is a subroutine for _bt_checkkeys. We provisionally assume that + * reaching the end of the current set of required keys (in particular the + * current required array keys) ends the ongoing (primitive) index scan. + * Callers without array keys should just end the scan right away when they + * find that continuescan has been set to false here by us. Things are more + * complicated for callers with array keys. + * + * Callers with array keys must first consider advancing the arrays when + * continuescan has been set to false here by us. They must then consider if + * it really does make sense to end the current (primitive) index scan, in + * light of everything that is known at that point. (In general when we set + * continuescan=false for these callers it must be treated as provisional.) + * + * We deal with advancing unsatisfied non-required arrays directly, though. + * This is safe, since by definition non-required keys can't end the scan. + * This is just how we determine if non-required arrays are just unsatisfied + * by the current array key, or if they're truly unsatisfied (that is, if + * they're unsatisfied by every possible array key). + * + * Pass advancenonrequired=false to avoid all array related side effects. + * This allows _bt_advance_array_keys caller to avoid infinite recursion. + * + * Pass forcenonrequired=true to instruct us to treat all keys as nonrequired. + * This is used to make it safe to temporarily stop properly maintaining the + * scan's required arrays. _bt_checkkeys caller (_bt_readpage, actually) + * determines a prefix of keys that must satisfy every possible corresponding + * index attribute value from its page, which is passed to us via *ikey arg + * (this is the first key that might be unsatisfied by tuples on the page). + * Obviously, we won't maintain any array keys from before *ikey, so it's + * quite possible for such arrays to "fall behind" the index's keyspace. + * Caller will need to "catch up" by passing forcenonrequired=true (alongside + * an *ikey=0) once the page's finaltup is reached. + * + * Note: it's safe to pass an *ikey > 0 with forcenonrequired=false, but only + * when caller determines that it won't affect array maintenance. + */ +static bool +_bt_check_compare(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool advancenonrequired, bool forcenonrequired, + bool *continuescan, int *ikey) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + *continuescan = true; /* default assumption */ + + for (; *ikey < so->numberOfKeys; (*ikey)++) + { + ScanKey key = so->keyData + *ikey; + Datum datum; + bool isNull; + bool requiredSameDir = false, + requiredOppositeDirOnly = false; + + /* + * Check if the key is required in the current scan direction, in the + * opposite scan direction _only_, or in neither direction (except + * when we're forced to treat all scan keys as nonrequired) + */ + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + requiredSameDir = true; + else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || + ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) + requiredOppositeDirOnly = true; + + if (key->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(BTreeTupleIsPivot(tuple)); + continue; + } + + /* + * A skip array scan key uses one of several sentinel values. We just + * fall back on _bt_tuple_before_array_skeys when we see such a value. + */ + if (key->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR)) + { + Assert(key->sk_flags & SK_SEARCHARRAY); + Assert(key->sk_flags & SK_BT_SKIP); + Assert(requiredSameDir || forcenonrequired); + + /* + * Cannot fall back on _bt_tuple_before_array_skeys when we're + * treating the scan's keys as nonrequired, though. Just handle + * this like any other non-required equality-type array key. + */ + if (forcenonrequired) + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); + + *continuescan = false; + return false; + } + + /* row-comparison keys need special processing */ + if (key->sk_flags & SK_ROW_HEADER) + { + if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, + forcenonrequired, continuescan)) + continue; + return false; + } + + datum = index_getattr(tuple, + key->sk_attno, + tupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* Handle IS NULL/NOT NULL tests */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (isNull) + continue; /* tuple satisfies this qual */ + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + Assert(!(key->sk_flags & SK_BT_SKIP)); + if (!isNull) + continue; /* tuple satisfies this qual */ + } + + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + */ + if (requiredSameDir) + *continuescan = false; + else if (unlikely(key->sk_flags & SK_BT_SKIP)) + { + /* + * If we're treating scan keys as nonrequired, and encounter a + * skip array scan key whose current element is NULL, then it + * must be a non-range skip array. It must be satisfied, so + * there's no need to call _bt_advance_array_keys to check. + */ + Assert(forcenonrequired && *ikey > 0); + continue; + } + + /* + * This indextuple doesn't match the qual. + */ + return false; + } + + if (isNull) + { + /* + * Scalar scan key isn't satisfied by NULL tuple value. + * + * If we're treating scan keys as nonrequired, and key is for a + * skip array, then we must attempt to advance the array to NULL + * (if we're successful then the tuple might match the qual). + */ + if (unlikely(forcenonrequired && key->sk_flags & SK_BT_SKIP)) + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); + + if (key->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + * (_bt_advance_array_keys also relies on this behavior during + * forward scans.) + */ + if ((requiredSameDir || requiredOppositeDirOnly) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + * (_bt_advance_array_keys also relies on this behavior during + * backward scans.) + */ + if ((requiredSameDir || requiredOppositeDirOnly) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * This indextuple doesn't match the qual. + */ + return false; + } + + if (!DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument))) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + */ + if (requiredSameDir) + *continuescan = false; + + /* + * If this is a non-required equality-type array key, the tuple + * needs to be checked against every possible array key. Handle + * this by "advancing" the scan key's array to a matching value + * (if we're successful then the tuple might match the qual). + */ + else if (advancenonrequired && + key->sk_strategy == BTEqualStrategyNumber && + (key->sk_flags & SK_SEARCHARRAY)) + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); + + /* + * This indextuple doesn't match the qual. + */ + return false; + } + } + + /* If we get here, the tuple passes all index quals. */ + return true; +} + +/* + * Test whether an indextuple satisfies a row-comparison scan condition. + * + * Return true if so, false if not. If not, also clear *continuescan if + * it's not possible for any future tuples in the current scan direction + * to pass the qual. + * + * This is a subroutine for _bt_checkkeys/_bt_check_compare. Caller passes us + * a row compare header key taken from so->keyData[]. + * + * Row value comparisons can be described in terms of logical expansions that + * use only scalar operators. Consider the following example row comparison: + * + * "(a, b, c) > (7, 'bar', 62)" + * + * This can be evaluated as: + * + * "(a = 7 AND b = 'bar' AND c > 62) OR (a = 7 AND b > 'bar') OR (a > 7)". + * + * Notice that this condition is satisfied by _all_ rows that satisfy "a > 7", + * and by a subset of all rows that satisfy "a >= 7" (possibly all such rows). + * It _can't_ be satisfied by other rows (where "a < 7" or where "a IS NULL"). + * A row comparison header key can therefore often be treated as if it was a + * simple scalar inequality on the row compare's most significant column. + * (For example, _bt_advance_array_keys and most preprocessing routines treat + * row compares like any other same-strategy inequality on the same column.) + * + * Things get more complicated for our row compare given a row where "a = 7". + * Note that a row compare isn't necessarily satisfied by _every_ tuple that + * appears between the first and last satisfied tuple returned by the scan, + * due to the way that its lower-order subkeys are only conditionally applied. + * A forwards scan that uses our example qual might initially return a tuple + * "(a, b, c) = (7, 'zebra', 54)". But it won't subsequently return a tuple + * "(a, b, c) = (7, NULL, 1)" located to the right of the first matching tuple + * (assume that "b" was declared NULLS LAST here). The scan will only return + * additional matches upon reaching tuples where "a > 7". If you rereview our + * example row comparison's logical expansion, you'll understand why this is. + * (Here we assume that all subkeys could be marked required, guaranteeing + * that row comparison order matches index order. This is the common case.) + * + * Note that a row comparison header key behaves _exactly_ the same as a + * similar scalar inequality key on the row's most significant column once the + * scan reaches the point where it no longer needs to evaluate lower-order + * subkeys (or before the point where it starts needing to evaluate them). + * For example, once a forwards scan that uses our example qual reaches the + * first tuple "a > 7", we'll behave in just the same way as our caller would + * behave with a similar scalar inequality "a > 7" for the remainder of the + * scan (assuming that the scan never changes direction/never goes backwards). + * We'll even set continuescan=false according to exactly the same rules as + * the ones our caller applies with simple scalar inequalities, including the + * rules it applies when NULL tuple values don't satisfy an inequality qual. + */ +static bool +_bt_check_rowcompare(ScanKey header, IndexTuple tuple, int tupnatts, + TupleDesc tupdesc, ScanDirection dir, + bool forcenonrequired, bool *continuescan) +{ + ScanKey subkey = (ScanKey) DatumGetPointer(header->sk_argument); + int32 cmpresult = 0; + bool result; + + /* First subkey should be same as the header says */ + Assert(header->sk_flags & SK_ROW_HEADER); + Assert(subkey->sk_attno == header->sk_attno); + Assert(subkey->sk_strategy == header->sk_strategy); + + /* Loop over columns of the row condition */ + for (;;) + { + Datum datum; + bool isNull; + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + + /* When a NULL row member is compared, the row never matches */ + if (subkey->sk_flags & SK_ISNULL) + { + /* + * Unlike the simple-scankey case, this isn't a disallowed case + * (except when it's the first row element that has the NULL arg). + * But it can never match. If all the earlier row comparison + * columns are required for the scan direction, we can stop the + * scan, because there can't be another tuple that will succeed. + */ + Assert(subkey != (ScanKey) DatumGetPointer(header->sk_argument)); + subkey--; + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + return false; + } + + if (subkey->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(BTreeTupleIsPivot(tuple)); + return true; + } + + datum = index_getattr(tuple, + subkey->sk_attno, + tupdesc, + &isNull); + + if (isNull) + { + int reqflags; + + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if (subkey->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. However, on a forwards + * scan, we must keep going, because we may have initially + * positioned to the start of the index. + * + * All required NULLS FIRST > row members can use NULL tuple + * values to end backwards scans, just like with other values. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a + * backwards scan upon reaching the index's rightmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42. + */ + reqflags = SK_BT_REQBKWD; + + /* + * When a most significant required NULLS FIRST < row compare + * member sees NULL tuple values during a backwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a + * backwards scan upon reaching the rightmost tuple whose "a" + * column has a NULL. The "a" NULL value is "<" 9, and yet + * our < row compare will still end the scan. (This isn't + * safe with later/lower-order row members. Notice that it + * can only happen with an "a" NULL some time after the scan + * completely stops needing to use its "b" and "c" members.) + */ + if (subkey == (ScanKey) DatumGetPointer(header->sk_argument)) + reqflags |= SK_BT_REQFWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. However, on a backward + * scan, we must keep going, because we may have initially + * positioned to the end of the index. + * + * All required NULLS LAST < row members can use NULL tuple + * values to end forwards scans, just like with other values. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a + * forwards scan upon reaching the index's leftmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42. + */ + reqflags = SK_BT_REQFWD; + + /* + * When a most significant required NULLS LAST > row compare + * member sees NULL tuple values during a forwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a + * forwards scan upon reaching the leftmost tuple whose "a" + * column has a NULL. The "a" NULL value is ">" 9, and yet + * our > row compare will end the scan. (This isn't safe with + * later/lower-order row members. Notice that it can only + * happen with an "a" NULL some time after the scan completely + * stops needing to use its "b" and "c" members.) + */ + if (subkey == (ScanKey) DatumGetPointer(header->sk_argument)) + reqflags |= SK_BT_REQBKWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + /* Perform the test --- three-way comparison not bool operator */ + cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, + subkey->sk_collation, + datum, + subkey->sk_argument)); + + if (subkey->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(cmpresult); + + /* Done comparing if unequal, else advance to next column */ + if (cmpresult != 0) + break; + + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + + /* Final subkey/column determines if row compare is satisfied */ + result = _bt_rowcompare_cmpresult(subkey, cmpresult); + + if (!result && !forcenonrequired) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will pass, + * either. Note we have to look at the deciding column, not + * necessarily the first or last column of the row condition. + */ + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + + return result; +} + +/* + * Call here when a row compare member returns a non-zero result, or with the + * result for the final ROW_END row compare member (no matter the cmpresult). + * + * cmpresult indicates the overall result of the row comparison (must already + * be commuted for DESC subkeys), and subkey is the deciding row member. + */ +static bool +_bt_rowcompare_cmpresult(ScanKey subkey, int cmpresult) +{ + bool satisfied; + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + + switch (subkey->sk_strategy) + { + case BTLessStrategyNumber: + satisfied = (cmpresult < 0); + break; + case BTLessEqualStrategyNumber: + satisfied = (cmpresult <= 0); + break; + case BTGreaterEqualStrategyNumber: + satisfied = (cmpresult >= 0); + break; + case BTGreaterStrategyNumber: + satisfied = (cmpresult > 0); + break; + default: + /* EQ and NE cases aren't allowed here */ + elog(ERROR, "unexpected strategy number %d", subkey->sk_strategy); + satisfied = false; /* keep compiler quiet */ + break; + } + + return satisfied; +} + +/* + * _bt_tuple_before_array_skeys() -- too early to advance required arrays? + * + * We always compare the tuple using the current array keys (which we assume + * are already set in so->keyData[]). readpagetup indicates if tuple is the + * scan's current _bt_readpage-wise tuple. + * + * readpagetup callers must only call here when _bt_check_compare already set + * continuescan=false. We help these callers deal with _bt_check_compare's + * inability to distinguish between the < and > cases (it uses equality + * operator scan keys, whereas we use 3-way ORDER procs). These callers pass + * a _bt_check_compare-set sktrig value that indicates which scan key + * triggered the call (!readpagetup callers just pass us sktrig=0 instead). + * This information allows us to avoid wastefully checking earlier scan keys + * that were already deemed to have been satisfied inside _bt_check_compare. + * + * Returns false when caller's tuple is >= the current required equality scan + * keys (or <=, in the case of backwards scans). This happens to readpagetup + * callers when the scan has reached the point of needing its array keys + * advanced; caller will need to advance required and non-required arrays at + * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over. + * (When we return false to readpagetup callers, tuple can only be == current + * required equality scan keys when caller's sktrig indicates that the arrays + * need to be advanced due to an unsatisfied required inequality key trigger.) + * + * Returns true when caller passes a tuple that is < the current set of + * equality keys for the most significant non-equal required scan key/column + * (or > the keys, during backwards scans). This happens to readpagetup + * callers when tuple is still before the start of matches for the scan's + * required equality strategy scan keys. (sktrig can't have indicated that an + * inequality strategy scan key wasn't satisfied in _bt_check_compare when we + * return true. In fact, we automatically return false when passed such an + * inequality sktrig by readpagetup callers -- _bt_check_compare's initial + * continuescan=false doesn't really need to be confirmed here by us.) + * + * !readpagetup callers optionally pass us *scanBehind, which tracks whether + * any missing truncated attributes might have affected array advancement + * (compared to what would happen if it was shown the first non-pivot tuple on + * the page to the right of caller's finaltup/high key tuple instead). It's + * only possible that we'll set *scanBehind to true when caller passes us a + * pivot tuple (with truncated -inf attributes) that we return false for. + */ +static bool +_bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, TupleDesc tupdesc, int tupnatts, + bool readpagetup, int sktrig, bool *scanBehind) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(so->numArrayKeys); + Assert(so->numberOfKeys); + Assert(sktrig == 0 || readpagetup); + Assert(!readpagetup || scanBehind == NULL); + + if (scanBehind) + *scanBehind = false; + + for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + Datum tupdatum; + bool tupnull; + int32 result; + + /* readpagetup calls require one ORDER proc comparison (at most) */ + Assert(!readpagetup || ikey == sktrig); + + /* + * Once we reach a non-required scan key, we're completely done. + * + * Note: we deliberately don't consider the scan direction here. + * _bt_advance_array_keys caller requires that we track *scanBehind + * without concern for scan direction. + */ + if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0) + { + Assert(!readpagetup); + Assert(ikey > sktrig || ikey == 0); + return false; + } + + if (cur->sk_attno > tupnatts) + { + Assert(!readpagetup); + + /* + * When we reach a high key's truncated attribute, assume that the + * tuple attribute's value is >= the scan's equality constraint + * scan keys (but set *scanBehind to let interested callers know + * that a truncated attribute might have affected our answer). + */ + if (scanBehind) + *scanBehind = true; + + return false; + } + + /* + * Deal with inequality strategy scan keys that _bt_check_compare set + * continuescan=false for + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + { + /* + * When _bt_check_compare indicated that a required inequality + * scan key wasn't satisfied, there's no need to verify anything; + * caller always calls _bt_advance_array_keys with this sktrig. + */ + if (readpagetup) + return false; + + /* + * Otherwise we can't give up, since we must check all required + * scan keys (required in either direction) in order to correctly + * track *scanBehind for caller + */ + continue; + } + + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + + if (likely(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))) + { + /* Scankey has a valid/comparable sk_argument value */ + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + + if (result == 0) + { + /* + * Interpret result in a way that takes NEXT/PRIOR into + * account + */ + if (cur->sk_flags & SK_BT_NEXT) + result = -1; + else if (cur->sk_flags & SK_BT_PRIOR) + result = 1; + + Assert(result == 0 || (cur->sk_flags & SK_BT_SKIP)); + } + } + else + { + BTArrayKeyInfo *array = NULL; + + /* + * Current array element/array = scan key value is a sentinel + * value that represents the lowest (or highest) possible value + * that's still within the range of the array. + * + * Like _bt_first, we only see MINVAL keys during forwards scans + * (and similarly only see MAXVAL keys during backwards scans). + * Even if the scan's direction changes, we'll stop at some higher + * order key before we can ever reach any MAXVAL (or MINVAL) keys. + * (However, unlike _bt_first we _can_ get to keys marked either + * NEXT or PRIOR, regardless of the scan's current direction.) + */ + Assert(ScanDirectionIsForward(dir) ? + !(cur->sk_flags & SK_BT_MAXVAL) : + !(cur->sk_flags & SK_BT_MINVAL)); + + /* + * There are no valid sk_argument values in MINVAL/MAXVAL keys. + * Check if tupdatum is within the range of skip array instead. + */ + for (int arrayidx = 0; arrayidx < so->numArrayKeys; arrayidx++) + { + array = &so->arrayKeys[arrayidx]; + if (array->scan_key == ikey) + break; + } + + _bt_binsrch_skiparray_skey(false, dir, tupdatum, tupnull, + array, cur, &result); + + if (result == 0) + { + /* + * tupdatum satisfies both low_compare and high_compare, so + * it's time to advance the array keys. + * + * Note: It's possible that the skip array will "advance" from + * its MINVAL (or MAXVAL) representation to an alternative, + * logically equivalent representation of the same value: a + * representation where the = key gets a valid datum in its + * sk_argument. This is only possible when low_compare uses + * the >= strategy (or high_compare uses the <= strategy). + */ + return false; + } + } + + /* + * Does this comparison indicate that caller must _not_ advance the + * scan's arrays just yet? + */ + if ((ScanDirectionIsForward(dir) && result < 0) || + (ScanDirectionIsBackward(dir) && result > 0)) + return true; + + /* + * Does this comparison indicate that caller should now advance the + * scan's arrays? (Must be if we get here during a readpagetup call.) + */ + if (readpagetup || result != 0) + { + Assert(result != 0); + return false; + } + + /* + * Inconclusive -- need to check later scan keys, too. + * + * This must be a finaltup precheck, or a call made from an assertion. + */ + Assert(result == 0); + } + + Assert(!readpagetup); + + return false; +} + +/* + * Determine if a scan with array keys should skip over uninteresting tuples. + * + * This is a subroutine for _bt_checkkeys. Called when _bt_readpage's linear + * search process (started after it finishes reading an initial group of + * matching tuples, used to locate the start of the next group of tuples + * matching the next set of required array keys) has already scanned an + * excessive number of tuples whose key space is "between arrays". + * + * When we perform look ahead successfully, we'll sets pstate.skip, which + * instructs _bt_readpage to skip ahead to that tuple next (could be past the + * end of the scan's leaf page). Pages where the optimization is effective + * will generally still need to skip several times. Each call here performs + * only a single "look ahead" comparison of a later tuple, whose distance from + * the current tuple's offset number is determined by applying heuristics. + */ +static void +_bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, + int tupnatts, TupleDesc tupdesc) +{ + ScanDirection dir = pstate->dir; + OffsetNumber aheadoffnum; + IndexTuple ahead; + + Assert(!pstate->forcenonrequired); + + /* Avoid looking ahead when comparing the page high key */ + if (pstate->offnum < pstate->minoff) + return; + + /* + * Don't look ahead when there aren't enough tuples remaining on the page + * (in the current scan direction) for it to be worth our while + */ + if (ScanDirectionIsForward(dir) && + pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE) + return; + else if (ScanDirectionIsBackward(dir) && + pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE) + return; + + /* + * The look ahead distance starts small, and ramps up as each call here + * allows _bt_readpage to skip over more tuples + */ + if (!pstate->targetdistance) + pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE; + else if (pstate->targetdistance < MaxIndexTuplesPerPage / 2) + pstate->targetdistance *= 2; + + /* Don't read past the end (or before the start) of the page, though */ + if (ScanDirectionIsForward(dir)) + aheadoffnum = Min((int) pstate->maxoff, + (int) pstate->offnum + pstate->targetdistance); + else + aheadoffnum = Max((int) pstate->minoff, + (int) pstate->offnum - pstate->targetdistance); + + ahead = (IndexTuple) PageGetItem(pstate->page, + PageGetItemId(pstate->page, aheadoffnum)); + if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts, + false, 0, NULL)) + { + /* + * Success -- instruct _bt_readpage to skip ahead to very next tuple + * after the one we determined was still before the current array keys + */ + if (ScanDirectionIsForward(dir)) + pstate->skip = aheadoffnum + 1; + else + pstate->skip = aheadoffnum - 1; + } + else + { + /* + * Failure -- "ahead" tuple is too far ahead (we were too aggressive). + * + * Reset the number of rechecks, and aggressively reduce the target + * distance (we're much more aggressive here than we were when the + * distance was initially ramped up). + */ + pstate->rechecks = 0; + pstate->targetdistance = Max(pstate->targetdistance / 8, 1); + } +} + +/* + * _bt_advance_array_keys() -- Advance array elements using a tuple + * + * The scan always gets a new qual as a consequence of calling here (except + * when we determine that the top-level scan has run out of matching tuples). + * All later _bt_check_compare calls also use the same new qual that was first + * used here (at least until the next call here advances the keys once again). + * It's convenient to structure _bt_check_compare rechecks of caller's tuple + * (using the new qual) as one the steps of advancing the scan's array keys, + * so this function works as a wrapper around _bt_check_compare. + * + * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the + * caller, and return a boolean indicating if caller's tuple satisfies the + * scan's new qual. But unlike _bt_check_compare, we set so->needPrimScan + * when we set continuescan=false, indicating if a new primitive index scan + * has been scheduled (otherwise, the top-level scan has run out of tuples in + * the current scan direction). + * + * Caller must use _bt_tuple_before_array_skeys to determine if the current + * place in the scan is >= the current array keys _before_ calling here. + * We're responsible for ensuring that caller's tuple is <= the newly advanced + * required array keys once we return. We try to find an exact match, but + * failing that we'll advance the array keys to whatever set of array elements + * comes next in the key space for the current scan direction. Required array + * keys "ratchet forwards" (or backwards). They can only advance as the scan + * itself advances through the index/key space. + * + * (The rules are the same for backwards scans, except that the operators are + * flipped: just replace the precondition's >= operator with a <=, and the + * postcondition's <= operator with a >=. In other words, just swap the + * precondition with the postcondition.) + * + * We also deal with "advancing" non-required arrays here (or arrays that are + * treated as non-required for the duration of a _bt_readpage call). Callers + * whose sktrig scan key is non-required specify sktrig_required=false. These + * calls are the only exception to the general rule about always advancing the + * required array keys (the scan may not even have a required array). These + * callers should just pass a NULL pstate (since there is never any question + * of stopping the scan). No call to _bt_tuple_before_array_skeys is required + * ahead of these calls (it's already clear that any required scan keys must + * be satisfied by caller's tuple). + * + * Note that we deal with non-array required equality strategy scan keys as + * degenerate single element arrays here. Obviously, they can never really + * advance in the way that real arrays can, but they must still affect how we + * advance real array scan keys (exactly like true array equality scan keys). + * We have to keep around a 3-way ORDER proc for these (using the "=" operator + * won't do), since in general whether the tuple is < or > _any_ unsatisfied + * required equality key influences how the scan's real arrays must advance. + * + * Note also that we may sometimes need to advance the array keys when the + * existing required array keys (and other required equality keys) are already + * an exact match for every corresponding value from caller's tuple. We must + * do this for inequalities that _bt_check_compare set continuescan=false for. + * They'll advance the array keys here, just like any other scan key that + * _bt_check_compare stops on. (This can even happen _after_ we advance the + * array keys, in which case we'll advance the array keys a second time. That + * way _bt_checkkeys caller always has its required arrays advance to the + * maximum possible extent that its tuple will allow.) + */ +static bool +_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int sktrig, bool sktrig_required) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = pstate ? pstate->dir : ForwardScanDirection; + int arrayidx = 0; + bool beyond_end_advance = false, + skip_array_advanced = false, + has_required_opposite_direction_only = false, + all_required_satisfied = true, + all_satisfied = true; + + Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); + Assert(_bt_verify_keys_with_arraykeys(scan)); + + if (sktrig_required) + { + /* + * Precondition array state assertion + */ + Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, + tupnatts, false, 0, NULL)); + + /* + * Once we return we'll have a new set of required array keys, so + * reset state used by "look ahead" optimization + */ + pstate->rechecks = 0; + pstate->targetdistance = 0; + } + else if (sktrig < so->numberOfKeys - 1 && + !(so->keyData[so->numberOfKeys - 1].sk_flags & SK_SEARCHARRAY)) + { + int least_sign_ikey = so->numberOfKeys - 1; + bool continuescan; + + /* + * Optimization: perform a precheck of the least significant key + * during !sktrig_required calls when it isn't already our sktrig + * (provided the precheck key is not itself an array). + * + * When the precheck works out we'll avoid an expensive binary search + * of sktrig's array (plus any other arrays before least_sign_ikey). + */ + Assert(so->keyData[sktrig].sk_flags & SK_SEARCHARRAY); + if (!_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, + false, &continuescan, + &least_sign_ikey)) + return false; + } + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + Datum tupdatum; + bool required = false, + tupnull; + int32 result; + int set_elem = 0; + + if (cur->sk_strategy == BTEqualStrategyNumber) + { + /* Manage array state */ + if (cur->sk_flags & SK_SEARCHARRAY) + { + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + } + } + else + { + /* + * Are any inequalities required in the opposite direction only + * present here? + */ + if (((ScanDirectionIsForward(dir) && + (cur->sk_flags & (SK_BT_REQBKWD))) || + (ScanDirectionIsBackward(dir) && + (cur->sk_flags & (SK_BT_REQFWD))))) + has_required_opposite_direction_only = true; + } + + /* Optimization: skip over known-satisfied scan keys */ + if (ikey < sktrig) + continue; + + if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) + { + required = true; + + if (cur->sk_attno > tupnatts) + { + /* Set this just like _bt_tuple_before_array_skeys */ + Assert(sktrig < ikey); + so->scanBehind = true; + } + } + + /* + * Handle a required non-array scan key that the initial call to + * _bt_check_compare indicated triggered array advancement, if any. + * + * The non-array scan key's strategy will be <, <=, or = during a + * forwards scan (or any one of =, >=, or > during a backwards scan). + * It follows that the corresponding tuple attribute's value must now + * be either > or >= the scan key value (for backwards scans it must + * be either < or <= that value). + * + * If this is a required equality strategy scan key, this is just an + * optimization; _bt_tuple_before_array_skeys already confirmed that + * this scan key places us ahead of caller's tuple. There's no need + * to repeat that work now. (The same underlying principle also gets + * applied by the cur_elem_trig optimization used to speed up searches + * for the next array element.) + * + * If this is a required inequality strategy scan key, we _must_ rely + * on _bt_check_compare like this; we aren't capable of directly + * evaluating required inequality strategy scan keys here, on our own. + */ + if (ikey == sktrig && !array) + { + Assert(sktrig_required && required && all_required_satisfied); + + /* Use "beyond end" advancement. See below for an explanation. */ + beyond_end_advance = true; + all_satisfied = all_required_satisfied = false; + + continue; + } + + /* + * Nothing more for us to do with an inequality strategy scan key that + * wasn't the one that _bt_check_compare stopped on, though. + * + * Note: if our later call to _bt_check_compare (to recheck caller's + * tuple) sets continuescan=false due to finding this same inequality + * unsatisfied (possible when it's required in the scan direction), + * we'll deal with it via a recursive "second pass" call. + */ + else if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + + /* + * Nothing for us to do with an equality strategy scan key that isn't + * marked required, either -- unless it's a non-required array + */ + else if (!required && !array) + continue; + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose binary search triggered "beyond end of array + * element" array advancement due to encountering a tuple attribute + * value > the closest matching array key (or < for backwards scans). + */ + if (beyond_end_advance) + { + if (array) + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); + + continue; + } + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose tuple attribute was < the closest matching + * array key when we dealt with it (or > for backwards scans). + * + * This earlier required array key already puts us ahead of caller's + * tuple in the key space (for the current scan direction). We must + * make sure that subsequent lower-order array keys do not put us too + * far ahead (ahead of tuples that have yet to be seen by our caller). + * For example, when a tuple "(a, b) = (42, 5)" advances the array + * keys on "a" from 40 to 45, we must also set "b" to whatever the + * first array element for "b" is. It would be wrong to allow "b" to + * be set based on the tuple value. + * + * Perform the same steps with truncated high key attributes. You can + * think of this as a "binary search" for the element closest to the + * value -inf. Again, the arrays must never get ahead of the scan. + */ + if (!all_required_satisfied || cur->sk_attno > tupnatts) + { + if (array) + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); + + continue; + } + + /* + * Search in scankey's array for the corresponding tuple attribute + * value from caller's tuple + */ + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + + if (array) + { + bool cur_elem_trig = (sktrig_required && ikey == sktrig); + + /* + * "Binary search" by checking if tupdatum/tupnull are within the + * range of the skip array + */ + if (array->num_elems == -1) + _bt_binsrch_skiparray_skey(cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); + + /* + * Binary search for the closest match from the SAOP array + */ + else + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); + } + else + { + Assert(required); + + /* + * This is a required non-array equality strategy scan key, which + * we'll treat as a degenerate single element array. + * + * This scan key's imaginary "array" can't really advance, but it + * can still roll over like any other array. (Actually, this is + * no different to real single value arrays, which never advance + * without rolling over -- they can never truly advance, either.) + */ + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + } + + /* + * Consider "beyond end of array element" array advancement. + * + * When the tuple attribute value is > the closest matching array key + * (or < in the backwards scan case), we need to ratchet this array + * forward (backward) by one increment, so that caller's tuple ends up + * being < final array value instead (or > final array value instead). + * This process has to work for all of the arrays, not just this one: + * it must "carry" to higher-order arrays when the set_elem that we + * just found happens to be the final one for the scan's direction. + * Incrementing (decrementing) set_elem itself isn't good enough. + * + * Our approach is to provisionally use set_elem as if it was an exact + * match now, then set each later/less significant array to whatever + * its final element is. Once outside the loop we'll then "increment + * this array's set_elem" by calling _bt_advance_array_keys_increment. + * That way the process rolls over to higher order arrays as needed. + * + * Under this scheme any required arrays only ever ratchet forwards + * (or backwards), and always do so to the maximum possible extent + * that we can know will be safe without seeing the scan's next tuple. + * We don't need any special handling for required scan keys that lack + * a real array to advance, nor for redundant scan keys that couldn't + * be eliminated by _bt_preprocess_keys. It won't matter if some of + * our "true" array scan keys (or even all of them) are non-required. + */ + if (sktrig_required && required && + ((ScanDirectionIsForward(dir) && result > 0) || + (ScanDirectionIsBackward(dir) && result < 0))) + beyond_end_advance = true; + + Assert(all_required_satisfied && all_satisfied); + if (result != 0) + { + /* + * Track whether caller's tuple satisfies our new post-advancement + * qual, for required scan keys, as well as for the entire set of + * interesting scan keys (all required scan keys plus non-required + * array scan keys are considered interesting.) + */ + all_satisfied = false; + if (sktrig_required && required) + all_required_satisfied = false; + else + { + /* + * There's no need to advance the arrays using the best + * available match for a non-required array. Give up now. + * (Though note that sktrig_required calls still have to do + * all the usual post-advancement steps, including the recheck + * call to _bt_check_compare.) + */ + break; + } + } + + /* Advance array keys, even when we don't have an exact match */ + if (array) + { + if (array->num_elems == -1) + { + /* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */ + _bt_skiparray_set_element(rel, cur, array, result, + tupdatum, tupnull); + skip_array_advanced = true; + } + else if (array->cur_elem != set_elem) + { + /* SAOP array's new element is set_elem datum */ + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; + } + } + } + + /* + * Advance the array keys incrementally whenever "beyond end of array + * element" array advancement happens, so that advancement will carry to + * higher-order arrays (might exhaust all the scan's arrays instead, which + * ends the top-level scan). + */ + if (beyond_end_advance && + !_bt_advance_array_keys_increment(scan, dir, &skip_array_advanced)) + goto end_toplevel_scan; + + Assert(_bt_verify_keys_with_arraykeys(scan)); + + /* + * Maintain a page-level count of the number of times the scan's array + * keys advanced in a way that affected at least one skip array + */ + if (sktrig_required && skip_array_advanced) + pstate->nskipadvances++; + + /* + * Does tuple now satisfy our new qual? Recheck with _bt_check_compare. + * + * Calls triggered by an unsatisfied required scan key, whose tuple now + * satisfies all required scan keys, but not all nonrequired array keys, + * will still require a recheck call to _bt_check_compare. They'll still + * need its "second pass" handling of required inequality scan keys. + * (Might have missed a still-unsatisfied required inequality scan key + * that caller didn't detect as the sktrig scan key during its initial + * _bt_check_compare call that used the old/original qual.) + * + * Calls triggered by an unsatisfied nonrequired array scan key never need + * "second pass" handling of required inequalities (nor any other handling + * of any required scan key). All that matters is whether caller's tuple + * satisfies the new qual, so it's safe to just skip the _bt_check_compare + * recheck when we've already determined that it can only return 'false'. + * + * Note: In practice most scan keys are marked required by preprocessing, + * if necessary by generating a preceding skip array. We nevertheless + * often handle array keys marked required as if they were nonrequired. + * This behavior is requested by our _bt_check_compare caller, though only + * when it is passed "forcenonrequired=true" by _bt_checkkeys. + */ + if ((sktrig_required && all_required_satisfied) || + (!sktrig_required && all_satisfied)) + { + int nsktrig = sktrig + 1; + bool continuescan; + + Assert(all_required_satisfied); + + /* Recheck _bt_check_compare on behalf of caller */ + if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, + !sktrig_required, &continuescan, + &nsktrig) && + !so->scanBehind) + { + /* This tuple satisfies the new qual */ + Assert(all_satisfied && continuescan); + + if (pstate) + pstate->continuescan = true; + + return true; + } + + /* + * Consider "second pass" handling of required inequalities. + * + * It's possible that our _bt_check_compare call indicated that the + * scan should end due to some unsatisfied inequality that wasn't + * initially recognized as such by us. Handle this by calling + * ourselves recursively, this time indicating that the trigger is the + * inequality that we missed first time around (and using a set of + * required array/equality keys that are now exact matches for tuple). + * + * We make a strong, general guarantee that every _bt_checkkeys call + * here will advance the array keys to the maximum possible extent + * that we can know to be safe based on caller's tuple alone. If we + * didn't perform this step, then that guarantee wouldn't quite hold. + */ + if (unlikely(!continuescan)) + { + bool satisfied PG_USED_FOR_ASSERTS_ONLY; + + Assert(sktrig_required); + Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber); + + /* + * The tuple must use "beyond end" advancement during the + * recursive call, so we cannot possibly end up back here when + * recursing. We'll consume a small, fixed amount of stack space. + */ + Assert(!beyond_end_advance); + + /* Advance the array keys a second time using same tuple */ + satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts, + tupdesc, nsktrig, true); + + /* This tuple doesn't satisfy the inequality */ + Assert(!satisfied); + return false; + } + + /* + * Some non-required scan key (from new qual) still not satisfied. + * + * All scan keys required in the current scan direction must still be + * satisfied, though, so we can trust all_required_satisfied below. + */ + } + + /* + * When we were called just to deal with "advancing" non-required arrays, + * this is as far as we can go (cannot stop the scan for these callers) + */ + if (!sktrig_required) + { + /* Caller's tuple doesn't match any qual */ + return false; + } + + /* + * Postcondition array state assertion (for still-unsatisfied tuples). + * + * By here we have established that the scan's required arrays (scan must + * have at least one required array) advanced, without becoming exhausted. + * + * Caller's tuple is now < the newly advanced array keys (or > when this + * is a backwards scan), except in the case where we only got this far due + * to an unsatisfied non-required scan key. Verify that with an assert. + * + * Note: we don't just quit at this point when all required scan keys were + * found to be satisfied because we need to consider edge-cases involving + * scan keys required in the opposite direction only; those aren't tracked + * by all_required_satisfied. + */ + Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, + false, 0, NULL) == + !all_required_satisfied); + + /* + * We generally permit primitive index scans to continue onto the next + * sibling page when the page's finaltup satisfies all required scan keys + * at the point where we're between pages. + * + * If caller's tuple is also the page's finaltup, and we see that required + * scan keys still aren't satisfied, start a new primitive index scan. + */ + if (!all_required_satisfied && pstate->finaltup == tuple) + goto new_prim_scan; + + /* + * Proactively check finaltup (don't wait until finaltup is reached by the + * scan) when it might well turn out to not be satisfied later on. + * + * Note: if so->scanBehind hasn't already been set for finaltup by us, + * it'll be set during this call to _bt_tuple_before_array_skeys. Either + * way, it'll be set correctly (for the whole page) after this point. + */ + if (!all_required_satisfied && pstate->finaltup && + _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, + BTreeTupleGetNAtts(pstate->finaltup, rel), + false, 0, &so->scanBehind)) + goto new_prim_scan; + + /* + * When we encounter a truncated finaltup high key attribute, we're + * optimistic about the chances of its corresponding required scan key + * being satisfied when we go on to recheck it against tuples from this + * page's right sibling leaf page. We consider truncated attributes to be + * satisfied by required scan keys, which allows the primitive index scan + * to continue to the next leaf page. We must set so->scanBehind to true + * to remember that the last page's finaltup had "satisfied" required scan + * keys for one or more truncated attribute values (scan keys required in + * _either_ scan direction). + * + * There is a chance that _bt_readpage (which checks so->scanBehind) will + * find that even the sibling leaf page's finaltup is < the new array + * keys. When that happens, our optimistic policy will have incurred a + * single extra leaf page access that could have been avoided. + * + * A pessimistic policy would give backward scans a gratuitous advantage + * over forward scans. We'd punish forward scans for applying more + * accurate information from the high key, rather than just using the + * final non-pivot tuple as finaltup, in the style of backward scans. + * Being pessimistic would also give some scans with non-required arrays a + * perverse advantage over similar scans that use required arrays instead. + * + * This is similar to our scan-level heuristics, below. They also set + * scanBehind to speculatively continue the primscan onto the next page. + */ + if (so->scanBehind) + { + /* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled */ + } + + /* + * Handle inequalities marked required in the opposite scan direction. + * They can also signal that we should start a new primitive index scan. + * + * It's possible that the scan is now positioned where "matching" tuples + * begin, and that caller's tuple satisfies all scan keys required in the + * current scan direction. But if caller's tuple still doesn't satisfy + * other scan keys that are required in the opposite scan direction only + * (e.g., a required >= strategy scan key when scan direction is forward), + * it's still possible that there are many leaf pages before the page that + * _bt_first could skip straight to. Groveling through all those pages + * will always give correct answers, but it can be very inefficient. We + * must avoid needlessly scanning extra pages. + * + * Separately, it's possible that _bt_check_compare set continuescan=false + * for a scan key that's required in the opposite direction only. This is + * a special case, that happens only when _bt_check_compare sees that the + * inequality encountered a NULL value. This signals the end of non-NULL + * values in the current scan direction, which is reason enough to end the + * (primitive) scan. If this happens at the start of a large group of + * NULL values, then we shouldn't expect to be called again until after + * the scan has already read indefinitely-many leaf pages full of tuples + * with NULL suffix values. (_bt_first is expected to skip over the group + * of NULLs by applying a similar "deduce NOT NULL" rule of its own, which + * involves consing up an explicit SK_SEARCHNOTNULL key.) + * + * Apply a test against finaltup to detect and recover from the problem: + * if even finaltup doesn't satisfy such an inequality, we just skip by + * starting a new primitive index scan. When we skip, we know for sure + * that all of the tuples on the current page following caller's tuple are + * also before the _bt_first-wise start of tuples for our new qual. That + * at least suggests many more skippable pages beyond the current page. + * (when so->scanBehind and so->oppositeDirCheck are set, this'll happen + * when we test the next page's finaltup/high key instead.) + */ + else if (has_required_opposite_direction_only && pstate->finaltup && + unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) + goto new_prim_scan; + +continue_scan: + + /* + * Stick with the ongoing primitive index scan for now. + * + * It's possible that later tuples will also turn out to have values that + * are still < the now-current array keys (or > the current array keys). + * Our caller will handle this by performing what amounts to a linear + * search of the page, implemented by calling _bt_check_compare and then + * _bt_tuple_before_array_skeys for each tuple. + * + * This approach has various advantages over a binary search of the page. + * Repeated binary searches of the page (one binary search for every array + * advancement) won't outperform a continuous linear search. While there + * are workloads that a naive linear search won't handle well, our caller + * has a "look ahead" fallback mechanism to deal with that problem. + */ + pstate->continuescan = true; /* Override _bt_check_compare */ + so->needPrimScan = false; /* _bt_readpage has more tuples to check */ + + if (so->scanBehind) + { + /* + * Remember if recheck needs to call _bt_oppodir_checkkeys for next + * page's finaltup (see above comments about "Handle inequalities + * marked required in the opposite scan direction" for why). + */ + so->oppositeDirCheck = has_required_opposite_direction_only; + + /* + * skip by setting "look ahead" mechanism's offnum for forwards scans + * (backwards scans check scanBehind flag directly instead) + */ + if (ScanDirectionIsForward(dir)) + pstate->skip = pstate->maxoff + 1; + } + + /* Caller's tuple doesn't match the new qual */ + return false; + +new_prim_scan: + + Assert(pstate->finaltup); /* not on rightmost/leftmost page */ + + /* + * Looks like another primitive index scan is required. But consider + * continuing the current primscan based on scan-level heuristics. + * + * Continue the ongoing primitive scan (and schedule a recheck for when + * the scan arrives on the next sibling leaf page) when it has already + * read at least one leaf page before the one we're reading now. This + * makes primscan scheduling more efficient when scanning subsets of an + * index with many distinct attribute values matching many array elements. + * It encourages fewer, larger primitive scans where that makes sense. + * This will in turn encourage _bt_readpage to apply the pstate.startikey + * optimization more often. + * + * Also continue the ongoing primitive index scan when it is still on the + * first page if there have been more than NSKIPADVANCES_THRESHOLD calls + * here that each advanced at least one of the scan's skip arrays + * (deliberately ignore advancements that only affected SAOP arrays here). + * A page that cycles through this many skip array elements is quite + * likely to neighbor similar pages, that we'll also need to read. + * + * Note: These heuristics aren't as aggressive as you might think. We're + * conservative about allowing a primitive scan to step from the first + * leaf page it reads to the page's sibling page (we only allow it on + * first pages whose finaltup strongly suggests that it'll work out, as + * well as first pages that have a large number of skip array advances). + * Clearing this first page finaltup hurdle is a strong signal in itself. + * + * Note: The NSKIPADVANCES_THRESHOLD heuristic exists only to avoid + * pathological cases. Specifically, cases where a skip scan should just + * behave like a traditional full index scan, but ends up "skipping" again + * and again, descending to the prior leaf page's direct sibling leaf page + * each time. This misbehavior would otherwise be possible during scans + * that never quite manage to "clear the first page finaltup hurdle". + */ + if (!pstate->firstpage || pstate->nskipadvances > NSKIPADVANCES_THRESHOLD) + { + /* Schedule a recheck once on the next (or previous) page */ + so->scanBehind = true; + + /* Continue the current primitive scan after all */ + goto continue_scan; + } + + /* + * End this primitive index scan, but schedule another. + * + * Note: We make a soft assumption that the current scan direction will + * also be used within _bt_next, when it is asked to step off this page. + * It is up to _bt_next to cancel this scheduled primitive index scan + * whenever it steps to a page in the direction opposite currPos.dir. + */ + pstate->continuescan = false; /* Tell _bt_readpage we're done... */ + so->needPrimScan = true; /* ...but call _bt_first again */ + + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, so->currPos.currPage); + + /* Caller's tuple doesn't match the new qual */ + return false; + +end_toplevel_scan: + + /* + * End the current primitive index scan, but don't schedule another. + * + * This ends the entire top-level scan in the current scan direction. + * + * Note: The scan's arrays (including any non-required arrays) are now in + * their final positions for the current scan direction. If the scan + * direction happens to change, then the arrays will already be in their + * first positions for what will then be the current scan direction. + */ + pstate->continuescan = false; /* Tell _bt_readpage we're done... */ + so->needPrimScan = false; /* ...and don't call _bt_first again */ + + /* Caller's tuple doesn't match any qual */ + return false; +} + +/* + * _bt_advance_array_keys_increment() -- Advance to next set of array elements + * + * Advances the array keys by a single increment in the current scan + * direction. When there are multiple array keys this can roll over from the + * lowest order array to higher order arrays. + * + * Returns true if there is another set of values to consider, false if not. + * On true result, the scankeys are initialized with the next set of values. + * On false result, the scankeys stay the same, and the array keys are not + * advanced (every array remains at its final element for scan direction). + */ +static bool +_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, + bool *skip_array_set) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* + * We must advance the last array key most quickly, since it will + * correspond to the lowest-order index column among the available + * qualifications + */ + for (int i = so->numArrayKeys - 1; i >= 0; i--) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + + if (array->num_elems == -1) + *skip_array_set = true; + + if (ScanDirectionIsForward(dir)) + { + if (_bt_array_increment(rel, skey, array)) + return true; + } + else + { + if (_bt_array_decrement(rel, skey, array)) + return true; + } + + /* + * Couldn't increment (or decrement) array. Handle array roll over. + * + * Start over at the array's lowest sorting value (or its highest + * value, for backward scans)... + */ + _bt_array_set_low_or_high(rel, skey, array, + ScanDirectionIsForward(dir)); + + /* ...then increment (or decrement) next most significant array */ + } + + /* + * The array keys are now exhausted. + * + * Restore the array keys to the state they were in immediately before we + * were called. This ensures that the arrays only ever ratchet in the + * current scan direction. + * + * Without this, scans could overlook matching tuples when the scan + * direction gets reversed just before btgettuple runs out of items to + * return, but just after _bt_readpage prepares all the items from the + * scan's final page in so->currPos. When we're on the final page it is + * typical for so->currPos to get invalidated once btgettuple finally + * returns false, which'll effectively invalidate the scan's array keys. + * That hasn't happened yet, though -- and in general it may never happen. + */ + _bt_start_array_keys(scan, -dir); + + return false; +} + +/* + * _bt_array_increment() -- increment array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully incremented. + * Cannot increment an array whose current element is already the final one. + */ +static bool +_bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool oflow = false; + Datum inc_sk_argument; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_BT_MINVAL | SK_BT_NEXT | SK_BT_PRIOR))); + + /* SAOP array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); + if (array->cur_elem < array->num_elems - 1) + { + /* + * Just increment current element, and assign its datum to skey + * (only skip arrays need us to free existing sk_argument memory) + */ + array->cur_elem++; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully incremented array */ + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* + * The sentinel value that represents the maximum value within the range + * of a skip array (often just +inf) is never incrementable + */ + if (skey->sk_flags & SK_BT_MAXVAL) + return false; + + /* + * When the current array element is NULL, and the highest sorting value + * in the index is also NULL, we cannot increment past the final element + */ + if ((skey->sk_flags & SK_ISNULL) && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "increment" the scan key's current + * element by setting the NEXT flag. The true next value is determined by + * repositioning to the first index tuple > existing sk_argument/current + * array element. Note that this works in the usual way when the scan key + * is already marked ISNULL (i.e. when the current element is NULL). + */ + if (!array->sksup) + { + /* Successfully "incremented" array */ + skey->sk_flags |= SK_BT_NEXT; + return true; + } + + /* + * Opclasses with skip support directly increment sk_argument + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(skey->sk_flags & SK_BT_NULLS_FIRST); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * "Increment" from NULL to the low_elem value provided by opclass + * skip support routine. + */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + skey->sk_argument = datumCopy(array->sksup->low_elem, + array->attbyval, array->attlen); + return true; + } + + /* + * Ask opclass support routine to provide incremented copy of existing + * non-NULL sk_argument + */ + inc_sk_argument = array->sksup->increment(rel, skey->sk_argument, &oflow); + if (unlikely(oflow)) + { + /* inc_sk_argument has undefined value (so no pfree) */ + if (array->null_elem && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + { + _bt_skiparray_set_isnull(rel, skey, array); + + /* Successfully "incremented" array to NULL */ + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* + * Successfully incremented sk_argument to a non-NULL value. Make sure + * that the incremented value is still within the range of the array. + */ + if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + inc_sk_argument, + array->high_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!array->attbyval) + pfree(DatumGetPointer(inc_sk_argument)); + + /* Cannot increment past final array element */ + return false; + } + + /* Accept value returned by opclass increment callback */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = inc_sk_argument; + + /* Successfully incremented array */ + return true; +} + +/* + * _bt_array_decrement() -- decrement array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully decremented. + * Cannot decrement an array whose current element is already the first one. + */ +static bool +_bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool uflow = false; + Datum dec_sk_argument; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_BT_MAXVAL | SK_BT_NEXT | SK_BT_PRIOR))); + + /* SAOP array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); + if (array->cur_elem > 0) + { + /* + * Just decrement current element, and assign its datum to skey + * (only skip arrays need us to free existing sk_argument memory) + */ + array->cur_elem--; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully decremented array */ + return true; + } + + /* Cannot decrement to before first array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* + * The sentinel value that represents the minimum value within the range + * of a skip array (often just -inf) is never decrementable + */ + if (skey->sk_flags & SK_BT_MINVAL) + return false; + + /* + * When the current array element is NULL, and the lowest sorting value in + * the index is also NULL, we cannot decrement before first array element + */ + if ((skey->sk_flags & SK_ISNULL) && (skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "decrement" the scan key's current + * element by setting the PRIOR flag. The true prior value is determined + * by repositioning to the last index tuple < existing sk_argument/current + * array element. Note that this works in the usual way when the scan key + * is already marked ISNULL (i.e. when the current element is NULL). + */ + if (!array->sksup) + { + /* Successfully "decremented" array */ + skey->sk_flags |= SK_BT_PRIOR; + return true; + } + + /* + * Opclasses with skip support directly decrement sk_argument + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(!(skey->sk_flags & SK_BT_NULLS_FIRST)); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * "Decrement" from NULL to the high_elem value provided by opclass + * skip support routine. + */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + skey->sk_argument = datumCopy(array->sksup->high_elem, + array->attbyval, array->attlen); + return true; + } + + /* + * Ask opclass support routine to provide decremented copy of existing + * non-NULL sk_argument + */ + dec_sk_argument = array->sksup->decrement(rel, skey->sk_argument, &uflow); + if (unlikely(uflow)) + { + /* dec_sk_argument has undefined value (so no pfree) */ + if (array->null_elem && (skey->sk_flags & SK_BT_NULLS_FIRST)) + { + _bt_skiparray_set_isnull(rel, skey, array); + + /* Successfully "decremented" array to NULL */ + return true; + } + + /* Cannot decrement to before first array element */ + return false; + } + + /* + * Successfully decremented sk_argument to a non-NULL value. Make sure + * that the decremented value is still within the range of the array. + */ + if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + dec_sk_argument, + array->low_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!array->attbyval) + pfree(DatumGetPointer(dec_sk_argument)); + + /* Cannot decrement to before first array element */ + return false; + } + + /* Accept value returned by opclass decrement callback */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = dec_sk_argument; + + /* Successfully decremented array */ + return true; +} + +/* + * _bt_array_set_low_or_high() -- Set array scan key to lowest/highest element + * + * Caller also passes associated scan key, which will have its argument set to + * the lowest/highest array value in passing. + */ +static void +_bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + bool low_not_high) +{ + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (array->num_elems != -1) + { + /* set low or high element for SAOP array */ + int set_elem = 0; + + Assert(!(skey->sk_flags & SK_BT_SKIP)); + + if (!low_not_high) + set_elem = array->num_elems - 1; + + /* + * Just copy over array datum (only skip arrays require freeing and + * allocating memory for sk_argument) + */ + array->cur_elem = set_elem; + skey->sk_argument = array->elem_values[set_elem]; + + return; + } + + /* set low or high element for skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Reset flags */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + + if (array->null_elem && + (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) + { + /* Requested element (either lowest or highest) has the value NULL */ + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); + } + else if (low_not_high) + { + /* Setting array to lowest element (according to low_compare) */ + skey->sk_flags |= SK_BT_MINVAL; + } + else + { + /* Setting array to highest element (according to high_compare) */ + skey->sk_flags |= SK_BT_MAXVAL; + } +} + +/* + * _bt_skiparray_set_element() -- Set skip array scan key's sk_argument + * + * Caller passes set_elem_result returned by _bt_binsrch_skiparray_skey for + * caller's tupdatum/tupnull. + * + * We copy tupdatum/tupnull into skey's sk_argument iff set_elem_result == 0. + * Otherwise, we set skey to either the lowest or highest value that's within + * the range of caller's skip array (whichever is the best available match to + * tupdatum/tupnull that is still within the range of the skip array according + * to _bt_binsrch_skiparray_skey/set_elem_result). + */ +static void +_bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + int32 set_elem_result, Datum tupdatum, bool tupnull) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (set_elem_result) + { + /* tupdatum/tupnull is out of the range of the skip array */ + Assert(!array->null_elem); + + _bt_array_set_low_or_high(rel, skey, array, set_elem_result < 0); + return; + } + + /* Advance skip array to tupdatum (or tupnull) value */ + if (unlikely(tupnull)) + { + _bt_skiparray_set_isnull(rel, skey, array); + return; + } + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* tupdatum becomes new sk_argument/new current element */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + skey->sk_argument = datumCopy(tupdatum, array->attbyval, array->attlen); +} + +/* + * _bt_skiparray_set_isnull() -- set skip array scan key to NULL + */ +static void +_bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(array->null_elem && !array->low_compare && !array->high_compare); + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* NULL becomes new sk_argument/new current element */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + +/* + * _bt_compare_array_skey() -- apply array comparison function + * + * Compares caller's tuple attribute value to a scan key/array element. + * Helper function used during binary searches of SK_SEARCHARRAY arrays. + * + * This routine returns: + * <0 if tupdatum < arrdatum; + * 0 if tupdatum == arrdatum; + * >0 if tupdatum > arrdatum. + * + * This is essentially the same interface as _bt_compare: both functions + * compare the value that they're searching for to a binary search pivot. + * However, unlike _bt_compare, this function's "tuple argument" comes first, + * while its "array/scankey argument" comes second. +*/ +static inline int32 +_bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur) +{ + int32 result = 0; + + Assert(cur->sk_strategy == BTEqualStrategyNumber); + Assert(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))); + + if (tupnull) /* NULL tupdatum */ + { + if (cur->sk_flags & SK_ISNULL) + result = 0; /* NULL "=" NULL */ + else if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * Like _bt_compare, we need to be careful of cross-type comparisons, + * so the left value has to be the value that came from an index tuple + */ + result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation, + tupdatum, arrdatum)); + + /* + * We flip the sign by following the obvious rule: flip whenever the + * column is a DESC column. + * + * _bt_compare does it the wrong way around (flip when *ASC*) in order + * to compensate for passing its orderproc arguments backwards. We + * don't need to play these games because we find it natural to pass + * tupdatum as the left value (and arrdatum as the right value). + */ + if (cur->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(result); + } + + return result; +} + +/* + * _bt_binsrch_array_skey() -- Binary search for next matching array key + * + * Returns an index to the first array element >= caller's tupdatum argument. + * This convention is more natural for forwards scan callers, but that can't + * really matter to backwards scan callers. Both callers require handling for + * the case where the match we return is < tupdatum, and symmetric handling + * for the case where our best match is > tupdatum. + * + * Also sets *set_elem_result to the result _bt_compare_array_skey returned + * when we used it to compare the matching array element to tupdatum/tupnull. + * + * cur_elem_trig indicates if array advancement was triggered by this array's + * scan key, and that the array is for a required scan key. We can apply this + * information to find the next matching array element in the current scan + * direction using far fewer comparisons (fewer on average, compared to naive + * binary search). This scheme takes advantage of an important property of + * required arrays: required arrays always advance in lockstep with the index + * scan's progress through the index's key space. + */ +int +_bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + int low_elem = 0, + mid_elem = -1, + high_elem = array->num_elems - 1, + result = 0; + Datum arrdatum; + + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(!(cur->sk_flags & SK_BT_SKIP)); + Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */ + Assert(cur->sk_strategy == BTEqualStrategyNumber); + + if (cur_elem_trig) + { + Assert(!ScanDirectionIsNoMovement(dir)); + Assert(cur->sk_flags & SK_BT_REQFWD); + + /* + * When the scan key that triggered array advancement is a required + * array scan key, it is now certain that the current array element + * (plus all prior elements relative to the current scan direction) + * cannot possibly be at or ahead of the corresponding tuple value. + * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which + * makes sure this is true as a condition of advancing the arrays.) + * + * This makes it safe to exclude array elements up to and including + * the former-current array element from our search. + * + * Separately, when array advancement was triggered by a required scan + * key, the array element immediately after the former-current element + * is often either an exact tupdatum match, or a "close by" near-match + * (a near-match tupdatum is one whose key space falls _between_ the + * former-current and new-current array elements). We'll detect both + * cases via an optimistic comparison of the new search lower bound + * (or new search upper bound in the case of backwards scans). + */ + if (ScanDirectionIsForward(dir)) + { + low_elem = array->cur_elem + 1; /* old cur_elem exhausted */ + + /* Compare prospective new cur_elem (also the new lower bound) */ + if (high_elem >= low_elem) + { + arrdatum = array->elem_values[low_elem]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result <= 0) + { + /* Optimistic comparison optimization worked out */ + *set_elem_result = result; + return low_elem; + } + mid_elem = low_elem; + low_elem++; /* this cur_elem exhausted, too */ + } + + if (high_elem < low_elem) + { + /* Caller needs to perform "beyond end" array advancement */ + *set_elem_result = 1; + return high_elem; + } + } + else + { + high_elem = array->cur_elem - 1; /* old cur_elem exhausted */ + + /* Compare prospective new cur_elem (also the new upper bound) */ + if (high_elem >= low_elem) + { + arrdatum = array->elem_values[high_elem]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result >= 0) + { + /* Optimistic comparison optimization worked out */ + *set_elem_result = result; + return high_elem; + } + mid_elem = high_elem; + high_elem--; /* this cur_elem exhausted, too */ + } + + if (high_elem < low_elem) + { + /* Caller needs to perform "beyond end" array advancement */ + *set_elem_result = -1; + return low_elem; + } + } + } + + while (high_elem > low_elem) + { + mid_elem = low_elem + ((high_elem - low_elem) / 2); + arrdatum = array->elem_values[mid_elem]; + + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result == 0) + { + /* + * It's safe to quit as soon as we see an equal array element. + * This often saves an extra comparison or two... + */ + low_elem = mid_elem; + break; + } + + if (result > 0) + low_elem = mid_elem + 1; + else + high_elem = mid_elem; + } + + /* + * ...but our caller also cares about how its searched-for tuple datum + * compares to the low_elem datum. Must always set *set_elem_result with + * the result of that comparison specifically. + */ + if (low_elem != mid_elem) + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + array->elem_values[low_elem], cur); + + *set_elem_result = result; + + return low_elem; +} + +/* + * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array + * + * Does not return an index into the array, since skip arrays don't really + * contain elements (they generate their array elements procedurally instead). + * Our interface matches that of _bt_binsrch_array_skey in every other way. + * + * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true + * array. The value 0 indicates that tupdatum/tupnull is within the range of + * the skip array. We return -1 when tupdatum/tupnull is lower that any value + * within the range of the array, and 1 when it is higher than every value. + * Caller should pass *set_elem_result to _bt_skiparray_set_element to advance + * the array. + * + * cur_elem_trig indicates if array advancement was triggered by this array's + * scan key. We use this to optimize-away comparisons that are known by our + * caller to be unnecessary from context, just like _bt_binsrch_array_skey. + */ +static void +_bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems == -1); + Assert(!ScanDirectionIsNoMovement(dir)); + + if (array->null_elem) + { + Assert(!array->low_compare && !array->high_compare); + + *set_elem_result = 0; + return; + } + + if (tupnull) /* NULL tupdatum */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + *set_elem_result = -1; /* NULL "<" NOT_NULL */ + else + *set_elem_result = 1; /* NULL ">" NOT_NULL */ + return; + } + + /* + * Array inequalities determine whether tupdatum is within the range of + * caller's skip array + */ + *set_elem_result = 0; + if (ScanDirectionIsForward(dir)) + { + /* + * Evaluate low_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate high_compare + */ + if (!cur_elem_trig && array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + else if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + } + else + { + /* + * Evaluate high_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate low_compare + */ + if (!cur_elem_trig && array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + else if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + } + + /* + * Assert that any keys that were assumed to be satisfied already (due to + * caller passing cur_elem_trig=true) really are satisfied as expected + */ +#ifdef USE_ASSERT_CHECKING + if (cur_elem_trig) + { + if (ScanDirectionIsForward(dir) && array->low_compare) + Assert(DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))); + + if (ScanDirectionIsBackward(dir) && array->high_compare) + Assert(DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))); + } +#endif +} + +#ifdef USE_ASSERT_CHECKING +/* + * Verify that the scan's "so->keyData[]" scan keys are in agreement with + * its array key state + */ +static bool +_bt_verify_keys_with_arraykeys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int last_sk_attno = InvalidAttrNumber, + arrayidx = 0; + bool nonrequiredseen = false; + + if (!so->qual_ok) + return false; + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array; + + if (cur->sk_strategy != BTEqualStrategyNumber || + !(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + array = &so->arrayKeys[arrayidx++]; + if (array->scan_key != ikey) + return false; + + if (array->num_elems == 0 || array->num_elems < -1) + return false; + + if (array->num_elems != -1 && + cur->sk_argument != array->elem_values[array->cur_elem]) + return false; + if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) + { + if (last_sk_attno > cur->sk_attno) + return false; + if (nonrequiredseen) + return false; + } + else + nonrequiredseen = true; + + last_sk_attno = cur->sk_attno; + } + + if (arrayidx != so->numArrayKeys) + return false; + + return true; +} +#endif diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af73..b44252319357e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -93,6 +93,7 @@ typedef struct BTParallelScanDescData typedef struct BTParallelScanDescData *BTParallelScanDesc; +static bool _bt_start_prim_scan(IndexScanDesc scan); static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so); static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, @@ -228,6 +229,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; + Assert(scan->heapRelation != NULL); + /* btree indexes are never lossy */ scan->xs_recheck = false; @@ -258,8 +261,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * just forget any excess entries. */ if (so->killedItems == NULL) - so->killedItems = (int *) - palloc(MaxTIDsPerBTreePage * sizeof(int)); + so->killedItems = palloc_array(int, MaxTIDsPerBTreePage); if (so->numKilled < MaxTIDsPerBTreePage) so->killedItems[so->numKilled++] = so->currPos.itemIndex; } @@ -274,7 +276,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) if (res) break; /* ... otherwise see if we need another primitive index scan */ - } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir)); + } while (so->numArrayKeys && _bt_start_prim_scan(scan)); return res; } @@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) int64 ntids = 0; ItemPointer heapTid; + Assert(scan->heapRelation == NULL); + /* Each loop iteration performs another primitive index scan */ do { @@ -320,7 +324,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) } } /* Now see if we need another primitive index scan */ - } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection)); + } while (so->numArrayKeys && _bt_start_prim_scan(scan)); return ntids; } @@ -341,7 +345,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) scan = RelationGetIndexScan(rel, nkeys, norderbys); /* allocate private workspace */ - so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); + so = palloc_object(BTScanOpaqueData); BTScanPosInvalidate(so->currPos); BTScanPosInvalidate(so->markPos); if (scan->numberOfKeys > 0) @@ -393,6 +397,34 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosInvalidate(so->currPos); } + /* + * We prefer to eagerly drop leaf page pins before btgettuple returns. + * This avoids making VACUUM wait to acquire a cleanup lock on the page. + * + * We cannot safely drop leaf page pins during index-only scans due to a + * race condition involving VACUUM setting pages all-visible in the VM. + * It's also unsafe for plain index scans that use a non-MVCC snapshot. + * + * When we drop pins eagerly, the mechanism that marks so->killedItems[] + * index tuples LP_DEAD has to deal with concurrent TID recycling races. + * The scheme used to detect unsafe TID recycling won't work when scanning + * unlogged relations (since it involves saving an affected page's LSN). + * Opt out of eager pin dropping during unlogged relation scans for now + * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting). + * + * Also opt out of dropping leaf page pins eagerly during bitmap scans. + * Pins cannot be held for more than an instant during bitmap scans either + * way, so we might as well avoid wasting cycles on acquiring page LSNs. + * + * See nbtree/README section on making concurrent TID recycling safe. + * + * Note: so->dropPin should never change across rescans. + */ + so->dropPin = (!scan->xs_want_itup && + IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation) && + scan->heapRelation != NULL); + so->markItemIndex = -1; so->needPrimScan = false; so->scanBehind = false; @@ -405,16 +437,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * not already done in a previous rescan call. To save on palloc * overhead, both workspaces are allocated as one palloc block; only this * function and btendscan know that. - * - * NOTE: this data structure also makes it safe to return data from a - * "name" column, even though btree name_ops uses an underlying storage - * datatype of cstring. The risk there is that "name" is supposed to be - * padded to NAMEDATALEN, but the actual index tuple is probably shorter. - * However, since we only return data out of tuples sitting in the - * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some - * data out of the markTuples array --- running off the end of memory for - * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats - * adding special-case treatment for name_ops elsewhere. */ if (scan->xs_want_itup && so->currTuples == NULL) { @@ -622,6 +644,75 @@ btestimateparallelscan(Relation rel, int nkeys, int norderbys) return estnbtreeshared; } +/* + * _bt_start_prim_scan() -- start scheduled primitive index scan? + * + * Returns true if _bt_checkkeys scheduled another primitive index scan, just + * as the last one ended. Otherwise returns false, indicating that the array + * keys are now fully exhausted. + * + * Only call here during scans with one or more equality type array scan keys, + * after _bt_first or _bt_next return false. + */ +static bool +_bt_start_prim_scan(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(so->numArrayKeys); + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ + + /* + * Array keys are advanced within _bt_checkkeys when the scan reaches the + * leaf level (more precisely, they're advanced when the scan reaches the + * end of each distinct set of array elements). This process avoids + * repeat access to leaf pages (across multiple primitive index scans) by + * advancing the scan's array keys when it allows the primitive index scan + * to find nearby matching tuples (or when it eliminates ranges of array + * key space that can't possibly be satisfied by any index tuple). + * + * _bt_checkkeys sets a simple flag variable to schedule another primitive + * index scan. The flag tells us what to do. + * + * We cannot rely on _bt_first always reaching _bt_checkkeys. There are + * various cases where that won't happen. For example, if the index is + * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys. + * We also don't expect a call to _bt_checkkeys during searches for a + * non-existent value that happens to be lower/higher than any existing + * value in the index. + * + * We don't require special handling for these cases -- we don't need to + * be explicitly instructed to _not_ perform another primitive index scan. + * It's up to code under the control of _bt_first to always set the flag + * when another primitive index scan will be required. + * + * This works correctly, even with the tricky cases listed above, which + * all involve access to leaf pages "near the boundaries of the key space" + * (whether it's from a leftmost/rightmost page, or an imaginary empty + * leaf root page). If _bt_checkkeys cannot be reached by a primitive + * index scan for one set of array keys, then it also won't be reached for + * any later set ("later" in terms of the direction that we scan the index + * and advance the arrays). The array keys won't have advanced in these + * cases, but that's the correct behavior (even _bt_advance_array_keys + * won't always advance the arrays at the point they become "exhausted"). + */ + if (so->needPrimScan) + { + /* + * Flag was set -- must call _bt_first again, which will reset the + * scan's needPrimScan flag + */ + return true; + } + + /* The top-level index scan ran out of tuples in this scan direction */ + if (scan->parallel_scan != NULL) + _bt_parallel_done(scan); + + return false; +} + /* * _bt_parallel_serialize_arrays() -- Serialize parallel array state. * @@ -1038,7 +1129,7 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* allocate stats if first time through, else re-use existing struct */ if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); /* Establish the vacuum cycle ID to use for this scan */ /* The ENSURE stuff ensures we clean up shared memory on failure */ @@ -1099,7 +1190,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) * We handle the problem by making num_index_tuples an estimate in * cleanup-only case. */ - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); btvacuumscan(info, stats, NULL, NULL, 0); stats->estimated_count = true; } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fe9a3886913d8..7a416d60cea39 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,23 +25,13 @@ #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); -static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, - OffsetNumber offnum, bool firstpage); -static void _bt_saveitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, IndexTuple itup); -static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, ItemPointer heapTid, - IndexTuple itup); -static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, - ItemPointer heapTid, int tupleOffset); static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, @@ -57,24 +47,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); /* * _bt_drop_lock_and_maybe_pin() * - * Unlock the buffer; and if it is safe to release the pin, do that, too. - * This will prevent vacuum from stalling in a blocked state trying to read a - * page when a cursor is sitting on it. - * - * See nbtree/README section on making concurrent TID recycling safe. + * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too. + * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock. */ -static void -_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +static inline void +_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so) { - _bt_unlockbuf(scan->indexRelation, sp->buf); - - if (IsMVCCSnapshot(scan->xs_snapshot) && - RelationNeedsWAL(scan->indexRelation) && - !scan->xs_want_itup) + if (!so->dropPin) { - ReleaseBuffer(sp->buf); - sp->buf = InvalidBuffer; + /* Just drop the lock (not the pin) */ + _bt_unlockbuf(rel, so->currPos.buf); + return; } + + /* + * Drop both the lock and the pin. + * + * Have to set so->currPos.lsn so that _bt_killitems has a way to detect + * when concurrent heap TID recycling by VACUUM might have taken place. + */ + Assert(RelationNeedsWAL(rel)); + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; } /* @@ -164,7 +159,7 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, * page one level down, it usually ends up inserting a new pivot * tuple/downlink immediately after the location recorded here. */ - new_stack = (BTStack) palloc(sizeof(BTStackData)); + new_stack = (BTStack) palloc_object(BTStackData); new_stack->bts_blkno = BufferGetBlockNumber(*bufP); new_stack->bts_offset = offnum; new_stack->bts_parent = stack_in; @@ -866,8 +861,8 @@ _bt_compare(Relation rel, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, data about the * matching tuple(s) on the page has been loaded into so->currPos. We'll - * drop all locks and hold onto a pin on page's buffer, except when - * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM. + * drop all locks and hold onto a pin on page's buffer, except during + * so->dropPin scans, when we drop both the lock and the pin. * _bt_returnitem sets the next item to return to scan on success exit. * * If there are no matching items in the index, we return false, with no @@ -887,9 +882,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; - ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkey; int keysz = 0; - StrategyNumber strat_total; + StrategyNumber strat_total = InvalidStrategy; BlockNumber blkno = InvalidBlockNumber, lastcurrblkno; @@ -955,46 +950,51 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /*---------- * Examine the scan keys to discover where we need to start the scan. + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. The final + * startKeys[] entry's strategy is set in strat_total. (Actually, there + * are a couple of cases where we force a less/more restrictive strategy.) * - * We want to identify the keys that can be used as starting boundaries; - * these are =, >, or >= keys for a forward scan or =, <, <= keys for - * a backwards scan. We can use keys for multiple attributes so long as - * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept - * a > or < boundary or find an attribute with no boundary (which can be - * thought of as the same as "> -infinity"), we can't use keys for any - * attributes to its right, because it would break our simplistic notion - * of what initial positioning strategy to use. + * We must use the key that was marked required (in the direction opposite + * our own scan's) during preprocessing. Each index attribute can only + * have one such required key. In general, the keys that we use to find + * an initial position when scanning forwards are the same keys that end + * the scan on the leaf level when scanning backwards (and vice-versa). * * When the scan keys include cross-type operators, _bt_preprocess_keys - * may not be able to eliminate redundant keys; in such cases we will - * arbitrarily pick a usable one for each attribute. This is correct - * but possibly not optimal behavior. (For example, with keys like - * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when - * x=5 would be more efficient.) Since the situation only arises given - * a poorly-worded query plus an incomplete opfamily, live with it. + * may not be able to eliminate redundant keys; in such cases it will + * arbitrarily pick a usable key for each attribute (and scan direction), + * ensuring that there is no more than one key required in each direction. + * We stop considering further keys once we reach the first nonrequired + * key (which must come after all required keys), so this can't affect us. * - * When both equality and inequality keys appear for a single attribute - * (again, only possible when cross-type operators appear), we *must* - * select one of the equality keys for the starting point, because - * _bt_checkkeys() will stop the scan as soon as an equality qual fails. - * For example, if we have keys like "x >= 4 AND x = 10" and we elect to - * start at x=4, we will fail and stop before reaching x=10. If multiple - * equality quals survive preprocessing, however, it doesn't matter which - * one we use --- by definition, they are either redundant or - * contradictory. + * The required keys that we use as starting boundaries have to be =, >, + * or >= keys for a forward scan or =, <, <= keys for a backwards scan. + * We can use keys for multiple attributes so long as the prior attributes + * had only =, >= (resp. =, <=) keys. These rules are very similar to the + * rules that preprocessing used to determine which keys to mark required. + * We cannot always use every required key as a positioning key, though. + * Skip arrays necessitate independently applying our own rules here. + * Skip arrays are always generally considered = array keys, but we'll + * nevertheless treat them as inequalities at certain points of the scan. + * When that happens, it _might_ have implications for the number of + * required keys that we can safely use for initial positioning purposes. * - * In practice we rarely see any "attribute boundary key gaps" here. - * Preprocessing can usually backfill skip array keys for any attributes - * that were omitted from the original scan->keyData[] input keys. All - * array keys are always considered = keys, but we'll sometimes need to - * treat the current key value as if we were using an inequality strategy. - * This happens with range skip arrays, which store inequality keys in the - * array's low_compare/high_compare fields (used to find the first/last - * set of matches, when = key will lack a usable sk_argument value). - * These are always preferred over any redundant "standard" inequality - * keys on the same column (per the usual rule about preferring = keys). - * Note also that any column with an = skip array key can never have an - * additional, contradictory = key. + * For example, a forward scan with a skip array on its leading attribute + * (with no low_compare/high_compare) will have at least two required scan + * keys, but we won't use any of them as boundary keys during the scan's + * initial call here. Our positioning key during the first call here can + * be thought of as representing "> -infinity". Similarly, if such a skip + * array's low_compare is "a > 'foo'", then we position using "a > 'foo'" + * during the scan's initial call here; a lower-order key such as "b = 42" + * can't be used until the "a" array advances beyond MINVAL/low_compare. + * + * On the other hand, if such a skip array's low_compare was "a >= 'foo'", + * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here. + * A subsequent call here might have us use "a = 'fop' AND b = 42". Note + * that we treat = and >= as equivalent when scanning forwards (just as we + * treat = and <= as equivalent when scanning backwards). We effectively + * do the same thing (though with a distinct "a" element/value) each time. * * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP * array keys whose array is "null_elem=true") imply a NOT NULL qualifier. @@ -1006,41 +1006,38 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * traversing a lot of null entries at the start of the scan. * * In this loop, row-comparison keys are treated the same as keys on their - * first (leftmost) columns. We'll add on lower-order columns of the row - * comparison below, if possible. - * - * The selected scan keys (at most one per index column) are remembered by - * storing their addresses into the local startKeys[] array. + * first (leftmost) columns. We'll add all lower-order columns of the row + * comparison that were marked required during preprocessing below. * - * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start - * the next primitive index scan (for scans with array keys) based in part - * on an understanding of how it'll enable us to reposition the scan. - * They're directly aware of how we'll sometimes cons up an explicit - * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a - * symmetric "deduce NOT NULL" rule of their own. This allows top-level - * scans to skip large groups of NULLs through repeated deductions about - * key strictness (for a required inequality key) and whether NULLs in the - * key's index column are stored last or first (relative to non-NULLs). + * _bt_advance_array_keys needs to know exactly how we'll reposition the + * scan (should it opt to schedule another primitive index scan). It is + * critical that primscans only be scheduled when they'll definitely make + * some useful progress. _bt_advance_array_keys does this by calling + * _bt_checkkeys routines that report whether a tuple is past the end of + * matches for the scan's keys (given the scan's current array elements). + * If the page's final tuple is "after the end of matches" for a scan that + * uses the *opposite* scan direction, then it must follow that it's also + * "before the start of matches" for the actual current scan direction. + * It is therefore essential that all of our initial positioning rules are + * symmetric with _bt_checkkeys's corresponding continuescan=false rule. * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might * need to be kept in sync. *---------- */ - strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { AttrNumber curattr; - ScanKey chosen; + ScanKey bkey; ScanKey impliesNN; ScanKey cur; /* - * chosen is the so-far-chosen key for the current attribute, if any. - * We don't cast the decision in stone until we reach keys for the - * next attribute. + * bkey will be set to the key that preprocessing left behind as the + * boundary key for this attribute, in this scan direction (if any) */ cur = so->keyData; curattr = 1; - chosen = NULL; + bkey = NULL; /* Also remember any scankey that implies a NOT NULL constraint */ impliesNN = NULL; @@ -1053,23 +1050,29 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { + /* Done looking for the curattr boundary key */ + Assert(bkey == NULL || + (bkey->sk_attno == curattr && + (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + Assert(impliesNN == NULL || + (impliesNN->sk_attno == curattr && + (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + /* - * Done looking at keys for curattr. - * * If this is a scan key for a skip array whose current * element is MINVAL, choose low_compare (when scanning * backwards it'll be MAXVAL, and we'll choose high_compare). * - * Note: if the array's low_compare key makes 'chosen' NULL, + * Note: if the array's low_compare key makes 'bkey' NULL, * then we behave as if the array's first element is -inf, * except when !array->null_elem implies a usable NOT NULL * constraint. */ - if (chosen != NULL && - (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) + if (bkey != NULL && + (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) { - int ikey = chosen - so->keyData; - ScanKey skipequalitykey = chosen; + int ikey = bkey - so->keyData; + ScanKey skipequalitykey = bkey; BTArrayKeyInfo *array = NULL; for (int arridx = 0; arridx < so->numArrayKeys; arridx++) @@ -1082,42 +1085,41 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (ScanDirectionIsForward(dir)) { Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL)); - chosen = array->low_compare; + bkey = array->low_compare; } else { Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL)); - chosen = array->high_compare; + bkey = array->high_compare; } - Assert(chosen == NULL || - chosen->sk_attno == skipequalitykey->sk_attno); + Assert(bkey == NULL || + bkey->sk_attno == skipequalitykey->sk_attno); if (!array->null_elem) impliesNN = skipequalitykey; else - Assert(chosen == NULL && impliesNN == NULL); + Assert(bkey == NULL && impliesNN == NULL); } /* * If we didn't find a usable boundary key, see if we can * deduce a NOT NULL key */ - if (chosen == NULL && impliesNN != NULL && + if (bkey == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? ScanDirectionIsForward(dir) : ScanDirectionIsBackward(dir))) { - /* Yes, so build the key in notnullkeys[keysz] */ - chosen = ¬nullkeys[keysz]; - ScanKeyEntryInitialize(chosen, + /* Final startKeys[] entry will be deduced NOT NULL key */ + bkey = ¬nullkey; + ScanKeyEntryInitialize(bkey, (SK_SEARCHNOTNULL | SK_ISNULL | (impliesNN->sk_flags & (SK_BT_DESC | SK_BT_NULLS_FIRST))), curattr, - ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? - BTGreaterStrategyNumber : - BTLessStrategyNumber), + ScanDirectionIsForward(dir) ? + BTGreaterStrategyNumber : BTLessStrategyNumber, InvalidOid, InvalidOid, InvalidOid, @@ -1125,12 +1127,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * If we still didn't find a usable boundary key, quit; else - * save the boundary key pointer in startKeys. + * If preprocessing didn't leave a usable boundary key, quit; + * else save the boundary key pointer in startKeys[] */ - if (chosen == NULL) + if (bkey == NULL) break; - startKeys[keysz++] = chosen; + startKeys[keysz++] = bkey; /* * We can only consider adding more boundary keys when the one @@ -1138,7 +1140,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * (during backwards scans we can only do so when the key that * we just added to startKeys[] uses the = or <= strategy) */ - strat_total = chosen->sk_strategy; + strat_total = bkey->sk_strategy; if (strat_total == BTGreaterStrategyNumber || strat_total == BTLessStrategyNumber) break; @@ -1149,19 +1151,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * make strat_total > or < (and stop adding boundary keys). * This can only happen with opclasses that lack skip support. */ - if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) + if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) { - Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(bkey->sk_flags & SK_BT_SKIP); Assert(strat_total == BTEqualStrategyNumber); if (ScanDirectionIsForward(dir)) { - Assert(!(chosen->sk_flags & SK_BT_PRIOR)); + Assert(!(bkey->sk_flags & SK_BT_PRIOR)); strat_total = BTGreaterStrategyNumber; } else { - Assert(!(chosen->sk_flags & SK_BT_NEXT)); + Assert(!(bkey->sk_flags & SK_BT_NEXT)); strat_total = BTLessStrategyNumber; } @@ -1175,24 +1177,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* * Done if that was the last scan key output by preprocessing. - * Also done if there is a gap index attribute that lacks a - * usable key (only possible when preprocessing was unable to - * generate a skip array key to "fill in the gap"). + * Also done if we've now examined all keys marked required. */ if (i >= so->numberOfKeys || - cur->sk_attno != curattr + 1) + !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) break; /* * Reset for next attr. */ + Assert(cur->sk_attno == curattr + 1); curattr = cur->sk_attno; - chosen = NULL; + bkey = NULL; impliesNN = NULL; } /* - * Can we use this key as a starting boundary for this attr? + * If we've located the starting boundary key for curattr, we have + * no interest in curattr's other required key + */ + if (bkey != NULL) + continue; + + /* + * Is this key the starting boundary key for curattr? * * If not, does it imply a NOT NULL constraint? (Because * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, @@ -1202,27 +1210,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsBackward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsBackward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; case BTEqualStrategyNumber: - /* override any non-equality choice */ - chosen = cur; + bkey = cur; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsForward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsForward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; } } @@ -1248,16 +1249,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Assert(keysz <= INDEX_MAX_KEYS); for (int i = 0; i < keysz; i++) { - ScanKey cur = startKeys[i]; + ScanKey bkey = startKeys[i]; - Assert(cur->sk_attno == i + 1); + Assert(bkey->sk_attno == i + 1); - if (cur->sk_flags & SK_ROW_HEADER) + if (bkey->sk_flags & SK_ROW_HEADER) { /* * Row comparison header: look to the first row member instead */ - ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument); + bool loosen_strat = false, + tighten_strat = false; /* * Cannot be a NULL in the first row member: _bt_preprocess_keys @@ -1265,9 +1268,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * ever getting this far */ Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == cur->sk_attno); + Assert(subkey->sk_attno == bkey->sk_attno); Assert(!(subkey->sk_flags & SK_ISNULL)); + /* + * This is either a > or >= key (during backwards scans it is + * either < or <=) that was marked required during preprocessing. + * Later so->keyData[] keys can't have been marked required, so + * our row compare header key must be the final startKeys[] entry. + */ + Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)); + Assert(subkey->sk_strategy == bkey->sk_strategy); + Assert(subkey->sk_strategy == strat_total); + Assert(i == keysz - 1); + /* * The member scankeys are already in insertion format (ie, they * have sk_func = 3-way-comparison function) @@ -1275,112 +1289,141 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); /* - * If the row comparison is the last positioning key we accepted, - * try to add additional keys from the lower-order row members. - * (If we accepted independent conditions on additional index - * columns, we use those instead --- doesn't seem worth trying to - * determine which is more restrictive.) Note that this is OK - * even if the row comparison is of ">" or "<" type, because the - * condition applied to all but the last row member is effectively - * ">=" or "<=", and so the extra keys don't break the positioning - * scheme. But, by the same token, if we aren't able to use all - * the row members, then the part of the row comparison that we - * did use has to be treated as just a ">=" or "<=" condition, and - * so we'd better adjust strat_total accordingly. + * Now look to later row compare members. + * + * If there's an "index attribute gap" between two row compare + * members, the second member won't have been marked required, and + * so can't be used as a starting boundary key here. The part of + * the row comparison that we do still use has to be treated as a + * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)" + * with an omitted intervening index attribute "b" will use an + * insertion scan key "a >= 1". Even the first "a = 1" tuple on + * the leaf level might satisfy the row compare qual. + * + * We're able to use a _more_ restrictive strategy when we reach a + * NULL row compare member, since they're always unsatisfiable. + * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an + * insertion scan key "a > 1". All tuples where "a = 1" cannot + * possibly satisfy the row compare qual, so this is safe. */ - if (i == keysz - 1) + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) { - bool used_all_subkeys = false; + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(!(subkey->sk_flags & SK_ROW_END)); - for (;;) + if (subkey->sk_flags & SK_ISNULL) { - subkey++; - Assert(subkey->sk_flags & SK_ROW_MEMBER); - if (subkey->sk_attno != keysz + 1) - break; /* out-of-sequence, can't use it */ - if (subkey->sk_strategy != cur->sk_strategy) - break; /* wrong direction, can't use it */ - if (subkey->sk_flags & SK_ISNULL) - break; /* can't use null keys */ - Assert(keysz < INDEX_MAX_KEYS); - memcpy(inskey.scankeys + keysz, subkey, - sizeof(ScanKeyData)); - keysz++; - if (subkey->sk_flags & SK_ROW_END) - { - used_all_subkeys = true; - break; - } + /* + * NULL member key, can only use earlier keys. + * + * We deliberately avoid checking if this key is marked + * required. All earlier keys are required, and this key + * is unsatisfiable either way, so we can't miss anything. + */ + tighten_strat = true; + break; } - if (!used_all_subkeys) + + if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) { - switch (strat_total) - { - case BTLessStrategyNumber: - strat_total = BTLessEqualStrategyNumber; - break; - case BTGreaterStrategyNumber: - strat_total = BTGreaterEqualStrategyNumber; - break; - } + /* nonrequired member key, can only use earlier keys */ + loosen_strat = true; + break; } - break; /* done with outer loop */ + + Assert(subkey->sk_attno == keysz + 1); + Assert(subkey->sk_strategy == bkey->sk_strategy); + Assert(keysz < INDEX_MAX_KEYS); + + memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData)); + keysz++; + + if (subkey->sk_flags & SK_ROW_END) + break; } - } - else - { - /* - * Ordinary comparison key. Transform the search-style scan key - * to an insertion scan key by replacing the sk_func with the - * appropriate btree comparison function. - * - * If scankey operator is not a cross-type comparison, we can use - * the cached comparison function; otherwise gotta look it up in - * the catalogs. (That can't lead to infinite recursion, since no - * indexscan initiated by syscache lookup will use cross-data-type - * operators.) - * - * We support the convention that sk_subtype == InvalidOid means - * the opclass input type; this is a hack to simplify life for - * ScanKeyInit(). - */ - if (cur->sk_subtype == rel->rd_opcintype[i] || - cur->sk_subtype == InvalidOid) + Assert(!(loosen_strat && tighten_strat)); + if (loosen_strat) { - FmgrInfo *procinfo; - - procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); - ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - procinfo, - cur->sk_argument); + /* Use less restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } } - else + if (tighten_strat) { - RegProcedure cmp_proc; - - cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], - rel->rd_opcintype[i], - cur->sk_subtype, - BTORDER_PROC); - if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", - BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, - cur->sk_attno, RelationGetRelationName(rel)); - ScanKeyEntryInitialize(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - cmp_proc, - cur->sk_argument); + /* Use more restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessEqualStrategyNumber: + strat_total = BTLessStrategyNumber; + break; + case BTGreaterEqualStrategyNumber: + strat_total = BTGreaterStrategyNumber; + break; + } } + + /* Done (row compare header key is always last startKeys[] key) */ + break; + } + + /* + * Ordinary comparison key/search-style key. + * + * Transform the search-style scan key to an insertion scan key by + * replacing the sk_func with the appropriate btree 3-way-comparison + * function. + * + * If scankey operator is not a cross-type comparison, we can use the + * cached comparison function; otherwise gotta look it up in the + * catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means the + * opclass input type; this hack simplifies life for ScanKeyInit(). + */ + if (bkey->sk_subtype == rel->rd_opcintype[i] || + bkey->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + procinfo, + bkey->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + bkey->sk_subtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype, + bkey->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + cmp_proc, + bkey->sk_argument); } } @@ -1469,6 +1512,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { + Assert(!so->needPrimScan); + /* * We only get here if the index is completely empty. Lock relation * because nothing finer to lock exists. Without a buffer lock, it's @@ -1487,7 +1532,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { - Assert(!so->needPrimScan); _bt_parallel_done(scan); return false; } @@ -1569,519 +1613,6 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) return true; } -/* - * _bt_readpage() -- Load data from current index page into so->currPos - * - * Caller must have pinned and read-locked so->currPos.buf; the buffer's state - * is not changed here. Also, currPos.moreLeft and moreRight must be valid; - * they are updated as appropriate. All other fields of so->currPos are - * initialized from scratch here. - * - * We scan the current page starting at offnum and moving in the indicated - * direction. All items matching the scan keys are loaded into currPos.items. - * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports - * that there can be no more matching tuples in the current scan direction - * (could just be for the current primitive index scan when scan has arrays). - * - * In the case of a parallel scan, caller must have called _bt_parallel_seize - * prior to calling this function; this function will invoke - * _bt_parallel_release before returning. - * - * Returns true if any matching items found on the page, false if none. - */ -static bool -_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, - bool firstpage) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Page page; - BTPageOpaque opaque; - OffsetNumber minoff; - OffsetNumber maxoff; - BTReadPageState pstate; - bool arrayKeys; - int itemIndex, - indnatts; - - /* save the page/buffer block number, along with its sibling links */ - page = BufferGetPage(so->currPos.buf); - opaque = BTPageGetOpaque(page); - so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); - so->currPos.prevPage = opaque->btpo_prev; - so->currPos.nextPage = opaque->btpo_next; - - Assert(!P_IGNORE(opaque)); - Assert(BTScanPosIsPinned(so->currPos)); - Assert(!so->needPrimScan); - - if (scan->parallel_scan) - { - /* allow next/prev page to be read by other worker without delay */ - if (ScanDirectionIsForward(dir)) - _bt_parallel_release(scan, so->currPos.nextPage, - so->currPos.currPage); - else - _bt_parallel_release(scan, so->currPos.prevPage, - so->currPos.currPage); - } - - /* initialize remaining currPos fields related to current page */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - so->currPos.dir = dir; - so->currPos.nextTupleOffset = 0; - /* either moreLeft or moreRight should be set now (may be unset later) */ - Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : - so->currPos.moreLeft); - - PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); - - /* initialize local variables */ - indnatts = IndexRelationGetNumberOfAttributes(rel); - arrayKeys = so->numArrayKeys != 0; - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); - - /* initialize page-level state that we'll pass to _bt_checkkeys */ - pstate.minoff = minoff; - pstate.maxoff = maxoff; - pstate.finaltup = NULL; - pstate.page = page; - pstate.firstpage = firstpage; - pstate.forcenonrequired = false; - pstate.startikey = 0; - pstate.offnum = InvalidOffsetNumber; - pstate.skip = InvalidOffsetNumber; - pstate.continuescan = true; /* default assumption */ - pstate.rechecks = 0; - pstate.targetdistance = 0; - pstate.nskipadvances = 0; - - if (ScanDirectionIsForward(dir)) - { - /* SK_SEARCHARRAY forward scans must provide high key up front */ - if (arrayKeys) - { - if (!P_RIGHTMOST(opaque)) - { - ItemId iid = PageGetItemId(page, P_HIKEY); - - pstate.finaltup = (IndexTuple) PageGetItem(page, iid); - - if (so->scanBehind && - !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) - { - /* Schedule another primitive index scan after all */ - so->currPos.moreRight = false; - so->needPrimScan = true; - if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, - so->currPos.currPage); - return false; - } - } - - so->scanBehind = so->oppositeDirCheck = false; /* reset */ - } - - /* - * Consider pstate.startikey optimization once the ongoing primitive - * index scan has already read at least one page - */ - if (!pstate.firstpage && minoff < maxoff) - _bt_set_startikey(scan, &pstate); - - /* load items[] in ascending order */ - itemIndex = 0; - - offnum = Max(offnum, minoff); - - while (offnum <= maxoff) - { - ItemId iid = PageGetItemId(page, offnum); - IndexTuple itup; - bool passes_quals; - - /* - * If the scan specifies not to return killed tuples, then we - * treat a killed tuple as not passing the qual - */ - if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) - { - offnum = OffsetNumberNext(offnum); - continue; - } - - itup = (IndexTuple) PageGetItem(page, iid); - Assert(!BTreeTupleIsPivot(itup)); - - pstate.offnum = offnum; - passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, - itup, indnatts); - - /* - * Check if we need to skip ahead to a later tuple (only possible - * when the scan uses array keys) - */ - if (arrayKeys && OffsetNumberIsValid(pstate.skip)) - { - Assert(!passes_quals && pstate.continuescan); - Assert(offnum < pstate.skip); - Assert(!pstate.forcenonrequired); - - offnum = pstate.skip; - pstate.skip = InvalidOffsetNumber; - continue; - } - - if (passes_quals) - { - /* tuple passes all scan key conditions */ - if (!BTreeTupleIsPosting(itup)) - { - /* Remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; - } - else - { - int tupleOffset; - - /* - * Set up state to return posting list, and remember first - * TID - */ - tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, 0), - itup); - itemIndex++; - /* Remember additional TIDs */ - for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) - { - _bt_savepostingitem(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, i), - tupleOffset); - itemIndex++; - } - } - } - /* When !continuescan, there can't be any more matches, so stop */ - if (!pstate.continuescan) - break; - - offnum = OffsetNumberNext(offnum); - } - - /* - * We don't need to visit page to the right when the high key - * indicates that no more matches will be found there. - * - * Checking the high key like this works out more often than you might - * think. Leaf page splits pick a split point between the two most - * dissimilar tuples (this is weighed against the need to evenly share - * free space). Leaf pages with high key attribute values that can - * only appear on non-pivot tuples on the right sibling page are - * common. - */ - if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque)) - { - ItemId iid = PageGetItemId(page, P_HIKEY); - IndexTuple itup = (IndexTuple) PageGetItem(page, iid); - int truncatt; - - /* Reset arrays, per _bt_set_startikey contract */ - if (pstate.forcenonrequired) - _bt_start_array_keys(scan, dir); - pstate.forcenonrequired = false; - pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */ - - truncatt = BTreeTupleGetNAtts(itup, rel); - _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); - } - - if (!pstate.continuescan) - so->currPos.moreRight = false; - - Assert(itemIndex <= MaxTIDsPerBTreePage); - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; - } - else - { - /* SK_SEARCHARRAY backward scans must provide final tuple up front */ - if (arrayKeys) - { - if (minoff <= maxoff && !P_LEFTMOST(opaque)) - { - ItemId iid = PageGetItemId(page, minoff); - - pstate.finaltup = (IndexTuple) PageGetItem(page, iid); - - if (so->scanBehind && - !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) - { - /* Schedule another primitive index scan after all */ - so->currPos.moreLeft = false; - so->needPrimScan = true; - if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, - so->currPos.currPage); - return false; - } - } - - so->scanBehind = so->oppositeDirCheck = false; /* reset */ - } - - /* - * Consider pstate.startikey optimization once the ongoing primitive - * index scan has already read at least one page - */ - if (!pstate.firstpage && minoff < maxoff) - _bt_set_startikey(scan, &pstate); - - /* load items[] in descending order */ - itemIndex = MaxTIDsPerBTreePage; - - offnum = Min(offnum, maxoff); - - while (offnum >= minoff) - { - ItemId iid = PageGetItemId(page, offnum); - IndexTuple itup; - bool tuple_alive; - bool passes_quals; - - /* - * If the scan specifies not to return killed tuples, then we - * treat a killed tuple as not passing the qual. Most of the - * time, it's a win to not bother examining the tuple's index - * keys, but just skip to the next tuple (previous, actually, - * since we're scanning backwards). However, if this is the first - * tuple on the page, we do check the index keys, to prevent - * uselessly advancing to the page to the left. This is similar - * to the high key optimization used by forward scans. - */ - if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) - { - if (offnum > minoff) - { - offnum = OffsetNumberPrev(offnum); - continue; - } - - tuple_alive = false; - } - else - tuple_alive = true; - - itup = (IndexTuple) PageGetItem(page, iid); - Assert(!BTreeTupleIsPivot(itup)); - - pstate.offnum = offnum; - if (arrayKeys && offnum == minoff && pstate.forcenonrequired) - { - /* Reset arrays, per _bt_set_startikey contract */ - pstate.forcenonrequired = false; - pstate.startikey = 0; - _bt_start_array_keys(scan, dir); - } - passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, - itup, indnatts); - - if (arrayKeys && so->scanBehind) - { - /* - * Done scanning this page, but not done with the current - * primscan. - * - * Note: Forward scans don't check this explicitly, since they - * prefer to reuse pstate.skip for this instead. - */ - Assert(!passes_quals && pstate.continuescan); - Assert(!pstate.forcenonrequired); - - break; - } - - /* - * Check if we need to skip ahead to a later tuple (only possible - * when the scan uses array keys) - */ - if (arrayKeys && OffsetNumberIsValid(pstate.skip)) - { - Assert(!passes_quals && pstate.continuescan); - Assert(offnum > pstate.skip); - Assert(!pstate.forcenonrequired); - - offnum = pstate.skip; - pstate.skip = InvalidOffsetNumber; - continue; - } - - if (passes_quals && tuple_alive) - { - /* tuple passes all scan key conditions */ - if (!BTreeTupleIsPosting(itup)) - { - /* Remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); - } - else - { - int tupleOffset; - - /* - * Set up state to return posting list, and remember first - * TID. - * - * Note that we deliberately save/return items from - * posting lists in ascending heap TID order for backwards - * scans. This allows _bt_killitems() to make a - * consistent assumption about the order of items - * associated with the same posting list tuple. - */ - itemIndex--; - tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, 0), - itup); - /* Remember additional TIDs */ - for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) - { - itemIndex--; - _bt_savepostingitem(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, i), - tupleOffset); - } - } - } - /* When !continuescan, there can't be any more matches, so stop */ - if (!pstate.continuescan) - break; - - offnum = OffsetNumberPrev(offnum); - } - - /* - * We don't need to visit page to the left when no more matches will - * be found there - */ - if (!pstate.continuescan) - so->currPos.moreLeft = false; - - Assert(itemIndex >= 0); - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxTIDsPerBTreePage - 1; - so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; - } - - /* - * If _bt_set_startikey told us to temporarily treat the scan's keys as - * nonrequired (possible only during scans with array keys), there must be - * no lasting consequences for the scan's array keys. The scan's arrays - * should now have exactly the same elements as they would have had if the - * nonrequired behavior had never been used. (In general, a scan's arrays - * are expected to track its progress through the index's key space.) - * - * We are required (by _bt_set_startikey) to call _bt_checkkeys against - * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's - * arrays to recover. Assert that that step hasn't been missed. - */ - Assert(!pstate.forcenonrequired); - - return (so->currPos.firstItem <= so->currPos.lastItem); -} - -/* Save an index item into so->currPos.items[itemIndex] */ -static void -_bt_saveitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, IndexTuple itup) -{ - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; - - Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); - - currItem->heapTid = itup->t_tid; - currItem->indexOffset = offnum; - if (so->currTuples) - { - Size itupsz = IndexTupleSize(itup); - - currItem->tupleOffset = so->currPos.nextTupleOffset; - memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); - so->currPos.nextTupleOffset += MAXALIGN(itupsz); - } -} - -/* - * Setup state to save TIDs/items from a single posting list tuple. - * - * Saves an index item into so->currPos.items[itemIndex] for TID that is - * returned to scan first. Second or subsequent TIDs for posting list should - * be saved by calling _bt_savepostingitem(). - * - * Returns an offset into tuple storage space that main tuple is stored at if - * needed. - */ -static int -_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, IndexTuple itup) -{ - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; - - Assert(BTreeTupleIsPosting(itup)); - - currItem->heapTid = *heapTid; - currItem->indexOffset = offnum; - if (so->currTuples) - { - /* Save base IndexTuple (truncate posting list) */ - IndexTuple base; - Size itupsz = BTreeTupleGetPostingOffset(itup); - - itupsz = MAXALIGN(itupsz); - currItem->tupleOffset = so->currPos.nextTupleOffset; - base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); - memcpy(base, itup, itupsz); - /* Defensively reduce work area index tuple header size */ - base->t_info &= ~INDEX_SIZE_MASK; - base->t_info |= itupsz; - so->currPos.nextTupleOffset += itupsz; - - return currItem->tupleOffset; - } - - return 0; -} - -/* - * Save an index item into so->currPos.items[itemIndex] for current posting - * tuple. - * - * Assumes that _bt_setuppostingitems() has already been called for current - * posting list tuple. Caller passes its return value as tupleOffset. - */ -static inline void -_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, int tupleOffset) -{ - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; - - currItem->heapTid = *heapTid; - currItem->indexOffset = offnum; - - /* - * Have index-only scans return the same base IndexTuple for every TID - * that originates from the same posting list - */ - if (so->currTuples) - currItem->tupleOffset = tupleOffset; -} - /* * Return the index item from so->currPos.items[so->currPos.itemIndex] to the * index scan by setting the relevant fields in caller's index scan descriptor @@ -2107,10 +1638,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * * Wrapper on _bt_readnextpage that performs final steps for the current page. * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. - * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped - * the pin eagerly earlier on. The scan must have so->currPos.currPage set to - * a valid block, in any case. + * On entry, so->currPos must be valid. Its buffer will be pinned, though + * never locked. (Actually, when so->dropPin there won't even be a pin held, + * though so->currPos.currPage must still be set to a valid block number.) */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) @@ -2197,12 +1727,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * * _bt_first caller passes us an offnum returned by _bt_binsrch, which might * be an out of bounds offnum such as "maxoff + 1" in certain corner cases. - * _bt_checkkeys will stop the scan as soon as an equality qual fails (when - * its scan key was marked required), so _bt_first _must_ pass us an offnum - * exactly at the beginning of where equal tuples are to be found. When we're - * passed an offnum past the end of the page, we might still manage to stop - * the scan on this page by calling _bt_checkkeys against the high key. See - * _bt_readpage for full details. + * When we're passed an offnum past the end of the page, we might still manage + * to stop the scan on this page by calling _bt_checkkeys against the high + * key. See _bt_readpage for full details. * * On entry, so->currPos must be pinned and locked (so offnum stays valid). * Parallel scan callers must have seized the scan before calling here. @@ -2251,12 +1778,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) */ if (_bt_readpage(scan, dir, offnum, true)) { + Relation rel = scan->indexRelation; + /* * _bt_readpage succeeded. Drop the lock (and maybe the pin) on * so->currPos.buf in preparation for btgettuple returning tuples. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } @@ -2278,9 +1807,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * previously-saved right link or left link. lastcurrblkno is the page that * was current at the point where the blkno link was saved, which we use to * reason about concurrent page splits/page deletions during backwards scans. + * In the common case where seized=false, blkno is either so->currPos.nextPage + * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage. * - * On entry, caller shouldn't hold any locks or pins on any page (we work - * directly off of blkno and lastcurrblkno instead). Parallel scan callers + * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must + * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno + * won't need to be read again in almost all cases). Parallel scan callers * that seized the scan before calling here should pass seized=true; such a * caller's blkno and lastcurrblkno arguments come from the seized scan. * seized=false callers just pass us the blkno/lastcurrblkno taken from their @@ -2294,11 +1826,11 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * * On success exit, so->currPos is updated to contain data from the next * interesting page, and we return true. We hold a pin on the buffer on - * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe - * to eagerly drop the pin (to avoid blocking VACUUM). + * success exit (except during so->dropPin index scans, when we drop the pin + * eagerly to avoid blocking VACUUM). * - * If there are no more matching records in the given direction, we drop all - * locks and pins, invalidate so->currPos, and return false. + * If there are no more matching records in the given direction, we invalidate + * so->currPos (while ensuring it retains no locks or pins), and return false. * * We always release the scan for a parallel scan caller, regardless of * success or failure; we'll call _bt_parallel_release as soon as possible. @@ -2413,7 +1945,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, */ Assert(so->currPos.currPage == blkno); Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } @@ -2616,6 +2148,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) else offnum = P_FIRSTDATAKEY(opaque); + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); blkno = BTreeTupleGetDownLink(itup); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 3794cc924ad46..d7695dc1108f2 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -44,6 +44,7 @@ #include "access/parallel.h" #include "access/relscan.h" #include "access/table.h" +#include "access/tableam.h" #include "access/xact.h" #include "catalog/index.h" #include "commands/progress.h" @@ -105,7 +106,7 @@ typedef struct BTShared int scantuplesortstates; /* Query ID, for report in worker processes */ - uint64 queryid; + int64 queryid; /* * workersdonecv is used to monitor the progress of workers. All parallel @@ -256,8 +257,8 @@ typedef struct BTWriteState static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo); static void _bt_spooldestroy(BTSpool *btspool); -static void _bt_spool(BTSpool *btspool, ItemPointer self, - Datum *values, bool *isnull); +static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, + const Datum *values, const bool *isnull); static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); @@ -265,7 +266,7 @@ static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); static void _bt_slideleft(Page rightmostpage); static void _bt_sortaddtup(Page page, Size itemsize, - IndexTuple itup, OffsetNumber itup_off, + const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem); static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra); @@ -334,7 +335,7 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result = palloc_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; @@ -365,7 +366,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo) { - BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); + BTSpool *btspool = palloc0_object(BTSpool); SortCoordinate coordinate = NULL; double reltuples = 0; @@ -398,7 +399,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, */ if (buildstate->btleader) { - coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate = palloc0_object(SortCoordinateData); coordinate->isWorker = false; coordinate->nParticipants = buildstate->btleader->nparticipanttuplesorts; @@ -439,7 +440,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, */ if (indexInfo->ii_Unique) { - BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); + BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; /* Initialize secondary spool */ @@ -456,7 +457,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * tuplesort_begin_index_btree() about the basic high level * coordination of a parallel sort. */ - coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate2 = palloc0_object(SortCoordinateData); coordinate2->isWorker = false; coordinate2->nParticipants = buildstate->btleader->nparticipanttuplesorts; @@ -524,7 +525,7 @@ _bt_spooldestroy(BTSpool *btspool) * spool an index entry into the sort file. */ static void -_bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) +_bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull) { tuplesort_putindextuplevalues(btspool->sortstate, btspool->index, self, values, isnull); @@ -647,7 +648,7 @@ _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno) static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level) { - BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); + BTPageState *state = palloc0_object(BTPageState); /* create initial page for level */ state->btps_buf = _bt_blnewpage(wstate, level); @@ -715,7 +716,7 @@ _bt_slideleft(Page rightmostpage) static void _bt_sortaddtup(Page page, Size itemsize, - IndexTuple itup, + const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem) { @@ -730,8 +731,7 @@ _bt_sortaddtup(Page page, itemsize = sizeof(IndexTupleData); } - if (PageAddItem(page, (Item) itup, itemsize, itup_off, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to the index page"); } @@ -933,8 +933,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Assert(IndexTupleSize(oitup) > last_truncextra); truncated = _bt_truncate(wstate->index, lastleft, oitup, wstate->inskey); - if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated, - IndexTupleSize(truncated))) + if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated))) elog(ERROR, "failed to add high key to the index page"); pfree(truncated); @@ -1003,7 +1002,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, if (last_off == P_HIKEY) { Assert(state->btps_lowkey == NULL); - state->btps_lowkey = palloc0(sizeof(IndexTupleData)); + state->btps_lowkey = palloc0_object(IndexTupleData); state->btps_lowkey->t_info = sizeof(IndexTupleData); BTreeTupleSetNAtts(state->btps_lowkey, 0, false); } @@ -1165,7 +1164,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) itup2 = tuplesort_getindextuple(btspool2->sortstate, true); /* Prepare SortSupport data for each column */ - sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); + sortKeys = palloc0_array(SortSupportData, keysz); for (i = 0; i < keysz; i++) { @@ -1267,7 +1266,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) /* merge is unnecessary, deduplicate into posting lists */ BTDedupState dstate; - dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); + dstate = palloc_object(BTDedupStateData); dstate->deduplicate = true; /* unused */ dstate->nmaxitems = 0; /* unused */ dstate->maxpostingsize = 0; /* set later */ @@ -1405,7 +1404,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) Sharedsort *sharedsort; Sharedsort *sharedsort2; BTSpool *btspool = buildstate->spool; - BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader)); + BTLeader *btleader = palloc0_object(BTLeader); WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; @@ -1694,7 +1693,7 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) int sortmem; /* Allocate memory and initialize private spool */ - leaderworker = (BTSpool *) palloc0(sizeof(BTSpool)); + leaderworker = palloc0_object(BTSpool); leaderworker->heap = buildstate->spool->heap; leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; @@ -1706,7 +1705,7 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) else { /* Allocate memory for worker's own private secondary spool */ - leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool)); + leaderworker2 = palloc0_object(BTSpool); /* Initialize worker's own secondary spool */ leaderworker2->heap = leaderworker->heap; @@ -1797,7 +1796,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) indexRel = index_open(btshared->indexrelid, indexLockmode); /* Initialize worker's own spool */ - btspool = (BTSpool *) palloc0(sizeof(BTSpool)); + btspool = palloc0_object(BTSpool); btspool->heap = heapRel; btspool->index = indexRel; btspool->isunique = btshared->isunique; @@ -1814,7 +1813,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) else { /* Allocate memory for worker's own private secondary spool */ - btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); + btspool2 = palloc0_object(BTSpool); /* Initialize worker's own secondary spool */ btspool2->heap = btspool->heap; @@ -1875,7 +1874,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, IndexInfo *indexInfo; /* Initialize local tuplesort coordination state */ - coordinate = palloc0(sizeof(SortCoordinateData)); + coordinate = palloc0_object(SortCoordinateData); coordinate->isWorker = true; coordinate->nParticipants = -1; coordinate->sharedsort = sharedsort; @@ -1902,7 +1901,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, * worker). Worker processes are generally permitted to allocate * work_mem independently. */ - coordinate2 = palloc0(sizeof(SortCoordinateData)); + coordinate2 = palloc0_object(SortCoordinateData); coordinate2->isWorker = true; coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index e6c9aaa0454dd..651ab013025bb 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/tableam.h" #include "common/int.h" typedef enum @@ -68,7 +69,7 @@ static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, static int _bt_splitcmp(const void *arg1, const void *arg2); static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult); -static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); +static bool _bt_adjacenthtid(const ItemPointerData *lowhtid, const ItemPointerData *highhtid); static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy); static int _bt_defaultinterval(FindSplitData *state); @@ -196,7 +197,7 @@ _bt_findsplitloc(Relation rel, * between tuples will be legal). */ state.maxsplits = maxoff; - state.splits = palloc(sizeof(SplitPoint) * state.maxsplits); + state.splits = palloc_array(SplitPoint, state.maxsplits); state.nsplits = 0; /* @@ -746,7 +747,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, * transaction. */ static bool -_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) +_bt_adjacenthtid(const ItemPointerData *lowhtid, const ItemPointerData *highhtid) { BlockNumber lowblk, highblk; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1a15dfcb7d357..a451d48e11ed1 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -19,53 +19,17 @@ #include "access/nbtree.h" #include "access/reloptions.h" +#include "access/relscan.h" #include "commands/progress.h" +#include "common/int.h" +#include "lib/qunique.h" #include "miscadmin.h" #include "utils/datum.h" #include "utils/lsyscache.h" +#include "utils/rel.h" -#define LOOK_AHEAD_REQUIRED_RECHECKS 3 -#define LOOK_AHEAD_DEFAULT_DISTANCE 5 -#define NSKIPADVANCES_THRESHOLD 3 - -static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, - Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur); -static void _bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, - Datum tupdatum, bool tupnull, - BTArrayKeyInfo *array, ScanKey cur, - int32 *set_elem_result); -static void _bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, - int32 set_elem_result, Datum tupdatum, bool tupnull); -static void _bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); -static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, - BTArrayKeyInfo *array, bool low_not_high); -static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); -static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); -static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, - bool *skip_array_set); -static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); -static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, - IndexTuple tuple, TupleDesc tupdesc, int tupnatts, - bool readpagetup, int sktrig, bool *scanBehind); -static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, - IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - int sktrig, bool sktrig_required); -#ifdef USE_ASSERT_CHECKING -static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); -static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); -#endif -static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, - IndexTuple finaltup); -static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, - IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool forcenonrequired, - bool *continuescan, int *ikey); -static bool _bt_check_rowcompare(ScanKey skey, - IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool forcenonrequired, bool *continuescan); -static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, - int tupnatts, TupleDesc tupdesc); + +static int _bt_compare_int(const void *va, const void *vb); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); @@ -94,3232 +58,118 @@ static int _bt_keep_natts(Relation rel, IndexTuple lastleft, BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup) { - BTScanInsert key; - ScanKey skey; - TupleDesc itupdesc; - int indnkeyatts; - int16 *indoption; - int tupnatts; - int i; - - itupdesc = RelationGetDescr(rel); - indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - indoption = rel->rd_indoption; - tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; - - Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); - - /* - * We'll execute search using scan key constructed on key columns. - * Truncated attributes and non-key attributes are omitted from the final - * scan key. - */ - key = palloc(offsetof(BTScanInsertData, scankeys) + - sizeof(ScanKeyData) * indnkeyatts); - if (itup) - _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); - else - { - /* Utility statement callers can set these fields themselves */ - key->heapkeyspace = true; - key->allequalimage = false; - } - key->anynullkeys = false; /* initial assumption */ - key->nextkey = false; /* usual case, required by btinsert */ - key->backward = false; /* usual case, required by btinsert */ - key->keysz = Min(indnkeyatts, tupnatts); - key->scantid = key->heapkeyspace && itup ? - BTreeTupleGetHeapTID(itup) : NULL; - skey = key->scankeys; - for (i = 0; i < indnkeyatts; i++) - { - FmgrInfo *procinfo; - Datum arg; - bool null; - int flags; - - /* - * We can use the cached (default) support procs since no cross-type - * comparison can be needed. - */ - procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); - - /* - * Key arguments built from truncated attributes (or when caller - * provides no tuple) are defensively represented as NULL values. They - * should never be used. - */ - if (i < tupnatts) - arg = index_getattr(itup, i + 1, itupdesc, &null); - else - { - arg = (Datum) 0; - null = true; - } - flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); - ScanKeyEntryInitializeWithInfo(&skey[i], - flags, - (AttrNumber) (i + 1), - InvalidStrategy, - InvalidOid, - rel->rd_indcollation[i], - procinfo, - arg); - /* Record if any key attribute is NULL (or truncated) */ - if (null) - key->anynullkeys = true; - } - - /* - * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so - * that full uniqueness check is done. - */ - if (rel->rd_index->indnullsnotdistinct) - key->anynullkeys = false; - - return key; -} - -/* - * free a retracement stack made by _bt_search. - */ -void -_bt_freestack(BTStack stack) -{ - BTStack ostack; - - while (stack != NULL) - { - ostack = stack; - stack = stack->bts_parent; - pfree(ostack); - } -} - -/* - * _bt_compare_array_skey() -- apply array comparison function - * - * Compares caller's tuple attribute value to a scan key/array element. - * Helper function used during binary searches of SK_SEARCHARRAY arrays. - * - * This routine returns: - * <0 if tupdatum < arrdatum; - * 0 if tupdatum == arrdatum; - * >0 if tupdatum > arrdatum. - * - * This is essentially the same interface as _bt_compare: both functions - * compare the value that they're searching for to a binary search pivot. - * However, unlike _bt_compare, this function's "tuple argument" comes first, - * while its "array/scankey argument" comes second. -*/ -static inline int32 -_bt_compare_array_skey(FmgrInfo *orderproc, - Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur) -{ - int32 result = 0; - - Assert(cur->sk_strategy == BTEqualStrategyNumber); - Assert(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))); - - if (tupnull) /* NULL tupdatum */ - { - if (cur->sk_flags & SK_ISNULL) - result = 0; /* NULL "=" NULL */ - else if (cur->sk_flags & SK_BT_NULLS_FIRST) - result = -1; /* NULL "<" NOT_NULL */ - else - result = 1; /* NULL ">" NOT_NULL */ - } - else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ - { - if (cur->sk_flags & SK_BT_NULLS_FIRST) - result = 1; /* NOT_NULL ">" NULL */ - else - result = -1; /* NOT_NULL "<" NULL */ - } - else - { - /* - * Like _bt_compare, we need to be careful of cross-type comparisons, - * so the left value has to be the value that came from an index tuple - */ - result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation, - tupdatum, arrdatum)); - - /* - * We flip the sign by following the obvious rule: flip whenever the - * column is a DESC column. - * - * _bt_compare does it the wrong way around (flip when *ASC*) in order - * to compensate for passing its orderproc arguments backwards. We - * don't need to play these games because we find it natural to pass - * tupdatum as the left value (and arrdatum as the right value). - */ - if (cur->sk_flags & SK_BT_DESC) - INVERT_COMPARE_RESULT(result); - } - - return result; -} - -/* - * _bt_binsrch_array_skey() -- Binary search for next matching array key - * - * Returns an index to the first array element >= caller's tupdatum argument. - * This convention is more natural for forwards scan callers, but that can't - * really matter to backwards scan callers. Both callers require handling for - * the case where the match we return is < tupdatum, and symmetric handling - * for the case where our best match is > tupdatum. - * - * Also sets *set_elem_result to the result _bt_compare_array_skey returned - * when we used it to compare the matching array element to tupdatum/tupnull. - * - * cur_elem_trig indicates if array advancement was triggered by this array's - * scan key, and that the array is for a required scan key. We can apply this - * information to find the next matching array element in the current scan - * direction using far fewer comparisons (fewer on average, compared to naive - * binary search). This scheme takes advantage of an important property of - * required arrays: required arrays always advance in lockstep with the index - * scan's progress through the index's key space. - */ -int -_bt_binsrch_array_skey(FmgrInfo *orderproc, - bool cur_elem_trig, ScanDirection dir, - Datum tupdatum, bool tupnull, - BTArrayKeyInfo *array, ScanKey cur, - int32 *set_elem_result) -{ - int low_elem = 0, - mid_elem = -1, - high_elem = array->num_elems - 1, - result = 0; - Datum arrdatum; - - Assert(cur->sk_flags & SK_SEARCHARRAY); - Assert(!(cur->sk_flags & SK_BT_SKIP)); - Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */ - Assert(cur->sk_strategy == BTEqualStrategyNumber); - - if (cur_elem_trig) - { - Assert(!ScanDirectionIsNoMovement(dir)); - Assert(cur->sk_flags & SK_BT_REQFWD); - - /* - * When the scan key that triggered array advancement is a required - * array scan key, it is now certain that the current array element - * (plus all prior elements relative to the current scan direction) - * cannot possibly be at or ahead of the corresponding tuple value. - * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which - * makes sure this is true as a condition of advancing the arrays.) - * - * This makes it safe to exclude array elements up to and including - * the former-current array element from our search. - * - * Separately, when array advancement was triggered by a required scan - * key, the array element immediately after the former-current element - * is often either an exact tupdatum match, or a "close by" near-match - * (a near-match tupdatum is one whose key space falls _between_ the - * former-current and new-current array elements). We'll detect both - * cases via an optimistic comparison of the new search lower bound - * (or new search upper bound in the case of backwards scans). - */ - if (ScanDirectionIsForward(dir)) - { - low_elem = array->cur_elem + 1; /* old cur_elem exhausted */ - - /* Compare prospective new cur_elem (also the new lower bound) */ - if (high_elem >= low_elem) - { - arrdatum = array->elem_values[low_elem]; - result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); - - if (result <= 0) - { - /* Optimistic comparison optimization worked out */ - *set_elem_result = result; - return low_elem; - } - mid_elem = low_elem; - low_elem++; /* this cur_elem exhausted, too */ - } - - if (high_elem < low_elem) - { - /* Caller needs to perform "beyond end" array advancement */ - *set_elem_result = 1; - return high_elem; - } - } - else - { - high_elem = array->cur_elem - 1; /* old cur_elem exhausted */ - - /* Compare prospective new cur_elem (also the new upper bound) */ - if (high_elem >= low_elem) - { - arrdatum = array->elem_values[high_elem]; - result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); - - if (result >= 0) - { - /* Optimistic comparison optimization worked out */ - *set_elem_result = result; - return high_elem; - } - mid_elem = high_elem; - high_elem--; /* this cur_elem exhausted, too */ - } - - if (high_elem < low_elem) - { - /* Caller needs to perform "beyond end" array advancement */ - *set_elem_result = -1; - return low_elem; - } - } - } - - while (high_elem > low_elem) - { - mid_elem = low_elem + ((high_elem - low_elem) / 2); - arrdatum = array->elem_values[mid_elem]; - - result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); - - if (result == 0) - { - /* - * It's safe to quit as soon as we see an equal array element. - * This often saves an extra comparison or two... - */ - low_elem = mid_elem; - break; - } - - if (result > 0) - low_elem = mid_elem + 1; - else - high_elem = mid_elem; - } - - /* - * ...but our caller also cares about how its searched-for tuple datum - * compares to the low_elem datum. Must always set *set_elem_result with - * the result of that comparison specifically. - */ - if (low_elem != mid_elem) - result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - array->elem_values[low_elem], cur); - - *set_elem_result = result; - - return low_elem; -} - -/* - * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array - * - * Does not return an index into the array, since skip arrays don't really - * contain elements (they generate their array elements procedurally instead). - * Our interface matches that of _bt_binsrch_array_skey in every other way. - * - * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true - * array. The value 0 indicates that tupdatum/tupnull is within the range of - * the skip array. We return -1 when tupdatum/tupnull is lower that any value - * within the range of the array, and 1 when it is higher than every value. - * Caller should pass *set_elem_result to _bt_skiparray_set_element to advance - * the array. - * - * cur_elem_trig indicates if array advancement was triggered by this array's - * scan key. We use this to optimize-away comparisons that are known by our - * caller to be unnecessary from context, just like _bt_binsrch_array_skey. - */ -static void -_bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, - Datum tupdatum, bool tupnull, - BTArrayKeyInfo *array, ScanKey cur, - int32 *set_elem_result) -{ - Assert(cur->sk_flags & SK_BT_SKIP); - Assert(cur->sk_flags & SK_SEARCHARRAY); - Assert(cur->sk_flags & SK_BT_REQFWD); - Assert(array->num_elems == -1); - Assert(!ScanDirectionIsNoMovement(dir)); - - if (array->null_elem) - { - Assert(!array->low_compare && !array->high_compare); - - *set_elem_result = 0; - return; - } - - if (tupnull) /* NULL tupdatum */ - { - if (cur->sk_flags & SK_BT_NULLS_FIRST) - *set_elem_result = -1; /* NULL "<" NOT_NULL */ - else - *set_elem_result = 1; /* NULL ">" NOT_NULL */ - return; - } - - /* - * Array inequalities determine whether tupdatum is within the range of - * caller's skip array - */ - *set_elem_result = 0; - if (ScanDirectionIsForward(dir)) - { - /* - * Evaluate low_compare first (unless cur_elem_trig tells us that it - * cannot possibly fail to be satisfied), then evaluate high_compare - */ - if (!cur_elem_trig && array->low_compare && - !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, - array->low_compare->sk_collation, - tupdatum, - array->low_compare->sk_argument))) - *set_elem_result = -1; - else if (array->high_compare && - !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, - array->high_compare->sk_collation, - tupdatum, - array->high_compare->sk_argument))) - *set_elem_result = 1; - } - else - { - /* - * Evaluate high_compare first (unless cur_elem_trig tells us that it - * cannot possibly fail to be satisfied), then evaluate low_compare - */ - if (!cur_elem_trig && array->high_compare && - !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, - array->high_compare->sk_collation, - tupdatum, - array->high_compare->sk_argument))) - *set_elem_result = 1; - else if (array->low_compare && - !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, - array->low_compare->sk_collation, - tupdatum, - array->low_compare->sk_argument))) - *set_elem_result = -1; - } - - /* - * Assert that any keys that were assumed to be satisfied already (due to - * caller passing cur_elem_trig=true) really are satisfied as expected - */ -#ifdef USE_ASSERT_CHECKING - if (cur_elem_trig) - { - if (ScanDirectionIsForward(dir) && array->low_compare) - Assert(DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, - array->low_compare->sk_collation, - tupdatum, - array->low_compare->sk_argument))); - - if (ScanDirectionIsBackward(dir) && array->high_compare) - Assert(DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, - array->high_compare->sk_collation, - tupdatum, - array->high_compare->sk_argument))); - } -#endif -} - -/* - * _bt_skiparray_set_element() -- Set skip array scan key's sk_argument - * - * Caller passes set_elem_result returned by _bt_binsrch_skiparray_skey for - * caller's tupdatum/tupnull. - * - * We copy tupdatum/tupnull into skey's sk_argument iff set_elem_result == 0. - * Otherwise, we set skey to either the lowest or highest value that's within - * the range of caller's skip array (whichever is the best available match to - * tupdatum/tupnull that is still within the range of the skip array according - * to _bt_binsrch_skiparray_skey/set_elem_result). - */ -static void -_bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, - int32 set_elem_result, Datum tupdatum, bool tupnull) -{ - Assert(skey->sk_flags & SK_BT_SKIP); - Assert(skey->sk_flags & SK_SEARCHARRAY); - - if (set_elem_result) - { - /* tupdatum/tupnull is out of the range of the skip array */ - Assert(!array->null_elem); - - _bt_array_set_low_or_high(rel, skey, array, set_elem_result < 0); - return; - } - - /* Advance skip array to tupdatum (or tupnull) value */ - if (unlikely(tupnull)) - { - _bt_skiparray_set_isnull(rel, skey, array); - return; - } - - /* Free memory previously allocated for sk_argument if needed */ - if (!array->attbyval && skey->sk_argument) - pfree(DatumGetPointer(skey->sk_argument)); - - /* tupdatum becomes new sk_argument/new current element */ - skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | - SK_BT_MINVAL | SK_BT_MAXVAL | - SK_BT_NEXT | SK_BT_PRIOR); - skey->sk_argument = datumCopy(tupdatum, array->attbyval, array->attlen); -} - -/* - * _bt_skiparray_set_isnull() -- set skip array scan key to NULL - */ -static void -_bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) -{ - Assert(skey->sk_flags & SK_BT_SKIP); - Assert(skey->sk_flags & SK_SEARCHARRAY); - Assert(array->null_elem && !array->low_compare && !array->high_compare); - - /* Free memory previously allocated for sk_argument if needed */ - if (!array->attbyval && skey->sk_argument) - pfree(DatumGetPointer(skey->sk_argument)); - - /* NULL becomes new sk_argument/new current element */ - skey->sk_argument = (Datum) 0; - skey->sk_flags &= ~(SK_BT_MINVAL | SK_BT_MAXVAL | - SK_BT_NEXT | SK_BT_PRIOR); - skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); -} - -/* - * _bt_start_array_keys() -- Initialize array keys at start of a scan - * - * Set up the cur_elem counters and fill in the first sk_argument value for - * each array scankey. - */ -void -_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - Assert(so->numArrayKeys); - Assert(so->qual_ok); - - for (int i = 0; i < so->numArrayKeys; i++) - { - BTArrayKeyInfo *array = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[array->scan_key]; - - Assert(skey->sk_flags & SK_SEARCHARRAY); - - _bt_array_set_low_or_high(rel, skey, array, - ScanDirectionIsForward(dir)); - } - so->scanBehind = so->oppositeDirCheck = false; /* reset */ -} - -/* - * _bt_array_set_low_or_high() -- Set array scan key to lowest/highest element - * - * Caller also passes associated scan key, which will have its argument set to - * the lowest/highest array value in passing. - */ -static void -_bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, - bool low_not_high) -{ - Assert(skey->sk_flags & SK_SEARCHARRAY); - - if (array->num_elems != -1) - { - /* set low or high element for SAOP array */ - int set_elem = 0; - - Assert(!(skey->sk_flags & SK_BT_SKIP)); - - if (!low_not_high) - set_elem = array->num_elems - 1; - - /* - * Just copy over array datum (only skip arrays require freeing and - * allocating memory for sk_argument) - */ - array->cur_elem = set_elem; - skey->sk_argument = array->elem_values[set_elem]; - - return; - } - - /* set low or high element for skip array */ - Assert(skey->sk_flags & SK_BT_SKIP); - Assert(array->num_elems == -1); - - /* Free memory previously allocated for sk_argument if needed */ - if (!array->attbyval && skey->sk_argument) - pfree(DatumGetPointer(skey->sk_argument)); - - /* Reset flags */ - skey->sk_argument = (Datum) 0; - skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | - SK_BT_MINVAL | SK_BT_MAXVAL | - SK_BT_NEXT | SK_BT_PRIOR); - - if (array->null_elem && - (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) - { - /* Requested element (either lowest or highest) has the value NULL */ - skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); - } - else if (low_not_high) - { - /* Setting array to lowest element (according to low_compare) */ - skey->sk_flags |= SK_BT_MINVAL; - } - else - { - /* Setting array to highest element (according to high_compare) */ - skey->sk_flags |= SK_BT_MAXVAL; - } -} - -/* - * _bt_array_decrement() -- decrement array scan key's sk_argument - * - * Return value indicates whether caller's array was successfully decremented. - * Cannot decrement an array whose current element is already the first one. - */ -static bool -_bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) -{ - bool uflow = false; - Datum dec_sk_argument; - - Assert(skey->sk_flags & SK_SEARCHARRAY); - Assert(!(skey->sk_flags & (SK_BT_MAXVAL | SK_BT_NEXT | SK_BT_PRIOR))); - - /* SAOP array? */ - if (array->num_elems != -1) - { - Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); - if (array->cur_elem > 0) - { - /* - * Just decrement current element, and assign its datum to skey - * (only skip arrays need us to free existing sk_argument memory) - */ - array->cur_elem--; - skey->sk_argument = array->elem_values[array->cur_elem]; - - /* Successfully decremented array */ - return true; - } - - /* Cannot decrement to before first array element */ - return false; - } - - /* Nope, this is a skip array */ - Assert(skey->sk_flags & SK_BT_SKIP); - - /* - * The sentinel value that represents the minimum value within the range - * of a skip array (often just -inf) is never decrementable - */ - if (skey->sk_flags & SK_BT_MINVAL) - return false; - - /* - * When the current array element is NULL, and the lowest sorting value in - * the index is also NULL, we cannot decrement before first array element - */ - if ((skey->sk_flags & SK_ISNULL) && (skey->sk_flags & SK_BT_NULLS_FIRST)) - return false; - - /* - * Opclasses without skip support "decrement" the scan key's current - * element by setting the PRIOR flag. The true prior value is determined - * by repositioning to the last index tuple < existing sk_argument/current - * array element. Note that this works in the usual way when the scan key - * is already marked ISNULL (i.e. when the current element is NULL). - */ - if (!array->sksup) - { - /* Successfully "decremented" array */ - skey->sk_flags |= SK_BT_PRIOR; - return true; - } - - /* - * Opclasses with skip support directly decrement sk_argument - */ - if (skey->sk_flags & SK_ISNULL) - { - Assert(!(skey->sk_flags & SK_BT_NULLS_FIRST)); - - /* - * Existing sk_argument/array element is NULL (for an IS NULL qual). - * - * "Decrement" from NULL to the high_elem value provided by opclass - * skip support routine. - */ - skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); - skey->sk_argument = datumCopy(array->sksup->high_elem, - array->attbyval, array->attlen); - return true; - } - - /* - * Ask opclass support routine to provide decremented copy of existing - * non-NULL sk_argument - */ - dec_sk_argument = array->sksup->decrement(rel, skey->sk_argument, &uflow); - if (unlikely(uflow)) - { - /* dec_sk_argument has undefined value (so no pfree) */ - if (array->null_elem && (skey->sk_flags & SK_BT_NULLS_FIRST)) - { - _bt_skiparray_set_isnull(rel, skey, array); - - /* Successfully "decremented" array to NULL */ - return true; - } - - /* Cannot decrement to before first array element */ - return false; - } - - /* - * Successfully decremented sk_argument to a non-NULL value. Make sure - * that the decremented value is still within the range of the array. - */ - if (array->low_compare && - !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, - array->low_compare->sk_collation, - dec_sk_argument, - array->low_compare->sk_argument))) - { - /* Keep existing sk_argument after all */ - if (!array->attbyval) - pfree(DatumGetPointer(dec_sk_argument)); - - /* Cannot decrement to before first array element */ - return false; - } - - /* Accept value returned by opclass decrement callback */ - if (!array->attbyval && skey->sk_argument) - pfree(DatumGetPointer(skey->sk_argument)); - skey->sk_argument = dec_sk_argument; - - /* Successfully decremented array */ - return true; -} - -/* - * _bt_array_increment() -- increment array scan key's sk_argument - * - * Return value indicates whether caller's array was successfully incremented. - * Cannot increment an array whose current element is already the final one. - */ -static bool -_bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) -{ - bool oflow = false; - Datum inc_sk_argument; - - Assert(skey->sk_flags & SK_SEARCHARRAY); - Assert(!(skey->sk_flags & (SK_BT_MINVAL | SK_BT_NEXT | SK_BT_PRIOR))); - - /* SAOP array? */ - if (array->num_elems != -1) - { - Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); - if (array->cur_elem < array->num_elems - 1) - { - /* - * Just increment current element, and assign its datum to skey - * (only skip arrays need us to free existing sk_argument memory) - */ - array->cur_elem++; - skey->sk_argument = array->elem_values[array->cur_elem]; - - /* Successfully incremented array */ - return true; - } - - /* Cannot increment past final array element */ - return false; - } - - /* Nope, this is a skip array */ - Assert(skey->sk_flags & SK_BT_SKIP); - - /* - * The sentinel value that represents the maximum value within the range - * of a skip array (often just +inf) is never incrementable - */ - if (skey->sk_flags & SK_BT_MAXVAL) - return false; - - /* - * When the current array element is NULL, and the highest sorting value - * in the index is also NULL, we cannot increment past the final element - */ - if ((skey->sk_flags & SK_ISNULL) && !(skey->sk_flags & SK_BT_NULLS_FIRST)) - return false; - - /* - * Opclasses without skip support "increment" the scan key's current - * element by setting the NEXT flag. The true next value is determined by - * repositioning to the first index tuple > existing sk_argument/current - * array element. Note that this works in the usual way when the scan key - * is already marked ISNULL (i.e. when the current element is NULL). - */ - if (!array->sksup) - { - /* Successfully "incremented" array */ - skey->sk_flags |= SK_BT_NEXT; - return true; - } - - /* - * Opclasses with skip support directly increment sk_argument - */ - if (skey->sk_flags & SK_ISNULL) - { - Assert(skey->sk_flags & SK_BT_NULLS_FIRST); - - /* - * Existing sk_argument/array element is NULL (for an IS NULL qual). - * - * "Increment" from NULL to the low_elem value provided by opclass - * skip support routine. - */ - skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); - skey->sk_argument = datumCopy(array->sksup->low_elem, - array->attbyval, array->attlen); - return true; - } - - /* - * Ask opclass support routine to provide incremented copy of existing - * non-NULL sk_argument - */ - inc_sk_argument = array->sksup->increment(rel, skey->sk_argument, &oflow); - if (unlikely(oflow)) - { - /* inc_sk_argument has undefined value (so no pfree) */ - if (array->null_elem && !(skey->sk_flags & SK_BT_NULLS_FIRST)) - { - _bt_skiparray_set_isnull(rel, skey, array); - - /* Successfully "incremented" array to NULL */ - return true; - } - - /* Cannot increment past final array element */ - return false; - } - - /* - * Successfully incremented sk_argument to a non-NULL value. Make sure - * that the incremented value is still within the range of the array. - */ - if (array->high_compare && - !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, - array->high_compare->sk_collation, - inc_sk_argument, - array->high_compare->sk_argument))) - { - /* Keep existing sk_argument after all */ - if (!array->attbyval) - pfree(DatumGetPointer(inc_sk_argument)); - - /* Cannot increment past final array element */ - return false; - } - - /* Accept value returned by opclass increment callback */ - if (!array->attbyval && skey->sk_argument) - pfree(DatumGetPointer(skey->sk_argument)); - skey->sk_argument = inc_sk_argument; - - /* Successfully incremented array */ - return true; -} - -/* - * _bt_advance_array_keys_increment() -- Advance to next set of array elements - * - * Advances the array keys by a single increment in the current scan - * direction. When there are multiple array keys this can roll over from the - * lowest order array to higher order arrays. - * - * Returns true if there is another set of values to consider, false if not. - * On true result, the scankeys are initialized with the next set of values. - * On false result, the scankeys stay the same, and the array keys are not - * advanced (every array remains at its final element for scan direction). - */ -static bool -_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, - bool *skip_array_set) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - /* - * We must advance the last array key most quickly, since it will - * correspond to the lowest-order index column among the available - * qualifications - */ - for (int i = so->numArrayKeys - 1; i >= 0; i--) - { - BTArrayKeyInfo *array = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[array->scan_key]; - - if (array->num_elems == -1) - *skip_array_set = true; - - if (ScanDirectionIsForward(dir)) - { - if (_bt_array_increment(rel, skey, array)) - return true; - } - else - { - if (_bt_array_decrement(rel, skey, array)) - return true; - } - - /* - * Couldn't increment (or decrement) array. Handle array roll over. - * - * Start over at the array's lowest sorting value (or its highest - * value, for backward scans)... - */ - _bt_array_set_low_or_high(rel, skey, array, - ScanDirectionIsForward(dir)); - - /* ...then increment (or decrement) next most significant array */ - } - - /* - * The array keys are now exhausted. - * - * Restore the array keys to the state they were in immediately before we - * were called. This ensures that the arrays only ever ratchet in the - * current scan direction. - * - * Without this, scans could overlook matching tuples when the scan - * direction gets reversed just before btgettuple runs out of items to - * return, but just after _bt_readpage prepares all the items from the - * scan's final page in so->currPos. When we're on the final page it is - * typical for so->currPos to get invalidated once btgettuple finally - * returns false, which'll effectively invalidate the scan's array keys. - * That hasn't happened yet, though -- and in general it may never happen. - */ - _bt_start_array_keys(scan, -dir); - - return false; -} - -/* - * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required - * - * Called when _bt_advance_array_keys decides to start a new primitive index - * scan on the basis of the current scan position being before the position - * that _bt_first is capable of repositioning the scan to by applying an - * inequality operator required in the opposite-to-scan direction only. - * - * Although equality strategy scan keys (for both arrays and non-arrays alike) - * are either marked required in both directions or in neither direction, - * there is a sense in which non-required arrays behave like required arrays. - * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)", - * the scan key on "c" is non-required, but nevertheless enables positioning - * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the - * first descent of the tree by _bt_first. Later on, there could also be a - * second descent, that places the scan right before tuples >= "(200, 3, 5)". - * _bt_first must never be allowed to build an insertion scan key whose "c" - * entry is set to a value other than 5, the "c" array's first element/value. - * (Actually, it's the first in the current scan direction. This example uses - * a forward scan.) - * - * Calling here resets the array scan key elements for the scan's non-required - * arrays. This is strictly necessary for correctness in a subset of cases - * involving "required in opposite direction"-triggered primitive index scans. - * Not all callers are at risk of _bt_first using a non-required array like - * this, but advancement always resets the arrays when another primitive scan - * is scheduled, just to keep things simple. Array advancement even makes - * sure to reset non-required arrays during scans that have no inequalities. - * (Advancement still won't call here when there are no inequalities, though - * that's just because it's all handled indirectly instead.) - * - * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that - * everybody got this right. - * - * Note: In practice almost all SAOP arrays are marked required during - * preprocessing (if necessary by generating skip arrays). It is hardly ever - * truly necessary to call here, but consistently doing so is simpler. - */ -static void -_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - Assert(array->scan_key == ikey); - - if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) - continue; - - Assert(array->num_elems != -1); /* No non-required skip arrays */ - - _bt_array_set_low_or_high(rel, cur, array, - ScanDirectionIsForward(dir)); - } -} - -/* - * _bt_tuple_before_array_skeys() -- too early to advance required arrays? - * - * We always compare the tuple using the current array keys (which we assume - * are already set in so->keyData[]). readpagetup indicates if tuple is the - * scan's current _bt_readpage-wise tuple. - * - * readpagetup callers must only call here when _bt_check_compare already set - * continuescan=false. We help these callers deal with _bt_check_compare's - * inability to distinguish between the < and > cases (it uses equality - * operator scan keys, whereas we use 3-way ORDER procs). These callers pass - * a _bt_check_compare-set sktrig value that indicates which scan key - * triggered the call (!readpagetup callers just pass us sktrig=0 instead). - * This information allows us to avoid wastefully checking earlier scan keys - * that were already deemed to have been satisfied inside _bt_check_compare. - * - * Returns false when caller's tuple is >= the current required equality scan - * keys (or <=, in the case of backwards scans). This happens to readpagetup - * callers when the scan has reached the point of needing its array keys - * advanced; caller will need to advance required and non-required arrays at - * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over. - * (When we return false to readpagetup callers, tuple can only be == current - * required equality scan keys when caller's sktrig indicates that the arrays - * need to be advanced due to an unsatisfied required inequality key trigger.) - * - * Returns true when caller passes a tuple that is < the current set of - * equality keys for the most significant non-equal required scan key/column - * (or > the keys, during backwards scans). This happens to readpagetup - * callers when tuple is still before the start of matches for the scan's - * required equality strategy scan keys. (sktrig can't have indicated that an - * inequality strategy scan key wasn't satisfied in _bt_check_compare when we - * return true. In fact, we automatically return false when passed such an - * inequality sktrig by readpagetup callers -- _bt_check_compare's initial - * continuescan=false doesn't really need to be confirmed here by us.) - * - * !readpagetup callers optionally pass us *scanBehind, which tracks whether - * any missing truncated attributes might have affected array advancement - * (compared to what would happen if it was shown the first non-pivot tuple on - * the page to the right of caller's finaltup/high key tuple instead). It's - * only possible that we'll set *scanBehind to true when caller passes us a - * pivot tuple (with truncated -inf attributes) that we return false for. - */ -static bool -_bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, - IndexTuple tuple, TupleDesc tupdesc, int tupnatts, - bool readpagetup, int sktrig, bool *scanBehind) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - Assert(so->numArrayKeys); - Assert(so->numberOfKeys); - Assert(sktrig == 0 || readpagetup); - Assert(!readpagetup || scanBehind == NULL); - - if (scanBehind) - *scanBehind = false; - - for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - Datum tupdatum; - bool tupnull; - int32 result; - - /* readpagetup calls require one ORDER proc comparison (at most) */ - Assert(!readpagetup || ikey == sktrig); - - /* - * Once we reach a non-required scan key, we're completely done. - * - * Note: we deliberately don't consider the scan direction here. - * _bt_advance_array_keys caller requires that we track *scanBehind - * without concern for scan direction. - */ - if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0) - { - Assert(!readpagetup); - Assert(ikey > sktrig || ikey == 0); - return false; - } - - if (cur->sk_attno > tupnatts) - { - Assert(!readpagetup); - - /* - * When we reach a high key's truncated attribute, assume that the - * tuple attribute's value is >= the scan's equality constraint - * scan keys (but set *scanBehind to let interested callers know - * that a truncated attribute might have affected our answer). - */ - if (scanBehind) - *scanBehind = true; - - return false; - } - - /* - * Deal with inequality strategy scan keys that _bt_check_compare set - * continuescan=false for - */ - if (cur->sk_strategy != BTEqualStrategyNumber) - { - /* - * When _bt_check_compare indicated that a required inequality - * scan key wasn't satisfied, there's no need to verify anything; - * caller always calls _bt_advance_array_keys with this sktrig. - */ - if (readpagetup) - return false; - - /* - * Otherwise we can't give up, since we must check all required - * scan keys (required in either direction) in order to correctly - * track *scanBehind for caller - */ - continue; - } - - tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); - - if (likely(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))) - { - /* Scankey has a valid/comparable sk_argument value */ - result = _bt_compare_array_skey(&so->orderProcs[ikey], - tupdatum, tupnull, - cur->sk_argument, cur); - - if (result == 0) - { - /* - * Interpret result in a way that takes NEXT/PRIOR into - * account - */ - if (cur->sk_flags & SK_BT_NEXT) - result = -1; - else if (cur->sk_flags & SK_BT_PRIOR) - result = 1; - - Assert(result == 0 || (cur->sk_flags & SK_BT_SKIP)); - } - } - else - { - BTArrayKeyInfo *array = NULL; - - /* - * Current array element/array = scan key value is a sentinel - * value that represents the lowest (or highest) possible value - * that's still within the range of the array. - * - * Like _bt_first, we only see MINVAL keys during forwards scans - * (and similarly only see MAXVAL keys during backwards scans). - * Even if the scan's direction changes, we'll stop at some higher - * order key before we can ever reach any MAXVAL (or MINVAL) keys. - * (However, unlike _bt_first we _can_ get to keys marked either - * NEXT or PRIOR, regardless of the scan's current direction.) - */ - Assert(ScanDirectionIsForward(dir) ? - !(cur->sk_flags & SK_BT_MAXVAL) : - !(cur->sk_flags & SK_BT_MINVAL)); - - /* - * There are no valid sk_argument values in MINVAL/MAXVAL keys. - * Check if tupdatum is within the range of skip array instead. - */ - for (int arrayidx = 0; arrayidx < so->numArrayKeys; arrayidx++) - { - array = &so->arrayKeys[arrayidx]; - if (array->scan_key == ikey) - break; - } - - _bt_binsrch_skiparray_skey(false, dir, tupdatum, tupnull, - array, cur, &result); - - if (result == 0) - { - /* - * tupdatum satisfies both low_compare and high_compare, so - * it's time to advance the array keys. - * - * Note: It's possible that the skip array will "advance" from - * its MINVAL (or MAXVAL) representation to an alternative, - * logically equivalent representation of the same value: a - * representation where the = key gets a valid datum in its - * sk_argument. This is only possible when low_compare uses - * the >= strategy (or high_compare uses the <= strategy). - */ - return false; - } - } - - /* - * Does this comparison indicate that caller must _not_ advance the - * scan's arrays just yet? - */ - if ((ScanDirectionIsForward(dir) && result < 0) || - (ScanDirectionIsBackward(dir) && result > 0)) - return true; - - /* - * Does this comparison indicate that caller should now advance the - * scan's arrays? (Must be if we get here during a readpagetup call.) - */ - if (readpagetup || result != 0) - { - Assert(result != 0); - return false; - } - - /* - * Inconclusive -- need to check later scan keys, too. - * - * This must be a finaltup precheck, or a call made from an assertion. - */ - Assert(result == 0); - } - - Assert(!readpagetup); - - return false; -} - -/* - * _bt_start_prim_scan() -- start scheduled primitive index scan? - * - * Returns true if _bt_checkkeys scheduled another primitive index scan, just - * as the last one ended. Otherwise returns false, indicating that the array - * keys are now fully exhausted. - * - * Only call here during scans with one or more equality type array scan keys, - * after _bt_first or _bt_next return false. - */ -bool -_bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - Assert(so->numArrayKeys); - - so->scanBehind = so->oppositeDirCheck = false; /* reset */ - - /* - * Array keys are advanced within _bt_checkkeys when the scan reaches the - * leaf level (more precisely, they're advanced when the scan reaches the - * end of each distinct set of array elements). This process avoids - * repeat access to leaf pages (across multiple primitive index scans) by - * advancing the scan's array keys when it allows the primitive index scan - * to find nearby matching tuples (or when it eliminates ranges of array - * key space that can't possibly be satisfied by any index tuple). - * - * _bt_checkkeys sets a simple flag variable to schedule another primitive - * index scan. The flag tells us what to do. - * - * We cannot rely on _bt_first always reaching _bt_checkkeys. There are - * various cases where that won't happen. For example, if the index is - * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys. - * We also don't expect a call to _bt_checkkeys during searches for a - * non-existent value that happens to be lower/higher than any existing - * value in the index. - * - * We don't require special handling for these cases -- we don't need to - * be explicitly instructed to _not_ perform another primitive index scan. - * It's up to code under the control of _bt_first to always set the flag - * when another primitive index scan will be required. - * - * This works correctly, even with the tricky cases listed above, which - * all involve access to leaf pages "near the boundaries of the key space" - * (whether it's from a leftmost/rightmost page, or an imaginary empty - * leaf root page). If _bt_checkkeys cannot be reached by a primitive - * index scan for one set of array keys, then it also won't be reached for - * any later set ("later" in terms of the direction that we scan the index - * and advance the arrays). The array keys won't have advanced in these - * cases, but that's the correct behavior (even _bt_advance_array_keys - * won't always advance the arrays at the point they become "exhausted"). - */ - if (so->needPrimScan) - { - Assert(_bt_verify_arrays_bt_first(scan, dir)); - - /* - * Flag was set -- must call _bt_first again, which will reset the - * scan's needPrimScan flag - */ - return true; - } - - /* The top-level index scan ran out of tuples in this scan direction */ - if (scan->parallel_scan != NULL) - _bt_parallel_done(scan); - - return false; -} - -/* - * _bt_advance_array_keys() -- Advance array elements using a tuple - * - * The scan always gets a new qual as a consequence of calling here (except - * when we determine that the top-level scan has run out of matching tuples). - * All later _bt_check_compare calls also use the same new qual that was first - * used here (at least until the next call here advances the keys once again). - * It's convenient to structure _bt_check_compare rechecks of caller's tuple - * (using the new qual) as one the steps of advancing the scan's array keys, - * so this function works as a wrapper around _bt_check_compare. - * - * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the - * caller, and return a boolean indicating if caller's tuple satisfies the - * scan's new qual. But unlike _bt_check_compare, we set so->needPrimScan - * when we set continuescan=false, indicating if a new primitive index scan - * has been scheduled (otherwise, the top-level scan has run out of tuples in - * the current scan direction). - * - * Caller must use _bt_tuple_before_array_skeys to determine if the current - * place in the scan is >= the current array keys _before_ calling here. - * We're responsible for ensuring that caller's tuple is <= the newly advanced - * required array keys once we return. We try to find an exact match, but - * failing that we'll advance the array keys to whatever set of array elements - * comes next in the key space for the current scan direction. Required array - * keys "ratchet forwards" (or backwards). They can only advance as the scan - * itself advances through the index/key space. - * - * (The rules are the same for backwards scans, except that the operators are - * flipped: just replace the precondition's >= operator with a <=, and the - * postcondition's <= operator with a >=. In other words, just swap the - * precondition with the postcondition.) - * - * We also deal with "advancing" non-required arrays here (or arrays that are - * treated as non-required for the duration of a _bt_readpage call). Callers - * whose sktrig scan key is non-required specify sktrig_required=false. These - * calls are the only exception to the general rule about always advancing the - * required array keys (the scan may not even have a required array). These - * callers should just pass a NULL pstate (since there is never any question - * of stopping the scan). No call to _bt_tuple_before_array_skeys is required - * ahead of these calls (it's already clear that any required scan keys must - * be satisfied by caller's tuple). - * - * Note that we deal with non-array required equality strategy scan keys as - * degenerate single element arrays here. Obviously, they can never really - * advance in the way that real arrays can, but they must still affect how we - * advance real array scan keys (exactly like true array equality scan keys). - * We have to keep around a 3-way ORDER proc for these (using the "=" operator - * won't do), since in general whether the tuple is < or > _any_ unsatisfied - * required equality key influences how the scan's real arrays must advance. - * - * Note also that we may sometimes need to advance the array keys when the - * existing required array keys (and other required equality keys) are already - * an exact match for every corresponding value from caller's tuple. We must - * do this for inequalities that _bt_check_compare set continuescan=false for. - * They'll advance the array keys here, just like any other scan key that - * _bt_check_compare stops on. (This can even happen _after_ we advance the - * array keys, in which case we'll advance the array keys a second time. That - * way _bt_checkkeys caller always has its required arrays advance to the - * maximum possible extent that its tuple will allow.) - */ -static bool -_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, - IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - int sktrig, bool sktrig_required) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Relation rel = scan->indexRelation; - ScanDirection dir = so->currPos.dir; - int arrayidx = 0; - bool beyond_end_advance = false, - skip_array_advanced = false, - has_required_opposite_direction_only = false, - all_required_satisfied = true, - all_satisfied = true; - - Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); - Assert(_bt_verify_keys_with_arraykeys(scan)); - - if (sktrig_required) - { - /* - * Precondition array state assertion - */ - Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, - tupnatts, false, 0, NULL)); - - /* - * Once we return we'll have a new set of required array keys, so - * reset state used by "look ahead" optimization - */ - pstate->rechecks = 0; - pstate->targetdistance = 0; - } - else if (sktrig < so->numberOfKeys - 1 && - !(so->keyData[so->numberOfKeys - 1].sk_flags & SK_SEARCHARRAY)) - { - int least_sign_ikey = so->numberOfKeys - 1; - bool continuescan; - - /* - * Optimization: perform a precheck of the least significant key - * during !sktrig_required calls when it isn't already our sktrig - * (provided the precheck key is not itself an array). - * - * When the precheck works out we'll avoid an expensive binary search - * of sktrig's array (plus any other arrays before least_sign_ikey). - */ - Assert(so->keyData[sktrig].sk_flags & SK_SEARCHARRAY); - if (!_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, - false, &continuescan, - &least_sign_ikey)) - return false; - } - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - Datum tupdatum; - bool required = false, - required_opposite_direction_only = false, - tupnull; - int32 result; - int set_elem = 0; - - if (cur->sk_strategy == BTEqualStrategyNumber) - { - /* Manage array state */ - if (cur->sk_flags & SK_SEARCHARRAY) - { - array = &so->arrayKeys[arrayidx++]; - Assert(array->scan_key == ikey); - } - } - else - { - /* - * Are any inequalities required in the opposite direction only - * present here? - */ - if (((ScanDirectionIsForward(dir) && - (cur->sk_flags & (SK_BT_REQBKWD))) || - (ScanDirectionIsBackward(dir) && - (cur->sk_flags & (SK_BT_REQFWD))))) - has_required_opposite_direction_only = - required_opposite_direction_only = true; - } - - /* Optimization: skip over known-satisfied scan keys */ - if (ikey < sktrig) - continue; - - if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) - { - required = true; - - if (cur->sk_attno > tupnatts) - { - /* Set this just like _bt_tuple_before_array_skeys */ - Assert(sktrig < ikey); - so->scanBehind = true; - } - } - - /* - * Handle a required non-array scan key that the initial call to - * _bt_check_compare indicated triggered array advancement, if any. - * - * The non-array scan key's strategy will be <, <=, or = during a - * forwards scan (or any one of =, >=, or > during a backwards scan). - * It follows that the corresponding tuple attribute's value must now - * be either > or >= the scan key value (for backwards scans it must - * be either < or <= that value). - * - * If this is a required equality strategy scan key, this is just an - * optimization; _bt_tuple_before_array_skeys already confirmed that - * this scan key places us ahead of caller's tuple. There's no need - * to repeat that work now. (The same underlying principle also gets - * applied by the cur_elem_trig optimization used to speed up searches - * for the next array element.) - * - * If this is a required inequality strategy scan key, we _must_ rely - * on _bt_check_compare like this; we aren't capable of directly - * evaluating required inequality strategy scan keys here, on our own. - */ - if (ikey == sktrig && !array) - { - Assert(sktrig_required && required && all_required_satisfied); - - /* Use "beyond end" advancement. See below for an explanation. */ - beyond_end_advance = true; - all_satisfied = all_required_satisfied = false; - - continue; - } - - /* - * Nothing more for us to do with an inequality strategy scan key that - * wasn't the one that _bt_check_compare stopped on, though. - * - * Note: if our later call to _bt_check_compare (to recheck caller's - * tuple) sets continuescan=false due to finding this same inequality - * unsatisfied (possible when it's required in the scan direction), - * we'll deal with it via a recursive "second pass" call. - */ - else if (cur->sk_strategy != BTEqualStrategyNumber) - continue; - - /* - * Nothing for us to do with an equality strategy scan key that isn't - * marked required, either -- unless it's a non-required array - */ - else if (!required && !array) - continue; - - /* - * Here we perform steps for all array scan keys after a required - * array scan key whose binary search triggered "beyond end of array - * element" array advancement due to encountering a tuple attribute - * value > the closest matching array key (or < for backwards scans). - */ - if (beyond_end_advance) - { - if (array) - _bt_array_set_low_or_high(rel, cur, array, - ScanDirectionIsBackward(dir)); - - continue; - } - - /* - * Here we perform steps for all array scan keys after a required - * array scan key whose tuple attribute was < the closest matching - * array key when we dealt with it (or > for backwards scans). - * - * This earlier required array key already puts us ahead of caller's - * tuple in the key space (for the current scan direction). We must - * make sure that subsequent lower-order array keys do not put us too - * far ahead (ahead of tuples that have yet to be seen by our caller). - * For example, when a tuple "(a, b) = (42, 5)" advances the array - * keys on "a" from 40 to 45, we must also set "b" to whatever the - * first array element for "b" is. It would be wrong to allow "b" to - * be set based on the tuple value. - * - * Perform the same steps with truncated high key attributes. You can - * think of this as a "binary search" for the element closest to the - * value -inf. Again, the arrays must never get ahead of the scan. - */ - if (!all_required_satisfied || cur->sk_attno > tupnatts) - { - if (array) - _bt_array_set_low_or_high(rel, cur, array, - ScanDirectionIsForward(dir)); - - continue; - } - - /* - * Search in scankey's array for the corresponding tuple attribute - * value from caller's tuple - */ - tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); - - if (array) - { - bool cur_elem_trig = (sktrig_required && ikey == sktrig); - - /* - * "Binary search" by checking if tupdatum/tupnull are within the - * range of the skip array - */ - if (array->num_elems == -1) - _bt_binsrch_skiparray_skey(cur_elem_trig, dir, - tupdatum, tupnull, array, cur, - &result); - - /* - * Binary search for the closest match from the SAOP array - */ - else - set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], - cur_elem_trig, dir, - tupdatum, tupnull, array, cur, - &result); - } - else - { - Assert(required); - - /* - * This is a required non-array equality strategy scan key, which - * we'll treat as a degenerate single element array. - * - * This scan key's imaginary "array" can't really advance, but it - * can still roll over like any other array. (Actually, this is - * no different to real single value arrays, which never advance - * without rolling over -- they can never truly advance, either.) - */ - result = _bt_compare_array_skey(&so->orderProcs[ikey], - tupdatum, tupnull, - cur->sk_argument, cur); - } - - /* - * Consider "beyond end of array element" array advancement. - * - * When the tuple attribute value is > the closest matching array key - * (or < in the backwards scan case), we need to ratchet this array - * forward (backward) by one increment, so that caller's tuple ends up - * being < final array value instead (or > final array value instead). - * This process has to work for all of the arrays, not just this one: - * it must "carry" to higher-order arrays when the set_elem that we - * just found happens to be the final one for the scan's direction. - * Incrementing (decrementing) set_elem itself isn't good enough. - * - * Our approach is to provisionally use set_elem as if it was an exact - * match now, then set each later/less significant array to whatever - * its final element is. Once outside the loop we'll then "increment - * this array's set_elem" by calling _bt_advance_array_keys_increment. - * That way the process rolls over to higher order arrays as needed. - * - * Under this scheme any required arrays only ever ratchet forwards - * (or backwards), and always do so to the maximum possible extent - * that we can know will be safe without seeing the scan's next tuple. - * We don't need any special handling for required scan keys that lack - * a real array to advance, nor for redundant scan keys that couldn't - * be eliminated by _bt_preprocess_keys. It won't matter if some of - * our "true" array scan keys (or even all of them) are non-required. - */ - if (sktrig_required && required && - ((ScanDirectionIsForward(dir) && result > 0) || - (ScanDirectionIsBackward(dir) && result < 0))) - beyond_end_advance = true; - - Assert(all_required_satisfied && all_satisfied); - if (result != 0) - { - /* - * Track whether caller's tuple satisfies our new post-advancement - * qual, for required scan keys, as well as for the entire set of - * interesting scan keys (all required scan keys plus non-required - * array scan keys are considered interesting.) - */ - all_satisfied = false; - if (sktrig_required && required) - all_required_satisfied = false; - else - { - /* - * There's no need to advance the arrays using the best - * available match for a non-required array. Give up now. - * (Though note that sktrig_required calls still have to do - * all the usual post-advancement steps, including the recheck - * call to _bt_check_compare.) - */ - break; - } - } - - /* Advance array keys, even when we don't have an exact match */ - if (array) - { - if (array->num_elems == -1) - { - /* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */ - _bt_skiparray_set_element(rel, cur, array, result, - tupdatum, tupnull); - skip_array_advanced = true; - } - else if (array->cur_elem != set_elem) - { - /* SAOP array's new element is set_elem datum */ - array->cur_elem = set_elem; - cur->sk_argument = array->elem_values[set_elem]; - } - } - } - - /* - * Advance the array keys incrementally whenever "beyond end of array - * element" array advancement happens, so that advancement will carry to - * higher-order arrays (might exhaust all the scan's arrays instead, which - * ends the top-level scan). - */ - if (beyond_end_advance && - !_bt_advance_array_keys_increment(scan, dir, &skip_array_advanced)) - goto end_toplevel_scan; - - Assert(_bt_verify_keys_with_arraykeys(scan)); - - /* - * Maintain a page-level count of the number of times the scan's array - * keys advanced in a way that affected at least one skip array - */ - if (sktrig_required && skip_array_advanced) - pstate->nskipadvances++; - - /* - * Does tuple now satisfy our new qual? Recheck with _bt_check_compare. - * - * Calls triggered by an unsatisfied required scan key, whose tuple now - * satisfies all required scan keys, but not all nonrequired array keys, - * will still require a recheck call to _bt_check_compare. They'll still - * need its "second pass" handling of required inequality scan keys. - * (Might have missed a still-unsatisfied required inequality scan key - * that caller didn't detect as the sktrig scan key during its initial - * _bt_check_compare call that used the old/original qual.) - * - * Calls triggered by an unsatisfied nonrequired array scan key never need - * "second pass" handling of required inequalities (nor any other handling - * of any required scan key). All that matters is whether caller's tuple - * satisfies the new qual, so it's safe to just skip the _bt_check_compare - * recheck when we've already determined that it can only return 'false'. - * - * Note: In practice most scan keys are marked required by preprocessing, - * if necessary by generating a preceding skip array. We nevertheless - * often handle array keys marked required as if they were nonrequired. - * This behavior is requested by our _bt_check_compare caller, though only - * when it is passed "forcenonrequired=true" by _bt_checkkeys. - */ - if ((sktrig_required && all_required_satisfied) || - (!sktrig_required && all_satisfied)) - { - int nsktrig = sktrig + 1; - bool continuescan; - - Assert(all_required_satisfied); - - /* Recheck _bt_check_compare on behalf of caller */ - if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, - !sktrig_required, &continuescan, - &nsktrig) && - !so->scanBehind) - { - /* This tuple satisfies the new qual */ - Assert(all_satisfied && continuescan); - - if (pstate) - pstate->continuescan = true; - - return true; - } - - /* - * Consider "second pass" handling of required inequalities. - * - * It's possible that our _bt_check_compare call indicated that the - * scan should end due to some unsatisfied inequality that wasn't - * initially recognized as such by us. Handle this by calling - * ourselves recursively, this time indicating that the trigger is the - * inequality that we missed first time around (and using a set of - * required array/equality keys that are now exact matches for tuple). - * - * We make a strong, general guarantee that every _bt_checkkeys call - * here will advance the array keys to the maximum possible extent - * that we can know to be safe based on caller's tuple alone. If we - * didn't perform this step, then that guarantee wouldn't quite hold. - */ - if (unlikely(!continuescan)) - { - bool satisfied PG_USED_FOR_ASSERTS_ONLY; - - Assert(sktrig_required); - Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber); - - /* - * The tuple must use "beyond end" advancement during the - * recursive call, so we cannot possibly end up back here when - * recursing. We'll consume a small, fixed amount of stack space. - */ - Assert(!beyond_end_advance); - - /* Advance the array keys a second time using same tuple */ - satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts, - tupdesc, nsktrig, true); - - /* This tuple doesn't satisfy the inequality */ - Assert(!satisfied); - return false; - } - - /* - * Some non-required scan key (from new qual) still not satisfied. - * - * All scan keys required in the current scan direction must still be - * satisfied, though, so we can trust all_required_satisfied below. - */ - } - - /* - * When we were called just to deal with "advancing" non-required arrays, - * this is as far as we can go (cannot stop the scan for these callers) - */ - if (!sktrig_required) - { - /* Caller's tuple doesn't match any qual */ - return false; - } - - /* - * Postcondition array state assertion (for still-unsatisfied tuples). - * - * By here we have established that the scan's required arrays (scan must - * have at least one required array) advanced, without becoming exhausted. - * - * Caller's tuple is now < the newly advanced array keys (or > when this - * is a backwards scan), except in the case where we only got this far due - * to an unsatisfied non-required scan key. Verify that with an assert. - * - * Note: we don't just quit at this point when all required scan keys were - * found to be satisfied because we need to consider edge-cases involving - * scan keys required in the opposite direction only; those aren't tracked - * by all_required_satisfied. - */ - Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, - false, 0, NULL) == - !all_required_satisfied); - - /* - * We generally permit primitive index scans to continue onto the next - * sibling page when the page's finaltup satisfies all required scan keys - * at the point where we're between pages. - * - * If caller's tuple is also the page's finaltup, and we see that required - * scan keys still aren't satisfied, start a new primitive index scan. - */ - if (!all_required_satisfied && pstate->finaltup == tuple) - goto new_prim_scan; - - /* - * Proactively check finaltup (don't wait until finaltup is reached by the - * scan) when it might well turn out to not be satisfied later on. - * - * Note: if so->scanBehind hasn't already been set for finaltup by us, - * it'll be set during this call to _bt_tuple_before_array_skeys. Either - * way, it'll be set correctly (for the whole page) after this point. - */ - if (!all_required_satisfied && pstate->finaltup && - _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, - BTreeTupleGetNAtts(pstate->finaltup, rel), - false, 0, &so->scanBehind)) - goto new_prim_scan; - - /* - * When we encounter a truncated finaltup high key attribute, we're - * optimistic about the chances of its corresponding required scan key - * being satisfied when we go on to recheck it against tuples from this - * page's right sibling leaf page. We consider truncated attributes to be - * satisfied by required scan keys, which allows the primitive index scan - * to continue to the next leaf page. We must set so->scanBehind to true - * to remember that the last page's finaltup had "satisfied" required scan - * keys for one or more truncated attribute values (scan keys required in - * _either_ scan direction). - * - * There is a chance that _bt_readpage (which checks so->scanBehind) will - * find that even the sibling leaf page's finaltup is < the new array - * keys. When that happens, our optimistic policy will have incurred a - * single extra leaf page access that could have been avoided. - * - * A pessimistic policy would give backward scans a gratuitous advantage - * over forward scans. We'd punish forward scans for applying more - * accurate information from the high key, rather than just using the - * final non-pivot tuple as finaltup, in the style of backward scans. - * Being pessimistic would also give some scans with non-required arrays a - * perverse advantage over similar scans that use required arrays instead. - * - * This is similar to our scan-level heuristics, below. They also set - * scanBehind to speculatively continue the primscan onto the next page. - */ - if (so->scanBehind) - { - /* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled */ - } - - /* - * Handle inequalities marked required in the opposite scan direction. - * They can also signal that we should start a new primitive index scan. - * - * It's possible that the scan is now positioned where "matching" tuples - * begin, and that caller's tuple satisfies all scan keys required in the - * current scan direction. But if caller's tuple still doesn't satisfy - * other scan keys that are required in the opposite scan direction only - * (e.g., a required >= strategy scan key when scan direction is forward), - * it's still possible that there are many leaf pages before the page that - * _bt_first could skip straight to. Groveling through all those pages - * will always give correct answers, but it can be very inefficient. We - * must avoid needlessly scanning extra pages. - * - * Separately, it's possible that _bt_check_compare set continuescan=false - * for a scan key that's required in the opposite direction only. This is - * a special case, that happens only when _bt_check_compare sees that the - * inequality encountered a NULL value. This signals the end of non-NULL - * values in the current scan direction, which is reason enough to end the - * (primitive) scan. If this happens at the start of a large group of - * NULL values, then we shouldn't expect to be called again until after - * the scan has already read indefinitely-many leaf pages full of tuples - * with NULL suffix values. (_bt_first is expected to skip over the group - * of NULLs by applying a similar "deduce NOT NULL" rule of its own, which - * involves consing up an explicit SK_SEARCHNOTNULL key.) - * - * Apply a test against finaltup to detect and recover from the problem: - * if even finaltup doesn't satisfy such an inequality, we just skip by - * starting a new primitive index scan. When we skip, we know for sure - * that all of the tuples on the current page following caller's tuple are - * also before the _bt_first-wise start of tuples for our new qual. That - * at least suggests many more skippable pages beyond the current page. - * (when so->scanBehind and so->oppositeDirCheck are set, this'll happen - * when we test the next page's finaltup/high key instead.) - */ - else if (has_required_opposite_direction_only && pstate->finaltup && - unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) - { - /* - * Make sure that any SAOP arrays that were not marked required by - * preprocessing are reset to their first element for this direction - */ - _bt_rewind_nonrequired_arrays(scan, dir); - goto new_prim_scan; - } - -continue_scan: - - /* - * Stick with the ongoing primitive index scan for now. - * - * It's possible that later tuples will also turn out to have values that - * are still < the now-current array keys (or > the current array keys). - * Our caller will handle this by performing what amounts to a linear - * search of the page, implemented by calling _bt_check_compare and then - * _bt_tuple_before_array_skeys for each tuple. - * - * This approach has various advantages over a binary search of the page. - * Repeated binary searches of the page (one binary search for every array - * advancement) won't outperform a continuous linear search. While there - * are workloads that a naive linear search won't handle well, our caller - * has a "look ahead" fallback mechanism to deal with that problem. - */ - pstate->continuescan = true; /* Override _bt_check_compare */ - so->needPrimScan = false; /* _bt_readpage has more tuples to check */ - - if (so->scanBehind) - { - /* - * Remember if recheck needs to call _bt_oppodir_checkkeys for next - * page's finaltup (see above comments about "Handle inequalities - * marked required in the opposite scan direction" for why). - */ - so->oppositeDirCheck = has_required_opposite_direction_only; - - _bt_rewind_nonrequired_arrays(scan, dir); - - /* - * skip by setting "look ahead" mechanism's offnum for forwards scans - * (backwards scans check scanBehind flag directly instead) - */ - if (ScanDirectionIsForward(dir)) - pstate->skip = pstate->maxoff + 1; - } - - /* Caller's tuple doesn't match the new qual */ - return false; - -new_prim_scan: - - Assert(pstate->finaltup); /* not on rightmost/leftmost page */ - - /* - * Looks like another primitive index scan is required. But consider - * continuing the current primscan based on scan-level heuristics. - * - * Continue the ongoing primitive scan (and schedule a recheck for when - * the scan arrives on the next sibling leaf page) when it has already - * read at least one leaf page before the one we're reading now. This - * makes primscan scheduling more efficient when scanning subsets of an - * index with many distinct attribute values matching many array elements. - * It encourages fewer, larger primitive scans where that makes sense. - * This will in turn encourage _bt_readpage to apply the pstate.startikey - * optimization more often. - * - * Also continue the ongoing primitive index scan when it is still on the - * first page if there have been more than NSKIPADVANCES_THRESHOLD calls - * here that each advanced at least one of the scan's skip arrays - * (deliberately ignore advancements that only affected SAOP arrays here). - * A page that cycles through this many skip array elements is quite - * likely to neighbor similar pages, that we'll also need to read. - * - * Note: These heuristics aren't as aggressive as you might think. We're - * conservative about allowing a primitive scan to step from the first - * leaf page it reads to the page's sibling page (we only allow it on - * first pages whose finaltup strongly suggests that it'll work out, as - * well as first pages that have a large number of skip array advances). - * Clearing this first page finaltup hurdle is a strong signal in itself. - * - * Note: The NSKIPADVANCES_THRESHOLD heuristic exists only to avoid - * pathological cases. Specifically, cases where a skip scan should just - * behave like a traditional full index scan, but ends up "skipping" again - * and again, descending to the prior leaf page's direct sibling leaf page - * each time. This misbehavior would otherwise be possible during scans - * that never quite manage to "clear the first page finaltup hurdle". - */ - if (!pstate->firstpage || pstate->nskipadvances > NSKIPADVANCES_THRESHOLD) - { - /* Schedule a recheck once on the next (or previous) page */ - so->scanBehind = true; - - /* Continue the current primitive scan after all */ - goto continue_scan; - } - - /* - * End this primitive index scan, but schedule another. - * - * Note: We make a soft assumption that the current scan direction will - * also be used within _bt_next, when it is asked to step off this page. - * It is up to _bt_next to cancel this scheduled primitive index scan - * whenever it steps to a page in the direction opposite currPos.dir. - */ - pstate->continuescan = false; /* Tell _bt_readpage we're done... */ - so->needPrimScan = true; /* ...but call _bt_first again */ - - if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, so->currPos.currPage); - - /* Caller's tuple doesn't match the new qual */ - return false; - -end_toplevel_scan: - - /* - * End the current primitive index scan, but don't schedule another. - * - * This ends the entire top-level scan in the current scan direction. - * - * Note: The scan's arrays (including any non-required arrays) are now in - * their final positions for the current scan direction. If the scan - * direction happens to change, then the arrays will already be in their - * first positions for what will then be the current scan direction. - */ - pstate->continuescan = false; /* Tell _bt_readpage we're done... */ - so->needPrimScan = false; /* ...and don't call _bt_first again */ - - /* Caller's tuple doesn't match any qual */ - return false; -} - -#ifdef USE_ASSERT_CHECKING -/* - * Verify that the scan's qual state matches what we expect at the point that - * _bt_start_prim_scan is about to start a just-scheduled new primitive scan. - * - * We enforce a rule against non-required array scan keys: they must start out - * with whatever element is the first for the scan's current scan direction. - * See _bt_rewind_nonrequired_arrays comments for an explanation. - */ -static bool -_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - int first_elem_dir; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - - if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) - continue; - - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array->cur_elem != first_elem_dir) - return false; - } - - return _bt_verify_keys_with_arraykeys(scan); -} - -/* - * Verify that the scan's "so->keyData[]" scan keys are in agreement with - * its array key state - */ -static bool -_bt_verify_keys_with_arraykeys(IndexScanDesc scan) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int last_sk_attno = InvalidAttrNumber, - arrayidx = 0; - - if (!so->qual_ok) - return false; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array; - - if (cur->sk_strategy != BTEqualStrategyNumber || - !(cur->sk_flags & SK_SEARCHARRAY)) - continue; - - array = &so->arrayKeys[arrayidx++]; - if (array->scan_key != ikey) - return false; - - if (array->num_elems == 0 || array->num_elems < -1) - return false; - - if (array->num_elems != -1 && - cur->sk_argument != array->elem_values[array->cur_elem]) - return false; - if (last_sk_attno > cur->sk_attno) - return false; - last_sk_attno = cur->sk_attno; - } - - if (arrayidx != so->numArrayKeys) - return false; - - return true; -} -#endif - -/* - * Test whether an indextuple satisfies all the scankey conditions. - * - * Return true if so, false if not. If the tuple fails to pass the qual, - * we also determine whether there's any need to continue the scan beyond - * this tuple, and set pstate.continuescan accordingly. See comments for - * _bt_preprocess_keys() about how this is done. - * - * Forward scan callers can pass a high key tuple in the hopes of having - * us set *continuescan to false, and avoiding an unnecessary visit to - * the page to the right. - * - * Advances the scan's array keys when necessary for arrayKeys=true callers. - * Scans without any array keys must always pass arrayKeys=false. - * - * Also stops and starts primitive index scans for arrayKeys=true callers. - * Scans with array keys are required to set up page state that helps us with - * this. The page's finaltup tuple (the page high key for a forward scan, or - * the page's first non-pivot tuple for a backward scan) must be set in - * pstate.finaltup ahead of the first call here for the page. Set this to - * NULL for rightmost page (or the leftmost page for backwards scans). - * - * scan: index scan descriptor (containing a search-type scankey) - * pstate: page level input and output parameters - * arrayKeys: should we advance the scan's array keys if necessary? - * tuple: index tuple to test - * tupnatts: number of attributes in tupnatts (high key may be truncated) - */ -bool -_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, - IndexTuple tuple, int tupnatts) -{ - TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); - BTScanOpaque so = (BTScanOpaque) scan->opaque; - ScanDirection dir = so->currPos.dir; - int ikey = pstate->startikey; - bool res; - - Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); - Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); - Assert(arrayKeys || so->numArrayKeys == 0); - - res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, arrayKeys, - pstate->forcenonrequired, &pstate->continuescan, - &ikey); - - /* - * If _bt_check_compare relied on the pstate.startikey optimization, call - * again (in assert-enabled builds) to verify it didn't affect our answer. - * - * Note: we can't do this when !pstate.forcenonrequired, since any arrays - * before pstate.startikey won't have advanced on this page at all. - */ - Assert(!pstate->forcenonrequired || arrayKeys); -#ifdef USE_ASSERT_CHECKING - if (pstate->startikey > 0 && !pstate->forcenonrequired) - { - bool dres, - dcontinuescan; - int dikey = 0; - - /* Pass arrayKeys=false to avoid array side-effects */ - dres = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, - pstate->forcenonrequired, &dcontinuescan, - &dikey); - Assert(res == dres); - Assert(pstate->continuescan == dcontinuescan); - - /* - * Should also get the same ikey result. We need a slightly weaker - * assertion during arrayKeys calls, since they might be using an - * array that couldn't be marked required during preprocessing. - */ - Assert(arrayKeys || ikey == dikey); - Assert(ikey <= dikey); - } -#endif - - /* - * Only one _bt_check_compare call is required in the common case where - * there are no equality strategy array scan keys. Otherwise we can only - * accept _bt_check_compare's answer unreservedly when it didn't set - * pstate.continuescan=false. - */ - if (!arrayKeys || pstate->continuescan) - return res; - - /* - * _bt_check_compare call set continuescan=false in the presence of - * equality type array keys. This could mean that the tuple is just past - * the end of matches for the current array keys. - * - * It's also possible that the scan is still _before_ the _start_ of - * tuples matching the current set of array keys. Check for that first. - */ - Assert(!pstate->forcenonrequired); - if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, - ikey, NULL)) - { - /* Override _bt_check_compare, continue primitive scan */ - pstate->continuescan = true; - - /* - * We will end up here repeatedly given a group of tuples > the - * previous array keys and < the now-current keys (for a backwards - * scan it's just the same, though the operators swap positions). - * - * We must avoid allowing this linear search process to scan very many - * tuples from well before the start of tuples matching the current - * array keys (or from well before the point where we'll once again - * have to advance the scan's array keys). - * - * We keep the overhead under control by speculatively "looking ahead" - * to later still-unscanned items from this same leaf page. We'll - * only attempt this once the number of tuples that the linear search - * process has examined starts to get out of hand. - */ - pstate->rechecks++; - if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS) - { - /* See if we should skip ahead within the current leaf page */ - _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc); - - /* - * Might have set pstate.skip to a later page offset. When that - * happens then _bt_readpage caller will inexpensively skip ahead - * to a later tuple from the same page (the one just after the - * tuple we successfully "looked ahead" to). - */ - } - - /* This indextuple doesn't match the current qual, in any case */ - return false; - } - - /* - * Caller's tuple is >= the current set of array keys and other equality - * constraint scan keys (or <= if this is a backwards scan). It's now - * clear that we _must_ advance any required array keys in lockstep with - * the scan. - */ - return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc, - ikey, true); -} - -/* - * Test whether caller's finaltup tuple is still before the start of matches - * for the current array keys. - * - * Called at the start of reading a page during a scan with array keys, though - * only when the so->scanBehind flag was set on the scan's prior page. - * - * Returns false if the tuple is still before the start of matches. When that - * happens, caller should cut its losses and start a new primitive index scan. - * Otherwise returns true. - */ -bool -_bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, - IndexTuple finaltup) -{ - Relation rel = scan->indexRelation; - TupleDesc tupdesc = RelationGetDescr(rel); - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); - bool scanBehind; - - Assert(so->numArrayKeys); - - if (_bt_tuple_before_array_skeys(scan, dir, finaltup, tupdesc, - nfinaltupatts, false, 0, &scanBehind)) - return false; - - /* - * If scanBehind was set, all of the untruncated attribute values from - * finaltup that correspond to an array match the array's current element, - * but there are other keys associated with truncated suffix attributes. - * Array advancement must have incremented the scan's arrays on the - * previous page, resulting in a set of array keys that happen to be an - * exact match for the current page high key's untruncated prefix values. - * - * This page definitely doesn't contain tuples that the scan will need to - * return. The next page may or may not contain relevant tuples. Handle - * this by cutting our losses and starting a new primscan. - */ - if (scanBehind) - return false; - - if (!so->oppositeDirCheck) - return true; - - return _bt_oppodir_checkkeys(scan, dir, finaltup); -} - -/* - * Test whether an indextuple fails to satisfy an inequality required in the - * opposite direction only. - * - * Caller's finaltup tuple is the page high key (for forwards scans), or the - * first non-pivot tuple (for backwards scans). Called during scans with - * required array keys and required opposite-direction inequalities. - * - * Returns false if an inequality scan key required in the opposite direction - * only isn't satisfied (and any earlier required scan keys are satisfied). - * Otherwise returns true. - * - * An unsatisfied inequality required in the opposite direction only might - * well enable skipping over many leaf pages, provided another _bt_first call - * takes place. This type of unsatisfied inequality won't usually cause - * _bt_checkkeys to stop the scan to consider array advancement/starting a new - * primitive index scan. - */ -static bool -_bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, - IndexTuple finaltup) -{ - Relation rel = scan->indexRelation; - TupleDesc tupdesc = RelationGetDescr(rel); - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); - bool continuescan; - ScanDirection flipped = -dir; - int ikey = 0; - - Assert(so->numArrayKeys); - - _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, false, - false, &continuescan, - &ikey); - - if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber) - return false; - - return true; -} - -/* - * Determines an offset to the first scan key (an so->keyData[]-wise offset) - * that is _not_ guaranteed to be satisfied by every tuple from pstate.page, - * which is set in pstate.startikey for _bt_checkkeys calls for the page. - * This allows caller to save cycles on comparisons of a prefix of keys while - * reading pstate.page. - * - * Also determines if later calls to _bt_checkkeys (for pstate.page) should be - * forced to treat all required scan keys >= pstate.startikey as nonrequired - * (that is, if they're to be treated as if any SK_BT_REQFWD/SK_BT_REQBKWD - * markings that were set by preprocessing were not set at all, for the - * duration of _bt_checkkeys calls prior to the call for pstate.finaltup). - * This is indicated to caller by setting pstate.forcenonrequired. - * - * Call here at the start of reading a leaf page beyond the first one for the - * primitive index scan. We consider all non-pivot tuples, so it doesn't make - * sense to call here when only a subset of those tuples can ever be read. - * This is also a good idea on performance grounds; not calling here when on - * the first page (first for the current primitive scan) avoids wasting cycles - * during selective point queries. They typically don't stand to gain as much - * when we can set pstate.startikey, and are likely to notice the overhead of - * calling here. (Also, allowing pstate.forcenonrequired to be set on a - * primscan's first page would mislead _bt_advance_array_keys, which expects - * pstate.nskipadvances to be representative of every first page's key space.) - * - * Caller must call _bt_start_array_keys and reset startikey/forcenonrequired - * ahead of the finaltup _bt_checkkeys call when we set forcenonrequired=true. - * This will give _bt_checkkeys the opportunity to call _bt_advance_array_keys - * with sktrig_required=true, restoring the invariant that the scan's required - * arrays always track the scan's progress through the index's key space. - * Caller won't need to do this on the rightmost/leftmost page in the index - * (where pstate.finaltup isn't ever set), since forcenonrequired will never - * be set here in the first place. - */ -void -_bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Relation rel = scan->indexRelation; - TupleDesc tupdesc = RelationGetDescr(rel); - ItemId iid; - IndexTuple firsttup, - lasttup; - int startikey = 0, - arrayidx = 0, - firstchangingattnum; - bool start_past_saop_eq = false; - - Assert(!so->scanBehind); - Assert(pstate->minoff < pstate->maxoff); - Assert(!pstate->firstpage); - Assert(pstate->startikey == 0); - Assert(!so->numArrayKeys || pstate->finaltup || - P_RIGHTMOST(BTPageGetOpaque(pstate->page)) || - P_LEFTMOST(BTPageGetOpaque(pstate->page))); - - if (so->numberOfKeys == 0) - return; - - /* minoff is an offset to the lowest non-pivot tuple on the page */ - iid = PageGetItemId(pstate->page, pstate->minoff); - firsttup = (IndexTuple) PageGetItem(pstate->page, iid); - - /* maxoff is an offset to the highest non-pivot tuple on the page */ - iid = PageGetItemId(pstate->page, pstate->maxoff); - lasttup = (IndexTuple) PageGetItem(pstate->page, iid); - - /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); - - for (; startikey < so->numberOfKeys; startikey++) - { - ScanKey key = so->keyData + startikey; - BTArrayKeyInfo *array; - Datum firstdatum, - lastdatum; - bool firstnull, - lastnull; - int32 result; - - /* - * Determine if it's safe to set pstate.startikey to an offset to a - * key that comes after this key, by examining this key - */ - if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) - { - /* Scan key isn't marked required (corner case) */ - Assert(!(key->sk_flags & SK_ROW_HEADER)); - break; /* unsafe */ - } - if (key->sk_flags & SK_ROW_HEADER) - { - /* - * RowCompare inequality. - * - * Only the first subkey from a RowCompare can ever be marked - * required (that happens when the row header is marked required). - * There is no simple, general way for us to transitively deduce - * whether or not every tuple on the page satisfies a RowCompare - * key based only on firsttup and lasttup -- so we just give up. - */ - if (!start_past_saop_eq && !so->skipScan) - break; /* unsafe to go further */ - - /* - * We have to be even more careful with RowCompares that come - * after an array: we assume it's unsafe to even bypass the array. - * Calling _bt_start_array_keys to recover the scan's arrays - * following use of forcenonrequired mode isn't compatible with - * _bt_check_rowcompare's continuescan=false behavior with NULL - * row compare members. _bt_advance_array_keys must not make a - * decision on the basis of a key not being satisfied in the - * opposite-to-scan direction until the scan reaches a leaf page - * where the same key begins to be satisfied in scan direction. - * The _bt_first !used_all_subkeys behavior makes this limitation - * hard to work around some other way. - */ - return; /* completely unsafe to set pstate.startikey */ - } - if (key->sk_strategy != BTEqualStrategyNumber) - { - /* - * Scalar inequality key. - * - * It's definitely safe for _bt_checkkeys to avoid assessing this - * inequality when the page's first and last non-pivot tuples both - * satisfy the inequality (since the same must also be true of all - * the tuples in between these two). - * - * Unlike the "=" case, it doesn't matter if this attribute has - * more than one distinct value (though it _is_ necessary for any - * and all _prior_ attributes to contain no more than one distinct - * value amongst all of the tuples from pstate.page). - */ - if (key->sk_attno > firstchangingattnum) /* >, not >= */ - break; /* unsafe, preceding attr has multiple - * distinct values */ - - firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); - lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); - - if (key->sk_flags & SK_ISNULL) - { - /* IS NOT NULL key */ - Assert(key->sk_flags & SK_SEARCHNOTNULL); - - if (firstnull || lastnull) - break; /* unsafe */ - - /* Safe, IS NOT NULL key satisfied by every tuple */ - continue; - } - - /* Test firsttup */ - if (firstnull || - !DatumGetBool(FunctionCall2Coll(&key->sk_func, - key->sk_collation, firstdatum, - key->sk_argument))) - break; /* unsafe */ - - /* Test lasttup */ - if (lastnull || - !DatumGetBool(FunctionCall2Coll(&key->sk_func, - key->sk_collation, lastdatum, - key->sk_argument))) - break; /* unsafe */ - - /* Safe, scalar inequality satisfied by every tuple */ - continue; - } - - /* Some = key (could be a scalar = key, could be an array = key) */ - Assert(key->sk_strategy == BTEqualStrategyNumber); - - if (!(key->sk_flags & SK_SEARCHARRAY)) - { - /* - * Scalar = key (possibly an IS NULL key). - * - * It is unsafe to set pstate.startikey to an ikey beyond this - * key, unless the = key is satisfied by every possible tuple on - * the page (possible only when attribute has just one distinct - * value among all tuples on the page). - */ - if (key->sk_attno >= firstchangingattnum) - break; /* unsafe, multiple distinct attr values */ - - firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, - &firstnull); - if (key->sk_flags & SK_ISNULL) - { - /* IS NULL key */ - Assert(key->sk_flags & SK_SEARCHNULL); - - if (!firstnull) - break; /* unsafe */ - - /* Safe, IS NULL key satisfied by every tuple */ - continue; - } - if (firstnull || - !DatumGetBool(FunctionCall2Coll(&key->sk_func, - key->sk_collation, firstdatum, - key->sk_argument))) - break; /* unsafe */ - - /* Safe, scalar = key satisfied by every tuple */ - continue; - } - - /* = array key (could be a SAOP array, could be a skip array) */ - array = &so->arrayKeys[arrayidx++]; - Assert(array->scan_key == startikey); - if (array->num_elems != -1) - { - /* - * SAOP array = key. - * - * Handle this like we handle scalar = keys (though binary search - * for a matching element, to avoid relying on key's sk_argument). - */ - if (key->sk_attno >= firstchangingattnum) - break; /* unsafe, multiple distinct attr values */ - - firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, - &firstnull); - _bt_binsrch_array_skey(&so->orderProcs[startikey], - false, NoMovementScanDirection, - firstdatum, firstnull, array, key, - &result); - if (result != 0) - break; /* unsafe */ - - /* Safe, SAOP = key satisfied by every tuple */ - start_past_saop_eq = true; - continue; - } - - /* - * Skip array = key - */ - Assert(key->sk_flags & SK_BT_SKIP); - if (array->null_elem) - { - /* - * Non-range skip array = key. - * - * Safe, non-range skip array "satisfied" by every tuple on page - * (safe even when "key->sk_attno > firstchangingattnum"). - */ - continue; - } + BTScanInsert key; + ScanKey skey; + TupleDesc itupdesc; + int indnkeyatts; + int16 *indoption; + int tupnatts; + int i; - /* - * Range skip array = key. - * - * Handle this like we handle scalar inequality keys (but avoid using - * key's sk_argument directly, as in the SAOP array case). - */ - if (key->sk_attno > firstchangingattnum) /* >, not >= */ - break; /* unsafe, preceding attr has multiple - * distinct values */ - - firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); - lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); - - /* Test firsttup */ - _bt_binsrch_skiparray_skey(false, ForwardScanDirection, - firstdatum, firstnull, array, key, - &result); - if (result != 0) - break; /* unsafe */ - - /* Test lasttup */ - _bt_binsrch_skiparray_skey(false, ForwardScanDirection, - lastdatum, lastnull, array, key, - &result); - if (result != 0) - break; /* unsafe */ - - /* Safe, range skip array satisfied by every tuple on page */ - } + itupdesc = RelationGetDescr(rel); + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + indoption = rel->rd_indoption; + tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; - /* - * Use of forcenonrequired is typically undesirable, since it'll force - * _bt_readpage caller to read every tuple on the page -- even though, in - * general, it might well be possible to end the scan on an earlier tuple. - * However, caller must use forcenonrequired when start_past_saop_eq=true, - * since the usual required array behavior might fail to roll over to the - * SAOP array. - * - * We always prefer forcenonrequired=true during scans with skip arrays - * (except on the first page of each primitive index scan), though -- even - * when "startikey == 0". That way, _bt_advance_array_keys's low-order - * key precheck optimization can always be used (unless on the first page - * of the scan). It seems slightly preferable to check more tuples when - * that allows us to do significantly less skip array maintenance. - */ - pstate->forcenonrequired = (start_past_saop_eq || so->skipScan); - pstate->startikey = startikey; + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); /* - * _bt_readpage caller is required to call _bt_checkkeys against page's - * finaltup with forcenonrequired=false whenever we initially set - * forcenonrequired=true. That way the scan's arrays will reliably track - * its progress through the index's key space. - * - * We don't expect this when _bt_readpage caller has no finaltup due to - * its page being the rightmost (or the leftmost, during backwards scans). - * When we see that _bt_readpage has no finaltup, back out of everything. + * We'll execute search using scan key constructed on key columns. + * Truncated attributes and non-key attributes are omitted from the final + * scan key. */ - Assert(!pstate->forcenonrequired || so->numArrayKeys); - if (pstate->forcenonrequired && !pstate->finaltup) + key = palloc(offsetof(BTScanInsertData, scankeys) + + sizeof(ScanKeyData) * indnkeyatts); + if (itup) + _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); + else { - pstate->forcenonrequired = false; - pstate->startikey = 0; + /* Utility statement callers can set these fields themselves */ + key->heapkeyspace = true; + key->allequalimage = false; } -} - -/* - * Test whether an indextuple satisfies current scan condition. - * - * Return true if so, false if not. If not, also sets *continuescan to false - * when it's also not possible for any later tuples to pass the current qual - * (with the scan's current set of array keys, in the current scan direction), - * in addition to setting *ikey to the so->keyData[] subscript/offset for the - * unsatisfied scan key (needed when caller must consider advancing the scan's - * array keys). - * - * This is a subroutine for _bt_checkkeys. We provisionally assume that - * reaching the end of the current set of required keys (in particular the - * current required array keys) ends the ongoing (primitive) index scan. - * Callers without array keys should just end the scan right away when they - * find that continuescan has been set to false here by us. Things are more - * complicated for callers with array keys. - * - * Callers with array keys must first consider advancing the arrays when - * continuescan has been set to false here by us. They must then consider if - * it really does make sense to end the current (primitive) index scan, in - * light of everything that is known at that point. (In general when we set - * continuescan=false for these callers it must be treated as provisional.) - * - * We deal with advancing unsatisfied non-required arrays directly, though. - * This is safe, since by definition non-required keys can't end the scan. - * This is just how we determine if non-required arrays are just unsatisfied - * by the current array key, or if they're truly unsatisfied (that is, if - * they're unsatisfied by every possible array key). - * - * Pass advancenonrequired=false to avoid all array related side effects. - * This allows _bt_advance_array_keys caller to avoid infinite recursion. - * - * Pass forcenonrequired=true to instruct us to treat all keys as nonrequired. - * This is used to make it safe to temporarily stop properly maintaining the - * scan's required arrays. _bt_checkkeys caller (_bt_readpage, actually) - * determines a prefix of keys that must satisfy every possible corresponding - * index attribute value from its page, which is passed to us via *ikey arg - * (this is the first key that might be unsatisfied by tuples on the page). - * Obviously, we won't maintain any array keys from before *ikey, so it's - * quite possible for such arrays to "fall behind" the index's keyspace. - * Caller will need to "catch up" by passing forcenonrequired=true (alongside - * an *ikey=0) once the page's finaltup is reached. - * - * Note: it's safe to pass an *ikey > 0 with forcenonrequired=false, but only - * when caller determines that it won't affect array maintenance. - */ -static bool -_bt_check_compare(IndexScanDesc scan, ScanDirection dir, - IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool forcenonrequired, - bool *continuescan, int *ikey) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - *continuescan = true; /* default assumption */ - - for (; *ikey < so->numberOfKeys; (*ikey)++) + key->anynullkeys = false; /* initial assumption */ + key->nextkey = false; /* usual case, required by btinsert */ + key->backward = false; /* usual case, required by btinsert */ + key->keysz = Min(indnkeyatts, tupnatts); + key->scantid = key->heapkeyspace && itup ? + BTreeTupleGetHeapTID(itup) : NULL; + skey = key->scankeys; + for (i = 0; i < indnkeyatts; i++) { - ScanKey key = so->keyData + *ikey; - Datum datum; - bool isNull; - bool requiredSameDir = false, - requiredOppositeDirOnly = false; + FmgrInfo *procinfo; + Datum arg; + bool null; + int flags; /* - * Check if the key is required in the current scan direction, in the - * opposite scan direction _only_, or in neither direction (except - * when we're forced to treat all scan keys as nonrequired) + * We can use the cached (default) support procs since no cross-type + * comparison can be needed. */ - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) - requiredSameDir = true; - else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || - ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) - requiredOppositeDirOnly = true; - - if (key->sk_attno > tupnatts) - { - /* - * This attribute is truncated (must be high key). The value for - * this attribute in the first non-pivot tuple on the page to the - * right could be any possible value. Assume that truncated - * attribute passes the qual. - */ - Assert(BTreeTupleIsPivot(tuple)); - continue; - } + procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); /* - * A skip array scan key uses one of several sentinel values. We just - * fall back on _bt_tuple_before_array_skeys when we see such a value. + * Key arguments built from truncated attributes (or when caller + * provides no tuple) are defensively represented as NULL values. They + * should never be used. */ - if (key->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL | - SK_BT_NEXT | SK_BT_PRIOR)) - { - Assert(key->sk_flags & SK_SEARCHARRAY); - Assert(key->sk_flags & SK_BT_SKIP); - Assert(requiredSameDir || forcenonrequired); - - /* - * Cannot fall back on _bt_tuple_before_array_skeys when we're - * treating the scan's keys as nonrequired, though. Just handle - * this like any other non-required equality-type array key. - */ - if (forcenonrequired) - return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, - tupdesc, *ikey, false); - - *continuescan = false; - return false; - } - - /* row-comparison keys need special processing */ - if (key->sk_flags & SK_ROW_HEADER) - { - if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - forcenonrequired, continuescan)) - continue; - return false; - } - - datum = index_getattr(tuple, - key->sk_attno, - tupdesc, - &isNull); - - if (key->sk_flags & SK_ISNULL) - { - /* Handle IS NULL/NOT NULL tests */ - if (key->sk_flags & SK_SEARCHNULL) - { - if (isNull) - continue; /* tuple satisfies this qual */ - } - else - { - Assert(key->sk_flags & SK_SEARCHNOTNULL); - Assert(!(key->sk_flags & SK_BT_SKIP)); - if (!isNull) - continue; /* tuple satisfies this qual */ - } - - /* - * Tuple fails this qual. If it's a required qual for the current - * scan direction, then we can conclude no further tuples will - * pass, either. - */ - if (requiredSameDir) - *continuescan = false; - else if (unlikely(key->sk_flags & SK_BT_SKIP)) - { - /* - * If we're treating scan keys as nonrequired, and encounter a - * skip array scan key whose current element is NULL, then it - * must be a non-range skip array. It must be satisfied, so - * there's no need to call _bt_advance_array_keys to check. - */ - Assert(forcenonrequired && *ikey > 0); - continue; - } - - /* - * This indextuple doesn't match the qual. - */ - return false; - } - - if (isNull) - { - /* - * Scalar scan key isn't satisfied by NULL tuple value. - * - * If we're treating scan keys as nonrequired, and key is for a - * skip array, then we must attempt to advance the array to NULL - * (if we're successful then the tuple might match the qual). - */ - if (unlikely(forcenonrequired && key->sk_flags & SK_BT_SKIP)) - return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, - tupdesc, *ikey, false); - - if (key->sk_flags & SK_BT_NULLS_FIRST) - { - /* - * Since NULLs are sorted before non-NULLs, we know we have - * reached the lower limit of the range of values for this - * index attr. On a backward scan, we can stop if this qual - * is one of the "must match" subset. We can stop regardless - * of whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a forward scan, however, we must keep going, because we may - * have initially positioned to the start of the index. - * (_bt_advance_array_keys also relies on this behavior during - * forward scans.) - */ - if ((requiredSameDir || requiredOppositeDirOnly) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - } - else - { - /* - * Since NULLs are sorted after non-NULLs, we know we have - * reached the upper limit of the range of values for this - * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. We can stop regardless of - * whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a backward scan, however, we must keep going, because we - * may have initially positioned to the end of the index. - * (_bt_advance_array_keys also relies on this behavior during - * backward scans.) - */ - if ((requiredSameDir || requiredOppositeDirOnly) && - ScanDirectionIsForward(dir)) - *continuescan = false; - } - - /* - * This indextuple doesn't match the qual. - */ - return false; - } - - if (!DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation, - datum, key->sk_argument))) + if (i < tupnatts) + arg = index_getattr(itup, i + 1, itupdesc, &null); + else { - /* - * Tuple fails this qual. If it's a required qual for the current - * scan direction, then we can conclude no further tuples will - * pass, either. - * - * Note: because we stop the scan as soon as any required equality - * qual fails, it is critical that equality quals be used for the - * initial positioning in _bt_first() when they are available. See - * comments in _bt_first(). - */ - if (requiredSameDir) - *continuescan = false; - - /* - * If this is a non-required equality-type array key, the tuple - * needs to be checked against every possible array key. Handle - * this by "advancing" the scan key's array to a matching value - * (if we're successful then the tuple might match the qual). - */ - else if (advancenonrequired && - key->sk_strategy == BTEqualStrategyNumber && - (key->sk_flags & SK_SEARCHARRAY)) - return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, - tupdesc, *ikey, false); - - /* - * This indextuple doesn't match the qual. - */ - return false; + arg = (Datum) 0; + null = true; } + flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); + ScanKeyEntryInitializeWithInfo(&skey[i], + flags, + (AttrNumber) (i + 1), + InvalidStrategy, + InvalidOid, + rel->rd_indcollation[i], + procinfo, + arg); + /* Record if any key attribute is NULL (or truncated) */ + if (null) + key->anynullkeys = true; } - /* If we get here, the tuple passes all index quals. */ - return true; + /* + * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so + * that full uniqueness check is done. + */ + if (rel->rd_index->indnullsnotdistinct) + key->anynullkeys = false; + + return key; } /* - * Test whether an indextuple satisfies a row-comparison scan condition. - * - * Return true if so, false if not. If not, also clear *continuescan if - * it's not possible for any future tuples in the current scan direction - * to pass the qual. - * - * This is a subroutine for _bt_checkkeys/_bt_check_compare. + * free a retracement stack made by _bt_search. */ -static bool -_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, - bool forcenonrequired, bool *continuescan) +void +_bt_freestack(BTStack stack) { - ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); - int32 cmpresult = 0; - bool result; - - /* First subkey should be same as the header says */ - Assert(subkey->sk_attno == skey->sk_attno); - - /* Loop over columns of the row condition */ - for (;;) - { - Datum datum; - bool isNull; - - Assert(subkey->sk_flags & SK_ROW_MEMBER); - - if (subkey->sk_attno > tupnatts) - { - /* - * This attribute is truncated (must be high key). The value for - * this attribute in the first non-pivot tuple on the page to the - * right could be any possible value. Assume that truncated - * attribute passes the qual. - */ - Assert(BTreeTupleIsPivot(tuple)); - cmpresult = 0; - if (subkey->sk_flags & SK_ROW_END) - break; - subkey++; - continue; - } - - datum = index_getattr(tuple, - subkey->sk_attno, - tupdesc, - &isNull); - - if (isNull) - { - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if (subkey->sk_flags & SK_BT_NULLS_FIRST) - { - /* - * Since NULLs are sorted before non-NULLs, we know we have - * reached the lower limit of the range of values for this - * index attr. On a backward scan, we can stop if this qual - * is one of the "must match" subset. We can stop regardless - * of whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a forward scan, however, we must keep going, because we may - * have initially positioned to the start of the index. - * (_bt_advance_array_keys also relies on this behavior during - * forward scans.) - */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - } - else - { - /* - * Since NULLs are sorted after non-NULLs, we know we have - * reached the upper limit of the range of values for this - * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. We can stop regardless of - * whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a backward scan, however, we must keep going, because we - * may have initially positioned to the end of the index. - * (_bt_advance_array_keys also relies on this behavior during - * backward scans.) - */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsForward(dir)) - *continuescan = false; - } - - /* - * In any case, this indextuple doesn't match the qual. - */ - return false; - } - - if (subkey->sk_flags & SK_ISNULL) - { - /* - * Unlike the simple-scankey case, this isn't a disallowed case - * (except when it's the first row element that has the NULL arg). - * But it can never match. If all the earlier row comparison - * columns are required for the scan direction, we can stop the - * scan, because there can't be another tuple that will succeed. - */ - Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); - subkey--; - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((subkey->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - return false; - } - - /* Perform the test --- three-way comparison not bool operator */ - cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, - subkey->sk_collation, - datum, - subkey->sk_argument)); - - if (subkey->sk_flags & SK_BT_DESC) - INVERT_COMPARE_RESULT(cmpresult); - - /* Done comparing if unequal, else advance to next column */ - if (cmpresult != 0) - break; - - if (subkey->sk_flags & SK_ROW_END) - break; - subkey++; - } - - /* - * At this point cmpresult indicates the overall result of the row - * comparison, and subkey points to the deciding column (or the last - * column if the result is "="). - */ - switch (subkey->sk_strategy) - { - /* EQ and NE cases aren't allowed here */ - case BTLessStrategyNumber: - result = (cmpresult < 0); - break; - case BTLessEqualStrategyNumber: - result = (cmpresult <= 0); - break; - case BTGreaterEqualStrategyNumber: - result = (cmpresult >= 0); - break; - case BTGreaterStrategyNumber: - result = (cmpresult > 0); - break; - default: - elog(ERROR, "unexpected strategy number %d", subkey->sk_strategy); - result = 0; /* keep compiler quiet */ - break; - } + BTStack ostack; - if (!result && !forcenonrequired) + while (stack != NULL) { - /* - * Tuple fails this qual. If it's a required qual for the current - * scan direction, then we can conclude no further tuples will pass, - * either. Note we have to look at the deciding column, not - * necessarily the first or last column of the row condition. - */ - if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((subkey->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; + ostack = stack; + stack = stack->bts_parent; + pfree(ostack); } - - return result; } /* - * Determine if a scan with array keys should skip over uninteresting tuples. - * - * This is a subroutine for _bt_checkkeys. Called when _bt_readpage's linear - * search process (started after it finishes reading an initial group of - * matching tuples, used to locate the start of the next group of tuples - * matching the next set of required array keys) has already scanned an - * excessive number of tuples whose key space is "between arrays". - * - * When we perform look ahead successfully, we'll sets pstate.skip, which - * instructs _bt_readpage to skip ahead to that tuple next (could be past the - * end of the scan's leaf page). Pages where the optimization is effective - * will generally still need to skip several times. Each call here performs - * only a single "look ahead" comparison of a later tuple, whose distance from - * the current tuple's offset number is determined by applying heuristics. + * qsort comparison function for int arrays */ -static void -_bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, - int tupnatts, TupleDesc tupdesc) +static int +_bt_compare_int(const void *va, const void *vb) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; - ScanDirection dir = so->currPos.dir; - OffsetNumber aheadoffnum; - IndexTuple ahead; - - Assert(!pstate->forcenonrequired); - - /* Avoid looking ahead when comparing the page high key */ - if (pstate->offnum < pstate->minoff) - return; - - /* - * Don't look ahead when there aren't enough tuples remaining on the page - * (in the current scan direction) for it to be worth our while - */ - if (ScanDirectionIsForward(dir) && - pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE) - return; - else if (ScanDirectionIsBackward(dir) && - pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE) - return; - - /* - * The look ahead distance starts small, and ramps up as each call here - * allows _bt_readpage to skip over more tuples - */ - if (!pstate->targetdistance) - pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE; - else if (pstate->targetdistance < MaxIndexTuplesPerPage / 2) - pstate->targetdistance *= 2; - - /* Don't read past the end (or before the start) of the page, though */ - if (ScanDirectionIsForward(dir)) - aheadoffnum = Min((int) pstate->maxoff, - (int) pstate->offnum + pstate->targetdistance); - else - aheadoffnum = Max((int) pstate->minoff, - (int) pstate->offnum - pstate->targetdistance); + int a = *((const int *) va); + int b = *((const int *) vb); - ahead = (IndexTuple) PageGetItem(pstate->page, - PageGetItemId(pstate->page, aheadoffnum)); - if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts, - false, 0, NULL)) - { - /* - * Success -- instruct _bt_readpage to skip ahead to very next tuple - * after the one we determined was still before the current array keys - */ - if (ScanDirectionIsForward(dir)) - pstate->skip = aheadoffnum + 1; - else - pstate->skip = aheadoffnum - 1; - } - else - { - /* - * Failure -- "ahead" tuple is too far ahead (we were too aggressive). - * - * Reset the number of rechecks, and aggressively reduce the target - * distance (we're much more aggressive here than we were when the - * distance was initially ramped up). - */ - pstate->rechecks = 0; - pstate->targetdistance = Max(pstate->targetdistance / 8, 1); - } + return pg_cmp_s32(a, b); } /* @@ -3330,87 +180,100 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, * current page and killed tuples thereon (generally, this should only be * called if so->numKilled > 0). * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). - * - * We match items by heap TID before assuming they are the right ones to - * delete. We cope with cases where items have moved right due to insertions. - * If an item has moved off the current page due to a split, we'll fail to - * find it and do nothing (this is not an error case --- we assume the item - * will eventually get marked in a future indexscan). - * - * Note that if we hold a pin on the target page continuously from initially - * reading the items until applying this function, VACUUM cannot have deleted - * any items from the page, and so there is no need to search left from the - * recorded offset. (This observation also guarantees that the item is still - * the right one to delete, which might otherwise be questionable since heap - * TIDs can get recycled.) This holds true even if the page has been modified - * by inserts and page splits, so there is no need to consult the LSN. - * - * If the pin was released after reading the page, then we re-read it. If it - * has been modified since we read it (as determined by the LSN), we dare not - * flag any entries because it is possible that the old entry was vacuumed - * away and the TID was re-used by a completely different heap tuple. + * Caller should not have a lock on the so->currPos page, but must hold a + * buffer pin when !so->dropPin. When we return, it still won't be locked. + * It'll continue to hold whatever pins were held before calling here. + * + * We match items by heap TID before assuming they are the right ones to set + * LP_DEAD. If the scan is one that holds a buffer pin on the target page + * continuously from initially reading the items until applying this function + * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the + * page, so the page's TIDs can't have been recycled by now. There's no risk + * that we'll confuse a new index tuple that happens to use a recycled TID + * with a now-removed tuple with the same TID (that used to be on this same + * page). We can't rely on that during scans that drop buffer pins eagerly + * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on + * the page LSN having not changed since back when _bt_readpage saw the page. + * We totally give up on setting LP_DEAD bits when the page LSN changed. + * + * We give up much less often during !so->dropPin scans, but it still happens. + * We cope with cases where items have moved right due to insertions. If an + * item has moved off the current page due to a split, we'll fail to find it + * and just give up on it. */ void _bt_killitems(IndexScanDesc scan) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int i; int numKilled = so->numKilled; bool killedsomething = false; - bool droppedpin PG_USED_FOR_ASSERTS_ONLY; + Buffer buf; + Assert(numKilled > 0); Assert(BTScanPosIsValid(so->currPos)); + Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */ + + /* Always invalidate so->killedItems[] before leaving so->currPos */ + so->numKilled = 0; /* - * Always reset the scan state, so we don't look for same items on other - * pages. + * We need to iterate through so->killedItems[] in leaf page order; the + * loop below expects this (when marking posting list tuples, at least). + * so->killedItems[] is now in whatever order the scan returned items in. + * Scrollable cursor scans might have even saved the same item/TID twice. + * + * Sort and unique-ify so->killedItems[] to deal with all this. */ - so->numKilled = 0; + if (numKilled > 1) + { + qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int); + numKilled = qunique(so->killedItems, numKilled, sizeof(int), + _bt_compare_int); + } - if (BTScanPosIsPinned(so->currPos)) + if (!so->dropPin) { /* * We have held the pin on this page since we read the index tuples, * so all we need to do is lock it. The pin will have prevented - * re-use of any TID on the page, so there is no need to check the - * LSN. + * concurrent VACUUMs from recycling any of the TIDs on the page. */ - droppedpin = false; - _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); - - page = BufferGetPage(so->currPos.buf); + Assert(BTScanPosIsPinned(so->currPos)); + buf = so->currPos.buf; + _bt_lockbuf(rel, buf, BT_READ); } else { - Buffer buf; + XLogRecPtr latestlsn; - droppedpin = true; - /* Attempt to re-read the buffer, getting pin and lock. */ - buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + Assert(!BTScanPosIsPinned(so->currPos)); + Assert(RelationNeedsWAL(rel)); + buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); - page = BufferGetPage(buf); - if (BufferGetLSNAtomic(buf) == so->currPos.lsn) - so->currPos.buf = buf; - else + latestlsn = BufferGetLSNAtomic(buf); + Assert(so->currPos.lsn <= latestlsn); + if (so->currPos.lsn != latestlsn) { - /* Modified while not pinned means hinting is not safe. */ - _bt_relbuf(scan->indexRelation, buf); + /* Modified, give up on hinting */ + _bt_relbuf(rel, buf); return; } + + /* Unmodified, hinting is safe */ } + page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); - for (i = 0; i < numKilled; i++) + /* Iterate through so->killedItems[] in leaf page order */ + for (int i = 0; i < numKilled; i++) { int itemIndex = so->killedItems[i]; BTScanPosItem *kitem = &so->currPos.items[itemIndex]; @@ -3418,6 +281,9 @@ _bt_killitems(IndexScanDesc scan) Assert(itemIndex >= so->currPos.firstItem && itemIndex <= so->currPos.lastItem); + Assert(i == 0 || + offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset); + if (offnum < minoff) continue; /* pure paranoia */ while (offnum <= maxoff) @@ -3433,16 +299,8 @@ _bt_killitems(IndexScanDesc scan) int j; /* - * We rely on the convention that heap TIDs in the scanpos - * items array are stored in ascending heap TID order for a - * group of TIDs that originally came from a posting list - * tuple. This convention even applies during backwards - * scans, where returning the TIDs in descending order might - * seem more natural. This is about effectiveness, not - * correctness. - * * Note that the page may have been modified in almost any way - * since we first read it (in the !droppedpin case), so it's + * since we first read it (in the !so->dropPin case), so it's * possible that this posting list tuple wasn't a posting list * tuple when we first encountered its heap TIDs. */ @@ -3458,7 +316,7 @@ _bt_killitems(IndexScanDesc scan) * though only in the common case where the page can't * have been concurrently modified */ - Assert(kitem->indexOffset == offnum || !droppedpin); + Assert(kitem->indexOffset == offnum || !so->dropPin); /* * Read-ahead to later kitems here. @@ -3522,10 +380,13 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + MarkBufferDirtyHint(buf, true); } - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!so->dropPin) + _bt_unlockbuf(rel, buf); + else + _bt_relbuf(rel, buf); } diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index d31dd56732d2f..dbe67c166fdff 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -38,7 +38,7 @@ _bt_restore_page(Page page, char *from, int len) IndexTupleData itupdata; Size itemsz; char *end = from + len; - Item items[MaxIndexTuplesPerPage]; + void *items[MaxIndexTuplesPerPage]; uint16 itemsizes[MaxIndexTuplesPerPage]; int i; int nitems; @@ -53,16 +53,15 @@ _bt_restore_page(Page page, char *from, int len) { /* * As we step through the items, 'from' won't always be properly - * aligned, so we need to use memcpy(). Further, we use Item (which - * is just a char*) here for our items array for the same reason; - * wouldn't want the compiler or anyone thinking that an item is - * aligned when it isn't. + * aligned, so we need to use memcpy(). Further, we use void * here + * for our items array for the same reason; wouldn't want the compiler + * or anyone thinking that an item is aligned when it isn't. */ memcpy(&itupdata, from, sizeof(IndexTupleData)); itemsz = IndexTupleSize(&itupdata); itemsz = MAXALIGN(itemsz); - items[i] = (Item) from; + items[i] = from; itemsizes[i] = itemsz; i++; @@ -72,8 +71,7 @@ _bt_restore_page(Page page, char *from, int len) for (i = nitems - 1; i >= 0; i--) { - if (PageAddItem(page, items[i], itemsizes[i], nitems - i, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, items[i], itemsizes[i], nitems - i, false, false) == InvalidOffsetNumber) elog(PANIC, "_bt_restore_page: cannot add item to page"); } } @@ -143,7 +141,7 @@ _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO) { - Page page = (Page) BufferGetPage(buf); + Page page = BufferGetPage(buf); BTPageOpaque pageop = BTPageGetOpaque(page); Assert(P_INCOMPLETE_SPLIT(pageop)); @@ -186,8 +184,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting, if (!posting) { /* Simple retail insertion */ - if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add new item"); } else @@ -225,8 +222,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting, /* Insert "final" new item (not orignewitem from WAL stream) */ Assert(IndexTupleSize(newitem) == datalen); - if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, newitem, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add posting split new item"); } @@ -287,7 +283,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogInitBufferForRedo(record, 1); datapos = XLogRecGetBlockData(record, 1, &datalen); - rpage = (Page) BufferGetPage(rbuf); + rpage = BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = BTPageGetOpaque(rpage); @@ -314,7 +310,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) * checking possible. See also _bt_restore_page(), which does the * same for the right page. */ - Page origpage = (Page) BufferGetPage(buf); + Page origpage = BufferGetPage(buf); BTPageOpaque oopaque = BTPageGetOpaque(origpage); OffsetNumber off; IndexTuple newitem = NULL, @@ -368,8 +364,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) /* Add high key tuple from WAL record to temp page */ leftoff = P_HIKEY; - if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY, - false, false) == InvalidOffsetNumber) + if (PageAddItem(leftpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); @@ -384,9 +379,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) { Assert(newitemonleft || xlrec->firstrightoff == xlrec->newitemoff); - if (PageAddItem(leftpage, (Item) nposting, - MAXALIGN(IndexTupleSize(nposting)), leftoff, - false, false) == InvalidOffsetNumber) + if (PageAddItem(leftpage, nposting, MAXALIGN(IndexTupleSize(nposting)), leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new posting list item to left page after split"); leftoff = OffsetNumberNext(leftoff); continue; /* don't insert oposting */ @@ -395,8 +388,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) /* add the new item if it was inserted on left page */ else if (newitemonleft && off == xlrec->newitemoff) { - if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, - false, false) == InvalidOffsetNumber) + if (PageAddItem(leftpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } @@ -404,8 +396,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) itemid = PageGetItemId(origpage, off); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); - if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, - false, false) == InvalidOffsetNumber) + if (PageAddItem(leftpage, item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); } @@ -413,8 +404,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) /* cope with possibility that newitem goes at the end */ if (newitemonleft && off == xlrec->newitemoff) { - if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, - false, false) == InvalidOffsetNumber) + if (PageAddItem(leftpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } @@ -439,7 +429,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO) { - Page spage = (Page) BufferGetPage(sbuf); + Page spage = BufferGetPage(sbuf); BTPageOpaque spageop = BTPageGetOpaque(spage); spageop->btpo_prev = rightpagenumber; @@ -470,7 +460,7 @@ btree_xlog_dedup(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) { char *ptr = XLogRecGetBlockData(record, 0, NULL); - Page page = (Page) BufferGetPage(buf); + Page page = BufferGetPage(buf); BTPageOpaque opaque = BTPageGetOpaque(page); OffsetNumber offnum, minoff, @@ -479,7 +469,7 @@ btree_xlog_dedup(XLogReaderState *record) BTDedupInterval *intervals; Page newpage; - state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state = palloc_object(BTDedupStateData); state->deduplicate = true; /* unused */ state->nmaxitems = 0; /* unused */ /* Conservatively use larger maxpostingsize than primary */ @@ -503,8 +493,7 @@ btree_xlog_dedup(XLogReaderState *record) Size itemsz = ItemIdGetLength(itemid); IndexTuple item = (IndexTuple) PageGetItem(page, itemid); - if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY, - false, false) == InvalidOffsetNumber) + if (PageAddItem(newpage, item, itemsz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add highkey"); } @@ -580,8 +569,7 @@ btree_xlog_updates(Page page, OffsetNumber *updatedoffsets, /* Overwrite updated version of tuple */ itemsz = MAXALIGN(IndexTupleSize(vacposting->itup)); - if (!PageIndexTupleOverwrite(page, updatedoffsets[i], - (Item) vacposting->itup, itemsz)) + if (!PageIndexTupleOverwrite(page, updatedoffsets[i], vacposting->itup, itemsz)) elog(PANIC, "failed to update partially dead item"); pfree(vacposting->itup); @@ -614,7 +602,7 @@ btree_xlog_vacuum(XLogReaderState *record) { char *ptr = XLogRecGetBlockData(record, 0, NULL); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (xlrec->nupdated > 0) { @@ -680,7 +668,7 @@ btree_xlog_delete(XLogReaderState *record) { char *ptr = XLogRecGetBlockData(record, 0, NULL); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (xlrec->nupdated > 0) { @@ -740,7 +728,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) OffsetNumber nextoffset; BlockNumber rightsib; - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); pageop = BTPageGetOpaque(page); poffset = xlrec->poffset; @@ -769,7 +757,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) /* Rewrite the leaf page as a halfdead page */ buffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = BTPageGetOpaque(page); @@ -788,8 +776,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) trunctuple.t_info = sizeof(IndexTupleData); BTreeTupleSetTopParent(&trunctuple, xlrec->topparent); - if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); @@ -836,7 +823,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) { if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(leftbuf); + page = BufferGetPage(leftbuf); pageop = BTPageGetOpaque(page); pageop->btpo_next = rightsib; @@ -849,7 +836,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) /* Rewrite target page as empty deleted page */ target = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(target); + page = BufferGetPage(target); _bt_pageinit(page, BufferGetPageSize(target)); pageop = BTPageGetOpaque(page); @@ -868,7 +855,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) /* Fix left-link of right sibling */ if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(rightbuf); + page = BufferGetPage(rightbuf); pageop = BTPageGetOpaque(page); pageop->btpo_prev = leftsib; @@ -907,7 +894,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) Assert(!isleaf); leafbuf = XLogInitBufferForRedo(record, 3); - page = (Page) BufferGetPage(leafbuf); + page = BufferGetPage(leafbuf); _bt_pageinit(page, BufferGetPageSize(leafbuf)); pageop = BTPageGetOpaque(page); @@ -923,8 +910,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) trunctuple.t_info = sizeof(IndexTupleData); BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent); - if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, - false, false) == InvalidOffsetNumber) + if (PageAddItem(page, &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); @@ -949,7 +935,7 @@ btree_xlog_newroot(XLogReaderState *record) Size len; buffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = BTPageGetOpaque(page); diff --git a/src/backend/access/rmgrdesc/genericdesc.c b/src/backend/access/rmgrdesc/genericdesc.c index 75dc4108b9aa2..29a4c9e894bf5 100644 --- a/src/backend/access/rmgrdesc/genericdesc.c +++ b/src/backend/access/rmgrdesc/genericdesc.c @@ -23,8 +23,8 @@ void generic_desc(StringInfo buf, XLogReaderState *record) { - Pointer ptr = XLogRecGetData(record), - end = ptr + XLogRecGetDataLen(record); + const char *ptr = XLogRecGetData(record); + const char *end = ptr + XLogRecGetDataLen(record); while (ptr < end) { diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index 723ff9499cf46..62e21f8c93578 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -23,7 +23,7 @@ desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData) int i; char *walbuf = ((char *) insertData) + sizeof(ginxlogRecompressDataLeaf); - appendStringInfo(buf, " %d segments:", (int) insertData->nactions); + appendStringInfo(buf, " %d segments:", insertData->nactions); for (i = 0; i < insertData->nactions; i++) { @@ -99,14 +99,7 @@ gin_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, " children: %u/%u", leftChildBlkno, rightChildBlkno); } - if (XLogRecHasBlockImage(record, 0)) - { - if (XLogRecBlockImageApply(record, 0)) - appendStringInfoString(buf, " (full page image)"); - else - appendStringInfoString(buf, " (full page image, for WAL verification)"); - } - else + if (!XLogRecHasBlockImage(record, 0)) { char *payload = XLogRecGetBlockData(record, 0, NULL); @@ -137,6 +130,9 @@ gin_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, " isdata: %c isleaf: %c", (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); + if (xlrec->leftChildBlkno != InvalidBlockNumber) + appendStringInfo(buf, " children: %u/%u", + xlrec->leftChildBlkno, xlrec->rightChildBlkno); } break; case XLOG_GIN_VACUUM_PAGE: @@ -144,14 +140,7 @@ gin_desc(StringInfo buf, XLogReaderState *record) break; case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: { - if (XLogRecHasBlockImage(record, 0)) - { - if (XLogRecBlockImageApply(record, 0)) - appendStringInfoString(buf, " (full page image)"); - else - appendStringInfoString(buf, " (full page image, for WAL verification)"); - } - else + if (!XLogRecHasBlockImage(record, 0)) { ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, NULL); @@ -164,10 +153,27 @@ gin_desc(StringInfo buf, XLogReaderState *record) /* no further information */ break; case XLOG_GIN_UPDATE_META_PAGE: - /* no further information */ + { + ginxlogUpdateMeta *xlrec = (ginxlogUpdateMeta *) rec; + + appendStringInfo(buf, "ntuples: %d", xlrec->ntuples); + if (xlrec->prevTail != InvalidBlockNumber) + appendStringInfo(buf, " prevTail: %u", + xlrec->prevTail); + if (xlrec->newRightlink != InvalidBlockNumber) + appendStringInfo(buf, " newRightLink: %u", + xlrec->newRightlink); + } break; case XLOG_GIN_INSERT_LISTPAGE: - /* no further information */ + { + ginxlogInsertListPage *xlrec = (ginxlogInsertListPage *) rec; + + appendStringInfo(buf, "ntuples: %d", xlrec->ntuples); + if (xlrec->rightlink != InvalidBlockNumber) + appendStringInfo(buf, " rightlink: %u", + xlrec->rightlink); + } break; case XLOG_GIN_DELETE_LISTPAGE: appendStringInfo(buf, "ndeleted: %d", diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c index 75f43a9152071..2ee5332452f39 100644 --- a/src/backend/access/rmgrdesc/hashdesc.c +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -28,8 +28,10 @@ hash_desc(StringInfo buf, XLogReaderState *record) { xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec; - appendStringInfo(buf, "num_tuples %g, fillfactor %d", - xlrec->num_tuples, xlrec->ffactor); + appendStringInfo(buf, "num_tuples %g, procid %u, fillfactor %d", + xlrec->num_tuples, + xlrec->procid, + xlrec->ffactor); break; } case XLOG_HASH_INIT_BITMAP_PAGE: @@ -58,8 +60,10 @@ hash_desc(StringInfo buf, XLogReaderState *record) { xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec; - appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c", + appendStringInfo(buf, "new_bucket %u, old_bucket_flag %u, new_bucket_flag %u, meta_page_masks_updated %c, issplitpoint_changed %c", xlrec->new_bucket, + xlrec->old_bucket_flag, + xlrec->new_bucket_flag, (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F', (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F'); break; @@ -85,11 +89,12 @@ hash_desc(StringInfo buf, XLogReaderState *record) { xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec; - appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c", + appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c, is_previous %c", xlrec->prevblkno, xlrec->nextblkno, xlrec->ntups, - xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F', + xlrec->is_prev_bucket_same_wrt ? 'T' : 'F'); break; } case XLOG_HASH_DELETE: diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 82b62c95de574..ca26d1f0ed151 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -16,6 +16,7 @@ #include "access/heapam_xlog.h" #include "access/rmgrdesc_utils.h" +#include "access/visibilitymapdefs.h" #include "storage/standbydefs.h" /* @@ -102,7 +103,7 @@ plan_elem_desc(StringInfo buf, void *plan, void *data) * code, the latter of which is used in frontend (pg_waldump) code. */ void -heap_xlog_deserialize_prune_and_freeze(char *cursor, uint8 flags, +heap_xlog_deserialize_prune_and_freeze(char *cursor, uint16 flags, int *nplans, xlhp_freeze_plan **plans, OffsetNumber **frz_offsets, int *nredirected, OffsetNumber **redirected, @@ -286,6 +287,15 @@ heap2_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, ", isCatalogRel: %c", xlrec->flags & XLHP_IS_CATALOG_REL ? 'T' : 'F'); + if (xlrec->flags & XLHP_VM_ALL_VISIBLE) + { + uint8 vmflags = VISIBILITYMAP_ALL_VISIBLE; + + if (xlrec->flags & XLHP_VM_ALL_FROZEN) + vmflags |= VISIBILITYMAP_ALL_FROZEN; + appendStringInfo(buf, ", vm_flags: 0x%02X", vmflags); + } + if (XLogRecHasBlockData(record, 0)) { Size datalen; @@ -354,6 +364,11 @@ heap2_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "ntuples: %d, flags: 0x%02X", xlrec->ntuples, xlrec->flags); + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + appendStringInfo(buf, ", vm_flags: 0x%02X", + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN); + if (XLogRecHasBlockData(record, 0) && !isinit) { appendStringInfoString(buf, ", offsets:"); diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 3ca0582db3647..052dd0a4ce56d 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -65,7 +65,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) xl_multixact_create *xlrec = (xl_multixact_create *) rec; int i; - appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid, + appendStringInfo(buf, "%u offset %" PRIu64 " nmembers %d: ", xlrec->mid, xlrec->moff, xlrec->nmembers); for (i = 0; i < xlrec->nmembers; i++) out_member(buf, &xlrec->members[i]); @@ -74,7 +74,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) { xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; - appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)", + appendStringInfo(buf, "offsets [%u, %u), members [%" PRIu64 ", %" PRIu64 ")", xlrec->startTruncOff, xlrec->endTruncOff, xlrec->startTruncMemb, xlrec->endTruncMemb); } diff --git a/src/backend/access/rmgrdesc/replorigindesc.c b/src/backend/access/rmgrdesc/replorigindesc.c index 5dd742339969a..35e3af2903ed2 100644 --- a/src/backend/access/rmgrdesc/replorigindesc.c +++ b/src/backend/access/rmgrdesc/replorigindesc.c @@ -29,7 +29,7 @@ replorigin_desc(StringInfo buf, XLogReaderState *record) xlrec = (xl_replorigin_set *) rec; - appendStringInfo(buf, "set %u; lsn %X/%X; force: %d", + appendStringInfo(buf, "set %u; lsn %X/%08X; force: %d", xlrec->node_id, LSN_FORMAT_ARGS(xlrec->remote_lsn), xlrec->force); diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c index 0d289d77fcf7a..a0edb78856bdf 100644 --- a/src/backend/access/rmgrdesc/seqdesc.c +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "commands/sequence.h" +#include "commands/sequence_xlog.h" void diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 305598e2865c8..f0f696855b9af 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -359,7 +359,7 @@ xact_desc_commit(StringInfo buf, uint8 info, xl_xact_commit *xlrec, RepOriginId if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN) { - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); @@ -384,7 +384,7 @@ xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec, RepOriginId or if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN) { - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); @@ -418,7 +418,7 @@ xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec, RepOriginI * way as PrepareRedoAdd(). */ if (origin_id != InvalidRepOriginId) - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 58040f28656fc..cd8edf5cc4902 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -65,8 +65,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) { CheckPoint *checkpoint = (CheckPoint *) rec; - appendStringInfo(buf, "redo %X/%X; " - "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " + appendStringInfo(buf, "redo %X/%08X; " + "tli %u; prev tli %u; fpw %s; wal_level %s; logical decoding %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", @@ -75,6 +75,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->PrevTimeLineID, checkpoint->fullPageWrites ? "true" : "false", get_wal_level_string(checkpoint->wal_level), + checkpoint->logicalDecodingEnabled ? "true" : "false", EpochFromFullTransactionId(checkpoint->nextXid), XidFromFullTransactionId(checkpoint->nextXid), checkpoint->nextOid, @@ -111,7 +112,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) XLogRecPtr startpoint; memcpy(&startpoint, rec, sizeof(XLogRecPtr)); - appendStringInfo(buf, "%X/%X", LSN_FORMAT_ARGS(startpoint)); + appendStringInfo(buf, "%X/%08X", LSN_FORMAT_ARGS(startpoint)); } else if (info == XLOG_PARAMETER_CHANGE) { @@ -156,7 +157,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xl_overwrite_contrecord xlrec; memcpy(&xlrec, rec, sizeof(xl_overwrite_contrecord)); - appendStringInfo(buf, "lsn %X/%X; time %s", + appendStringInfo(buf, "lsn %X/%08X; time %s", LSN_FORMAT_ARGS(xlrec.overwritten_lsn), timestamptz_to_str(xlrec.overwrite_time)); } @@ -167,6 +168,13 @@ xlog_desc(StringInfo buf, XLogReaderState *record) memcpy(&wal_level, rec, sizeof(int)); appendStringInfo(buf, "wal_level %s", get_wal_level_string(wal_level)); } + else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE) + { + bool enabled; + + memcpy(&enabled, rec, sizeof(bool)); + appendStringInfoString(buf, enabled ? "true" : "false"); + } } const char * @@ -218,6 +226,9 @@ xlog_identify(uint8 info) case XLOG_CHECKPOINT_REDO: id = "CHECKPOINT_REDO"; break; + case XLOG_LOGICAL_DECODING_STATUS_CHANGE: + id = "LOGICAL_DECODING_STATUS_CHANGE"; + break; } return id; diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c index af6b27b2135ac..1e1e61d20447a 100644 --- a/src/backend/access/spgist/spgdoinsert.c +++ b/src/backend/access/spgist/spgdoinsert.c @@ -89,7 +89,7 @@ addNode(SpGistState *state, SpGistInnerTuple tuple, Datum label, int offset) else if (offset > tuple->nNodes) elog(ERROR, "invalid offset for adding node to SPGiST inner tuple"); - nodes = palloc(sizeof(SpGistNodeTuple) * (tuple->nNodes + 1)); + nodes = palloc_array(SpGistNodeTuple, tuple->nNodes + 1); SGITITERATE(tuple, i, node) { if (i < offset) @@ -165,8 +165,7 @@ spgPageIndexMultiDelete(SpGistState *state, Page page, if (tuple == NULL || tuple->tupstate != tupstate) tuple = spgFormDeadTuple(state, tupstate, blkno, offnum); - if (PageAddItem(page, (Item) tuple, tuple->size, - itemno, false, false) != itemno) + if (PageAddItem(page, tuple, tuple->size, itemno, false, false) != itemno) elog(ERROR, "failed to add item of size %u to SPGiST index page", tuple->size); @@ -222,7 +221,7 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, /* Tuple is not part of a chain */ SGLT_SET_NEXTOFFSET(leafTuple, InvalidOffsetNumber); current->offnum = SpGistPageAddNewItem(state, current->page, - (Item) leafTuple, leafTuple->size, + leafTuple, leafTuple->size, NULL, false); xlrec.offnumLeaf = current->offnum; @@ -255,7 +254,7 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, { SGLT_SET_NEXTOFFSET(leafTuple, SGLT_GET_NEXTOFFSET(head)); offnum = SpGistPageAddNewItem(state, current->page, - (Item) leafTuple, leafTuple->size, + leafTuple, leafTuple->size, NULL, false); /* @@ -274,7 +273,7 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, SGLT_SET_NEXTOFFSET(leafTuple, InvalidOffsetNumber); PageIndexTupleDelete(current->page, current->offnum); if (PageAddItem(current->page, - (Item) leafTuple, leafTuple->size, + leafTuple, leafTuple->size, current->offnum, false, false) != current->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", leafTuple->size); @@ -410,8 +409,8 @@ moveLeafs(Relation index, SpGistState *state, /* Locate the tuples to be moved, and count up the space needed */ i = PageGetMaxOffsetNumber(current->page); - toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * i); - toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (i + 1)); + toDelete = palloc_array(OffsetNumber, i); + toInsert = palloc_array(OffsetNumber, i + 1); size = newLeafTuple->size + sizeof(ItemIdData); @@ -478,8 +477,7 @@ moveLeafs(Relation index, SpGistState *state, */ SGLT_SET_NEXTOFFSET(it, r); - r = SpGistPageAddNewItem(state, npage, (Item) it, it->size, - &startOffset, false); + r = SpGistPageAddNewItem(state, npage, it, it->size, &startOffset, false); toInsert[nInsert] = r; nInsert++; @@ -492,9 +490,7 @@ moveLeafs(Relation index, SpGistState *state, /* add the new tuple as well */ SGLT_SET_NEXTOFFSET(newLeafTuple, r); - r = SpGistPageAddNewItem(state, npage, - (Item) newLeafTuple, newLeafTuple->size, - &startOffset, false); + r = SpGistPageAddNewItem(state, npage, newLeafTuple, newLeafTuple->size, &startOffset, false); toInsert[nInsert] = r; nInsert++; memcpy(leafptr, newLeafTuple, newLeafTuple->size); @@ -638,7 +634,7 @@ checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig, { Datum theLabel = out->nodeLabels[theNode]; - out->nodeLabels = (Datum *) palloc(sizeof(Datum) * out->nNodes); + out->nodeLabels = palloc_array(Datum, out->nNodes); for (i = 0; i < out->nNodes; i++) out->nodeLabels[i] = theLabel; } @@ -721,12 +717,12 @@ doPickSplit(Relation index, SpGistState *state, */ max = PageGetMaxOffsetNumber(current->page); n = max + 1; - in.datums = (Datum *) palloc(sizeof(Datum) * n); - toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n); - toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n); - oldLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n); - newLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n); - leafPageSelect = (uint8 *) palloc(sizeof(uint8) * n); + in.datums = palloc_array(Datum, n); + toDelete = palloc_array(OffsetNumber, n); + toInsert = palloc_array(OffsetNumber, n); + oldLeafs = palloc_array(SpGistLeafTuple, n); + newLeafs = palloc_array(SpGistLeafTuple, n); + leafPageSelect = palloc_array(uint8, n); STORE_STATE(state, xlrec.stateSrc); @@ -862,7 +858,7 @@ doPickSplit(Relation index, SpGistState *state, out.hasPrefix = false; out.nNodes = 1; out.nodeLabels = NULL; - out.mapTuplesToNodes = palloc0(sizeof(int) * in.nTuples); + out.mapTuplesToNodes = palloc0_array(int, in.nTuples); /* * Form new leaf tuples and count up the total space needed. @@ -918,8 +914,8 @@ doPickSplit(Relation index, SpGistState *state, * out.nNodes with a value larger than the number of tuples on the input * page, we can't allocate these arrays before here. */ - nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * out.nNodes); - leafSizes = (int *) palloc0(sizeof(int) * out.nNodes); + nodes = palloc_array(SpGistNodeTuple, out.nNodes); + leafSizes = palloc0_array(int, out.nNodes); /* * Form nodes of inner tuple and inner tuple itself @@ -1058,7 +1054,7 @@ doPickSplit(Relation index, SpGistState *state, * do so, even if totalLeafSizes is less than the available space, * because we can't split a group across pages. */ - nodePageSelect = (uint8 *) palloc(sizeof(uint8) * out.nNodes); + nodePageSelect = palloc_array(uint8, out.nNodes); curspace = currentFreeSpace; newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer)); @@ -1226,7 +1222,7 @@ doPickSplit(Relation index, SpGistState *state, /* Insert it on page */ newoffset = SpGistPageAddNewItem(state, BufferGetPage(leafBuffer), - (Item) it, it->size, + it, it->size, &startOffsets[leafPageSelect[i]], false); toInsert[i] = newoffset; @@ -1268,7 +1264,7 @@ doPickSplit(Relation index, SpGistState *state, current->page = parent->page; xlrec.offnumInner = current->offnum = SpGistPageAddNewItem(state, current->page, - (Item) innerTuple, innerTuple->size, + innerTuple, innerTuple->size, NULL, false); /* @@ -1302,7 +1298,7 @@ doPickSplit(Relation index, SpGistState *state, current->page = BufferGetPage(current->buffer); xlrec.offnumInner = current->offnum = SpGistPageAddNewItem(state, current->page, - (Item) innerTuple, innerTuple->size, + innerTuple, innerTuple->size, NULL, false); /* Done modifying new current buffer, mark it dirty */ @@ -1340,7 +1336,7 @@ doPickSplit(Relation index, SpGistState *state, xlrec.innerIsParent = false; xlrec.offnumInner = current->offnum = - PageAddItem(current->page, (Item) innerTuple, innerTuple->size, + PageAddItem(current->page, innerTuple, innerTuple->size, InvalidOffsetNumber, false, false); if (current->offnum != FirstOffsetNumber) elog(ERROR, "failed to add item of size %u to SPGiST index page", @@ -1547,7 +1543,7 @@ spgAddNodeAction(Relation index, SpGistState *state, PageIndexTupleDelete(current->page, current->offnum); if (PageAddItem(current->page, - (Item) newInnerTuple, newInnerTuple->size, + newInnerTuple, newInnerTuple->size, current->offnum, false, false) != current->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", newInnerTuple->size); @@ -1631,7 +1627,7 @@ spgAddNodeAction(Relation index, SpGistState *state, /* insert new ... */ xlrec.offnumNew = current->offnum = SpGistPageAddNewItem(state, current->page, - (Item) newInnerTuple, newInnerTuple->size, + newInnerTuple, newInnerTuple->size, NULL, false); MarkBufferDirty(current->buffer); @@ -1654,7 +1650,7 @@ spgAddNodeAction(Relation index, SpGistState *state, current->blkno, current->offnum); PageIndexTupleDelete(saveCurrent.page, saveCurrent.offnum); - if (PageAddItem(saveCurrent.page, (Item) dt, dt->size, + if (PageAddItem(saveCurrent.page, dt, dt->size, saveCurrent.offnum, false, false) != saveCurrent.offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", @@ -1744,8 +1740,7 @@ spgSplitNodeAction(Relation index, SpGistState *state, * Construct new prefix tuple with requested number of nodes. We'll fill * in the childNodeN'th node's downlink below. */ - nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * - out->result.splitTuple.prefixNNodes); + nodes = palloc_array(SpGistNodeTuple, out->result.splitTuple.prefixNNodes); for (i = 0; i < out->result.splitTuple.prefixNNodes; i++) { @@ -1773,7 +1768,7 @@ spgSplitNodeAction(Relation index, SpGistState *state, * same node datums, but with the prefix specified by the picksplit * function. */ - nodes = palloc(sizeof(SpGistNodeTuple) * innerTuple->nNodes); + nodes = palloc_array(SpGistNodeTuple, innerTuple->nNodes); SGITITERATE(innerTuple, i, node) { nodes[i] = node; @@ -1818,7 +1813,7 @@ spgSplitNodeAction(Relation index, SpGistState *state, */ PageIndexTupleDelete(current->page, current->offnum); xlrec.offnumPrefix = PageAddItem(current->page, - (Item) prefixTuple, prefixTuple->size, + prefixTuple, prefixTuple->size, current->offnum, false, false); if (xlrec.offnumPrefix != current->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", @@ -1832,7 +1827,7 @@ spgSplitNodeAction(Relation index, SpGistState *state, postfixBlkno = current->blkno; xlrec.offnumPostfix = postfixOffset = SpGistPageAddNewItem(state, current->page, - (Item) postfixTuple, postfixTuple->size, + postfixTuple, postfixTuple->size, NULL, false); xlrec.postfixBlkSame = true; } @@ -1841,7 +1836,7 @@ spgSplitNodeAction(Relation index, SpGistState *state, postfixBlkno = BufferGetBlockNumber(newBuffer); xlrec.offnumPostfix = postfixOffset = SpGistPageAddNewItem(state, BufferGetPage(newBuffer), - (Item) postfixTuple, postfixTuple->size, + postfixTuple, postfixTuple->size, NULL, false); MarkBufferDirty(newBuffer); xlrec.postfixBlkSame = false; @@ -1912,7 +1907,7 @@ spgSplitNodeAction(Relation index, SpGistState *state, */ bool spgdoinsert(Relation index, SpGistState *state, - ItemPointer heapPtr, Datum *datums, bool *isnulls) + const ItemPointerData *heapPtr, const Datum *datums, const bool *isnulls) { bool result = true; TupleDesc leafDescriptor = state->leafTupDesc; diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 6a61e093fa05a..dda99755f6662 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -140,7 +140,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) true); } - result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); + result = palloc0_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; diff --git a/src/backend/access/spgist/spgkdtreeproc.c b/src/backend/access/spgist/spgkdtreeproc.c index d6989759e5fd7..f0167d6ffa688 100644 --- a/src/backend/access/spgist/spgkdtreeproc.c +++ b/src/backend/access/spgist/spgkdtreeproc.c @@ -114,7 +114,7 @@ spg_kd_picksplit(PG_FUNCTION_ARGS) SortedPoint *sorted; double coord; - sorted = palloc(sizeof(*sorted) * in->nTuples); + sorted = palloc_array(SortedPoint, in->nTuples); for (i = 0; i < in->nTuples; i++) { sorted[i].p = DatumGetPointP(in->datums[i]); @@ -132,8 +132,8 @@ spg_kd_picksplit(PG_FUNCTION_ARGS) out->nNodes = 2; out->nodeLabels = NULL; /* we don't need node labels */ - out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); - out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = palloc_array(int, in->nTuples); + out->leafTupleDatums = palloc_array(Datum, in->nTuples); /* * Note: points that have coordinates exactly equal to coord may get @@ -259,7 +259,7 @@ spg_kd_inner_consistent(PG_FUNCTION_ARGS) if (!which) PG_RETURN_VOID(); - out->nodeNumbers = (int *) palloc(sizeof(int) * 2); + out->nodeNumbers = palloc_array(int, 2); /* * When ordering scan keys are specified, we've to calculate distance for @@ -273,8 +273,8 @@ spg_kd_inner_consistent(PG_FUNCTION_ARGS) BOX infArea; BOX *area; - out->distances = (double **) palloc(sizeof(double *) * in->nNodes); - out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes); + out->distances = palloc_array(double *, in->nNodes); + out->traversalValues = palloc_array(void *, in->nNodes); if (in->level == 0) { @@ -335,7 +335,7 @@ spg_kd_inner_consistent(PG_FUNCTION_ARGS) } /* Set up level increments, too */ - out->levelAdds = (int *) palloc(sizeof(int) * 2); + out->levelAdds = palloc_array(int, 2); out->levelAdds[0] = 1; out->levelAdds[1] = 1; diff --git a/src/backend/access/spgist/spgproc.c b/src/backend/access/spgist/spgproc.c index 660009291da4d..e169c60a46340 100644 --- a/src/backend/access/spgist/spgproc.c +++ b/src/backend/access/spgist/spgproc.c @@ -51,7 +51,7 @@ point_box_distance(Point *point, BOX *box) else dy = 0.0; - return HYPOT(dx, dy); + return hypot(dx, dy); } /* @@ -64,7 +64,7 @@ spg_key_orderbys_distances(Datum key, bool isLeaf, ScanKey orderbys, int norderbys) { int sk_num; - double *distances = (double *) palloc(norderbys * sizeof(double)), + double *distances = palloc_array(double, norderbys), *distance = distances; for (sk_num = 0; sk_num < norderbys; ++sk_num, ++orderbys, ++distance) @@ -81,7 +81,7 @@ spg_key_orderbys_distances(Datum key, bool isLeaf, BOX * box_copy(BOX *orig) { - BOX *result = palloc(sizeof(BOX)); + BOX *result = palloc_object(BOX); *result = *orig; return result; diff --git a/src/backend/access/spgist/spgquadtreeproc.c b/src/backend/access/spgist/spgquadtreeproc.c index 3e8cfa1709af3..75ffb09ca5aaa 100644 --- a/src/backend/access/spgist/spgquadtreeproc.c +++ b/src/backend/access/spgist/spgquadtreeproc.c @@ -82,7 +82,7 @@ getQuadrant(Point *centroid, Point *tst) static BOX * getQuadrantArea(BOX *bbox, Point *centroid, int quadrant) { - BOX *result = (BOX *) palloc(sizeof(BOX)); + BOX *result = palloc_object(BOX); switch (quadrant) { @@ -177,11 +177,11 @@ spg_quad_picksplit(PG_FUNCTION_ARGS) /* Use the median values of x and y as the centroid point */ Point **sorted; - sorted = palloc(sizeof(*sorted) * in->nTuples); + sorted = palloc_array(Point *, in->nTuples); for (i = 0; i < in->nTuples; i++) sorted[i] = DatumGetPointP(in->datums[i]); - centroid = palloc(sizeof(*centroid)); + centroid = palloc_object(Point); qsort(sorted, in->nTuples, sizeof(*sorted), x_cmp); centroid->x = sorted[in->nTuples >> 1]->x; @@ -189,7 +189,7 @@ spg_quad_picksplit(PG_FUNCTION_ARGS) centroid->y = sorted[in->nTuples >> 1]->y; #else /* Use the average values of x and y as the centroid point */ - centroid = palloc0(sizeof(*centroid)); + centroid = palloc0_object(Point); for (i = 0; i < in->nTuples; i++) { @@ -207,8 +207,8 @@ spg_quad_picksplit(PG_FUNCTION_ARGS) out->nNodes = 4; out->nodeLabels = NULL; /* we don't need node labels */ - out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); - out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = palloc_array(int, in->nTuples); + out->leafTupleDatums = palloc_array(Datum, in->nTuples); for (i = 0; i < in->nTuples; i++) { @@ -246,8 +246,8 @@ spg_quad_inner_consistent(PG_FUNCTION_ARGS) */ if (in->norderbys > 0) { - out->distances = (double **) palloc(sizeof(double *) * in->nNodes); - out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes); + out->distances = palloc_array(double *, in->nNodes); + out->traversalValues = palloc_array(void *, in->nNodes); if (in->level == 0) { @@ -270,7 +270,7 @@ spg_quad_inner_consistent(PG_FUNCTION_ARGS) { /* Report that all nodes should be visited */ out->nNodes = in->nNodes; - out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->nodeNumbers = palloc_array(int, in->nNodes); for (i = 0; i < in->nNodes; i++) { out->nodeNumbers[i] = i; @@ -368,12 +368,12 @@ spg_quad_inner_consistent(PG_FUNCTION_ARGS) break; /* no need to consider remaining conditions */ } - out->levelAdds = palloc(sizeof(int) * 4); + out->levelAdds = palloc_array(int, 4); for (i = 0; i < 4; ++i) out->levelAdds[i] = 1; /* We must descend into the quadrant(s) identified by which */ - out->nodeNumbers = (int *) palloc(sizeof(int) * 4); + out->nodeNumbers = palloc_array(int, 4); out->nNodes = 0; for (i = 1; i <= 4; i++) diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 25893050c5876..946772f39576b 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -309,9 +309,9 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) scan = RelationGetIndexScan(rel, keysz, orderbysz); - so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData)); + so = palloc0_object(SpGistScanOpaqueData); if (keysz > 0) - so->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * keysz); + so->keyData = palloc_array(ScanKeyData, keysz); else so->keyData = NULL; initSpGistState(&so->state, scan->indexRelation); @@ -336,16 +336,12 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) if (scan->numberOfOrderBys > 0) { /* This will be filled in spgrescan, but allocate the space here */ - so->orderByTypes = (Oid *) - palloc(sizeof(Oid) * scan->numberOfOrderBys); - so->nonNullOrderByOffsets = (int *) - palloc(sizeof(int) * scan->numberOfOrderBys); + so->orderByTypes = palloc_array(Oid, scan->numberOfOrderBys); + so->nonNullOrderByOffsets = palloc_array(int, scan->numberOfOrderBys); /* These arrays have constant contents, so we can fill them now */ - so->zeroDistances = (double *) - palloc(sizeof(double) * scan->numberOfOrderBys); - so->infDistances = (double *) - palloc(sizeof(double) * scan->numberOfOrderBys); + so->zeroDistances = palloc_array(double, scan->numberOfOrderBys); + so->infDistances = palloc_array(double, scan->numberOfOrderBys); for (i = 0; i < scan->numberOfOrderBys; i++) { @@ -353,10 +349,8 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) so->infDistances[i] = get_float8_infinity(); } - scan->xs_orderbyvals = (Datum *) - palloc0(sizeof(Datum) * scan->numberOfOrderBys); - scan->xs_orderbynulls = (bool *) - palloc(sizeof(bool) * scan->numberOfOrderBys); + scan->xs_orderbyvals = palloc0_array(Datum, scan->numberOfOrderBys); + scan->xs_orderbynulls = palloc_array(bool, scan->numberOfOrderBys); memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); } @@ -690,7 +684,7 @@ spgInnerTest(SpGistScanOpaque so, SpGistSearchItem *item, { /* force all children to be visited */ out.nNodes = nNodes; - out.nodeNumbers = (int *) palloc(sizeof(int) * nNodes); + out.nodeNumbers = palloc_array(int, nNodes); for (i = 0; i < nNodes; i++) out.nodeNumbers[i] = i; } @@ -703,7 +697,7 @@ spgInnerTest(SpGistScanOpaque so, SpGistSearchItem *item, { /* collect node pointers */ SpGistNodeTuple node; - SpGistNodeTuple *nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * nNodes); + SpGistNodeTuple *nodes = palloc_array(SpGistNodeTuple, nNodes); SGITITERATE(innerTuple, i, node) { @@ -972,8 +966,8 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, so->distances[so->nPtrs] = NULL; else { - IndexOrderByDistance *distances = - palloc(sizeof(distances[0]) * so->numberOfOrderBys); + IndexOrderByDistance *distances = palloc_array(IndexOrderByDistance, + so->numberOfOrderBys); int i; for (i = 0; i < so->numberOfOrderBys; i++) diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c index 73842655f086c..09be7ffe47eb6 100644 --- a/src/backend/access/spgist/spgtextproc.c +++ b/src/backend/access/spgist/spgtextproc.c @@ -155,7 +155,7 @@ commonPrefix(const char *a, const char *b, int lena, int lenb) * On success, *i gets the match location; on failure, it gets where to insert */ static bool -searchChar(Datum *nodeLabels, int nNodes, int16 c, int *i) +searchChar(const Datum *nodeLabels, int nNodes, int16 c, int *i) { int StopLow = 0, StopHigh = nNodes; @@ -230,8 +230,7 @@ spg_text_choose(PG_FUNCTION_ARGS) formTextDatum(prefixStr, commonLen); } out->result.splitTuple.prefixNNodes = 1; - out->result.splitTuple.prefixNodeLabels = - (Datum *) palloc(sizeof(Datum)); + out->result.splitTuple.prefixNodeLabels = palloc_object(Datum); out->result.splitTuple.prefixNodeLabels[0] = Int16GetDatum(*(unsigned char *) (prefixStr + commonLen)); @@ -303,7 +302,7 @@ spg_text_choose(PG_FUNCTION_ARGS) out->result.splitTuple.prefixHasPrefix = in->hasPrefix; out->result.splitTuple.prefixPrefixDatum = in->prefixDatum; out->result.splitTuple.prefixNNodes = 1; - out->result.splitTuple.prefixNodeLabels = (Datum *) palloc(sizeof(Datum)); + out->result.splitTuple.prefixNodeLabels = palloc_object(Datum); out->result.splitTuple.prefixNodeLabels[0] = Int16GetDatum(-2); out->result.splitTuple.childNodeN = 0; out->result.splitTuple.postfixHasPrefix = false; @@ -371,7 +370,7 @@ spg_text_picksplit(PG_FUNCTION_ARGS) } /* Extract the node label (first non-common byte) from each value */ - nodes = (spgNodePtr *) palloc(sizeof(spgNodePtr) * in->nTuples); + nodes = palloc_array(spgNodePtr, in->nTuples); for (i = 0; i < in->nTuples; i++) { @@ -394,9 +393,9 @@ spg_text_picksplit(PG_FUNCTION_ARGS) /* And emit results */ out->nNodes = 0; - out->nodeLabels = (Datum *) palloc(sizeof(Datum) * in->nTuples); - out->mapTuplesToNodes = (int *) palloc(sizeof(int) * in->nTuples); - out->leafTupleDatums = (Datum *) palloc(sizeof(Datum) * in->nTuples); + out->nodeLabels = palloc_array(Datum, in->nTuples); + out->mapTuplesToNodes = palloc_array(int, in->nTuples); + out->leafTupleDatums = palloc_array(Datum, in->nTuples); for (i = 0; i < in->nTuples; i++) { @@ -476,9 +475,9 @@ spg_text_inner_consistent(PG_FUNCTION_ARGS) * and see if it's consistent with the query. If so, emit an entry into * the output arrays. */ - out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); - out->levelAdds = (int *) palloc(sizeof(int) * in->nNodes); - out->reconstructedValues = (Datum *) palloc(sizeof(Datum) * in->nNodes); + out->nodeNumbers = palloc_array(int, in->nNodes); + out->levelAdds = palloc_array(int, in->nNodes); + out->reconstructedValues = palloc_array(Datum, in->nNodes); out->nNodes = 0; for (i = 0; i < in->nNodes; i++) diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 95fea74e296f8..a60ec85e8bedc 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -785,7 +785,7 @@ SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum) else if (att->attlen > 0) size = att->attlen; else - size = VARSIZE_ANY(datum); + size = VARSIZE_ANY(DatumGetPointer(datum)); return MAXALIGN(size); } @@ -804,7 +804,7 @@ memcpyInnerDatum(void *target, SpGistTypeDesc *att, Datum datum) } else { - size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum); + size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(DatumGetPointer(datum)); memcpy(target, DatumGetPointer(datum), size); } } @@ -868,7 +868,7 @@ SpGistGetLeafTupleSize(TupleDesc tupleDescriptor, * Construct a leaf tuple containing the given heap TID and datum values */ SpGistLeafTuple -spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr, +spgFormLeafTuple(SpGistState *state, const ItemPointerData *heapPtr, const Datum *datums, const bool *isnulls) { SpGistLeafTuple tup; @@ -1177,7 +1177,7 @@ spgExtractNodeLabels(SpGistState *state, SpGistInnerTuple innerTuple) } else { - nodeLabels = (Datum *) palloc(sizeof(Datum) * innerTuple->nNodes); + nodeLabels = palloc_array(Datum, innerTuple->nNodes); SGITITERATE(innerTuple, i, node) { if (IndexTupleHasNulls(node)) @@ -1200,7 +1200,7 @@ spgExtractNodeLabels(SpGistState *state, SpGistInnerTuple innerTuple) * rather than returning InvalidOffsetNumber. */ OffsetNumber -SpGistPageAddNewItem(SpGistState *state, Page page, Item item, Size size, +SpGistPageAddNewItem(SpGistState *state, Page page, const void *item, Size size, OffsetNumber *startOffset, bool errorOK) { SpGistPageOpaque opaque = SpGistPageGetOpaque(page); diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 2678f7ab7829a..cb5671c1a4e95 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -61,7 +61,7 @@ typedef struct spgBulkDeleteState * ensures that scans of the list don't miss items added during the scan. */ static void -spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid) +spgAddPendingTID(spgBulkDeleteState *bds, const ItemPointerData *tid) { spgVacPendingItem *pitem; spgVacPendingItem **listLink; @@ -76,7 +76,7 @@ spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid) listLink = &pitem->next; } /* not there, so append new entry */ - pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem)); + pitem = palloc_object(spgVacPendingItem); pitem->tid = *tid; pitem->done = false; pitem->next = NULL; @@ -626,7 +626,7 @@ spgvacuumpage(spgBulkDeleteState *bds, Buffer buffer) Page page; LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (PageIsNew(page)) { @@ -707,7 +707,7 @@ spgprocesspending(spgBulkDeleteState *bds) buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, bds->info->strategy); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); if (PageIsNew(page) || SpGistPageIsDeleted(page)) { @@ -954,7 +954,7 @@ spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* allocate stats if first time through, else re-use existing struct */ if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); bds.info = info; bds.stats = stats; bds.callback = callback; @@ -994,7 +994,7 @@ spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) */ if (stats == NULL) { - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats = palloc0_object(IndexBulkDeleteResult); bds.info = info; bds.stats = stats; bds.callback = dummy_callback; diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index b7986e6f7131e..f635be0698fbb 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -47,7 +47,7 @@ fillFakeState(SpGistState *state, spgxlogState stateSrc) * existing tuple, it had better be a placeholder tuple. */ static void -addOrReplaceTuple(Page page, Item tuple, int size, OffsetNumber offset) +addOrReplaceTuple(Page page, const void *tuple, int size, OffsetNumber offset) { if (offset <= PageGetMaxOffsetNumber(page)) { @@ -110,8 +110,7 @@ spgRedoAddLeaf(XLogReaderState *record) if (xldata->offnumLeaf != xldata->offnumHeadLeaf) { /* normal cases, tuple was added by SpGistPageAddNewItem */ - addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, - xldata->offnumLeaf); + addOrReplaceTuple(page, leafTuple, leafTupleHdr.size, xldata->offnumLeaf); /* update head tuple's chain link if needed */ if (xldata->offnumHeadLeaf != InvalidOffsetNumber) @@ -129,7 +128,7 @@ spgRedoAddLeaf(XLogReaderState *record) /* replacing a DEAD tuple */ PageIndexTupleDelete(page, xldata->offnumLeaf); if (PageAddItem(page, - (Item) leafTuple, leafTupleHdr.size, + leafTuple, leafTupleHdr.size, xldata->offnumLeaf, false, false) != xldata->offnumLeaf) elog(ERROR, "failed to add item of size %u to SPGiST index page", leafTupleHdr.size); @@ -232,8 +231,7 @@ spgRedoMoveLeafs(XLogReaderState *record) memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData)); - addOrReplaceTuple(page, (Item) leafTuple, - leafTupleHdr.size, toInsert[i]); + addOrReplaceTuple(page, leafTuple, leafTupleHdr.size, toInsert[i]); ptr += leafTupleHdr.size; } @@ -309,7 +307,7 @@ spgRedoAddNode(XLogReaderState *record) page = BufferGetPage(buffer); PageIndexTupleDelete(page, xldata->offnum); - if (PageAddItem(page, (Item) innerTuple, innerTupleHdr.size, + if (PageAddItem(page, innerTuple, innerTupleHdr.size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", @@ -351,8 +349,7 @@ spgRedoAddNode(XLogReaderState *record) { page = BufferGetPage(buffer); - addOrReplaceTuple(page, (Item) innerTuple, - innerTupleHdr.size, xldata->offnumNew); + addOrReplaceTuple(page, innerTuple, innerTupleHdr.size, xldata->offnumNew); /* * If parent is in this same page, update it now. @@ -390,7 +387,7 @@ spgRedoAddNode(XLogReaderState *record) xldata->offnumNew); PageIndexTupleDelete(page, xldata->offnum); - if (PageAddItem(page, (Item) dt, dt->size, + if (PageAddItem(page, dt, dt->size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", @@ -492,8 +489,7 @@ spgRedoSplitTuple(XLogReaderState *record) { page = BufferGetPage(buffer); - addOrReplaceTuple(page, (Item) postfixTuple, - postfixTupleHdr.size, xldata->offnumPostfix); + addOrReplaceTuple(page, postfixTuple, postfixTupleHdr.size, xldata->offnumPostfix); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -508,15 +504,13 @@ spgRedoSplitTuple(XLogReaderState *record) page = BufferGetPage(buffer); PageIndexTupleDelete(page, xldata->offnumPrefix); - if (PageAddItem(page, (Item) prefixTuple, prefixTupleHdr.size, + if (PageAddItem(page, prefixTuple, prefixTupleHdr.size, xldata->offnumPrefix, false, false) != xldata->offnumPrefix) elog(ERROR, "failed to add item of size %u to SPGiST index page", prefixTupleHdr.size); if (xldata->postfixBlkSame) - addOrReplaceTuple(page, (Item) postfixTuple, - postfixTupleHdr.size, - xldata->offnumPostfix); + addOrReplaceTuple(page, postfixTuple, postfixTupleHdr.size, xldata->offnumPostfix); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -576,7 +570,7 @@ spgRedoPickSplit(XLogReaderState *record) { /* just re-init the source page */ srcBuffer = XLogInitBufferForRedo(record, 0); - srcPage = (Page) BufferGetPage(srcBuffer); + srcPage = BufferGetPage(srcBuffer); SpGistInitBuffer(srcBuffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); @@ -629,7 +623,7 @@ spgRedoPickSplit(XLogReaderState *record) { /* just re-init the dest page */ destBuffer = XLogInitBufferForRedo(record, 1); - destPage = (Page) BufferGetPage(destBuffer); + destPage = BufferGetPage(destBuffer); SpGistInitBuffer(destBuffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); @@ -642,7 +636,7 @@ spgRedoPickSplit(XLogReaderState *record) * full-page-image case, but for safety let's hold it till later. */ if (XLogReadBufferForRedo(record, 1, &destBuffer) == BLK_NEEDS_REDO) - destPage = (Page) BufferGetPage(destBuffer); + destPage = BufferGetPage(destBuffer); else destPage = NULL; /* don't do any page updates */ } @@ -662,8 +656,7 @@ spgRedoPickSplit(XLogReaderState *record) if (page == NULL) continue; /* no need to touch this page */ - addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, - toInsert[i]); + addOrReplaceTuple(page, leafTuple, leafTupleHdr.size, toInsert[i]); } /* Now update src and dest page LSNs if needed */ @@ -692,8 +685,7 @@ spgRedoPickSplit(XLogReaderState *record) { page = BufferGetPage(innerBuffer); - addOrReplaceTuple(page, (Item) innerTuple, innerTupleHdr.size, - xldata->offnumInner); + addOrReplaceTuple(page, innerTuple, innerTupleHdr.size, xldata->offnumInner); /* if inner is also parent, update link while we're here */ if (xldata->innerIsParent) @@ -909,7 +901,7 @@ spgRedoVacuumRedirect(XLogReaderState *record) int max = PageGetMaxOffsetNumber(page); OffsetNumber *toDelete; - toDelete = palloc(sizeof(OffsetNumber) * max); + toDelete = palloc_array(OffsetNumber, max); for (i = xldata->firstPlaceholder; i <= max; i++) toDelete[i - xldata->firstPlaceholder] = i; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index a56c5eceb14ad..73ebc01a08fb3 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -110,7 +110,7 @@ table_slot_create(Relation relation, List **reglist) */ TableScanDesc -table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key) +table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT; @@ -188,6 +188,37 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) pscan, flags); } +TableScanDesc +table_beginscan_parallel_tidrange(Relation relation, + ParallelTableScanDesc pscan) +{ + Snapshot snapshot; + uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE; + TableScanDesc sscan; + + Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); + + /* disable syncscan in parallel tid range scan. */ + pscan->phs_syncscan = false; + + if (!pscan->phs_snapshot_any) + { + /* Snapshot was serialized -- restore it */ + snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); + RegisterSnapshot(snapshot); + flags |= SO_TEMP_SNAPSHOT; + } + else + { + /* SnapshotAny passed by caller (not serialized) */ + snapshot = SnapshotAny; + } + + sscan = relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, + pscan, flags); + return sscan; +} + /* ---------------------------------------------------------------------------- * Index scan related functions. @@ -398,6 +429,7 @@ table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) bpscan->phs_nblocks > NBuffers / 4; SpinLockInit(&bpscan->phs_mutex); bpscan->phs_startblock = InvalidBlockNumber; + bpscan->phs_numblock = InvalidBlockNumber; pg_atomic_init_u64(&bpscan->phs_nallocated, 0); return sizeof(ParallelBlockTableScanDescData); @@ -416,57 +448,59 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) * * Determine where the parallel seq scan should start. This function may be * called many times, once by each parallel worker. We must be careful only - * to set the startblock once. + * to set the phs_startblock and phs_numblock fields once. + * + * Callers may optionally specify a non-InvalidBlockNumber value for + * 'startblock' to force the scan to start at the given page. Likewise, + * 'numblocks' can be specified as a non-InvalidBlockNumber to limit the + * number of blocks to scan to that many blocks. */ void table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanWorker pbscanwork, - ParallelBlockTableScanDesc pbscan) + ParallelBlockTableScanDesc pbscan, + BlockNumber startblock, + BlockNumber numblocks) { + StaticAssertDecl(MaxBlockNumber <= 0xFFFFFFFE, + "pg_nextpower2_32 may be too small for non-standard BlockNumber width"); + BlockNumber sync_startpage = InvalidBlockNumber; + BlockNumber scan_nblocks; /* Reset the state we use for controlling allocation size. */ memset(pbscanwork, 0, sizeof(*pbscanwork)); - StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE, - "pg_nextpower2_32 may be too small for non-standard BlockNumber width"); - - /* - * We determine the chunk size based on the size of the relation. First we - * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then - * take the next highest power of 2 number of the chunk size. This means - * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS - * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks. - */ - pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks / - PARALLEL_SEQSCAN_NCHUNKS, 1)); - - /* - * Ensure we don't go over the maximum chunk size with larger tables. This - * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger - * tables. Too large a chunk size has been shown to be detrimental to - * synchronous scan performance. - */ - pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size, - PARALLEL_SEQSCAN_MAX_CHUNK_SIZE); - retry: /* Grab the spinlock. */ SpinLockAcquire(&pbscan->phs_mutex); /* - * If the scan's startblock has not yet been initialized, we must do so - * now. If this is not a synchronized scan, we just start at block 0, but - * if it is a synchronized scan, we must get the starting position from - * the synchronized scan machinery. We can't hold the spinlock while - * doing that, though, so release the spinlock, get the information we - * need, and retry. If nobody else has initialized the scan in the - * meantime, we'll fill in the value we fetched on the second time - * through. + * When the caller specified a limit on the number of blocks to scan, set + * that in the ParallelBlockTableScanDesc, if it's not been done by + * another worker already. + */ + if (numblocks != InvalidBlockNumber && + pbscan->phs_numblock == InvalidBlockNumber) + { + pbscan->phs_numblock = numblocks; + } + + /* + * If the scan's phs_startblock has not yet been initialized, we must do + * so now. If a startblock was specified, start there, otherwise if this + * is not a synchronized scan, we just start at block 0, but if it is a + * synchronized scan, we must get the starting position from the + * synchronized scan machinery. We can't hold the spinlock while doing + * that, though, so release the spinlock, get the information we need, and + * retry. If nobody else has initialized the scan in the meantime, we'll + * fill in the value we fetched on the second time through. */ if (pbscan->phs_startblock == InvalidBlockNumber) { - if (!pbscan->base.phs_syncscan) + if (startblock != InvalidBlockNumber) + pbscan->phs_startblock = startblock; + else if (!pbscan->base.phs_syncscan) pbscan->phs_startblock = 0; else if (sync_startpage != InvalidBlockNumber) pbscan->phs_startblock = sync_startpage; @@ -478,6 +512,34 @@ table_block_parallelscan_startblock_init(Relation rel, } } SpinLockRelease(&pbscan->phs_mutex); + + /* + * Figure out how many blocks we're going to scan; either all of them, or + * just phs_numblock's worth, if a limit has been imposed. + */ + if (pbscan->phs_numblock == InvalidBlockNumber) + scan_nblocks = pbscan->phs_nblocks; + else + scan_nblocks = pbscan->phs_numblock; + + /* + * We determine the chunk size based on scan_nblocks. First we split + * scan_nblocks into PARALLEL_SEQSCAN_NCHUNKS chunks then we calculate the + * next highest power of 2 number of the result. This means we split the + * blocks we're scanning into somewhere between PARALLEL_SEQSCAN_NCHUNKS + * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks. + */ + pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(scan_nblocks / + PARALLEL_SEQSCAN_NCHUNKS, 1)); + + /* + * Ensure we don't go over the maximum chunk size with larger tables. This + * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger + * tables. Too large a chunk size has been shown to be detrimental to + * sequential scan performance. + */ + pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size, + PARALLEL_SEQSCAN_MAX_CHUNK_SIZE); } /* @@ -493,6 +555,7 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan) { + BlockNumber scan_nblocks; BlockNumber page; uint64 nallocated; @@ -513,7 +576,7 @@ table_block_parallelscan_nextpage(Relation rel, * * Here we name these ranges of blocks "chunks". The initial size of * these chunks is determined in table_block_parallelscan_startblock_init - * based on the size of the relation. Towards the end of the scan, we + * based on the number of blocks to scan. Towards the end of the scan, we * start making reductions in the size of the chunks in order to attempt * to divide the remaining work over all the workers as evenly as * possible. @@ -530,17 +593,23 @@ table_block_parallelscan_nextpage(Relation rel, * phs_nallocated counter will exceed rs_nblocks, because workers will * still increment the value, when they try to allocate the next block but * all blocks have been allocated already. The counter must be 64 bits - * wide because of that, to avoid wrapping around when rs_nblocks is close - * to 2^32. + * wide because of that, to avoid wrapping around when scan_nblocks is + * close to 2^32. * * The actual block to return is calculated by adding the counter to the - * starting block number, modulo nblocks. + * starting block number, modulo phs_nblocks. */ + /* First, figure out how many blocks we're planning on scanning */ + if (pbscan->phs_numblock == InvalidBlockNumber) + scan_nblocks = pbscan->phs_nblocks; + else + scan_nblocks = pbscan->phs_numblock; + /* - * First check if we have any remaining blocks in a previous chunk for - * this worker. We must consume all of the blocks from that before we - * allocate a new chunk to the worker. + * Now check if we have any remaining blocks in a previous chunk for this + * worker. We must consume all of the blocks from that before we allocate + * a new chunk to the worker. */ if (pbscanwork->phsw_chunk_remaining > 0) { @@ -562,7 +631,7 @@ table_block_parallelscan_nextpage(Relation rel, * chunk size set to 1. */ if (pbscanwork->phsw_chunk_size > 1 && - pbscanwork->phsw_nallocated > pbscan->phs_nblocks - + pbscanwork->phsw_nallocated > scan_nblocks - (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS)) pbscanwork->phsw_chunk_size >>= 1; @@ -577,7 +646,8 @@ table_block_parallelscan_nextpage(Relation rel, pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1; } - if (nallocated >= pbscan->phs_nblocks) + /* Check if we've run out of blocks to scan */ + if (nallocated >= scan_nblocks) page = InvalidBlockNumber; /* all blocks have been allocated */ else page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks; diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c index b60fab0a4d294..11f97d65367d5 100644 --- a/src/backend/access/table/toast_helper.c +++ b/src/backend/access/table/toast_helper.c @@ -330,7 +330,7 @@ toast_delete_external(Relation rel, const Datum *values, const bool *isnull, if (isnull[i]) continue; - else if (VARATT_IS_EXTERNAL_ONDISK(value)) + else if (VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(value))) toast_delete_datum(rel, value, is_speculative); } } diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index 8db813b89fc64..c6740dbdd1b75 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -129,7 +129,7 @@ system_samplescangetsamplesize(PlannerInfo *root, static void system_initsamplescan(SampleScanState *node, int eflags) { - node->tsm_state = palloc0(sizeof(SystemSamplerData)); + node->tsm_state = palloc0_object(SystemSamplerData); } /* diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 661c55a9db789..a32f473e0a22b 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -36,7 +36,8 @@ OBJS = \ xlogreader.o \ xlogrecovery.o \ xlogstats.o \ - xlogutils.o + xlogutils.o \ + xlogwait.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 48f10bec91e12..ea43b432dafc7 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -110,9 +110,7 @@ static SlruCtlData XactCtlData; #define XactCtl (&XactCtlData) -static int ZeroCLOGPage(int64 pageno, bool writeXlog); static bool CLOGPagePrecedes(int64 page1, int64 page2); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb); static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, @@ -383,7 +381,8 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, * write-busy, since we don't care if the update reaches disk sooner than * we think. */ - slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); + slotno = SimpleLruReadPage(XactCtl, pageno, !XLogRecPtrIsValid(lsn), + xid); /* * Set the main transaction id, if any. @@ -707,7 +706,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i * recovery. After recovery completes the next clog change will set the * LSN correctly. */ - if (!XLogRecPtrIsInvalid(lsn)) + if (XLogRecPtrIsValid(lsn)) { int lsnindex = GetLSNIndex(slotno, xid); @@ -832,41 +831,8 @@ check_transaction_buffers(int *newval, void **extra, GucSource source) void BootStrapCLOG(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(XactCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the commit log */ - slotno = ZeroCLOGPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of CLOG to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCLOGPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(XactCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(XactCtl, 0); } /* @@ -974,8 +940,9 @@ ExtendCLOG(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCLOGPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(XactCtl, pageno); + XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -1067,17 +1034,6 @@ CLOGPagePrecedes(int64 page1, int64 page2) } -/* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); -} - /* * Write a TRUNCATE xlog record * @@ -1114,19 +1070,9 @@ clog_redo(XLogReaderState *record) if (info == CLOG_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(XactCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCLOGPage(pageno, false); - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(XactCtl, pageno); } else if (info == CLOG_TRUNCATE) { diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 113fae1437ad8..370b38e048b91 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -114,11 +114,9 @@ static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, RepOriginId nodeid, int slotno); static void error_commit_ts_disabled(void); -static int ZeroCommitTsPage(int64 pageno, bool writeXlog); static bool CommitTsPagePrecedes(int64 page1, int64 page2); static void ActivateCommitTs(void); static void DeactivateCommitTs(void); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid); /* @@ -602,28 +600,6 @@ BootStrapCommitTs(void) */ } -/* - * Initialize (or reinitialize) a page of CommitTs to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCommitTsPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(CommitTsCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; -} - /* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized TransamVariables->nextXid. @@ -707,6 +683,13 @@ ActivateCommitTs(void) TransactionId xid; int64 pageno; + /* + * During bootstrap, we should not register commit timestamps so skip the + * activation in this case. + */ + if (IsBootstrapProcessingMode()) + return; + /* If we've done this already, there's nothing to do */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); if (commitTsShared->commitTsActive) @@ -747,16 +730,7 @@ ActivateCommitTs(void) /* Create the current segment file, if necessary */ if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) - { - LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - int slotno; - - LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - LWLockRelease(lock); - } + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); /* Change the activation status in shared memory. */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); @@ -867,8 +841,12 @@ ExtendCommitTs(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCommitTsPage(pageno, !InRecovery); + /* Zero the page ... */ + SimpleLruZeroPage(CommitTsCtl, pageno); + + /* and make a WAL entry about that, unless we're in REDO */ + if (!InRecovery) + XLogSimpleInsertInt64(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -982,17 +960,6 @@ CommitTsPagePrecedes(int64 page1, int64 page2) } -/* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); -} - /* * Write a TRUNCATE xlog record */ @@ -1023,19 +990,9 @@ commit_ts_redo(XLogReaderState *record) if (info == COMMIT_TS_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); } else if (info == COMMIT_TS_TRUNCATE) { diff --git a/src/backend/access/transam/meson.build b/src/backend/access/transam/meson.build index e8ae9b13c8e49..74a62ab3eab6b 100644 --- a/src/backend/access/transam/meson.build +++ b/src/backend/access/transam/meson.build @@ -24,6 +24,7 @@ backend_sources += files( 'xlogrecovery.c', 'xlogstats.c', 'xlogutils.c', + 'xlogwait.c', ) # used by frontend programs to build a frontend xlogreader diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 3c06ac45532f8..34956a5a6634a 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -69,17 +69,13 @@ #include "postgres.h" #include "access/multixact.h" +#include "access/multixact_internal.h" #include "access/slru.h" -#include "access/transam.h" #include "access/twophase.h" #include "access/twophase_rmgr.h" -#include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "access/xlogutils.h" -#include "commands/dbcommands.h" -#include "funcapi.h" -#include "lib/ilist.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" @@ -87,136 +83,27 @@ #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procarray.h" -#include "utils/fmgrprotos.h" #include "utils/guc_hooks.h" #include "utils/injection_point.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" /* - * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is - * used everywhere else in Postgres. - * - * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, - * MultiXact page numbering also wraps around at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in this module, except when comparing - * segment and page numbers in TruncateMultiXact (see - * MultiXactOffsetPagePrecedes). + * Thresholds used to keep members disk usage in check when multixids have a + * lot of members. When MULTIXACT_MEMBER_LOW_THRESHOLD is reached, vacuum + * starts freezing multixids more aggressively, even if the normal multixid + * age limits haven't been reached yet. */ +#define MULTIXACT_MEMBER_LOW_THRESHOLD UINT64CONST(2000000000) +#define MULTIXACT_MEMBER_HIGH_THRESHOLD UINT64CONST(4000000000) -/* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) - -static inline int64 -MultiXactIdToOffsetPage(MultiXactId multi) -{ - return multi / MULTIXACT_OFFSETS_PER_PAGE; -} - -static inline int -MultiXactIdToOffsetEntry(MultiXactId multi) -{ - return multi % MULTIXACT_OFFSETS_PER_PAGE; -} - -static inline int64 -MultiXactIdToOffsetSegment(MultiXactId multi) -{ - return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT; -} - -/* - * The situation for members is a bit more complex: we store one byte of - * additional flag bits for each TransactionId. To do this without getting - * into alignment issues, we store four bytes of flags, and then the - * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and - * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups - * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and - * performance) trumps space efficiency here. - * - * Note that the "offset" macros work with byte offset, not array indexes, so - * arithmetic must be done using "char *" pointers. - */ -/* We need eight bits per xact, so one xact fits in a byte */ -#define MXACT_MEMBER_BITS_PER_XACT 8 -#define MXACT_MEMBER_FLAGS_PER_BYTE 1 -#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) - -/* how many full bytes of flags are there in a group? */ -#define MULTIXACT_FLAGBYTES_PER_GROUP 4 -#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ - (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) -/* size in bytes of a complete group */ -#define MULTIXACT_MEMBERGROUP_SIZE \ - (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) -#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) -#define MULTIXACT_MEMBERS_PER_PAGE \ - (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) - -/* - * Because the number of items per page is not a divisor of the last item - * number (member 0xFFFFFFFF), the last segment does not use the maximum number - * of pages, and moreover the last used page therein does not use the same - * number of items as previous pages. (Another way to say it is that the - * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page - * has some empty space after that item.) - * - * This constant is the number of members in the last page of the last segment. - */ -#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ - ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) - -/* page in which a member is to be found */ -static inline int64 -MXOffsetToMemberPage(MultiXactOffset offset) -{ - return offset / MULTIXACT_MEMBERS_PER_PAGE; -} - -static inline int64 -MXOffsetToMemberSegment(MultiXactOffset offset) -{ - return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; -} - -/* Location (byte offset within page) of flag word for a given member */ -static inline int -MXOffsetToFlagsOffset(MultiXactOffset offset) -{ - MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; - int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; - - return byteoff; -} - -static inline int -MXOffsetToFlagsBitShift(MultiXactOffset offset) -{ - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; - - return bshift; -} - -/* Location (byte offset within page) of TransactionId of given member */ -static inline int -MXOffsetToMemberOffset(MultiXactOffset offset) +static inline MultiXactId +NextMultiXactId(MultiXactId multi) { - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - - return MXOffsetToFlagsOffset(offset) + - MULTIXACT_FLAGBYTES_PER_GROUP + - member_in_group * sizeof(TransactionId); + return multi == MaxMultiXactId ? FirstMultiXactId : multi + 1; } -/* Multixact members wraparound thresholds. */ -#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) -#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ - (MaxMultiXactOffset - MaxMultiXactOffset / 4) - static inline MultiXactId PreviousMultiXactId(MultiXactId multi) { @@ -260,11 +147,9 @@ typedef struct MultiXactStateData /* * Oldest multixact offset that is potentially referenced by a multixact - * referenced by a relation. We don't always know this value, so there's - * a flag here to indicate whether or not we currently do. + * referenced by a relation. */ MultiXactOffset oldestOffset; - bool oldestOffsetKnown; /* support for anti-wraparound measures */ MultiXactId multiVacLimit; @@ -272,15 +157,6 @@ typedef struct MultiXactStateData MultiXactId multiStopLimit; MultiXactId multiWrapLimit; - /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - - /* - * This is used to sleep until a multixact offset is written when we want - * to create the next one. - */ - ConditionVariable nextoff_cv; - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -398,22 +274,13 @@ static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members); -static char *mxstatus_to_string(MultiXactStatus status); - /* management of SLRU infrastructure */ -static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog); -static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog); static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2); static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2); -static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, - MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, - MultiXactOffset start, uint32 distance); -static bool SetOffsetVacuumLimit(bool is_startup); +static void SetOldestOffset(void); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); -static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, @@ -558,8 +425,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) * Note we have the same race condition here as above: j could be 0 at the * end of the loop. */ - newMembers = (MultiXactMember *) - palloc(sizeof(MultiXactMember) * (nmembers + 1)); + newMembers = palloc_array(MultiXactMember, nmembers + 1); for (i = 0, j = 0; i < nmembers; i++) { @@ -692,14 +558,7 @@ MultiXactIdSetOldestMember(void) */ LWLockAcquire(MultiXactGenLock, LW_SHARED); - /* - * We have to beware of the possibility that nextMXact is in the - * wrapped-around state. We don't fix the counter itself here, but we - * must be sure to store a valid value in our array entry. - */ nextMXact = MultiXactState->nextMXact; - if (nextMXact < FirstMultiXactId) - nextMXact = FirstMultiXactId; OldestMemberMXactId[MyProcNumber] = nextMXact; @@ -736,15 +595,7 @@ MultiXactIdSetOldestVisible(void) LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - /* - * We have to beware of the possibility that nextMXact is in the - * wrapped-around state. We don't fix the counter itself here, but we - * must be sure to store a valid value in our array entry. - */ oldestMXact = MultiXactState->nextMXact; - if (oldestMXact < FirstMultiXactId) - oldestMXact = FirstMultiXactId; - for (i = 0; i < MaxOldestSlot; i++) { MultiXactId thisoldest = OldestMemberMXactId[i]; @@ -777,9 +628,6 @@ ReadNextMultiXactId(void) mxid = MultiXactState->nextMXact; LWLockRelease(MultiXactGenLock); - if (mxid < FirstMultiXactId) - mxid = FirstMultiXactId; - return mxid; } @@ -794,11 +642,6 @@ ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next) *oldest = MultiXactState->oldestMultiXactId; *next = MultiXactState->nextMXact; LWLockRelease(MultiXactGenLock); - - if (*oldest < FirstMultiXactId) - *oldest = FirstMultiXactId; - if (*next < FirstMultiXactId) - *next = FirstMultiXactId; } @@ -921,13 +764,32 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int entryno; int slotno; MultiXactOffset *offptr; - int i; + MultiXactId next; + int64 next_pageno; + int next_entryno; + MultiXactOffset *next_offptr; + MultiXactOffset next_offset; LWLock *lock; LWLock *prevlock = NULL; + /* position of this multixid in the offsets SLRU area */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* position of the next multixid */ + next = NextMultiXactId(multi); + next_pageno = MultiXactIdToOffsetPage(next); + next_entryno = MultiXactIdToOffsetEntry(next); + + /* + * Set the starting offset of this multixid's members. + * + * In the common case, it was already be set by the previous + * RecordNewMultiXact call, as this was the next multixid of the previous + * multixid. But if multiple backends are generating multixids + * concurrently, we might race ahead and get called before the previous + * multixid. + */ lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); @@ -942,22 +804,54 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + if (*offptr != offset) + { + /* should already be set to the correct value, or not at all */ + Assert(*offptr == 0); + *offptr = offset; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + /* + * Set the next multixid's offset to the end of this multixid's members. + */ + if (next_pageno == pageno) + { + next_offptr = offptr + 1; + } + else + { + /* must be the first entry on the page */ + Assert(next_entryno == 0 || next == FirstMultiXactId); + + /* Swap the lock for a lock on the next page */ + LWLockRelease(lock); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + next_offptr += next_entryno; + } + + /* Like in GetNewMultiXactId(), skip over offset 0 */ + next_offset = offset + nmembers; + if (next_offset == 0) + next_offset = 1; + if (*next_offptr != next_offset) + { + /* should already be set to the correct value, or not at all */ + Assert(*next_offptr == 0); + *next_offptr = next_offset; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } /* Release MultiXactOffset SLRU lock. */ LWLockRelease(lock); - /* - * If anybody was waiting to know the offset of this multixact ID we just - * wrote, they can read it now, so wake them up. - */ - ConditionVariableBroadcast(&MultiXactState->nextoff_cv); - prev_pageno = -1; - for (i = 0; i < nmembers; i++, offset++) + for (int i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; uint32 *flagsptr; @@ -1042,10 +936,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - /* Handle wraparound of the nextMXact counter */ - if (MultiXactState->nextMXact < FirstMultiXactId) - MultiXactState->nextMXact = FirstMultiXactId; - /* Assign the MXID */ result = MultiXactState->nextMXact; @@ -1112,7 +1002,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * request only once per 64K multis generated. This still gives * plenty of chances before we get into real trouble. */ - if (IsUnderPostmaster && (result % 65536) == 0) + if (IsUnderPostmaster && ((result % 65536) == 0 || result == FirstMultiXactId)) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); if (!MultiXactIdPrecedes(result, multiWarnLimit)) @@ -1143,98 +1033,31 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) /* Re-acquire lock and start over */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); result = MultiXactState->nextMXact; - if (result < FirstMultiXactId) - result = FirstMultiXactId; } - /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + /* + * Make sure there is room for the next MXID in the file. Assigning this + * MXID sets the next MXID's offset already. + */ + ExtendMultiXactOffset(NextMultiXactId(result)); /* - * Reserve the members space, similarly to above. Also, be careful not to - * return zero as the starting offset for any multixact. See - * GetMultiXactIdMembers() for motivation. + * Reserve the members space, similarly to above. */ nextOffset = MultiXactState->nextOffset; - if (nextOffset == 0) - { - *offset = 1; - nmembers++; /* allocate member slot 0 too */ - } - else - *offset = nextOffset; - /*---------- - * Protect against overrun of the members space as well, with the - * following rules: - * - * If we're past offsetStopLimit, refuse to generate more multis. - * If we're close to offsetStopLimit, emit a warning. - * - * Arbitrarily, we start emitting warnings when we're 20 segments or less - * from offsetStopLimit. - * - * Note we haven't updated the shared state yet, so if we fail at this - * point, the multixact ID we grabbed can still be used by the next guy. - * - * Note that there is no point in forcing autovacuum runs here: the - * multixact freeze settings would have to be reduced for that to have any - * effect. - *---------- + /* + * Offsets are 64-bit integers and will never wrap around. Firstly, it + * would take an unrealistic amount of time and resources to consume 2^64 + * offsets. Secondly, multixid creation is WAL-logged, so you would run + * out of LSNs before reaching offset wraparound. Nevertheless, check for + * wraparound as a sanity check. */ -#define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, - nmembers)) - { - /* see comment in the corresponding offsets wraparound case */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - + if (nextOffset + nmembers < nextOffset) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("multixact \"members\" limit exceeded"), - errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", - "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", - MultiXactState->offsetStopLimit - nextOffset - 1, - nmembers, - MultiXactState->offsetStopLimit - nextOffset - 1), - errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.", - MultiXactState->oldestMultiXactDB))); - } - - /* - * Check whether we should kick autovacuum into action, to prevent members - * wraparound. NB we use a much larger window to trigger autovacuum than - * just the warning limit. The warning is just a measure of last resort - - * this is in line with GetNewTransactionId's behaviour. - */ - if (!MultiXactState->oldestOffsetKnown || - (MultiXactState->nextOffset - MultiXactState->oldestOffset - > MULTIXACT_MEMBER_SAFE_THRESHOLD)) - { - /* - * To avoid swamping the postmaster with signals, we issue the autovac - * request only when crossing a segment boundary. With default - * compilation settings that's roughly after 50k members. This still - * gives plenty of chances before we get into real trouble. - */ - if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != - (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - } - - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, - nextOffset, - nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) - ereport(WARNING, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", - "database with OID %u must be vacuumed before %d more multixact members are used", - MultiXactState->offsetStopLimit - nextOffset + nmembers, - MultiXactState->oldestMultiXactDB, - MultiXactState->offsetStopLimit - nextOffset + nmembers), - errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings."))); + errmsg("MultiXact members would wrap around"))); + *offset = nextOffset; ExtendMultiXactMember(nextOffset, nmembers); @@ -1250,21 +1073,14 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) /* * Advance counters. As in GetNewTransactionId(), this must not happen * until after file extension has succeeded! - * - * We don't care about MultiXactId wraparound here; it will be handled by - * the next iteration. But note that nextMXact may be InvalidMultiXactId - * or the first value on a segment-beginning page after this routine - * exits, so anyone else looking at the variable must be prepared to deal - * with either case. Similarly, nextOffset may be zero, but we won't use - * that as the actual start offset of the next multixact. */ - (MultiXactState->nextMXact)++; - + MultiXactState->nextMXact = NextMultiXactId(result); MultiXactState->nextOffset += nmembers; LWLockRelease(MultiXactGenLock); - debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); + debug_elog4(DEBUG2, "GetNew: returning %u offset %" PRIu64, + result, *offset); return result; } @@ -1305,15 +1121,12 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, int slotno; MultiXactOffset *offptr; MultiXactOffset offset; + MultiXactOffset nextMXOffset; int length; - int truelength; MultiXactId oldestMXact; MultiXactId nextMXact; - MultiXactId tmpMXact; - MultiXactOffset nextOffset; MultiXactMember *ptr; LWLock *lock; - bool slept = false; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1360,14 +1173,12 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * error. * * Shared lock is enough here since we aren't modifying any global state. - * Acquire it just long enough to grab the current counter values. We may - * need both nextMXact and nextOffset; see below. + * Acquire it just long enough to grab the current counter values. */ LWLockAcquire(MultiXactGenLock, LW_SHARED); oldestMXact = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; - nextOffset = MultiXactState->nextOffset; LWLockRelease(MultiXactGenLock); @@ -1387,38 +1198,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * Find out the offset at which we need to start reading MultiXactMembers * and the number of members in the multixact. We determine the latter as * the difference between this multixact's starting offset and the next - * one's. However, there are some corner cases to worry about: - * - * 1. This multixact may be the latest one created, in which case there is - * no next one to look at. In this case the nextOffset value we just - * saved is the correct endpoint. - * - * 2. The next multixact may still be in process of being filled in: that - * is, another process may have done GetNewMultiXactId but not yet written - * the offset entry for that ID. In that scenario, it is guaranteed that - * the offset entry for that multixact exists (because GetNewMultiXactId - * won't release MultiXactGenLock until it does) but contains zero - * (because we are careful to pre-zero offset pages). Because - * GetNewMultiXactId will never return zero as the starting offset for a - * multixact, when we read zero as the next multixact's offset, we know we - * have this case. We handle this by sleeping on the condition variable - * we have just for this; the process in charge will signal the CV as soon - * as it has finished writing the multixact offset. - * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset - * wraparound. If we see next multixact's offset is one, is that our - * multixact's actual endpoint, or did it end at zero with a subsequent - * increment? We handle this using the knowledge that if the zero'th - * member slot wasn't filled, it'll contain zero, and zero isn't a valid - * transaction ID so it can't be a multixact member. Therefore, if we - * read a zero from the members array, just ignore it. - * - * This is all pretty messy, but the mess occurs only in infrequent corner - * cases, so it seems better than holding the MultiXactGenLock for a long - * time on every multixact creation. + * one's. */ -retry: pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1426,31 +1207,23 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); + /* read this multi's offset */ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; offset = *offptr; - Assert(offset != 0); - - /* - * Use the same increment rule as GetNewMultiXactId(), that is, don't - * handle wraparound explicitly until needed. - */ - tmpMXact = multi + 1; + if (offset == 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid offset", multi))); - if (nextMXact == tmpMXact) + /* read next multi's offset */ { - /* Corner case 1: there is no next multixact */ - length = nextOffset - offset; - } - else - { - MultiXactOffset nextMXOffset; + MultiXactId tmpMXact; /* handle wraparound if needed */ - if (tmpMXact < FirstMultiXactId) - tmpMXact = FirstMultiXactId; + tmpMXact = NextMultiXactId(multi); prev_pageno = pageno; @@ -1479,36 +1252,35 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; nextMXOffset = *offptr; - - if (nextMXOffset == 0) - { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(lock); - CHECK_FOR_INTERRUPTS(); - - INJECTION_POINT("multixact-get-members-cv-sleep", NULL); - - ConditionVariableSleep(&MultiXactState->nextoff_cv, - WAIT_EVENT_MULTIXACT_CREATION); - slept = true; - goto retry; - } - - length = nextMXOffset - offset; } LWLockRelease(lock); lock = NULL; - /* - * If we slept above, clean up state; it's no longer needed. - */ - if (slept) - ConditionVariableCancelSleep(); + /* Sanity check the next offset */ + if (nextMXOffset == 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid next offset", multi))); + if (nextMXOffset == offset) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u with offset (%" PRIu64 ") has zero members", + multi, offset))); + if (nextMXOffset < offset) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has offset (%" PRIu64 ") greater than its next offset (%" PRIu64 ")", + multi, offset, nextMXOffset))); + if (nextMXOffset - offset > INT32_MAX) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has too many members (%" PRIu64 ")", + multi, nextMXOffset - offset))); + length = nextMXOffset - offset; + /* read the members */ ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); - - truelength = 0; prev_pageno = -1; for (int i = 0; i < length; i++, offset++) { @@ -1545,37 +1317,27 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, xactptr = (TransactionId *) (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - - if (!TransactionIdIsValid(*xactptr)) - { - /* Corner case 3: we must be looking at unused slot zero */ - Assert(offset == 0); - continue; - } + Assert(TransactionIdIsValid(*xactptr)); flagsoff = MXOffsetToFlagsOffset(offset); bshift = MXOffsetToFlagsBitShift(offset); flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); - ptr[truelength].xid = *xactptr; - ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; - truelength++; + ptr[i].xid = *xactptr; + ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; } LWLockRelease(lock); - /* A multixid with zero members should not happen */ - Assert(truelength > 0); - /* * Copy the result into the local cache. */ - mXactCachePut(multi, truelength, ptr); + mXactCachePut(multi, length, ptr); debug_elog3(DEBUG2, "GetMembers: no cache for %s", - mxid_to_string(multi, truelength, ptr)); + mxid_to_string(multi, length, ptr)); *members = ptr; - return truelength; + return length; } /* @@ -1750,7 +1512,7 @@ mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) } } -static char * +char * mxstatus_to_string(MultiXactStatus status) { switch (status) @@ -1847,7 +1609,7 @@ AtPrepare_MultiXact(void) * Clean up after successful PREPARE TRANSACTION */ void -PostPrepare_MultiXact(TransactionId xid) +PostPrepare_MultiXact(FullTransactionId fxid) { MultiXactId myOldestMember; @@ -1858,7 +1620,7 @@ PostPrepare_MultiXact(TransactionId xid) myOldestMember = OldestMemberMXactId[MyProcNumber]; if (MultiXactIdIsValid(myOldestMember)) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false); /* * Even though storing MultiXactId is atomic, acquire lock to make @@ -1896,10 +1658,10 @@ PostPrepare_MultiXact(TransactionId xid) * Recover the state of a prepared transaction at startup */ void -multixact_twophase_recover(TransactionId xid, uint16 info, +multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false); MultiXactId oldestMember; /* @@ -1917,10 +1679,10 @@ multixact_twophase_recover(TransactionId xid, uint16 info, * Similar to AtEOXact_MultiXact but for COMMIT PREPARED */ void -multixact_twophase_postcommit(TransactionId xid, uint16 info, +multixact_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true); Assert(len == sizeof(MultiXactId)); @@ -1932,10 +1694,10 @@ multixact_twophase_postcommit(TransactionId xid, uint16 info, * This is actually just the same as the COMMIT case. */ void -multixact_twophase_postabort(TransactionId xid, uint16 info, +multixact_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - multixact_twophase_postcommit(xid, info, recdata, len); + multixact_twophase_postcommit(fxid, info, recdata, len); } /* @@ -1982,7 +1744,7 @@ MultiXactShmemInit(void) "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER, LWTRANCHE_MULTIXACTMEMBER_SLRU, SYNC_HANDLER_MULTIXACT_MEMBER, - false); + true); /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ /* Initialize our shared state struct */ @@ -1995,7 +1757,6 @@ MultiXactShmemInit(void) /* Make sure we zero out the per-backend state */ MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); - ConditionVariableInit(&MultiXactState->nextoff_cv); } else Assert(found); @@ -2033,112 +1794,9 @@ check_multixact_member_buffers(int *newval, void **extra, GucSource source) void BootStrapMultiXact(void) { - int slotno; - LWLock *lock; - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the offsets log */ - slotno = ZeroMultiXactOffsetPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the members log */ - slotno = ZeroMultiXactMemberPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); - - return slotno; -} - -/* - * Ditto, for MultiXactMember - */ -static int -ZeroMultiXactMemberPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); - - return slotno; -} - -/* - * MaybeExtendOffsetSlru - * Extend the offsets SLRU area, if necessary - * - * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might - * contain files that are shorter than necessary; this would occur if the old - * installation had used multixacts beyond the first page (files cannot be - * copied, because the on-disk representation is different). pg_upgrade would - * update pg_control to set the next offset value to be at that position, so - * that tuples marked as locked by such MultiXacts would be seen as visible - * without having to consult multixact. However, trying to create and use a - * new MultiXactId would result in an error because the page on which the new - * value would reside does not exist. This routine is in charge of creating - * such pages. - */ -static void -MaybeExtendOffsetSlru(void) -{ - int64 pageno; - LWLock *lock; - - pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) - { - int slotno; - - /* - * Fortunately for us, SimpleLruWritePage is already prepared to deal - * with creating a new segment file even if the page we're writing is - * not the first in it, so this is enough. - */ - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - } - - LWLockRelease(lock); + /* Zero the initial pages and flush them to disk */ + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, 0); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } /* @@ -2202,26 +1860,34 @@ TrimMultiXact(void) pageno); /* - * Zero out the remainder of the current offsets page. See notes in - * TrimCLOG() for background. Unlike CLOG, some WAL record covers every - * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL - * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * Set the offset of nextMXact on the offsets page. This is normally done + * in RecordNewMultiXact() of the previous multixact, but let's be sure + * the next page exists, if the nextMXact was reset with pg_resetwal for + * example. + * + * Zero out the remainder of the page. See notes in TrimCLOG() for + * background. Unlike CLOG, some WAL record covers every pg_multixact + * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write + * xlog before data," nextMXact successors may carry obsolete, nonzero + * offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); - if (entryno != 0) { int slotno; MultiXactOffset *offptr; LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + if (entryno == 0 || nextMXact == FirstMultiXactId) + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + else + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + *offptr = offset; + if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ) + MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset)); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; LWLockRelease(lock); @@ -2271,8 +1937,8 @@ TrimMultiXact(void) MultiXactState->finishedStartup = true; LWLockRelease(MultiXactGenLock); - /* Now compute how far away the next members wraparound is. */ - SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); + /* Now compute how far away the next multixid wraparound is. */ + SetMultiXactIdLimit(oldestMXact, oldestMXactDB); } /* @@ -2293,7 +1959,7 @@ MultiXactGetCheckptMulti(bool is_shutdown, LWLockRelease(MultiXactGenLock); debug_elog6(DEBUG2, - "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", + "MultiXact: checkpoint is nextMulti %u, nextOffset %" PRIu64 ", oldestMulti %u in DB %u", *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); } @@ -2328,26 +1994,14 @@ void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset) { - debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", + Assert(MultiXactIdIsValid(nextMulti)); + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %" PRIu64, nextMulti, nextMultiOffset); + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->nextMXact = nextMulti; MultiXactState->nextOffset = nextMultiOffset; LWLockRelease(MultiXactGenLock); - - /* - * During a binary upgrade, make sure that the offsets SLRU is large - * enough to contain the next value that would be created. - * - * We need to do this pretty early during the first startup in binary - * upgrade mode: before StartupMultiXact() in fact, because this routine - * is called even before that by StartupXLOG(). And we can't do it - * earlier than at this point, because during that first call of this - * routine we determine the MultiXactState->nextMXact value that - * MaybeExtendOffsetSlru needs. - */ - if (IsBinaryUpgrade) - MaybeExtendOffsetSlru(); } /* @@ -2355,28 +2009,24 @@ MultiXactSetNextMXact(MultiXactId nextMulti, * datminmxid (ie, the oldest MultiXactId that might exist in any database * of our cluster), and the OID of the (or a) database with that value. * - * is_startup is true when we are just starting the cluster, false when we - * are updating state in a running cluster. This only affects log messages. + * This also updates MultiXactState->oldestOffset, by looking up the offset of + * MultiXactState->oldestMultiXactId. */ void -SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, - bool is_startup) +SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) { MultiXactId multiVacLimit; MultiXactId multiWarnLimit; MultiXactId multiStopLimit; MultiXactId multiWrapLimit; MultiXactId curMulti; - bool needs_offset_vacuum; Assert(MultiXactIdIsValid(oldest_datminmxid)); /* * We pretend that a wrap will happen halfway through the multixact ID * space, but that's not really true, because multixacts wrap differently - * from transaction IDs. Note that, separately from any concern about - * multixact IDs wrapping, we must ensure that multixact members do not - * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. + * from transaction IDs. */ multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); if (multiWrapLimit < FirstMultiXactId) @@ -2444,8 +2094,14 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, Assert(!InRecovery); - /* Set limits for offset vacuum. */ - needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); + /* + * Offsets are 64-bits wide and never wrap around, so we don't need to + * consider them for emergency autovacuum purposes. But now that we're in + * a consistent state, determine MultiXactState->oldestOffset. It will be + * used to adjust the freezing cutoff, to keep the offsets disk usage in + * check. + */ + SetOldestOffset(); /* * If past the autovacuum force point, immediately signal an autovac @@ -2454,8 +2110,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, * database, it'll call here, and we'll signal the postmaster to start * another iteration immediately if there are still any old databases. */ - if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || - needs_offset_vacuum) && IsUnderPostmaster) + if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); /* Give an immediate warning if past the wrap warn point */ @@ -2511,15 +2166,17 @@ void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset) { + Assert(MultiXactIdIsValid(minMulti)); + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) { debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); MultiXactState->nextMXact = minMulti; } - if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) + if (MultiXactState->nextOffset < minMultiOffset) { - debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", + debug_elog3(DEBUG2, "MultiXact: setting next offset to %" PRIu64, minMultiOffset); MultiXactState->nextOffset = minMultiOffset; } @@ -2538,7 +2195,7 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) Assert(InRecovery); if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) - SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); + SetMultiXactIdLimit(oldestMulti, oldestMultiDB); } /* @@ -2568,8 +2225,10 @@ ExtendMultiXactOffset(MultiXactId multi) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactOffsetPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE, + pageno); LWLockRelease(lock); } @@ -2611,33 +2270,19 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactMemberPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactMemberCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, + XLOG_MULTIXACT_ZERO_MEM_PAGE, pageno); LWLockRelease(lock); } - /* - * Compute the number of items till end of current page. Careful: if - * addition of unsigned ints wraps around, we're at the last page of - * the last segment; since that page holds a different number of items - * than other pages, we need to do it differently. - */ - if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) - { - /* - * This is the last page of the last segment; we can compute the - * number of items left to allocate in it without modulo - * arithmetic. - */ - difference = MaxMultiXactOffset - offset + 1; - } - else - difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + /* Compute the number of items till end of current page. */ + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; /* - * Advance to next page, taking care to properly handle the wraparound - * case. OK if nmembers goes negative. + * Advance to next page. OK if nmembers goes negative. */ nmembers -= difference; offset += difference; @@ -2660,7 +2305,6 @@ MultiXactId GetOldestMultiXactId(void) { MultiXactId oldestMXact; - MultiXactId nextMXact; int i; /* @@ -2668,17 +2312,7 @@ GetOldestMultiXactId(void) * OldestVisibleMXactId[] entries, or nextMXact if none are valid. */ LWLockAcquire(MultiXactGenLock, LW_SHARED); - - /* - * We have to beware of the possibility that nextMXact is in the - * wrapped-around state. We don't fix the counter itself here, but we - * must be sure to use a valid value in our calculation. - */ - nextMXact = MultiXactState->nextMXact; - if (nextMXact < FirstMultiXactId) - nextMXact = FirstMultiXactId; - - oldestMXact = nextMXact; + oldestMXact = MultiXactState->nextMXact; for (i = 0; i < MaxOldestSlot; i++) { MultiXactId thisoldest; @@ -2699,28 +2333,17 @@ GetOldestMultiXactId(void) } /* - * Determine how aggressively we need to vacuum in order to prevent member - * wraparound. - * - * To do so determine what's the oldest member offset and install the limit - * info in MultiXactState, where it can be used to prevent overrun of old data - * in the members SLRU area. - * - * The return value is true if emergency autovacuum is required and false - * otherwise. + * Calculate the oldest member offset and install it in MultiXactState, where + * it can be used to adjust multixid freezing cutoffs. */ -static bool -SetOffsetVacuumLimit(bool is_startup) +static void +SetOldestOffset(void) { MultiXactId oldestMultiXactId; MultiXactId nextMXact; MultiXactOffset oldestOffset = 0; /* placate compiler */ - MultiXactOffset prevOldestOffset; MultiXactOffset nextOffset; bool oldestOffsetKnown = false; - bool prevOldestOffsetKnown; - MultiXactOffset offsetStopLimit = 0; - MultiXactOffset prevOffsetStopLimit; /* * NB: Have to prevent concurrent truncation, we might otherwise try to @@ -2733,9 +2356,6 @@ SetOffsetVacuumLimit(bool is_startup) oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; - prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; - prevOldestOffset = MultiXactState->oldestOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; Assert(MultiXactState->finishedStartup); LWLockRelease(MultiXactGenLock); @@ -2758,121 +2378,39 @@ SetOffsetVacuumLimit(bool is_startup) else { /* - * Figure out where the oldest existing multixact's offsets are - * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, - * the supposedly-earliest multixact might not really exist. We are - * careful not to fail in that case. + * Look up the offset at which the oldest existing multixact's members + * are stored. If we cannot find it, be careful not to fail, and + * leave oldestOffset unchanged. oldestOffset is initialized to zero + * at system startup, which prevents truncating members until a proper + * value is calculated. + * + * (We had bugs in early releases of PostgreSQL 9.3.X and 9.4.X where + * the supposedly-earliest multixact might not really exist. Those + * should be long gone by now, so this should not fail, but let's + * still be defensive.) */ oldestOffsetKnown = find_multixact_start(oldestMultiXactId, &oldestOffset); if (oldestOffsetKnown) ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId member is at offset %u", + (errmsg_internal("oldest MultiXactId member is at offset %" PRIu64, oldestOffset))); else ereport(LOG, - (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", + (errmsg("MultiXact member truncation is disabled because oldest checkpointed MultiXact %u does not exist on disk", oldestMultiXactId))); } LWLockRelease(MultiXactTruncationLock); - /* - * If we can, compute limits (and install them MultiXactState) to prevent - * overrun of old data in the members SLRU area. We can only do so if the - * oldest offset is known though. - */ + /* Install the computed value */ if (oldestOffsetKnown) { - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - if (!prevOldestOffsetKnown && !is_startup) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - - ereport(DEBUG1, - (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", - offsetStopLimit, oldestMultiXactId))); - } - else if (prevOldestOffsetKnown) - { - /* - * If we failed to get the oldest offset this time, but we have a - * value from a previous pass through this function, use the old - * values rather than automatically forcing an emergency autovacuum - * cycle again. - */ - oldestOffset = prevOldestOffset; - oldestOffsetKnown = true; - offsetStopLimit = prevOffsetStopLimit; + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestOffset = oldestOffset; + LWLockRelease(MultiXactGenLock); } - - /* Install the computed values */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->oldestOffset = oldestOffset; - MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->offsetStopLimit = offsetStopLimit; - LWLockRelease(MultiXactGenLock); - - /* - * Do we need an emergency autovacuum? If we're not sure, assume yes. - */ - return !oldestOffsetKnown || - (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); -} - -/* - * Return whether adding "distance" to "start" would move past "boundary". - * - * We use this to determine whether the addition is "wrapping around" the - * boundary point, hence the name. The reason we don't want to use the regular - * 2^31-modulo arithmetic here is that we want to be able to use the whole of - * the 2^32-1 space here, allowing for more multixacts than would fit - * otherwise. - */ -static bool -MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, - uint32 distance) -{ - MultiXactOffset finish; - - /* - * Note that offset number 0 is not used (see GetMultiXactIdMembers), so - * if the addition wraps around the UINT_MAX boundary, skip that value. - */ - finish = start + distance; - if (finish < start) - finish++; - - /*----------------------------------------------------------------------- - * When the boundary is numerically greater than the starting point, any - * value numerically between the two is not wrapped: - * - * <----S----B----> - * [---) = F wrapped past B (and UINT_MAX) - * [---) = F not wrapped - * [----] = F wrapped past B - * - * When the boundary is numerically less than the starting point (i.e. the - * UINT_MAX wraparound occurs somewhere in between) then all values in - * between are wrapped: - * - * <----B----S----> - * [---) = F not wrapped past B (but wrapped past UINT_MAX) - * [---) = F wrapped past B (and UINT_MAX) - * [----] = F not wrapped - *----------------------------------------------------------------------- - */ - if (start < boundary) - return finish >= boundary || finish < start; - else - return finish >= boundary && finish < start; } /* @@ -2919,32 +2457,30 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) } /* - * Determine how many multixacts, and how many multixact members, currently - * exist. Return false if unable to determine. + * GetMultiXactInfo + * + * Returns information about the current MultiXact state, as of: + * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId) + * members: Number of member entries (nextOffset - oldestOffset) + * oldestMultiXactId: Oldest MultiXact ID still in use + * oldestOffset: Oldest offset still in use */ -static bool -ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) +void +GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, + MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset) { MultiXactOffset nextOffset; - MultiXactOffset oldestOffset; - MultiXactId oldestMultiXactId; MultiXactId nextMultiXactId; - bool oldestOffsetKnown; LWLockAcquire(MultiXactGenLock, LW_SHARED); nextOffset = MultiXactState->nextOffset; - oldestMultiXactId = MultiXactState->oldestMultiXactId; + *oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMultiXactId = MultiXactState->nextMXact; - oldestOffset = MultiXactState->oldestOffset; - oldestOffsetKnown = MultiXactState->oldestOffsetKnown; + *oldestOffset = MultiXactState->oldestOffset; LWLockRelease(MultiXactGenLock); - if (!oldestOffsetKnown) - return false; - - *members = nextOffset - oldestOffset; - *multixacts = nextMultiXactId - oldestMultiXactId; - return true; + *members = nextOffset - *oldestOffset; + *multixacts = nextMultiXactId - *oldestMultiXactId; } /* @@ -2953,26 +2489,27 @@ ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) * vacuum_multixact_freeze_table_age work together to make sure we never have * too many multixacts; we hope that, at least under normal circumstances, * this will also be sufficient to keep us from using too many offsets. - * However, if the average multixact has many members, we might exhaust the - * members space while still using few enough members that these limits fail - * to trigger relminmxid advancement by VACUUM. At that point, we'd have no - * choice but to start failing multixact-creating operations with an error. - * - * To prevent that, if more than a threshold portion of the members space is - * used, we effectively reduce autovacuum_multixact_freeze_max_age and - * to a value just less than the number of multixacts in use. We hope that - * this will quickly trigger autovacuuming on the table or tables with the - * oldest relminmxid, thus allowing datminmxid values to advance and removing - * some members. - * - * As the fraction of the member space currently in use grows, we become - * more aggressive in clamping this value. That not only causes autovacuum - * to ramp up, but also makes any manual vacuums the user issues more - * aggressive. This happens because vacuum_get_cutoffs() will clamp the - * freeze table and the minimum freeze age cutoffs based on the effective - * autovacuum_multixact_freeze_max_age this function returns. In the worst - * case, we'll claim the freeze_max_age to zero, and every vacuum of any - * table will freeze every multixact. + * However, if the average multixact has many members, we might accumulate a + * large amount of members, consuming disk space, while still using few enough + * multixids that the multixid limits fail to trigger relminmxid advancement + * by VACUUM. + * + * To prevent that, if the members space usage exceeds a threshold + * (MULTIXACT_MEMBER_LOW_THRESHOLD), we effectively reduce + * autovacuum_multixact_freeze_max_age to a value just less than the number of + * multixacts in use. We hope that this will quickly trigger autovacuuming on + * the table or tables with the oldest relminmxid, thus allowing datminmxid + * values to advance and removing some members. + * + * As the amount of the member space in use grows, we become more aggressive + * in clamping this value. That not only causes autovacuum to ramp up, but + * also makes any manual vacuums the user issues more aggressive. This + * happens because vacuum_get_cutoffs() will clamp the freeze table and the + * minimum freeze age cutoffs based on the effective + * autovacuum_multixact_freeze_max_age this function returns. At the extreme, + * when the members usage reaches MULTIXACT_MEMBER_HIGH_THRESHOLD, we clamp + * freeze_max_age to zero, and every vacuum of any table will freeze every + * multixact. */ int MultiXactMemberFreezeThreshold(void) @@ -2982,27 +2519,36 @@ MultiXactMemberFreezeThreshold(void) uint32 victim_multixacts; double fraction; int result; + MultiXactId oldestMultiXactId; + MultiXactOffset oldestOffset; - /* If we can't determine member space utilization, assume the worst. */ - if (!ReadMultiXactCounts(&multixacts, &members)) - return 0; + /* Read the current offsets and members usage. */ + GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset); /* If member space utilization is low, no special action is required. */ - if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) + if (members <= MULTIXACT_MEMBER_LOW_THRESHOLD) return autovacuum_multixact_freeze_max_age; /* * Compute a target for relminmxid advancement. The number of multixacts * we try to eliminate from the system is based on how far we are past - * MULTIXACT_MEMBER_SAFE_THRESHOLD. + * MULTIXACT_MEMBER_LOW_THRESHOLD. + * + * The way this formula works is that when members is exactly at the low + * threshold, fraction = 0.0, and we set freeze_max_age equal to + * mxid_age(oldestMultiXactId). As members grows further, towards the + * high threshold, fraction grows linearly from 0.0 to 1.0, and the result + * shrinks from mxid_age(oldestMultiXactId) to 0. Beyond the high + * threshold, fraction > 1.0 and the result is clamped to 0. */ - fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / - (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); - victim_multixacts = multixacts * fraction; + fraction = (double) (members - MULTIXACT_MEMBER_LOW_THRESHOLD) / + (MULTIXACT_MEMBER_HIGH_THRESHOLD - MULTIXACT_MEMBER_LOW_THRESHOLD); /* fraction could be > 1.0, but lowest possible freeze age is zero */ - if (victim_multixacts > multixacts) + if (fraction >= 1.0) return 0; + + victim_multixacts = multixacts * fraction; result = multixacts - victim_multixacts; /* @@ -3038,36 +2584,12 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data /* * Delete members segments [oldest, newOldest) - * - * The members SLRU can, in contrast to the offsets one, be filled to almost - * the full range at once. This means SimpleLruTruncate() can't trivially be - * used - instead the to-be-deleted range is computed using the offsets - * SLRU. C.f. TruncateMultiXact(). */ static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) { - const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); - int64 startsegment = MXOffsetToMemberSegment(oldestOffset); - int64 endsegment = MXOffsetToMemberSegment(newOldestOffset); - int64 segment = startsegment; - - /* - * Delete all the segments but the last one. The last segment can still - * contain, possibly partially, valid data. - */ - while (segment != endsegment) - { - elog(DEBUG2, "truncating multixact members segment %" PRIx64, - segment); - SlruDeleteSegment(MultiXactMemberCtl, segment); - - /* move to next segment, handling wraparound correctly */ - if (segment == maxsegment) - segment = 0; - else - segment += 1; - } + SimpleLruTruncate(MultiXactMemberCtl, + MXOffsetToMemberPage(newOldestOffset)); } /* @@ -3111,6 +2633,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) Assert(!RecoveryInProgress()); Assert(MultiXactState->finishedStartup); + Assert(MultiXactIdIsValid(newOldestMulti)); /* * We can only allow one truncation to happen at once. Otherwise parts of @@ -3125,7 +2648,6 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) nextOffset = MultiXactState->nextOffset; oldestMulti = MultiXactState->oldestMultiXactId; LWLockRelease(MultiXactGenLock); - Assert(MultiXactIdIsValid(oldestMulti)); /* * Make sure to only attempt truncation if there's values to truncate @@ -3211,7 +2733,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) elog(DEBUG1, "performing multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", oldestMulti, newOldestMulti, MultiXactIdToOffsetSegment(oldestMulti), MultiXactIdToOffsetSegment(newOldestMulti), @@ -3252,6 +2774,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->oldestMultiXactId = newOldestMulti; MultiXactState->oldestMultiXactDB = newOldestMultiDB; + MultiXactState->oldestOffset = newOldestOffset; LWLockRelease(MultiXactGenLock); /* First truncate members */ @@ -3291,20 +2814,13 @@ MultiXactOffsetPagePrecedes(int64 page1, int64 page2) /* * Decide whether a MultiXactMember page number is "older" for truncation - * purposes. There is no "invalid offset number" so use the numbers verbatim. + * purposes. There is no "invalid offset number" and members never wrap + * around, so use the numbers verbatim. */ static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2) { - MultiXactOffset offset1; - MultiXactOffset offset2; - - offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; - offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; - - return (MultiXactOffsetPrecedes(offset1, offset2) && - MultiXactOffsetPrecedes(offset1, - offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); + return page1 < page2; } /* @@ -3336,29 +2852,6 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) } -/* - * Decide which of two offsets is earlier. - */ -static bool -MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) -{ - int32 diff = (int32) (offset1 - offset2); - - return (diff < 0); -} - -/* - * Write an xlog record reflecting the zeroing of either a MEMBERs or - * OFFSETs page (info shows which) - */ -static void -WriteMZeroPageXlogRec(int64 pageno, uint8 info) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_MULTIXACT_ID, info); -} - /* * Write a TRUNCATE xlog record * @@ -3401,36 +2894,16 @@ multixact_redo(XLogReaderState *record) if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, pageno); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactMemberPage(pageno, false); - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, pageno); } else if (info == XLOG_MULTIXACT_CREATE_ID) { @@ -3444,7 +2917,7 @@ multixact_redo(XLogReaderState *record) xlrec->members); /* Make sure nextMXact/nextOffset are beyond what this record has */ - MultiXactAdvanceNextMXact(xlrec->mid + 1, + MultiXactAdvanceNextMXact(NextMultiXactId(xlrec->mid), xlrec->moff + xlrec->nmembers); /* @@ -3471,7 +2944,7 @@ multixact_redo(XLogReaderState *record) elog(DEBUG1, "replaying multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", xlrec.startTruncOff, xlrec.endTruncOff, MultiXactIdToOffsetSegment(xlrec.startTruncOff), MultiXactIdToOffsetSegment(xlrec.endTruncOff), @@ -3486,7 +2959,7 @@ multixact_redo(XLogReaderState *record) * Advance the horizon values, so they're current at the end of * recovery. */ - SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB); PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); @@ -3506,68 +2979,6 @@ multixact_redo(XLogReaderState *record) elog(PANIC, "multixact_redo: unknown op code %u", info); } -Datum -pg_get_multixact_members(PG_FUNCTION_ARGS) -{ - typedef struct - { - MultiXactMember *members; - int nmembers; - int iter; - } mxact; - MultiXactId mxid = PG_GETARG_TRANSACTIONID(0); - mxact *multi; - FuncCallContext *funccxt; - - if (mxid < FirstMultiXactId) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid MultiXactId: %u", mxid))); - - if (SRF_IS_FIRSTCALL()) - { - MemoryContext oldcxt; - TupleDesc tupdesc; - - funccxt = SRF_FIRSTCALL_INIT(); - oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); - - multi = palloc(sizeof(mxact)); - /* no need to allow for old values here */ - multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false, - false); - multi->iter = 0; - - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - funccxt->tuple_desc = tupdesc; - funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc); - funccxt->user_fctx = multi; - - MemoryContextSwitchTo(oldcxt); - } - - funccxt = SRF_PERCALL_SETUP(); - multi = (mxact *) funccxt->user_fctx; - - while (multi->iter < multi->nmembers) - { - HeapTuple tuple; - char *values[2]; - - values[0] = psprintf("%u", multi->members[multi->iter].xid); - values[1] = mxstatus_to_string(multi->members[multi->iter].status); - - tuple = BuildTupleFromCStrings(funccxt->attinmeta, values); - - multi->iter++; - pfree(values[0]); - SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple)); - } - - SRF_RETURN_DONE(funccxt); -} - /* * Entrypoint for sync.c to sync offsets files. */ diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 94db1ec30126a..642c61fc55ce2 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -186,7 +186,7 @@ CreateParallelContext(const char *library_name, const char *function_name, oldcontext = MemoryContextSwitchTo(TopTransactionContext); /* Initialize a new ParallelContext. */ - pcxt = palloc0(sizeof(ParallelContext)); + pcxt = palloc0_object(ParallelContext); pcxt->subid = GetCurrentSubTransactionId(); pcxt->nworkers = nworkers; pcxt->nworkers_to_launch = nworkers; @@ -266,6 +266,10 @@ InitializeParallelDSM(ParallelContext *pcxt) if (pcxt->nworkers > 0) { + StaticAssertDecl(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) == + PARALLEL_ERROR_QUEUE_SIZE, + "parallel error queue size not buffer-aligned"); + /* Estimate space for various kinds of state sharing. */ library_len = EstimateLibraryStateSpace(); shm_toc_estimate_chunk(&pcxt->estimator, library_len); @@ -297,9 +301,6 @@ InitializeParallelDSM(ParallelContext *pcxt) shm_toc_estimate_keys(&pcxt->estimator, 12); /* Estimate space need for error queues. */ - StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) == - PARALLEL_ERROR_QUEUE_SIZE, - "parallel error queue size not buffer-aligned"); shm_toc_estimate_chunk(&pcxt->estimator, mul_size(PARALLEL_ERROR_QUEUE_SIZE, pcxt->nworkers)); @@ -453,7 +454,7 @@ InitializeParallelDSM(ParallelContext *pcxt) clientconninfospace); /* Allocate space for worker information. */ - pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers); + pcxt->worker = palloc0_array(ParallelWorkerInfo, pcxt->nworkers); /* * Establish error queues in dynamic shared memory. @@ -507,8 +508,12 @@ InitializeParallelDSM(ParallelContext *pcxt) void ReinitializeParallelDSM(ParallelContext *pcxt) { + MemoryContext oldcontext; FixedParallelState *fps; + /* We might be running in a very short-lived memory context. */ + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + /* Wait for any old workers to exit. */ if (pcxt->nworkers_launched > 0) { @@ -546,6 +551,9 @@ ReinitializeParallelDSM(ParallelContext *pcxt) pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); } } + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); } /* @@ -648,8 +656,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) */ if (pcxt->nworkers_launched > 0) { - pcxt->known_attached_workers = - palloc0(sizeof(bool) * pcxt->nworkers_launched); + pcxt->known_attached_workers = palloc0_array(bool, pcxt->nworkers_launched); pcxt->nknown_attached_workers = 0; } diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 1b7499726eb02..4fda03a3cfcc6 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -33,7 +33,7 @@ #include "access/xact.h" #include "catalog/storage_xlog.h" #include "commands/dbcommands_xlog.h" -#include "commands/sequence.h" +#include "commands/sequence_xlog.h" #include "commands/tablespace.h" #include "replication/decode.h" #include "replication/message.h" diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index fe56286d9a972..77676d6d0359e 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -246,6 +246,7 @@ SimpleLruAutotuneBuffers(int divisor, int max) * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks. * bank_tranche_id: tranche ID to use for the bank LWLocks. * sync_handler: which set of functions to use to handle sync requests + * long_segment_names: use short or long segment names */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, @@ -400,15 +401,15 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno) /* * Assume this page is now the latest active page. * - * Note that because both this routine and SlruSelectLRUPage run with - * ControlLock held, it is not possible for this to be zeroing a page that - * SlruSelectLRUPage is going to evict simultaneously. Therefore, there's - * no memory barrier here. + * Note that because both this routine and SlruSelectLRUPage run with a + * SLRU bank lock held, it is not possible for this to be zeroing a page + * that SlruSelectLRUPage is going to evict simultaneously. Therefore, + * there's no memory barrier here. */ pg_atomic_write_u64(&shared->latest_page_number, pageno); /* update the stats counter of zeroed pages */ - pgstat_count_slru_page_zeroed(shared->slru_stats_idx); + pgstat_count_slru_blocks_zeroed(shared->slru_stats_idx); return slotno; } @@ -433,6 +434,31 @@ SimpleLruZeroLSNs(SlruCtl ctl, int slotno) shared->lsn_groups_per_page * sizeof(XLogRecPtr)); } +/* + * This is a convenience wrapper for the common case of zeroing a page and + * immediately flushing it to disk. + * + * SLRU bank lock is acquired and released here. + */ +void +SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno) +{ + int slotno; + LWLock *lock; + + lock = SimpleLruGetBankLock(ctl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(ctl, pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(ctl, slotno); + Assert(!ctl->shared->page_dirty[slotno]); + + LWLockRelease(lock); +} + /* * Wait for any active I/O on a page slot to finish. (This does not * guarantee that new I/O hasn't been started before we return, though. @@ -535,7 +561,7 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, SlruRecentlyUsed(shared, slotno); /* update the stats counter of pages found in the SLRU */ - pgstat_count_slru_page_hit(shared->slru_stats_idx); + pgstat_count_slru_blocks_hit(shared->slru_stats_idx); return slotno; } @@ -580,7 +606,7 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, SlruRecentlyUsed(shared, slotno); /* update the stats counter of pages not found in SLRU */ - pgstat_count_slru_page_read(shared->slru_stats_idx); + pgstat_count_slru_blocks_read(shared->slru_stats_idx); return slotno; } @@ -619,11 +645,11 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid) shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { - /* See comments for SlruRecentlyUsed macro */ + /* See comments for SlruRecentlyUsed() */ SlruRecentlyUsed(shared, slotno); /* update the stats counter of pages found in the SLRU */ - pgstat_count_slru_page_hit(shared->slru_stats_idx); + pgstat_count_slru_blocks_hit(shared->slru_stats_idx); return slotno; } @@ -753,7 +779,7 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno) off_t endpos; /* update the stats counter of checked pages */ - pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx); + pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx); SlruFileName(ctl, path, segno); @@ -882,7 +908,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata) int fd = -1; /* update the stats counter of written pages */ - pgstat_count_slru_page_written(shared->slru_stats_idx); + pgstat_count_slru_blocks_written(shared->slru_stats_idx); /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not @@ -911,7 +937,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata) max_lsn = this_lsn; } - if (!XLogRecPtrIsInvalid(max_lsn)) + if (XLogRecPtrIsValid(max_lsn)) { /* * As noted above, elog(ERROR) is not acceptable here, so if diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 15153618fad16..09aace9e09f0e 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -74,7 +74,6 @@ static SlruCtlData SubTransCtlData; #define SubTransCtl (&SubTransCtlData) -static int ZeroSUBTRANSPage(int64 pageno); static bool SubTransPagePrecedes(int64 page1, int64 page2); @@ -269,33 +268,8 @@ check_subtrans_buffers(int *newval, void **extra, GucSource source) void BootStrapSUBTRANS(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(SubTransCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the subtrans log */ - slotno = ZeroSUBTRANSPage(0); - - /* Make sure it's written out */ - SimpleLruWritePage(SubTransCtl, slotno); - Assert(!SubTransCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of SUBTRANS to zeroes. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroSUBTRANSPage(int64 pageno) -{ - return SimpleLruZeroPage(SubTransCtl, pageno); + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(SubTransCtl, 0); } /* @@ -335,7 +309,7 @@ StartupSUBTRANS(TransactionId oldestActiveXID) prevlock = lock; } - (void) ZeroSUBTRANSPage(startPage); + (void) SimpleLruZeroPage(SubTransCtl, startPage); if (startPage == endPage) break; @@ -395,7 +369,7 @@ ExtendSUBTRANS(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page */ - ZeroSUBTRANSPage(pageno); + SimpleLruZeroPage(SubTransCtl, pageno); LWLockRelease(lock); } diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index a27f27cc037d1..b8af25e82d1ac 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -87,7 +87,7 @@ readTimeLineHistory(TimeLineID targetTLI) /* Timeline 1 does not have a history file, so no need to check */ if (targetTLI == 1) { - entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry = palloc_object(TimeLineHistoryEntry); entry->tli = targetTLI; entry->begin = entry->end = InvalidXLogRecPtr; return list_make1(entry); @@ -110,7 +110,7 @@ readTimeLineHistory(TimeLineID targetTLI) (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* Not there, so assume no parents */ - entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry = palloc_object(TimeLineHistoryEntry); entry->tli = targetTLI; entry->begin = entry->end = InvalidXLogRecPtr; return list_make1(entry); @@ -154,7 +154,7 @@ readTimeLineHistory(TimeLineID targetTLI) if (*ptr == '\0' || *ptr == '#') continue; - nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo); + nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo); if (nfields < 1) { @@ -175,7 +175,7 @@ readTimeLineHistory(TimeLineID targetTLI) lasttli = tli; - entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry = palloc_object(TimeLineHistoryEntry); entry->tli = tli; entry->begin = prevend; entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo; @@ -198,7 +198,7 @@ readTimeLineHistory(TimeLineID targetTLI) * Create one more entry for the "tip" of the timeline, which has no entry * in the history file. */ - entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry = palloc_object(TimeLineHistoryEntry); entry->tli = targetTLI; entry->begin = prevend; entry->end = InvalidXLogRecPtr; @@ -399,7 +399,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, * parent file failed to end with one. */ snprintf(buffer, sizeof(buffer), - "%s%u\t%X/%X\t%s\n", + "%s%u\t%X/%08X\t%s\n", (srcfd < 0) ? "" : "\n", parentTLI, LSN_FORMAT_ARGS(switchpoint), @@ -549,8 +549,8 @@ tliOfPointInHistory(XLogRecPtr ptr, List *history) { TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell); - if ((XLogRecPtrIsInvalid(tle->begin) || tle->begin <= ptr) && - (XLogRecPtrIsInvalid(tle->end) || ptr < tle->end)) + if ((!XLogRecPtrIsValid(tle->begin) || tle->begin <= ptr) && + (!XLogRecPtrIsValid(tle->end) || ptr < tle->end)) { /* found it */ return tle->tli; diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 9a39451a29a96..553d6756cb382 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -273,70 +273,6 @@ TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids) TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr); } -/* - * TransactionIdPrecedes --- is id1 logically < id2? - */ -bool -TransactionIdPrecedes(TransactionId id1, TransactionId id2) -{ - /* - * If either ID is a permanent XID then we can just do unsigned - * comparison. If both are normal, do a modulo-2^32 comparison. - */ - int32 diff; - - if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) - return (id1 < id2); - - diff = (int32) (id1 - id2); - return (diff < 0); -} - -/* - * TransactionIdPrecedesOrEquals --- is id1 logically <= id2? - */ -bool -TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2) -{ - int32 diff; - - if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) - return (id1 <= id2); - - diff = (int32) (id1 - id2); - return (diff <= 0); -} - -/* - * TransactionIdFollows --- is id1 logically > id2? - */ -bool -TransactionIdFollows(TransactionId id1, TransactionId id2) -{ - int32 diff; - - if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) - return (id1 > id2); - - diff = (int32) (id1 - id2); - return (diff > 0); -} - -/* - * TransactionIdFollowsOrEquals --- is id1 logically >= id2? - */ -bool -TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2) -{ - int32 diff; - - if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) - return (id1 >= id2); - - diff = (int32) (id1 - id2); - return (diff >= 0); -} - /* * TransactionIdLatest --- get latest XID among a main xact and its children diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 73a80559194e7..3bc8598682988 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -103,6 +103,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/timestamp.h" @@ -159,7 +160,7 @@ typedef struct GlobalTransactionData */ XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */ XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */ - TransactionId xid; /* The GXACT id */ + FullTransactionId fxid; /* The GXACT full xid */ Oid owner; /* ID of user that executed the xact */ ProcNumber locking_backend; /* backend currently working on the xact */ @@ -167,7 +168,7 @@ typedef struct GlobalTransactionData bool ondisk; /* true if prepare state file is on disk */ bool inredo; /* true if entry was added via xlog_redo */ char gid[GIDSIZE]; /* The GID assigned to the prepared xact */ -} GlobalTransactionData; +} GlobalTransactionData; /* * Two Phase Commit shared state. Access to this struct is protected @@ -197,6 +198,7 @@ static GlobalTransaction MyLockedGxact = NULL; static bool twophaseExitRegistered = false; +static void PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning); static void RecordTransactionCommitPrepared(TransactionId xid, int nchildren, TransactionId *children, @@ -216,19 +218,19 @@ static void RecordTransactionAbortPrepared(TransactionId xid, int nstats, xl_xact_stats_item *stats, const char *gid); -static void ProcessRecords(char *bufptr, TransactionId xid, +static void ProcessRecords(char *bufptr, FullTransactionId fxid, const TwoPhaseCallback callbacks[]); static void RemoveGXact(GlobalTransaction gxact); static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len); -static char *ProcessTwoPhaseBuffer(TransactionId xid, +static char *ProcessTwoPhaseBuffer(FullTransactionId fxid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid); -static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, +static void MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid); -static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); -static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); +static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning); +static void RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len); /* * Initialization of shared memory @@ -356,7 +358,7 @@ PostPrepare_Twophase(void) * Reserve the GID for the given transaction. */ GlobalTransaction -MarkAsPreparing(TransactionId xid, const char *gid, +MarkAsPreparing(FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid) { GlobalTransaction gxact; @@ -407,7 +409,7 @@ MarkAsPreparing(TransactionId xid, const char *gid, gxact = TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = gxact->next; - MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid); + MarkAsPreparingGuts(gxact, fxid, gid, prepared_at, owner, databaseid); gxact->ondisk = false; @@ -430,11 +432,13 @@ MarkAsPreparing(TransactionId xid, const char *gid, * Note: This function should be called with appropriate locks held. */ static void -MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, - TimestampTz prepared_at, Oid owner, Oid databaseid) +MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, + const char *gid, TimestampTz prepared_at, Oid owner, + Oid databaseid) { PGPROC *proc; int i; + TransactionId xid = XidFromFullTransactionId(fxid); Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); @@ -479,7 +483,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->subxidStatus.count = 0; gxact->prepared_at = prepared_at; - gxact->xid = xid; + gxact->fxid = fxid; gxact->owner = owner; gxact->locking_backend = MyProcNumber; gxact->valid = false; @@ -680,7 +684,7 @@ GetPreparedTransactionList(GlobalTransaction *gxacts) } num = TwoPhaseState->numPrepXacts; - array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num); + array = palloc_array(GlobalTransactionData, num); *gxacts = array; for (i = 0; i < num; i++) memcpy(array + i, TwoPhaseState->prepXacts[i], @@ -746,7 +750,7 @@ pg_prepared_xact(PG_FUNCTION_ARGS) * Collect all the 2PC status information that we will format and send * out as a result set. */ - status = (Working_State *) palloc(sizeof(Working_State)); + status = palloc_object(Working_State); funcctx->user_fctx = status; status->ngxacts = GetPreparedTransactionList(&status->array); @@ -797,12 +801,12 @@ pg_prepared_xact(PG_FUNCTION_ARGS) * caller had better hold it. */ static GlobalTransaction -TwoPhaseGetGXact(TransactionId xid, bool lock_held) +TwoPhaseGetGXact(FullTransactionId fxid, bool lock_held) { GlobalTransaction result = NULL; int i; - static TransactionId cached_xid = InvalidTransactionId; + static FullTransactionId cached_fxid = {InvalidTransactionId}; static GlobalTransaction cached_gxact = NULL; Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock)); @@ -811,7 +815,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called * repeatedly for the same XID. We can save work with a simple cache. */ - if (xid == cached_xid) + if (FullTransactionIdEquals(fxid, cached_fxid)) return cached_gxact; if (!lock_held) @@ -821,7 +825,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; - if (gxact->xid == xid) + if (FullTransactionIdEquals(gxact->fxid, fxid)) { result = gxact; break; @@ -832,9 +836,10 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) LWLockRelease(TwoPhaseStateLock); if (result == NULL) /* should not happen */ - elog(ERROR, "failed to find GlobalTransaction for xid %u", xid); + elog(ERROR, "failed to find GlobalTransaction for xid %u", + XidFromFullTransactionId(fxid)); - cached_xid = xid; + cached_fxid = fxid; cached_gxact = result; return result; @@ -881,7 +886,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, *have_more = true; break; } - result = gxact->xid; + result = XidFromFullTransactionId(gxact->fxid); } } @@ -892,7 +897,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, /* * TwoPhaseGetDummyProcNumber - * Get the dummy proc number for prepared transaction specified by XID + * Get the dummy proc number for prepared transaction * * Dummy proc numbers are similar to proc numbers of real backends. They * start at MaxBackends, and are unique across all currently active real @@ -900,24 +905,24 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, * TwoPhaseStateLock will not be taken, so the caller had better hold it. */ ProcNumber -TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held) +TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held) { - GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held); return gxact->pgprocno; } /* * TwoPhaseGetDummyProc - * Get the PGPROC that represents a prepared transaction specified by XID + * Get the PGPROC that represents a prepared transaction * * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the * caller had better hold it. */ PGPROC * -TwoPhaseGetDummyProc(TransactionId xid, bool lock_held) +TwoPhaseGetDummyProc(FullTransactionId fxid, bool lock_held) { - GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held); return GetPGProcByNumber(gxact->pgprocno); } @@ -942,10 +947,8 @@ AdjustToFullTransactionId(TransactionId xid) } static inline int -TwoPhaseFilePath(char *path, TransactionId xid) +TwoPhaseFilePath(char *path, FullTransactionId fxid) { - FullTransactionId fxid = AdjustToFullTransactionId(xid); - return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X", EpochFromFullTransactionId(fxid), XidFromFullTransactionId(fxid)); @@ -1024,7 +1027,7 @@ save_state_data(const void *data, uint32 len) if (padlen > records.bytes_free) { - records.tail->next = palloc0(sizeof(StateFileChunk)); + records.tail->next = palloc0_object(StateFileChunk); records.tail = records.tail->next; records.tail->len = 0; records.tail->next = NULL; @@ -1034,7 +1037,7 @@ save_state_data(const void *data, uint32 len) records.tail->data = palloc(records.bytes_free); } - memcpy(((char *) records.tail->data) + records.tail->len, data, len); + memcpy(records.tail->data + records.tail->len, data, len); records.tail->len += padlen; records.bytes_free -= padlen; records.total_len += padlen; @@ -1049,7 +1052,7 @@ void StartPrepare(GlobalTransaction gxact) { PGPROC *proc = GetPGProcByNumber(gxact->pgprocno); - TransactionId xid = gxact->xid; + TransactionId xid = XidFromFullTransactionId(gxact->fxid); TwoPhaseFileHeader hdr; TransactionId *children; RelFileLocator *commitrels; @@ -1059,7 +1062,7 @@ StartPrepare(GlobalTransaction gxact) SharedInvalidationMessage *invalmsgs; /* Initialize linked list */ - records.head = palloc0(sizeof(StateFileChunk)); + records.head = palloc0_object(StateFileChunk); records.head->len = 0; records.head->next = NULL; @@ -1181,7 +1184,11 @@ EndPrepare(GlobalTransaction gxact) * starting immediately after the WAL record is inserted could complete * without fsync'ing our state file. (This is essentially the same kind * of race condition as the COMMIT-to-clog-write case that - * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.) + * RecordTransactionCommit uses DELAY_CHKPT_IN_COMMIT for; see notes + * there.) Note that DELAY_CHKPT_IN_COMMIT is used to find transactions in + * the critical commit section. We need to know about such transactions + * for conflict detection in logical replication. See + * GetOldestActiveTransactionId(true, false) and its use. * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. @@ -1281,10 +1288,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, * If it looks OK (has a valid magic number and CRC), return the palloc'd * contents of the file, issuing an error when finding corrupted data. If * missing_ok is true, which indicates that missing files can be safely - * ignored, then return NULL. This state can be reached when doing recovery. + * ignored, then return NULL. This state can be reached when doing recovery + * after discarding two-phase files from frozen epochs. */ static char * -ReadTwoPhaseFile(TransactionId xid, bool missing_ok) +ReadTwoPhaseFile(FullTransactionId fxid, bool missing_ok) { char path[MAXPGPATH]; char *buf; @@ -1296,7 +1304,7 @@ ReadTwoPhaseFile(TransactionId xid, bool missing_ok) file_crc; int r; - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) @@ -1426,12 +1434,12 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read two-phase state from WAL at %X/%X: %s", + errmsg("could not read two-phase state from WAL at %X/%08X: %s", LSN_FORMAT_ARGS(lsn), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read two-phase state from WAL at %X/%X", + errmsg("could not read two-phase state from WAL at %X/%08X", LSN_FORMAT_ARGS(lsn)))); } @@ -1439,13 +1447,13 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) (XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE) ereport(ERROR, (errcode_for_file_access(), - errmsg("expected two-phase state data is not present in WAL at %X/%X", + errmsg("expected two-phase state data is not present in WAL at %X/%08X", LSN_FORMAT_ARGS(lsn)))); if (len != NULL) *len = XLogRecGetDataLen(xlogreader); - *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader)); + *buf = palloc_array(char, XLogRecGetDataLen(xlogreader)); memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader)); XLogReaderFree(xlogreader); @@ -1461,6 +1469,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid) char *buf; TwoPhaseFileHeader *hdr; bool result; + FullTransactionId fxid; Assert(TransactionIdIsValid(xid)); @@ -1468,7 +1477,8 @@ StandbyTransactionIdIsPrepared(TransactionId xid) return false; /* nothing to do */ /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, true); + fxid = AdjustToFullTransactionId(xid); + buf = ReadTwoPhaseFile(fxid, true); if (buf == NULL) return false; @@ -1488,6 +1498,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) { GlobalTransaction gxact; PGPROC *proc; + FullTransactionId fxid; TransactionId xid; bool ondisk; char *buf; @@ -1509,7 +1520,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit) */ gxact = LockGXact(gid, GetUserId()); proc = GetPGProcByNumber(gxact->pgprocno); - xid = gxact->xid; + fxid = gxact->fxid; + xid = XidFromFullTransactionId(fxid); /* * Read and validate 2PC state data. State data will typically be stored @@ -1517,7 +1529,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * to disk if for some reason they have lived for a long time. */ if (gxact->ondisk) - buf = ReadTwoPhaseFile(xid, false); + buf = ReadTwoPhaseFile(fxid, false); else XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); @@ -1636,11 +1648,11 @@ FinishPreparedTransaction(const char *gid, bool isCommit) /* And now do the callbacks */ if (isCommit) - ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); + ProcessRecords(bufptr, fxid, twophase_postcommit_callbacks); else - ProcessRecords(bufptr, xid, twophase_postabort_callbacks); + ProcessRecords(bufptr, fxid, twophase_postabort_callbacks); - PredicateLockTwoPhaseFinish(xid, isCommit); + PredicateLockTwoPhaseFinish(fxid, isCommit); /* * Read this value while holding the two-phase lock, as the on-disk 2PC @@ -1664,7 +1676,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * And now we can clean up any files we may have left. */ if (ondisk) - RemoveTwoPhaseFile(xid, true); + RemoveTwoPhaseFile(fxid, true); MyLockedGxact = NULL; @@ -1677,7 +1689,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record. */ static void -ProcessRecords(char *bufptr, TransactionId xid, +ProcessRecords(char *bufptr, FullTransactionId fxid, const TwoPhaseCallback callbacks[]) { for (;;) @@ -1691,24 +1703,28 @@ ProcessRecords(char *bufptr, TransactionId xid, bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk)); if (callbacks[record->rmid] != NULL) - callbacks[record->rmid] (xid, record->info, bufptr, record->len); + callbacks[record->rmid] (fxid, record->info, bufptr, record->len); bufptr += MAXALIGN(record->len); } } /* - * Remove the 2PC file for the specified XID. + * Remove the 2PC file. * * If giveWarning is false, do not complain about file-not-present; * this is an expected case during WAL replay. + * + * This routine is used at early stages at recovery where future and + * past orphaned files are checked, hence the FullTransactionId to build + * a complete file name fit for the removal. */ static void -RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) +RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning) { char path[MAXPGPATH]; - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); if (unlink(path)) if (errno != ENOENT || giveWarning) ereport(WARNING, @@ -1723,7 +1739,7 @@ RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) * Note: content and len don't include CRC. */ static void -RecreateTwoPhaseFile(TransactionId xid, void *content, int len) +RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len) { char path[MAXPGPATH]; pg_crc32c statefile_crc; @@ -1734,7 +1750,7 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) COMP_CRC32C(statefile_crc, content, len); FIN_CRC32C(statefile_crc); - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); fd = OpenTransientFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); @@ -1846,7 +1862,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) int len; XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); - RecreateTwoPhaseFile(gxact->xid, buf, len); + RecreateTwoPhaseFile(gxact->fxid, buf, len); gxact->ondisk = true; gxact->prepare_start_lsn = InvalidXLogRecPtr; gxact->prepare_end_lsn = InvalidXLogRecPtr; @@ -1897,19 +1913,17 @@ restoreTwoPhaseData(void) if (strlen(clde->d_name) == 16 && strspn(clde->d_name, "0123456789ABCDEF") == 16) { - TransactionId xid; FullTransactionId fxid; char *buf; fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16)); - xid = XidFromFullTransactionId(fxid); - buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr, + buf = ProcessTwoPhaseBuffer(fxid, InvalidXLogRecPtr, true, false, false); if (buf == NULL) continue; - PrepareRedoAdd(buf, InvalidXLogRecPtr, + PrepareRedoAdd(fxid, buf, InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidRepOriginId); } } @@ -1968,9 +1982,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) Assert(gxact->inredo); - xid = gxact->xid; - - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, false, true); @@ -1981,6 +1993,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) * OK, we think this file is valid. Incorporate xid into the * running-minimum result. */ + xid = XidFromFullTransactionId(gxact->fxid); if (TransactionIdPrecedes(xid, result)) result = xid; @@ -2036,15 +2049,12 @@ StandbyRecoverPreparedTransactions(void) LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - TransactionId xid; char *buf; GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; Assert(gxact->inredo); - xid = gxact->xid; - - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, true, false); if (buf != NULL) @@ -2077,16 +2087,14 @@ RecoverPreparedTransactions(void) LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - TransactionId xid; char *buf; GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + FullTransactionId fxid = gxact->fxid; char *bufptr; TwoPhaseFileHeader *hdr; TransactionId *subxids; const char *gid; - xid = gxact->xid; - /* * Reconstruct subtrans state for the transaction --- needed because * pg_subtrans is not preserved over a restart. Note that we are @@ -2096,17 +2104,20 @@ RecoverPreparedTransactions(void) * SubTransSetParent has been set before, if the prepared transaction * generated xid assignment records. */ - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, true, false); if (buf == NULL) continue; ereport(LOG, - (errmsg("recovering prepared transaction %u from shared memory", xid))); + (errmsg("recovering prepared transaction %u of epoch %u from shared memory", + XidFromFullTransactionId(gxact->fxid), + EpochFromFullTransactionId(gxact->fxid)))); hdr = (TwoPhaseFileHeader *) buf; - Assert(TransactionIdEquals(hdr->xid, xid)); + Assert(TransactionIdEquals(hdr->xid, + XidFromFullTransactionId(gxact->fxid))); bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); gid = (const char *) bufptr; bufptr += MAXALIGN(hdr->gidlen); @@ -2122,7 +2133,7 @@ RecoverPreparedTransactions(void) * Recreate its GXACT and dummy PGPROC. But, check whether it was * added in redo and already has a shmem entry for it. */ - MarkAsPreparingGuts(gxact, xid, gid, + MarkAsPreparingGuts(gxact, gxact->fxid, gid, hdr->prepared_at, hdr->owner, hdr->database); @@ -2137,7 +2148,7 @@ RecoverPreparedTransactions(void) /* * Recover other state (notably locks) using resource managers. */ - ProcessRecords(bufptr, xid, twophase_recover_callbacks); + ProcessRecords(bufptr, fxid, twophase_recover_callbacks); /* * Release locks held by the standby process after we process each @@ -2145,7 +2156,7 @@ RecoverPreparedTransactions(void) * additional locks at any one time. */ if (InHotStandby) - StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids); + StandbyReleaseLockTree(hdr->xid, hdr->nsubxacts, subxids); /* * We're done with recovering this transaction. Clear MyLockedGxact, @@ -2164,7 +2175,7 @@ RecoverPreparedTransactions(void) /* * ProcessTwoPhaseBuffer * - * Given a transaction id, read it either from disk or read it directly + * Given a FullTransactionId, read it either from disk or read it directly * via shmem xlog record pointer using the provided "prepare_start_lsn". * * If setParent is true, set up subtransaction parent linkages. @@ -2173,13 +2184,12 @@ RecoverPreparedTransactions(void) * value scanned. */ static char * -ProcessTwoPhaseBuffer(TransactionId xid, +ProcessTwoPhaseBuffer(FullTransactionId fxid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid) { FullTransactionId nextXid = TransamVariables->nextXid; - TransactionId origNextXid = XidFromFullTransactionId(nextXid); TransactionId *subxids; char *buf; TwoPhaseFileHeader *hdr; @@ -2188,44 +2198,49 @@ ProcessTwoPhaseBuffer(TransactionId xid, Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); if (!fromdisk) - Assert(prepare_start_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(prepare_start_lsn)); /* Already processed? */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + if (TransactionIdDidCommit(XidFromFullTransactionId(fxid)) || + TransactionIdDidAbort(XidFromFullTransactionId(fxid))) { if (fromdisk) { ereport(WARNING, - (errmsg("removing stale two-phase state file for transaction %u", - xid))); - RemoveTwoPhaseFile(xid, true); + (errmsg("removing stale two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + RemoveTwoPhaseFile(fxid, true); } else { ereport(WARNING, - (errmsg("removing stale two-phase state from memory for transaction %u", - xid))); - PrepareRedoRemove(xid, true); + (errmsg("removing stale two-phase state from memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + PrepareRedoRemoveFull(fxid, true); } return NULL; } /* Reject XID if too new */ - if (TransactionIdFollowsOrEquals(xid, origNextXid)) + if (FullTransactionIdFollowsOrEquals(fxid, nextXid)) { if (fromdisk) { ereport(WARNING, - (errmsg("removing future two-phase state file for transaction %u", - xid))); - RemoveTwoPhaseFile(xid, true); + (errmsg("removing future two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + RemoveTwoPhaseFile(fxid, true); } else { ereport(WARNING, - (errmsg("removing future two-phase state from memory for transaction %u", - xid))); - PrepareRedoRemove(xid, true); + (errmsg("removing future two-phase state from memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + PrepareRedoRemoveFull(fxid, true); } return NULL; } @@ -2233,7 +2248,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, if (fromdisk) { /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, false); + buf = ReadTwoPhaseFile(fxid, false); } else { @@ -2243,18 +2258,20 @@ ProcessTwoPhaseBuffer(TransactionId xid, /* Deconstruct header */ hdr = (TwoPhaseFileHeader *) buf; - if (!TransactionIdEquals(hdr->xid, xid)) + if (!TransactionIdEquals(hdr->xid, XidFromFullTransactionId(fxid))) { if (fromdisk) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted two-phase state file for transaction %u", - xid))); + errmsg("corrupted two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted two-phase state in memory for transaction %u", - xid))); + errmsg("corrupted two-phase state in memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); } /* @@ -2268,14 +2285,14 @@ ProcessTwoPhaseBuffer(TransactionId xid, { TransactionId subxid = subxids[i]; - Assert(TransactionIdFollows(subxid, xid)); + Assert(TransactionIdFollows(subxid, XidFromFullTransactionId(fxid))); /* update nextXid if needed */ if (setNextXid) AdvanceNextFullTransactionIdPastXid(subxid); if (setParent) - SubTransSetParent(subxid, xid); + SubTransSetParent(subxid, XidFromFullTransactionId(fxid)); } return buf; @@ -2286,7 +2303,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit (q.v. if you change - * this function): in particular, we must set DELAY_CHKPT_START to avoid a + * this function): in particular, we must set DELAY_CHKPT_IN_COMMIT to avoid a * race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), @@ -2306,7 +2323,7 @@ RecordTransactionCommitPrepared(TransactionId xid, const char *gid) { XLogRecPtr recptr; - TimestampTz committs = GetCurrentTimestamp(); + TimestampTz committs; bool replorigin; /* @@ -2316,11 +2333,32 @@ RecordTransactionCommitPrepared(TransactionId xid, replorigin = (replorigin_session_origin != InvalidRepOriginId && replorigin_session_origin != DoNotReplicateId); + /* Load the injection point before entering the critical section */ + INJECTION_POINT_LOAD("commit-after-delay-checkpoint"); + START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + INJECTION_POINT_CACHED("commit-after-delay-checkpoint", NULL); + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before + * commit time is written. + */ + pg_write_barrier(); + + /* + * Note it is important to set committs value after marking ourselves as + * in the commit critical section (DELAY_CHKPT_IN_COMMIT). This is because + * we want to ensure all transactions that have acquired commit timestamp + * are finished before we allow the logical replication client to advance + * its xid which is used to hold back dead rows for conflict detection. + * See comments atop worker.c. + */ + committs = GetCurrentTimestamp(); /* * Emit the XLOG commit record. Note that we mark 2PC commits as @@ -2369,7 +2407,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); @@ -2466,8 +2504,9 @@ RecordTransactionAbortPrepared(TransactionId xid, * data, the entry is marked as located on disk. */ void -PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, - XLogRecPtr end_lsn, RepOriginId origin_id) +PrepareRedoAdd(FullTransactionId fxid, char *buf, + XLogRecPtr start_lsn, XLogRecPtr end_lsn, + RepOriginId origin_id) { TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf; char *bufptr; @@ -2477,6 +2516,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); Assert(RecoveryInProgress()); + if (!FullTransactionIdIsValid(fxid)) + { + Assert(InRecovery); + fxid = FullTransactionIdFromAllowableAt(TransamVariables->nextXid, + hdr->xid); + } + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); gid = (const char *) bufptr; @@ -2501,18 +2547,19 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, * the record is added to TwoPhaseState and it should have no * corresponding file in pg_twophase. */ - if (!XLogRecPtrIsInvalid(start_lsn)) + if (XLogRecPtrIsValid(start_lsn)) { char path[MAXPGPATH]; - TwoPhaseFilePath(path, hdr->xid); + Assert(InRecovery); + TwoPhaseFilePath(path, fxid); if (access(path, F_OK) == 0) { ereport(reachedConsistency ? ERROR : WARNING, (errmsg("could not recover two-phase state file for transaction %u", hdr->xid), - errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.", + errdetail("Two-phase state file has been found in WAL record %X/%08X, but this transaction has already been restored from disk.", LSN_FORMAT_ARGS(start_lsn)))); return; } @@ -2536,11 +2583,11 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, gxact->prepared_at = hdr->prepared_at; gxact->prepare_start_lsn = start_lsn; gxact->prepare_end_lsn = end_lsn; - gxact->xid = hdr->xid; + gxact->fxid = fxid; gxact->owner = hdr->owner; gxact->locking_backend = INVALID_PROC_NUMBER; gxact->valid = false; - gxact->ondisk = XLogRecPtrIsInvalid(start_lsn); + gxact->ondisk = !XLogRecPtrIsValid(start_lsn); gxact->inredo = true; /* yes, added in redo */ strcpy(gxact->gid, gid); @@ -2555,11 +2602,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, false /* backward */ , false /* WAL */ ); } - elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid); + elog(DEBUG2, "added 2PC data in shared memory for transaction %u of epoch %u", + XidFromFullTransactionId(gxact->fxid), + EpochFromFullTransactionId(gxact->fxid)); } /* - * PrepareRedoRemove + * PrepareRedoRemoveFull * * Remove the corresponding gxact entry from TwoPhaseState. Also remove * the 2PC file if a prepared transaction was saved via an earlier checkpoint. @@ -2567,8 +2616,8 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState * is updated. */ -void -PrepareRedoRemove(TransactionId xid, bool giveWarning) +static void +PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning) { GlobalTransaction gxact = NULL; int i; @@ -2581,7 +2630,7 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) { gxact = TwoPhaseState->prepXacts[i]; - if (gxact->xid == xid) + if (FullTransactionIdEquals(gxact->fxid, fxid)) { Assert(gxact->inredo); found = true; @@ -2598,12 +2647,28 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) /* * And now we can clean up any files we may have left. */ - elog(DEBUG2, "removing 2PC data for transaction %u", xid); + elog(DEBUG2, "removing 2PC data for transaction %u of epoch %u ", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)); + if (gxact->ondisk) - RemoveTwoPhaseFile(xid, giveWarning); + RemoveTwoPhaseFile(fxid, giveWarning); + RemoveGXact(gxact); } +/* + * Wrapper of PrepareRedoRemoveFull(), for TransactionIds. + */ +void +PrepareRedoRemove(TransactionId xid, bool giveWarning) +{ + FullTransactionId fxid = + FullTransactionIdFromAllowableAt(TransamVariables->nextXid, xid); + + PrepareRedoRemoveFull(fxid, giveWarning); +} + /* * LookupGXact * Check if the prepared transaction with the given GID, lsn and timestamp @@ -2648,7 +2713,7 @@ LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, * between publisher and subscriber. */ if (gxact->ondisk) - buf = ReadTwoPhaseFile(gxact->xid, false); + buf = ReadTwoPhaseFile(gxact->fxid, false); else { Assert(gxact->prepare_start_lsn); @@ -2750,3 +2815,58 @@ LookupGXactBySubid(Oid subid) return found; } + +/* + * TwoPhaseGetOldestXidInCommit + * Return the oldest transaction ID from prepared transactions that are + * currently in the commit critical section. + * + * This function only considers transactions in the currently connected + * database. If no matching transactions are found, it returns + * InvalidTransactionId. + */ +TransactionId +TwoPhaseGetOldestXidInCommit(void) +{ + TransactionId oldestRunningXid = InvalidTransactionId; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (int i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *commitproc; + TransactionId xid; + + if (!gxact->valid) + continue; + + if (gxact->locking_backend == INVALID_PROC_NUMBER) + continue; + + /* + * Get the backend that is handling the transaction. It's safe to + * access this backend while holding TwoPhaseStateLock, as the backend + * can only be destroyed after either removing or unlocking the + * current global transaction, both of which require an exclusive + * TwoPhaseStateLock. + */ + commitproc = GetPGProcByNumber(gxact->locking_backend); + + if (MyDatabaseId != commitproc->databaseId) + continue; + + if ((commitproc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + xid = XidFromFullTransactionId(gxact->fxid); + + if (!TransactionIdIsValid(oldestRunningXid) || + TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + } + + LWLockRelease(TwoPhaseStateLock); + + return oldestRunningXid; +} diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index fe895787cb72d..f8c4dada7c93a 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -19,11 +19,11 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlogutils.h" -#include "commands/dbcommands.h" #include "miscadmin.h" #include "postmaster/autovacuum.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "utils/lsyscache.h" #include "utils/syscache.h" diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index b885513f76541..1b5c1f6b7637c 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -31,6 +31,7 @@ #include "access/xloginsert.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" +#include "access/xlogwait.h" #include "catalog/index.h" #include "catalog/namespace.h" #include "catalog/pg_enum.h" @@ -551,9 +552,9 @@ MarkCurrentTransactionIdLoggedIfAny(void) * operation in a subtransaction. We require that for logical decoding, see * LogicalDecodingProcessRecord. * - * This returns true if wal_level >= logical and we are inside a valid - * subtransaction, for which the assignment was not yet written to any WAL - * record. + * This returns true if effective_wal_level is logical and we are inside + * a valid subtransaction, for which the assignment was not yet written to + * any WAL record. */ bool IsSubxactTopXidLogPending(void) @@ -562,7 +563,7 @@ IsSubxactTopXidLogPending(void) if (CurrentTransactionState->topXidLogged) return false; - /* wal_level has to be logical */ + /* effective_wal_level has to be logical */ if (!XLogLogicalInfoActive()) return false; @@ -663,7 +664,7 @@ AssignTransactionId(TransactionState s) TransactionState *parents; size_t parentOffset = 0; - parents = palloc(sizeof(TransactionState) * s->nestingLevel); + parents = palloc_array(TransactionState, s->nestingLevel); while (p != NULL && !FullTransactionIdIsValid(p->fullTransactionId)) { parents[parentOffset++] = p; @@ -681,14 +682,14 @@ AssignTransactionId(TransactionState s) } /* - * When wal_level=logical, guarantee that a subtransaction's xid can only - * be seen in the WAL stream if its toplevel xid has been logged before. - * If necessary we log an xact_assignment record with fewer than - * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set - * for a transaction even though it appears in a WAL record, we just might - * superfluously log something. That can happen when an xid is included - * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in - * xl_standby_locks. + * When effective_wal_level is logical, guarantee that a subtransaction's + * xid can only be seen in the WAL stream if its toplevel xid has been + * logged before. If necessary we log an xact_assignment record with fewer + * than PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't + * set for a transaction even though it appears in a WAL record, we just + * might superfluously log something. That can happen when an xid is + * included somewhere inside a wal record, but not in XLogRecord->xl_xid, + * like in xl_standby_locks. */ if (isSubXact && XLogLogicalInfoActive() && !TopTransactionStateData.didLogXid) @@ -1431,10 +1432,22 @@ RecordTransactionCommit(void) * without holding the ProcArrayLock, since we're the only one * modifying it. This makes checkpoint's determination of which xacts * are delaying the checkpoint a bit fuzzy, but it doesn't matter. + * + * Note, it is important to get the commit timestamp after marking the + * transaction in the commit critical section. See + * RecordTransactionCommitPrepared. */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); START_CRIT_SECTION(); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + Assert(xactStopTimestamp == 0); + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible + * before commit time is written. + */ + pg_write_barrier(); /* * Insert the commit XLOG record. @@ -1537,7 +1550,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); } @@ -2476,6 +2489,7 @@ CommitTransaction(void) AtEOXact_Snapshot(true, false); AtEOXact_ApplyLauncher(true); AtEOXact_LogicalRepWorkers(true); + AtEOXact_LogicalCtl(); pgstat_report_xact_timestamp(0); ResourceOwnerDelete(TopTransactionResourceOwner); @@ -2515,7 +2529,7 @@ static void PrepareTransaction(void) { TransactionState s = CurrentTransactionState; - TransactionId xid = GetCurrentTransactionId(); + FullTransactionId fxid = GetCurrentFullTransactionId(); GlobalTransaction gxact; TimestampTz prepared_at; @@ -2644,7 +2658,7 @@ PrepareTransaction(void) * Reserve the GID for this transaction. This could fail if the requested * GID is invalid or already in use. */ - gxact = MarkAsPreparing(xid, prepareGID, prepared_at, + gxact = MarkAsPreparing(fxid, prepareGID, prepared_at, GetUserId(), MyDatabaseId); prepareGID = NULL; @@ -2694,7 +2708,7 @@ PrepareTransaction(void) * ProcArrayClearTransaction(). Otherwise, a GetLockConflicts() would * conclude "xact already committed or aborted" for our locks. */ - PostPrepare_Locks(xid); + PostPrepare_Locks(fxid); /* * Let others know about no transaction in progress by me. This has to be @@ -2738,9 +2752,9 @@ PrepareTransaction(void) PostPrepare_smgr(); - PostPrepare_MultiXact(xid); + PostPrepare_MultiXact(fxid); - PostPrepare_PredicateLocks(xid); + PostPrepare_PredicateLocks(fxid); ResourceOwnerRelease(TopTransactionResourceOwner, RESOURCE_RELEASE_LOCKS, @@ -2771,6 +2785,7 @@ PrepareTransaction(void) /* we treat PREPARE as ROLLBACK so far as waking workers goes */ AtEOXact_ApplyLauncher(false); AtEOXact_LogicalRepWorkers(false); + AtEOXact_LogicalCtl(); pgstat_report_xact_timestamp(0); CurrentResourceOwner = NULL; @@ -2831,6 +2846,11 @@ AbortTransaction(void) */ LWLockReleaseAll(); + /* + * Cleanup waiting for LSN if any. + */ + WaitLSNCleanup(); + /* Clear wait information and command progress indicator */ pgstat_report_wait_end(); pgstat_progress_end_command(); @@ -2993,6 +3013,7 @@ AbortTransaction(void) AtEOXact_PgStat(false, is_parallel_worker); AtEOXact_ApplyLauncher(false); AtEOXact_LogicalRepWorkers(false); + AtEOXact_LogicalCtl(); pgstat_report_xact_timestamp(0); } @@ -4523,13 +4544,13 @@ ReleaseSavepoint(const char *name) break; } - for (target = s; PointerIsValid(target); target = target->parent) + for (target = s; target; target = target->parent) { - if (PointerIsValid(target->name) && strcmp(target->name, name) == 0) + if (target->name && strcmp(target->name, name) == 0) break; } - if (!PointerIsValid(target)) + if (!target) ereport(ERROR, (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), errmsg("savepoint \"%s\" does not exist", name))); @@ -4553,7 +4574,7 @@ ReleaseSavepoint(const char *name) if (xact == target) break; xact = xact->parent; - Assert(PointerIsValid(xact)); + Assert(xact); } } @@ -4632,13 +4653,13 @@ RollbackToSavepoint(const char *name) break; } - for (target = s; PointerIsValid(target); target = target->parent) + for (target = s; target; target = target->parent) { - if (PointerIsValid(target->name) && strcmp(target->name, name) == 0) + if (target->name && strcmp(target->name, name) == 0) break; } - if (!PointerIsValid(target)) + if (!target) ereport(ERROR, (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), errmsg("savepoint \"%s\" does not exist", name))); @@ -4667,7 +4688,7 @@ RollbackToSavepoint(const char *name) elog(FATAL, "RollbackToSavepoint: unexpected state %s", BlockStateAsString(xact->blockState)); xact = xact->parent; - Assert(PointerIsValid(xact)); + Assert(xact); } /* And mark the target as "restart pending" */ @@ -5688,12 +5709,12 @@ ShowTransactionStateRec(const char *str, TransactionState s) ereport(DEBUG5, (errmsg_internal("%s(%d) name: %s; blockState: %s; state: %s, xid/subid/cid: %u/%u/%u%s%s", str, s->nestingLevel, - PointerIsValid(s->name) ? s->name : "unnamed", + s->name ? s->name : "unnamed", BlockStateAsString(s->blockState), TransStateAsString(s->state), - (unsigned int) XidFromFullTransactionId(s->fullTransactionId), - (unsigned int) s->subTransactionId, - (unsigned int) currentCommandId, + XidFromFullTransactionId(s->fullTransactionId), + s->subTransactionId, + currentCommandId, currentCommandIdUsed ? " (used)" : "", buf.data))); pfree(buf.data); @@ -6420,7 +6441,8 @@ xact_redo(XLogReaderState *record) * gxact entry. */ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); - PrepareRedoAdd(XLogRecGetData(record), + PrepareRedoAdd(InvalidFullTransactionId, + XLogRecGetData(record), record->ReadRecPtr, record->EndRecPtr, XLogRecGetOrigin(record)); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1914859b2eed7..1b7ef589fc097 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -62,6 +62,7 @@ #include "access/xlogreader.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" +#include "access/xlogwait.h" #include "backup/basebackup.h" #include "catalog/catversion.h" #include "catalog/pg_control.h" @@ -79,6 +80,7 @@ #include "postmaster/walwriter.h" #include "replication/origin.h" #include "replication/slot.h" +#include "replication/slotsync.h" #include "replication/snapbuild.h" #include "replication/walreceiver.h" #include "replication/walsender.h" @@ -96,6 +98,7 @@ #include "utils/guc_hooks.h" #include "utils/guc_tables.h" #include "utils/injection_point.h" +#include "utils/pgstat_internal.h" #include "utils/ps_status.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -302,6 +305,11 @@ static bool doPageWrites; * so it's a plain spinlock. The other locks are held longer (potentially * over I/O operations), so we use LWLocks for them. These locks are: * + * WALBufMappingLock: must be held to replace a page in the WAL buffer cache. + * It is only held while initializing and changing the mapping. If the + * contents of the buffer being replaced haven't been written yet, the mapping + * lock is released while the write is done, and reacquired afterwards. + * * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or * XLogFlush). * @@ -449,7 +457,6 @@ typedef struct XLogCtlData /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ - FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ @@ -468,37 +475,21 @@ typedef struct XLogCtlData pg_atomic_uint64 logFlushResult; /* last byte + 1 flushed */ /* - * First initialized page in the cache (first byte position). - */ - XLogRecPtr InitializedFrom; - - /* - * Latest reserved for initialization page in the cache (last byte - * position + 1). + * Latest initialized page in the cache (last byte position + 1). * - * To change the identity of a buffer, you need to advance - * InitializeReserved first. To change the identity of a buffer that's + * To change the identity of a buffer (and InitializedUpTo), you need to + * hold WALBufMappingLock. To change the identity of a buffer that's * still dirty, the old page needs to be written out first, and for that * you need WALWriteLock, and you need to ensure that there are no * in-progress insertions to the page by calling * WaitXLogInsertionsToFinish(). */ - pg_atomic_uint64 InitializeReserved; - - /* - * Latest initialized page in the cache (last byte position + 1). - * - * InitializedUpTo is updated after the buffer initialization. After - * update, waiters got notification using InitializedUpToCondVar. - */ - pg_atomic_uint64 InitializedUpTo; - ConditionVariable InitializedUpToCondVar; + XLogRecPtr InitializedUpTo; /* * These values do not change after startup, although the pointed-to pages - * and xlblocks values certainly do. xlblocks values are changed - * lock-free according to the check for the xlog write position and are - * accompanied by changes of InitializeReserved and InitializedUpTo. + * and xlblocks values certainly do. xlblocks values are protected by + * WALBufMappingLock. */ char *pages; /* buffers for unwritten XLOG pages */ pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */ @@ -703,7 +694,7 @@ static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version); static void WriteControlFile(void); static void ReadControlFile(void); static void UpdateControlFile(void); -static char *str_time(pg_time_t tnow); +static char *str_time(pg_time_t tnow, char *buf, size_t bufsize); static int get_sync_bit(int method); @@ -760,6 +751,7 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn, uint8 flags, int num_fpi, + uint64 fpi_bytes, bool topxid_included) { XLogCtlInsert *Insert = &XLogCtl->Insert; @@ -821,9 +813,9 @@ XLogInsertRecord(XLogRecData *rdata, * fullPageWrites from changing until the insertion is finished. * * Step 2 can usually be done completely in parallel. If the required WAL - * page is not initialized yet, you have to go through AdvanceXLInsertBuffer, - * which will ensure it is initialized. But the WAL writer tries to do that - * ahead of insertions to avoid that from happening in the critical path. + * page is not initialized yet, you have to grab WALBufMappingLock to + * initialize it, but the WAL writer tries to do that ahead of insertions + * to avoid that from happening in the critical path. * *---------- */ @@ -858,7 +850,7 @@ XLogInsertRecord(XLogRecData *rdata, if (doPageWrites && (!prevDoPageWrites || - (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr))) + (XLogRecPtrIsValid(fpw_lsn) && fpw_lsn <= RedoRecPtr))) { /* * Oops, some buffer now needs to be backed up that the caller @@ -892,7 +884,7 @@ XLogInsertRecord(XLogRecData *rdata, * Those checks are only needed for records that can contain buffer * references, and an XLOG_SWITCH record never does. */ - Assert(fpw_lsn == InvalidXLogRecPtr); + Assert(!XLogRecPtrIsValid(fpw_lsn)); WALInsertLockAcquireExclusive(); inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev); } @@ -907,7 +899,7 @@ XLogInsertRecord(XLogRecData *rdata, * not check RedoRecPtr before inserting the record; we just need to * update it afterwards. */ - Assert(fpw_lsn == InvalidXLogRecPtr); + Assert(!XLogRecPtrIsValid(fpw_lsn)); WALInsertLockAcquireExclusive(); ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos, &rechdr->xl_prev); @@ -1028,7 +1020,7 @@ XLogInsertRecord(XLogRecData *rdata, oldCxt = MemoryContextSwitchTo(walDebugCxt); initStringInfo(&buf); - appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos)); + appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos)); /* * We have to piece together the WAL record data from the XLogRecData @@ -1092,6 +1084,10 @@ XLogInsertRecord(XLogRecData *rdata, pgWalUsage.wal_bytes += rechdr->xl_tot_len; pgWalUsage.wal_records++; pgWalUsage.wal_fpi += num_fpi; + pgWalUsage.wal_fpi_bytes += fpi_bytes; + + /* Required for the flush of pending stats WAL data */ + pgstat_report_fixed = true; } return EndPos; @@ -1549,8 +1545,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) if (upto > reservedUpto) { ereport(LOG, - (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X", - LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)))); + errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X", + LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))); upto = reservedUpto; } @@ -1608,7 +1604,7 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) */ } while (insertingat < upto); - if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto) + if (XLogRecPtrIsValid(insertingat) && insertingat < finishedUpto) finishedUpto = insertingat; } @@ -1716,7 +1712,7 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); if (expectedEndPtr != endptr) - elog(PANIC, "could not find WAL buffer for %X/%X", + elog(PANIC, "could not find WAL buffer for %X/%08X", LSN_FORMAT_ARGS(ptr)); } else @@ -1767,7 +1763,7 @@ WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, if (RecoveryInProgress() || tli != GetWALInsertionTimeLine()) return 0; - Assert(!XLogRecPtrIsInvalid(startptr)); + Assert(XLogRecPtrIsValid(startptr)); /* * Caller should ensure that the requested data has been inserted into WAL @@ -1776,7 +1772,7 @@ WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult); if (startptr + count > inserted) ereport(ERROR, - errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X", + errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X", LSN_FORMAT_ARGS(startptr + count), LSN_FORMAT_ARGS(inserted))); @@ -2002,79 +1998,32 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr; XLogRecPtr NewPageBeginPtr; XLogPageHeader NewPage; - XLogRecPtr ReservedPtr; int npages pg_attribute_unused() = 0; - /* - * We must run the loop below inside the critical section as we expect - * XLogCtl->InitializedUpTo to eventually keep up. The most of callers - * already run inside the critical section. Except for WAL writer, which - * passed 'opportunistic == true', and therefore we don't perform - * operations that could error out. - * - * Start an explicit critical section anyway though. - */ - Assert(CritSectionCount > 0 || opportunistic); - START_CRIT_SECTION(); + LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); - /*-- - * Loop till we get all the pages in WAL buffer before 'upto' reserved for - * initialization. Multiple process can initialize different buffers with - * this loop in parallel as following. - * - * 1. Reserve page for initialization using XLogCtl->InitializeReserved. - * 2. Initialize the reserved page. - * 3. Attempt to advance XLogCtl->InitializedUpTo, + /* + * Now that we have the lock, check if someone initialized the page + * already. */ - ReservedPtr = pg_atomic_read_u64(&XLogCtl->InitializeReserved); - while (upto >= ReservedPtr || opportunistic) + while (upto >= XLogCtl->InitializedUpTo || opportunistic) { - Assert(ReservedPtr % XLOG_BLCKSZ == 0); + nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo); /* - * Get ending-offset of the buffer page we need to replace. - * - * We don't lookup into xlblocks, but rather calculate position we - * must wait to be written. If it was written, xlblocks will have this - * position (or uninitialized) + * Get ending-offset of the buffer page we need to replace (this may + * be zero if the buffer hasn't been used yet). Fall through if it's + * already written out. */ - if (ReservedPtr + XLOG_BLCKSZ > XLogCtl->InitializedFrom + XLOG_BLCKSZ * XLOGbuffers) - OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - (XLogRecPtr) XLOG_BLCKSZ * XLOGbuffers; - else - OldPageRqstPtr = InvalidXLogRecPtr; - - if (LogwrtResult.Write < OldPageRqstPtr && opportunistic) + OldPageRqstPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]); + if (LogwrtResult.Write < OldPageRqstPtr) { /* - * If we just want to pre-initialize as much as we can without - * flushing, give up now. + * Nope, got work to do. If we just want to pre-initialize as much + * as we can without flushing, give up now. */ - upto = ReservedPtr - 1; - break; - } - - /* - * Attempt to reserve the page for initialization. Failure means that - * this page got reserved by another process. - */ - if (!pg_atomic_compare_exchange_u64(&XLogCtl->InitializeReserved, - &ReservedPtr, - ReservedPtr + XLOG_BLCKSZ)) - continue; - - /* - * Wait till page gets correctly initialized up to OldPageRqstPtr. - */ - nextidx = XLogRecPtrToBufIdx(ReservedPtr); - while (pg_atomic_read_u64(&XLogCtl->InitializedUpTo) < OldPageRqstPtr) - ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT); - ConditionVariableCancelSleep(); - Assert(pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) == OldPageRqstPtr); - - /* Fall through if it's already written out. */ - if (LogwrtResult.Write < OldPageRqstPtr) - { - /* Nope, got work to do. */ + if (opportunistic) + break; /* Advance shared memory write request position */ SpinLockAcquire(&XLogCtl->info_lck); @@ -2089,6 +2038,14 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) RefreshXLogWriteResult(LogwrtResult); if (LogwrtResult.Write < OldPageRqstPtr) { + /* + * Must acquire write lock. Release WALBufMappingLock first, + * to make sure that all insertions that we need to wait for + * can finish (up to this same position). Otherwise we risk + * deadlock. + */ + LWLockRelease(WALBufMappingLock); + WaitXLogInsertionsToFinish(OldPageRqstPtr); LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); @@ -2109,7 +2066,16 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) LWLockRelease(WALWriteLock); pgWalUsage.wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + + /* + * Required for the flush of pending stats WAL data, per + * update of pgWalUsage. + */ + pgstat_report_fixed = true; } + /* Re-acquire WALBufMappingLock and retry */ + LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + continue; } } @@ -2117,9 +2083,11 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) * Now the next buffer slot is free and we can set it up to be the * next output page. */ - NewPageBeginPtr = ReservedPtr; + NewPageBeginPtr = XLogCtl->InitializedUpTo; NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ; + Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx); + NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ); /* @@ -2183,105 +2151,17 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) */ pg_write_barrier(); - /*----- - * Update the value of XLogCtl->xlblocks[nextidx] and try to advance - * XLogCtl->InitializedUpTo in a lock-less manner. - * - * First, let's provide a formal proof of the algorithm. Let it be 'n' - * process with the following variables in shared memory: - * f - an array of 'n' boolean flags, - * v - atomic integer variable. - * - * Also, let - * i - a number of a process, - * j - local integer variable, - * CAS(var, oldval, newval) - compare-and-swap atomic operation - * returning true on success, - * write_barrier()/read_barrier() - memory barriers. - * - * The pseudocode for each process is the following. - * - * j := i - * f[i] := true - * write_barrier() - * while CAS(v, j, j + 1): - * j := j + 1 - * read_barrier() - * if not f[j]: - * break - * - * Let's prove that v eventually reaches the value of n. - * 1. Prove by contradiction. Assume v doesn't reach n and stucks - * on k, where k < n. - * 2. Process k attempts CAS(v, k, k + 1). 1). If, as we assumed, v - * gets stuck at k, then this CAS operation must fail. Therefore, - * v < k when process k attempts CAS(v, k, k + 1). - * 3. If, as we assumed, v gets stuck at k, then the value k of v - * must be achieved by some process m, where m < k. The process - * m must observe f[k] == false. Otherwise, it will later attempt - * CAS(v, k, k + 1) with success. - * 4. Therefore, corresponding read_barrier() (while j == k) on - * process m reached before write_barrier() of process k. But then - * process k attempts CAS(v, k, k + 1) after process m successfully - * incremented v to k, and that CAS operation must succeed. - * That leads to a contradiction. So, there is no such k (k < n) - * where v gets stuck. Q.E.D. - * - * To apply this proof to the code below, we assume - * XLogCtl->InitializedUpTo will play the role of v with XLOG_BLCKSZ - * granularity. We also assume setting XLogCtl->xlblocks[nextidx] to - * NewPageEndPtr to play the role of setting f[i] to true. Also, note - * that processes can't concurrently map different xlog locations to - * the same nextidx because we previously requested that - * XLogCtl->InitializedUpTo >= OldPageRqstPtr. So, a xlog buffer can - * be taken for initialization only once the previous initialization - * takes effect on XLogCtl->InitializedUpTo. - */ - pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr); - - pg_write_barrier(); - - while (pg_atomic_compare_exchange_u64(&XLogCtl->InitializedUpTo, &NewPageBeginPtr, NewPageEndPtr)) - { - NewPageBeginPtr = NewPageEndPtr; - NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ; - nextidx = XLogRecPtrToBufIdx(NewPageBeginPtr); - - pg_read_barrier(); - - if (pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) != NewPageEndPtr) - { - /* - * Page at nextidx wasn't initialized yet, so we can't move - * InitializedUpto further. It will be moved by backend which - * will initialize nextidx. - */ - ConditionVariableBroadcast(&XLogCtl->InitializedUpToCondVar); - break; - } - } + XLogCtl->InitializedUpTo = NewPageEndPtr; npages++; } - - END_CRIT_SECTION(); - - /* - * All the pages in WAL buffer before 'upto' were reserved for - * initialization. However, some pages might be reserved by concurrent - * processes. Wait till they finish initialization. - */ - while (upto >= pg_atomic_read_u64(&XLogCtl->InitializedUpTo)) - ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT); - ConditionVariableCancelSleep(); - - pg_read_barrier(); + LWLockRelease(WALBufMappingLock); #ifdef WAL_DEBUG if (XLOG_DEBUG && npages > 0) { - elog(DEBUG1, "initialized %d pages, up to %X/%X", + elog(DEBUG1, "initialized %d pages, up to %X/%08X", npages, LSN_FORMAT_ARGS(NewPageEndPtr)); } #endif @@ -2346,25 +2226,6 @@ check_wal_segment_size(int *newval, void **extra, GucSource source) return true; } -/* - * GUC check_hook for max_slot_wal_keep_size - * - * We don't allow the value of max_slot_wal_keep_size other than -1 during the - * binary upgrade. See start_postmaster() in pg_upgrade for more details. - */ -bool -check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source) -{ - if (IsBinaryUpgrade && *newval != -1) - { - GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.", - "max_slot_wal_keep_size"); - return false; - } - - return true; -} - /* * At a checkpoint, how many WAL segments to recycle as preallocated future * XLOG segments? Returns the highest segment that should be preallocated. @@ -2492,7 +2353,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) XLogRecPtr EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]); if (LogwrtResult.Write >= EndPtr) - elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", + elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X", LSN_FORMAT_ARGS(LogwrtResult.Write), LSN_FORMAT_ARGS(EndPtr)); @@ -2857,7 +2718,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) * available is replayed in this case. This also saves from extra locks * taken on the control file from the startup process. */ - if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery) + if (!XLogRecPtrIsValid(LocalMinRecoveryPoint) && InRecovery) { updateMinRecoveryPoint = false; return; @@ -2869,7 +2730,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; - if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint)) + if (!XLogRecPtrIsValid(LocalMinRecoveryPoint)) updateMinRecoveryPoint = false; else if (force || LocalMinRecoveryPoint < lsn) { @@ -2892,7 +2753,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI); if (!force && newMinRecoveryPoint < lsn) elog(WARNING, - "xlog min recovery request %X/%X is past current point %X/%X", + "xlog min recovery request %X/%08X is past current point %X/%08X", LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint)); /* update control file */ @@ -2905,9 +2766,9 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) LocalMinRecoveryPointTLI = newMinRecoveryPointTLI; ereport(DEBUG2, - (errmsg_internal("updated min recovery point to %X/%X on timeline %u", - LSN_FORMAT_ARGS(newMinRecoveryPoint), - newMinRecoveryPointTLI))); + errmsg_internal("updated min recovery point to %X/%08X on timeline %u", + LSN_FORMAT_ARGS(newMinRecoveryPoint), + newMinRecoveryPointTLI)); } } LWLockRelease(ControlFileLock); @@ -2945,7 +2806,7 @@ XLogFlush(XLogRecPtr record) #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", + elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X", LSN_FORMAT_ARGS(record), LSN_FORMAT_ARGS(LogwrtResult.Write), LSN_FORMAT_ARGS(LogwrtResult.Flush)); @@ -3024,7 +2885,9 @@ XLogFlush(XLogRecPtr record) if (CommitDelay > 0 && enableFsync && MinimumActiveBackends(CommitSiblings)) { + pgstat_report_wait_start(WAIT_EVENT_COMMIT_DELAY); pg_usleep(CommitDelay); + pgstat_report_wait_end(); /* * Re-check how far we can now flush the WAL. It's generally not @@ -3078,9 +2941,16 @@ XLogFlush(XLogRecPtr record) */ if (LogwrtResult.Flush < record) elog(ERROR, - "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", + "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X", LSN_FORMAT_ARGS(record), LSN_FORMAT_ARGS(LogwrtResult.Flush)); + + /* + * Cross-check XLogNeedsFlush(). Some of the checks of XLogFlush() and + * XLogNeedsFlush() are duplicated, and this assertion ensures that these + * remain consistent. + */ + Assert(!XLogNeedsFlush(record)); } /* @@ -3205,7 +3075,7 @@ XLogBackgroundFlush(void) #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", + elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X", LSN_FORMAT_ARGS(WriteRqst.Write), LSN_FORMAT_ARGS(WriteRqst.Flush), LSN_FORMAT_ARGS(LogwrtResult.Write), @@ -3245,10 +3115,16 @@ XLogBackgroundFlush(void) } /* - * Test whether XLOG data has been flushed up to (at least) the given position. + * Test whether XLOG data has been flushed up to (at least) the given + * position, or whether the minimum recovery point has been updated past + * the given position. + * + * Returns true if a flush is still needed, or if the minimum recovery point + * must be updated. * - * Returns true if a flush is still needed. (It may be that someone else - * is already in process of flushing that far, however.) + * It is possible that someone else is already in the process of flushing + * that far, or has updated the minimum recovery point up to the given + * position. */ bool XLogNeedsFlush(XLogRecPtr record) @@ -3257,9 +3133,17 @@ XLogNeedsFlush(XLogRecPtr record) * During recovery, we don't flush WAL but update minRecoveryPoint * instead. So "needs flush" is taken to mean whether minRecoveryPoint * would need to be updated. + * + * Using XLogInsertAllowed() rather than RecoveryInProgress() matters for + * the case of an end-of-recovery checkpoint, where WAL data is flushed. + * This check should be consistent with the one in XLogFlush(). */ - if (RecoveryInProgress()) + if (!XLogInsertAllowed()) { + /* Quick exit if already known to be updated or cannot be updated */ + if (!updateMinRecoveryPoint || record <= LocalMinRecoveryPoint) + return false; + /* * An invalid minRecoveryPoint means that we need to recover all the * WAL, i.e., we're doing crash recovery. We never modify the control @@ -3268,12 +3152,11 @@ XLogNeedsFlush(XLogRecPtr record) * which cannot update its local copy of minRecoveryPoint as long as * it has not replayed all WAL available when doing crash recovery. */ - if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery) + if (!XLogRecPtrIsValid(LocalMinRecoveryPoint) && InRecovery) + { updateMinRecoveryPoint = false; - - /* Quick exit if already known to be updated or cannot be updated */ - if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint) return false; + } /* * Update local copy of minRecoveryPoint. But if the lock is busy, @@ -3290,7 +3173,7 @@ XLogNeedsFlush(XLogRecPtr record) * process doing crash recovery, which should not update the control * file value if crash recovery is still running. */ - if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint)) + if (!XLogRecPtrIsValid(LocalMinRecoveryPoint)) updateMinRecoveryPoint = false; /* check again */ @@ -4391,6 +4274,7 @@ WriteControlFile(void) ControlFile->blcksz = BLCKSZ; ControlFile->relseg_size = RELSEG_SIZE; + ControlFile->slru_pages_per_segment = SLRU_PAGES_PER_SEGMENT; ControlFile->xlog_blcksz = XLOG_BLCKSZ; ControlFile->xlog_seg_size = wal_segment_size; @@ -4400,7 +4284,7 @@ WriteControlFile(void) ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE; ControlFile->loblksize = LOBLKSIZE; - ControlFile->float8ByVal = FLOAT8PASSBYVAL; + ControlFile->float8ByVal = true; /* vestigial */ /* * Initialize the default 'char' signedness. @@ -4610,6 +4494,16 @@ ReadControlFile(void) "RELSEG_SIZE", ControlFile->relseg_size, "RELSEG_SIZE", RELSEG_SIZE), errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->slru_pages_per_segment != SLRU_PAGES_PER_SEGMENT) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("database files are incompatible with server"), + /* translator: %s is a variable name and %d is its value */ + errdetail("The database cluster was initialized with %s %d," + " but the server was compiled with %s %d.", + "SLRU_PAGES_PER_SEGMENT", ControlFile->slru_pages_per_segment, + "SLRU_PAGES_PER_SEGMENT", SLRU_PAGES_PER_SEGMENT), + errhint("It looks like you need to recompile or initdb."))); if (ControlFile->xlog_blcksz != XLOG_BLCKSZ) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -4661,23 +4555,7 @@ ReadControlFile(void) "LOBLKSIZE", (int) LOBLKSIZE), errhint("It looks like you need to recompile or initdb."))); -#ifdef USE_FLOAT8_BYVAL - if (ControlFile->float8ByVal != true) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("database files are incompatible with server"), - errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL" - " but the server was compiled with USE_FLOAT8_BYVAL."), - errhint("It looks like you need to recompile or initdb."))); -#else - if (ControlFile->float8ByVal != false) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("database files are incompatible with server"), - errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL" - " but the server was compiled without USE_FLOAT8_BYVAL."), - errhint("It looks like you need to recompile or initdb."))); -#endif + Assert(ControlFile->float8ByVal); /* vestigial, not worth an error msg */ wal_segment_size = ControlFile->xlog_seg_size; @@ -5011,6 +4889,25 @@ show_in_hot_standby(void) return RecoveryInProgress() ? "on" : "off"; } +/* + * GUC show_hook for effective_wal_level + */ +const char * +show_effective_wal_level(void) +{ + if (wal_level == WAL_LEVEL_MINIMAL) + return "minimal"; + + /* + * During recovery, effective_wal_level reflects the primary's + * configuration rather than the local wal_level value. + */ + if (RecoveryInProgress()) + return IsXLogLogicalInfoEnabled() ? "logical" : "replica"; + + return XLogLogicalInfoActive() ? "logical" : "replica"; +} + /* * Read the control file, set respective GUCs. * @@ -5027,7 +4924,7 @@ void LocalProcessControlFile(bool reset) { Assert(reset || ControlFile == NULL); - ControlFile = palloc(sizeof(ControlFileData)); + ControlFile = palloc_object(ControlFileData); ReadControlFile(); } @@ -5204,10 +5101,6 @@ XLOGShmemInit(void) pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr); pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr); pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr); - - pg_atomic_init_u64(&XLogCtl->InitializeReserved, InvalidXLogRecPtr); - pg_atomic_init_u64(&XLogCtl->InitializedUpTo, InvalidXLogRecPtr); - ConditionVariableInit(&XLogCtl->InitializedUpToCondVar); } /* @@ -5218,7 +5111,7 @@ void BootStrapXLOG(uint32 data_checksum_version) { CheckPoint checkPoint; - char *buffer; + PGAlignedXLogBlock buffer; XLogPageHeader page; XLogLongPageHeader longpage; XLogRecord *record; @@ -5247,10 +5140,8 @@ BootStrapXLOG(uint32 data_checksum_version) sysidentifier |= ((uint64) tv.tv_usec) << 12; sysidentifier |= getpid() & 0xFFF; - /* page buffer must be aligned suitably for O_DIRECT */ - buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); - page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer); - memset(page, 0, XLOG_BLCKSZ); + memset(&buffer, 0, sizeof buffer); + page = (XLogPageHeader) &buffer; /* * Set up information for the initial checkpoint record @@ -5263,12 +5154,13 @@ BootStrapXLOG(uint32 data_checksum_version) checkPoint.ThisTimeLineID = BootstrapTimeLineID; checkPoint.PrevTimeLineID = BootstrapTimeLineID; checkPoint.fullPageWrites = fullPageWrites; + checkPoint.logicalDecodingEnabled = (wal_level == WAL_LEVEL_LOGICAL); checkPoint.wal_level = wal_level; checkPoint.nextXid = FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; checkPoint.nextMulti = FirstMultiXactId; - checkPoint.nextMultiOffset = 0; + checkPoint.nextMultiOffset = 1; checkPoint.oldestXid = FirstNormalTransactionId; checkPoint.oldestXidDB = Template1DbOid; checkPoint.oldestMulti = FirstMultiXactId; @@ -5284,7 +5176,7 @@ BootStrapXLOG(uint32 data_checksum_version) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); /* Set up the XLOG page header */ @@ -5331,7 +5223,7 @@ BootStrapXLOG(uint32 data_checksum_version) /* Write the first page with the initial record */ errno = 0; pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE); - if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ) + if (write(openLogFile, &buffer, XLOG_BLCKSZ) != XLOG_BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) @@ -5371,8 +5263,6 @@ BootStrapXLOG(uint32 data_checksum_version) BootStrapSUBTRANS(); BootStrapMultiXact(); - pfree(buffer); - /* * Force control file to be read - in contrast to normal processing we'd * otherwise never run the checks and GUC related initializations therein. @@ -5381,11 +5271,9 @@ BootStrapXLOG(uint32 data_checksum_version) } static char * -str_time(pg_time_t tnow) +str_time(pg_time_t tnow, char *buf, size_t bufsize) { - char *buf = palloc(128); - - pg_strftime(buf, 128, + pg_strftime(buf, bufsize, "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&tnow, log_timezone)); @@ -5628,6 +5516,7 @@ StartupXLOG(void) XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; bool promoted = false; + char timebuf[128]; /* * We should have an aux process resource owner to use, and we should not @@ -5656,25 +5545,29 @@ StartupXLOG(void) */ ereport(IsPostmasterEnvironment ? LOG : NOTICE, (errmsg("database system was shut down at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_SHUTDOWNED_IN_RECOVERY: ereport(LOG, (errmsg("database system was shut down in recovery at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_SHUTDOWNING: ereport(LOG, (errmsg("database system shutdown was interrupted; last known up at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_IN_CRASH_RECOVERY: ereport(LOG, (errmsg("database system was interrupted while in recovery at %s", - str_time(ControlFile->time)), + str_time(ControlFile->time, + timebuf, sizeof(timebuf))), errhint("This probably means that some data is corrupted and" " you will have to use the last backup for recovery."))); break; @@ -5682,7 +5575,8 @@ StartupXLOG(void) case DB_IN_ARCHIVE_RECOVERY: ereport(LOG, (errmsg("database system was interrupted while in recovery at log time %s", - str_time(ControlFile->checkPointCopy.time)), + str_time(ControlFile->checkPointCopy.time, + timebuf, sizeof(timebuf))), errhint("If this has occurred more than once some data might be corrupted" " and you might need to choose an earlier recovery target."))); break; @@ -5690,7 +5584,8 @@ StartupXLOG(void) case DB_IN_PRODUCTION: ereport(LOG, (errmsg("database system was interrupted; last known up at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; default: @@ -5760,10 +5655,9 @@ StartupXLOG(void) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetCommitTsLimit(checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid); - XLogCtl->ckptFullXid = checkPoint.nextXid; /* * Clear out any old relcache cache files. This is *necessary* if we do @@ -5785,6 +5679,12 @@ StartupXLOG(void) */ StartupReplicationSlots(); + /* + * Startup the logical decoding status with the last status stored in the + * checkpoint record. + */ + StartupLogicalDecodingStatus(checkPoint.logicalDecodingEnabled); + /* * Startup logical state, needs to be setup now so we have proper data * during crash recovery. @@ -6071,7 +5971,7 @@ StartupXLOG(void) */ if (InRecovery && (EndOfLog < LocalMinRecoveryPoint || - !XLogRecPtrIsInvalid(ControlFile->backupStartPoint))) + XLogRecPtrIsValid(ControlFile->backupStartPoint))) { /* * Ran off end of WAL before reaching end-of-backup WAL record, or @@ -6081,7 +5981,7 @@ StartupXLOG(void) */ if (ArchiveRecoveryRequested || ControlFile->backupEndRequired) { - if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired) + if (XLogRecPtrIsValid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL ends before end of online backup"), @@ -6183,7 +6083,7 @@ StartupXLOG(void) * (It's critical to first write an OVERWRITE_CONTRECORD message, which * we'll do as soon as we're open for writing new WAL.) */ - if (!XLogRecPtrIsInvalid(missingContrecPtr)) + if (XLogRecPtrIsValid(missingContrecPtr)) { /* * We should only have a missingContrecPtr if we're not switching to a @@ -6193,7 +6093,7 @@ StartupXLOG(void) * disregard. */ Assert(newTLI == endOfRecoveryInfo->lastRecTLI); - Assert(!XLogRecPtrIsInvalid(abortedRecPtr)); + Assert(XLogRecPtrIsValid(abortedRecPtr)); EndOfLog = missingContrecPtr; } @@ -6227,8 +6127,7 @@ StartupXLOG(void) memset(page + len, 0, XLOG_BLCKSZ - len); pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ); - pg_atomic_write_u64(&XLogCtl->InitializedUpTo, endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ); - XLogCtl->InitializedFrom = endOfRecoveryInfo->lastPageBeginPtr; + XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ; } else { @@ -6237,10 +6136,8 @@ StartupXLOG(void) * let the first attempt to insert a log record to initialize the next * buffer. */ - pg_atomic_write_u64(&XLogCtl->InitializedUpTo, EndOfLog); - XLogCtl->InitializedFrom = EndOfLog; + XLogCtl->InitializedUpTo = EndOfLog; } - pg_atomic_write_u64(&XLogCtl->InitializeReserved, pg_atomic_read_u64(&XLogCtl->InitializedUpTo)); /* * Update local and shared status. This is OK to do without any locks @@ -6300,9 +6197,9 @@ StartupXLOG(void) LocalSetXLogInsertAllowed(); /* If necessary, write overwrite-contrecord before doing anything else */ - if (!XLogRecPtrIsInvalid(abortedRecPtr)) + if (XLogRecPtrIsValid(abortedRecPtr)) { - Assert(!XLogRecPtrIsInvalid(missingContrecPtr)); + Assert(XLogRecPtrIsValid(missingContrecPtr)); CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI); } @@ -6336,6 +6233,18 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + /* + * Update logical decoding status in shared memory and write an + * XLOG_LOGICAL_DECODING_STATUS_CHANGE, if necessary. + */ + UpdateLogicalDecodingStatusEndOfRecovery(); + + /* Clean up EndOfWalRecoveryInfo data to appease Valgrind leak checking */ + if (endOfRecoveryInfo->lastPage) + pfree(endOfRecoveryInfo->lastPage); + pfree(endOfRecoveryInfo->recoveryStopReason); + pfree(endOfRecoveryInfo); + /* * All done with end-of-recovery actions. * @@ -6361,6 +6270,18 @@ StartupXLOG(void) UpdateControlFile(); LWLockRelease(ControlFileLock); + /* + * Wake up the checkpointer process as there might be a request to disable + * logical decoding by concurrent slot drop. + */ + WakeupCheckpointer(); + + /* + * Wake up all waiters for replay LSN. They need to report an error that + * recovery was ended before reaching the target LSN. + */ + WaitLSNWakeup(WAIT_LSN_TYPE_REPLAY, InvalidXLogRecPtr); + /* * Shutdown the recovery environment. This must occur after * RecoverPreparedTransactions() (see notes in lock_twophase_recover()) @@ -6505,7 +6426,7 @@ PerformRecoveryXLogAction(void) else { RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_IMMEDIATE | + CHECKPOINT_FAST | CHECKPOINT_WAIT); } @@ -6814,7 +6735,7 @@ ShutdownXLOG(int code, Datum arg) WalSndWaitStopping(); if (RecoveryInProgress()) - CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST); else { /* @@ -6826,7 +6747,7 @@ ShutdownXLOG(int code, Datum arg) if (XLogArchivingActive()) RequestXLogSwitch(false); - CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST); } } @@ -6842,24 +6763,24 @@ LogCheckpointStart(int flags, bool restartpoint) (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FAST) ? " fast" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", (flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); else ereport(LOG, /* translator: the placeholders show checkpoint options */ (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FAST) ? " fast" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", (flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); } /* @@ -6921,7 +6842,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -6945,7 +6866,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -7042,12 +6963,12 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset) * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, - * ignoring checkpoint_completion_target parameter. + * CHECKPOINT_FAST: finish the checkpoint ASAP, ignoring + * checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or * CHECKPOINT_END_OF_RECOVERY). - * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables. + * CHECKPOINT_FLUSH_UNLOGGED: also flush buffers of unlogged tables. * * Note: flags contains other bits, of interest here only for logging purposes. * In particular note that this routine is synchronous and does not pay @@ -7119,6 +7040,10 @@ CreateCheckPoint(int flags) */ SyncPreCheckpoint(); + /* Run these points outside the critical section. */ + INJECTION_POINT("create-checkpoint-initial", NULL); + INJECTION_POINT_LOAD("create-checkpoint-run"); + /* * Use a critical section to force system panic if we have trouble. */ @@ -7142,7 +7067,7 @@ CreateCheckPoint(int flags) * starting snapshot of locks and transactions. */ if (!shutdown && XLogStandbyInfoActive()) - checkPoint.oldestActiveXid = GetOldestActiveTransactionId(); + checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true); else checkPoint.oldestActiveXid = InvalidTransactionId; @@ -7269,6 +7194,8 @@ CreateCheckPoint(int flags) if (log_checkpoints) LogCheckpointStart(flags, false); + INJECTION_POINT_CACHED("create-checkpoint-run", NULL); + /* Update the process title */ update_checkpoint_display(flags, false, false); @@ -7299,6 +7226,8 @@ CreateCheckPoint(int flags) checkPoint.nextOid += TransamVariables->oidCount; LWLockRelease(OidGenLock); + checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled(); + MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, &checkPoint.nextMultiOffset, @@ -7456,11 +7385,6 @@ CreateCheckPoint(int flags) UpdateControlFile(); LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* * We are now done with critical updates; no need for system panic if we * have trouble while fooling with old log segments. @@ -7495,9 +7419,11 @@ CreateCheckPoint(int flags) * Update the average distance between checkpoints if the prior checkpoint * exists. */ - if (PriorRedoPtr != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(PriorRedoPtr)) UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); + INJECTION_POINT("checkpoint-before-old-wal-removal", NULL); + /* * Delete old log files, those no longer needed for last checkpoint to * prevent the disk holding the xlog from growing full. @@ -7637,7 +7563,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, if (!RecoveryInProgress()) elog(ERROR, "can only be used at end of recovery"); if (pagePtr % XLOG_BLCKSZ != 0) - elog(ERROR, "invalid position for missing continuation record %X/%X", + elog(ERROR, "invalid position for missing continuation record %X/%08X", LSN_FORMAT_ARGS(pagePtr)); /* The current WAL insert position should be right after the page header */ @@ -7648,7 +7574,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, startPos += SizeOfXLogShortPHD; recptr = GetXLogInsertRecPtr(); if (recptr != startPos) - elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD", + elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD", LSN_FORMAT_ARGS(recptr)); START_CRIT_SECTION(); @@ -7678,7 +7604,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, /* check that the record was inserted to the right place */ if (ProcLastRecPtr != startPos) - elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X", + elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X", LSN_FORMAT_ARGS(ProcLastRecPtr)); XLogFlush(recptr); @@ -7747,8 +7673,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record) if (XLogHaveInvalidPages()) { elog(DEBUG2, - "could not record restart point at %X/%X because there " - "are unresolved references to invalid pages", + "could not record restart point at %X/%08X because there are unresolved references to invalid pages", LSN_FORMAT_ARGS(checkPoint->redo)); return; } @@ -7824,12 +7749,12 @@ CreateRestartPoint(int flags) * restartpoint. It's assumed that flushing the buffers will do that as a * side-effect. */ - if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) || + if (!XLogRecPtrIsValid(lastCheckPointRecPtr) || lastCheckPoint.redo <= ControlFile->checkPointCopy.redo) { ereport(DEBUG2, - (errmsg_internal("skipping restartpoint, already performed at %X/%X", - LSN_FORMAT_ARGS(lastCheckPoint.redo)))); + errmsg_internal("skipping restartpoint, already performed at %X/%08X", + LSN_FORMAT_ARGS(lastCheckPoint.redo))); UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); if (flags & CHECKPOINT_IS_SHUTDOWN) @@ -7942,7 +7867,7 @@ CreateRestartPoint(int flags) * Update the average distance between checkpoints/restartpoints if the * prior checkpoint exists. */ - if (PriorRedoPtr != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(PriorRedoPtr)) UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); /* @@ -8013,10 +7938,10 @@ CreateRestartPoint(int flags) xtime = GetLatestXTime(); ereport((log_checkpoints ? LOG : DEBUG2), - (errmsg("recovery restart point at %X/%X", - LSN_FORMAT_ARGS(lastCheckPoint.redo)), - xtime ? errdetail("Last completed transaction was at log time %s.", - timestamptz_to_str(xtime)) : 0)); + errmsg("recovery restart point at %X/%08X", + LSN_FORMAT_ARGS(lastCheckPoint.redo)), + xtime ? errdetail("Last completed transaction was at log time %s.", + timestamptz_to_str(xtime)) : 0); /* * Finally, execute archive_cleanup_command, if any. @@ -8067,7 +7992,7 @@ GetWALAvailability(XLogRecPtr targetLSN) /* * slot does not reserve WAL. Either deactivated, or has never been active */ - if (XLogRecPtrIsInvalid(targetLSN)) + if (!XLogRecPtrIsValid(targetLSN)) return WALAVAIL_INVALID_LSN; /* @@ -8147,17 +8072,19 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) XLByteToSeg(recptr, currSegNo, wal_segment_size); segno = currSegNo; - /* - * Calculate how many segments are kept by slots first, adjusting for - * max_slot_wal_keep_size. - */ + /* Calculate how many segments are kept by slots. */ keep = XLogGetReplicationSlotMinimumLSN(); - if (keep != InvalidXLogRecPtr && keep < recptr) + if (XLogRecPtrIsValid(keep) && keep < recptr) { XLByteToSeg(keep, segno, wal_segment_size); - /* Cap by max_slot_wal_keep_size ... */ - if (max_slot_wal_keep_size_mb >= 0) + /* + * Account for max_slot_wal_keep_size to avoid keeping more than + * configured. However, don't do that during a binary upgrade: if + * slots were to be invalidated because of this, it would not be + * possible to preserve logical ones during the upgrade. + */ + if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade) { uint64 slot_keep_segs; @@ -8174,7 +8101,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) * summarized. */ keep = GetOldestUnsummarizedLSN(NULL, NULL); - if (keep != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(keep)) { XLogSegNo unsummarized_segno; @@ -8277,8 +8204,8 @@ XLogRestorePoint(const char *rpName) RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT); ereport(LOG, - (errmsg("restore point \"%s\" created at %X/%X", - rpName, LSN_FORMAT_ARGS(RecPtr)))); + errmsg("restore point \"%s\" created at %X/%08X", + rpName, LSN_FORMAT_ARGS(RecPtr))); return RecPtr; } @@ -8481,8 +8408,8 @@ xlog_redo(XLogReaderState *record) * never arrive. */ if (ArchiveRecoveryRequested && - !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) && - XLogRecPtrIsInvalid(ControlFile->backupEndPoint)) + XLogRecPtrIsValid(ControlFile->backupStartPoint) && + !XLogRecPtrIsValid(ControlFile->backupEndPoint)) ereport(PANIC, (errmsg("online backup was canceled, recovery cannot continue"))); @@ -8530,11 +8457,6 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* * We should've already switched to the new TLI before replaying this * record. @@ -8546,6 +8468,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_CHECKPOINT_ONLINE) { @@ -8591,11 +8521,6 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* TLI should not change in an on-line checkpoint */ (void) GetCurrentReplayRecPtr(&replayTLI); if (checkPoint.ThisTimeLineID != replayTLI) @@ -8604,6 +8529,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_OVERWRITE_CONTRECORD) { @@ -8689,21 +8622,6 @@ xlog_redo(XLogReaderState *record) /* Update our copy of the parameters in pg_control */ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change)); - /* - * Invalidate logical slots if we are in hot standby and the primary - * does not have a WAL level sufficient for logical decoding. No need - * to search for potentially conflicting logically slots if standby is - * running with wal_level lower than logical, because in that case, we - * would have either disallowed creation of logical slots or - * invalidated existing ones. - */ - if (InRecovery && InHotStandby && - xlrec.wal_level < WAL_LEVEL_LOGICAL && - wal_level >= WAL_LEVEL_LOGICAL) - InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL, - 0, InvalidOid, - InvalidTransactionId); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->MaxConnections = xlrec.MaxConnections; ControlFile->max_worker_processes = xlrec.max_worker_processes; @@ -8726,7 +8644,7 @@ xlog_redo(XLogReaderState *record) LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; } - if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn) + if (XLogRecPtrIsValid(LocalMinRecoveryPoint) && LocalMinRecoveryPoint < lsn) { TimeLineID replayTLI; @@ -8771,6 +8689,55 @@ xlog_redo(XLogReaderState *record) { /* nothing to do here, just for informational purposes */ } + else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE) + { + bool status; + + memcpy(&status, XLogRecGetData(record), sizeof(bool)); + + /* + * We need to toggle the logical decoding status and update the + * XLogLogicalInfo cache of processes synchronously because + * XLogLogicalInfoActive() is used even during read-only queries + * (e.g., via RelationIsAccessibleInLogicalDecoding()). In the + * 'disable' case, it is safe to invalidate existing slots after + * disabling logical decoding because logical decoding cannot process + * subsequent WAL records, which may not contain logical information. + */ + if (status) + EnableLogicalDecoding(); + else + DisableLogicalDecoding(); + + elog(DEBUG1, "update logical decoding status to %d during recovery", + status); + + if (InRecovery && InHotStandby) + { + if (!status) + { + /* + * Invalidate logical slots if we are in hot standby and the + * primary disabled logical decoding. + */ + InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL, + 0, InvalidOid, + InvalidTransactionId); + } + else if (sync_replication_slots) + { + /* + * Signal the postmaster to launch the slotsync worker. + * + * XXX: For simplicity, we keep the slotsync worker running + * even after logical decoding is disabled. A future + * improvement can consider starting and stopping the worker + * based on logical decoding status change. + */ + kill(PostmasterPid, SIGUSR1); + } + } + } } /* @@ -8943,9 +8910,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) * backup state and tablespace map. * * Input parameters are "state" (the backup state), "fast" (if true, we do - * the checkpoint in immediate mode to make it faster), and "tablespaces" - * (if non-NULL, indicates a list of tablespaceinfo structs describing the - * cluster's tablespaces.). + * the checkpoint in fast mode), and "tablespaces" (if non-NULL, indicates a + * list of tablespaceinfo structs describing the cluster's tablespaces.). * * The tablespace map contents are appended to passed-in parameter * tablespace_map and the caller is responsible for including it in the backup @@ -9022,7 +8988,7 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, * work correctly, it is critical that sessionBackupState is only updated * after this block is over. */ - PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true)); + PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true)); { bool gotUniqueStartpoint = false; DIR *tblspcdir; @@ -9073,11 +9039,11 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, * during recovery means that checkpointer is running, we can use * RequestCheckpoint() to establish a restartpoint. * - * We use CHECKPOINT_IMMEDIATE only if requested by user (via - * passing fast = true). Otherwise this can take awhile. + * We use CHECKPOINT_FAST only if requested by user (via passing + * fast = true). Otherwise this can take awhile. */ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | - (fast ? CHECKPOINT_IMMEDIATE : 0)); + (fast ? CHECKPOINT_FAST : 0)); /* * Now we need to fetch the checkpoint record location, and also @@ -9248,7 +9214,7 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, continue; } - ti = palloc(sizeof(tablespaceinfo)); + ti = palloc_object(tablespaceinfo); ti->oid = tsoid; ti->path = pstrdup(linkpath); ti->rpath = relpath; @@ -9261,7 +9227,7 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, state->starttime = (pg_time_t) time(NULL); } - PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true)); + PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true)); state->started_in_recovery = backup_started_in_recovery; @@ -9601,7 +9567,7 @@ register_persistent_abort_backup_handler(void) if (already_done) return; - before_shmem_exit(do_pg_abort_backup, DatumGetBool(false)); + before_shmem_exit(do_pg_abort_backup, BoolGetDatum(false)); already_done = true; } @@ -9649,11 +9615,10 @@ GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) void XLogShutdownWalRcv(void) { - ShutdownWalRcv(); + Assert(AmStartupProcess() || !IsUnderPostmaster); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - XLogCtl->InstallXLogFileSegmentActive = false; - LWLockRelease(ControlFileLock); + ShutdownWalRcv(); + ResetInstallXLogFileSegmentActive(); } /* Enable WAL file recycling and preallocation. */ @@ -9665,6 +9630,15 @@ SetInstallXLogFileSegmentActive(void) LWLockRelease(ControlFileLock); } +/* Disable WAL file recycling and preallocation. */ +void +ResetInstallXLogFileSegmentActive(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = false; + LWLockRelease(ControlFileLock); +} + bool IsInstallXLogFileSegmentActive(void) { diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c index 342590e0a46d3..e88df68293265 100644 --- a/src/backend/access/transam/xlogbackup.c +++ b/src/backend/access/transam/xlogbackup.c @@ -31,18 +31,19 @@ build_backup_content(BackupState *state, bool ishistoryfile) char startstrbuf[128]; char startxlogfile[MAXFNAMELEN]; /* backup start WAL file */ XLogSegNo startsegno; - StringInfo result = makeStringInfo(); - char *data; + StringInfoData result; Assert(state != NULL); + initStringInfo(&result); + /* Use the log timezone here, not the session timezone */ pg_strftime(startstrbuf, sizeof(startstrbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&state->starttime, log_timezone)); XLByteToSeg(state->startpoint, startsegno, wal_segment_size); XLogFileName(startxlogfile, state->starttli, startsegno, wal_segment_size); - appendStringInfo(result, "START WAL LOCATION: %X/%X (file %s)\n", + appendStringInfo(&result, "START WAL LOCATION: %X/%08X (file %s)\n", LSN_FORMAT_ARGS(state->startpoint), startxlogfile); if (ishistoryfile) @@ -52,18 +53,18 @@ build_backup_content(BackupState *state, bool ishistoryfile) XLByteToSeg(state->stoppoint, stopsegno, wal_segment_size); XLogFileName(stopxlogfile, state->stoptli, stopsegno, wal_segment_size); - appendStringInfo(result, "STOP WAL LOCATION: %X/%X (file %s)\n", + appendStringInfo(&result, "STOP WAL LOCATION: %X/%08X (file %s)\n", LSN_FORMAT_ARGS(state->stoppoint), stopxlogfile); } - appendStringInfo(result, "CHECKPOINT LOCATION: %X/%X\n", + appendStringInfo(&result, "CHECKPOINT LOCATION: %X/%08X\n", LSN_FORMAT_ARGS(state->checkpointloc)); - appendStringInfoString(result, "BACKUP METHOD: streamed\n"); - appendStringInfo(result, "BACKUP FROM: %s\n", + appendStringInfoString(&result, "BACKUP METHOD: streamed\n"); + appendStringInfo(&result, "BACKUP FROM: %s\n", state->started_in_recovery ? "standby" : "primary"); - appendStringInfo(result, "START TIME: %s\n", startstrbuf); - appendStringInfo(result, "LABEL: %s\n", state->name); - appendStringInfo(result, "START TIMELINE: %u\n", state->starttli); + appendStringInfo(&result, "START TIME: %s\n", startstrbuf); + appendStringInfo(&result, "LABEL: %s\n", state->name); + appendStringInfo(&result, "START TIMELINE: %u\n", state->starttli); if (ishistoryfile) { @@ -73,22 +74,19 @@ build_backup_content(BackupState *state, bool ishistoryfile) pg_strftime(stopstrfbuf, sizeof(stopstrfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&state->stoptime, log_timezone)); - appendStringInfo(result, "STOP TIME: %s\n", stopstrfbuf); - appendStringInfo(result, "STOP TIMELINE: %u\n", state->stoptli); + appendStringInfo(&result, "STOP TIME: %s\n", stopstrfbuf); + appendStringInfo(&result, "STOP TIMELINE: %u\n", state->stoptli); } /* either both istartpoint and istarttli should be set, or neither */ - Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0)); - if (!XLogRecPtrIsInvalid(state->istartpoint)) + Assert(XLogRecPtrIsValid(state->istartpoint) == (state->istarttli != 0)); + if (XLogRecPtrIsValid(state->istartpoint)) { - appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n", + appendStringInfo(&result, "INCREMENTAL FROM LSN: %X/%08X\n", LSN_FORMAT_ARGS(state->istartpoint)); - appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n", + appendStringInfo(&result, "INCREMENTAL FROM TLI: %u\n", state->istarttli); } - data = result->data; - pfree(result); - - return data; + return result.data; } diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 8c3090165f001..339cb75c3ad97 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -90,7 +90,7 @@ pg_backup_start(PG_FUNCTION_ARGS) } oldcontext = MemoryContextSwitchTo(backupcontext); - backup_state = (BackupState *) palloc0(sizeof(BackupState)); + backup_state = palloc0_object(BackupState); tablespace_map = makeStringInfo(); MemoryContextSwitchTo(oldcontext); @@ -341,7 +341,7 @@ pg_last_wal_receive_lsn(PG_FUNCTION_ARGS) recptr = GetWalRcvFlushRecPtr(NULL, NULL); - if (recptr == 0) + if (!XLogRecPtrIsValid(recptr)) PG_RETURN_NULL(); PG_RETURN_LSN(recptr); @@ -360,7 +360,7 @@ pg_last_wal_replay_lsn(PG_FUNCTION_ARGS) recptr = GetXLogReplayRecPtr(NULL); - if (recptr == 0) + if (!XLogRecPtrIsValid(recptr)) PG_RETURN_NULL(); PG_RETURN_LSN(recptr); @@ -479,7 +479,7 @@ pg_split_walfile_name(PG_FUNCTION_ARGS) /* Capitalize WAL file name. */ for (p = fname_upper; *p; p++) - *p = pg_toupper((unsigned char) *p); + *p = pg_ascii_toupper((unsigned char) *p); if (!IsXLogFileName(fname_upper)) ereport(ERROR, diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5ee9d0b028eae..a56d5a5528263 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -33,12 +33,14 @@ #include "access/xloginsert.h" #include "catalog/pg_control.h" #include "common/pg_lzcompress.h" +#include "executor/instrument.h" #include "miscadmin.h" #include "pg_trace.h" #include "replication/origin.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" +#include "utils/pgstat_internal.h" /* * Guess the maximum buffer size required to store a compressed version of @@ -137,6 +139,7 @@ static MemoryContext xloginsert_cxt; static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, XLogRecPtr *fpw_lsn, int *num_fpi, + uint64 *fpi_bytes, bool *topxid_included); static bool XLogCompressBackupBlock(const PageData *page, uint16 hole_offset, uint16 hole_length, void *dest, uint16 *dlen); @@ -258,7 +261,8 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags) */ #ifdef USE_ASSERT_CHECKING if (!(flags & REGBUF_NO_CHANGE)) - Assert(BufferIsExclusiveLocked(buffer) && BufferIsDirty(buffer)); + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) && + BufferIsDirty(buffer)); #endif if (block_id >= max_registered_block_id) @@ -509,6 +513,7 @@ XLogInsert(RmgrId rmid, uint8 info) XLogRecPtr fpw_lsn; XLogRecData *rdt; int num_fpi = 0; + uint64 fpi_bytes = 0; /* * Get values needed to decide whether to do full-page writes. Since @@ -518,17 +523,30 @@ XLogInsert(RmgrId rmid, uint8 info) GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, - &fpw_lsn, &num_fpi, &topxid_included); + &fpw_lsn, &num_fpi, &fpi_bytes, + &topxid_included); EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi, - topxid_included); - } while (EndPos == InvalidXLogRecPtr); + fpi_bytes, topxid_included); + } while (!XLogRecPtrIsValid(EndPos)); XLogResetInsertion(); return EndPos; } +/* + * Simple wrapper to XLogInsert to insert a WAL record with elementary + * contents (only an int64 is supported as value currently). + */ +XLogRecPtr +XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value) +{ + XLogBeginInsert(); + XLogRegisterData(&value, sizeof(value)); + return XLogInsert(rmid, info); +} + /* * Assemble a WAL record from the registered data and buffers into an * XLogRecData chain, ready for insertion with XLogInsertRecord(). @@ -547,7 +565,8 @@ XLogInsert(RmgrId rmid, uint8 info) static XLogRecData * XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, - XLogRecPtr *fpw_lsn, int *num_fpi, bool *topxid_included) + XLogRecPtr *fpw_lsn, int *num_fpi, uint64 *fpi_bytes, + bool *topxid_included) { XLogRecData *rdt; uint64 total_len = 0; @@ -620,7 +639,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, needs_backup = (page_lsn <= RedoRecPtr); if (!needs_backup) { - if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn) + if (!XLogRecPtrIsValid(*fpw_lsn) || page_lsn < *fpw_lsn) *fpw_lsn = page_lsn; } } @@ -783,6 +802,9 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } total_len += bimg.length; + + /* Track the WAL full page images in bytes */ + *fpi_bytes += bimg.length; } if (needs_data) diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 7735562db01d1..ccc29192c5a03 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -364,7 +364,7 @@ XLogPrefetcherAllocate(XLogReaderState *reader) XLogPrefetcher *prefetcher; HASHCTL ctl; - prefetcher = palloc0(sizeof(XLogPrefetcher)); + prefetcher = palloc0_object(XLogPrefetcher); prefetcher->reader = reader; ctl.keysize = sizeof(RelFileLocator); @@ -546,7 +546,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all readahead until %X/%X is replayed due to possible TLI change", + "suppressing all readahead until %X/%08X is replayed due to possible TLI change", LSN_FORMAT_ARGS(record->lsn)); #endif @@ -579,7 +579,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy", + "suppressing prefetch in database %u until %X/%08X is replayed due to raw file copy", rlocator.dbOid, LSN_FORMAT_ARGS(record->lsn)); #endif @@ -607,7 +607,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation", + "suppressing prefetch in relation %u/%u/%u until %X/%08X is replayed, which creates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -630,7 +630,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation", + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, which truncates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -729,7 +729,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk", + "suppressing all prefetch in relation %u/%u/%u until %X/%08X is replayed, because the relation does not exist on disk", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -750,7 +750,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small", + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, because the relation is too small", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -928,7 +928,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (blocks >= %u filtered)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed), filter->filter_from_block); @@ -944,7 +944,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (whole database)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed)); #endif diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 2790ade1f91e8..5e5001b2101ac 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -231,7 +231,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) { - Assert(!XLogRecPtrIsInvalid(RecPtr)); + Assert(XLogRecPtrIsValid(RecPtr)); ResetDecoder(state); @@ -343,7 +343,7 @@ XLogNextRecord(XLogReaderState *state, char **errormsg) * XLogBeginRead() or XLogNextRecord(), and is the location of the * error. */ - Assert(!XLogRecPtrIsInvalid(state->EndRecPtr)); + Assert(XLogRecPtrIsValid(state->EndRecPtr)); return NULL; } @@ -558,7 +558,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) RecPtr = state->NextRecPtr; - if (state->DecodeRecPtr != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(state->DecodeRecPtr)) { /* read the record after the one we just read */ @@ -617,7 +617,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u", + report_invalid_record(state, "invalid record offset at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), pageHeaderSize, targetRecOff); goto err; @@ -626,7 +626,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", + report_invalid_record(state, "contrecord is requested by %X/%08X", LSN_FORMAT_ARGS(RecPtr)); goto err; } @@ -667,7 +667,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (total_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", + "invalid record length at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, total_len); goto err; @@ -723,11 +723,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Calculate pointer to beginning of next page */ targetPagePtr += XLOG_BLCKSZ; - /* Wait for the next page to become available */ - readOff = ReadPageInternal(state, targetPagePtr, - Min(total_len - gotlen + SizeOfXLogShortPHD, - XLOG_BLCKSZ)); - + /* + * Read the page header before processing the record data, so we + * can handle the case where the previous record ended as being a + * partial one. + */ + readOff = ReadPageInternal(state, targetPagePtr, SizeOfXLogShortPHD); if (readOff == XLREAD_WOULDBLOCK) return XLREAD_WOULDBLOCK; else if (readOff < 0) @@ -756,7 +757,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { report_invalid_record(state, - "there is no contrecord flag at %X/%X", + "there is no contrecord flag at %X/%08X", LSN_FORMAT_ARGS(RecPtr)); goto err; } @@ -769,13 +770,22 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) total_len != (pageHeader->xlp_rem_len + gotlen)) { report_invalid_record(state, - "invalid contrecord length %u (expected %lld) at %X/%X", + "invalid contrecord length %u (expected %lld) at %X/%08X", pageHeader->xlp_rem_len, ((long long) total_len) - gotlen, LSN_FORMAT_ARGS(RecPtr)); goto err; } + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + /* Append the continuation from this page to the buffer */ pageHeaderSize = XLogPageHeaderSize(pageHeader); @@ -1132,7 +1142,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_tot_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", + "invalid record length at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, record->xl_tot_len); return false; @@ -1140,7 +1150,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (!RmgrIdIsValid(record->xl_rmid)) { report_invalid_record(state, - "invalid resource manager ID %u at %X/%X", + "invalid resource manager ID %u at %X/%08X", record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); return false; } @@ -1153,7 +1163,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (!(record->xl_prev < RecPtr)) { report_invalid_record(state, - "record with incorrect prev-link %X/%X at %X/%X", + "record with incorrect prev-link %X/%08X at %X/%08X", LSN_FORMAT_ARGS(record->xl_prev), LSN_FORMAT_ARGS(RecPtr)); return false; @@ -1169,7 +1179,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_prev != PrevRecPtr) { report_invalid_record(state, - "record with incorrect prev-link %X/%X at %X/%X", + "record with incorrect prev-link %X/%08X at %X/%08X", LSN_FORMAT_ARGS(record->xl_prev), LSN_FORMAT_ARGS(RecPtr)); return false; @@ -1207,7 +1217,7 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) if (!EQ_CRC32C(record->xl_crc, crc)) { report_invalid_record(state, - "incorrect resource manager data checksum in record at %X/%X", + "incorrect resource manager data checksum in record at %X/%08X", LSN_FORMAT_ARGS(recptr)); return false; } @@ -1241,7 +1251,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "invalid magic number %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid magic number %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_magic, fname, LSN_FORMAT_ARGS(recptr), @@ -1256,7 +1266,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), @@ -1298,7 +1308,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, /* hmm, first page of file doesn't have a long header? */ report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), @@ -1318,7 +1328,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "unexpected pageaddr %X/%X in WAL segment %s, LSN %X/%X, offset %u", + "unexpected pageaddr %X/%08X in WAL segment %s, LSN %X/%08X, offset %u", LSN_FORMAT_ARGS(hdr->xlp_pageaddr), fname, LSN_FORMAT_ARGS(recptr), @@ -1344,7 +1354,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%X, offset %u", + "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_tli, state->latestPageTLI, fname, @@ -1388,7 +1398,7 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) XLogPageHeader header; char *errormsg; - Assert(!XLogRecPtrIsInvalid(RecPtr)); + Assert(XLogRecPtrIsValid(RecPtr)); /* Make sure ReadPageInternal() can't return XLREAD_WOULDBLOCK. */ state->nonblocking = false; @@ -1564,7 +1574,7 @@ WALRead(XLogReaderState *state, /* Reset errno first; eases reporting non-errno-affecting errors */ errno = 0; - readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + readbytes = pg_pread(state->seg.ws_file, p, segbytes, (pgoff_t) startoff); #ifndef FRONTEND pgstat_report_wait_end(); @@ -1756,7 +1766,7 @@ DecodeXLogRecord(XLogReaderState *state, if (block_id <= decoded->max_block_id) { report_invalid_record(state, - "out-of-order block_id %u at %X/%X", + "out-of-order block_id %u at %X/%08X", block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1780,15 +1790,15 @@ DecodeXLogRecord(XLogReaderState *state, if (blk->has_data && blk->data_len == 0) { report_invalid_record(state, - "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + "BKPBLOCK_HAS_DATA set, but no data included at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } if (!blk->has_data && blk->data_len != 0) { report_invalid_record(state, - "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", - (unsigned int) blk->data_len, + "BKPBLOCK_HAS_DATA not set, but data length is %d at %X/%08X", + blk->data_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1823,10 +1833,10 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len == BLCKSZ)) { report_invalid_record(state, - "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", - (unsigned int) blk->hole_offset, - (unsigned int) blk->hole_length, - (unsigned int) blk->bimg_len, + "BKPIMAGE_HAS_HOLE set, but hole offset %d length %d block image length %d at %X/%08X", + blk->hole_offset, + blk->hole_length, + blk->bimg_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1839,9 +1849,9 @@ DecodeXLogRecord(XLogReaderState *state, (blk->hole_offset != 0 || blk->hole_length != 0)) { report_invalid_record(state, - "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", - (unsigned int) blk->hole_offset, - (unsigned int) blk->hole_length, + "BKPIMAGE_HAS_HOLE not set, but hole offset %d length %d at %X/%08X", + blk->hole_offset, + blk->hole_length, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1853,8 +1863,8 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len == BLCKSZ) { report_invalid_record(state, - "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X", - (unsigned int) blk->bimg_len, + "BKPIMAGE_COMPRESSED set, but block image length %d at %X/%08X", + blk->bimg_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1868,8 +1878,8 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len != BLCKSZ) { report_invalid_record(state, - "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X", - (unsigned int) blk->data_len, + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %d at %X/%08X", + blk->data_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1884,7 +1894,7 @@ DecodeXLogRecord(XLogReaderState *state, if (rlocator == NULL) { report_invalid_record(state, - "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + "BKPBLOCK_SAME_REL set but no previous rel at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1896,7 +1906,7 @@ DecodeXLogRecord(XLogReaderState *state, else { report_invalid_record(state, - "invalid block_id %u at %X/%X", + "invalid block_id %u at %X/%08X", block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1963,7 +1973,7 @@ DecodeXLogRecord(XLogReaderState *state, shortdata_err: report_invalid_record(state, - "record with invalid length at %X/%X", + "record with invalid length at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); err: *errormsg = state->errormsg_buf; @@ -2073,14 +2083,14 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) !record->record->blocks[block_id].in_use) { report_invalid_record(record, - "could not restore image at %X/%X with invalid block %d specified", + "could not restore image at %X/%08X with invalid block %d specified", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; } if (!record->record->blocks[block_id].has_image) { - report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d", + report_invalid_record(record, "could not restore image at %X/%08X with invalid state, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; @@ -2107,7 +2117,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0) decomp_success = false; #else - report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), "LZ4", block_id); @@ -2124,7 +2134,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) if (ZSTD_isError(decomp_result)) decomp_success = false; #else - report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), "zstd", block_id); @@ -2133,7 +2143,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) } else { - report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with unknown method, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; @@ -2141,7 +2151,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) if (!decomp_success) { - report_invalid_record(record, "could not decompress image at %X/%X, block %d", + report_invalid_record(record, "could not decompress image at %X/%08X, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 6ce979f2d8bc4..38b594d217092 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -40,11 +40,13 @@ #include "access/xlogreader.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" +#include "access/xlogwait.h" #include "backup/basebackup.h" #include "catalog/pg_control.h" #include "commands/tablespace.h" #include "common/file_utils.h" #include "miscadmin.h" +#include "nodes/miscnodes.h" #include "pgstat.h" #include "postmaster/bgwriter.h" #include "postmaster/startup.h" @@ -557,7 +559,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * Set the WAL reading processor now, as it will be needed when reading * the checkpoint record required (backup_label or not). */ - private = palloc0(sizeof(XLogPageReadPrivate)); + private = palloc0_object(XLogPageReadPrivate); xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.page_read = &XLogPageRead, @@ -620,10 +622,10 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * than ControlFile->checkPoint is used. */ ereport(LOG, - (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u", - LSN_FORMAT_ARGS(RedoStartLSN), - LSN_FORMAT_ARGS(CheckPointLoc), - CheckPointTLI))); + errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u", + LSN_FORMAT_ARGS(RedoStartLSN), + LSN_FORMAT_ARGS(CheckPointLoc), + CheckPointTLI)); /* * When a backup_label file is present, we want to roll forward from @@ -636,8 +638,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg_internal("checkpoint record is at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); InRecovery = true; /* force recovery even if SHUTDOWNED */ /* @@ -652,23 +654,23 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID)) ereport(FATAL, - (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X", - LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir, DataDir)); } } else { ereport(FATAL, - (errmsg("could not locate required checkpoint record at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errmsg("could not locate required checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc)), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir, DataDir)); wasShutdown = false; /* keep compiler quiet */ } @@ -756,9 +758,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * end-of-backup record), and we can enter archive recovery directly. */ if (ArchiveRecoveryRequested && - (ControlFile->minRecoveryPoint != InvalidXLogRecPtr || + (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) || ControlFile->backupEndRequired || - ControlFile->backupEndPoint != InvalidXLogRecPtr || + XLogRecPtrIsValid(ControlFile->backupEndPoint) || ControlFile->state == DB_SHUTDOWNED)) { InArchiveRecovery = true; @@ -771,10 +773,10 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * emit a log message when we continue initializing from a base * backup. */ - if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) + if (XLogRecPtrIsValid(ControlFile->backupStartPoint)) ereport(LOG, - (errmsg("restarting backup recovery with redo LSN %X/%X", - LSN_FORMAT_ARGS(ControlFile->backupStartPoint)))); + errmsg("restarting backup recovery with redo LSN %X/%08X", + LSN_FORMAT_ARGS(ControlFile->backupStartPoint))); /* Get the last valid checkpoint record. */ CheckPointLoc = ControlFile->checkPoint; @@ -786,8 +788,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (record != NULL) { ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg_internal("checkpoint record is at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); } else { @@ -798,11 +800,21 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * simplify processing around checkpoints. */ ereport(PANIC, - (errmsg("could not locate a valid checkpoint record at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg("could not locate a valid checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + + /* Make sure that REDO location exists. */ + if (checkPoint.redo < CheckPointLoc) + { + XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo); + if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID)) + ereport(FATAL, + errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc))); + } } if (ArchiveRecoveryRequested) @@ -824,8 +836,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, recoveryTargetName))); else if (recoveryTarget == RECOVERY_TARGET_LSN) ereport(LOG, - (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryTargetLSN)))); + errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryTargetLSN))); else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); @@ -855,7 +867,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, (errmsg("requested timeline %u is not a child of this server's history", recoveryTargetTLI), /* translator: %s is a backup_label file or a pg_control file */ - errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", + errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.", haveBackupLabel ? "backup_label" : "pg_control", LSN_FORMAT_ARGS(CheckPointLoc), CheckPointTLI, @@ -866,25 +878,25 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * The min recovery point should be part of the requested timeline's * history, too. */ - if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && + if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) && tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != ControlFile->minRecoveryPointTLI) ereport(FATAL, - (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", - recoveryTargetTLI, - LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), - ControlFile->minRecoveryPointTLI))); + errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u", + recoveryTargetTLI, + LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), + ControlFile->minRecoveryPointTLI)); ereport(DEBUG1, - (errmsg_internal("redo record is at %X/%X; shutdown %s", - LSN_FORMAT_ARGS(checkPoint.redo), - wasShutdown ? "true" : "false"))); + errmsg_internal("redo record is at %X/%08X; shutdown %s", + LSN_FORMAT_ARGS(checkPoint.redo), + wasShutdown ? "true" : "false")); ereport(DEBUG1, (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", U64FromFullTransactionId(checkPoint.nextXid), checkPoint.nextOid))); ereport(DEBUG1, - (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64, checkPoint.nextMulti, checkPoint.nextMultiOffset))); ereport(DEBUG1, (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", @@ -1253,14 +1265,14 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, * is pretty crude, but we are not expecting any variability in the file * format). */ - if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", + if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c", &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); RedoStartLSN = ((uint64) hi) << 32 | lo; RedoStartTLI = tli_from_walseg; - if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", + if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c", &hi, &lo, &ch) != 3 || ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -1332,7 +1344,7 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, tli_from_file, BACKUP_LABEL_FILE))); } - if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0) + if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("this is an incremental backup, not a data directory"), @@ -1414,7 +1426,7 @@ read_tablespace_map(List **tablespaces) errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); str[n++] = '\0'; - ti = palloc0(sizeof(tablespaceinfo)); + ti = palloc0_object(tablespaceinfo); errno = 0; ti->oid = strtoul(str, &endp, 10); if (*endp != '\0' || errno == EINVAL || errno == ERANGE) @@ -1465,7 +1477,7 @@ read_tablespace_map(List **tablespaces) EndOfWalRecoveryInfo * FinishWalRecovery(void) { - EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo)); + EndOfWalRecoveryInfo *result = palloc_object(EndOfWalRecoveryInfo); XLogRecPtr lastRec; TimeLineID lastRecTLI; XLogRecPtr endOfLog; @@ -1626,6 +1638,7 @@ ShutdownWalRecovery(void) close(readFile); readFile = -1; } + pfree(xlogreader->private_data); XLogReaderFree(xlogreader); XLogPrefetcherFree(xlogprefetcher); @@ -1722,8 +1735,8 @@ PerformWalRecovery(void) if (record->xl_rmid != RM_XLOG_ID || (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO) ereport(FATAL, - (errmsg("unexpected record type found at redo point %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + errmsg("unexpected record type found at redo point %X/%08X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); } else { @@ -1745,8 +1758,8 @@ PerformWalRecovery(void) RmgrStartup(); ereport(LOG, - (errmsg("redo starts at %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + errmsg("redo starts at %X/%08X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); /* Prepare to report progress of the redo phase. */ if (!StandbyMode) @@ -1758,7 +1771,7 @@ PerformWalRecovery(void) do { if (!StandbyMode) - ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X", + ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X", LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)); #ifdef WAL_DEBUG @@ -1767,7 +1780,7 @@ PerformWalRecovery(void) StringInfoData buf; initStringInfo(&buf); - appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", + appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ", LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); xlog_outrec(&buf, xlogreader); @@ -1836,6 +1849,16 @@ PerformWalRecovery(void) break; } + /* + * If we replayed an LSN that someone was waiting for then walk + * over the shared memory array and set latches to notify the + * waiters. + */ + if (waitLSNState && + (XLogRecoveryCtl->lastReplayedEndRecPtr >= + pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_REPLAY]))) + WaitLSNWakeup(WAIT_LSN_TYPE_REPLAY, XLogRecoveryCtl->lastReplayedEndRecPtr); + /* Else, try to fetch the next WAL record */ record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); } while (record != NULL); @@ -1880,9 +1903,9 @@ PerformWalRecovery(void) RmgrCleanup(); ereport(LOG, - (errmsg("redo done at %X/%X system usage: %s", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), - pg_rusage_show(&ru0)))); + errmsg("redo done at %X/%08X system usage: %s", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + pg_rusage_show(&ru0))); xtime = GetLatestXTime(); if (xtime) ereport(LOG, @@ -2092,7 +2115,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); if (xlrec.overwritten_lsn != record->overwrittenRecPtr) - elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", + elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X", LSN_FORMAT_ARGS(xlrec.overwritten_lsn), LSN_FORMAT_ARGS(record->overwrittenRecPtr)); @@ -2101,9 +2124,9 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) missingContrecPtr = InvalidXLogRecPtr; ereport(LOG, - (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s", - LSN_FORMAT_ARGS(xlrec.overwritten_lsn), - timestamptz_to_str(xlrec.overwrite_time)))); + errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + timestamptz_to_str(xlrec.overwrite_time))); /* Verifying the record should only happen once */ record->overwrittenRecPtr = InvalidXLogRecPtr; @@ -2129,7 +2152,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) backupEndPoint = lsn; } else - elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X", + elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X", LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint)); } } @@ -2191,7 +2214,7 @@ CheckRecoveryConsistency(void) * During crash recovery, we don't reach a consistent state until we've * replayed all the WAL. */ - if (XLogRecPtrIsInvalid(minRecoveryPoint)) + if (!XLogRecPtrIsValid(minRecoveryPoint)) return; Assert(InArchiveRecovery); @@ -2206,7 +2229,7 @@ CheckRecoveryConsistency(void) /* * Have we reached the point where our base backup was completed? */ - if (!XLogRecPtrIsInvalid(backupEndPoint) && + if (XLogRecPtrIsValid(backupEndPoint) && backupEndPoint <= lastReplayedEndRecPtr) { XLogRecPtr saveBackupStartPoint = backupStartPoint; @@ -2224,9 +2247,9 @@ CheckRecoveryConsistency(void) backupEndRequired = false; ereport(LOG, - (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X", - LSN_FORMAT_ARGS(saveBackupStartPoint), - LSN_FORMAT_ARGS(saveBackupEndPoint)))); + errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X", + LSN_FORMAT_ARGS(saveBackupStartPoint), + LSN_FORMAT_ARGS(saveBackupEndPoint))); } /* @@ -2255,8 +2278,8 @@ CheckRecoveryConsistency(void) reachedConsistency = true; SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); ereport(LOG, - (errmsg("consistent recovery state reached at %X/%X", - LSN_FORMAT_ARGS(lastReplayedEndRecPtr)))); + errmsg("consistent recovery state reached at %X/%08X", + LSN_FORMAT_ARGS(lastReplayedEndRecPtr))); } /* @@ -2293,7 +2316,7 @@ rm_redo_error_callback(void *arg) xlog_block_info(&buf, record); /* translator: %s is a WAL record description */ - errcontext("WAL redo at %X/%X for %s", + errcontext("WAL redo at %X/%08X for %s", LSN_FORMAT_ARGS(record->ReadRecPtr), buf.data); @@ -2328,7 +2351,7 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record) static void xlog_outrec(StringInfo buf, XLogReaderState *record) { - appendStringInfo(buf, "prev %X/%X; xid %u", + appendStringInfo(buf, "prev %X/%08X; xid %u", LSN_FORMAT_ARGS(XLogRecGetPrev(record)), XLogRecGetXid(record)); @@ -2412,14 +2435,14 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, * branched before the timeline the min recovery point is on, and you * attempt to do PITR to the new timeline. */ - if (!XLogRecPtrIsInvalid(minRecoveryPoint) && + if (XLogRecPtrIsValid(minRecoveryPoint) && lsn < minRecoveryPoint && newTLI > minRecoveryPointTLI) ereport(PANIC, - (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", - newTLI, - LSN_FORMAT_ARGS(minRecoveryPoint), - minRecoveryPointTLI))); + errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u", + newTLI, + LSN_FORMAT_ARGS(minRecoveryPoint), + minRecoveryPointTLI)); /* Looks good */ } @@ -2621,8 +2644,8 @@ recoveryStopsBefore(XLogReaderState *record) recoveryStopTime = 0; recoveryStopName[0] = '\0'; ereport(LOG, - (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); + errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryStopLSN))); return true; } @@ -2789,8 +2812,8 @@ recoveryStopsAfter(XLogReaderState *record) recoveryStopTime = 0; recoveryStopName[0] = '\0'; ereport(LOG, - (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); + errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryStopLSN))); return true; } @@ -2910,7 +2933,7 @@ getRecoveryStopReason(void) timestamptz_to_str(recoveryStopTime)); else if (recoveryTarget == RECOVERY_TARGET_LSN) snprintf(reason, sizeof(reason), - "%s LSN %X/%X\n", + "%s LSN %X/%08X\n", recoveryStopAfter ? "after" : "before", LSN_FORMAT_ARGS(recoveryStopLSN)); else if (recoveryTarget == RECOVERY_TARGET_NAME) @@ -3146,10 +3169,12 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher); XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; + Assert(AmStartupProcess() || !IsUnderPostmaster); + /* Pass through parameters to XLogPageRead */ private->fetching_ckpt = fetching_ckpt; private->emode = emode; - private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); + private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr); private->replayTLI = replayTLI; /* This is the first attempt to read this page. */ @@ -3175,7 +3200,7 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, * overwrite contrecord in the wrong place, breaking everything. */ if (!ArchiveRecoveryRequested && - !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr)) + XLogRecPtrIsValid(xlogreader->abortedRecPtr)) { abortedRecPtr = xlogreader->abortedRecPtr; missingContrecPtr = xlogreader->missingContrecPtr; @@ -3213,11 +3238,11 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, XLogFileName(fname, xlogreader->seg.ws_tli, segno, wal_segment_size); ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), - (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u", - xlogreader->latestPageTLI, - fname, - LSN_FORMAT_ARGS(xlogreader->latestPagePtr), - offset))); + errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u", + xlogreader->latestPageTLI, + fname, + LSN_FORMAT_ARGS(xlogreader->latestPagePtr), + offset)); record = NULL; } @@ -3317,6 +3342,8 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, int r; instr_time io_start; + Assert(AmStartupProcess() || !IsUnderPostmaster); + XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); @@ -3412,7 +3439,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, io_start = pgstat_prepare_io_time(track_wal_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); + r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff); if (r != XLOG_BLCKSZ) { char fname[MAXFNAMELEN]; @@ -3429,14 +3456,14 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, errno = save_errno; ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode_for_file_access(), - errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m", + errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m", fname, LSN_FORMAT_ARGS(targetPagePtr), readOff))); } else ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu", + errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu", fname, LSN_FORMAT_ARGS(targetPagePtr), readOff, r, (Size) XLOG_BLCKSZ))); goto next_record_is_invalid; @@ -3685,8 +3712,27 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * Before we leave XLOG_FROM_STREAM state, make sure that * walreceiver is not active, so that it won't overwrite * WAL that we restore from archive. + * + * If walreceiver is actively streaming (or attempting to + * connect), we must shut it down. However, if it's + * already in WAITING state (e.g., due to timeline + * divergence), we only need to reset the install flag to + * allow archive restoration. */ - XLogShutdownWalRcv(); + if (WalRcvStreaming()) + XLogShutdownWalRcv(); + else + { + /* + * WALRCV_STOPPING state is a transient state while + * the startup process is in ShutdownWalRcv(). It + * should never appear here since we would be waiting + * for the walreceiver to reach WALRCV_STOPPED in that + * case. + */ + Assert(WalRcvGetState() != WALRCV_STOPPING); + ResetInstallXLogFileSegmentActive(); + } /* * Before we sleep, re-scan for possible new timelines if @@ -3718,7 +3764,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, wait_time = wal_retrieve_retry_interval - TimestampDifferenceMilliseconds(last_fail_time, now); - elog(LOG, "waiting for WAL to become available at %X/%X", + elog(LOG, "waiting for WAL to become available at %X/%08X", LSN_FORMAT_ARGS(RecPtr)); /* Do background tasks that might benefit us later. */ @@ -3864,7 +3910,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); if (curFileTLI > 0 && tli < curFileTLI) - elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u", LSN_FORMAT_ARGS(tliRecPtr), tli, curFileTLI); } @@ -4177,10 +4223,10 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) if (currentTle->end < replayLSN) { ereport(LOG, - (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", - newtarget, - replayTLI, - LSN_FORMAT_ARGS(replayLSN)))); + errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X", + newtarget, + replayTLI, + LSN_FORMAT_ARGS(replayLSN))); return false; } @@ -4334,7 +4380,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source) * Skip scanning the timeline ID that the logfile segment to read * doesn't belong to */ - if (hent->begin != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(hent->begin)) { XLogSegNo beginseg = 0; @@ -4759,9 +4805,20 @@ RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue bool check_primary_slot_name(char **newval, void **extra, GucSource source) { + int err_code; + char *err_msg = NULL; + char *err_hint = NULL; + if (*newval && strcmp(*newval, "") != 0 && - !ReplicationSlotValidateName(*newval, WARNING)) + !ReplicationSlotValidateNameInternal(*newval, false, &err_code, + &err_msg, &err_hint)) + { + GUC_check_errcode(err_code); + GUC_check_errdetail("%s", err_msg); + if (err_hint != NULL) + GUC_check_errhint("%s", err_hint); return false; + } return true; } @@ -4833,10 +4890,10 @@ check_recovery_target_lsn(char **newval, void **extra, GucSource source) { XLogRecPtr lsn; XLogRecPtr *myextra; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - lsn = pg_lsn_in_internal(*newval, &have_error); - if (have_error) + lsn = pg_lsn_in_safe(*newval, (Node *) &escontext); + if (escontext.error_occurred) return false; myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr)); @@ -4994,13 +5051,25 @@ check_recovery_target_timeline(char **newval, void **extra, GucSource source) rttg = RECOVERY_TARGET_TIMELINE_LATEST; else { + char *endp; + uint64 timeline; + rttg = RECOVERY_TARGET_TIMELINE_NUMERIC; errno = 0; - strtoul(*newval, NULL, 0); - if (errno == EINVAL || errno == ERANGE) + timeline = strtou64(*newval, &endp, 0); + + if (*endp != '\0' || errno == EINVAL || errno == ERANGE) + { + GUC_check_errdetail("\"%s\" is not a valid number.", + "recovery_target_timeline"); + return false; + } + + if (timeline < 1 || timeline > PG_UINT32_MAX) { - GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number."); + GUC_check_errdetail("\"%s\" must be between %u and %u.", + "recovery_target_timeline", 1, UINT_MAX); return false; } } diff --git a/src/backend/access/transam/xlogstats.c b/src/backend/access/transam/xlogstats.c index f92d9e13b174e..85963a6ac29e7 100644 --- a/src/backend/access/transam/xlogstats.c +++ b/src/backend/access/transam/xlogstats.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * xlogstats.c - * Functions for WAL Statitstics + * Functions for WAL Statistics * * Copyright (c) 2022-2025, PostgreSQL Global Development Group * diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index c389b27f77d47..db55c0c5bd231 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -523,7 +523,7 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, if (mode == RBM_NORMAL) { /* check that page has been initialized */ - Page page = (Page) BufferGetPage(buffer); + Page page = BufferGetPage(buffer); /* * We assume that PageIsNew is safe without a lock. During recovery, @@ -574,7 +574,7 @@ CreateFakeRelcacheEntry(RelFileLocator rlocator) Relation rel; /* Allocate the Relation struct and all related space in one block. */ - fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); + fakeentry = palloc0_object(FakeRelCacheEntryData); rel = (Relation) fakeentry; rel->rd_rel = &fakeentry->pgc; @@ -710,7 +710,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, const XLogRecPtr lastReadPage = (state->seg.ws_segno * state->segcxt.ws_segsize + state->segoff); - Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); + Assert(XLogRecPtrIsValid(wantPage) && wantPage % XLOG_BLCKSZ == 0); Assert(wantLength <= XLOG_BLCKSZ); Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); Assert(currTLI != 0); @@ -741,7 +741,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, */ if (state->currTLI == currTLI && wantPage >= lastReadPage) { - Assert(state->currTLIValidUntil == InvalidXLogRecPtr); + Assert(!XLogRecPtrIsValid(state->currTLIValidUntil)); return; } @@ -750,7 +750,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, * timeline and the timeline we're reading from is valid until the end of * the current segment we can just keep reading. */ - if (state->currTLIValidUntil != InvalidXLogRecPtr && + if (XLogRecPtrIsValid(state->currTLIValidUntil) && state->currTLI != currTLI && state->currTLI != 0 && ((wantPage + wantLength) / state->segcxt.ws_segsize) < @@ -790,12 +790,12 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, &state->nextTLI); - Assert(state->currTLIValidUntil == InvalidXLogRecPtr || + Assert(!XLogRecPtrIsValid(state->currTLIValidUntil) || wantPage + wantLength < state->currTLIValidUntil); list_free_deep(timelineHistory); - elog(DEBUG3, "switched to timeline %u valid until %X/%X", + elog(DEBUG3, "switched to timeline %u valid until %X/%08X", state->currTLI, LSN_FORMAT_ARGS(state->currTLIValidUntil)); } diff --git a/src/backend/access/transam/xlogwait.c b/src/backend/access/transam/xlogwait.c new file mode 100644 index 0000000000000..6109381c0f0a8 --- /dev/null +++ b/src/backend/access/transam/xlogwait.c @@ -0,0 +1,409 @@ +/*------------------------------------------------------------------------- + * + * xlogwait.c + * Implements waiting for WAL operations to reach specific LSNs. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/transam/xlogwait.c + * + * NOTES + * This file implements waiting for WAL operations to reach specific LSNs + * on both physical standby and primary servers. The core idea is simple: + * every process that wants to wait publishes the LSN it needs to the + * shared memory, and the appropriate process (startup on standby, or + * WAL writer/backend on primary) wakes it once that LSN has been reached. + * + * The shared memory used by this module comprises a procInfos + * per-backend array with the information of the awaited LSN for each + * of the backend processes. The elements of that array are organized + * into a pairing heap waitersHeap, which allows for very fast finding + * of the least awaited LSN. + * + * In addition, the least-awaited LSN is cached as minWaitedLSN. The + * waiter process publishes information about itself to the shared + * memory and waits on the latch until it is woken up by the appropriate + * process, standby is promoted, or the postmaster dies. Then, it cleans + * information about itself in the shared memory. + * + * On standby servers: After replaying a WAL record, the startup process + * first performs a fast path check minWaitedLSN > replayLSN. If this + * check is negative, it checks waitersHeap and wakes up the backend + * whose awaited LSNs are reached. + * + * On primary servers: After flushing WAL, the WAL writer or backend + * process performs a similar check against the flush LSN and wakes up + * waiters whose target flush LSNs have been reached. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/xlog.h" +#include "access/xlogrecovery.h" +#include "access/xlogwait.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/latch.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "utils/fmgrprotos.h" +#include "utils/pg_lsn.h" +#include "utils/snapmgr.h" + + +static int waitlsn_cmp(const pairingheap_node *a, const pairingheap_node *b, + void *arg); + +struct WaitLSNState *waitLSNState = NULL; + +/* Report the amount of shared memory space needed for WaitLSNState. */ +Size +WaitLSNShmemSize(void) +{ + Size size; + + size = offsetof(WaitLSNState, procInfos); + size = add_size(size, mul_size(MaxBackends + NUM_AUXILIARY_PROCS, sizeof(WaitLSNProcInfo))); + return size; +} + +/* Initialize the WaitLSNState in the shared memory. */ +void +WaitLSNShmemInit(void) +{ + bool found; + + waitLSNState = (WaitLSNState *) ShmemInitStruct("WaitLSNState", + WaitLSNShmemSize(), + &found); + if (!found) + { + int i; + + /* Initialize heaps and tracking */ + for (i = 0; i < WAIT_LSN_TYPE_COUNT; i++) + { + pg_atomic_init_u64(&waitLSNState->minWaitedLSN[i], PG_UINT64_MAX); + pairingheap_initialize(&waitLSNState->waitersHeap[i], waitlsn_cmp, NULL); + } + + /* Initialize process info array */ + memset(&waitLSNState->procInfos, 0, + (MaxBackends + NUM_AUXILIARY_PROCS) * sizeof(WaitLSNProcInfo)); + } +} + +/* + * Comparison function for LSN waiters heaps. Waiting processes are ordered by + * LSN, so that the waiter with smallest LSN is at the top. + */ +static int +waitlsn_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) +{ + const WaitLSNProcInfo *aproc = pairingheap_const_container(WaitLSNProcInfo, heapNode, a); + const WaitLSNProcInfo *bproc = pairingheap_const_container(WaitLSNProcInfo, heapNode, b); + + if (aproc->waitLSN < bproc->waitLSN) + return 1; + else if (aproc->waitLSN > bproc->waitLSN) + return -1; + else + return 0; +} + +/* + * Update minimum waited LSN for the specified LSN type + */ +static void +updateMinWaitedLSN(WaitLSNType lsnType) +{ + XLogRecPtr minWaitedLSN = PG_UINT64_MAX; + int i = (int) lsnType; + + Assert(i >= 0 && i < WAIT_LSN_TYPE_COUNT); + + if (!pairingheap_is_empty(&waitLSNState->waitersHeap[i])) + { + pairingheap_node *node = pairingheap_first(&waitLSNState->waitersHeap[i]); + WaitLSNProcInfo *procInfo = pairingheap_container(WaitLSNProcInfo, heapNode, node); + + minWaitedLSN = procInfo->waitLSN; + } + pg_atomic_write_u64(&waitLSNState->minWaitedLSN[i], minWaitedLSN); +} + +/* + * Add current process to appropriate waiters heap based on LSN type + */ +static void +addLSNWaiter(XLogRecPtr lsn, WaitLSNType lsnType) +{ + WaitLSNProcInfo *procInfo = &waitLSNState->procInfos[MyProcNumber]; + int i = (int) lsnType; + + Assert(i >= 0 && i < WAIT_LSN_TYPE_COUNT); + + LWLockAcquire(WaitLSNLock, LW_EXCLUSIVE); + + procInfo->procno = MyProcNumber; + procInfo->waitLSN = lsn; + procInfo->lsnType = lsnType; + + Assert(!procInfo->inHeap); + pairingheap_add(&waitLSNState->waitersHeap[i], &procInfo->heapNode); + procInfo->inHeap = true; + updateMinWaitedLSN(lsnType); + + LWLockRelease(WaitLSNLock); +} + +/* + * Remove current process from appropriate waiters heap based on LSN type + */ +static void +deleteLSNWaiter(WaitLSNType lsnType) +{ + WaitLSNProcInfo *procInfo = &waitLSNState->procInfos[MyProcNumber]; + int i = (int) lsnType; + + Assert(i >= 0 && i < WAIT_LSN_TYPE_COUNT); + + LWLockAcquire(WaitLSNLock, LW_EXCLUSIVE); + + Assert(procInfo->lsnType == lsnType); + + if (procInfo->inHeap) + { + pairingheap_remove(&waitLSNState->waitersHeap[i], &procInfo->heapNode); + procInfo->inHeap = false; + updateMinWaitedLSN(lsnType); + } + + LWLockRelease(WaitLSNLock); +} + +/* + * Size of a static array of procs to wakeup by WaitLSNWakeup() allocated + * on the stack. It should be enough to take single iteration for most cases. + */ +#define WAKEUP_PROC_STATIC_ARRAY_SIZE (16) + +/* + * Remove waiters whose LSN has been reached from the heap and set their + * latches. If InvalidXLogRecPtr is given, remove all waiters from the heap + * and set latches for all waiters. + * + * This function first accumulates waiters to wake up into an array, then + * wakes them up without holding a WaitLSNLock. The array size is static and + * equal to WAKEUP_PROC_STATIC_ARRAY_SIZE. That should be more than enough + * to wake up all the waiters at once in the vast majority of cases. However, + * if there are more waiters, this function will loop to process them in + * multiple chunks. + */ +static void +wakeupWaiters(WaitLSNType lsnType, XLogRecPtr currentLSN) +{ + ProcNumber wakeUpProcs[WAKEUP_PROC_STATIC_ARRAY_SIZE]; + int numWakeUpProcs; + int i = (int) lsnType; + + Assert(i >= 0 && i < WAIT_LSN_TYPE_COUNT); + + do + { + numWakeUpProcs = 0; + LWLockAcquire(WaitLSNLock, LW_EXCLUSIVE); + + /* + * Iterate the waiters heap until we find LSN not yet reached. Record + * process numbers to wake up, but send wakeups after releasing lock. + */ + while (!pairingheap_is_empty(&waitLSNState->waitersHeap[i])) + { + pairingheap_node *node = pairingheap_first(&waitLSNState->waitersHeap[i]); + WaitLSNProcInfo *procInfo; + + /* Get procInfo using appropriate heap node */ + procInfo = pairingheap_container(WaitLSNProcInfo, heapNode, node); + + if (XLogRecPtrIsValid(currentLSN) && procInfo->waitLSN > currentLSN) + break; + + Assert(numWakeUpProcs < WAKEUP_PROC_STATIC_ARRAY_SIZE); + wakeUpProcs[numWakeUpProcs++] = procInfo->procno; + (void) pairingheap_remove_first(&waitLSNState->waitersHeap[i]); + + /* Update appropriate flag */ + procInfo->inHeap = false; + + if (numWakeUpProcs == WAKEUP_PROC_STATIC_ARRAY_SIZE) + break; + } + + updateMinWaitedLSN(lsnType); + LWLockRelease(WaitLSNLock); + + /* + * Set latches for processes whose waited LSNs have been reached. + * Since SetLatch() is a time-consuming operation, we do this outside + * of WaitLSNLock. This is safe because procLatch is never freed, so + * at worst we may set a latch for the wrong process or for no process + * at all, which is harmless. + */ + for (i = 0; i < numWakeUpProcs; i++) + SetLatch(&GetPGProcByNumber(wakeUpProcs[i])->procLatch); + + } while (numWakeUpProcs == WAKEUP_PROC_STATIC_ARRAY_SIZE); +} + +/* + * Wake up processes waiting for LSN to reach currentLSN + */ +void +WaitLSNWakeup(WaitLSNType lsnType, XLogRecPtr currentLSN) +{ + int i = (int) lsnType; + + Assert(i >= 0 && i < WAIT_LSN_TYPE_COUNT); + + /* + * Fast path check. Skip if currentLSN is InvalidXLogRecPtr, which means + * "wake all waiters" (e.g., during promotion when recovery ends). + */ + if (XLogRecPtrIsValid(currentLSN) && + pg_atomic_read_u64(&waitLSNState->minWaitedLSN[i]) > currentLSN) + return; + + wakeupWaiters(lsnType, currentLSN); +} + +/* + * Clean up LSN waiters for exiting process + */ +void +WaitLSNCleanup(void) +{ + if (waitLSNState) + { + /* + * We do a fast-path check of the inHeap flag without the lock. This + * flag is set to true only by the process itself. So, it's only + * possible to get a false positive. But that will be eliminated by a + * recheck inside deleteLSNWaiter(). + */ + if (waitLSNState->procInfos[MyProcNumber].inHeap) + deleteLSNWaiter(waitLSNState->procInfos[MyProcNumber].lsnType); + } +} + +/* + * Wait using MyLatch till the given LSN is reached, the replica gets + * promoted, or the postmaster dies. + * + * Returns WAIT_LSN_RESULT_SUCCESS if target LSN was reached. + * Returns WAIT_LSN_RESULT_NOT_IN_RECOVERY if run not in recovery, + * or replica got promoted before the target LSN reached. + */ +WaitLSNResult +WaitForLSN(WaitLSNType lsnType, XLogRecPtr targetLSN, int64 timeout) +{ + XLogRecPtr currentLSN; + TimestampTz endtime = 0; + int wake_events = WL_LATCH_SET | WL_POSTMASTER_DEATH; + + /* Shouldn't be called when shmem isn't initialized */ + Assert(waitLSNState); + + /* Should have a valid proc number */ + Assert(MyProcNumber >= 0 && MyProcNumber < MaxBackends + NUM_AUXILIARY_PROCS); + + if (timeout > 0) + { + endtime = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), timeout); + wake_events |= WL_TIMEOUT; + } + + /* + * Add our process to the waiters heap. It might happen that target LSN + * gets reached before we do. The check at the beginning of the loop + * below prevents the race condition. + */ + addLSNWaiter(targetLSN, lsnType); + + for (;;) + { + int rc; + long delay_ms = -1; + + if (lsnType == WAIT_LSN_TYPE_REPLAY) + currentLSN = GetXLogReplayRecPtr(NULL); + else + currentLSN = GetFlushRecPtr(NULL); + + /* Check that recovery is still in-progress */ + if (lsnType == WAIT_LSN_TYPE_REPLAY && !RecoveryInProgress()) + { + /* + * Recovery was ended, but check if target LSN was already + * reached. + */ + deleteLSNWaiter(lsnType); + + if (PromoteIsTriggered() && targetLSN <= currentLSN) + return WAIT_LSN_RESULT_SUCCESS; + return WAIT_LSN_RESULT_NOT_IN_RECOVERY; + } + else + { + /* Check if the waited LSN has been reached */ + if (targetLSN <= currentLSN) + break; + } + + if (timeout > 0) + { + delay_ms = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), endtime); + if (delay_ms <= 0) + break; + } + + CHECK_FOR_INTERRUPTS(); + + rc = WaitLatch(MyLatch, wake_events, delay_ms, + (lsnType == WAIT_LSN_TYPE_REPLAY) ? WAIT_EVENT_WAIT_FOR_WAL_REPLAY : WAIT_EVENT_WAIT_FOR_WAL_FLUSH); + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. + */ + if (rc & WL_POSTMASTER_DEATH) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("terminating connection due to unexpected postmaster exit"), + errcontext("while waiting for LSN")); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + + /* + * Delete our process from the shared memory heap. We might already be + * deleted by the startup process. The 'inHeap' flags prevents us from + * the double deletion. + */ + deleteLSNWaiter(lsnType); + + /* + * If we didn't reach the target LSN, we must be exited by timeout. + */ + if (targetLSN > currentLSN) + return WAIT_LSN_RESULT_TIMEOUT; + + return WAIT_LSN_RESULT_SUCCESS; +} diff --git a/src/backend/backup/backup_manifest.c b/src/backend/backup/backup_manifest.c index 22e2be37c95c3..dd76c9b0b6383 100644 --- a/src/backend/backup/backup_manifest.c +++ b/src/backend/backup/backup_manifest.c @@ -242,7 +242,7 @@ AddWALInfoToBackupManifest(backup_manifest_info *manifest, XLogRecPtr startptr, * entry->end is InvalidXLogRecPtr, it means that the timeline has not * yet ended.) */ - if (!XLogRecPtrIsInvalid(entry->end) && entry->end < startptr) + if (XLogRecPtrIsValid(entry->end) && entry->end < startptr) continue; /* @@ -274,14 +274,14 @@ AddWALInfoToBackupManifest(backup_manifest_info *manifest, XLogRecPtr startptr, * better have arrived at the expected starting TLI. If not, * something's gone horribly wrong. */ - if (XLogRecPtrIsInvalid(entry->begin)) + if (!XLogRecPtrIsValid(entry->begin)) ereport(ERROR, errmsg("expected start timeline %u but found timeline %u", starttli, entry->tli)); } AppendToManifest(manifest, - "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%08X\", \"End-LSN\": \"%X/%08X\" }", first_wal_range ? "" : ",\n", entry->tli, LSN_FORMAT_ARGS(tl_beginptr), diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index f0f88838dc21a..719a758624dea 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -239,7 +239,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, TimeLineID endtli; backup_manifest_info manifest; BackupState *backup_state; - StringInfo tablespace_map; + StringInfoData tablespace_map; /* Initial backup state, insofar as we know it now. */ state.tablespaces = NIL; @@ -262,12 +262,12 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, total_checksum_failures = 0; /* Allocate backup related variables. */ - backup_state = (BackupState *) palloc0(sizeof(BackupState)); - tablespace_map = makeStringInfo(); + backup_state = palloc0_object(BackupState); + initStringInfo(&tablespace_map); basebackup_progress_wait_checkpoint(); do_pg_backup_start(opt->label, opt->fastcheckpoint, &state.tablespaces, - backup_state, tablespace_map); + backup_state, &tablespace_map); state.startptr = backup_state->startpoint; state.starttli = backup_state->starttli; @@ -289,7 +289,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, PrepareForIncrementalBackup(ib, backup_state); /* Add a node for the base directory at the end */ - newti = palloc0(sizeof(tablespaceinfo)); + newti = palloc0_object(tablespaceinfo); newti->size = -1; state.tablespaces = lappend(state.tablespaces, newti); @@ -342,7 +342,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, if (opt->sendtblspcmapfile) { sendFileWithContent(sink, TABLESPACE_MAP, - tablespace_map->data, -1, &manifest); + tablespace_map.data, -1, &manifest); sendtblspclinks = false; } @@ -399,7 +399,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, endtli = backup_state->stoptli; /* Deallocate backup-related variables. */ - destroyStringInfo(tablespace_map); + pfree(tablespace_map.data); pfree(backup_state); } PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false)); @@ -808,8 +808,8 @@ parse_basebackup_options(List *options, basebackup_options *opt) if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)", - (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER))); + errmsg("%" PRId64 " is outside the valid range for parameter \"%s\" (%d .. %d)", + maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER))); opt->maxrate = (uint32) maxrate; o_maxrate = true; @@ -1048,7 +1048,7 @@ SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib) sink = bbsink_zstd_new(sink, &opt.compression_specification); /* Set up progress reporting. */ - sink = bbsink_progress_new(sink, opt.progress); + sink = bbsink_progress_new(sink, opt.progress, opt.incremental); /* * Perform the base backup, but make sure we clean up the bbsink even if @@ -1206,7 +1206,7 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, * But we don't need it at all if this is not an incremental backup. */ if (ib != NULL) - relative_block_numbers = palloc(sizeof(BlockNumber) * RELSEG_SIZE); + relative_block_numbers = palloc_array(BlockNumber, RELSEG_SIZE); /* * Determine if the current path is a database directory that can contain diff --git a/src/backend/backup/basebackup_copy.c b/src/backend/backup/basebackup_copy.c index a284ce318ff7d..8bb8d3939fe22 100644 --- a/src/backend/backup/basebackup_copy.c +++ b/src/backend/backup/basebackup_copy.c @@ -107,7 +107,7 @@ static const bbsink_ops bbsink_copystream_ops = { bbsink * bbsink_copystream_new(bool send_to_client) { - bbsink_copystream *sink = palloc0(sizeof(bbsink_copystream)); + bbsink_copystream *sink = palloc0_object(bbsink_copystream); *((const bbsink_ops **) &sink->base.bbs_ops) = &bbsink_copystream_ops; sink->send_to_client = send_to_client; @@ -143,7 +143,7 @@ bbsink_copystream_begin_backup(bbsink *sink) buf = palloc(mysink->base.bbs_buffer_length + MAXIMUM_ALIGNOF); mysink->msgbuffer = buf + (MAXIMUM_ALIGNOF - 1); mysink->base.bbs_buffer = buf + MAXIMUM_ALIGNOF; - mysink->msgbuffer[0] = 'd'; /* archive or manifest data */ + mysink->msgbuffer[0] = PqMsg_CopyData; /* archive or manifest data */ /* Tell client the backup start location. */ SendXlogRecPtrResult(state->startptr, state->starttli); @@ -170,7 +170,7 @@ bbsink_copystream_begin_archive(bbsink *sink, const char *archive_name) ti = list_nth(state->tablespaces, state->tablespace_num); pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'n'); /* New archive */ + pq_sendbyte(&buf, PqBackupMsg_NewArchive); pq_sendstring(&buf, archive_name); pq_sendstring(&buf, ti->path == NULL ? "" : ti->path); pq_endmessage(&buf); @@ -191,7 +191,7 @@ bbsink_copystream_archive_contents(bbsink *sink, size_t len) if (mysink->send_to_client) { /* Add one because we're also sending a leading type byte. */ - pq_putmessage('d', mysink->msgbuffer, len + 1); + pq_putmessage(PqMsg_CopyData, mysink->msgbuffer, len + 1); } /* Consider whether to send a progress report to the client. */ @@ -221,7 +221,7 @@ bbsink_copystream_archive_contents(bbsink *sink, size_t len) mysink->last_progress_report_time = now; pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'p'); /* Progress report */ + pq_sendbyte(&buf, PqBackupMsg_ProgressReport); pq_sendint64(&buf, state->bytes_done); pq_endmessage(&buf); pq_flush_if_writable(); @@ -247,7 +247,7 @@ bbsink_copystream_end_archive(bbsink *sink) mysink->bytes_done_at_last_time_check = state->bytes_done; mysink->last_progress_report_time = GetCurrentTimestamp(); pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'p'); /* Progress report */ + pq_sendbyte(&buf, PqBackupMsg_ProgressReport); pq_sendint64(&buf, state->bytes_done); pq_endmessage(&buf); pq_flush_if_writable(); @@ -262,7 +262,7 @@ bbsink_copystream_begin_manifest(bbsink *sink) StringInfoData buf; pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'm'); /* Manifest */ + pq_sendbyte(&buf, PqBackupMsg_Manifest); pq_endmessage(&buf); } @@ -277,7 +277,7 @@ bbsink_copystream_manifest_contents(bbsink *sink, size_t len) if (mysink->send_to_client) { /* Add one because we're also sending a leading type byte. */ - pq_putmessage('d', mysink->msgbuffer, len + 1); + pq_putmessage(PqMsg_CopyData, mysink->msgbuffer, len + 1); } } @@ -361,7 +361,7 @@ SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli) tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); /* Data row */ - values[0] = CStringGetTextDatum(psprintf("%X/%X", LSN_FORMAT_ARGS(ptr))); + values[0] = CStringGetTextDatum(psprintf("%X/%08X", LSN_FORMAT_ARGS(ptr))); values[1] = Int64GetDatum(tli); do_tup_output(tstate, values, nulls); diff --git a/src/backend/backup/basebackup_gzip.c b/src/backend/backup/basebackup_gzip.c index c4cbb5f527644..aaad834291afc 100644 --- a/src/backend/backup/basebackup_gzip.c +++ b/src/backend/backup/basebackup_gzip.c @@ -76,7 +76,7 @@ bbsink_gzip_new(bbsink *next, pg_compress_specification *compress) Assert((compresslevel >= 1 && compresslevel <= 9) || compresslevel == Z_DEFAULT_COMPRESSION); - sink = palloc0(sizeof(bbsink_gzip)); + sink = palloc0_object(bbsink_gzip); *((const bbsink_ops **) &sink->base.bbs_ops) = &bbsink_gzip_ops; sink->base.bbs_next = next; sink->compresslevel = compresslevel; diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c index 28491b1e0ab08..7678e4f6ec33f 100644 --- a/src/backend/backup/basebackup_incremental.c +++ b/src/backend/backup/basebackup_incremental.c @@ -157,7 +157,7 @@ CreateIncrementalBackupInfo(MemoryContext mcxt) oldcontext = MemoryContextSwitchTo(mcxt); - ib = palloc0(sizeof(IncrementalBackupInfo)); + ib = palloc0_object(IncrementalBackupInfo); ib->mcxt = mcxt; initStringInfo(&ib->buf); @@ -169,7 +169,7 @@ CreateIncrementalBackupInfo(MemoryContext mcxt) */ ib->manifest_files = backup_file_create(mcxt, 10000, NULL); - context = palloc0(sizeof(JsonManifestParseContext)); + context = palloc0_object(JsonManifestParseContext); /* Parse the manifest. */ context->private_data = ib; context->version_cb = manifest_process_version; @@ -409,7 +409,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->start_lsn < tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X", + errmsg("manifest requires WAL from initial timeline %u starting at %X/%08X, but that timeline begins at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); @@ -419,7 +419,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->start_lsn != tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X", + errmsg("manifest requires WAL from continuation timeline %u starting at %X/%08X, but that timeline begins at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); @@ -430,7 +430,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->end_lsn > backup_state->startpoint) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X", + errmsg("manifest requires WAL from final timeline %u ending at %X/%08X, but this backup starts at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(backup_state->startpoint)), @@ -441,7 +441,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->end_lsn != tlep[i]->end) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X", + errmsg("manifest requires WAL from non-final timeline %u ending at %X/%08X, but this server switched timelines at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(tlep[i]->end)))); @@ -519,21 +519,21 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn, &tli_missing_lsn)) { - if (XLogRecPtrIsInvalid(tli_missing_lsn)) + if (!XLogRecPtrIsValid(tli_missing_lsn)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist", + errmsg("WAL summaries are required on timeline %u from %X/%08X to %X/%08X, but no summaries for that timeline and LSN range exist", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)))); else ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete", + errmsg("WAL summaries are required on timeline %u from %X/%08X to %X/%08X, but the summaries for that timeline and LSN range are incomplete", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)), - errdetail("The first unsummarized LSN in this range is %X/%X.", + errdetail("The first unsummarized LSN in this range is %X/%08X.", LSN_FORMAT_ARGS(tli_missing_lsn)))); } @@ -993,7 +993,7 @@ manifest_process_wal_range(JsonManifestParseContext *context, XLogRecPtr end_lsn) { IncrementalBackupInfo *ib = context->private_data; - backup_wal_range *range = palloc(sizeof(backup_wal_range)); + backup_wal_range *range = palloc_object(backup_wal_range); range->tli = tli; range->start_lsn = start_lsn; diff --git a/src/backend/backup/basebackup_lz4.c b/src/backend/backup/basebackup_lz4.c index c5ceccb846f57..bd5704520fb7d 100644 --- a/src/backend/backup/basebackup_lz4.c +++ b/src/backend/backup/basebackup_lz4.c @@ -75,7 +75,7 @@ bbsink_lz4_new(bbsink *next, pg_compress_specification *compress) compresslevel = compress->level; Assert(compresslevel >= 0 && compresslevel <= 12); - sink = palloc0(sizeof(bbsink_lz4)); + sink = palloc0_object(bbsink_lz4); *((const bbsink_ops **) &sink->base.bbs_ops) = &bbsink_lz4_ops; sink->base.bbs_next = next; sink->compresslevel = compresslevel; diff --git a/src/backend/backup/basebackup_progress.c b/src/backend/backup/basebackup_progress.c index 1d22b541f89af..d783c9f573875 100644 --- a/src/backend/backup/basebackup_progress.c +++ b/src/backend/backup/basebackup_progress.c @@ -56,23 +56,28 @@ static const bbsink_ops bbsink_progress_ops = { * forwards data to a successor sink. */ bbsink * -bbsink_progress_new(bbsink *next, bool estimate_backup_size) +bbsink_progress_new(bbsink *next, bool estimate_backup_size, bool incremental) { bbsink *sink; Assert(next != NULL); - sink = palloc0(sizeof(bbsink)); + sink = palloc0_object(bbsink); *((const bbsink_ops **) &sink->bbs_ops) = &bbsink_progress_ops; sink->bbs_next = next; /* * Report that a base backup is in progress, and set the total size of the * backup to -1, which will get translated to NULL. If we're estimating - * the backup size, we'll insert the real estimate when we have it. + * the backup size, we'll insert the real estimate when we have it. Also, + * the backup type is set. */ pgstat_progress_start_command(PROGRESS_COMMAND_BASEBACKUP, InvalidOid); pgstat_progress_update_param(PROGRESS_BASEBACKUP_BACKUP_TOTAL, -1); + pgstat_progress_update_param(PROGRESS_BASEBACKUP_BACKUP_TYPE, + incremental + ? PROGRESS_BASEBACKUP_BACKUP_TYPE_INCREMENTAL + : PROGRESS_BASEBACKUP_BACKUP_TYPE_FULL); return sink; } diff --git a/src/backend/backup/basebackup_server.c b/src/backend/backup/basebackup_server.c index f5c0c61640a94..0d44a148f017c 100644 --- a/src/backend/backup/basebackup_server.c +++ b/src/backend/backup/basebackup_server.c @@ -59,7 +59,7 @@ static const bbsink_ops bbsink_server_ops = { bbsink * bbsink_server_new(bbsink *next, char *pathname) { - bbsink_server *sink = palloc0(sizeof(bbsink_server)); + bbsink_server *sink = palloc0_object(bbsink_server); *((const bbsink_ops **) &sink->base.bbs_ops) = &bbsink_server_ops; sink->pathname = pathname; @@ -176,9 +176,9 @@ bbsink_server_archive_contents(bbsink *sink, size_t len) /* short write: complain appropriately */ ereport(ERROR, (errcode(ERRCODE_DISK_FULL), - errmsg("could not write file \"%s\": wrote only %d of %d bytes at offset %u", + errmsg("could not write file \"%s\": wrote only %d of %zu bytes at offset %u", FilePathName(mysink->file), - nbytes, (int) len, (unsigned) mysink->filepos), + nbytes, len, (unsigned) mysink->filepos), errhint("Check free disk space."))); } @@ -269,9 +269,9 @@ bbsink_server_manifest_contents(bbsink *sink, size_t len) /* short write: complain appropriately */ ereport(ERROR, (errcode(ERRCODE_DISK_FULL), - errmsg("could not write file \"%s\": wrote only %d of %d bytes at offset %u", + errmsg("could not write file \"%s\": wrote only %d of %zu bytes at offset %u", FilePathName(mysink->file), - nbytes, (int) len, (unsigned) mysink->filepos), + nbytes, len, (unsigned) mysink->filepos), errhint("Check free disk space."))); } diff --git a/src/backend/backup/basebackup_target.c b/src/backend/backup/basebackup_target.c index 84b1309d3bdc8..8b74828aed6d3 100644 --- a/src/backend/backup/basebackup_target.c +++ b/src/backend/backup/basebackup_target.c @@ -96,7 +96,7 @@ BaseBackupAddTarget(char *name, * name into a newly-allocated chunk of memory. */ oldcontext = MemoryContextSwitchTo(TopMemoryContext); - newtype = palloc(sizeof(BaseBackupTargetType)); + newtype = palloc_object(BaseBackupTargetType); newtype->name = pstrdup(name); newtype->check_detail = check_detail; newtype->get_sink = get_sink; @@ -132,7 +132,7 @@ BaseBackupGetTargetHandle(char *target, char *target_detail) BaseBackupTargetHandle *handle; /* Found the target. */ - handle = palloc(sizeof(BaseBackupTargetHandle)); + handle = palloc_object(BaseBackupTargetHandle); handle->type = ttype; handle->detail_arg = ttype->check_detail(target, target_detail); diff --git a/src/backend/backup/basebackup_throttle.c b/src/backend/backup/basebackup_throttle.c index b2b743238f9d0..95746c3ea4093 100644 --- a/src/backend/backup/basebackup_throttle.c +++ b/src/backend/backup/basebackup_throttle.c @@ -72,7 +72,7 @@ bbsink_throttle_new(bbsink *next, uint32 maxrate) Assert(next != NULL); Assert(maxrate > 0); - sink = palloc0(sizeof(bbsink_throttle)); + sink = palloc0_object(bbsink_throttle); *((const bbsink_ops **) &sink->base.bbs_ops) = &bbsink_throttle_ops; sink->base.bbs_next = next; diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c index 18b2e8fb0b3b6..647ee0eb97831 100644 --- a/src/backend/backup/basebackup_zstd.c +++ b/src/backend/backup/basebackup_zstd.c @@ -70,7 +70,7 @@ bbsink_zstd_new(bbsink *next, pg_compress_specification *compress) Assert(next != NULL); - sink = palloc0(sizeof(bbsink_zstd)); + sink = palloc0_object(bbsink_zstd); *((const bbsink_ops **) &sink->base.bbs_ops) = &bbsink_zstd_ops; sink->base.bbs_next = next; sink->compress = compress; diff --git a/src/backend/backup/walsummary.c b/src/backend/backup/walsummary.c index c7a2c65cc6a7a..a843876337bf8 100644 --- a/src/backend/backup/walsummary.c +++ b/src/backend/backup/walsummary.c @@ -67,13 +67,13 @@ GetWalSummaries(TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn) /* Skip if it doesn't match the filter criteria. */ if (tli != 0 && tli != file_tli) continue; - if (!XLogRecPtrIsInvalid(start_lsn) && start_lsn >= file_end_lsn) + if (XLogRecPtrIsValid(start_lsn) && start_lsn >= file_end_lsn) continue; - if (!XLogRecPtrIsInvalid(end_lsn) && end_lsn <= file_start_lsn) + if (XLogRecPtrIsValid(end_lsn) && end_lsn <= file_start_lsn) continue; /* Add it to the list. */ - ws = palloc(sizeof(WalSummaryFile)); + ws = palloc_object(WalSummaryFile); ws->tli = file_tli; ws->start_lsn = file_start_lsn; ws->end_lsn = file_end_lsn; @@ -111,9 +111,9 @@ FilterWalSummaries(List *wslist, TimeLineID tli, /* Skip if it doesn't match the filter criteria. */ if (tli != 0 && tli != ws->tli) continue; - if (!XLogRecPtrIsInvalid(start_lsn) && start_lsn > ws->end_lsn) + if (XLogRecPtrIsValid(start_lsn) && start_lsn > ws->end_lsn) continue; - if (!XLogRecPtrIsInvalid(end_lsn) && end_lsn < ws->start_lsn) + if (XLogRecPtrIsValid(end_lsn) && end_lsn < ws->start_lsn) continue; /* Add it to the result list. */ diff --git a/src/backend/backup/walsummaryfuncs.c b/src/backend/backup/walsummaryfuncs.c index d6dd131da145b..29e2cb83ff449 100644 --- a/src/backend/backup/walsummaryfuncs.c +++ b/src/backend/backup/walsummaryfuncs.c @@ -12,6 +12,7 @@ #include "postgres.h" +#include "access/htup_details.h" #include "backup/walsummary.h" #include "common/blkreftable.h" #include "funcapi.h" diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 6db864892d0dd..4986b1ea7ed01 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -109,6 +109,8 @@ static const struct typinfo TypInfo[] = { F_REGROLEIN, F_REGROLEOUT}, {"regnamespace", REGNAMESPACEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_REGNAMESPACEIN, F_REGNAMESPACEOUT}, + {"regdatabase", REGDATABASEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, + F_REGDATABASEIN, F_REGDATABASEOUT}, {"text", TEXTOID, 0, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, F_TEXTIN, F_TEXTOUT}, {"oid", OIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, @@ -740,7 +742,7 @@ populate_typ_list(void) Form_pg_type typForm = (Form_pg_type) GETSTRUCT(tup); struct typmap *newtyp; - newtyp = (struct typmap *) palloc(sizeof(struct typmap)); + newtyp = palloc_object(struct typmap); Typ = lappend(Typ, newtyp); newtyp->am_oid = typForm->oid; @@ -949,10 +951,10 @@ index_register(Oid heap, oldcxt = MemoryContextSwitchTo(nogc); - newind = (IndexList *) palloc(sizeof(IndexList)); + newind = palloc_object(IndexList); newind->il_heap = heap; newind->il_ind = ind; - newind->il_info = (IndexInfo *) palloc(sizeof(IndexInfo)); + newind->il_info = palloc_object(IndexInfo); memcpy(newind->il_info, indexInfo, sizeof(IndexInfo)); /* expressions will likely be null, but may as well copy it */ diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index c090094ed08d5..8e40e1b8189f8 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -44,6 +44,7 @@ OBJS = \ pg_range.o \ pg_shdepend.o \ pg_subscription.o \ + pg_tablespace.o \ pg_type.o \ storage.o \ toasting.o diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index 9ca8a88dc9104..5b410ff14c933 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -64,7 +64,6 @@ #include "catalog/pg_proc.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/extension.h" @@ -581,7 +580,7 @@ ExecuteGrantStmt(GrantStmt *stmt) elog(ERROR, "AccessPriv node must specify privilege or columns"); priv = string_to_privilege(privnode->priv_name); - if (priv & ~((AclMode) all_privileges)) + if (priv & ~all_privileges) ereport(ERROR, (errcode(ERRCODE_INVALID_GRANT_OPERATION), errmsg(errormsg, privilege_to_string(priv)))); @@ -659,6 +658,20 @@ ExecGrantStmt_oids(InternalGrant *istmt) * objectNamesToOids * * Turn a list of object names of a given type into an Oid list. + * + * XXX This function intentionally takes only an AccessShareLock. In the face + * of concurrent DDL, we might easily latch onto an old version of an object, + * causing the GRANT or REVOKE statement to fail. But it does prevent the + * object from disappearing altogether. To do better, we would need to use a + * self-exclusive lock, perhaps ShareUpdateExclusiveLock, here and before + * *every* CatalogTupleUpdate() of a row that GRANT/REVOKE can affect. + * Besides that additional work, this could have operational costs. For + * example, it would make GRANT ALL TABLES IN SCHEMA terminate every + * autovacuum running in the schema and consume a shared lock table entry per + * table in the schema. The user-visible benefit of that additional work is + * just changing "ERROR: tuple concurrently updated" to blocking. That's not + * nothing, but it might not outweigh autovacuum termination and lock table + * consumption spikes. */ static List * objectNamesToOids(ObjectType objtype, List *objnames, bool is_grant) @@ -1046,7 +1059,7 @@ ExecAlterDefaultPrivilegesStmt(ParseState *pstate, AlterDefaultPrivilegesStmt *s elog(ERROR, "AccessPriv node must specify privilege"); priv = string_to_privilege(privnode->priv_name); - if (priv & ~((AclMode) all_privileges)) + if (priv & ~all_privileges) ereport(ERROR, (errcode(ERRCODE_INVALID_GRANT_OPERATION), errmsg(errormsg, privilege_to_string(priv)))); @@ -1194,7 +1207,8 @@ SetDefaultACL(InternalDefaultACL *iacls) if (OidIsValid(iacls->nspid)) ereport(ERROR, (errcode(ERRCODE_INVALID_GRANT_OPERATION), - errmsg("cannot use IN SCHEMA clause when using GRANT/REVOKE ON SCHEMAS"))); + errmsg("cannot use IN SCHEMA clause when using %s", + "GRANT/REVOKE ON SCHEMAS"))); objtype = DEFACLOBJ_NAMESPACE; if (iacls->all_privs && this_privileges == ACL_NO_RIGHTS) this_privileges = ACL_ALL_RIGHTS_SCHEMA; @@ -1204,7 +1218,8 @@ SetDefaultACL(InternalDefaultACL *iacls) if (OidIsValid(iacls->nspid)) ereport(ERROR, (errcode(ERRCODE_INVALID_GRANT_OPERATION), - errmsg("cannot use IN SCHEMA clause when using GRANT/REVOKE ON LARGE OBJECTS"))); + errmsg("cannot use IN SCHEMA clause when using %s", + "GRANT/REVOKE ON LARGE OBJECTS"))); objtype = DEFACLOBJ_LARGEOBJECT; if (iacls->all_privs && this_privileges == ACL_NO_RIGHTS) this_privileges = ACL_ALL_RIGHTS_LARGEOBJECT; @@ -3112,7 +3127,7 @@ object_aclmask_ext(Oid classid, Oid objectid, Oid roleid, result = aclmask(acl, roleid, ownerId, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); ReleaseSysCache(tuple); @@ -3242,7 +3257,7 @@ pg_attribute_aclmask_ext(Oid table_oid, AttrNumber attnum, Oid roleid, result = aclmask(acl, roleid, ownerId, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); ReleaseSysCache(attTuple); @@ -3349,7 +3364,7 @@ pg_class_aclmask_ext(Oid table_oid, Oid roleid, AclMode mask, result = aclmask(acl, roleid, ownerId, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); ReleaseSysCache(tuple); @@ -3441,7 +3456,7 @@ pg_parameter_aclmask(const char *name, Oid roleid, AclMode mask, AclMaskHow how) result = aclmask(acl, roleid, BOOTSTRAP_SUPERUSERID, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); ReleaseSysCache(tuple); @@ -3496,7 +3511,7 @@ pg_parameter_acl_aclmask(Oid acl_oid, Oid roleid, AclMode mask, AclMaskHow how) result = aclmask(acl, roleid, BOOTSTRAP_SUPERUSERID, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); ReleaseSysCache(tuple); @@ -3576,7 +3591,7 @@ pg_largeobject_aclmask_snapshot(Oid lobj_oid, Oid roleid, result = aclmask(acl, roleid, ownerId, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); systable_endscan(scan); @@ -3670,7 +3685,7 @@ pg_namespace_aclmask_ext(Oid nsp_oid, Oid roleid, result = aclmask(acl, roleid, ownerId, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); ReleaseSysCache(tuple); @@ -3806,7 +3821,7 @@ pg_type_aclmask_ext(Oid type_oid, Oid roleid, AclMode mask, AclMaskHow how, result = aclmask(acl, roleid, ownerId, mask, how); /* if we have a detoasted copy, free it */ - if (acl && (Pointer) acl != DatumGetPointer(aclDatum)) + if (acl && acl != DatumGetPointer(aclDatum)) pfree(acl); ReleaseSysCache(tuple); @@ -3990,7 +4005,7 @@ pg_attribute_aclcheck_all_ext(Oid table_oid, Oid roleid, attmask = aclmask(acl, roleid, ownerId, mode, ACLMASK_ANY); /* if we have a detoasted copy, free it */ - if ((Pointer) acl != DatumGetPointer(aclDatum)) + if (acl != DatumGetPointer(aclDatum)) pfree(acl); } diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 18316a3968bcf..7489bbd5fb34f 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -22,6 +22,7 @@ #include "catalog/dependency.h" #include "catalog/heap.h" #include "catalog/index.h" +#include "catalog/namespace.h" #include "catalog/objectaccess.h" #include "catalog/pg_am.h" #include "catalog/pg_amop.h" @@ -320,13 +321,63 @@ performDeletion(const ObjectAddress *object, } /* - * performMultipleDeletions: Similar to performDeletion, but act on multiple + * performDeletionCheck: Check whether a specific object can be safely deleted. + * This function does not perform any deletion; instead, it raises an error + * if the object cannot be deleted due to existing dependencies. + * + * It can be useful when you need to delete some objects later. See comments + * in performDeletion too. + * The behavior must be specified as DROP_RESTRICT. + */ +void +performDeletionCheck(const ObjectAddress *object, + DropBehavior behavior, int flags) +{ + Relation depRel; + ObjectAddresses *targetObjects; + + Assert(behavior == DROP_RESTRICT); + + depRel = table_open(DependRelationId, RowExclusiveLock); + + AcquireDeletionLock(object, 0); + + /* + * Construct a list of objects we want to delete later (ie, the given + * object plus everything directly or indirectly dependent on it). + */ + targetObjects = new_object_addresses(); + + findDependentObjects(object, + DEPFLAG_ORIGINAL, + flags, + NULL, /* empty stack */ + targetObjects, + NULL, /* no pendingObjects */ + &depRel); + + /* + * Check if deletion is allowed. + */ + reportDependentObjects(targetObjects, + behavior, + flags, + object); + + /* And clean up */ + free_object_addresses(targetObjects); + + table_close(depRel, RowExclusiveLock); +} + +/* + * performMultipleDeletions: Similar to performDeletion, but acts on multiple * objects at once. * * The main difference from issuing multiple performDeletion calls is that the * list of objects that would be implicitly dropped, for each object to be * dropped, is the union of the implicit-object list for all objects. This - * makes each check be more relaxed. + * makes each check more relaxed. */ void performMultipleDeletions(const ObjectAddresses *objects, @@ -800,8 +851,7 @@ findDependentObjects(const ObjectAddress *object, * regression testing.) */ maxDependentObjects = 128; /* arbitrary initial allocation */ - dependentObjects = (ObjectAddressAndFlags *) - palloc(maxDependentObjects * sizeof(ObjectAddressAndFlags)); + dependentObjects = palloc_array(ObjectAddressAndFlags, maxDependentObjects); numDependentObjects = 0; ScanKeyInit(&key[0], @@ -1554,25 +1604,57 @@ recordDependencyOnExpr(const ObjectAddress *depender, Node *expr, List *rtable, DependencyType behavior) { - find_expr_references_context context; - - context.addrs = new_object_addresses(); + ObjectAddresses *addrs; - /* Set up interpretation for Vars at varlevelsup = 0 */ - context.rtables = list_make1(rtable); + addrs = new_object_addresses(); - /* Scan the expression tree for referenceable objects */ - find_expr_references_walker(expr, &context); + /* Collect all dependencies from the expression */ + collectDependenciesOfExpr(addrs, expr, rtable); - /* Remove any duplicates */ - eliminate_duplicate_dependencies(context.addrs); + /* Remove duplicates */ + eliminate_duplicate_dependencies(addrs); /* And record 'em */ recordMultipleDependencies(depender, - context.addrs->refs, context.addrs->numrefs, + addrs->refs, addrs->numrefs, behavior); - free_object_addresses(context.addrs); + free_object_addresses(addrs); +} + +/* + * collectDependenciesOfExpr - collect expression dependencies + * + * This function analyzes an expression or query in node-tree form to + * find all the objects it refers to (tables, columns, operators, + * functions, etc.) and adds them to the provided ObjectAddresses + * structure. Unlike recordDependencyOnExpr, this function does not + * immediately record the dependencies, allowing the caller to add to, + * filter, or modify the collected dependencies before recording them. + * + * rtable is the rangetable to be used to interpret Vars with varlevelsup=0. + * It can be NIL if no such variables are expected. + * + * Note: the returned list may well contain duplicates. The caller should + * de-duplicate before recording the dependencies. Within this file, callers + * must call eliminate_duplicate_dependencies(). External callers typically + * go through record_object_address_dependencies() which will see to that. + * This choice allows collecting dependencies from multiple sources without + * redundant de-duplication work. + */ +void +collectDependenciesOfExpr(ObjectAddresses *addrs, + Node *expr, List *rtable) +{ + find_expr_references_context context; + + context.addrs = addrs; + + /* Set up interpretation for Vars at varlevelsup = 0 */ + context.rtables = list_make1(rtable); + + /* Scan the expression tree for referenceable objects */ + find_expr_references_walker(expr, &context); } /* @@ -1850,6 +1932,17 @@ find_expr_references_walker(Node *node, errmsg("constant of the type %s cannot be used here", "regrole"))); break; + + /* + * Dependencies for regdatabase should be shared among all + * databases, so explicitly inhibit to have dependencies. + */ + case REGDATABASEOID: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constant of the type %s cannot be used here", + "regdatabase"))); + break; } } return false; @@ -2391,6 +2484,75 @@ process_function_rte_ref(RangeTblEntry *rte, AttrNumber attnum, attnum, rte->eref->aliasname))); } +/* + * find_temp_object - search an array of dependency references for temp objects + * + * Scan an ObjectAddresses array for references to temporary objects (objects + * in temporary namespaces), ignoring those in our own temp namespace if + * local_temp_okay is true. If one is found, return true after storing its + * address in *foundobj. + * + * Current callers only use this to deliver helpful notices, so reporting + * one such object seems sufficient. We return the first one, which should + * be a stable result for a given query since it depends only on the order + * in which this module searches query trees. (However, it's important to + * call this before de-duplicating the objects, else OID order would affect + * the result.) + */ +bool +find_temp_object(const ObjectAddresses *addrs, bool local_temp_okay, + ObjectAddress *foundobj) +{ + for (int i = 0; i < addrs->numrefs; i++) + { + const ObjectAddress *thisobj = addrs->refs + i; + Oid objnamespace; + + /* + * Use get_object_namespace() to see if this object belongs to a + * schema. If not, we can skip it. + */ + objnamespace = get_object_namespace(thisobj); + + /* + * If the object is in a temporary namespace, complain, except if + * local_temp_okay and it's our own temp namespace. + */ + if (OidIsValid(objnamespace) && isAnyTempNamespace(objnamespace) && + !(local_temp_okay && isTempNamespace(objnamespace))) + { + *foundobj = *thisobj; + return true; + } + } + return false; +} + +/* + * query_uses_temp_object - convenience wrapper for find_temp_object + * + * If the Query includes any use of a temporary object, fill *temp_object + * with the address of one such object and return true. + */ +bool +query_uses_temp_object(Query *query, ObjectAddress *temp_object) +{ + bool result; + ObjectAddresses *addrs; + + addrs = new_object_addresses(); + + /* Collect all dependencies from the Query */ + collectDependenciesOfExpr(addrs, (Node *) query, NIL); + + /* Look for one that is temp */ + result = find_temp_object(addrs, false, temp_object); + + free_object_addresses(addrs); + + return result; +} + /* * Given an array of dependency references, eliminate any duplicates. */ @@ -2503,12 +2665,11 @@ new_object_addresses(void) { ObjectAddresses *addrs; - addrs = palloc(sizeof(ObjectAddresses)); + addrs = palloc_object(ObjectAddresses); addrs->numrefs = 0; addrs->maxrefs = 32; - addrs->refs = (ObjectAddress *) - palloc(addrs->maxrefs * sizeof(ObjectAddress)); + addrs->refs = palloc_array(ObjectAddress, addrs->maxrefs); addrs->extras = NULL; /* until/unless needed */ return addrs; diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index df3231fcd41c2..6c02aee726754 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -1054,8 +1054,7 @@ sub morph_row_for_schemapg } # Expand booleans from 'f'/'t' to 'false'/'true'. - # Some values might be other macros (eg FLOAT8PASSBYVAL), - # don't change. + # Some values might be other macros, if so don't change. elsif ($atttype eq 'bool') { $row->{$attname} = 'true' if $row->{$attname} eq 't'; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index fbaed5359ad7c..265cc3e5fbf45 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -664,6 +664,15 @@ CheckAttributeType(const char *attname, flags); } + /* + * For consistency with check_virtual_generated_security(). + */ + if ((flags & CHKATYPE_IS_VIRTUAL) && atttypid >= FirstUnpinnedObjectId) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("virtual generated column \"%s\" cannot have a user-defined type", attname), + errdetail("Virtual generated columns that make use of user-defined types are not yet supported.")); + /* * This might not be strictly invalid per SQL standard, but it is pretty * useless, and it cannot be dumped, so we must disallow it. @@ -723,7 +732,7 @@ InsertPgAttributeTuples(Relation pg_attribute_rel, /* Initialize the number of slots to use */ nslots = Min(tupdesc->natts, (MAX_CATALOG_MULTI_INSERT_BYTES / sizeof(FormData_pg_attribute))); - slot = palloc(sizeof(TupleTableSlot *) * nslots); + slot = palloc_array(TupleTableSlot *, nslots); for (int i = 0; i < nslots; i++) slot[i] = MakeSingleTupleTableSlot(td, &TTSOpsHeapTuple); @@ -1100,6 +1109,7 @@ AddNewRelationType(const char *typeName, * if false, relacl is always set NULL * allow_system_table_mods: true to allow creation in system namespaces * is_internal: is this a system-generated catalog? + * relrewrite: link to original relation during a table rewrite * * Output parameters: * typaddress: if not null, gets the object address of the new pg_type entry @@ -2449,7 +2459,7 @@ AddRelationNewConstraints(Relation rel, defOid = StoreAttrDefault(rel, colDef->attnum, expr, is_internal); - cooked = (CookedConstraint *) palloc(sizeof(CookedConstraint)); + cooked = palloc_object(CookedConstraint); cooked->contype = CONSTR_DEFAULT; cooked->conoid = defOid; cooked->name = NULL; @@ -2583,7 +2593,7 @@ AddRelationNewConstraints(Relation rel, numchecks++; - cooked = (CookedConstraint *) palloc(sizeof(CookedConstraint)); + cooked = palloc_object(CookedConstraint); cooked->contype = CONSTR_CHECK; cooked->conoid = constrOid; cooked->name = ccname; @@ -2659,7 +2669,7 @@ AddRelationNewConstraints(Relation rel, inhcount, cdef->is_no_inherit); - nncooked = (CookedConstraint *) palloc(sizeof(CookedConstraint)); + nncooked = palloc_object(CookedConstraint); nncooked->contype = CONSTR_NOTNULL; nncooked->conoid = constrOid; nncooked->name = nnname; @@ -2996,7 +3006,7 @@ AddRelationNotNullConstraints(Relation rel, List *constraints, if (constr->is_no_inherit) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("cannot define not-null constraint on column \"%s\" with NO INHERIT", + errmsg("cannot define not-null constraint with NO INHERIT on column \"%s\"", strVal(linitial(constr->keys))), errdetail("The column has an inherited not-null constraint."))); @@ -3214,6 +3224,86 @@ check_nested_generated(ParseState *pstate, Node *node) check_nested_generated_walker(node, pstate); } +/* + * Check security of virtual generated column expression. + * + * Just like selecting from a view is exploitable (CVE-2024-7348), selecting + * from a table with virtual generated columns is exploitable. Users who are + * concerned about this can avoid selecting from views, but telling them to + * avoid selecting from tables is less practical. + * + * To address this, this restricts generation expressions for virtual + * generated columns are restricted to using built-in functions and types. We + * assume that built-in functions and types cannot be exploited for this + * purpose. Note the overall security also requires that all functions in use + * a immutable. (For example, there are some built-in non-immutable functions + * that can run arbitrary SQL.) The immutability is checked elsewhere, since + * that is a property that needs to hold independent of security + * considerations. + * + * In the future, this could be expanded by some new mechanism to declare + * other functions and types as safe or trusted for this purpose, but that is + * to be designed. + */ + +/* + * Callback for check_functions_in_node() that determines whether a function + * is user-defined. + */ +static bool +contains_user_functions_checker(Oid func_id, void *context) +{ + return (func_id >= FirstUnpinnedObjectId); +} + +/* + * Checks for all the things we don't want in the generation expressions of + * virtual generated columns for security reasons. Errors out if it finds + * one. + */ +static bool +check_virtual_generated_security_walker(Node *node, void *context) +{ + ParseState *pstate = context; + + if (node == NULL) + return false; + + if (!IsA(node, List)) + { + if (check_functions_in_node(node, contains_user_functions_checker, NULL)) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("generation expression uses user-defined function"), + errdetail("Virtual generated columns that make use of user-defined functions are not yet supported."), + parser_errposition(pstate, exprLocation(node))); + + /* + * check_functions_in_node() doesn't check some node types (see + * comment there). We handle CoerceToDomain and MinMaxExpr by + * checking for built-in types. The other listed node types cannot + * call user-definable SQL-visible functions. + * + * We furthermore need this type check to handle built-in, immutable + * polymorphic functions such as array_eq(). + */ + if (exprType(node) >= FirstUnpinnedObjectId) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("generation expression uses user-defined type"), + errdetail("Virtual generated columns that make use of user-defined types are not yet supported."), + parser_errposition(pstate, exprLocation(node))); + } + + return expression_tree_walker(node, check_virtual_generated_security_walker, context); +} + +static void +check_virtual_generated_security(ParseState *pstate, Node *node) +{ + check_virtual_generated_security_walker(node, pstate); +} + /* * Take a raw default and convert it to a cooked format ready for * storage. @@ -3253,6 +3343,10 @@ cookDefault(ParseState *pstate, ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("generation expression is not immutable"))); + + /* Check security of expressions for virtual generated column */ + if (attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + check_virtual_generated_security(pstate, expr); } else { diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 739a92bdcc1ca..8dea58ad96bf0 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -800,11 +800,11 @@ index_create(Relation heapRelation, errmsg("user-defined indexes on system catalog tables are not supported"))); /* - * Btree text_pattern_ops uses text_eq as the equality operator, which is - * fine as long as the collation is deterministic; text_eq then reduces to + * Btree text_pattern_ops uses texteq as the equality operator, which is + * fine as long as the collation is deterministic; texteq then reduces to * bitwise equality and so it is semantically compatible with the other * operators and functions in that opclass. But with a nondeterministic - * collation, text_eq could yield results that are incompatible with the + * collation, texteq could yield results that are incompatible with the * actual behavior of the index (which is determined by the opclass's * comparison function). We prevent such problems by refusing creation of * an index with that opclass and a nondeterministic collation. @@ -814,7 +814,7 @@ index_create(Relation heapRelation, * opclasses as incompatible with nondeterminism; but for now, this small * hack suffices. * - * Another solution is to use a special operator, not text_eq, as the + * Another solution is to use a special operator, not texteq, as the * equality opclass member; but that is undesirable because it would * prevent index usage in many queries that work fine today. */ @@ -1414,7 +1414,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, } /* Extract opclass options for each attribute */ - opclassOptions = palloc0(sizeof(Datum) * newInfo->ii_NumIndexAttrs); + opclassOptions = palloc0_array(Datum, newInfo->ii_NumIndexAttrs); for (int i = 0; i < newInfo->ii_NumIndexAttrs; i++) opclassOptions[i] = get_attoptions(oldIndexId, i + 1); @@ -2678,9 +2678,9 @@ BuildSpeculativeIndexInfo(Relation index, IndexInfo *ii) */ Assert(ii->ii_Unique); - ii->ii_UniqueOps = (Oid *) palloc(sizeof(Oid) * indnkeyatts); - ii->ii_UniqueProcs = (Oid *) palloc(sizeof(Oid) * indnkeyatts); - ii->ii_UniqueStrats = (uint16 *) palloc(sizeof(uint16) * indnkeyatts); + ii->ii_UniqueOps = palloc_array(Oid, indnkeyatts); + ii->ii_UniqueProcs = palloc_array(Oid, indnkeyatts); + ii->ii_UniqueStrats = palloc_array(uint16, indnkeyatts); /* * We have to look up the operator's strategy number. This provides a @@ -3014,13 +3014,13 @@ index_build(Relation heapRelation, * sanity checks */ Assert(RelationIsValid(indexRelation)); - Assert(PointerIsValid(indexRelation->rd_indam)); - Assert(PointerIsValid(indexRelation->rd_indam->ambuild)); - Assert(PointerIsValid(indexRelation->rd_indam->ambuildempty)); + Assert(indexRelation->rd_indam); + Assert(indexRelation->rd_indam->ambuild); + Assert(indexRelation->rd_indam->ambuildempty); /* * Determine worker process details for parallel CREATE INDEX. Currently, - * only btree and BRIN have support for parallel builds. + * only btree, GIN, and BRIN have support for parallel builds. * * Note that planner considers parallel safety for us. */ @@ -3077,7 +3077,7 @@ index_build(Relation heapRelation, */ stats = indexRelation->rd_indam->ambuild(heapRelation, indexRelation, indexInfo); - Assert(PointerIsValid(stats)); + Assert(stats); /* * If this is an unlogged index, we may need to write out an init fork for diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index 25c4b6bdc87f0..004c5121000fe 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -310,7 +310,7 @@ CatalogTuplesMultiInsertWithInfo(Relation heapRel, TupleTableSlot **slot, * (Use CatalogTupleUpdateWithInfo in such cases.) */ void -CatalogTupleUpdate(Relation heapRel, ItemPointer otid, HeapTuple tup) +CatalogTupleUpdate(Relation heapRel, const ItemPointerData *otid, HeapTuple tup) { CatalogIndexState indstate; TU_UpdateIndexes updateIndexes = TU_All; @@ -334,7 +334,7 @@ CatalogTupleUpdate(Relation heapRel, ItemPointer otid, HeapTuple tup) * so that callers needn't trouble over this ... but we don't do so today. */ void -CatalogTupleUpdateWithInfo(Relation heapRel, ItemPointer otid, HeapTuple tup, +CatalogTupleUpdateWithInfo(Relation heapRel, const ItemPointerData *otid, HeapTuple tup, CatalogIndexState indstate) { TU_UpdateIndexes updateIndexes = TU_All; @@ -362,7 +362,7 @@ CatalogTupleUpdateWithInfo(Relation heapRel, ItemPointer otid, HeapTuple tup, * it might be better to do something about caching CatalogIndexState. */ void -CatalogTupleDelete(Relation heapRel, ItemPointer tid) +CatalogTupleDelete(Relation heapRel, const ItemPointerData *tid) { simple_heap_delete(heapRel, tid); } diff --git a/src/backend/catalog/meson.build b/src/backend/catalog/meson.build index 1958ea9238a76..58674ffeee642 100644 --- a/src/backend/catalog/meson.build +++ b/src/backend/catalog/meson.build @@ -31,6 +31,7 @@ backend_sources += files( 'pg_range.c', 'pg_shdepend.c', 'pg_subscription.c', + 'pg_tablespace.c', 'pg_type.c', 'storage.c', 'toasting.c', diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index d97d632a7ef55..c94089caa5820 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -41,7 +41,6 @@ #include "catalog/pg_ts_parser.h" #include "catalog/pg_ts_template.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "common/hashfn_unstable.h" #include "funcapi.h" #include "mb/pg_wchar.h" @@ -233,7 +232,7 @@ static void RemoveTempRelationsCallback(int code, Datum arg); static void InvalidationCallback(Datum arg, int cacheid, uint32 hashvalue); static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, bool include_out_arguments, int pronargs, - int **argnumbers); + int **argnumbers, int *fgc_flags); /* * Recomputing the namespace path can be costly when done frequently, such as @@ -1118,15 +1117,15 @@ TypeIsVisibleExt(Oid typid, bool *is_missing) /* * FuncnameGetCandidates - * Given a possibly-qualified function name and argument count, + * Given a possibly-qualified routine name, argument count, and arg names, * retrieve a list of the possible matches. * - * If nargs is -1, we return all functions matching the given name, + * If nargs is -1, we return all routines matching the given name, * regardless of argument count. (argnames must be NIL, and expand_variadic * and expand_defaults must be false, in this case.) * * If argnames isn't NIL, we are considering a named- or mixed-notation call, - * and only functions having all the listed argument names will be returned. + * and only routines having all the listed argument names will be returned. * (We assume that length(argnames) <= nargs and all the passed-in names are * distinct.) The returned structs will include an argnumbers array showing * the actual argument index for each logical argument position. @@ -1184,14 +1183,21 @@ TypeIsVisibleExt(Oid typid, bool *is_missing) * The caller might end up discarding such an entry anyway, but if it selects * such an entry it should react as though the call were ambiguous. * - * If missing_ok is true, an empty list (NULL) is returned if the name was - * schema-qualified with a schema that does not exist. Likewise if no - * candidate is found for other reasons. + * We return an empty list (NULL) if no suitable matches can be found. + * If the function name was schema-qualified with a schema that does not + * exist, then we return an empty list if missing_ok is true and otherwise + * throw an error. (missing_ok does not affect the behavior otherwise.) + * + * The output argument *fgc_flags is filled with a bitmask indicating how + * far we were able to match the supplied information. This is not of much + * interest if any candidates were found, but if not, it can help callers + * produce an on-point error message. */ FuncCandidateList FuncnameGetCandidates(List *names, int nargs, List *argnames, bool expand_variadic, bool expand_defaults, - bool include_out_arguments, bool missing_ok) + bool include_out_arguments, bool missing_ok, + int *fgc_flags) { FuncCandidateList resultList = NULL; bool any_special = false; @@ -1204,15 +1210,20 @@ FuncnameGetCandidates(List *names, int nargs, List *argnames, /* check for caller error */ Assert(nargs >= 0 || !(expand_variadic | expand_defaults)); + /* initialize output fgc_flags to empty */ + *fgc_flags = 0; + /* deconstruct the name list */ DeconstructQualifiedName(names, &schemaname, &funcname); if (schemaname) { /* use exact schema given */ + *fgc_flags |= FGC_SCHEMA_GIVEN; /* report that a schema is given */ namespaceId = LookupExplicitNamespace(schemaname, missing_ok); if (!OidIsValid(namespaceId)) return NULL; + *fgc_flags |= FGC_SCHEMA_EXISTS; /* report that the schema exists */ } else { @@ -1238,6 +1249,8 @@ FuncnameGetCandidates(List *names, int nargs, List *argnames, int *argnumbers = NULL; FuncCandidateList newResult; + *fgc_flags |= FGC_NAME_EXISTS; /* the name is present in pg_proc */ + if (OidIsValid(namespaceId)) { /* Consider only procs in specified namespace */ @@ -1263,6 +1276,8 @@ FuncnameGetCandidates(List *names, int nargs, List *argnames, continue; /* proc is not in search path */ } + *fgc_flags |= FGC_NAME_VISIBLE; /* routine is in the right schema */ + /* * If we are asked to match to OUT arguments, then use the * proallargtypes array (which includes those); otherwise use @@ -1297,16 +1312,6 @@ FuncnameGetCandidates(List *names, int nargs, List *argnames, /* * Call uses named or mixed notation * - * Named or mixed notation can match a variadic function only if - * expand_variadic is off; otherwise there is no way to match the - * presumed-nameless parameters expanded from the variadic array. - */ - if (OidIsValid(procform->provariadic) && expand_variadic) - continue; - va_elem_type = InvalidOid; - variadic = false; - - /* * Check argument count. */ Assert(nargs >= 0); /* -1 not supported with argnames */ @@ -1325,12 +1330,33 @@ FuncnameGetCandidates(List *names, int nargs, List *argnames, if (pronargs != nargs && !use_defaults) continue; + /* We found a routine with a suitable number of arguments */ + *fgc_flags |= FGC_ARGCOUNT_MATCH; + /* Check for argument name match, generate positional mapping */ if (!MatchNamedCall(proctup, nargs, argnames, include_out_arguments, pronargs, - &argnumbers)) + &argnumbers, fgc_flags)) continue; + /* + * Named or mixed notation can match a variadic function only if + * expand_variadic is off; otherwise there is no way to match the + * presumed-nameless parameters expanded from the variadic array. + * However, we postpone the check until here because we want to + * perform argument name matching anyway (using the variadic array + * argument's name). This allows us to give an on-point error + * message if the user forgets to say VARIADIC in what would have + * been a valid call with it. + */ + if (OidIsValid(procform->provariadic) && expand_variadic) + continue; + va_elem_type = InvalidOid; + variadic = false; + + /* We found a fully-valid call using argument names */ + *fgc_flags |= FGC_ARGNAMES_VALID; + /* Named argument matching is always "special" */ any_special = true; } @@ -1372,6 +1398,9 @@ FuncnameGetCandidates(List *names, int nargs, List *argnames, /* Ignore if it doesn't match requested argument count */ if (nargs >= 0 && pronargs != nargs && !variadic && !use_defaults) continue; + + /* We found a routine with a suitable number of arguments */ + *fgc_flags |= FGC_ARGCOUNT_MATCH; } /* @@ -1580,11 +1609,13 @@ FuncnameGetCandidates(List *names, int nargs, List *argnames, * the mapping from call argument positions to actual function argument * numbers. Defaulted arguments are included in this map, at positions * after the last supplied argument. + * + * We also add flag bits to *fgc_flags reporting on how far the match got. */ static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, bool include_out_arguments, int pronargs, - int **argnumbers) + int **argnumbers, int *fgc_flags) { Form_pg_proc procform = (Form_pg_proc) GETSTRUCT(proctup); int numposargs = nargs - list_length(argnames); @@ -1593,6 +1624,7 @@ MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, char **p_argnames; char *p_argmodes; bool arggiven[FUNC_MAX_ARGS]; + bool arg_filled_twice = false; bool isnull; int ap; /* call args position */ int pp; /* proargs position */ @@ -1646,9 +1678,9 @@ MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, continue; if (p_argnames[i] && strcmp(p_argnames[i], argname) == 0) { - /* fail if argname matches a positional argument */ + /* note if argname matches a positional argument */ if (arggiven[pp]) - return false; + arg_filled_twice = true; arggiven[pp] = true; (*argnumbers)[ap] = pp; found = true; @@ -1665,6 +1697,16 @@ MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, Assert(ap == nargs); /* processed all actual parameters */ + /* If we get here, the function did match all the supplied argnames */ + *fgc_flags |= FGC_ARGNAMES_MATCH; + + /* ... however, some of them might have been placed wrong */ + if (arg_filled_twice) + return false; /* some argname matched a positional argument */ + + /* If we get here, the call doesn't have invalid mixed notation */ + *fgc_flags |= FGC_ARGNAMES_NONDUP; + /* Check for default arguments */ if (nargs < pronargs) { @@ -1683,6 +1725,9 @@ MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, Assert(ap == pronargs); /* processed all function parameters */ + /* If we get here, the call supplies all the required arguments */ + *fgc_flags |= FGC_ARGNAMES_ALL; + return true; } @@ -1746,11 +1791,13 @@ FunctionIsVisibleExt(Oid funcid, bool *is_missing) char *proname = NameStr(procform->proname); int nargs = procform->pronargs; FuncCandidateList clist; + int fgc_flags; visible = false; clist = FuncnameGetCandidates(list_make1(makeString(proname)), - nargs, NIL, false, false, false, false); + nargs, NIL, false, false, false, false, + &fgc_flags); for (; clist; clist = clist->next) { @@ -1883,9 +1930,20 @@ OpernameGetOprid(List *names, Oid oprleft, Oid oprright) * * The returned items always have two args[] entries --- the first will be * InvalidOid for a prefix oprkind. nargs is always 2, too. + * + * We return an empty list (NULL) if no suitable matches can be found. If the + * operator name was schema-qualified with a schema that does not exist, then + * we return an empty list if missing_schema_ok is true and otherwise throw an + * error. (missing_schema_ok does not affect the behavior otherwise.) + * + * The output argument *fgc_flags is filled with a bitmask indicating how + * far we were able to match the supplied information. This is not of much + * interest if any candidates were found, but if not, it can help callers + * produce an on-point error message. */ FuncCandidateList -OpernameGetCandidates(List *names, char oprkind, bool missing_schema_ok) +OpernameGetCandidates(List *names, char oprkind, bool missing_schema_ok, + int *fgc_flags) { FuncCandidateList resultList = NULL; char *resultSpace = NULL; @@ -1896,15 +1954,20 @@ OpernameGetCandidates(List *names, char oprkind, bool missing_schema_ok) CatCList *catlist; int i; + /* initialize output fgc_flags to empty */ + *fgc_flags = 0; + /* deconstruct the name list */ DeconstructQualifiedName(names, &schemaname, &opername); if (schemaname) { /* use exact schema given */ + *fgc_flags |= FGC_SCHEMA_GIVEN; /* report that a schema is given */ namespaceId = LookupExplicitNamespace(schemaname, missing_schema_ok); - if (missing_schema_ok && !OidIsValid(namespaceId)) + if (!OidIsValid(namespaceId)) return NULL; + *fgc_flags |= FGC_SCHEMA_EXISTS; /* report that the schema exists */ } else { @@ -1942,6 +2005,8 @@ OpernameGetCandidates(List *names, char oprkind, bool missing_schema_ok) if (oprkind && operform->oprkind != oprkind) continue; + *fgc_flags |= FGC_NAME_EXISTS; /* the name is present in pg_operator */ + if (OidIsValid(namespaceId)) { /* Consider only opers in specified namespace */ @@ -2015,6 +2080,8 @@ OpernameGetCandidates(List *names, char oprkind, bool missing_schema_ok) } } + *fgc_flags |= FGC_NAME_VISIBLE; /* operator is in the right schema */ + /* * Okay to add it to result list */ @@ -2686,6 +2753,9 @@ StatisticsObjIsVisibleExt(Oid stxid, bool *is_missing) { Oid namespaceId = lfirst_oid(l); + if (namespaceId == myTempNamespace) + continue; /* do not look in temp namespace */ + if (namespaceId == stxnamespace) { /* Found it first in path */ @@ -3859,7 +3929,7 @@ GetSearchPathMatcher(MemoryContext context) oldcxt = MemoryContextSwitchTo(context); - result = (SearchPathMatcher *) palloc0(sizeof(SearchPathMatcher)); + result = palloc0_object(SearchPathMatcher); schemas = list_copy(activeSearchPath); while (schemas && linitial_oid(schemas) != activeCreationNamespace) { @@ -3890,7 +3960,7 @@ CopySearchPathMatcher(SearchPathMatcher *path) { SearchPathMatcher *result; - result = (SearchPathMatcher *) palloc(sizeof(SearchPathMatcher)); + result = palloc_object(SearchPathMatcher); result->schemas = list_copy(path->schemas); result->addCatalog = path->addCatalog; result->addTemp = path->addTemp; diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index b63fd57dc04bb..fa6c6df598ae8 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -62,7 +62,6 @@ #include "catalog/pg_ts_template.h" #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/extension.h" @@ -4283,8 +4282,8 @@ pg_identify_object(PG_FUNCTION_ARGS) nspAttnum = get_object_attnum_namespace(address.classId); if (nspAttnum != InvalidAttrNumber) { - schema_oid = heap_getattr(objtup, nspAttnum, - RelationGetDescr(catalog), &isnull); + schema_oid = DatumGetObjectId(heap_getattr(objtup, nspAttnum, + RelationGetDescr(catalog), &isnull)); if (isnull) elog(ERROR, "invalid null namespace in object %u/%u/%d", address.classId, address.objectId, address.objectSubId); @@ -4850,7 +4849,7 @@ getObjectIdentityParts(const ObjectAddress *object, * will be initialized in all cases inside the switch; but we do it anyway * so that we can test below that no branch leaves it unset. */ - Assert(PointerIsValid(objname) == PointerIsValid(objargs)); + Assert((objname != NULL) == (objargs != NULL)); if (objname) { *objname = NIL; @@ -6145,8 +6144,8 @@ strlist_to_textarray(List *list) ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(memcxt); - datums = (Datum *) palloc(sizeof(Datum) * list_length(list)); - nulls = palloc(sizeof(bool) * list_length(list)); + datums = palloc_array(Datum, list_length(list)); + nulls = palloc_array(bool, list_length(list)); foreach(cell, list) { diff --git a/src/backend/catalog/pg_aggregate.c b/src/backend/catalog/pg_aggregate.c index a05f8a87c1f83..a1cb5719a0c6d 100644 --- a/src/backend/catalog/pg_aggregate.c +++ b/src/backend/catalog/pg_aggregate.c @@ -654,7 +654,7 @@ AggregateCreate(const char *aggName, for (i = 0; i < Natts_pg_aggregate; i++) { nulls[i] = false; - values[i] = (Datum) NULL; + values[i] = (Datum) 0; replaces[i] = true; } values[Anum_pg_aggregate_aggfnoid - 1] = ObjectIdGetDatum(procOid); @@ -836,6 +836,7 @@ lookup_agg_function(List *fnName, Oid vatype; Oid *true_oid_array; FuncDetailCode fdresult; + int fgc_flags; AclResult aclresult; int i; @@ -848,6 +849,7 @@ lookup_agg_function(List *fnName, */ fdresult = func_get_detail(fnName, NIL, NIL, nargs, input_types, false, false, false, + &fgc_flags, &fnOid, rettype, &retset, &nvargs, &vatype, &true_oid_array, NULL); diff --git a/src/backend/catalog/pg_attrdef.c b/src/backend/catalog/pg_attrdef.c index 1b6270b121324..29f5691bee9e0 100644 --- a/src/backend/catalog/pg_attrdef.c +++ b/src/backend/catalog/pg_attrdef.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/relation.h" #include "access/table.h" #include "catalog/dependency.h" diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index 2d5ac1ea8138b..672b188930f39 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -179,7 +179,7 @@ CreateConstraintEntry(const char *constraintName, for (i = 0; i < Natts_pg_constraint; i++) { nulls[i] = false; - values[i] = (Datum) NULL; + values[i] = (Datum) 0; } conOid = GetNewOidWithIndex(conDesc, ConstraintOidIndexId, @@ -846,7 +846,7 @@ RelationGetNotNullConstraints(Oid relid, bool cooked, bool include_noinh) { CookedConstraint *cooked; - cooked = (CookedConstraint *) palloc(sizeof(CookedConstraint)); + cooked = palloc_object(CookedConstraint); cooked->contype = CONSTR_NOTNULL; cooked->conoid = conForm->oid; @@ -875,7 +875,7 @@ RelationGetNotNullConstraints(Oid relid, bool cooked, bool include_noinh) false))); constr->is_enforced = true; constr->skip_validation = !conForm->convalidated; - constr->initially_valid = true; + constr->initially_valid = conForm->convalidated; constr->is_no_inherit = conForm->connoinherit; notnulls = lappend(notnulls, constr); } @@ -937,10 +937,12 @@ RemoveConstraintById(Oid conId) con->conrelid); classForm = (Form_pg_class) GETSTRUCT(relTup); - if (classForm->relchecks == 0) /* should not happen */ - elog(ERROR, "relation \"%s\" has relchecks = 0", - RelationGetRelationName(rel)); - classForm->relchecks--; + if (classForm->relchecks > 0) + classForm->relchecks--; + else + /* should not happen */ + elog(WARNING, "relation \"%s\" has relchecks = %d", + RelationGetRelationName(rel), classForm->relchecks); CatalogTupleUpdate(pgrel, &relTup->t_self, relTup); @@ -1542,7 +1544,7 @@ DeconstructFkConstraintRow(HeapTuple tuple, int *numfks, if (numkeys <= 0 || numkeys > INDEX_MAX_KEYS) elog(ERROR, "foreign key constraint cannot have %d columns", numkeys); memcpy(conkey, ARR_DATA_PTR(arr), numkeys * sizeof(int16)); - if ((Pointer) arr != DatumGetPointer(adatum)) + if (arr != DatumGetPointer(adatum)) pfree(arr); /* free de-toasted copy, if any */ adatum = SysCacheGetAttrNotNull(CONSTROID, tuple, @@ -1554,7 +1556,7 @@ DeconstructFkConstraintRow(HeapTuple tuple, int *numfks, ARR_ELEMTYPE(arr) != INT2OID) elog(ERROR, "confkey is not a 1-D smallint array"); memcpy(confkey, ARR_DATA_PTR(arr), numkeys * sizeof(int16)); - if ((Pointer) arr != DatumGetPointer(adatum)) + if (arr != DatumGetPointer(adatum)) pfree(arr); /* free de-toasted copy, if any */ if (pf_eq_oprs) @@ -1569,7 +1571,7 @@ DeconstructFkConstraintRow(HeapTuple tuple, int *numfks, ARR_ELEMTYPE(arr) != OIDOID) elog(ERROR, "conpfeqop is not a 1-D Oid array"); memcpy(pf_eq_oprs, ARR_DATA_PTR(arr), numkeys * sizeof(Oid)); - if ((Pointer) arr != DatumGetPointer(adatum)) + if (arr != DatumGetPointer(adatum)) pfree(arr); /* free de-toasted copy, if any */ } @@ -1584,7 +1586,7 @@ DeconstructFkConstraintRow(HeapTuple tuple, int *numfks, ARR_ELEMTYPE(arr) != OIDOID) elog(ERROR, "conppeqop is not a 1-D Oid array"); memcpy(pp_eq_oprs, ARR_DATA_PTR(arr), numkeys * sizeof(Oid)); - if ((Pointer) arr != DatumGetPointer(adatum)) + if (arr != DatumGetPointer(adatum)) pfree(arr); /* free de-toasted copy, if any */ } @@ -1599,7 +1601,7 @@ DeconstructFkConstraintRow(HeapTuple tuple, int *numfks, ARR_ELEMTYPE(arr) != OIDOID) elog(ERROR, "conffeqop is not a 1-D Oid array"); memcpy(ff_eq_oprs, ARR_DATA_PTR(arr), numkeys * sizeof(Oid)); - if ((Pointer) arr != DatumGetPointer(adatum)) + if (arr != DatumGetPointer(adatum)) pfree(arr); /* free de-toasted copy, if any */ } @@ -1622,7 +1624,7 @@ DeconstructFkConstraintRow(HeapTuple tuple, int *numfks, elog(ERROR, "confdelsetcols is not a 1-D smallint array"); num_delete_cols = ARR_DIMS(arr)[0]; memcpy(fk_del_set_cols, ARR_DATA_PTR(arr), num_delete_cols * sizeof(int16)); - if ((Pointer) arr != DatumGetPointer(adatum)) + if (arr != DatumGetPointer(adatum)) pfree(arr); /* free de-toasted copy, if any */ *num_fk_del_set_cols = num_delete_cols; diff --git a/src/backend/catalog/pg_conversion.c b/src/backend/catalog/pg_conversion.c index 04cc375caea8c..090f680d1908f 100644 --- a/src/backend/catalog/pg_conversion.c +++ b/src/backend/catalog/pg_conversion.c @@ -87,7 +87,7 @@ ConversionCreate(const char *conname, Oid connamespace, for (i = 0; i < Natts_pg_conversion; i++) { nulls[i] = false; - values[i] = (Datum) NULL; + values[i] = (Datum) 0; } /* form a tuple */ diff --git a/src/backend/catalog/pg_db_role_setting.c b/src/backend/catalog/pg_db_role_setting.c index 090fc07c28acb..832e49a34bea5 100644 --- a/src/backend/catalog/pg_db_role_setting.c +++ b/src/backend/catalog/pg_db_role_setting.c @@ -151,6 +151,15 @@ AlterSetting(Oid databaseid, Oid roleid, VariableSetStmt *setstmt) CatalogTupleInsert(rel, newtuple); } + else + { + /* + * RESET doesn't need to change any state if there's no pre-existing + * pg_db_role_setting entry, but for consistency we should still check + * that the option is valid and we're allowed to set it. + */ + (void) GUCArrayDelete(NULL, setstmt->name); + } InvokeObjectPostAlterHookArg(DbRoleSettingRelationId, databaseid, 0, roleid, false); diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c index c8b11f887e274..a6f63e107bb69 100644 --- a/src/backend/catalog/pg_depend.c +++ b/src/backend/catalog/pg_depend.c @@ -88,7 +88,7 @@ recordMultipleDependencies(const ObjectAddress *depender, */ max_slots = Min(nreferenced, MAX_CATALOG_MULTI_INSERT_BYTES / sizeof(FormData_pg_depend)); - slot = palloc(sizeof(TupleTableSlot *) * max_slots); + slot = palloc_array(TupleTableSlot *, max_slots); /* Don't open indexes unless we need to make an update */ indstate = NULL; diff --git a/src/backend/catalog/pg_enum.c b/src/backend/catalog/pg_enum.c index a1634e58eecdd..07998839affdc 100644 --- a/src/backend/catalog/pg_enum.c +++ b/src/backend/catalog/pg_enum.c @@ -110,12 +110,6 @@ EnumValuesCreate(Oid enumTypeOid, List *vals) num_elems = list_length(vals); - /* - * We do not bother to check the list of values for duplicates --- if you - * have any, you'll get a less-than-friendly unique-index violation. It is - * probably not worth trying harder. - */ - pg_enum = table_open(EnumRelationId, RowExclusiveLock); /* @@ -126,7 +120,7 @@ EnumValuesCreate(Oid enumTypeOid, List *vals) * allocating the next), trouble could only occur if the OID counter wraps * all the way around before we finish. Which seems unlikely. */ - oids = (Oid *) palloc(num_elems * sizeof(Oid)); + oids = palloc_array(Oid, num_elems); for (elemno = 0; elemno < num_elems; elemno++) { @@ -154,7 +148,7 @@ EnumValuesCreate(Oid enumTypeOid, List *vals) /* allocate the slots to use and initialize them */ nslots = Min(num_elems, MAX_CATALOG_MULTI_INSERT_BYTES / sizeof(FormData_pg_enum)); - slot = palloc(sizeof(TupleTableSlot *) * nslots); + slot = palloc_array(TupleTableSlot *, nslots); for (int i = 0; i < nslots; i++) slot[i] = MakeSingleTupleTableSlot(RelationGetDescr(pg_enum), &TTSOpsHeapTuple); @@ -164,6 +158,7 @@ EnumValuesCreate(Oid enumTypeOid, List *vals) { char *lab = strVal(lfirst(lc)); Name enumlabel = palloc0(NAMEDATALEN); + ListCell *lc2; /* * labels are stored in a name field, for easier syscache lookup, so @@ -176,6 +171,24 @@ EnumValuesCreate(Oid enumTypeOid, List *vals) errdetail("Labels must be %d bytes or less.", NAMEDATALEN - 1))); + /* + * Check for duplicate labels. The unique index on pg_enum would catch + * that anyway, but we prefer a friendlier error message. + */ + foreach(lc2, vals) + { + /* Only need to compare lc to earlier entries */ + if (lc2 == lc) + break; + + if (strcmp(lab, strVal(lfirst(lc2))) == 0) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("enum label \"%s\" used more than once", + lab))); + } + + /* OK, construct a tuple for this label */ ExecClearTuple(slot[slotCount]); memset(slot[slotCount]->tts_isnull, false, @@ -362,7 +375,7 @@ AddEnumLabel(Oid enumTypeOid, nelems = list->n_members; /* Sort the existing members by enumsortorder */ - existing = (HeapTuple *) palloc(nelems * sizeof(HeapTuple)); + existing = palloc_array(HeapTuple, nelems); for (i = 0; i < nelems; i++) existing[i] = &(list->members[i]->tuple); diff --git a/src/backend/catalog/pg_largeobject.c b/src/backend/catalog/pg_largeobject.c index 89fc810215099..33e8fa96a65a8 100644 --- a/src/backend/catalog/pg_largeobject.c +++ b/src/backend/catalog/pg_largeobject.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/table.h" #include "catalog/catalog.h" #include "catalog/indexing.h" diff --git a/src/backend/catalog/pg_namespace.c b/src/backend/catalog/pg_namespace.c index 6f5634a4de69b..616bcc7852113 100644 --- a/src/backend/catalog/pg_namespace.c +++ b/src/backend/catalog/pg_namespace.c @@ -76,7 +76,7 @@ NamespaceCreate(const char *nspName, Oid ownerId, bool isTemp) for (i = 0; i < Natts_pg_namespace; i++) { nulls[i] = false; - values[i] = (Datum) NULL; + values[i] = (Datum) 0; } nspoid = GetNewOidWithIndex(nspdesc, NamespaceOidIndexId, diff --git a/src/backend/catalog/pg_operator.c b/src/backend/catalog/pg_operator.c index bfcfa643464ac..44d2ccb6788e9 100644 --- a/src/backend/catalog/pg_operator.c +++ b/src/backend/catalog/pg_operator.c @@ -225,7 +225,7 @@ OperatorShellMake(const char *operatorName, for (i = 0; i < Natts_pg_operator; ++i) { nulls[i] = false; - values[i] = (Datum) NULL; /* redundant, but safe */ + values[i] = (Datum) 0; /* redundant, but safe */ } /* @@ -453,7 +453,7 @@ OperatorCreate(const char *operatorName, for (i = 0; i < Natts_pg_operator; ++i) { - values[i] = (Datum) NULL; + values[i] = (Datum) 0; replaces[i] = true; nulls[i] = false; } diff --git a/src/backend/catalog/pg_parameter_acl.c b/src/backend/catalog/pg_parameter_acl.c index 62a05783eb333..dcdf49ea408d6 100644 --- a/src/backend/catalog/pg_parameter_acl.c +++ b/src/backend/catalog/pg_parameter_acl.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/table.h" #include "catalog/catalog.h" #include "catalog/indexing.h" diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c index 5fdcf24d5f8de..d608f37d36188 100644 --- a/src/backend/catalog/pg_proc.c +++ b/src/backend/catalog/pg_proc.c @@ -20,6 +20,7 @@ #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/indexing.h" +#include "catalog/namespace.h" #include "catalog/objectaccess.h" #include "catalog/pg_language.h" #include "catalog/pg_namespace.h" @@ -141,7 +142,8 @@ ProcedureCreate(const char *procedureName, TupleDesc tupDesc; bool is_update; ObjectAddress myself, - referenced; + referenced, + temp_object; char *detailmsg; int i; ObjectAddresses *addrs; @@ -149,7 +151,7 @@ ProcedureCreate(const char *procedureName, /* * sanity checks */ - Assert(PointerIsValid(prosrc)); + Assert(prosrc); parameterCount = parameterTypes->dim1; if (parameterCount < 0 || parameterCount > FUNC_MAX_ARGS) @@ -658,17 +660,40 @@ ProcedureCreate(const char *procedureName, add_exact_object_address(&referenced, addrs); } - record_object_address_dependencies(&myself, addrs, DEPENDENCY_NORMAL); - free_object_addresses(addrs); - - /* dependency on SQL routine body */ + /* dependencies appearing in new-style SQL routine body */ if (languageObjectId == SQLlanguageId && prosqlbody) - recordDependencyOnExpr(&myself, prosqlbody, NIL, DEPENDENCY_NORMAL); + collectDependenciesOfExpr(addrs, prosqlbody, NIL); /* dependency on parameter default expressions */ if (parameterDefaults) - recordDependencyOnExpr(&myself, (Node *) parameterDefaults, - NIL, DEPENDENCY_NORMAL); + collectDependenciesOfExpr(addrs, (Node *) parameterDefaults, NIL); + + /* + * Now that we have all the normal dependencies, thumb through them and + * warn if any are to temporary objects. This informs the user if their + * supposedly non-temp function will silently go away at session exit, due + * to a dependency on a temp object. However, do not complain when a + * function created in our own pg_temp namespace refers to other objects + * in that namespace, since then they'll have similar lifespans anyway. + */ + if (find_temp_object(addrs, isTempNamespace(procNamespace), &temp_object)) + ereport(NOTICE, + (errmsg("function \"%s\" will be effectively temporary", + procedureName), + errdetail("It depends on temporary %s.", + getObjectDescription(&temp_object, false)))); + + /* + * Now record all normal dependencies at once. This will also remove any + * duplicates in the list. (Role and extension dependencies are handled + * separately below. Role dependencies would have to be separate anyway + * since they are shared dependencies. An extension dependency could be + * folded into the addrs list, but pg_depend.c doesn't make that easy, and + * it won't duplicate anything we've collected so far anyway.) + */ + record_object_address_dependencies(&myself, addrs, DEPENDENCY_NORMAL); + + free_object_addresses(addrs); /* dependency on owner */ if (!is_update) @@ -1212,6 +1237,6 @@ oid_array_to_list(Datum datum) deconstruct_array_builtin(array, OIDOID, &values, NULL, &nelems); for (i = 0; i < nelems; i++) - result = lappend_oid(result, values[i]); + result = lappend_oid(result, DatumGetObjectId(values[i])); return result; } diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c index d6f94db5d999b..7aa3f17992405 100644 --- a/src/backend/catalog/pg_publication.c +++ b/src/backend/catalog/pg_publication.c @@ -115,8 +115,10 @@ check_publication_add_schema(Oid schemaid) * Returns if relation represented by oid and Form_pg_class entry * is publishable. * - * Does same checks as check_publication_add_relation() above, but does not - * need relation to be opened and also does not throw errors. + * Does same checks as check_publication_add_relation() above except for + * RELKIND_SEQUENCE, but does not need relation to be opened and also does + * not throw errors. Here, the additional check is to support ALL SEQUENCES + * publication. * * XXX This also excludes all tables with relid < FirstNormalObjectId, * ie all tables created during initdb. This mainly affects the preinstalled @@ -134,7 +136,8 @@ static bool is_publishable_class(Oid relid, Form_pg_class reltuple) { return (reltuple->relkind == RELKIND_RELATION || - reltuple->relkind == RELKIND_PARTITIONED_TABLE) && + reltuple->relkind == RELKIND_PARTITIONED_TABLE || + reltuple->relkind == RELKIND_SEQUENCE) && !IsCatalogRelationOid(relid) && reltuple->relpersistence == RELPERSISTENCE_PERMANENT && relid >= FirstNormalObjectId; @@ -773,8 +776,8 @@ GetRelationPublications(Oid relid) /* * Gets list of relation oids for a publication. * - * This should only be used FOR TABLE publications, the FOR ALL TABLES - * should use GetAllTablesPublicationRelations(). + * This should only be used FOR TABLE publications, the FOR ALL TABLES/SEQUENCES + * should use GetAllPublicationRelations(). */ List * GetPublicationRelations(Oid pubid, PublicationPartOpt pub_partopt) @@ -785,7 +788,7 @@ GetPublicationRelations(Oid pubid, PublicationPartOpt pub_partopt) SysScanDesc scan; HeapTuple tup; - /* Find all publications associated with the relation. */ + /* Find all relations associated with the publication. */ pubrelsrel = table_open(PublicationRelRelationId, AccessShareLock); ScanKeyInit(&scankey, @@ -854,14 +857,16 @@ GetAllTablesPublications(void) } /* - * Gets list of all relation published by FOR ALL TABLES publication(s). + * Gets list of all relations published by FOR ALL TABLES/SEQUENCES + * publication(s). * * If the publication publishes partition changes via their respective root * partitioned tables, we must exclude partitions in favor of including the - * root partitioned tables. + * root partitioned tables. This is not applicable to FOR ALL SEQUENCES + * publication. */ List * -GetAllTablesPublicationRelations(bool pubviaroot) +GetAllPublicationRelations(char relkind, bool pubviaroot) { Relation classRel; ScanKeyData key[1]; @@ -869,12 +874,14 @@ GetAllTablesPublicationRelations(bool pubviaroot) HeapTuple tuple; List *result = NIL; + Assert(!(relkind == RELKIND_SEQUENCE && pubviaroot)); + classRel = table_open(RelationRelationId, AccessShareLock); ScanKeyInit(&key[0], Anum_pg_class_relkind, BTEqualStrategyNumber, F_CHAREQ, - CharGetDatum(RELKIND_RELATION)); + CharGetDatum(relkind)); scan = table_beginscan_catalog(classRel, 1, key); @@ -1001,7 +1008,7 @@ GetSchemaPublicationRelations(Oid schemaid, PublicationPartOpt pub_partopt) ScanKeyInit(&key[0], Anum_pg_class_relnamespace, BTEqualStrategyNumber, F_OIDEQ, - schemaid); + ObjectIdGetDatum(schemaid)); /* get all the relations present in the specified schema */ scan = table_beginscan_catalog(classRel, 1, key); @@ -1079,10 +1086,11 @@ GetPublication(Oid pubid) pubform = (Form_pg_publication) GETSTRUCT(tup); - pub = (Publication *) palloc(sizeof(Publication)); + pub = palloc_object(Publication); pub->oid = pubid; pub->name = pstrdup(NameStr(pubform->pubname)); pub->alltables = pubform->puballtables; + pub->allsequences = pubform->puballsequences; pub->pubactions.pubinsert = pubform->pubinsert; pub->pubactions.pubupdate = pubform->pubupdate; pub->pubactions.pubdelete = pubform->pubdelete; @@ -1160,7 +1168,8 @@ pg_get_publication_tables(PG_FUNCTION_ARGS) * those. Otherwise, get the partitioned table itself. */ if (pub_elem->alltables) - pub_elem_tables = GetAllTablesPublicationRelations(pub_elem->pubviaroot); + pub_elem_tables = GetAllPublicationRelations(RELKIND_RELATION, + pub_elem->pubviaroot); else { List *relids, @@ -1187,7 +1196,7 @@ pg_get_publication_tables(PG_FUNCTION_ARGS) */ foreach(lc, pub_elem_tables) { - published_rel *table_info = (published_rel *) palloc(sizeof(published_rel)); + published_rel *table_info = palloc_object(published_rel); table_info->relid = lfirst_oid(lc); table_info->pubid = pub_elem->oid; @@ -1290,7 +1299,7 @@ pg_get_publication_tables(PG_FUNCTION_ARGS) TupleDesc desc = RelationGetDescr(rel); int i; - attnums = (int16 *) palloc(desc->natts * sizeof(int16)); + attnums = palloc_array(int16, desc->natts); for (i = 0; i < desc->natts; i++) { @@ -1332,3 +1341,49 @@ pg_get_publication_tables(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } + +/* + * Returns Oids of sequences in a publication. + */ +Datum +pg_get_publication_sequences(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + List *sequences = NIL; + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) + { + char *pubname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + Publication *publication; + MemoryContext oldcontext; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + publication = GetPublicationByName(pubname, false); + + if (publication->allsequences) + sequences = GetAllPublicationRelations(RELKIND_SEQUENCE, false); + + funcctx->user_fctx = sequences; + + MemoryContextSwitchTo(oldcontext); + } + + /* stuff done on every call of the function */ + funcctx = SRF_PERCALL_SETUP(); + sequences = (List *) funcctx->user_fctx; + + if (funcctx->call_cntr < list_length(sequences)) + { + Oid relid = list_nth_oid(sequences, funcctx->call_cntr); + + SRF_RETURN_NEXT(funcctx, ObjectIdGetDatum(relid)); + } + + SRF_RETURN_DONE(funcctx); +} diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c index 536191284e803..fe4e453667621 100644 --- a/src/backend/catalog/pg_shdepend.c +++ b/src/backend/catalog/pg_shdepend.c @@ -47,7 +47,6 @@ #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" #include "commands/alter.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/policy.h" @@ -61,6 +60,7 @@ #include "storage/lmgr.h" #include "utils/acl.h" #include "utils/fmgroids.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" @@ -791,7 +791,7 @@ checkSharedDependencies(Oid classId, Oid objectId, } if (!stored) { - dep = (remoteDep *) palloc(sizeof(remoteDep)); + dep = palloc_object(remoteDep); dep->dbOid = sdepForm->dbid; dep->count = 1; remDeps = lappend(remDeps, dep); @@ -913,7 +913,7 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId) * know that they will be used. */ max_slots = MAX_CATALOG_MULTI_INSERT_BYTES / sizeof(FormData_pg_shdepend); - slot = palloc(sizeof(TupleTableSlot *) * max_slots); + slot = palloc_array(TupleTableSlot *, max_slots); indstate = CatalogOpenIndexes(sdepRel); @@ -956,12 +956,12 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId) shdep = (Form_pg_shdepend) GETSTRUCT(tup); slot[slot_stored_count]->tts_values[Anum_pg_shdepend_dbid - 1] = ObjectIdGetDatum(newDbId); - slot[slot_stored_count]->tts_values[Anum_pg_shdepend_classid - 1] = shdep->classid; - slot[slot_stored_count]->tts_values[Anum_pg_shdepend_objid - 1] = shdep->objid; - slot[slot_stored_count]->tts_values[Anum_pg_shdepend_objsubid - 1] = shdep->objsubid; - slot[slot_stored_count]->tts_values[Anum_pg_shdepend_refclassid - 1] = shdep->refclassid; - slot[slot_stored_count]->tts_values[Anum_pg_shdepend_refobjid - 1] = shdep->refobjid; - slot[slot_stored_count]->tts_values[Anum_pg_shdepend_deptype - 1] = shdep->deptype; + slot[slot_stored_count]->tts_values[Anum_pg_shdepend_classid - 1] = ObjectIdGetDatum(shdep->classid); + slot[slot_stored_count]->tts_values[Anum_pg_shdepend_objid - 1] = ObjectIdGetDatum(shdep->objid); + slot[slot_stored_count]->tts_values[Anum_pg_shdepend_objsubid - 1] = Int32GetDatum(shdep->objsubid); + slot[slot_stored_count]->tts_values[Anum_pg_shdepend_refclassid - 1] = ObjectIdGetDatum(shdep->refclassid); + slot[slot_stored_count]->tts_values[Anum_pg_shdepend_refobjid - 1] = ObjectIdGetDatum(shdep->refobjid); + slot[slot_stored_count]->tts_values[Anum_pg_shdepend_deptype - 1] = CharGetDatum(shdep->deptype); ExecStoreVirtualTuple(slot[slot_stored_count]); slot_stored_count++; diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index 1395032413e3d..ad6fbd77ffdd3 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -89,7 +89,7 @@ GetSubscription(Oid subid, bool missing_ok) subform = (Form_pg_subscription) GETSTRUCT(tup); - sub = (Subscription *) palloc(sizeof(Subscription)); + sub = palloc_object(Subscription); sub->oid = subid; sub->dbid = subform->subdbid; sub->skiplsn = subform->subskiplsn; @@ -103,6 +103,9 @@ GetSubscription(Oid subid, bool missing_ok) sub->passwordrequired = subform->subpasswordrequired; sub->runasowner = subform->subrunasowner; sub->failover = subform->subfailover; + sub->retaindeadtuples = subform->subretaindeadtuples; + sub->maxretention = subform->submaxretention; + sub->retentionactive = subform->subretentionactive; /* Get conninfo */ datum = SysCacheGetAttrNotNull(SUBSCRIPTIONOID, @@ -281,7 +284,7 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state, ObjectIdGetDatum(relid), ObjectIdGetDatum(subid)); if (HeapTupleIsValid(tup)) - elog(ERROR, "subscription table %u in subscription %u already exists", + elog(ERROR, "subscription relation %u in subscription %u already exists", relid, subid); /* Form the tuple. */ @@ -290,7 +293,7 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state, values[Anum_pg_subscription_rel_srsubid - 1] = ObjectIdGetDatum(subid); values[Anum_pg_subscription_rel_srrelid - 1] = ObjectIdGetDatum(relid); values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); - if (sublsn != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(sublsn)) values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); else nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; @@ -319,7 +322,7 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state, */ void UpdateSubscriptionRelState(Oid subid, Oid relid, char state, - XLogRecPtr sublsn) + XLogRecPtr sublsn, bool already_locked) { Relation rel; HeapTuple tup; @@ -327,16 +330,31 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state, Datum values[Natts_pg_subscription_rel]; bool replaces[Natts_pg_subscription_rel]; - LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + if (already_locked) + { +#ifdef USE_ASSERT_CHECKING + LOCKTAG tag; - rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId, + RowExclusiveLock, true)); + SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0); + Assert(LockHeldByMe(&tag, AccessShareLock, true)); +#endif + + rel = table_open(SubscriptionRelRelationId, NoLock); + } + else + { + LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + } /* Try finding existing mapping. */ tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, ObjectIdGetDatum(relid), ObjectIdGetDatum(subid)); if (!HeapTupleIsValid(tup)) - elog(ERROR, "subscription table %u in subscription %u does not exist", + elog(ERROR, "subscription relation %u in subscription %u does not exist", relid, subid); /* Update the tuple. */ @@ -348,7 +366,7 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state, values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); replaces[Anum_pg_subscription_rel_srsublsn - 1] = true; - if (sublsn != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(sublsn)) values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); else nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; @@ -460,9 +478,13 @@ RemoveSubscriptionRel(Oid subid, Oid relid) * synchronization is in progress unless the caller updates the * corresponding subscription as well. This is to ensure that we don't * leave tablesync slots or origins in the system when the - * corresponding table is dropped. + * corresponding table is dropped. For sequences, however, it's ok to + * drop them since no separate slots or origins are created during + * synchronization. */ - if (!OidIsValid(subid) && subrel->srsubstate != SUBREL_STATE_READY) + if (!OidIsValid(subid) && + subrel->srsubstate != SUBREL_STATE_READY && + get_rel_relkind(subrel->srrelid) != RELKIND_SEQUENCE) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -488,18 +510,19 @@ RemoveSubscriptionRel(Oid subid, Oid relid) } /* - * Does the subscription have any relations? + * Does the subscription have any tables? * * Use this function only to know true/false, and when you have no need for the * List returned by GetSubscriptionRelations. */ bool -HasSubscriptionRelations(Oid subid) +HasSubscriptionTables(Oid subid) { Relation rel; ScanKeyData skey[1]; SysScanDesc scan; - bool has_subrels; + HeapTuple tup; + bool has_subtables = false; rel = table_open(SubscriptionRelRelationId, AccessShareLock); @@ -511,14 +534,27 @@ HasSubscriptionRelations(Oid subid) scan = systable_beginscan(rel, InvalidOid, false, NULL, 1, skey); - /* If even a single tuple exists then the subscription has tables. */ - has_subrels = HeapTupleIsValid(systable_getnext(scan)); + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + char relkind; + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + relkind = get_rel_relkind(subrel->srrelid); + + if (relkind == RELKIND_RELATION || + relkind == RELKIND_PARTITIONED_TABLE) + { + has_subtables = true; + break; + } + } /* Cleanup */ systable_endscan(scan); table_close(rel, AccessShareLock); - return has_subrels; + return has_subtables; } /* @@ -529,7 +565,8 @@ HasSubscriptionRelations(Oid subid) * returned list is palloc'ed in the current memory context. */ List * -GetSubscriptionRelations(Oid subid, bool not_ready) +GetSubscriptionRelations(Oid subid, bool tables, bool sequences, + bool not_ready) { List *res = NIL; Relation rel; @@ -538,6 +575,9 @@ GetSubscriptionRelations(Oid subid, bool not_ready) ScanKeyData skey[2]; SysScanDesc scan; + /* One or both of 'tables' and 'sequences' must be true. */ + Assert(tables || sequences); + rel = table_open(SubscriptionRelRelationId, AccessShareLock); ScanKeyInit(&skey[nkeys++], @@ -560,10 +600,25 @@ GetSubscriptionRelations(Oid subid, bool not_ready) SubscriptionRelState *relstate; Datum d; bool isnull; + char relkind; subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); - relstate = (SubscriptionRelState *) palloc(sizeof(SubscriptionRelState)); + /* Relation is either a sequence or a table */ + relkind = get_rel_relkind(subrel->srrelid); + Assert(relkind == RELKIND_SEQUENCE || relkind == RELKIND_RELATION || + relkind == RELKIND_PARTITIONED_TABLE); + + /* Skip sequences if they were not requested */ + if ((relkind == RELKIND_SEQUENCE) && !sequences) + continue; + + /* Skip tables if they were not requested */ + if ((relkind == RELKIND_RELATION || + relkind == RELKIND_PARTITIONED_TABLE) && !tables) + continue; + + relstate = palloc_object(SubscriptionRelState); relstate->relid = subrel->srrelid; relstate->state = subrel->srsubstate; d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, @@ -582,3 +637,42 @@ GetSubscriptionRelations(Oid subid, bool not_ready) return res; } + +/* + * Update the dead tuple retention status for the given subscription. + */ +void +UpdateDeadTupleRetentionStatus(Oid subid, bool active) +{ + Relation rel; + bool nulls[Natts_pg_subscription]; + bool replaces[Natts_pg_subscription]; + Datum values[Natts_pg_subscription]; + HeapTuple tup; + + /* Look up the subscription in the catalog */ + rel = table_open(SubscriptionRelationId, RowExclusiveLock); + tup = SearchSysCacheCopy1(SUBSCRIPTIONOID, ObjectIdGetDatum(subid)); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for subscription %u", subid); + + LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + + /* Form a new tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + /* Set the subscription to disabled. */ + values[Anum_pg_subscription_subretentionactive - 1] = active; + replaces[Anum_pg_subscription_subretentionactive - 1] = true; + + /* Update the catalog */ + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); + CatalogTupleUpdate(rel, &tup->t_self, tup); + heap_freetuple(tup); + + table_close(rel, NoLock); +} diff --git a/src/backend/catalog/pg_tablespace.c b/src/backend/catalog/pg_tablespace.c new file mode 100644 index 0000000000000..6aca24c231e17 --- /dev/null +++ b/src/backend/catalog/pg_tablespace.c @@ -0,0 +1,90 @@ +/*------------------------------------------------------------------------- + * + * pg_tablespace.c + * routines to support manipulation of the pg_tablespace relation + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/catalog/pg_tablespace.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "miscadmin.h" + + +/* + * get_tablespace_location + * Get a tablespace's location as a C-string, by its OID + */ +char * +get_tablespace_location(Oid tablespaceOid) +{ + char sourcepath[MAXPGPATH]; + char targetpath[MAXPGPATH]; + int rllen; + struct stat st; + + /* + * It's useful to apply this to pg_class.reltablespace, wherein zero means + * "the database's default tablespace". So, rather than throwing an error + * for zero, we choose to assume that's what is meant. + */ + if (tablespaceOid == InvalidOid) + tablespaceOid = MyDatabaseTableSpace; + + /* + * Return empty string for the cluster's default tablespaces + */ + if (tablespaceOid == DEFAULTTABLESPACE_OID || + tablespaceOid == GLOBALTABLESPACE_OID) + return pstrdup(""); + + /* + * Find the location of the tablespace by reading the symbolic link that + * is in pg_tblspc/. + */ + snprintf(sourcepath, sizeof(sourcepath), "%s/%u", PG_TBLSPC_DIR, tablespaceOid); + + /* + * Before reading the link, check if the source path is a link or a + * junction point. Note that a directory is possible for a tablespace + * created with allow_in_place_tablespaces enabled. If a directory is + * found, a relative path to the data directory is returned. + */ + if (lstat(sourcepath, &st) < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + sourcepath)); + + if (!S_ISLNK(st.st_mode)) + return pstrdup(sourcepath); + + /* + * In presence of a link or a junction point, return the path pointed to. + */ + rllen = readlink(sourcepath, targetpath, sizeof(targetpath)); + if (rllen < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not read symbolic link \"%s\": %m", + sourcepath)); + if (rllen >= sizeof(targetpath)) + ereport(ERROR, + errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("symbolic link \"%s\" target is too long", + sourcepath)); + targetpath[rllen] = '\0'; + + return pstrdup(targetpath); +} diff --git a/src/backend/catalog/pg_type.c b/src/backend/catalog/pg_type.c index b36f81afb9d3f..4a69a81b9fbaf 100644 --- a/src/backend/catalog/pg_type.c +++ b/src/backend/catalog/pg_type.c @@ -66,7 +66,7 @@ TypeShellMake(const char *typeName, Oid typeNamespace, Oid ownerId) NameData name; ObjectAddress address; - Assert(PointerIsValid(typeName)); + Assert(typeName); /* * open pg_type @@ -80,7 +80,7 @@ TypeShellMake(const char *typeName, Oid typeNamespace, Oid ownerId) for (i = 0; i < Natts_pg_type; ++i) { nulls[i] = false; - values[i] = (Datum) NULL; /* redundant, but safe */ + values[i] = (Datum) 0; /* redundant, but safe */ } /* @@ -285,8 +285,7 @@ TypeCreate(Oid newTypeOid, errmsg("alignment \"%c\" is invalid for passed-by-value type of size %d", alignment, internalSize))); } -#if SIZEOF_DATUM == 8 - else if (internalSize == (int16) sizeof(Datum)) + else if (internalSize == (int16) sizeof(int64)) { if (alignment != TYPALIGN_DOUBLE) ereport(ERROR, @@ -294,7 +293,6 @@ TypeCreate(Oid newTypeOid, errmsg("alignment \"%c\" is invalid for passed-by-value type of size %d", alignment, internalSize))); } -#endif else ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -950,7 +948,7 @@ char * makeMultirangeTypeName(const char *rangeTypeName, Oid typeNamespace) { char *buf; - char *rangestr; + const char *rangestr; /* * If the range type name contains "range" then change that to diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt index ebe85337c2877..3a8ad201607f6 100644 --- a/src/backend/catalog/sql_features.txt +++ b/src/backend/catalog/sql_features.txt @@ -518,7 +518,7 @@ T612 Advanced OLAP operations YES T613 Sampling YES T614 NTILE function YES T615 LEAD and LAG functions YES -T616 Null treatment option for LEAD and LAG functions NO +T616 Null treatment option for LEAD and LAG functions YES T617 FIRST_VALUE and LAST_VALUE functions YES T618 NTH_VALUE function NO function exists, but some options missing T619 Nested window functions NO diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 227df90f89c97..d06c1c68174ef 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -546,7 +546,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("invalid page in block %u of relation %s", + errmsg("invalid page in block %u of relation \"%s\"", blkno, relpath.str))); } @@ -586,7 +586,7 @@ RelFileLocatorSkippingWAL(RelFileLocator rlocator) Size EstimatePendingSyncsSpace(void) { - long entries; + int64 entries; entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0; return mul_size(1 + entries, sizeof(RelFileLocator)); @@ -707,12 +707,12 @@ smgrDoPendingDeletes(bool isCommit) if (maxrels == 0) { maxrels = 8; - srels = palloc(sizeof(SMgrRelation) * maxrels); + srels = palloc_array(SMgrRelation, maxrels); } else if (maxrels <= nrels) { maxrels *= 2; - srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); + srels = repalloc_array(srels, SMgrRelation, maxrels); } srels[nrels++] = srel; @@ -829,12 +829,12 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) if (maxrels == 0) { maxrels = 8; - srels = palloc(sizeof(SMgrRelation) * maxrels); + srels = palloc_array(SMgrRelation, maxrels); } else if (maxrels <= nrels) { maxrels *= 2; - srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); + srels = repalloc_array(srels, SMgrRelation, maxrels); } srels[nrels++] = srel; @@ -909,7 +909,7 @@ smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr) *ptr = NULL; return 0; } - rptr = (RelFileLocator *) palloc(nrels * sizeof(RelFileLocator)); + rptr = palloc_array(RelFileLocator, nrels); *ptr = rptr; for (pending = pendingDeletes; pending != NULL; pending = pending->next) { diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index 566f308e4439d..2d946d6d9e9bc 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -650,6 +650,13 @@ LANGUAGE INTERNAL CALLED ON NULL INPUT VOLATILE PARALLEL SAFE AS 'pg_stat_reset_slru'; +CREATE OR REPLACE FUNCTION + pg_replication_origin_session_setup(node_name text, pid integer DEFAULT 0) +RETURNS void +LANGUAGE INTERNAL +STRICT VOLATILE PARALLEL UNSAFE +AS 'pg_replication_origin_session_setup'; + -- -- The default permissions for functions mean that anyone can execute them. -- A number of functions shouldn't be executable by just anyone, but rather @@ -751,7 +758,7 @@ REVOKE EXECUTE ON FUNCTION pg_replication_origin_session_progress(boolean) FROM REVOKE EXECUTE ON FUNCTION pg_replication_origin_session_reset() FROM public; -REVOKE EXECUTE ON FUNCTION pg_replication_origin_session_setup(text) FROM public; +REVOKE EXECUTE ON FUNCTION pg_replication_origin_session_setup(text, integer) FROM public; REVOKE EXECUTE ON FUNCTION pg_replication_origin_xact_reset() FROM public; diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 08f780a2e6382..0a0f95f6bb9fe 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -394,6 +394,16 @@ CREATE VIEW pg_publication_tables AS pg_class C JOIN pg_namespace N ON (N.oid = C.relnamespace) WHERE C.oid = GPT.relid; +CREATE VIEW pg_publication_sequences AS + SELECT + P.pubname AS pubname, + N.nspname AS schemaname, + C.relname AS sequencename + FROM pg_publication P, + LATERAL pg_get_publication_sequences(P.pubname) GPS, + pg_class C JOIN pg_namespace N ON (N.oid = C.relnamespace) + WHERE C.oid = GPS.relid; + CREATE VIEW pg_locks AS SELECT * FROM pg_lock_status() AS L; @@ -666,6 +676,14 @@ GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats; +CREATE VIEW pg_dsm_registry_allocations AS + SELECT * FROM pg_get_dsm_registry_allocations(); + +REVOKE ALL ON pg_dsm_registry_allocations FROM PUBLIC; +GRANT SELECT ON pg_dsm_registry_allocations TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_dsm_registry_allocations() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_dsm_registry_allocations() TO pg_read_all_stats; + CREATE VIEW pg_backend_memory_contexts AS SELECT * FROM pg_get_backend_memory_contexts(); @@ -708,7 +726,8 @@ CREATE VIEW pg_stat_all_tables AS pg_stat_get_total_vacuum_time(C.oid) AS total_vacuum_time, pg_stat_get_total_autovacuum_time(C.oid) AS total_autovacuum_time, pg_stat_get_total_analyze_time(C.oid) AS total_analyze_time, - pg_stat_get_total_autoanalyze_time(C.oid) AS total_autoanalyze_time + pg_stat_get_total_autoanalyze_time(C.oid) AS total_autoanalyze_time, + pg_stat_get_stat_reset_time(C.oid) AS stats_reset FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) @@ -770,7 +789,8 @@ CREATE VIEW pg_statio_all_tables AS pg_stat_get_blocks_hit(T.oid) AS toast_blks_read, pg_stat_get_blocks_hit(T.oid) AS toast_blks_hit, X.idx_blks_read AS tidx_blks_read, - X.idx_blks_hit AS tidx_blks_hit + X.idx_blks_hit AS tidx_blks_hit, + pg_stat_get_stat_reset_time(C.oid) AS stats_reset FROM pg_class C LEFT JOIN pg_class T ON C.reltoastrelid = T.oid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) @@ -810,7 +830,8 @@ CREATE VIEW pg_stat_all_indexes AS pg_stat_get_numscans(I.oid) AS idx_scan, pg_stat_get_lastscan(I.oid) AS last_idx_scan, pg_stat_get_tuples_returned(I.oid) AS idx_tup_read, - pg_stat_get_tuples_fetched(I.oid) AS idx_tup_fetch + pg_stat_get_tuples_fetched(I.oid) AS idx_tup_fetch, + pg_stat_get_stat_reset_time(I.oid) AS stats_reset FROM pg_class C JOIN pg_index X ON C.oid = X.indrelid JOIN pg_class I ON I.oid = X.indexrelid @@ -836,7 +857,8 @@ CREATE VIEW pg_statio_all_indexes AS I.relname AS indexrelname, pg_stat_get_blocks_fetched(I.oid) - pg_stat_get_blocks_hit(I.oid) AS idx_blks_read, - pg_stat_get_blocks_hit(I.oid) AS idx_blks_hit + pg_stat_get_blocks_hit(I.oid) AS idx_blks_hit, + pg_stat_get_stat_reset_time(I.oid) AS stats_reset FROM pg_class C JOIN pg_index X ON C.oid = X.indrelid JOIN pg_class I ON I.oid = X.indexrelid @@ -895,7 +917,7 @@ CREATE VIEW pg_stat_activity AS S.wait_event, S.state, S.backend_xid, - s.backend_xmin, + S.backend_xmin, S.query_id, S.query, S.backend_type @@ -1038,7 +1060,8 @@ CREATE VIEW pg_replication_slots AS L.conflicting, L.invalidation_reason, L.failover, - L.synced + L.synced, + L.slotsync_skip_reason FROM pg_get_replication_slots() AS L LEFT JOIN pg_database D ON (L.datoid = D.oid); @@ -1051,8 +1074,11 @@ CREATE VIEW pg_stat_replication_slots AS s.stream_txns, s.stream_count, s.stream_bytes, + s.mem_exceeded_count, s.total_txns, s.total_bytes, + s.slotsync_skip_count, + s.slotsync_last_skip, s.stats_reset FROM pg_replication_slots as r, LATERAL pg_stat_get_replication_slot(slot_name) as s @@ -1119,7 +1145,8 @@ CREATE VIEW pg_stat_user_functions AS P.proname AS funcname, pg_stat_get_function_calls(P.oid) AS calls, pg_stat_get_function_total_time(P.oid) AS total_time, - pg_stat_get_function_self_time(P.oid) AS self_time + pg_stat_get_function_self_time(P.oid) AS self_time, + pg_stat_get_function_stat_reset_time(P.oid) AS stats_reset FROM pg_proc P LEFT JOIN pg_namespace N ON (N.oid = P.pronamespace) WHERE P.prolang != 12 -- fast check to eliminate built-in functions AND pg_stat_get_function_calls(P.oid) IS NOT NULL; @@ -1197,6 +1224,7 @@ CREATE VIEW pg_stat_wal AS w.wal_records, w.wal_fpi, w.wal_bytes, + w.wal_fpi_bytes, w.wal_buffers_full, w.stats_reset FROM pg_stat_get_wal() w; @@ -1219,7 +1247,10 @@ CREATE VIEW pg_stat_progress_analyze AS S.param6 AS child_tables_total, S.param7 AS child_tables_done, CAST(S.param8 AS oid) AS current_child_table_relid, - S.param9 / 1000000::double precision AS delay_time + S.param9 / 1000000::double precision AS delay_time, + CASE S.param10 WHEN 1 THEN 'manual' + WHEN 2 THEN 'autovacuum' + ELSE NULL END AS started_by FROM pg_stat_get_progress_info('ANALYZE') AS S LEFT JOIN pg_database D ON S.datid = D.oid; @@ -1240,7 +1271,15 @@ CREATE VIEW pg_stat_progress_vacuum AS S.param6 AS max_dead_tuple_bytes, S.param7 AS dead_tuple_bytes, S.param8 AS num_dead_item_ids, S.param9 AS indexes_total, S.param10 AS indexes_processed, - S.param11 / 1000000::double precision AS delay_time + S.param11 / 1000000::double precision AS delay_time, + CASE S.param12 WHEN 1 THEN 'normal' + WHEN 2 THEN 'aggressive' + WHEN 3 THEN 'failsafe' + ELSE NULL END AS mode, + CASE S.param13 WHEN 1 THEN 'manual' + WHEN 2 THEN 'autovacuum' + WHEN 3 THEN 'autovacuum_wraparound' + ELSE NULL END AS started_by FROM pg_stat_get_progress_info('VACUUM') AS S LEFT JOIN pg_database D ON S.datid = D.oid; @@ -1319,7 +1358,10 @@ CREATE VIEW pg_stat_progress_basebackup AS CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS backup_total, S.param3 AS backup_streamed, S.param4 AS tablespaces_total, - S.param5 AS tablespaces_streamed + S.param5 AS tablespaces_streamed, + CASE S.param6 WHEN 1 THEN 'full' + WHEN 2 THEN 'incremental' + END AS backup_type FROM pg_stat_get_progress_info('BASEBACKUP') AS S; @@ -1378,6 +1420,7 @@ REVOKE ALL ON pg_subscription FROM public; GRANT SELECT (oid, subdbid, subskiplsn, subname, subowner, subenabled, subbinary, substream, subtwophasestate, subdisableonerr, subpasswordrequired, subrunasowner, subfailover, + subretaindeadtuples, submaxretention, subretentionactive, subslotname, subsynccommit, subpublications, suborigin) ON pg_subscription TO public; @@ -1386,10 +1429,12 @@ CREATE VIEW pg_stat_subscription_stats AS ss.subid, s.subname, ss.apply_error_count, - ss.sync_error_count, + ss.sync_seq_error_count, + ss.sync_table_error_count, ss.confl_insert_exists, ss.confl_update_origin_differs, ss.confl_update_exists, + ss.confl_update_deleted, ss.confl_update_missing, ss.confl_delete_origin_differs, ss.confl_delete_missing, diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile index cb2fbdc7c6018..64cb6278409ff 100644 --- a/src/backend/commands/Makefile +++ b/src/backend/commands/Makefile @@ -53,6 +53,7 @@ OBJS = \ schemacmds.o \ seclabel.o \ sequence.o \ + sequence_xlog.o \ statscmds.o \ subscriptioncmds.o \ tablecmds.o \ @@ -64,6 +65,7 @@ OBJS = \ vacuum.o \ vacuumparallel.o \ variable.o \ - view.o + view.o \ + wait.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c index c801c869c1cfc..cb75e11fced62 100644 --- a/src/backend/commands/alter.c +++ b/src/backend/commands/alter.c @@ -220,7 +220,7 @@ AlterObjectRename_internal(Relation rel, Oid objectId, const char *new_name) Assert(!isnull); ownerId = DatumGetObjectId(datum); - if (!has_privs_of_role(GetUserId(), DatumGetObjectId(ownerId))) + if (!has_privs_of_role(GetUserId(), ownerId)) aclcheck_error(ACLCHECK_NOT_OWNER, get_object_type(classId, objectId), old_name); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 4fffb76e55735..5e2a7a8234ec8 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -29,7 +29,6 @@ #include "catalog/index.h" #include "catalog/indexing.h" #include "catalog/pg_inherits.h" -#include "commands/dbcommands.h" #include "commands/progress.h" #include "commands/tablecmds.h" #include "commands/vacuum.h" @@ -76,7 +75,7 @@ static BufferAccessStrategy vac_strategy; static void do_analyze_rel(Relation onerel, - VacuumParams *params, List *va_cols, + const VacuumParams params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel); static void compute_index_stats(Relation onerel, double totalrows, @@ -107,7 +106,7 @@ static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); */ void analyze_rel(Oid relid, RangeVar *relation, - VacuumParams *params, List *va_cols, bool in_outer_xact, + const VacuumParams params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy) { Relation onerel; @@ -116,7 +115,7 @@ analyze_rel(Oid relid, RangeVar *relation, BlockNumber relpages = 0; /* Select logging level */ - if (params->options & VACOPT_VERBOSE) + if (params.options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; @@ -138,8 +137,8 @@ analyze_rel(Oid relid, RangeVar *relation, * * Make sure to generate only logs for ANALYZE in this case. */ - onerel = vacuum_open_relation(relid, relation, params->options & ~(VACOPT_VACUUM), - params->log_min_duration >= 0, + onerel = vacuum_open_relation(relid, relation, params.options & ~(VACOPT_VACUUM), + params.log_analyze_min_duration >= 0, ShareUpdateExclusiveLock); /* leave if relation could not be opened or locked */ @@ -155,7 +154,7 @@ analyze_rel(Oid relid, RangeVar *relation, */ if (!vacuum_is_permitted_for_relation(RelationGetRelid(onerel), onerel->rd_rel, - params->options & ~VACOPT_VACUUM)) + params.options & ~VACOPT_VACUUM)) { relation_close(onerel, ShareUpdateExclusiveLock); return; @@ -227,7 +226,7 @@ analyze_rel(Oid relid, RangeVar *relation, else { /* No need for a WARNING if we already complained during VACUUM */ - if (!(params->options & VACOPT_VACUUM)) + if (!(params.options & VACOPT_VACUUM)) ereport(WARNING, (errmsg("skipping \"%s\" --- cannot analyze non-tables or special system tables", RelationGetRelationName(onerel)))); @@ -240,6 +239,12 @@ analyze_rel(Oid relid, RangeVar *relation, */ pgstat_progress_start_command(PROGRESS_COMMAND_ANALYZE, RelationGetRelid(onerel)); + if (AmAutoVacuumWorkerProcess()) + pgstat_progress_update_param(PROGRESS_ANALYZE_STARTED_BY, + PROGRESS_ANALYZE_STARTED_BY_AUTOVACUUM); + else + pgstat_progress_update_param(PROGRESS_ANALYZE_STARTED_BY, + PROGRESS_ANALYZE_STARTED_BY_MANUAL); /* * Do the normal non-recursive ANALYZE. We can skip this for partitioned @@ -275,7 +280,7 @@ analyze_rel(Oid relid, RangeVar *relation, * appropriate acquirefunc for each child table. */ static void -do_analyze_rel(Relation onerel, VacuumParams *params, +do_analyze_rel(Relation onerel, const VacuumParams params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel) @@ -309,9 +314,9 @@ do_analyze_rel(Relation onerel, VacuumParams *params, PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; - verbose = (params->options & VACOPT_VERBOSE) != 0; + verbose = (params.options & VACOPT_VERBOSE) != 0; instrument = (verbose || (AmAutoVacuumWorkerProcess() && - params->log_min_duration >= 0)); + params.log_analyze_min_duration >= 0)); if (inh) ereport(elevel, (errmsg("analyzing \"%s.%s\" inheritance tree", @@ -690,8 +695,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params, * only do it for inherited stats. (We're never called for not-inherited * stats on partitioned tables anyway.) * - * Reset the changes_since_analyze counter only if we analyzed all - * columns; otherwise, there is still work for auto-analyze to do. + * Reset the mod_since_analyze counter only if we analyzed all columns; + * otherwise, there is still work for auto-analyze to do. */ if (!inh) pgstat_report_analyze(onerel, totalrows, totaldeadrows, @@ -706,7 +711,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params, * amvacuumcleanup() when called in ANALYZE-only mode. The only exception * among core index AMs is GIN/ginvacuumcleanup(). */ - if (!(params->options & VACOPT_VACUUM)) + if (!(params.options & VACOPT_VACUUM)) { for (ind = 0; ind < nindexes; ind++) { @@ -736,9 +741,9 @@ do_analyze_rel(Relation onerel, VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - if (verbose || params->log_min_duration == 0 || + if (verbose || params.log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, - params->log_min_duration)) + params.log_analyze_min_duration)) { long delay_in_ms; WalUsage walusage; @@ -832,10 +837,11 @@ do_analyze_rel(Relation onerel, VacuumParams *params, total_blks_read, total_blks_dirtied); appendStringInfo(&buf, - _("WAL usage: %" PRId64 " records, %" PRId64 " full page images, %" PRIu64 " bytes, %" PRId64 " buffers full\n"), + _("WAL usage: %" PRId64 " records, %" PRId64 " full page images, %" PRIu64 " bytes, %" PRIu64 " full page image bytes, %" PRId64 " buffers full\n"), walusage.wal_records, walusage.wal_fpi, walusage.wal_bytes, + walusage.wal_fpi_bytes, walusage.wal_buffers_full); appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); @@ -1073,7 +1079,7 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr) /* * Create the VacAttrStats struct. */ - stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + stats = palloc0_object(VacAttrStats); stats->attstattarget = attstattarget; /* @@ -1712,10 +1718,9 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats) i = Anum_pg_statistic_stanumbers1 - 1; for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { - int nnum = stats->numnumbers[k]; - - if (nnum > 0) + if (stats->stanumbers[k] != NULL) { + int nnum = stats->numnumbers[k]; Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum)); ArrayType *arry; @@ -1733,7 +1738,7 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats) i = Anum_pg_statistic_stavalues1 - 1; for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { - if (stats->numvalues[k] > 0) + if (stats->stavalues[k] != NULL) { ArrayType *arry; @@ -1905,7 +1910,7 @@ std_typanalyze(VacAttrStats *stats) NULL); /* Save the operator info for compute_stats routines */ - mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData)); + mystats = palloc_object(StdAnalyzeData); mystats->eqopr = eqopr; mystats->eqfunc = OidIsValid(eqopr) ? get_opcode(eqopr) : InvalidOid; mystats->ltopr = ltopr; @@ -2860,7 +2865,7 @@ compute_scalar_stats(VacAttrStatsP stats, /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); - corrs = (float4 *) palloc(sizeof(float4)); + corrs = palloc_object(float4); MemoryContextSwitchTo(old_context); /*---------- diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 4bd37d5beb559..eb86402cae43c 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -446,9 +446,8 @@ static double asyncQueueUsage(void); static void asyncQueueFillWarning(void); static void SignalBackends(void); static void asyncQueueReadAllNotifications(void); -static bool asyncQueueProcessPageEntries(volatile QueuePosition *current, +static bool asyncQueueProcessPageEntries(QueuePosition *current, QueuePosition stop, - char *page_buffer, Snapshot snapshot); static void asyncQueueAdvanceTail(void); static void ProcessIncomingNotify(bool flush); @@ -1419,6 +1418,7 @@ asyncQueueAddEntries(ListCell *nextNotify) */ qe.length = QUEUE_PAGESIZE - offset; qe.dboid = InvalidOid; + qe.xid = InvalidTransactionId; qe.data[0] = '\0'; /* empty channel */ qe.data[1] = '\0'; /* empty payload */ } @@ -1850,17 +1850,10 @@ ProcessNotifyInterrupt(bool flush) static void asyncQueueReadAllNotifications(void) { - volatile QueuePosition pos; + QueuePosition pos; QueuePosition head; Snapshot snapshot; - /* page_buffer must be adequately aligned, so use a union */ - union - { - char buf[QUEUE_PAGESIZE]; - AsyncQueueEntry align; - } page_buffer; - /* Fetch current state */ LWLockAcquire(NotifyQueueLock, LW_SHARED); /* Assert checks that we have a valid state entry */ @@ -1920,49 +1913,27 @@ asyncQueueReadAllNotifications(void) * It is possible that we fail while trying to send a message to our * frontend (for example, because of encoding conversion failure). If * that happens it is critical that we not try to send the same message - * over and over again. Therefore, we place a PG_TRY block here that will - * forcibly advance our queue position before we lose control to an error. - * (We could alternatively retake NotifyQueueLock and move the position - * before handling each individual message, but that seems like too much - * lock traffic.) + * over and over again. Therefore, we set ExitOnAnyError to upgrade any + * ERRORs to FATAL, causing the client connection to be closed on error. + * + * We used to only skip over the offending message and try to soldier on, + * but it was somewhat questionable to lose a notification and give the + * client an ERROR instead. A client application is not be prepared for + * that and can't tell that a notification was missed. It was also not + * very useful in practice because notifications are often processed while + * a connection is idle and reading a message from the client, and in that + * state, any error is upgraded to FATAL anyway. Closing the connection + * is a clear signal to the application that it might have missed + * notifications. */ - PG_TRY(); { + bool save_ExitOnAnyError = ExitOnAnyError; bool reachedStop; + ExitOnAnyError = true; + do { - int64 curpage = QUEUE_POS_PAGE(pos); - int curoffset = QUEUE_POS_OFFSET(pos); - int slotno; - int copysize; - - /* - * We copy the data from SLRU into a local buffer, so as to avoid - * holding the SLRU lock while we are examining the entries and - * possibly transmitting them to our frontend. Copy only the part - * of the page we will actually inspect. - */ - slotno = SimpleLruReadPage_ReadOnly(NotifyCtl, curpage, - InvalidTransactionId); - if (curpage == QUEUE_POS_PAGE(head)) - { - /* we only want to read as far as head */ - copysize = QUEUE_POS_OFFSET(head) - curoffset; - if (copysize < 0) - copysize = 0; /* just for safety */ - } - else - { - /* fetch all the rest of the page */ - copysize = QUEUE_PAGESIZE - curoffset; - } - memcpy(page_buffer.buf + curoffset, - NotifyCtl->shared->page_buffer[slotno] + curoffset, - copysize); - /* Release lock that we got from SimpleLruReadPage_ReadOnly() */ - LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage)); - /* * Process messages up to the stop position, end of page, or an * uncommitted message. @@ -1978,19 +1949,16 @@ asyncQueueReadAllNotifications(void) * rewrite pages under us. Especially we don't want to hold a lock * while sending the notifications to the frontend. */ - reachedStop = asyncQueueProcessPageEntries(&pos, head, - page_buffer.buf, - snapshot); + reachedStop = asyncQueueProcessPageEntries(&pos, head, snapshot); } while (!reachedStop); - } - PG_FINALLY(); - { + /* Update shared state */ LWLockAcquire(NotifyQueueLock, LW_SHARED); QUEUE_BACKEND_POS(MyProcNumber) = pos; LWLockRelease(NotifyQueueLock); + + ExitOnAnyError = save_ExitOnAnyError; } - PG_END_TRY(); /* Done with snapshot */ UnregisterSnapshot(snapshot); @@ -2000,31 +1968,38 @@ asyncQueueReadAllNotifications(void) * Fetch notifications from the shared queue, beginning at position current, * and deliver relevant ones to my frontend. * - * The current page must have been fetched into page_buffer from shared - * memory. (We could access the page right in shared memory, but that - * would imply holding the SLRU bank lock throughout this routine.) - * - * We stop if we reach the "stop" position, or reach a notification from an - * uncommitted transaction, or reach the end of the page. - * * The function returns true once we have reached the stop position or an * uncommitted notification, and false if we have finished with the page. * In other words: once it returns true there is no need to look further. * The QueuePosition *current is advanced past all processed messages. */ static bool -asyncQueueProcessPageEntries(volatile QueuePosition *current, +asyncQueueProcessPageEntries(QueuePosition *current, QueuePosition stop, - char *page_buffer, Snapshot snapshot) { + int64 curpage = QUEUE_POS_PAGE(*current); + int slotno; + char *page_buffer; bool reachedStop = false; bool reachedEndOfPage; - AsyncQueueEntry *qe; + + /* + * We copy the entries into a local buffer to avoid holding the SLRU lock + * while we transmit them to our frontend. The local buffer must be + * adequately aligned. + */ + alignas(AsyncQueueEntry) char local_buf[QUEUE_PAGESIZE]; + char *local_buf_end = local_buf; + + slotno = SimpleLruReadPage_ReadOnly(NotifyCtl, curpage, + InvalidTransactionId); + page_buffer = NotifyCtl->shared->page_buffer[slotno]; do { QueuePosition thisentry = *current; + AsyncQueueEntry *qe; if (QUEUE_POS_EQUAL(thisentry, stop)) break; @@ -2066,18 +2041,23 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current, reachedStop = true; break; } - else if (TransactionIdDidCommit(qe->xid)) - { - /* qe->data is the null-terminated channel name */ - char *channel = qe->data; - if (IsListeningOn(channel)) - { - /* payload follows channel name */ - char *payload = qe->data + strlen(channel) + 1; + /* + * Quick check for the case that we're not listening on any + * channels, before calling TransactionIdDidCommit(). This makes + * that case a little faster, but more importantly, it ensures + * that if there's a bad entry in the queue for which + * TransactionIdDidCommit() fails for some reason, we can skip + * over it on the first LISTEN in a session, and not get stuck on + * it indefinitely. + */ + if (listenChannels == NIL) + continue; - NotifyMyFrontEnd(channel, payload, qe->srcPid); - } + if (TransactionIdDidCommit(qe->xid)) + { + memcpy(local_buf_end, qe, qe->length); + local_buf_end += qe->length; } else { @@ -2091,6 +2071,32 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current, /* Loop back if we're not at end of page */ } while (!reachedEndOfPage); + /* Release lock that we got from SimpleLruReadPage_ReadOnly() */ + LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage)); + + /* + * Now that we have let go of the SLRU bank lock, send the notifications + * to our backend + */ + Assert(local_buf_end - local_buf <= BLCKSZ); + for (char *p = local_buf; p < local_buf_end;) + { + AsyncQueueEntry *qe = (AsyncQueueEntry *) p; + + /* qe->data is the null-terminated channel name */ + char *channel = qe->data; + + if (IsListeningOn(channel)) + { + /* payload follows channel name */ + char *payload = qe->data + strlen(channel) + 1; + + NotifyMyFrontEnd(channel, payload, qe->srcPid); + } + + p += qe->length; + } + if (QUEUE_POS_EQUAL(*current, stop)) reachedStop = true; @@ -2168,6 +2174,120 @@ asyncQueueAdvanceTail(void) LWLockRelease(NotifyQueueTailLock); } +/* + * AsyncNotifyFreezeXids + * + * Prepare the async notification queue for CLOG truncation by freezing + * transaction IDs that are about to become inaccessible. + * + * This function is called by VACUUM before advancing datfrozenxid. It scans + * the notification queue and replaces XIDs that would become inaccessible + * after CLOG truncation with special markers: + * - Committed transactions are set to FrozenTransactionId + * - Aborted/crashed transactions are set to InvalidTransactionId + * + * Only XIDs < newFrozenXid are processed, as those are the ones whose CLOG + * pages will be truncated. If XID < newFrozenXid, it cannot still be running + * (or it would have held back newFrozenXid through ProcArray). + * Therefore, if TransactionIdDidCommit returns false, we know the transaction + * either aborted explicitly or crashed, and we can safely mark it invalid. + */ +void +AsyncNotifyFreezeXids(TransactionId newFrozenXid) +{ + QueuePosition pos; + QueuePosition head; + int64 curpage = -1; + int slotno = -1; + char *page_buffer = NULL; + bool page_dirty = false; + + /* + * Acquire locks in the correct order to avoid deadlocks. As per the + * locking protocol: NotifyQueueTailLock, then NotifyQueueLock, then SLRU + * bank locks. + * + * We only need SHARED mode since we're just reading the head/tail + * positions, not modifying them. + */ + LWLockAcquire(NotifyQueueTailLock, LW_SHARED); + LWLockAcquire(NotifyQueueLock, LW_SHARED); + + pos = QUEUE_TAIL; + head = QUEUE_HEAD; + + /* Release NotifyQueueLock early, we only needed to read the positions */ + LWLockRelease(NotifyQueueLock); + + /* + * Scan the queue from tail to head, freezing XIDs as needed. We hold + * NotifyQueueTailLock throughout to ensure the tail doesn't move while + * we're working. + */ + while (!QUEUE_POS_EQUAL(pos, head)) + { + AsyncQueueEntry *qe; + TransactionId xid; + int64 pageno = QUEUE_POS_PAGE(pos); + int offset = QUEUE_POS_OFFSET(pos); + + /* If we need a different page, release old lock and get new one */ + if (pageno != curpage) + { + LWLock *lock; + + /* Release previous page if any */ + if (slotno >= 0) + { + if (page_dirty) + { + NotifyCtl->shared->page_dirty[slotno] = true; + page_dirty = false; + } + LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage)); + } + + lock = SimpleLruGetBankLock(NotifyCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + slotno = SimpleLruReadPage(NotifyCtl, pageno, true, + InvalidTransactionId); + page_buffer = NotifyCtl->shared->page_buffer[slotno]; + curpage = pageno; + } + + qe = (AsyncQueueEntry *) (page_buffer + offset); + xid = qe->xid; + + if (TransactionIdIsNormal(xid) && + TransactionIdPrecedes(xid, newFrozenXid)) + { + if (TransactionIdDidCommit(xid)) + { + qe->xid = FrozenTransactionId; + page_dirty = true; + } + else + { + qe->xid = InvalidTransactionId; + page_dirty = true; + } + } + + /* Advance to next entry */ + asyncQueueAdvance(&pos, qe->length); + } + + /* Release final page lock if we acquired one */ + if (slotno >= 0) + { + if (page_dirty) + NotifyCtl->shared->page_dirty[slotno] = true; + LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage)); + } + + LWLockRelease(NotifyQueueTailLock); +} + /* * ProcessIncomingNotify * diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 54a08e4102e14..2120c85ccb4cb 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -124,8 +124,8 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized CLUSTER option \"%s\"", - opt->defname), + errmsg("unrecognized %s option \"%s\"", + "CLUSTER", opt->defname), parser_errposition(pstate, opt->location))); } @@ -917,7 +917,7 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb * not to be aggressive about this. */ memset(¶ms, 0, sizeof(VacuumParams)); - vacuum_get_cutoffs(OldHeap, ¶ms, &cutoffs); + vacuum_get_cutoffs(OldHeap, params, &cutoffs); /* * FreezeXid will become the table's new relfrozenxid, and that mustn't go @@ -1672,7 +1672,7 @@ get_tables_to_cluster(MemoryContext cluster_context) /* Use a permanent memory context for the result list */ old_context = MemoryContextSwitchTo(cluster_context); - rtc = (RelToCluster *) palloc(sizeof(RelToCluster)); + rtc = palloc_object(RelToCluster); rtc->tableOid = index->indrelid; rtc->indexOid = index->indexrelid; rtcs = lappend(rtcs, rtc); @@ -1726,7 +1726,7 @@ get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid) /* Use a permanent memory context for the result list */ old_context = MemoryContextSwitchTo(cluster_context); - rtc = (RelToCluster *) palloc(sizeof(RelToCluster)); + rtc = palloc_object(RelToCluster); rtc->tableOid = relid; rtc->indexOid = indexrelid; rtcs = lappend(rtcs, rtc); diff --git a/src/backend/commands/comment.c b/src/backend/commands/comment.c index f67a8b95d29de..5c783cc61f1d7 100644 --- a/src/backend/commands/comment.c +++ b/src/backend/commands/comment.c @@ -20,10 +20,10 @@ #include "access/table.h" #include "catalog/indexing.h" #include "catalog/objectaddress.h" +#include "catalog/pg_database.h" #include "catalog/pg_description.h" #include "catalog/pg_shdescription.h" #include "commands/comment.h" -#include "commands/dbcommands.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/fmgroids.h" diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 74ae42b19a710..6454a39a01f0c 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -133,6 +133,9 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, if (stmt->whereClause) { + Bitmapset *expr_attrs = NULL; + int i; + /* add nsitem to query namespace */ addNSItemToQuery(pstate, nsitem, false, true, true); @@ -145,6 +148,42 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, /* we have to fix its collations too */ assign_expr_collations(pstate, whereClause); + /* + * Examine all the columns in the WHERE clause expression. When + * the whole-row reference is present, examine all the columns of + * the table. + */ + pull_varattnos(whereClause, 1, &expr_attrs); + if (bms_is_member(0 - FirstLowInvalidHeapAttributeNumber, expr_attrs)) + { + expr_attrs = bms_add_range(expr_attrs, + 1 - FirstLowInvalidHeapAttributeNumber, + RelationGetNumberOfAttributes(rel) - FirstLowInvalidHeapAttributeNumber); + expr_attrs = bms_del_member(expr_attrs, 0 - FirstLowInvalidHeapAttributeNumber); + } + + i = -1; + while ((i = bms_next_member(expr_attrs, i)) >= 0) + { + AttrNumber attno = i + FirstLowInvalidHeapAttributeNumber; + + Assert(attno != 0); + + /* + * Prohibit generated columns in the WHERE clause. Stored + * generated columns are not yet computed when the filtering + * happens. Virtual generated columns could probably work (we + * would need to expand them somewhere around here), but for + * now we keep them consistent with the stored variant. + */ + if (TupleDescAttr(RelationGetDescr(rel), attno - 1)->attgenerated) + ereport(ERROR, + errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("generated columns are not supported in COPY FROM WHERE conditions"), + errdetail("Column \"%s\" is a generated column.", + get_attname(RelationGetRelid(rel), attno, false))); + } + whereClause = eval_const_expressions(NULL, whereClause); whereClause = (Node *) canonicalize_qual((Expr *) whereClause, false); @@ -251,11 +290,15 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, * relation which we have opened and locked. Use "ONLY" so that * COPY retrieves rows from only the target table not any * inheritance children, the same as when RLS doesn't apply. + * + * However, when copying data from a partitioned table, we don't + * use "ONLY", since we need to retrieve rows from its descendant + * tables too. */ from = makeRangeVar(get_namespace_name(RelationGetNamespace(rel)), pstrdup(RelationGetRelationName(rel)), -1); - from->inh = false; /* apply ONLY */ + from->inh = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); /* Build query */ select = makeNode(SelectStmt); @@ -322,11 +365,13 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, } /* - * Extract a CopyHeaderChoice value from a DefElem. This is like - * defGetBoolean() but also accepts the special value "match". + * Extract the CopyFormatOptions.header_line value from a DefElem. + * + * Parses the HEADER option for COPY, which can be a boolean, a non-negative + * integer (number of lines to skip), or the special value "match". */ -static CopyHeaderChoice -defGetCopyHeaderChoice(DefElem *def, bool is_from) +static int +defGetCopyHeaderOption(DefElem *def, bool is_from) { /* * If no parameter value given, assume "true" is meant. @@ -335,20 +380,27 @@ defGetCopyHeaderChoice(DefElem *def, bool is_from) return COPY_HEADER_TRUE; /* - * Allow 0, 1, "true", "false", "on", "off", or "match". + * Allow 0, 1, "true", "false", "on", "off", a non-negative integer, or + * "match". */ switch (nodeTag(def->arg)) { case T_Integer: - switch (intVal(def->arg)) { - case 0: - return COPY_HEADER_FALSE; - case 1: - return COPY_HEADER_TRUE; - default: - /* otherwise, error out below */ - break; + int ival = intVal(def->arg); + + if (ival < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("a negative integer value cannot be " + "specified for %s", def->defname))); + + if (!is_from && ival > 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use multi-line header in COPY TO"))); + + return ival; } break; default: @@ -381,7 +433,8 @@ defGetCopyHeaderChoice(DefElem *def, bool is_from) } ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("%s requires a Boolean value or \"match\"", + errmsg("%s requires a Boolean value, a non-negative integer, " + "or the string \"match\"", def->defname))); return COPY_HEADER_FALSE; /* keep compiler quiet */ } @@ -508,7 +561,7 @@ ProcessCopyOptions(ParseState *pstate, /* Support external use for option sanity checking */ if (opts_out == NULL) - opts_out = (CopyFormatOptions *) palloc0(sizeof(CopyFormatOptions)); + opts_out = palloc0_object(CopyFormatOptions); opts_out->file_encoding = -1; @@ -566,7 +619,7 @@ ProcessCopyOptions(ParseState *pstate, if (header_specified) errorConflictingDefElem(defel, pstate); header_specified = true; - opts_out->header_line = defGetCopyHeaderChoice(defel, is_from); + opts_out->header_line = defGetCopyHeaderOption(defel, is_from); } else if (strcmp(defel->defname, "quote") == 0) { @@ -769,7 +822,7 @@ ProcessCopyOptions(ParseState *pstate, errmsg("COPY delimiter cannot be \"%s\"", opts_out->delim))); /* Check header */ - if (opts_out->binary && opts_out->header_line) + if (opts_out->binary && opts_out->header_line != COPY_HEADER_FALSE) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), /*- translator: %s is the name of a COPY option, e.g. ON_ERROR */ diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index fbbbc09a97b17..2ae3d2ba86e76 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -364,7 +364,7 @@ CopyMultiInsertBufferInit(ResultRelInfo *rri) { CopyMultiInsertBuffer *buffer; - buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer)); + buffer = palloc_object(CopyMultiInsertBuffer); memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES); buffer->resultRelInfo = rri; buffer->bistate = (rri->ri_FdwRoutine == NULL) ? GetBulkInsertState() : NULL; @@ -919,7 +919,7 @@ CopyFrom(CopyFromState cstate) ExecInitResultRelation(estate, resultRelInfo, 1); /* Verify the named relation is a valid target for INSERT */ - CheckValidResultRel(resultRelInfo, CMD_INSERT, NIL); + CheckValidResultRel(resultRelInfo, CMD_INSERT, ONCONFLICT_NONE, NIL); ExecOpenIndices(resultRelInfo, false); @@ -1558,7 +1558,7 @@ BeginCopyFrom(ParseState *pstate, }; /* Allocate workspace and zero all fields */ - cstate = (CopyFromStateData *) palloc0(sizeof(CopyFromStateData)); + cstate = palloc0_object(CopyFromStateData); /* * We allocate everything used by a cstate in a new memory context. This diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index f5fc346e2013b..62afcd8fad114 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -335,7 +335,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) if (avail > maxread) avail = maxread; pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail); - databuf = (void *) ((char *) databuf + avail); + databuf = (char *) databuf + avail; maxread -= avail; bytesread += avail; } @@ -771,21 +771,30 @@ static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv) { int fldct; - bool done; + bool done = false; /* only available for text or csv input */ Assert(!cstate->opts.binary); /* on input check that the header line is correct if needed */ - if (cstate->cur_lineno == 0 && cstate->opts.header_line) + if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE) { ListCell *cur; TupleDesc tupDesc; + int lines_to_skip = cstate->opts.header_line; + + /* If set to "match", one header line is skipped */ + if (cstate->opts.header_line == COPY_HEADER_MATCH) + lines_to_skip = 1; tupDesc = RelationGetDescr(cstate->rel); - cstate->cur_lineno++; - done = CopyReadLine(cstate, is_csv); + for (int i = 0; i < lines_to_skip; i++) + { + cstate->cur_lineno++; + if ((done = CopyReadLine(cstate, is_csv))) + break; + } if (cstate->opts.header_line == COPY_HEADER_MATCH) { @@ -1127,7 +1136,7 @@ CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), errmsg("row field count is %d, expected %d", - (int) fld_count, attr_count))); + fld_count, attr_count))); foreach(cur, cstate->attnumlist) { @@ -1538,7 +1547,7 @@ GetDecimalFromHex(char hex) if (isdigit((unsigned char) hex)) return hex - '0'; else - return tolower((unsigned char) hex) - 'a' + 10; + return pg_ascii_tolower((unsigned char) hex) - 'a' + 10; } /* diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index ea6f18f2c8008..dae91630ac3ea 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -18,7 +18,9 @@ #include #include +#include "access/table.h" #include "access/tableam.h" +#include "catalog/pg_inherits.h" #include "commands/copyapi.h" #include "commands/progress.h" #include "executor/execdesc.h" @@ -86,6 +88,7 @@ typedef struct CopyToStateData CopyFormatOptions opts; Node *whereClause; /* WHERE condition (or NULL) */ + List *partitions; /* OID list of partitions to copy data from */ /* * Working state @@ -116,6 +119,8 @@ static void CopyOneRowTo(CopyToState cstate, TupleTableSlot *slot); static void CopyAttributeOutText(CopyToState cstate, const char *string); static void CopyAttributeOutCSV(CopyToState cstate, const char *string, bool use_quote); +static void CopyRelationTo(CopyToState cstate, Relation rel, Relation root_rel, + uint64 *processed); /* built-in format-specific routines */ static void CopyToTextLikeStart(CopyToState cstate, TupleDesc tupDesc); @@ -199,7 +204,7 @@ CopyToTextLikeStart(CopyToState cstate, TupleDesc tupDesc) cstate->file_encoding); /* if a header has been requested send the line */ - if (cstate->opts.header_line) + if (cstate->opts.header_line == COPY_HEADER_TRUE) { ListCell *cur; bool hdr_delim = false; @@ -581,7 +586,7 @@ ClosePipeToProgram(CopyToState cstate) } /* - * Release resources allocated in a cstate for COPY TO/FROM. + * Release resources allocated in a cstate for COPY TO. */ static void EndCopy(CopyToState cstate) @@ -602,6 +607,10 @@ EndCopy(CopyToState cstate) pgstat_progress_end_command(); MemoryContextDelete(cstate->copycontext); + + if (cstate->partitions) + list_free(cstate->partitions); + pfree(cstate); } @@ -643,6 +652,7 @@ BeginCopyTo(ParseState *pstate, PROGRESS_COPY_COMMAND_TO, 0 }; + List *children = NIL; if (rel != NULL && rel->rd_rel->relkind != RELKIND_RELATION) { @@ -673,11 +683,34 @@ BeginCopyTo(ParseState *pstate, errmsg("cannot copy from sequence \"%s\"", RelationGetRelationName(rel)))); else if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot copy from partitioned table \"%s\"", - RelationGetRelationName(rel)), - errhint("Try the COPY (SELECT ...) TO variant."))); + { + /* + * Collect OIDs of relation containing data, so that later + * DoCopyTo can copy the data from them. + */ + children = find_all_inheritors(RelationGetRelid(rel), AccessShareLock, NULL); + + foreach_oid(child, children) + { + char relkind = get_rel_relkind(child); + + if (relkind == RELKIND_FOREIGN_TABLE) + { + char *relation_name = get_rel_name(child); + + ereport(ERROR, + errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot copy from foreign table \"%s\"", relation_name), + errdetail("Partition \"%s\" is a foreign table in partitioned table \"%s\"", + relation_name, RelationGetRelationName(rel)), + errhint("Try the COPY (SELECT ...) TO variant.")); + } + + /* Exclude tables with no data */ + if (RELKIND_HAS_PARTITIONS(relkind)) + children = foreach_delete_current(children, child); + } + } else ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -687,7 +720,7 @@ BeginCopyTo(ParseState *pstate, /* Allocate workspace and zero all fields */ - cstate = (CopyToStateData *) palloc0(sizeof(CopyToStateData)); + cstate = palloc0_object(CopyToStateData); /* * We allocate everything used by a cstate in a new memory context. This @@ -713,6 +746,7 @@ BeginCopyTo(ParseState *pstate, cstate->rel = rel; tupDesc = RelationGetDescr(cstate->rel); + cstate->partitions = children; } else { @@ -722,6 +756,7 @@ BeginCopyTo(ParseState *pstate, DestReceiver *dest; cstate->rel = NULL; + cstate->partitions = NIL; /* * Run parse analysis and rewrite. Note this also acquires sufficient @@ -796,7 +831,7 @@ BeginCopyTo(ParseState *pstate, /* plan the query */ plan = pg_plan_query(query, pstate->p_sourcetext, - CURSOR_OPT_PARALLEL_OK, NULL); + CURSOR_OPT_PARALLEL_OK, NULL, NULL); /* * With row-level security and a user using "COPY relation TO", we @@ -1030,7 +1065,7 @@ DoCopyTo(CopyToState cstate) TupleDesc tupDesc; int num_phys_attrs; ListCell *cur; - uint64 processed; + uint64 processed = 0; if (fe_copy) SendCopyBegin(cstate); @@ -1070,33 +1105,24 @@ DoCopyTo(CopyToState cstate) if (cstate->rel) { - TupleTableSlot *slot; - TableScanDesc scandesc; - - scandesc = table_beginscan(cstate->rel, GetActiveSnapshot(), 0, NULL); - slot = table_slot_create(cstate->rel, NULL); - - processed = 0; - while (table_scan_getnextslot(scandesc, ForwardScanDirection, slot)) + /* + * If COPY TO source table is a partitioned table, then open each + * partition and process each individual partition. + */ + if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { - CHECK_FOR_INTERRUPTS(); - - /* Deconstruct the tuple ... */ - slot_getallattrs(slot); - - /* Format and send the data */ - CopyOneRowTo(cstate, slot); + foreach_oid(child, cstate->partitions) + { + Relation scan_rel; - /* - * Increment the number of processed tuples, and report the - * progress. - */ - pgstat_progress_update_param(PROGRESS_COPY_TUPLES_PROCESSED, - ++processed); + /* We already got the lock in BeginCopyTo */ + scan_rel = table_open(child, NoLock); + CopyRelationTo(cstate, scan_rel, cstate->rel, &processed); + table_close(scan_rel, NoLock); + } } - - ExecDropSingleTupleTableSlot(slot); - table_endscan(scandesc); + else + CopyRelationTo(cstate, cstate->rel, NULL, &processed); } else { @@ -1115,6 +1141,73 @@ DoCopyTo(CopyToState cstate) return processed; } +/* + * Scans a single table and exports its rows to the COPY destination. + * + * root_rel can be set to the root table of rel if rel is a partition + * table so that we can send tuples in root_rel's rowtype, which might + * differ from individual partitions. +*/ +static void +CopyRelationTo(CopyToState cstate, Relation rel, Relation root_rel, uint64 *processed) +{ + TupleTableSlot *slot; + TableScanDesc scandesc; + AttrMap *map = NULL; + TupleTableSlot *root_slot = NULL; + + scandesc = table_beginscan(rel, GetActiveSnapshot(), 0, NULL); + slot = table_slot_create(rel, NULL); + + /* + * If we are exporting partition data here, we check if converting tuples + * to the root table's rowtype, because a partition might have column + * order different than its root table. + */ + if (root_rel != NULL) + { + root_slot = table_slot_create(root_rel, NULL); + map = build_attrmap_by_name_if_req(RelationGetDescr(root_rel), + RelationGetDescr(rel), + false); + } + + while (table_scan_getnextslot(scandesc, ForwardScanDirection, slot)) + { + TupleTableSlot *copyslot; + + CHECK_FOR_INTERRUPTS(); + + if (map != NULL) + copyslot = execute_attr_map_slot(map, slot, root_slot); + else + { + /* Deconstruct the tuple */ + slot_getallattrs(slot); + copyslot = slot; + } + + /* Format and send the data */ + CopyOneRowTo(cstate, copyslot); + + /* + * Increment the number of processed tuples, and report the progress. + */ + pgstat_progress_update_param(PROGRESS_COPY_TUPLES_PROCESSED, + ++(*processed)); + } + + ExecDropSingleTupleTableSlot(slot); + + if (root_slot != NULL) + ExecDropSingleTupleTableSlot(root_slot); + + if (map != NULL) + free_attrmap(map); + + table_endscan(scandesc); +} + /* * Emit one row during DoCopyTo(). */ @@ -1434,7 +1527,7 @@ copy_dest_destroy(DestReceiver *self) DestReceiver * CreateCopyDestReceiver(void) { - DR_copy *self = (DR_copy *) palloc(sizeof(DR_copy)); + DR_copy *self = palloc_object(DR_copy); self->pub.receiveSlot = copy_dest_receive; self->pub.rStartup = copy_dest_startup; diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index dfd2ab8e8628c..ddc45e3aa0d3f 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -321,7 +321,7 @@ ExecCreateTableAs(ParseState *pstate, CreateTableAsStmt *stmt, /* plan the query */ plan = pg_plan_query(query, pstate->p_sourcetext, - CURSOR_OPT_PARALLEL_OK, params); + CURSOR_OPT_PARALLEL_OK, params, NULL); /* * Use a snapshot with an updated command ID to ensure this query sees @@ -439,7 +439,7 @@ CreateTableAsRelExists(CreateTableAsStmt *ctas) DestReceiver * CreateIntoRelDestReceiver(IntoClause *intoClause) { - DR_intorel *self = (DR_intorel *) palloc0(sizeof(DR_intorel)); + DR_intorel *self = palloc0_object(DR_intorel); self->pub.receiveSlot = intorel_receive; self->pub.rStartup = intorel_startup; diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 5fbbcdaabb1d2..d1f3be89b35f0 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -64,6 +64,7 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/lsyscache.h" #include "utils/pg_locale.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -430,7 +431,7 @@ ScanSourceDatabasePgClassTuple(HeapTupleData *tuple, Oid tbid, Oid dbid, classForm->oid); /* Prepare a rel info element and add it to the list. */ - relinfo = (CreateDBRelInfo *) palloc(sizeof(CreateDBRelInfo)); + relinfo = palloc_object(CreateDBRelInfo); if (OidIsValid(classForm->reltablespace)) relinfo->rlocator.spcOid = classForm->reltablespace; else @@ -570,8 +571,8 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, * any CREATE DATABASE commands. */ if (!IsBinaryUpgrade) - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | - CHECKPOINT_WAIT | CHECKPOINT_FLUSH_ALL); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | + CHECKPOINT_WAIT | CHECKPOINT_FLUSH_UNLOGGED); /* * Iterate through all tablespaces of the template database, and copy each @@ -673,7 +674,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, * strategy that avoids these problems. */ if (!IsBinaryUpgrade) - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); } @@ -1052,7 +1053,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbctype = src_ctype; if (dblocprovider == '\0') dblocprovider = src_locprovider; - if (dblocale == NULL) + if (dblocale == NULL && dblocprovider == src_locprovider) dblocale = src_locale; if (dbicurules == NULL) dbicurules = src_icurules; @@ -1065,16 +1066,41 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) /* Check that the chosen locales are valid, and get canonical spellings */ if (!check_locale(LC_COLLATE, dbcollate, &canonname)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), - errhint("If the locale name is specific to ICU, use ICU_LOCALE."))); + { + if (dblocprovider == COLLPROVIDER_BUILTIN) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), + errhint("If the locale name is specific to the builtin provider, use BUILTIN_LOCALE."))); + else if (dblocprovider == COLLPROVIDER_ICU) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), + errhint("If the locale name is specific to the ICU provider, use ICU_LOCALE."))); + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate))); + } dbcollate = canonname; if (!check_locale(LC_CTYPE, dbctype, &canonname)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), - errhint("If the locale name is specific to ICU, use ICU_LOCALE."))); + { + if (dblocprovider == COLLPROVIDER_BUILTIN) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), + errhint("If the locale name is specific to the builtin provider, use BUILTIN_LOCALE."))); + else if (dblocprovider == COLLPROVIDER_ICU) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), + errhint("If the locale name is specific to the ICU provider, use ICU_LOCALE."))); + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype))); + } + dbctype = canonname; check_encoding_locale_matches(encoding, dbcollate, dbctype); @@ -1845,7 +1871,7 @@ dropdb(const char *dbname, bool missing_ok, bool force) * Force a checkpoint to make sure the checkpointer has received the * message sent by ForgetDatabaseSyncRequests. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* Close all smgr fds in all backends. */ WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); @@ -2095,8 +2121,8 @@ movedb(const char *dbname, const char *tblspcname) * On Windows, this also ensures that background procs don't hold any open * files, which would cause rmdir() to fail. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT - | CHECKPOINT_FLUSH_ALL); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT + | CHECKPOINT_FLUSH_UNLOGGED); /* Close all smgr fds in all backends. */ WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); @@ -2227,7 +2253,7 @@ movedb(const char *dbname, const char *tblspcname) * any unlogged operations done in the new DB tablespace before the * next checkpoint. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* * Force synchronous commit, thus minimizing the window between @@ -2328,7 +2354,8 @@ DropDatabase(ParseState *pstate, DropdbStmt *stmt) else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized DROP DATABASE option \"%s\"", opt->defname), + errmsg("unrecognized %s option \"%s\"", + "DROP DATABASE", opt->defname), parser_errposition(pstate, opt->location))); } @@ -3179,30 +3206,6 @@ get_database_oid(const char *dbname, bool missing_ok) } -/* - * get_database_name - given a database OID, look up the name - * - * Returns a palloc'd string, or NULL if no such database. - */ -char * -get_database_name(Oid dbid) -{ - HeapTuple dbtuple; - char *result; - - dbtuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(dbid)); - if (HeapTupleIsValid(dbtuple)) - { - result = pstrdup(NameStr(((Form_pg_database) GETSTRUCT(dbtuple))->datname)); - ReleaseSysCache(dbtuple); - } - else - result = NULL; - - return result; -} - - /* * While dropping a database the pg_database row is marked invalid, but the * catalog contents still exist. Connections to such a database are not @@ -3373,6 +3376,7 @@ dbase_redo(XLogReaderState *record) parent_path = pstrdup(dbpath); get_parent_directory(parent_path); recovery_create_dbdir(parent_path, true); + pfree(parent_path); /* Create the database directory with the version file. */ CreateDirAndVersionFile(dbpath, xlrec->db_id, xlrec->tablespace_id, diff --git a/src/backend/commands/define.c b/src/backend/commands/define.c index 5e1b867e6f733..3e238c414f7ef 100644 --- a/src/backend/commands/define.c +++ b/src/backend/commands/define.c @@ -42,7 +42,7 @@ defGetString(DefElem *def) switch (nodeTag(def->arg)) { case T_Integer: - return psprintf("%ld", (long) intVal(def->arg)); + return psprintf("%d", intVal(def->arg)); case T_Float: return castNode(Float, def->arg)->fval; case T_Boolean: diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index edc2c988e2934..e718230813018 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -21,6 +21,7 @@ #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/objectaccess.h" +#include "catalog/pg_attrdef.h" #include "catalog/pg_authid.h" #include "catalog/pg_auth_members.h" #include "catalog/pg_database.h" @@ -29,6 +30,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_parameter_acl.h" +#include "catalog/pg_policy.h" #include "catalog/pg_proc.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_trigger.h" @@ -109,6 +111,8 @@ static Oid insert_event_trigger_tuple(const char *trigname, const char *eventnam static void validate_ddl_tags(const char *filtervar, List *taglist); static void validate_table_rewrite_tags(const char *filtervar, List *taglist); static void EventTriggerInvoke(List *fn_oid_list, EventTriggerData *trigdata); +static bool obtain_object_name_namespace(const ObjectAddress *object, + SQLDropObject *obj); static const char *stringify_grant_objtype(ObjectType objtype); static const char *stringify_adefprivs_objtype(ObjectType objtype); static void SetDatabaseHasLoginEventTriggers(void); @@ -360,7 +364,7 @@ filter_list_to_array(List *filterlist) int i = 0, l = list_length(filterlist); - data = (Datum *) palloc(l * sizeof(Datum)); + data = palloc_array(Datum, l); foreach(lc, filterlist) { @@ -1280,34 +1284,179 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no Assert(EventTriggerSupportsObject(object)); - /* don't report temp schemas except my own */ - if (object->classId == NamespaceRelationId && - (isAnyTempNamespace(object->objectId) && - !isTempNamespace(object->objectId))) - return; - oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - obj = palloc0(sizeof(SQLDropObject)); + obj = palloc0_object(SQLDropObject); obj->address = *object; obj->original = original; obj->normal = normal; + if (object->classId == NamespaceRelationId) + { + /* Special handling is needed for temp namespaces */ + if (isTempNamespace(object->objectId)) + obj->istemp = true; + else if (isAnyTempNamespace(object->objectId)) + { + /* don't report temp schemas except my own */ + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + obj->objname = get_namespace_name(object->objectId); + } + else if (object->classId == AttrDefaultRelationId) + { + /* We treat a column default as temp if its table is temp */ + ObjectAddress colobject; + + colobject = GetAttrDefaultColumnAddress(object->objectId); + if (OidIsValid(colobject.objectId)) + { + if (!obtain_object_name_namespace(&colobject, obj)) + { + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + } + else if (object->classId == TriggerRelationId) + { + /* Similarly, a trigger is temp if its table is temp */ + /* Sadly, there's no lsyscache.c support for trigger objects */ + Relation pg_trigger_rel; + ScanKeyData skey[1]; + SysScanDesc sscan; + HeapTuple tuple; + Oid relid; + + /* Fetch the trigger's table OID the hard way */ + pg_trigger_rel = table_open(TriggerRelationId, AccessShareLock); + ScanKeyInit(&skey[0], + Anum_pg_trigger_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object->objectId)); + sscan = systable_beginscan(pg_trigger_rel, TriggerOidIndexId, true, + NULL, 1, skey); + tuple = systable_getnext(sscan); + if (HeapTupleIsValid(tuple)) + relid = ((Form_pg_trigger) GETSTRUCT(tuple))->tgrelid; + else + relid = InvalidOid; /* shouldn't happen */ + systable_endscan(sscan); + table_close(pg_trigger_rel, AccessShareLock); + /* Do nothing if we didn't find the trigger */ + if (OidIsValid(relid)) + { + ObjectAddress relobject; + + relobject.classId = RelationRelationId; + relobject.objectId = relid; + /* Arbitrarily set objectSubId nonzero so as not to fill objname */ + relobject.objectSubId = 1; + if (!obtain_object_name_namespace(&relobject, obj)) + { + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + } + else if (object->classId == PolicyRelationId) + { + /* Similarly, a policy is temp if its table is temp */ + /* Sadly, there's no lsyscache.c support for policy objects */ + Relation pg_policy_rel; + ScanKeyData skey[1]; + SysScanDesc sscan; + HeapTuple tuple; + Oid relid; + + /* Fetch the policy's table OID the hard way */ + pg_policy_rel = table_open(PolicyRelationId, AccessShareLock); + ScanKeyInit(&skey[0], + Anum_pg_policy_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object->objectId)); + sscan = systable_beginscan(pg_policy_rel, PolicyOidIndexId, true, + NULL, 1, skey); + tuple = systable_getnext(sscan); + if (HeapTupleIsValid(tuple)) + relid = ((Form_pg_policy) GETSTRUCT(tuple))->polrelid; + else + relid = InvalidOid; /* shouldn't happen */ + systable_endscan(sscan); + table_close(pg_policy_rel, AccessShareLock); + /* Do nothing if we didn't find the policy */ + if (OidIsValid(relid)) + { + ObjectAddress relobject; + + relobject.classId = RelationRelationId; + relobject.objectId = relid; + /* Arbitrarily set objectSubId nonzero so as not to fill objname */ + relobject.objectSubId = 1; + if (!obtain_object_name_namespace(&relobject, obj)) + { + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + } + else + { + /* Generic handling for all other object classes */ + if (!obtain_object_name_namespace(object, obj)) + { + /* don't report temp objects except my own */ + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + + /* object identity, objname and objargs */ + obj->objidentity = + getObjectIdentityParts(&obj->address, &obj->addrnames, &obj->addrargs, + false); + + /* object type */ + obj->objecttype = getObjectTypeDescription(&obj->address, false); + + slist_push_head(&(currentEventTriggerState->SQLDropList), &obj->next); + + MemoryContextSwitchTo(oldcxt); +} + +/* + * Fill obj->objname, obj->schemaname, and obj->istemp based on object. + * + * Returns true if this object should be reported, false if it should + * be ignored because it is a temporary object of another session. + */ +static bool +obtain_object_name_namespace(const ObjectAddress *object, SQLDropObject *obj) +{ /* * Obtain schema names from the object's catalog tuple, if one exists; * this lets us skip objects in temp schemas. We trust that * ObjectProperty contains all object classes that can be * schema-qualified. + * + * Currently, this function does nothing for object classes that are not + * in ObjectProperty, but we might sometime add special cases for that. */ if (is_objectclass_supported(object->classId)) { Relation catalog; HeapTuple tuple; - catalog = table_open(obj->address.classId, AccessShareLock); + catalog = table_open(object->classId, AccessShareLock); tuple = get_catalog_object_by_oid(catalog, get_object_attnum_oid(object->classId), - obj->address.objectId); + object->objectId); if (tuple) { @@ -1315,7 +1464,7 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no Datum datum; bool isnull; - attnum = get_object_attnum_namespace(obj->address.classId); + attnum = get_object_attnum_namespace(object->classId); if (attnum != InvalidAttrNumber) { datum = heap_getattr(tuple, attnum, @@ -1333,10 +1482,9 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no } else if (isAnyTempNamespace(namespaceId)) { - pfree(obj); + /* no need to fill any fields of *obj */ table_close(catalog, AccessShareLock); - MemoryContextSwitchTo(oldcxt); - return; + return false; } else { @@ -1346,10 +1494,10 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no } } - if (get_object_namensp_unique(obj->address.classId) && - obj->address.objectSubId == 0) + if (get_object_namensp_unique(object->classId) && + object->objectSubId == 0) { - attnum = get_object_attnum_name(obj->address.classId); + attnum = get_object_attnum_name(object->classId); if (attnum != InvalidAttrNumber) { datum = heap_getattr(tuple, attnum, @@ -1362,24 +1510,8 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no table_close(catalog, AccessShareLock); } - else - { - if (object->classId == NamespaceRelationId && - isTempNamespace(object->objectId)) - obj->istemp = true; - } - - /* object identity, objname and objargs */ - obj->objidentity = - getObjectIdentityParts(&obj->address, &obj->addrnames, &obj->addrargs, - false); - - /* object type */ - obj->objecttype = getObjectTypeDescription(&obj->address, false); - slist_push_head(&(currentEventTriggerState->SQLDropList), &obj->next); - - MemoryContextSwitchTo(oldcxt); + return true; } /* @@ -1594,7 +1726,7 @@ EventTriggerCollectSimpleCommand(ObjectAddress address, oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - command = palloc(sizeof(CollectedCommand)); + command = palloc_object(CollectedCommand); command->type = SCT_Simple; command->in_extension = creating_extension; @@ -1630,7 +1762,7 @@ EventTriggerAlterTableStart(Node *parsetree) oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - command = palloc(sizeof(CollectedCommand)); + command = palloc_object(CollectedCommand); command->type = SCT_AlterTable; command->in_extension = creating_extension; @@ -1686,7 +1818,7 @@ EventTriggerCollectAlterTableSubcmd(Node *subcmd, ObjectAddress address) oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - newsub = palloc(sizeof(CollectedATSubcmd)); + newsub = palloc_object(CollectedATSubcmd); newsub->address = address; newsub->parsetree = copyObject(subcmd); @@ -1760,7 +1892,7 @@ EventTriggerCollectGrant(InternalGrant *istmt) /* * This is tedious, but necessary. */ - icopy = palloc(sizeof(InternalGrant)); + icopy = palloc_object(InternalGrant); memcpy(icopy, istmt, sizeof(InternalGrant)); icopy->objects = list_copy(istmt->objects); icopy->grantees = list_copy(istmt->grantees); @@ -1769,7 +1901,7 @@ EventTriggerCollectGrant(InternalGrant *istmt) icopy->col_privs = lappend(icopy->col_privs, copyObject(lfirst(cell))); /* Now collect it, using the copied InternalGrant */ - command = palloc(sizeof(CollectedCommand)); + command = palloc_object(CollectedCommand); command->type = SCT_Grant; command->in_extension = creating_extension; command->d.grant.istmt = icopy; @@ -1800,7 +1932,7 @@ EventTriggerCollectAlterOpFam(AlterOpFamilyStmt *stmt, Oid opfamoid, oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - command = palloc(sizeof(CollectedCommand)); + command = palloc_object(CollectedCommand); command->type = SCT_AlterOpFamily; command->in_extension = creating_extension; ObjectAddressSet(command->d.opfam.address, @@ -1833,7 +1965,7 @@ EventTriggerCollectCreateOpClass(CreateOpClassStmt *stmt, Oid opcoid, oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - command = palloc0(sizeof(CollectedCommand)); + command = palloc0_object(CollectedCommand); command->type = SCT_CreateOpClass; command->in_extension = creating_extension; ObjectAddressSet(command->d.createopc.address, @@ -1867,12 +1999,12 @@ EventTriggerCollectAlterTSConfig(AlterTSConfigurationStmt *stmt, Oid cfgId, oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - command = palloc0(sizeof(CollectedCommand)); + command = palloc0_object(CollectedCommand); command->type = SCT_AlterTSConfig; command->in_extension = creating_extension; ObjectAddressSet(command->d.atscfg.address, TSConfigRelationId, cfgId); - command->d.atscfg.dictIds = palloc(sizeof(Oid) * ndicts); + command->d.atscfg.dictIds = palloc_array(Oid, ndicts); memcpy(command->d.atscfg.dictIds, dictIds, sizeof(Oid) * ndicts); command->d.atscfg.ndicts = ndicts; command->parsetree = (Node *) copyObject(stmt); @@ -1901,7 +2033,7 @@ EventTriggerCollectAlterDefPrivs(AlterDefaultPrivilegesStmt *stmt) oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); - command = palloc0(sizeof(CollectedCommand)); + command = palloc0_object(CollectedCommand); command->type = SCT_AlterDefaultPrivileges; command->d.defprivs.objtype = stmt->action->objtype; command->in_extension = creating_extension; @@ -2021,8 +2153,8 @@ pg_event_trigger_ddl_commands(PG_FUNCTION_ARGS) elog(ERROR, "cache lookup failed for object %u/%u", addr.classId, addr.objectId); schema_oid = - heap_getattr(objtup, nspAttnum, - RelationGetDescr(catalog), &isnull); + DatumGetObjectId(heap_getattr(objtup, nspAttnum, + RelationGetDescr(catalog), &isnull)); if (isnull) elog(ERROR, "invalid null namespace in object %u/%u/%d", diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index bfa83fbc3fec8..5a6390631eba9 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -147,6 +147,7 @@ static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); +static void show_result_replacement_info(Result *result, ExplainState *es); static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir, ExplainState *es); static void ExplainScanTarget(Scan *plan, ExplainState *es); @@ -350,7 +351,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, INSTR_TIME_SET_CURRENT(planstart); /* plan the query */ - plan = pg_plan_query(query, queryString, cursorOptions, params); + plan = pg_plan_query(query, queryString, cursorOptions, params, es); INSTR_TIME_SET_CURRENT(planduration); INSTR_TIME_SUBTRACT(planduration, planstart); @@ -811,14 +812,10 @@ ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc) * the queryid in any of the EXPLAIN plans to keep stable the results * generated by regression test suites. */ - if (es->verbose && queryDesc->plannedstmt->queryId != UINT64CONST(0) && + if (es->verbose && queryDesc->plannedstmt->queryId != INT64CONST(0) && compute_query_id != COMPUTE_QUERY_ID_REGRESS) { - /* - * Output the queryid as an int64 rather than a uint64 so we match - * what would be seen in the BIGINT pg_stat_statements.queryid column. - */ - ExplainPropertyInteger("Query Identifier", NULL, (int64) + ExplainPropertyInteger("Query Identifier", NULL, queryDesc->plannedstmt->queryId, es); } } @@ -1233,6 +1230,10 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) *rels_used = bms_add_members(*rels_used, ((MergeAppend *) plan)->apprelids); break; + case T_Result: + *rels_used = bms_add_members(*rels_used, + ((Result *) plan)->relids); + break; default: break; } @@ -2236,6 +2237,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ancestors, es); break; case T_Result: + show_result_replacement_info(castNode(Result, plan), es); show_upper_qual((List *) ((Result *) plan)->resconstantqual, "One-Time Filter", planstate, ancestors, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); @@ -3586,6 +3588,7 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) { Plan *plan = ((PlanState *) mstate)->plan; + Memoize *mplan = (Memoize *) plan; ListCell *lc; List *context; StringInfoData keystr; @@ -3606,7 +3609,7 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) plan, ancestors); - foreach(lc, ((Memoize *) plan)->param_exprs) + foreach(lc, mplan->param_exprs) { Node *expr = (Node *) lfirst(lc); @@ -3622,6 +3625,24 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) pfree(keystr.data); + if (es->costs) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Estimates: capacity=%u distinct keys=%.0f lookups=%.0f hit percent=%.2f%%\n", + mplan->est_entries, mplan->est_unique_keys, + mplan->est_calls, mplan->est_hit_ratio * 100.0); + } + else + { + ExplainPropertyUInteger("Estimated Capacity", NULL, mplan->est_entries, es); + ExplainPropertyFloat("Estimated Distinct Lookup Keys", NULL, mplan->est_unique_keys, 0, es); + ExplainPropertyFloat("Estimated Lookups", NULL, mplan->est_calls, 0, es); + ExplainPropertyFloat("Estimated Hit Percent", NULL, mplan->est_hit_ratio * 100.0, 2, es); + } + } + if (!es->analyze) return; @@ -4262,7 +4283,8 @@ show_wal_usage(ExplainState *es, const WalUsage *usage) { /* Show only positive counter values. */ if ((usage->wal_records > 0) || (usage->wal_fpi > 0) || - (usage->wal_bytes > 0) || (usage->wal_buffers_full > 0)) + (usage->wal_bytes > 0) || (usage->wal_buffers_full > 0) || + (usage->wal_fpi_bytes > 0)) { ExplainIndentText(es); appendStringInfoString(es->str, "WAL:"); @@ -4276,6 +4298,9 @@ show_wal_usage(ExplainState *es, const WalUsage *usage) if (usage->wal_bytes > 0) appendStringInfo(es->str, " bytes=%" PRIu64, usage->wal_bytes); + if (usage->wal_fpi_bytes > 0) + appendStringInfo(es->str, " fpi bytes=%" PRIu64, + usage->wal_fpi_bytes); if (usage->wal_buffers_full > 0) appendStringInfo(es->str, " buffers full=%" PRId64, usage->wal_buffers_full); @@ -4290,6 +4315,8 @@ show_wal_usage(ExplainState *es, const WalUsage *usage) usage->wal_fpi, es); ExplainPropertyUInteger("WAL Bytes", NULL, usage->wal_bytes, es); + ExplainPropertyUInteger("WAL FPI Bytes", NULL, + usage->wal_fpi_bytes, es); ExplainPropertyInteger("WAL Buffers Full", NULL, usage->wal_buffers_full, es); } @@ -4735,6 +4762,102 @@ show_modifytable_info(ModifyTableState *mtstate, List *ancestors, ExplainCloseGroup("Target Tables", "Target Tables", false, es); } +/* + * Explain what a "Result" node replaced. + */ +static void +show_result_replacement_info(Result *result, ExplainState *es) +{ + StringInfoData buf; + int nrels = 0; + int rti = -1; + bool found_non_result = false; + char *replacement_type = "???"; + + /* If the Result node has a subplan, it didn't replace anything. */ + if (result->plan.lefttree != NULL) + return; + + /* Gating result nodes should have a subplan, and we don't. */ + Assert(result->result_type != RESULT_TYPE_GATING); + + switch (result->result_type) + { + case RESULT_TYPE_GATING: + replacement_type = "Gating"; + break; + case RESULT_TYPE_SCAN: + replacement_type = "Scan"; + break; + case RESULT_TYPE_JOIN: + replacement_type = "Join"; + break; + case RESULT_TYPE_UPPER: + /* a small white lie */ + replacement_type = "Aggregate"; + break; + case RESULT_TYPE_MINMAX: + replacement_type = "MinMaxAggregate"; + break; + } + + /* + * Build up a comma-separated list of user-facing names for the range + * table entries in the relids set. + */ + initStringInfo(&buf); + while ((rti = bms_next_member(result->relids, rti)) >= 0) + { + RangeTblEntry *rte = rt_fetch(rti, es->rtable); + char *refname; + + /* + * add_outer_joins_to_relids will add join RTIs to the relids set of a + * join; if that join is then replaced with a Result node, we may see + * such RTIs here. But we want to completely ignore those here, + * because "a LEFT JOIN b ON whatever" is a join between a and b, not + * a join between a, b, and an unnamed join. + */ + if (rte->rtekind == RTE_JOIN) + continue; + + /* Count the number of rels that aren't ignored completely. */ + ++nrels; + + /* Work out what reference name to use and add it to the string. */ + refname = (char *) list_nth(es->rtable_names, rti - 1); + if (refname == NULL) + refname = rte->eref->aliasname; + if (buf.len > 0) + appendStringInfoString(&buf, ", "); + appendStringInfoString(&buf, refname); + + /* Keep track of whether we see anything other than RTE_RESULT. */ + if (rte->rtekind != RTE_RESULT) + found_non_result = true; + } + + /* + * If this Result node is because of a single RTE that is RTE_RESULT, it + * is not really replacing anything at all, because there's no other + * method for implementing a scan of such an RTE, so we don't display the + * Replaces line in such cases. + */ + if (nrels <= 1 && !found_non_result && + result->result_type == RESULT_TYPE_SCAN) + return; + + /* Say what we replaced, with list of rels if available. */ + if (buf.len == 0) + ExplainPropertyText("Replaces", replacement_type, es); + else + { + char *s = psprintf("%s on %s", replacement_type, buf.data); + + ExplainPropertyText("Replaces", s, es); + } +} + /* * Explain the constituent plans of an Append, MergeAppend, * BitmapAnd, or BitmapOr node. @@ -4784,6 +4907,7 @@ ExplainSubPlans(List *plans, List *ancestors, { SubPlanState *sps = (SubPlanState *) lfirst(lst); SubPlan *sp = sps->subplan; + char *cooked_plan_name; /* * There can be multiple SubPlan nodes referencing the same physical @@ -4807,8 +4931,20 @@ ExplainSubPlans(List *plans, List *ancestors, */ ancestors = lcons(sp, ancestors); + /* + * The plan has a name like exists_1 or rowcompare_2, but here we want + * to prefix that with CTE, InitPlan, or SubPlan, as appropriate, for + * display purposes. + */ + if (sp->subLinkType == CTE_SUBLINK) + cooked_plan_name = psprintf("CTE %s", sp->plan_name); + else if (sp->isInitPlan) + cooked_plan_name = psprintf("InitPlan %s", sp->plan_name); + else + cooked_plan_name = psprintf("SubPlan %s", sp->plan_name); + ExplainNode(sps->planstate, ancestors, - relationship, sp->plan_name, es); + relationship, cooked_plan_name, es); ancestors = list_delete_first(ancestors); } @@ -4844,7 +4980,7 @@ ExplainCreateWorkersState(int num_workers) { ExplainWorkersState *wstate; - wstate = (ExplainWorkersState *) palloc(sizeof(ExplainWorkersState)); + wstate = palloc_object(ExplainWorkersState); wstate->num_workers = num_workers; wstate->worker_inited = (bool *) palloc0(num_workers * sizeof(bool)); wstate->worker_str = (StringInfoData *) diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index 5715546cf437b..5833aa80fb5b2 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994-5, Regents of the University of California * * IDENTIFICATION - * src/backend/commands/explain.c + * src/backend/commands/explain_dr.c * *------------------------------------------------------------------------- */ @@ -19,6 +19,7 @@ #include "libpq/pqformat.h" #include "libpq/protocol.h" #include "utils/lsyscache.h" +#include "varatt.h" /* * DestReceiver functions for SERIALIZE option @@ -275,7 +276,7 @@ CreateExplainSerializeDestReceiver(ExplainState *es) { SerializeDestReceiver *self; - self = (SerializeDestReceiver *) palloc0(sizeof(SerializeDestReceiver)); + self = palloc0_object(SerializeDestReceiver); self->pub.receiveSlot = serializeAnalyzeReceive; self->pub.rStartup = serializeAnalyzeStartup; diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c index 60d98d63a62e2..a6623f8fa5295 100644 --- a/src/backend/commands/explain_state.c +++ b/src/backend/commands/explain_state.c @@ -60,7 +60,7 @@ static int ExplainExtensionOptionsAllocated = 0; ExplainState * NewExplainState(void) { - ExplainState *es = (ExplainState *) palloc0(sizeof(ExplainState)); + ExplainState *es = palloc0_object(ExplainState); /* Set default options (most fields can be left as zeroes). */ es->costs = true; @@ -130,8 +130,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized value for EXPLAIN option \"%s\": \"%s\"", - opt->defname, p), + errmsg("unrecognized value for %s option \"%s\": \"%s\"", + "EXPLAIN", opt->defname, p), parser_errposition(pstate, opt->location))); } else @@ -155,15 +155,15 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized value for EXPLAIN option \"%s\": \"%s\"", - opt->defname, p), + errmsg("unrecognized value for %s option \"%s\": \"%s\"", + "EXPLAIN", opt->defname, p), parser_errposition(pstate, opt->location))); } else if (!ApplyExtensionExplainOption(es, opt, pstate)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized EXPLAIN option \"%s\"", - opt->defname), + errmsg("unrecognized %s option \"%s\"", + "EXPLAIN", opt->defname), parser_errposition(pstate, opt->location))); } @@ -195,7 +195,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) if (es->generic && es->analyze) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("EXPLAIN options ANALYZE and GENERIC_PLAN cannot be used together"))); + errmsg("%s options %s and %s cannot be used together", + "EXPLAIN", "ANALYZE", "GENERIC_PLAN"))); /* if the summary was not set explicitly, set default value */ es->summary = (summary_set) ? es->summary : es->analyze; @@ -281,7 +282,8 @@ SetExplainExtensionState(ExplainState *es, int extension_id, void *opaque) /* If there is no array yet, create one. */ if (es->extension_state == NULL) { - es->extension_state_allocated = 16; + es->extension_state_allocated = + Max(16, pg_nextpower2_32(extension_id + 1)); es->extension_state = palloc0(es->extension_state_allocated * sizeof(void *)); } @@ -291,11 +293,8 @@ SetExplainExtensionState(ExplainState *es, int extension_id, void *opaque) { int i; - i = pg_nextpower2_32(es->extension_state_allocated + 1); - es->extension_state = (void **) - repalloc0(es->extension_state, - es->extension_state_allocated * sizeof(void *), - i * sizeof(void *)); + i = pg_nextpower2_32(extension_id + 1); + es->extension_state = repalloc0_array(es->extension_state, void *, es->extension_state_allocated, i); es->extension_state_allocated = i; } diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index e6f9ab6dfd66b..c43b74e319e80 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -724,7 +724,7 @@ read_extension_aux_control_file(const ExtensionControlFile *pcontrol, /* * Flat-copy the struct. Pointer fields share values with original. */ - acontrol = (ExtensionControlFile *) palloc(sizeof(ExtensionControlFile)); + acontrol = palloc_object(ExtensionControlFile); memcpy(acontrol, pcontrol, sizeof(ExtensionControlFile)); /* @@ -931,7 +931,7 @@ execute_sql_string(const char *sql, const char *filename) callback_arg.stmt_len = -1; scripterrcontext.callback = script_error_callback; - scripterrcontext.arg = (void *) &callback_arg; + scripterrcontext.arg = &callback_arg; scripterrcontext.previous = error_context_stack; error_context_stack = &scripterrcontext; @@ -1349,7 +1349,7 @@ get_ext_ver_info(const char *versionname, List **evi_list) return evi; } - evi = (ExtensionVersionInfo *) palloc(sizeof(ExtensionVersionInfo)); + evi = palloc_object(ExtensionVersionInfo); evi->name = pstrdup(versionname); evi->reachable = NIL; evi->installable = false; @@ -2208,6 +2208,7 @@ pg_available_extensions(PG_FUNCTION_ARGS) List *locations; DIR *dir; struct dirent *de; + List *found_ext = NIL; /* Build tuplestore to hold the result rows */ InitMaterializedSRF(fcinfo, 0); @@ -2232,6 +2233,7 @@ pg_available_extensions(PG_FUNCTION_ARGS) { ExtensionControlFile *control; char *extname; + String *extname_str; Datum values[3]; bool nulls[3]; @@ -2246,6 +2248,16 @@ pg_available_extensions(PG_FUNCTION_ARGS) if (strstr(extname, "--")) continue; + /* + * Ignore already-found names. They are not reachable by the + * path search, so don't shown them. + */ + extname_str = makeString(extname); + if (list_member(found_ext, extname_str)) + continue; + else + found_ext = lappend(found_ext, extname_str); + control = new_ExtensionControlFile(extname); control->control_dir = pstrdup(location); parse_extension_control_file(control, NULL); @@ -2294,6 +2306,7 @@ pg_available_extension_versions(PG_FUNCTION_ARGS) List *locations; DIR *dir; struct dirent *de; + List *found_ext = NIL; /* Build tuplestore to hold the result rows */ InitMaterializedSRF(fcinfo, 0); @@ -2318,6 +2331,7 @@ pg_available_extension_versions(PG_FUNCTION_ARGS) { ExtensionControlFile *control; char *extname; + String *extname_str; if (!is_extension_control_filename(de->d_name)) continue; @@ -2330,6 +2344,16 @@ pg_available_extension_versions(PG_FUNCTION_ARGS) if (strstr(extname, "--")) continue; + /* + * Ignore already-found names. They are not reachable by the + * path search, so don't shown them. + */ + extname_str = makeString(extname); + if (list_member(found_ext, extname_str)) + continue; + else + found_ext = lappend(found_ext, extname_str); + /* read the control file */ control = new_ExtensionControlFile(extname); control->control_dir = pstrdup(location); diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index c14e038d54f14..536065dc515bc 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -71,15 +71,26 @@ optionListToArray(List *options) foreach(cell, options) { DefElem *def = lfirst(cell); + const char *name; const char *value; Size len; text *t; + name = def->defname; value = defGetString(def); - len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value); + + /* Insist that name not contain "=", else "a=b=c" is ambiguous */ + if (strchr(name, '=') != NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option name \"%s\": must not contain \"=\"", + name))); + + len = VARHDRSZ + strlen(name) + 1 + strlen(value); + /* +1 leaves room for sprintf's trailing null */ t = palloc(len + 1); SET_VARSIZE(t, len); - sprintf(VARDATA(t), "%s=%s", def->defname, value); + sprintf(VARDATA(t), "%s=%s", name, value); astate = accumArrayResult(astate, PointerGetDatum(t), false, TEXTOID, @@ -621,7 +632,7 @@ CreateForeignDataWrapper(ParseState *pstate, CreateFdwStmt *stmt) stmt->options, fdwvalidator); - if (PointerIsValid(DatumGetPointer(fdwoptions))) + if (DatumGetPointer(fdwoptions) != NULL) values[Anum_pg_foreign_data_wrapper_fdwoptions - 1] = fdwoptions; else nulls[Anum_pg_foreign_data_wrapper_fdwoptions - 1] = true; @@ -772,7 +783,7 @@ AlterForeignDataWrapper(ParseState *pstate, AlterFdwStmt *stmt) stmt->options, fdwvalidator); - if (PointerIsValid(DatumGetPointer(datum))) + if (DatumGetPointer(datum) != NULL) repl_val[Anum_pg_foreign_data_wrapper_fdwoptions - 1] = datum; else repl_null[Anum_pg_foreign_data_wrapper_fdwoptions - 1] = true; @@ -932,7 +943,7 @@ CreateForeignServer(CreateForeignServerStmt *stmt) stmt->options, fdw->fdwvalidator); - if (PointerIsValid(DatumGetPointer(srvoptions))) + if (DatumGetPointer(srvoptions) != NULL) values[Anum_pg_foreign_server_srvoptions - 1] = srvoptions; else nulls[Anum_pg_foreign_server_srvoptions - 1] = true; @@ -1040,7 +1051,7 @@ AlterForeignServer(AlterForeignServerStmt *stmt) stmt->options, fdw->fdwvalidator); - if (PointerIsValid(DatumGetPointer(datum))) + if (DatumGetPointer(datum) != NULL) repl_val[Anum_pg_foreign_server_srvoptions - 1] = datum; else repl_null[Anum_pg_foreign_server_srvoptions - 1] = true; @@ -1176,7 +1187,7 @@ CreateUserMapping(CreateUserMappingStmt *stmt) stmt->options, fdw->fdwvalidator); - if (PointerIsValid(DatumGetPointer(useoptions))) + if (DatumGetPointer(useoptions) != NULL) values[Anum_pg_user_mapping_umoptions - 1] = useoptions; else nulls[Anum_pg_user_mapping_umoptions - 1] = true; @@ -1290,7 +1301,7 @@ AlterUserMapping(AlterUserMappingStmt *stmt) stmt->options, fdw->fdwvalidator); - if (PointerIsValid(DatumGetPointer(datum))) + if (DatumGetPointer(datum) != NULL) repl_val[Anum_pg_user_mapping_umoptions - 1] = datum; else repl_null[Anum_pg_user_mapping_umoptions - 1] = true; @@ -1453,7 +1464,7 @@ CreateForeignTable(CreateForeignTableStmt *stmt, Oid relid) stmt->options, fdw->fdwvalidator); - if (PointerIsValid(DatumGetPointer(ftoptions))) + if (DatumGetPointer(ftoptions) != NULL) values[Anum_pg_foreign_table_ftoptions - 1] = ftoptions; else nulls[Anum_pg_foreign_table_ftoptions - 1] = true; @@ -1577,6 +1588,7 @@ ImportForeignSchema(ImportForeignSchemaStmt *stmt) pstmt->utilityStmt = (Node *) cstmt; pstmt->stmt_location = rs->stmt_location; pstmt->stmt_len = rs->stmt_len; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* Execute statement */ ProcessUtility(pstmt, cmd, false, diff --git a/src/backend/commands/functioncmds.c b/src/backend/commands/functioncmds.c index 0335e982b318b..8a435cd93dbf7 100644 --- a/src/backend/commands/functioncmds.c +++ b/src/backend/commands/functioncmds.c @@ -153,6 +153,8 @@ compute_return_type(TypeName *returnType, Oid languageOid, address = TypeShellMake(typname, namespaceId, GetUserId()); rettype = address.objectId; Assert(OidIsValid(rettype)); + /* Ensure the new shell type is visible to ProcedureCreate */ + CommandCounterIncrement(); } aclresult = object_aclcheck(TypeRelationId, rettype, GetUserId(), ACL_USAGE); @@ -911,7 +913,7 @@ interpret_AS_clause(Oid languageOid, const char *languageName, { SQLFunctionParseInfoPtr pinfo; - pinfo = (SQLFunctionParseInfoPtr) palloc0(sizeof(SQLFunctionParseInfo)); + pinfo = palloc0_object(SQLFunctionParseInfo); pinfo->fname = funcname; pinfo->nargs = list_length(parameterTypes); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index d962fe392cd27..d9cccb6ac1885 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -38,7 +38,6 @@ #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" #include "commands/comment.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/progress.h" @@ -1790,6 +1789,7 @@ DefineIndex(Oid tableId, * before the reference snap was taken, we have to wait out any * transactions that might have older snapshots. */ + INJECTION_POINT("define-index-before-set-valid", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_3); WaitForOlderSnapshots(limitXmin, true); @@ -2469,8 +2469,8 @@ GetOperatorFromCompareType(Oid opclass, Oid rhstype, CompareType cmptype, cmptype == COMPARE_EQ ? errmsg("could not identify an equality operator for type %s", format_type_be(opcintype)) : cmptype == COMPARE_OVERLAP ? errmsg("could not identify an overlaps operator for type %s", format_type_be(opcintype)) : cmptype == COMPARE_CONTAINED_BY ? errmsg("could not identify a contained-by operator for type %s", format_type_be(opcintype)) : 0, - errdetail("Could not translate compare type %d for operator family \"%s\", input type %s, access method \"%s\".", - cmptype, get_opfamily_name(opfamily, false), format_type_be(opcintype), get_am_name(amid))); + errdetail("Could not translate compare type %d for operator family \"%s\" of access method \"%s\".", + cmptype, get_opfamily_name(opfamily, false), get_am_name(amid))); /* * We parameterize rhstype so foreign keys can ask for a <@ operator @@ -2592,7 +2592,9 @@ makeObjectName(const char *name1, const char *name2, const char *label) * constraint names.) * * Note: it is theoretically possible to get a collision anyway, if someone - * else chooses the same name concurrently. This is fairly unlikely to be + * else chooses the same name concurrently. We shorten the race condition + * window by checking for conflicting relations using SnapshotDirty, but + * that doesn't close the window entirely. This is fairly unlikely to be * a problem in practice, especially if one is holding an exclusive lock on * the relation identified by name1. However, if choosing multiple names * within a single command, you'd better create the new object and do @@ -2608,15 +2610,45 @@ ChooseRelationName(const char *name1, const char *name2, int pass = 0; char *relname = NULL; char modlabel[NAMEDATALEN]; + SnapshotData SnapshotDirty; + Relation pgclassrel; + + /* prepare to search pg_class with a dirty snapshot */ + InitDirtySnapshot(SnapshotDirty); + pgclassrel = table_open(RelationRelationId, AccessShareLock); /* try the unmodified label first */ strlcpy(modlabel, label, sizeof(modlabel)); for (;;) { + ScanKeyData key[2]; + SysScanDesc scan; + bool collides; + relname = makeObjectName(name1, name2, modlabel); - if (!OidIsValid(get_relname_relid(relname, namespaceid))) + /* is there any conflicting relation name? */ + ScanKeyInit(&key[0], + Anum_pg_class_relname, + BTEqualStrategyNumber, F_NAMEEQ, + CStringGetDatum(relname)); + ScanKeyInit(&key[1], + Anum_pg_class_relnamespace, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(namespaceid)); + + scan = systable_beginscan(pgclassrel, ClassNameNspIndexId, + true /* indexOK */ , + &SnapshotDirty, + 2, key); + + collides = HeapTupleIsValid(systable_getnext(scan)); + + systable_endscan(scan); + + /* break out of loop if no conflict */ + if (!collides) { if (!isconstraint || !ConstraintNameExists(relname, namespaceid)) @@ -2628,6 +2660,8 @@ ChooseRelationName(const char *name1, const char *name2, snprintf(modlabel, sizeof(modlabel), "%s%d", label, ++pass); } + table_close(pgclassrel, AccessShareLock); + return relname; } @@ -2809,8 +2843,8 @@ ExecReindex(ParseState *pstate, const ReindexStmt *stmt, bool isTopLevel) else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized REINDEX option \"%s\"", - opt->defname), + errmsg("unrecognized %s option \"%s\"", + "REINDEX", opt->defname), parser_errposition(pstate, opt->location))); } @@ -2943,6 +2977,7 @@ RangeVarCallbackForReindexIndex(const RangeVar *relation, struct ReindexIndexCallbackState *state = arg; LOCKMODE table_lockmode; Oid table_oid; + AclResult aclresult; /* * Lock level here should match table lock in reindex_index() for @@ -2967,43 +3002,42 @@ RangeVarCallbackForReindexIndex(const RangeVar *relation, if (!OidIsValid(relId)) return; - /* - * If the relation does exist, check whether it's an index. But note that - * the relation might have been dropped between the time we did the name - * lookup and now. In that case, there's nothing to do. - */ + /* If the relation does exist, check whether it's an index. */ relkind = get_rel_relkind(relId); - if (!relkind) - return; if (relkind != RELKIND_INDEX && relkind != RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not an index", relation->relname))); - /* Check permissions */ - table_oid = IndexGetRelation(relId, true); - if (OidIsValid(table_oid)) - { - AclResult aclresult; + /* Look up the index's table. */ + table_oid = IndexGetRelation(relId, false); - aclresult = pg_class_aclcheck(table_oid, GetUserId(), ACL_MAINTAIN); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, OBJECT_INDEX, relation->relname); - } + /* + * In the unlikely event that, upon retry, we get the same index OID with + * a different table OID, fail. RangeVarGetRelidExtended() will have + * already locked the index in this case, and it won't retry again, so we + * can't lock the newly discovered table OID without risking deadlock. + * Also, while this corner case is indeed possible, it is extremely + * unlikely to happen in practice, so it's probably not worth any more + * effort than this. + */ + if (relId == oldRelId && table_oid != state->locked_table_oid) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("index \"%s\" was concurrently dropped", + relation->relname))); + + /* Check permissions. */ + aclresult = pg_class_aclcheck(table_oid, GetUserId(), ACL_MAINTAIN); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_INDEX, relation->relname); /* Lock heap before index to avoid deadlock. */ if (relId != oldRelId) { - /* - * If the OID isn't valid, it means the index was concurrently - * dropped, which is not a problem for us; just return normally. - */ - if (OidIsValid(table_oid)) - { - LockRelationOid(table_oid, table_lockmode); - state->locked_table_oid = table_oid; - } + LockRelationOid(table_oid, table_lockmode); + state->locked_table_oid = table_oid; } } @@ -4196,6 +4230,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * indexes with the correct names. */ + INJECTION_POINT("reindex-relation-concurrently-before-swap", NULL); StartTransactionCommand(); /* @@ -4226,7 +4261,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein false); /* - * Updating pg_index might involve TOAST table access, so ensure we + * Swapping the indexes might involve TOAST table access, so ensure we * have a valid snapshot. */ PushActiveSnapshot(GetTransactionSnapshot()); @@ -4274,6 +4309,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * index_drop() for more details. */ + INJECTION_POINT("reindex-relation-concurrently-before-set-dead", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_4); WaitForLockersMultiple(lockTags, AccessExclusiveLock, true); diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 27c2cb26ef5f3..a1fd4cab35b68 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -61,7 +61,6 @@ static void transientrel_shutdown(DestReceiver *self); static void transientrel_destroy(DestReceiver *self); static uint64 refresh_matview_datafill(DestReceiver *dest, Query *query, const char *queryString, bool is_create); -static char *make_temptable_name_n(char *tempname, int n); static void refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, int save_sec_context); static void refresh_by_heap_swap(Oid matviewOid, Oid OIDNewHeap, char relpersistence); @@ -211,8 +210,8 @@ RefreshMatViewByOid(Oid matviewOid, bool is_create, bool skipData, if (concurrent && skipData) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("%s and %s options cannot be used together", - "CONCURRENTLY", "WITH NO DATA"))); + errmsg("%s options %s and %s cannot be used together", + "REFRESH", "CONCURRENTLY", "WITH NO DATA"))); /* * Check that everything is correct for a refresh. Problems at this point @@ -426,7 +425,7 @@ refresh_matview_datafill(DestReceiver *dest, Query *query, CHECK_FOR_INTERRUPTS(); /* Plan the query which will generate data for the refresh. */ - plan = pg_plan_query(query, queryString, CURSOR_OPT_PARALLEL_OK, NULL); + plan = pg_plan_query(query, queryString, CURSOR_OPT_PARALLEL_OK, NULL, NULL); /* * Use a snapshot with an updated command ID to ensure this query sees @@ -464,7 +463,7 @@ refresh_matview_datafill(DestReceiver *dest, Query *query, DestReceiver * CreateTransientRelDestReceiver(Oid transientoid) { - DR_transientrel *self = (DR_transientrel *) palloc0(sizeof(DR_transientrel)); + DR_transientrel *self = palloc0_object(DR_transientrel); self->pub.receiveSlot = transientrel_receive; self->pub.rStartup = transientrel_startup; @@ -556,28 +555,6 @@ transientrel_destroy(DestReceiver *self) pfree(self); } - -/* - * Given a qualified temporary table name, append an underscore followed by - * the given integer, to make a new table name based on the old one. - * The result is a palloc'd string. - * - * As coded, this would fail to make a valid SQL name if the given name were, - * say, "FOO"."BAR". Currently, the table name portion of the input will - * never be double-quoted because it's of the form "pg_temp_NNN", cf - * make_new_heap(). But we might have to work harder someday. - */ -static char * -make_temptable_name_n(char *tempname, int n) -{ - StringInfoData namebuf; - - initStringInfo(&namebuf); - appendStringInfoString(&namebuf, tempname); - appendStringInfo(&namebuf, "_%d", n); - return namebuf.data; -} - /* * refresh_by_match_merge * @@ -620,6 +597,9 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, char *matviewname; char *tempname; char *diffname; + char *temprelname; + char *diffrelname; + char *nsp; TupleDesc tupdesc; bool foundUniqueIndex; List *indexoidlist; @@ -632,9 +612,17 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, matviewname = quote_qualified_identifier(get_namespace_name(RelationGetNamespace(matviewRel)), RelationGetRelationName(matviewRel)); tempRel = table_open(tempOid, NoLock); - tempname = quote_qualified_identifier(get_namespace_name(RelationGetNamespace(tempRel)), - RelationGetRelationName(tempRel)); - diffname = make_temptable_name_n(tempname, 2); + + /* + * Build qualified names of the temporary table and the diff table. The + * only difference between them is the "_2" suffix on the diff table name. + */ + nsp = get_namespace_name(RelationGetNamespace(tempRel)); + temprelname = RelationGetRelationName(tempRel); + diffrelname = psprintf("%s_2", temprelname); + + tempname = quote_qualified_identifier(nsp, temprelname); + diffname = quote_qualified_identifier(nsp, diffrelname); relnatts = RelationGetNumberOfAttributes(matviewRel); @@ -725,7 +713,7 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, * include all rows. */ tupdesc = matviewRel->rd_att; - opUsedForQual = (Oid *) palloc0(sizeof(Oid) * relnatts); + opUsedForQual = palloc0_array(Oid, relnatts); foundUniqueIndex = false; indexoidlist = RelationGetIndexList(matviewRel); @@ -835,7 +823,8 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, if (!foundUniqueIndex) ereport(ERROR, errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not find suitable unique index on materialized view")); + errmsg("could not find suitable unique index on materialized view \"%s\"", + RelationGetRelationName(matviewRel))); appendStringInfoString(&querybuf, " AND newdata.* OPERATOR(pg_catalog.*=) mv.*) " diff --git a/src/backend/commands/meson.build b/src/backend/commands/meson.build index dd4cde41d32cc..5fc35826b1cc3 100644 --- a/src/backend/commands/meson.build +++ b/src/backend/commands/meson.build @@ -41,6 +41,7 @@ backend_sources += files( 'schemacmds.c', 'seclabel.c', 'sequence.c', + 'sequence_xlog.c', 'statscmds.c', 'subscriptioncmds.c', 'tablecmds.c', @@ -53,4 +54,5 @@ backend_sources += files( 'vacuumparallel.c', 'variable.c', 'view.c', + 'wait.c', ) diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index a6dd8eab5186b..992ae789b0062 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -523,7 +523,7 @@ DefineOpClass(CreateOpClassStmt *stmt) #endif /* Save the info */ - member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember)); + member = palloc0_object(OpFamilyMember); member->is_func = false; member->object = operOid; member->number = item->number; @@ -547,7 +547,7 @@ DefineOpClass(CreateOpClassStmt *stmt) get_func_name(funcOid)); #endif /* Save the info */ - member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember)); + member = palloc0_object(OpFamilyMember); member->is_func = true; member->object = funcOid; member->number = item->number; @@ -940,7 +940,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, #endif /* Save the info */ - member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember)); + member = palloc0_object(OpFamilyMember); member->is_func = false; member->object = operOid; member->number = item->number; @@ -970,7 +970,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, #endif /* Save the info */ - member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember)); + member = palloc0_object(OpFamilyMember); member->is_func = true; member->object = funcOid; member->number = item->number; @@ -1058,7 +1058,7 @@ AlterOpFamilyDrop(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, item->number, maxOpNumber))); processTypesSpec(item->class_args, &lefttype, &righttype); /* Save the info */ - member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember)); + member = palloc0_object(OpFamilyMember); member->is_func = false; member->number = item->number; member->lefttype = lefttype; @@ -1074,7 +1074,7 @@ AlterOpFamilyDrop(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, item->number, maxProcNumber))); processTypesSpec(item->class_args, &lefttype, &righttype); /* Save the info */ - member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember)); + member = palloc0_object(OpFamilyMember); member->is_func = true; member->number = item->number; member->lefttype = lefttype; diff --git a/src/backend/commands/policy.c b/src/backend/commands/policy.c index 83056960fe47e..5bd5f8c9968f7 100644 --- a/src/backend/commands/policy.c +++ b/src/backend/commands/policy.c @@ -144,7 +144,7 @@ policy_role_list_to_array(List *roles, int *num_roles) if (roles == NIL) { *num_roles = 1; - role_oids = (Datum *) palloc(*num_roles * sizeof(Datum)); + role_oids = palloc_array(Datum, *num_roles); role_oids[0] = ObjectIdGetDatum(ACL_ID_PUBLIC); return role_oids; @@ -471,7 +471,7 @@ RemoveRoleFromObjectPolicy(Oid roleid, Oid classid, Oid policy_id) * Ordinarily there'd be exactly one, but we must cope with duplicate * mentions, since CREATE/ALTER POLICY historically have allowed that. */ - role_oids = (Datum *) palloc(num_roles * sizeof(Datum)); + role_oids = palloc_array(Datum, num_roles); for (i = 0, j = 0; i < num_roles; i++) { if (roles[i] != roleid) @@ -945,7 +945,7 @@ AlterPolicy(AlterPolicyStmt *stmt) nitems = ARR_DIMS(policy_roles)[0]; - role_oids = (Datum *) palloc(nitems * sizeof(Datum)); + role_oids = palloc_array(Datum, nitems); for (i = 0; i < nitems; i++) role_oids[i] = ObjectIdGetDatum(roles[i]); diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index e7c8171c10207..ec96c2efcd36a 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -99,7 +99,8 @@ PerformCursorOpen(ParseState *pstate, DeclareCursorStmt *cstmt, ParamListInfo pa elog(ERROR, "non-SELECT statement in DECLARE CURSOR"); /* Plan the query, applying the specified options */ - plan = pg_plan_query(query, pstate->p_sourcetext, cstmt->options, params); + plan = pg_plan_query(query, pstate->p_sourcetext, cstmt->options, params, + NULL); /* * Create a portal and copy the plan and query string into its memory. diff --git a/src/backend/commands/proclang.c b/src/backend/commands/proclang.c index 5036ac03639d6..d75e2fa74b297 100644 --- a/src/backend/commands/proclang.c +++ b/src/backend/commands/proclang.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/table.h" #include "catalog/catalog.h" #include "catalog/dependency.h" diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c index 0b23d94c38e20..40a4efd7390c0 100644 --- a/src/backend/commands/publicationcmds.c +++ b/src/backend/commands/publicationcmds.c @@ -29,7 +29,6 @@ #include "catalog/pg_publication.h" #include "catalog/pg_publication_namespace.h" #include "catalog/pg_publication_rel.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/publicationcmds.h" @@ -848,11 +847,14 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) aclcheck_error(aclresult, OBJECT_DATABASE, get_database_name(MyDatabaseId)); - /* FOR ALL TABLES requires superuser */ - if (stmt->for_all_tables && !superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to create FOR ALL TABLES publication"))); + /* FOR ALL TABLES and FOR ALL SEQUENCES requires superuser */ + if (!superuser()) + { + if (stmt->for_all_tables || stmt->for_all_sequences) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to create a FOR ALL TABLES or ALL SEQUENCES publication")); + } rel = table_open(PublicationRelationId, RowExclusiveLock); @@ -881,11 +883,20 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) &publish_generated_columns_given, &publish_generated_columns); + if (stmt->for_all_sequences && + (publish_given || publish_via_partition_root_given || + publish_generated_columns_given)) + ereport(NOTICE, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication parameters are not applicable to sequence synchronization and will be ignored for sequences")); + puboid = GetNewOidWithIndex(rel, PublicationObjectIndexId, Anum_pg_publication_oid); values[Anum_pg_publication_oid - 1] = ObjectIdGetDatum(puboid); values[Anum_pg_publication_puballtables - 1] = BoolGetDatum(stmt->for_all_tables); + values[Anum_pg_publication_puballsequences - 1] = + BoolGetDatum(stmt->for_all_sequences); values[Anum_pg_publication_pubinsert - 1] = BoolGetDatum(pubactions.pubinsert); values[Anum_pg_publication_pubupdate - 1] = @@ -915,10 +926,14 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) /* Associate objects with the publication. */ if (stmt->for_all_tables) { - /* Invalidate relcache so that publication info is rebuilt. */ + /* + * Invalidate relcache so that publication info is rebuilt. Sequences + * publication doesn't require invalidation, as replica identity + * checks don't apply to them. + */ CacheInvalidateRelcacheAll(); } - else + else if (!stmt->for_all_sequences) { ObjectsInPublicationToOids(stmt->pubobjects, pstate, &relations, &schemaidlist); @@ -960,11 +975,16 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) InvokeObjectPostCreateHook(PublicationRelationId, puboid, 0); - if (wal_level != WAL_LEVEL_LOGICAL) + /* + * We don't need this warning message when wal_level >= 'replica' since + * logical decoding is automatically enabled up on a logical slot + * creation. + */ + if (wal_level < WAL_LEVEL_REPLICA) ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("\"wal_level\" is insufficient to publish logical changes"), - errhint("Set \"wal_level\" to \"logical\" before creating subscriptions."))); + errmsg("logical decoding must be enabled to publish logical changes"), + errhint("Before creating subscriptions, ensure that \"wal_level\" is set to \"replica\" or higher."))); return myself; } @@ -990,6 +1010,8 @@ AlterPublicationOptions(ParseState *pstate, AlterPublicationStmt *stmt, List *root_relids = NIL; ListCell *lc; + pubform = (Form_pg_publication) GETSTRUCT(tup); + parse_publication_options(pstate, stmt->options, &publish_given, &pubactions, @@ -998,7 +1020,12 @@ AlterPublicationOptions(ParseState *pstate, AlterPublicationStmt *stmt, &publish_generated_columns_given, &publish_generated_columns); - pubform = (Form_pg_publication) GETSTRUCT(tup); + if (pubform->puballsequences && + (publish_given || publish_via_partition_root_given || + publish_generated_columns_given)) + ereport(NOTICE, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication parameters are not applicable to sequence synchronization and will be ignored for sequences")); /* * If the publication doesn't publish changes via the root partitioned @@ -1323,7 +1350,7 @@ AlterPublicationTables(AlterPublicationStmt *stmt, HeapTuple tup, */ if (!found) { - oldrel = palloc(sizeof(PublicationRelInfo)); + oldrel = palloc_object(PublicationRelInfo); oldrel->whereClause = NULL; oldrel->columns = NIL; oldrel->relation = table_open(oldrelid, @@ -1452,20 +1479,50 @@ CheckAlterPublication(AlterPublicationStmt *stmt, HeapTuple tup, * Check that user is allowed to manipulate the publication tables in * schema */ - if (schemaidlist && pubform->puballtables) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("publication \"%s\" is defined as FOR ALL TABLES", - NameStr(pubform->pubname)), - errdetail("Schemas cannot be added to or dropped from FOR ALL TABLES publications."))); + if (schemaidlist && (pubform->puballtables || pubform->puballsequences)) + { + if (pubform->puballtables && pubform->puballsequences) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication \"%s\" is defined as FOR ALL TABLES, ALL SEQUENCES", + NameStr(pubform->pubname)), + errdetail("Schemas cannot be added to or dropped from FOR ALL TABLES, ALL SEQUENCES publications.")); + else if (pubform->puballtables) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication \"%s\" is defined as FOR ALL TABLES", + NameStr(pubform->pubname)), + errdetail("Schemas cannot be added to or dropped from FOR ALL TABLES publications.")); + else + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication \"%s\" is defined as FOR ALL SEQUENCES", + NameStr(pubform->pubname)), + errdetail("Schemas cannot be added to or dropped from FOR ALL SEQUENCES publications.")); + } /* Check that user is allowed to manipulate the publication tables. */ - if (tables && pubform->puballtables) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("publication \"%s\" is defined as FOR ALL TABLES", - NameStr(pubform->pubname)), - errdetail("Tables cannot be added to or dropped from FOR ALL TABLES publications."))); + if (tables && (pubform->puballtables || pubform->puballsequences)) + { + if (pubform->puballtables && pubform->puballsequences) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication \"%s\" is defined as FOR ALL TABLES, ALL SEQUENCES", + NameStr(pubform->pubname)), + errdetail("Tables or sequences cannot be added to or dropped from FOR ALL TABLES, ALL SEQUENCES publications.")); + else if (pubform->puballtables) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication \"%s\" is defined as FOR ALL TABLES", + NameStr(pubform->pubname)), + errdetail("Tables or sequences cannot be added to or dropped from FOR ALL TABLES publications.")); + else + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publication \"%s\" is defined as FOR ALL SEQUENCES", + NameStr(pubform->pubname)), + errdetail("Tables or sequences cannot be added to or dropped from FOR ALL SEQUENCES publications.")); + } } /* @@ -1705,7 +1762,7 @@ OpenTableList(List *tables) continue; } - pub_rel = palloc(sizeof(PublicationRelInfo)); + pub_rel = palloc_object(PublicationRelInfo); pub_rel->relation = rel; pub_rel->whereClause = t->whereClause; pub_rel->columns = t->columns; @@ -1774,7 +1831,7 @@ OpenTableList(List *tables) /* find_all_inheritors already got lock */ rel = table_open(childrelid, NoLock); - pub_rel = palloc(sizeof(PublicationRelInfo)); + pub_rel = palloc_object(PublicationRelInfo); pub_rel->relation = rel; /* child inherits WHERE clause from parent */ pub_rel->whereClause = t->whereClause; @@ -1856,8 +1913,6 @@ PublicationAddTables(Oid pubid, List *rels, bool if_not_exists, { ListCell *lc; - Assert(!stmt || !stmt->for_all_tables); - foreach(lc, rels) { PublicationRelInfo *pub_rel = (PublicationRelInfo *) lfirst(lc); @@ -1935,8 +1990,6 @@ PublicationAddSchemas(Oid pubid, List *schemas, bool if_not_exists, { ListCell *lc; - Assert(!stmt || !stmt->for_all_tables); - foreach(lc, schemas) { Oid schemaid = lfirst_oid(lc); @@ -2019,19 +2072,16 @@ AlterPublicationOwner_internal(Relation rel, HeapTuple tup, Oid newOwnerId) aclcheck_error(aclresult, OBJECT_DATABASE, get_database_name(MyDatabaseId)); - if (form->puballtables && !superuser_arg(newOwnerId)) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("permission denied to change owner of publication \"%s\"", - NameStr(form->pubname)), - errhint("The owner of a FOR ALL TABLES publication must be a superuser."))); - - if (!superuser_arg(newOwnerId) && is_schema_publication(form->oid)) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("permission denied to change owner of publication \"%s\"", - NameStr(form->pubname)), - errhint("The owner of a FOR TABLES IN SCHEMA publication must be a superuser."))); + if (!superuser_arg(newOwnerId)) + { + if (form->puballtables || form->puballsequences || + is_schema_publication(form->oid)) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to change owner of publication \"%s\"", + NameStr(form->pubname)), + errhint("The owner of a FOR ALL TABLES or ALL SEQUENCES or TABLES IN SCHEMA publication must be a superuser.")); + } } form->pubowner = newOwnerId; @@ -2113,25 +2163,25 @@ AlterPublicationOwner_oid(Oid pubid, Oid newOwnerId) static char defGetGeneratedColsOption(DefElem *def) { - char *sval; + char *sval = ""; /* - * If no parameter value given, assume "stored" is meant. + * A parameter value is required. */ - if (!def->arg) - return PUBLISH_GENCOLS_STORED; - - sval = defGetString(def); + if (def->arg) + { + sval = defGetString(def); - if (pg_strcasecmp(sval, "none") == 0) - return PUBLISH_GENCOLS_NONE; - if (pg_strcasecmp(sval, "stored") == 0) - return PUBLISH_GENCOLS_STORED; + if (pg_strcasecmp(sval, "none") == 0) + return PUBLISH_GENCOLS_NONE; + if (pg_strcasecmp(sval, "stored") == 0) + return PUBLISH_GENCOLS_STORED; + } ereport(ERROR, errcode(ERRCODE_SYNTAX_ERROR), - errmsg("%s requires a \"none\" or \"stored\" value", - def->defname)); + errmsg("invalid value for publication parameter \"%s\": \"%s\"", def->defname, sval), + errdetail("Valid values are \"%s\" and \"%s\".", "none", "stored")); return PUBLISH_GENCOLS_NONE; /* keep compiler quiet */ } diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c index 546160f09410e..3cc1472103a7a 100644 --- a/src/backend/commands/schemacmds.c +++ b/src/backend/commands/schemacmds.c @@ -25,7 +25,6 @@ #include "catalog/pg_authid.h" #include "catalog/pg_database.h" #include "catalog/pg_namespace.h" -#include "commands/dbcommands.h" #include "commands/event_trigger.h" #include "commands/schemacmds.h" #include "miscadmin.h" @@ -34,6 +33,7 @@ #include "tcop/utility.h" #include "utils/acl.h" #include "utils/builtins.h" +#include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/syscache.h" @@ -215,6 +215,7 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString, wrapper->utilityStmt = stmt; wrapper->stmt_location = stmt_location; wrapper->stmt_len = stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; /* do this step */ ProcessUtility(wrapper, diff --git a/src/backend/commands/seclabel.c b/src/backend/commands/seclabel.c index cee5d7bbb9c7e..07bed6e1487eb 100644 --- a/src/backend/commands/seclabel.c +++ b/src/backend/commands/seclabel.c @@ -573,7 +573,7 @@ register_label_provider(const char *provider_name, check_object_relabel_type hoo MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(TopMemoryContext); - provider = palloc(sizeof(LabelProvider)); + provider = palloc_object(LabelProvider); provider->provider_name = pstrdup(provider_name); provider->hook = hook; label_provider_list = lappend(label_provider_list, provider); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 451ae6f7f6940..51567994126f4 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -14,7 +14,6 @@ */ #include "postgres.h" -#include "access/bufmask.h" #include "access/htup_details.h" #include "access/multixact.h" #include "access/relation.h" @@ -22,9 +21,7 @@ #include "access/table.h" #include "access/transam.h" #include "access/xact.h" -#include "access/xlog.h" #include "access/xloginsert.h" -#include "access/xlogutils.h" #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/namespace.h" @@ -34,17 +31,20 @@ #include "catalog/storage_xlog.h" #include "commands/defrem.h" #include "commands/sequence.h" +#include "commands/sequence_xlog.h" #include "commands/tablecmds.h" #include "funcapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" #include "parser/parse_type.h" +#include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/lsyscache.h" +#include "utils/pg_lsn.h" #include "utils/resowner.h" #include "utils/syscache.h" #include "utils/varlena.h" @@ -57,16 +57,6 @@ */ #define SEQ_LOG_VALS 32 -/* - * The "special area" of a sequence's buffer page looks like this. - */ -#define SEQ_MAGIC 0x1717 - -typedef struct sequence_magic -{ - uint32 magic; -} sequence_magic; - /* * We store a SeqTable item for every sequence we have touched in the current * session. This is needed to hold onto nextval/currval state. (We can't @@ -106,10 +96,11 @@ static Form_pg_sequence_data read_seq_tuple(Relation rel, static void init_params(ParseState *pstate, List *options, bool for_identity, bool isInit, Form_pg_sequence seqform, - Form_pg_sequence_data seqdataform, + int64 *last_value, + bool *reset_state, + bool *is_called, bool *need_seq_rewrite, List **owned_by); -static void do_setval(Oid relid, int64 next, bool iscalled); static void process_owned_by(Relation seqrel, List *owned_by, bool for_identity); @@ -121,7 +112,9 @@ ObjectAddress DefineSequence(ParseState *pstate, CreateSeqStmt *seq) { FormData_pg_sequence seqform; - FormData_pg_sequence_data seqdataform; + int64 last_value; + bool reset_state; + bool is_called; bool need_seq_rewrite; List *owned_by; CreateStmt *stmt = makeNode(CreateStmt); @@ -164,7 +157,7 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq) /* Check and set all option values */ init_params(pstate, seq->options, seq->for_identity, true, - &seqform, &seqdataform, + &seqform, &last_value, &reset_state, &is_called, &need_seq_rewrite, &owned_by); /* @@ -179,7 +172,7 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq) { case SEQ_COL_LASTVAL: coldef = makeColumnDef("last_value", INT8OID, -1, InvalidOid); - value[i - 1] = Int64GetDatumFast(seqdataform.last_value); + value[i - 1] = Int64GetDatumFast(last_value); break; case SEQ_COL_LOG: coldef = makeColumnDef("log_cnt", INT8OID, -1, InvalidOid); @@ -399,8 +392,7 @@ fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum) MarkBufferDirty(buf); - offnum = PageAddItem(page, (Item) tuple->t_data, tuple->t_len, - InvalidOffsetNumber, false, false); + offnum = PageAddItem(page, tuple->t_data, tuple->t_len, InvalidOffsetNumber, false, false); if (offnum != FirstOffsetNumber) elog(ERROR, "failed to add sequence tuple to page"); @@ -448,6 +440,9 @@ AlterSequence(ParseState *pstate, AlterSeqStmt *stmt) ObjectAddress address; Relation rel; HeapTuple seqtuple; + bool reset_state = false; + bool is_called; + int64 last_value; HeapTuple newdatatuple; /* Open and lock sequence, and check for ownership along the way. */ @@ -481,12 +476,14 @@ AlterSequence(ParseState *pstate, AlterSeqStmt *stmt) /* copy the existing sequence data tuple, so it can be modified locally */ newdatatuple = heap_copytuple(&datatuple); newdataform = (Form_pg_sequence_data) GETSTRUCT(newdatatuple); + last_value = newdataform->last_value; + is_called = newdataform->is_called; UnlockReleaseBuffer(buf); /* Check and set new values */ init_params(pstate, stmt->options, stmt->for_identity, false, - seqform, newdataform, + seqform, &last_value, &reset_state, &is_called, &need_seq_rewrite, &owned_by); /* If needed, rewrite the sequence relation itself */ @@ -513,6 +510,10 @@ AlterSequence(ParseState *pstate, AlterSeqStmt *stmt) /* * Insert the modified tuple into the new storage file. */ + newdataform->last_value = last_value; + newdataform->is_called = is_called; + if (reset_state) + newdataform->log_cnt = 0; fill_seq_with_data(seqrel, newdatatuple); } @@ -941,8 +942,8 @@ lastval(PG_FUNCTION_ARGS) * it is the only way to clear the is_called flag in an existing * sequence. */ -static void -do_setval(Oid relid, int64 next, bool iscalled) +void +SetSequence(Oid relid, int64 next, bool iscalled) { SeqTable elm; Relation seqrel; @@ -1043,7 +1044,7 @@ do_setval(Oid relid, int64 next, bool iscalled) /* * Implement the 2 arg setval procedure. - * See do_setval for discussion. + * See SetSequence for discussion. */ Datum setval_oid(PG_FUNCTION_ARGS) @@ -1051,14 +1052,14 @@ setval_oid(PG_FUNCTION_ARGS) Oid relid = PG_GETARG_OID(0); int64 next = PG_GETARG_INT64(1); - do_setval(relid, next, true); + SetSequence(relid, next, true); PG_RETURN_INT64(next); } /* * Implement the 3 arg setval procedure. - * See do_setval for discussion. + * See SetSequence for discussion. */ Datum setval3_oid(PG_FUNCTION_ARGS) @@ -1067,7 +1068,7 @@ setval3_oid(PG_FUNCTION_ARGS) int64 next = PG_GETARG_INT64(1); bool iscalled = PG_GETARG_BOOL(2); - do_setval(relid, next, iscalled); + SetSequence(relid, next, iscalled); PG_RETURN_INT64(next); } @@ -1236,17 +1237,19 @@ read_seq_tuple(Relation rel, Buffer *buf, HeapTuple seqdatatuple) /* * init_params: process the options list of CREATE or ALTER SEQUENCE, and * store the values into appropriate fields of seqform, for changes that go - * into the pg_sequence catalog, and fields of seqdataform for changes to the - * sequence relation itself. Set *need_seq_rewrite to true if we changed any - * parameters that require rewriting the sequence's relation (interesting for - * ALTER SEQUENCE). Also set *owned_by to any OWNED BY option, or to NIL if - * there is none. + * into the pg_sequence catalog, and fields for changes to the sequence + * relation itself (*is_called, *last_value and *reset_state). Set + * *need_seq_rewrite to true if we changed any parameters that require + * rewriting the sequence's relation (interesting for ALTER SEQUENCE). Also + * set *owned_by to any OWNED BY option, or to NIL if there is none. Set + * *reset_state to true if the internal state of the sequence needs to be + * reset, affecting future nextval() calls, for example with WAL logging. * * If isInit is true, fill any unspecified options with default values; * otherwise, do not change existing options that aren't explicitly overridden. * * Note: we force a sequence rewrite whenever we change parameters that affect - * generation of future sequence values, even if the seqdataform per se is not + * generation of future sequence values, even if the metadata per se is not * changed. This allows ALTER SEQUENCE to behave transactionally. Currently, * the only option that doesn't cause that is OWNED BY. It's *necessary* for * ALTER SEQUENCE OWNED BY to not rewrite the sequence, because that would @@ -1257,7 +1260,9 @@ static void init_params(ParseState *pstate, List *options, bool for_identity, bool isInit, Form_pg_sequence seqform, - Form_pg_sequence_data seqdataform, + int64 *last_value, + bool *reset_state, + bool *is_called, bool *need_seq_rewrite, List **owned_by) { @@ -1363,11 +1368,11 @@ init_params(ParseState *pstate, List *options, bool for_identity, } /* - * We must reset log_cnt when isInit or when changing any parameters that - * would affect future nextval allocations. + * We must reset the state of the sequence when isInit or when changing + * any parameters that would affect future nextval allocations. */ if (isInit) - seqdataform->log_cnt = 0; + *reset_state = true; /* AS type */ if (as_type != NULL) @@ -1416,7 +1421,7 @@ init_params(ParseState *pstate, List *options, bool for_identity, ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("INCREMENT must not be zero"))); - seqdataform->log_cnt = 0; + *reset_state = true; } else if (isInit) { @@ -1428,7 +1433,7 @@ init_params(ParseState *pstate, List *options, bool for_identity, { seqform->seqcycle = boolVal(is_cycled->arg); Assert(BoolIsValid(seqform->seqcycle)); - seqdataform->log_cnt = 0; + *reset_state = true; } else if (isInit) { @@ -1439,7 +1444,7 @@ init_params(ParseState *pstate, List *options, bool for_identity, if (max_value != NULL && max_value->arg) { seqform->seqmax = defGetInt64(max_value); - seqdataform->log_cnt = 0; + *reset_state = true; } else if (isInit || max_value != NULL || reset_max_value) { @@ -1455,7 +1460,7 @@ init_params(ParseState *pstate, List *options, bool for_identity, } else seqform->seqmax = -1; /* descending seq */ - seqdataform->log_cnt = 0; + *reset_state = true; } /* Validate maximum value. No need to check INT8 as seqmax is an int64 */ @@ -1471,7 +1476,7 @@ init_params(ParseState *pstate, List *options, bool for_identity, if (min_value != NULL && min_value->arg) { seqform->seqmin = defGetInt64(min_value); - seqdataform->log_cnt = 0; + *reset_state = true; } else if (isInit || min_value != NULL || reset_min_value) { @@ -1487,7 +1492,7 @@ init_params(ParseState *pstate, List *options, bool for_identity, } else seqform->seqmin = 1; /* ascending seq */ - seqdataform->log_cnt = 0; + *reset_state = true; } /* Validate minimum value. No need to check INT8 as seqmin is an int64 */ @@ -1538,30 +1543,30 @@ init_params(ParseState *pstate, List *options, bool for_identity, if (restart_value != NULL) { if (restart_value->arg != NULL) - seqdataform->last_value = defGetInt64(restart_value); + *last_value = defGetInt64(restart_value); else - seqdataform->last_value = seqform->seqstart; - seqdataform->is_called = false; - seqdataform->log_cnt = 0; + *last_value = seqform->seqstart; + *is_called = false; + *reset_state = true; } else if (isInit) { - seqdataform->last_value = seqform->seqstart; - seqdataform->is_called = false; + *last_value = seqform->seqstart; + *is_called = false; } /* crosscheck RESTART (or current value, if changing MIN/MAX) */ - if (seqdataform->last_value < seqform->seqmin) + if (*last_value < seqform->seqmin) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("RESTART value (%" PRId64 ") cannot be less than MINVALUE (%" PRId64 ")", - seqdataform->last_value, + *last_value, seqform->seqmin))); - if (seqdataform->last_value > seqform->seqmax) + if (*last_value > seqform->seqmax) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("RESTART value (%" PRId64 ") cannot be greater than MAXVALUE (%" PRId64 ")", - seqdataform->last_value, + *last_value, seqform->seqmax))); /* CACHE */ @@ -1573,7 +1578,7 @@ init_params(ParseState *pstate, List *options, bool for_identity, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("CACHE (%" PRId64 ") must be greater than zero", seqform->seqcache))); - seqdataform->log_cnt = 0; + *reset_state = true; } else if (isInit) { @@ -1778,15 +1783,16 @@ pg_sequence_parameters(PG_FUNCTION_ARGS) /* - * Return the sequence tuple. + * Return the sequence tuple along with its page LSN. * - * This is primarily intended for use by pg_dump to gather sequence data - * without needing to individually query each sequence relation. + * This is primarily used by pg_dump to efficiently collect sequence data + * without querying each sequence individually, and is also leveraged by + * logical replication while synchronizing sequences. */ Datum pg_get_sequence_data(PG_FUNCTION_ARGS) { -#define PG_GET_SEQUENCE_DATA_COLS 2 +#define PG_GET_SEQUENCE_DATA_COLS 3 Oid relid = PG_GETARG_OID(0); SeqTable elm; Relation seqrel; @@ -1801,6 +1807,8 @@ pg_get_sequence_data(PG_FUNCTION_ARGS) INT8OID, -1, 0); TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "is_called", BOOLOID, -1, 0); + TupleDescInitEntry(resultTupleDesc, (AttrNumber) 3, "page_lsn", + LSNOID, -1, 0); resultTupleDesc = BlessTupleDesc(resultTupleDesc); init_sequence(relid, &elm, &seqrel); @@ -1816,11 +1824,14 @@ pg_get_sequence_data(PG_FUNCTION_ARGS) Buffer buf; HeapTupleData seqtuple; Form_pg_sequence_data seq; + Page page; seq = read_seq_tuple(seqrel, &buf, &seqtuple); + page = BufferGetPage(buf); values[0] = Int64GetDatum(seq->last_value); values[1] = BoolGetDatum(seq->is_called); + values[2] = LSNGetDatum(PageGetLSN(page)); UnlockReleaseBuffer(buf); } @@ -1885,57 +1896,6 @@ pg_sequence_last_value(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - -void -seq_redo(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - Buffer buffer; - Page page; - Page localpage; - char *item; - Size itemsz; - xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record); - sequence_magic *sm; - - if (info != XLOG_SEQ_LOG) - elog(PANIC, "seq_redo: unknown op code %u", info); - - buffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(buffer); - - /* - * We always reinit the page. However, since this WAL record type is also - * used for updating sequences, it's possible that a hot-standby backend - * is examining the page concurrently; so we mustn't transiently trash the - * buffer. The solution is to build the correct new page contents in - * local workspace and then memcpy into the buffer. Then only bytes that - * are supposed to change will change, even transiently. We must palloc - * the local page for alignment reasons. - */ - localpage = (Page) palloc(BufferGetPageSize(buffer)); - - PageInit(localpage, BufferGetPageSize(buffer), sizeof(sequence_magic)); - sm = (sequence_magic *) PageGetSpecialPointer(localpage); - sm->magic = SEQ_MAGIC; - - item = (char *) xlrec + sizeof(xl_seq_rec); - itemsz = XLogRecGetDataLen(record) - sizeof(xl_seq_rec); - - if (PageAddItem(localpage, (Item) item, itemsz, - FirstOffsetNumber, false, false) == InvalidOffsetNumber) - elog(PANIC, "seq_redo: failed to add item to page"); - - PageSetLSN(localpage, lsn); - - memcpy(page, localpage, BufferGetPageSize(buffer)); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); - - pfree(localpage); -} - /* * Flush cached sequence information. */ @@ -1950,14 +1910,3 @@ ResetSequenceCaches(void) last_used_seq = NULL; } - -/* - * Mask a Sequence page before performing consistency checks on it. - */ -void -seq_mask(char *page, BlockNumber blkno) -{ - mask_page_lsn_and_checksum(page); - - mask_unused_space(page); -} diff --git a/src/backend/commands/sequence_xlog.c b/src/backend/commands/sequence_xlog.c new file mode 100644 index 0000000000000..ffbd9820416ae --- /dev/null +++ b/src/backend/commands/sequence_xlog.c @@ -0,0 +1,80 @@ +/*------------------------------------------------------------------------- + * + * sequence.c + * RMGR WAL routines for sequences. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/commands/sequence_xlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/xlogutils.h" +#include "commands/sequence_xlog.h" +#include "storage/bufmgr.h" + +void +seq_redo(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + Buffer buffer; + Page page; + Page localpage; + char *item; + Size itemsz; + xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record); + sequence_magic *sm; + + if (info != XLOG_SEQ_LOG) + elog(PANIC, "seq_redo: unknown op code %u", info); + + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + + /* + * We always reinit the page. However, since this WAL record type is also + * used for updating sequences, it's possible that a hot-standby backend + * is examining the page concurrently; so we mustn't transiently trash the + * buffer. The solution is to build the correct new page contents in + * local workspace and then memcpy into the buffer. Then only bytes that + * are supposed to change will change, even transiently. We must palloc + * the local page for alignment reasons. + */ + localpage = (Page) palloc(BufferGetPageSize(buffer)); + + PageInit(localpage, BufferGetPageSize(buffer), sizeof(sequence_magic)); + sm = (sequence_magic *) PageGetSpecialPointer(localpage); + sm->magic = SEQ_MAGIC; + + item = (char *) xlrec + sizeof(xl_seq_rec); + itemsz = XLogRecGetDataLen(record) - sizeof(xl_seq_rec); + + if (PageAddItem(localpage, item, itemsz, FirstOffsetNumber, false, false) == InvalidOffsetNumber) + elog(PANIC, "seq_redo: failed to add item to page"); + + PageSetLSN(localpage, lsn); + + memcpy(page, localpage, BufferGetPageSize(buffer)); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + + pfree(localpage); +} + +/* + * Mask a Sequence page before performing consistency checks on it. + */ +void +seq_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn_and_checksum(page); + + mask_unused_space(page); +} diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index e24d540cd45ba..77b1a6e2dc51f 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/relation.h" #include "access/table.h" #include "catalog/catalog.h" @@ -59,7 +60,7 @@ compare_int16(const void *a, const void *b) * CREATE STATISTICS */ ObjectAddress -CreateStatistics(CreateStatsStmt *stmt) +CreateStatistics(CreateStatsStmt *stmt, bool check_rights) { int16 attnums[STATS_MAX_DIMENSIONS]; int nattnums = 0; @@ -134,7 +135,13 @@ CreateStatistics(CreateStatsStmt *stmt) RelationGetRelationName(rel)), errdetail_relkind_not_supported(rel->rd_rel->relkind))); - /* You must own the relation to create stats on it */ + /* + * You must own the relation to create stats on it. + * + * NB: Concurrent changes could cause this function's lookup to find a + * different relation than a previous lookup by the caller, so we must + * perform this check even when check_rights == false. + */ if (!object_ownercheck(RelationRelationId, RelationGetRelid(rel), stxowner)) aclcheck_error(ACLCHECK_NOT_OWNER, get_relkind_objtype(rel->rd_rel->relkind), RelationGetRelationName(rel)); @@ -169,6 +176,21 @@ CreateStatistics(CreateStatsStmt *stmt) } namestrcpy(&stxname, namestr); + /* + * Check we have creation rights in target namespace. Skip check if + * caller doesn't want it. + */ + if (check_rights) + { + AclResult aclresult; + + aclresult = object_aclcheck(NamespaceRelationId, namespaceId, + GetUserId(), ACL_CREATE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_SCHEMA, + get_namespace_name(namespaceId)); + } + /* * Deal with the possibility that the statistics object already exists. */ diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 4aec73bcc6bbc..4efd4685abcab 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/commit_ts.h" #include "access/htup_details.h" #include "access/table.h" #include "access/twophase.h" @@ -29,7 +30,6 @@ #include "catalog/pg_subscription.h" #include "catalog/pg_subscription_rel.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/subscriptioncmds.h" @@ -71,8 +71,10 @@ #define SUBOPT_PASSWORD_REQUIRED 0x00000800 #define SUBOPT_RUN_AS_OWNER 0x00001000 #define SUBOPT_FAILOVER 0x00002000 -#define SUBOPT_LSN 0x00004000 -#define SUBOPT_ORIGIN 0x00008000 +#define SUBOPT_RETAIN_DEAD_TUPLES 0x00004000 +#define SUBOPT_MAX_RETENTION_DURATION 0x00008000 +#define SUBOPT_LSN 0x00010000 +#define SUBOPT_ORIGIN 0x00020000 /* check if the 'val' has 'bits' set */ #define IsSet(val, bits) (((val) & (bits)) == (bits)) @@ -98,15 +100,36 @@ typedef struct SubOpts bool passwordrequired; bool runasowner; bool failover; + bool retaindeadtuples; + int32 maxretention; char *origin; XLogRecPtr lsn; } SubOpts; -static List *fetch_table_list(WalReceiverConn *wrconn, List *publications); -static void check_publications_origin(WalReceiverConn *wrconn, - List *publications, bool copydata, - char *origin, Oid *subrel_local_oids, - int subrel_count, char *subname); +/* + * PublicationRelKind represents a relation included in a publication. + * It stores the schema-qualified relation name (rv) and its kind (relkind). + */ +typedef struct PublicationRelKind +{ + RangeVar *rv; + char relkind; +} PublicationRelKind; + +static List *fetch_relation_list(WalReceiverConn *wrconn, List *publications); +static void check_publications_origin_tables(WalReceiverConn *wrconn, + List *publications, bool copydata, + bool retain_dead_tuples, + char *origin, + Oid *subrel_local_oids, + int subrel_count, char *subname); +static void check_publications_origin_sequences(WalReceiverConn *wrconn, + List *publications, + bool copydata, char *origin, + Oid *subrel_local_oids, + int subrel_count, + char *subname); +static void check_pub_dead_tuple_retention(WalReceiverConn *wrconn); static void check_duplicates_in_publist(List *publist, Datum *datums); static List *merge_publications(List *oldpublist, List *newpublist, bool addpub, const char *subname); static void ReportSlotConnectionError(List *rstates, Oid subid, char *slotname, char *err); @@ -162,6 +185,10 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->runasowner = false; if (IsSet(supported_opts, SUBOPT_FAILOVER)) opts->failover = false; + if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + opts->retaindeadtuples = false; + if (IsSet(supported_opts, SUBOPT_MAX_RETENTION_DURATION)) + opts->maxretention = 0; if (IsSet(supported_opts, SUBOPT_ORIGIN)) opts->origin = pstrdup(LOGICALREP_ORIGIN_ANY); @@ -210,7 +237,7 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, if (strcmp(opts->slot_name, "none") == 0) opts->slot_name = NULL; else - ReplicationSlotValidateName(opts->slot_name, ERROR); + ReplicationSlotValidateName(opts->slot_name, false, ERROR); } else if (IsSet(supported_opts, SUBOPT_COPY_DATA) && strcmp(defel->defname, "copy_data") == 0) @@ -307,6 +334,24 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->specified_opts |= SUBOPT_FAILOVER; opts->failover = defGetBoolean(defel); } + else if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES) && + strcmp(defel->defname, "retain_dead_tuples") == 0) + { + if (IsSet(opts->specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + errorConflictingDefElem(defel, pstate); + + opts->specified_opts |= SUBOPT_RETAIN_DEAD_TUPLES; + opts->retaindeadtuples = defGetBoolean(defel); + } + else if (IsSet(supported_opts, SUBOPT_MAX_RETENTION_DURATION) && + strcmp(defel->defname, "max_retention_duration") == 0) + { + if (IsSet(opts->specified_opts, SUBOPT_MAX_RETENTION_DURATION)) + errorConflictingDefElem(defel, pstate); + + opts->specified_opts |= SUBOPT_MAX_RETENTION_DURATION; + opts->maxretention = defGetInt32(defel); + } else if (IsSet(supported_opts, SUBOPT_ORIGIN) && strcmp(defel->defname, "origin") == 0) { @@ -348,7 +393,7 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, lsn = DatumGetLSN(DirectFunctionCall1(pg_lsn_in, CStringGetDatum(lsn_str))); - if (XLogRecPtrIsInvalid(lsn)) + if (!XLogRecPtrIsValid(lsn)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid WAL location (LSN): %s", lsn_str))); @@ -446,20 +491,20 @@ static void check_publications(WalReceiverConn *wrconn, List *publications) { WalRcvExecResult *res; - StringInfo cmd; + StringInfoData cmd; TupleTableSlot *slot; List *publicationsCopy = NIL; Oid tableRow[1] = {TEXTOID}; - cmd = makeStringInfo(); - appendStringInfoString(cmd, "SELECT t.pubname FROM\n" + initStringInfo(&cmd); + appendStringInfoString(&cmd, "SELECT t.pubname FROM\n" " pg_catalog.pg_publication t WHERE\n" " t.pubname IN ("); - GetPublicationsStr(publications, cmd, true); - appendStringInfoChar(cmd, ')'); + GetPublicationsStr(publications, &cmd, true); + appendStringInfoChar(&cmd, ')'); - res = walrcv_exec(wrconn, cmd->data, 1, tableRow); - destroyStringInfo(cmd); + res = walrcv_exec(wrconn, cmd.data, 1, tableRow); + pfree(cmd.data); if (res->status != WALRCV_OK_TUPLES) ereport(ERROR, @@ -490,15 +535,17 @@ check_publications(WalReceiverConn *wrconn, List *publications) if (list_length(publicationsCopy)) { /* Prepare the list of non-existent publication(s) for error message. */ - StringInfo pubnames = makeStringInfo(); + StringInfoData pubnames; + + initStringInfo(&pubnames); - GetPublicationsStr(publicationsCopy, pubnames, false); + GetPublicationsStr(publicationsCopy, &pubnames, false); ereport(WARNING, errcode(ERRCODE_UNDEFINED_OBJECT), errmsg_plural("publication %s does not exist on the publisher", "publications %s do not exist on the publisher", list_length(publicationsCopy), - pubnames->data)); + pubnames.data)); } } @@ -519,7 +566,7 @@ publicationListToArray(List *publist) ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(memcxt); - datums = (Datum *) palloc(sizeof(Datum) * list_length(publist)); + datums = palloc_array(Datum, list_length(publist)); check_duplicates_in_publist(publist, datums); @@ -563,7 +610,9 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, SUBOPT_SYNCHRONOUS_COMMIT | SUBOPT_BINARY | SUBOPT_STREAMING | SUBOPT_TWOPHASE_COMMIT | SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | - SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | SUBOPT_ORIGIN); + SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | + SUBOPT_RETAIN_DEAD_TUPLES | + SUBOPT_MAX_RETENTION_DURATION | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); /* @@ -621,7 +670,7 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, /* Check if name is used */ subid = GetSysCacheOid2(SUBSCRIPTIONNAME, Anum_pg_subscription_oid, - MyDatabaseId, CStringGetDatum(stmt->subname)); + ObjectIdGetDatum(MyDatabaseId), CStringGetDatum(stmt->subname)); if (OidIsValid(subid)) { ereport(ERROR, @@ -630,6 +679,14 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, stmt->subname))); } + /* + * Ensure that system configuration parameters are set appropriately to + * support retain_dead_tuples and max_retention_duration. + */ + CheckSubDeadTupleRetention(true, !opts.enabled, WARNING, + opts.retaindeadtuples, opts.retaindeadtuples, + (opts.maxretention > 0)); + if (!IsSet(opts.specified_opts, SUBOPT_SLOT_NAME) && opts.slot_name == NULL) opts.slot_name = stmt->subname; @@ -670,6 +727,12 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, values[Anum_pg_subscription_subpasswordrequired - 1] = BoolGetDatum(opts.passwordrequired); values[Anum_pg_subscription_subrunasowner - 1] = BoolGetDatum(opts.runasowner); values[Anum_pg_subscription_subfailover - 1] = BoolGetDatum(opts.failover); + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); + values[Anum_pg_subscription_submaxretention - 1] = + Int32GetDatum(opts.maxretention); + values[Anum_pg_subscription_subretentionactive - 1] = + Int32GetDatum(opts.retaindeadtuples); values[Anum_pg_subscription_subconninfo - 1] = CStringGetTextDatum(conninfo); if (opts.slot_name) @@ -692,20 +755,27 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, recordDependencyOnOwner(SubscriptionRelationId, subid, owner); + /* + * A replication origin is currently created for all subscriptions, + * including those that only contain sequences or are otherwise empty. + * + * XXX: While this is technically unnecessary, optimizing it would require + * additional logic to skip origin creation during DDL operations and + * apply workers initialization, and to handle origin creation dynamically + * when tables are added to the subscription. It is not clear whether + * preventing creation of origins is worth additional complexity. + */ ReplicationOriginNameForLogicalRep(subid, InvalidOid, originname, sizeof(originname)); replorigin_create(originname); /* * Connect to remote side to execute requested commands and fetch table - * info. + * and sequence info. */ if (opts.connect) { char *err; WalReceiverConn *wrconn; - List *tables; - ListCell *lc; - char table_state; bool must_use_password; /* Try to connect to the publisher. */ @@ -720,33 +790,48 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, PG_TRY(); { + bool has_tables = false; + List *pubrels; + char relation_state; + check_publications(wrconn, publications); - check_publications_origin(wrconn, publications, opts.copy_data, - opts.origin, NULL, 0, stmt->subname); + check_publications_origin_tables(wrconn, publications, + opts.copy_data, + opts.retaindeadtuples, opts.origin, + NULL, 0, stmt->subname); + check_publications_origin_sequences(wrconn, publications, + opts.copy_data, opts.origin, + NULL, 0, stmt->subname); + + if (opts.retaindeadtuples) + check_pub_dead_tuple_retention(wrconn); /* * Set sync state based on if we were asked to do data copy or * not. */ - table_state = opts.copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY; + relation_state = opts.copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY; /* - * Get the table list from publisher and build local table status - * info. + * Build local relation status info. Relations are for both tables + * and sequences from the publisher. */ - tables = fetch_table_list(wrconn, publications); - foreach(lc, tables) + pubrels = fetch_relation_list(wrconn, publications); + + foreach_ptr(PublicationRelKind, pubrelinfo, pubrels) { - RangeVar *rv = (RangeVar *) lfirst(lc); Oid relid; + char relkind; + RangeVar *rv = pubrelinfo->rv; relid = RangeVarGetRelid(rv, AccessShareLock, false); + relkind = get_rel_relkind(relid); /* Check for supported relkind. */ - CheckSubscriptionRelkind(get_rel_relkind(relid), + CheckSubscriptionRelkind(relkind, pubrelinfo->relkind, rv->schemaname, rv->relname); - - AddSubscriptionRelState(subid, relid, table_state, + has_tables |= (relkind != RELKIND_SEQUENCE); + AddSubscriptionRelState(subid, relid, relation_state, InvalidXLogRecPtr, true); } @@ -754,6 +839,10 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, * If requested, create permanent slot for the subscription. We * won't use the initial snapshot for anything, so no need to * export it. + * + * XXX: Similar to origins, it is not clear whether preventing the + * slot creation for empty and sequence-only subscriptions is + * worth additional complexity. */ if (opts.create_slot) { @@ -777,7 +866,7 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, * PENDING, to allow ALTER SUBSCRIPTION ... REFRESH * PUBLICATION to work. */ - if (opts.twophase && !opts.copy_data && tables != NIL) + if (opts.twophase && !opts.copy_data && has_tables) twophase_enabled = true; walrcv_create_slot(wrconn, opts.slot_name, false, twophase_enabled, @@ -800,13 +889,23 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, else ereport(WARNING, (errmsg("subscription was created, but is not connected"), - errhint("To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription."))); + errhint("To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications."))); table_close(rel, RowExclusiveLock); pgstat_create_subscription(subid); - if (opts.enabled) + /* + * Notify the launcher to start the apply worker if the subscription is + * enabled, or to create the conflict detection slot if retain_dead_tuples + * is enabled. + * + * Creating the conflict detection slot is essential even when the + * subscription is not enabled. This ensures that dead tuples are + * retained, which is necessary for accurately identifying the type of + * conflict during replication. + */ + if (opts.enabled || opts.retaindeadtuples) ApplyLauncherWakeupAtCommit(); ObjectAddressSet(myself, SubscriptionRelationId, subid); @@ -821,21 +920,24 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, List *validate_publications) { char *err; - List *pubrel_names; + List *pubrels = NIL; + Oid *pubrel_local_oids; List *subrel_states; + List *sub_remove_rels = NIL; Oid *subrel_local_oids; - Oid *pubrel_local_oids; + Oid *subseq_local_oids; + int subrel_count; ListCell *lc; int off; - int remove_rel_len; - int subrel_count; + int tbl_count = 0; + int seq_count = 0; Relation rel = NULL; typedef struct SubRemoveRels { Oid relid; char state; } SubRemoveRels; - SubRemoveRels *sub_remove_rels; + WalReceiverConn *wrconn; bool must_use_password; @@ -857,71 +959,84 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, if (validate_publications) check_publications(wrconn, validate_publications); - /* Get the table list from publisher. */ - pubrel_names = fetch_table_list(wrconn, sub->publications); + /* Get the relation list from publisher. */ + pubrels = fetch_relation_list(wrconn, sub->publications); - /* Get local table list. */ - subrel_states = GetSubscriptionRelations(sub->oid, false); + /* Get local relation list. */ + subrel_states = GetSubscriptionRelations(sub->oid, true, true, false); subrel_count = list_length(subrel_states); /* - * Build qsorted array of local table oids for faster lookup. This can - * potentially contain all tables in the database so speed of lookup - * is important. + * Build qsorted arrays of local table oids and sequence oids for + * faster lookup. This can potentially contain all tables and + * sequences in the database so speed of lookup is important. + * + * We do not yet know the exact count of tables and sequences, so we + * allocate separate arrays for table OIDs and sequence OIDs based on + * the total number of relations (subrel_count). */ subrel_local_oids = palloc(subrel_count * sizeof(Oid)); - off = 0; + subseq_local_oids = palloc(subrel_count * sizeof(Oid)); foreach(lc, subrel_states) { SubscriptionRelState *relstate = (SubscriptionRelState *) lfirst(lc); - subrel_local_oids[off++] = relstate->relid; + if (get_rel_relkind(relstate->relid) == RELKIND_SEQUENCE) + subseq_local_oids[seq_count++] = relstate->relid; + else + subrel_local_oids[tbl_count++] = relstate->relid; } - qsort(subrel_local_oids, subrel_count, - sizeof(Oid), oid_cmp); - check_publications_origin(wrconn, sub->publications, copy_data, - sub->origin, subrel_local_oids, - subrel_count, sub->name); + qsort(subrel_local_oids, tbl_count, sizeof(Oid), oid_cmp); + check_publications_origin_tables(wrconn, sub->publications, copy_data, + sub->retaindeadtuples, sub->origin, + subrel_local_oids, tbl_count, + sub->name); - /* - * Rels that we want to remove from subscription and drop any slots - * and origins corresponding to them. - */ - sub_remove_rels = palloc(subrel_count * sizeof(SubRemoveRels)); + qsort(subseq_local_oids, seq_count, sizeof(Oid), oid_cmp); + check_publications_origin_sequences(wrconn, sub->publications, + copy_data, sub->origin, + subseq_local_oids, seq_count, + sub->name); /* - * Walk over the remote tables and try to match them to locally known - * tables. If the table is not known locally create a new state for - * it. + * Walk over the remote relations and try to match them to locally + * known relations. If the relation is not known locally create a new + * state for it. * - * Also builds array of local oids of remote tables for the next step. + * Also builds array of local oids of remote relations for the next + * step. */ off = 0; - pubrel_local_oids = palloc(list_length(pubrel_names) * sizeof(Oid)); + pubrel_local_oids = palloc(list_length(pubrels) * sizeof(Oid)); - foreach(lc, pubrel_names) + foreach_ptr(PublicationRelKind, pubrelinfo, pubrels) { - RangeVar *rv = (RangeVar *) lfirst(lc); + RangeVar *rv = pubrelinfo->rv; Oid relid; + char relkind; relid = RangeVarGetRelid(rv, AccessShareLock, false); + relkind = get_rel_relkind(relid); /* Check for supported relkind. */ - CheckSubscriptionRelkind(get_rel_relkind(relid), + CheckSubscriptionRelkind(relkind, pubrelinfo->relkind, rv->schemaname, rv->relname); pubrel_local_oids[off++] = relid; if (!bsearch(&relid, subrel_local_oids, - subrel_count, sizeof(Oid), oid_cmp)) + tbl_count, sizeof(Oid), oid_cmp) && + !bsearch(&relid, subseq_local_oids, + seq_count, sizeof(Oid), oid_cmp)) { AddSubscriptionRelState(sub->oid, relid, copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY, InvalidXLogRecPtr, true); ereport(DEBUG1, - (errmsg_internal("table \"%s.%s\" added to subscription \"%s\"", - rv->schemaname, rv->relname, sub->name))); + errmsg_internal("%s \"%s.%s\" added to subscription \"%s\"", + relkind == RELKIND_SEQUENCE ? "sequence" : "table", + rv->schemaname, rv->relname, sub->name)); } } @@ -929,19 +1044,18 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, * Next remove state for tables we should not care about anymore using * the data we collected above */ - qsort(pubrel_local_oids, list_length(pubrel_names), - sizeof(Oid), oid_cmp); + qsort(pubrel_local_oids, list_length(pubrels), sizeof(Oid), oid_cmp); - remove_rel_len = 0; - for (off = 0; off < subrel_count; off++) + for (off = 0; off < tbl_count; off++) { Oid relid = subrel_local_oids[off]; if (!bsearch(&relid, pubrel_local_oids, - list_length(pubrel_names), sizeof(Oid), oid_cmp)) + list_length(pubrels), sizeof(Oid), oid_cmp)) { char state; XLogRecPtr statelsn; + SubRemoveRels *remove_rel = palloc_object(SubRemoveRels); /* * Lock pg_subscription_rel with AccessExclusiveLock to @@ -963,12 +1077,14 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, /* Last known rel state. */ state = GetSubscriptionRelState(sub->oid, relid, &statelsn); - sub_remove_rels[remove_rel_len].relid = relid; - sub_remove_rels[remove_rel_len++].state = state; - RemoveSubscriptionRel(sub->oid, relid); - logicalrep_worker_stop(sub->oid, relid); + remove_rel->relid = relid; + remove_rel->state = state; + + sub_remove_rels = lappend(sub_remove_rels, remove_rel); + + logicalrep_worker_stop(WORKERTYPE_TABLESYNC, sub->oid, relid); /* * For READY state, we would have already dropped the @@ -983,10 +1099,10 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, * * It is possible that the origin is not yet created for * tablesync worker, this can happen for the states before - * SUBREL_STATE_FINISHEDCOPY. The tablesync worker or - * apply worker can also concurrently try to drop the - * origin and by this time the origin might be already - * removed. For these reasons, passing missing_ok = true. + * SUBREL_STATE_DATASYNC. The tablesync worker or apply + * worker can also concurrently try to drop the origin and + * by this time the origin might be already removed. For + * these reasons, passing missing_ok = true. */ ReplicationOriginNameForLogicalRep(sub->oid, relid, originname, sizeof(originname)); @@ -1006,10 +1122,10 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, * to be at the end because otherwise if there is an error while doing * the database operations we won't be able to rollback dropped slots. */ - for (off = 0; off < remove_rel_len; off++) + foreach_ptr(SubRemoveRels, sub_remove_rel, sub_remove_rels) { - if (sub_remove_rels[off].state != SUBREL_STATE_READY && - sub_remove_rels[off].state != SUBREL_STATE_SYNCDONE) + if (sub_remove_rel->state != SUBREL_STATE_READY && + sub_remove_rel->state != SUBREL_STATE_SYNCDONE) { char syncslotname[NAMEDATALEN] = {0}; @@ -1023,11 +1139,39 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, * dropped slots and fail. For these reasons, we allow * missing_ok = true for the drop. */ - ReplicationSlotNameForTablesync(sub->oid, sub_remove_rels[off].relid, + ReplicationSlotNameForTablesync(sub->oid, sub_remove_rel->relid, syncslotname, sizeof(syncslotname)); ReplicationSlotDropAtPubNode(wrconn, syncslotname, true); } } + + /* + * Next remove state for sequences we should not care about anymore + * using the data we collected above + */ + for (off = 0; off < seq_count; off++) + { + Oid relid = subseq_local_oids[off]; + + if (!bsearch(&relid, pubrel_local_oids, + list_length(pubrels), sizeof(Oid), oid_cmp)) + { + /* + * This locking ensures that the state of rels won't change + * till we are done with this refresh operation. + */ + if (!rel) + rel = table_open(SubscriptionRelRelationId, AccessExclusiveLock); + + RemoveSubscriptionRel(sub->oid, relid); + + ereport(DEBUG1, + errmsg_internal("sequence \"%s.%s\" removed from subscription \"%s\"", + get_namespace_name(get_rel_namespace(relid)), + get_rel_name(relid), + sub->name)); + } + } } PG_FINALLY(); { @@ -1040,18 +1184,74 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, } /* - * Common checks for altering failover and two_phase options. + * Marks all sequences with INIT state. + */ +static void +AlterSubscription_refresh_seq(Subscription *sub) +{ + char *err = NULL; + WalReceiverConn *wrconn; + bool must_use_password; + + /* Load the library providing us libpq calls. */ + load_file("libpqwalreceiver", false); + + /* Try to connect to the publisher. */ + must_use_password = sub->passwordrequired && !sub->ownersuperuser; + wrconn = walrcv_connect(sub->conninfo, true, true, must_use_password, + sub->name, &err); + if (!wrconn) + ereport(ERROR, + errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("subscription \"%s\" could not connect to the publisher: %s", + sub->name, err)); + + PG_TRY(); + { + List *subrel_states; + + check_publications_origin_sequences(wrconn, sub->publications, true, + sub->origin, NULL, 0, sub->name); + + /* Get local sequence list. */ + subrel_states = GetSubscriptionRelations(sub->oid, false, true, false); + foreach_ptr(SubscriptionRelState, subrel, subrel_states) + { + Oid relid = subrel->relid; + + UpdateSubscriptionRelState(sub->oid, relid, SUBREL_STATE_INIT, + InvalidXLogRecPtr, false); + ereport(DEBUG1, + errmsg_internal("sequence \"%s.%s\" of subscription \"%s\" set to INIT state", + get_namespace_name(get_rel_namespace(relid)), + get_rel_name(relid), + sub->name)); + } + } + PG_FINALLY(); + { + walrcv_disconnect(wrconn); + } + PG_END_TRY(); +} + +/* + * Common checks for altering failover, two_phase, and retain_dead_tuples + * options. */ static void CheckAlterSubOption(Subscription *sub, const char *option, bool slot_needs_update, bool isTopLevel) { + Assert(strcmp(option, "failover") == 0 || + strcmp(option, "two_phase") == 0 || + strcmp(option, "retain_dead_tuples") == 0); + /* - * The checks in this function are required only for failover and - * two_phase options. + * Altering the retain_dead_tuples option does not update the slot on the + * publisher. */ - Assert(strcmp(option, "failover") == 0 || - strcmp(option, "two_phase") == 0); + Assert(!slot_needs_update || strcmp(option, "retain_dead_tuples") != 0); /* * Do not allow changing the option if the subscription is enabled. This @@ -1063,6 +1263,39 @@ CheckAlterSubOption(Subscription *sub, const char *option, * the publisher by the existing walsender, so we could have allowed that * even when the subscription is enabled. But we kept this restriction for * the sake of consistency and simplicity. + * + * Additionally, do not allow changing the retain_dead_tuples option when + * the subscription is enabled to prevent race conditions arising from the + * new option value being acknowledged asynchronously by the launcher and + * apply workers. + * + * Without the restriction, a race condition may arise when a user + * disables and immediately re-enables the retain_dead_tuples option. In + * this case, the launcher might drop the slot upon noticing the disabled + * action, while the apply worker may keep maintaining + * oldest_nonremovable_xid without noticing the option change. During this + * period, a transaction ID wraparound could falsely make this ID appear + * as if it originates from the future w.r.t the transaction ID stored in + * the slot maintained by launcher. + * + * Similarly, if the user enables retain_dead_tuples concurrently with the + * launcher starting the worker, the apply worker may start calculating + * oldest_nonremovable_xid before the launcher notices the enable action. + * Consequently, the launcher may update slot.xmin to a newer value than + * that maintained by the worker. In subsequent cycles, upon integrating + * the worker's oldest_nonremovable_xid, the launcher might detect a + * retreat in the calculated xmin, necessitating additional handling. + * + * XXX To address the above race conditions, we can define + * oldest_nonremovable_xid as FullTransactionID and adds the check to + * disallow retreating the conflict slot's xmin. For now, we kept the + * implementation simple by disallowing change to the retain_dead_tuples, + * but in the future we can change this after some more analysis. + * + * Note that we could restrict only the enabling of retain_dead_tuples to + * avoid the race conditions described above, but we maintain the + * restriction for both enable and disable operations for the sake of + * consistency. */ if (sub->enabled) ereport(ERROR, @@ -1110,6 +1343,11 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, bool update_tuple = false; bool update_failover = false; bool update_two_phase = false; + bool check_pub_rdt = false; + bool retain_dead_tuples; + int max_retention; + bool retention_active; + char *origin; Subscription *sub; Form_pg_subscription form; bits32 supported_opts; @@ -1118,7 +1356,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, rel = table_open(SubscriptionRelationId, RowExclusiveLock); /* Fetch the existing tuple. */ - tup = SearchSysCacheCopy2(SUBSCRIPTIONNAME, MyDatabaseId, + tup = SearchSysCacheCopy2(SUBSCRIPTIONNAME, ObjectIdGetDatum(MyDatabaseId), CStringGetDatum(stmt->subname)); if (!HeapTupleIsValid(tup)) @@ -1137,6 +1375,11 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, sub = GetSubscription(subid, false); + retain_dead_tuples = sub->retaindeadtuples; + origin = sub->origin; + max_retention = sub->maxretention; + retention_active = sub->retentionactive; + /* * Don't allow non-superuser modification of a subscription with * password_required=false. @@ -1165,6 +1408,8 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | + SUBOPT_RETAIN_DEAD_TUPLES | + SUBOPT_MAX_RETENTION_DURATION | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, @@ -1267,7 +1512,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, IsSet(opts.specified_opts, SUBOPT_SLOT_NAME)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("slot_name and two_phase cannot be altered at the same time"))); + errmsg("\"slot_name\" and \"two_phase\" cannot be altered at the same time"))); /* * Note that workers may still survive even if the @@ -1283,7 +1528,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, if (logicalrep_workers_find(subid, true, true)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot alter two_phase when logical replication worker is still running"), + errmsg("cannot alter \"two_phase\" when logical replication worker is still running"), errhint("Try again after some time."))); /* @@ -1297,7 +1542,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, LookupGXactBySubid(subid)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot disable two_phase when prepared transactions are present"), + errmsg("cannot disable \"two_phase\" when prepared transactions exist"), errhint("Resolve these transactions and try again."))); /* Change system catalog accordingly */ @@ -1325,11 +1570,99 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, replaces[Anum_pg_subscription_subfailover - 1] = true; } + if (IsSet(opts.specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + { + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); + replaces[Anum_pg_subscription_subretaindeadtuples - 1] = true; + + /* + * Update the retention status only if there's a change in + * the retain_dead_tuples option value. + * + * Automatically marking retention as active when + * retain_dead_tuples is enabled may not always be ideal, + * especially if retention was previously stopped and the + * user toggles retain_dead_tuples without adjusting the + * publisher workload. However, this behavior provides a + * convenient way for users to manually refresh the + * retention status. Since retention will be stopped again + * unless the publisher workload is reduced, this approach + * is acceptable for now. + */ + if (opts.retaindeadtuples != sub->retaindeadtuples) + { + values[Anum_pg_subscription_subretentionactive - 1] = + BoolGetDatum(opts.retaindeadtuples); + replaces[Anum_pg_subscription_subretentionactive - 1] = true; + + retention_active = opts.retaindeadtuples; + } + + CheckAlterSubOption(sub, "retain_dead_tuples", false, isTopLevel); + + /* + * Workers may continue running even after the + * subscription has been disabled. + * + * To prevent race conditions (as described in + * CheckAlterSubOption()), ensure that all worker + * processes have already exited before proceeding. + */ + if (logicalrep_workers_find(subid, true, true)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot alter retain_dead_tuples when logical replication worker is still running"), + errhint("Try again after some time."))); + + /* + * Notify the launcher to manage the replication slot for + * conflict detection. This ensures that replication slot + * is efficiently handled (created, updated, or dropped) + * in response to any configuration changes. + */ + ApplyLauncherWakeupAtCommit(); + + check_pub_rdt = opts.retaindeadtuples; + retain_dead_tuples = opts.retaindeadtuples; + } + + if (IsSet(opts.specified_opts, SUBOPT_MAX_RETENTION_DURATION)) + { + values[Anum_pg_subscription_submaxretention - 1] = + Int32GetDatum(opts.maxretention); + replaces[Anum_pg_subscription_submaxretention - 1] = true; + + max_retention = opts.maxretention; + } + + /* + * Ensure that system configuration parameters are set + * appropriately to support retain_dead_tuples and + * max_retention_duration. + */ + if (IsSet(opts.specified_opts, SUBOPT_RETAIN_DEAD_TUPLES) || + IsSet(opts.specified_opts, SUBOPT_MAX_RETENTION_DURATION)) + CheckSubDeadTupleRetention(true, !sub->enabled, NOTICE, + retain_dead_tuples, + retention_active, + (max_retention > 0)); + if (IsSet(opts.specified_opts, SUBOPT_ORIGIN)) { values[Anum_pg_subscription_suborigin - 1] = CStringGetTextDatum(opts.origin); replaces[Anum_pg_subscription_suborigin - 1] = true; + + /* + * Check if changes from different origins may be received + * from the publisher when the origin is changed to ANY + * and retain_dead_tuples is enabled. + */ + check_pub_rdt = retain_dead_tuples && + pg_strcasecmp(opts.origin, LOGICALREP_ORIGIN_ANY) == 0; + + origin = opts.origin; } update_tuple = true; @@ -1347,6 +1680,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot enable subscription that does not have a slot name"))); + /* + * Check track_commit_timestamp only when enabling the + * subscription in case it was disabled after creation. See + * comments atop CheckSubDeadTupleRetention() for details. + */ + CheckSubDeadTupleRetention(opts.enabled, !opts.enabled, + WARNING, sub->retaindeadtuples, + sub->retentionactive, false); + values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(opts.enabled); replaces[Anum_pg_subscription_subenabled - 1] = true; @@ -1355,6 +1697,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, ApplyLauncherWakeupAtCommit(); update_tuple = true; + + /* + * The subscription might be initially created with + * connect=false and retain_dead_tuples=true, meaning the + * remote server's status may not be checked. Ensure this + * check is conducted now. + */ + check_pub_rdt = sub->retaindeadtuples && opts.enabled; break; } @@ -1369,6 +1719,13 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, CStringGetTextDatum(stmt->conninfo); replaces[Anum_pg_subscription_subconninfo - 1] = true; update_tuple = true; + + /* + * Since the remote server configuration might have changed, + * perform a check to ensure it permits enabling + * retain_dead_tuples. + */ + check_pub_rdt = sub->retaindeadtuples; break; case ALTER_SUBSCRIPTION_SET_PUBLICATION: @@ -1393,8 +1750,8 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, errhint("Use ALTER SUBSCRIPTION ... SET PUBLICATION ... WITH (refresh = false)."))); /* - * See ALTER_SUBSCRIPTION_REFRESH for details why this is - * not allowed. + * See ALTER_SUBSCRIPTION_REFRESH_PUBLICATION for details + * why this is not allowed. */ if (sub->twophasestate == LOGICALREP_TWOPHASE_STATE_ENABLED && opts.copy_data) ereport(ERROR, @@ -1448,8 +1805,8 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, "ALTER SUBSCRIPTION ... DROP PUBLICATION ... WITH (refresh = false)"))); /* - * See ALTER_SUBSCRIPTION_REFRESH for details why this is - * not allowed. + * See ALTER_SUBSCRIPTION_REFRESH_PUBLICATION for details + * why this is not allowed. */ if (sub->twophasestate == LOGICALREP_TWOPHASE_STATE_ENABLED && opts.copy_data) ereport(ERROR, @@ -1473,12 +1830,13 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, break; } - case ALTER_SUBSCRIPTION_REFRESH: + case ALTER_SUBSCRIPTION_REFRESH_PUBLICATION: { if (!sub->enabled) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("ALTER SUBSCRIPTION ... REFRESH is not allowed for disabled subscriptions"))); + errmsg("%s is not allowed for disabled subscriptions", + "ALTER SUBSCRIPTION ... REFRESH PUBLICATION"))); parse_subscription_options(pstate, stmt->options, SUBOPT_COPY_DATA, &opts); @@ -1490,8 +1848,8 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, * * But, having reached this two-phase commit "enabled" state * we must not allow any subsequent table initialization to - * occur. So the ALTER SUBSCRIPTION ... REFRESH is disallowed - * when the user had requested two_phase = on mode. + * occur. So the ALTER SUBSCRIPTION ... REFRESH PUBLICATION is + * disallowed when the user had requested two_phase = on mode. * * The exception to this restriction is when copy_data = * false, because when copy_data is false the tablesync will @@ -1503,16 +1861,29 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, if (sub->twophasestate == LOGICALREP_TWOPHASE_STATE_ENABLED && opts.copy_data) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("ALTER SUBSCRIPTION ... REFRESH with copy_data is not allowed when two_phase is enabled"), - errhint("Use ALTER SUBSCRIPTION ... REFRESH with copy_data = false, or use DROP/CREATE SUBSCRIPTION."))); + errmsg("ALTER SUBSCRIPTION ... REFRESH PUBLICATION with copy_data is not allowed when two_phase is enabled"), + errhint("Use ALTER SUBSCRIPTION ... REFRESH PUBLICATION with copy_data = false, or use DROP/CREATE SUBSCRIPTION."))); - PreventInTransactionBlock(isTopLevel, "ALTER SUBSCRIPTION ... REFRESH"); + PreventInTransactionBlock(isTopLevel, "ALTER SUBSCRIPTION ... REFRESH PUBLICATION"); AlterSubscription_refresh(sub, opts.copy_data, NULL); break; } + case ALTER_SUBSCRIPTION_REFRESH_SEQUENCES: + { + if (!sub->enabled) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("%s is not allowed for disabled subscriptions", + "ALTER SUBSCRIPTION ... REFRESH SEQUENCES")); + + AlterSubscription_refresh_seq(sub); + + break; + } + case ALTER_SUBSCRIPTION_SKIP: { parse_subscription_options(pstate, stmt->options, SUBOPT_LSN, &opts); @@ -1524,7 +1895,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, * If the user sets subskiplsn, we do a sanity check to make * sure that the specified LSN is a probable value. */ - if (!XLogRecPtrIsInvalid(opts.lsn)) + if (XLogRecPtrIsValid(opts.lsn)) { RepOriginId originid; char originname[NAMEDATALEN]; @@ -1536,10 +1907,10 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, remote_lsn = replorigin_get_progress(originid, false); /* Check the given LSN is at least a future LSN */ - if (!XLogRecPtrIsInvalid(remote_lsn) && opts.lsn < remote_lsn) + if (XLogRecPtrIsValid(remote_lsn) && opts.lsn < remote_lsn) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("skip WAL location (LSN %X/%X) must be greater than origin LSN %X/%X", + errmsg("skip WAL location (LSN %X/%08X) must be greater than origin LSN %X/%08X", LSN_FORMAT_ARGS(opts.lsn), LSN_FORMAT_ARGS(remote_lsn)))); } @@ -1568,14 +1939,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, } /* - * Try to acquire the connection necessary for altering the slot, if - * needed. + * Try to acquire the connection necessary either for modifying the slot + * or for checking if the remote server permits enabling + * retain_dead_tuples. * * This has to be at the end because otherwise if there is an error while * doing the database operations we won't be able to rollback altered * slot. */ - if (update_failover || update_two_phase) + if (update_failover || update_two_phase || check_pub_rdt) { bool must_use_password; char *err; @@ -1584,10 +1956,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, /* Load the library providing us libpq calls. */ load_file("libpqwalreceiver", false); - /* Try to connect to the publisher. */ + /* + * Try to connect to the publisher, using the new connection string if + * available. + */ must_use_password = sub->passwordrequired && !sub->ownersuperuser; - wrconn = walrcv_connect(sub->conninfo, true, true, must_use_password, - sub->name, &err); + wrconn = walrcv_connect(stmt->conninfo ? stmt->conninfo : sub->conninfo, + true, true, must_use_password, sub->name, + &err); if (!wrconn) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), @@ -1596,9 +1972,17 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, PG_TRY(); { - walrcv_alter_slot(wrconn, sub->slotname, - update_failover ? &opts.failover : NULL, - update_two_phase ? &opts.twophase : NULL); + if (retain_dead_tuples) + check_pub_dead_tuple_retention(wrconn); + + check_publications_origin_tables(wrconn, sub->publications, false, + retain_dead_tuples, origin, NULL, 0, + sub->name); + + if (update_failover || update_two_phase) + walrcv_alter_slot(wrconn, sub->slotname, + update_failover ? &opts.failover : NULL, + update_two_phase ? &opts.twophase : NULL); } PG_FINALLY(); { @@ -1645,12 +2029,14 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) bool must_use_password; /* - * Lock pg_subscription with AccessExclusiveLock to ensure that the - * launcher doesn't restart new worker during dropping the subscription + * The launcher may concurrently start a new worker for this subscription. + * During initialization, the worker checks for subscription validity and + * exits if the subscription has already been dropped. See + * InitializeLogRepWorker. */ - rel = table_open(SubscriptionRelationId, AccessExclusiveLock); + rel = table_open(SubscriptionRelationId, RowExclusiveLock); - tup = SearchSysCache2(SUBSCRIPTIONNAME, MyDatabaseId, + tup = SearchSysCache2(SUBSCRIPTIONNAME, ObjectIdGetDatum(MyDatabaseId), CStringGetDatum(stmt->subname)); if (!HeapTupleIsValid(tup)) @@ -1750,7 +2136,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) { LogicalRepWorker *w = (LogicalRepWorker *) lfirst(lc); - logicalrep_worker_stop(w->subid, w->relid); + logicalrep_worker_stop(w->type, w->subid, w->relid); } list_free(subworkers); @@ -1773,7 +2159,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) * the apply and tablesync workers and they can't restart because of * exclusive lock on the subscription. */ - rstates = GetSubscriptionRelations(subid, true); + rstates = GetSubscriptionRelations(subid, true, false, true); foreach(lc, rstates) { SubscriptionRelState *rstate = (SubscriptionRelState *) lfirst(lc); @@ -1788,7 +2174,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) * * It is possible that the origin is not yet created for tablesync * worker so passing missing_ok = true. This can happen for the states - * before SUBREL_STATE_FINISHEDCOPY. + * before SUBREL_STATE_DATASYNC. */ ReplicationOriginNameForLogicalRep(subid, relid, originname, sizeof(originname)); @@ -2035,7 +2421,7 @@ AlterSubscriptionOwner(const char *name, Oid newOwnerId) rel = table_open(SubscriptionRelationId, RowExclusiveLock); - tup = SearchSysCacheCopy2(SUBSCRIPTIONNAME, MyDatabaseId, + tup = SearchSysCacheCopy2(SUBSCRIPTIONNAME, ObjectIdGetDatum(MyDatabaseId), CStringGetDatum(name)); if (!HeapTupleIsValid(tup)) @@ -2086,21 +2472,30 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId) * Check and log a warning if the publisher has subscribed to the same table, * its partition ancestors (if it's a partition), or its partition children (if * it's a partitioned table), from some other publishers. This check is - * required only if "copy_data = true" and "origin = none" for CREATE - * SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements to notify the - * user that data having origin might have been copied. + * required in the following scenarios: * - * This check need not be performed on the tables that are already added - * because incremental sync for those tables will happen through WAL and the - * origin of the data can be identified from the WAL records. + * 1) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH PUBLICATION + * statements with "copy_data = true" and "origin = none": + * - Warn the user that data with an origin might have been copied. + * - This check is skipped for tables already added, as incremental sync via + * WAL allows origin tracking. The list of such tables is in + * subrel_local_oids. * - * subrel_local_oids contains the list of relation oids that are already - * present on the subscriber. + * 2) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH PUBLICATION + * statements with "retain_dead_tuples = true" and "origin = any", and for + * ALTER SUBSCRIPTION statements that modify retain_dead_tuples or origin, + * or when the publisher's status changes (e.g., due to a connection string + * update): + * - Warn the user that only conflict detection info for local changes on + * the publisher is retained. Data from other origins may lack sufficient + * details for reliable conflict detection. + * - See comments atop worker.c for more details. */ static void -check_publications_origin(WalReceiverConn *wrconn, List *publications, - bool copydata, char *origin, Oid *subrel_local_oids, - int subrel_count, char *subname) +check_publications_origin_tables(WalReceiverConn *wrconn, List *publications, + bool copydata, bool retain_dead_tuples, + char *origin, Oid *subrel_local_oids, + int subrel_count, char *subname) { WalRcvExecResult *res; StringInfoData cmd; @@ -2108,9 +2503,29 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, Oid tableRow[1] = {TEXTOID}; List *publist = NIL; int i; + bool check_rdt; + bool check_table_sync; + bool origin_none = origin && + pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) == 0; + + /* + * Enable retain_dead_tuples checks only when origin is set to 'any', + * since with origin='none' only local changes are replicated to the + * subscriber. + */ + check_rdt = retain_dead_tuples && !origin_none; + + /* + * Enable table synchronization checks only when origin is 'none', to + * ensure that data from other origins is not inadvertently copied. + */ + check_table_sync = copydata && origin_none; - if (!copydata || !origin || - (pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) != 0)) + /* retain_dead_tuples and table sync checks occur separately */ + Assert(!(check_rdt && check_table_sync)); + + /* Return if no checks are required */ + if (!check_rdt && !check_table_sync) return; initStringInfo(&cmd); @@ -2127,18 +2542,25 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, appendStringInfoString(&cmd, ")\n"); /* - * In case of ALTER SUBSCRIPTION ... REFRESH, subrel_local_oids contains - * the list of relation oids that are already present on the subscriber. - * This check should be skipped for these tables. + * In case of ALTER SUBSCRIPTION ... REFRESH PUBLICATION, + * subrel_local_oids contains the list of relation oids that are already + * present on the subscriber. This check should be skipped for these + * tables if checking for table sync scenario. However, when handling the + * retain_dead_tuples scenario, ensure all tables are checked, as some + * existing tables may now include changes from other origins due to newly + * created subscriptions on the publisher. */ - for (i = 0; i < subrel_count; i++) + if (check_table_sync) { - Oid relid = subrel_local_oids[i]; - char *schemaname = get_namespace_name(get_rel_namespace(relid)); - char *tablename = get_rel_name(relid); + for (i = 0; i < subrel_count; i++) + { + Oid relid = subrel_local_oids[i]; + char *schemaname = get_namespace_name(get_rel_namespace(relid)); + char *tablename = get_rel_name(relid); - appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", - schemaname, tablename); + appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", + schemaname, tablename); + } } res = walrcv_exec(wrconn, cmd.data, 1, tableRow); @@ -2150,7 +2572,7 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, errmsg("could not receive list of replicated tables from the publisher: %s", res->err))); - /* Process tables. */ + /* Process publications. */ slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) { @@ -2173,22 +2595,140 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, * XXX: For simplicity, we don't check whether the table has any data or * not. If the table doesn't have any data then we don't need to * distinguish between data having origin and data not having origin so we - * can avoid logging a warning in that case. + * can avoid logging a warning for table sync scenario. + */ + if (publist) + { + StringInfoData pubnames; + + /* Prepare the list of publication(s) for warning message. */ + initStringInfo(&pubnames); + GetPublicationsStr(publist, &pubnames, false); + + if (check_table_sync) + ereport(WARNING, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin", + subname), + errdetail_plural("The subscription subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", + "The subscription subscribes to publications (%s) that contain tables that are written to by other subscriptions.", + list_length(publist), pubnames.data), + errhint("Verify that initial data copied from the publisher tables did not come from other origins.")); + else + ereport(WARNING, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("subscription \"%s\" enabled retain_dead_tuples but might not reliably detect conflicts for changes from different origins", + subname), + errdetail_plural("The subscription subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", + "The subscription subscribes to publications (%s) that contain tables that are written to by other subscriptions.", + list_length(publist), pubnames.data), + errhint("Consider using origin = NONE or disabling retain_dead_tuples.")); + } + + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); +} + +/* + * This function is similar to check_publications_origin_tables and serves + * same purpose for sequences. + */ +static void +check_publications_origin_sequences(WalReceiverConn *wrconn, List *publications, + bool copydata, char *origin, + Oid *subrel_local_oids, int subrel_count, + char *subname) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[1] = {TEXTOID}; + List *publist = NIL; + + /* + * Enable sequence synchronization checks only when origin is 'none' , to + * ensure that sequence data from other origins is not inadvertently + * copied. This check is necessary if the publisher is running PG19 or + * later, where logical replication sequence synchronization is supported. + */ + if (!copydata || pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) != 0 || + walrcv_server_version(wrconn) < 190000) + return; + + initStringInfo(&cmd); + appendStringInfoString(&cmd, + "SELECT DISTINCT P.pubname AS pubname\n" + "FROM pg_publication P,\n" + " LATERAL pg_get_publication_sequences(P.pubname) GPS\n" + " JOIN pg_subscription_rel PS ON (GPS.relid = PS.srrelid),\n" + " pg_class C JOIN pg_namespace N ON (N.oid = C.relnamespace)\n" + "WHERE C.oid = GPS.relid AND P.pubname IN ("); + + GetPublicationsStr(publications, &cmd, true); + appendStringInfoString(&cmd, ")\n"); + + /* + * In case of ALTER SUBSCRIPTION ... REFRESH PUBLICATION, + * subrel_local_oids contains the list of relations that are already + * present on the subscriber. This check should be skipped as these will + * not be re-synced. + */ + for (int i = 0; i < subrel_count; i++) + { + Oid relid = subrel_local_oids[i]; + char *schemaname = get_namespace_name(get_rel_namespace(relid)); + char *seqname = get_rel_name(relid); + + appendStringInfo(&cmd, + "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", + schemaname, seqname); + } + + res = walrcv_exec(wrconn, cmd.data, 1, tableRow); + pfree(cmd.data); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not receive list of replicated sequences from the publisher: %s", + res->err))); + + /* Process publications. */ + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + char *pubname; + bool isnull; + + pubname = TextDatumGetCString(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + + ExecClearTuple(slot); + publist = list_append_unique(publist, makeString(pubname)); + } + + /* + * Log a warning if the publisher has subscribed to the same sequence from + * some other publisher. We cannot know the origin of sequences data + * during the initial sync. */ if (publist) { - StringInfo pubnames = makeStringInfo(); + StringInfoData pubnames; /* Prepare the list of publication(s) for warning message. */ - GetPublicationsStr(publist, pubnames, false); + initStringInfo(&pubnames); + GetPublicationsStr(publist, &pubnames, false); + ereport(WARNING, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin", subname), - errdetail_plural("The subscription being created subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", - "The subscription being created subscribes to publications (%s) that contain tables that are written to by other subscriptions.", - list_length(publist), pubnames->data), - errhint("Verify that initial data copied from the publisher tables did not come from other origins.")); + errdetail_plural("The subscription subscribes to a publication (%s) that contains sequences that are written to by other subscriptions.", + "The subscription subscribes to publications (%s) that contain sequences that are written to by other subscriptions.", + list_length(publist), pubnames.data), + errhint("Verify that initial data copied from the publisher sequences did not come from other origins.")); } ExecDropSingleTupleTableSlot(slot); @@ -2197,8 +2737,134 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, } /* - * Get the list of tables which belong to specified publications on the - * publisher connection. + * Determine whether the retain_dead_tuples can be enabled based on the + * publisher's status. + * + * This option is disallowed if the publisher is running a version earlier + * than the PG19, or if the publisher is in recovery (i.e., it is a standby + * server). + * + * See comments atop worker.c for a detailed explanation. + */ +static void +check_pub_dead_tuple_retention(WalReceiverConn *wrconn) +{ + WalRcvExecResult *res; + Oid RecoveryRow[1] = {BOOLOID}; + TupleTableSlot *slot; + bool isnull; + bool remote_in_recovery; + + if (walrcv_server_version(wrconn) < 190000) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot enable retain_dead_tuples if the publisher is running a version earlier than PostgreSQL 19")); + + res = walrcv_exec(wrconn, "SELECT pg_is_in_recovery()", 1, RecoveryRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not obtain recovery progress from the publisher: %s", + res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + elog(ERROR, "failed to fetch tuple for the recovery progress"); + + remote_in_recovery = DatumGetBool(slot_getattr(slot, 1, &isnull)); + + if (remote_in_recovery) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot enable retain_dead_tuples if the publisher is in recovery.")); + + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); +} + +/* + * Check if the subscriber's configuration is adequate to enable the + * retain_dead_tuples option. + * + * Issue an ERROR if the wal_level does not support the use of replication + * slots when check_guc is set to true. + * + * Issue a WARNING if track_commit_timestamp is not enabled when check_guc is + * set to true. This is only to highlight the importance of enabling + * track_commit_timestamp instead of catching all the misconfigurations, as + * this setting can be adjusted after subscription creation. Without it, the + * apply worker will simply skip conflict detection. + * + * Issue a WARNING or NOTICE if the subscription is disabled and the retention + * is active. Do not raise an ERROR since users can only modify + * retain_dead_tuples for disabled subscriptions. And as long as the + * subscription is enabled promptly, it will not pose issues. + * + * Issue a NOTICE to inform users that max_retention_duration is + * ineffective when retain_dead_tuples is disabled for a subscription. An ERROR + * is not issued because setting max_retention_duration causes no harm, + * even when it is ineffective. + */ +void +CheckSubDeadTupleRetention(bool check_guc, bool sub_disabled, + int elevel_for_sub_disabled, + bool retain_dead_tuples, bool retention_active, + bool max_retention_set) +{ + Assert(elevel_for_sub_disabled == NOTICE || + elevel_for_sub_disabled == WARNING); + + if (retain_dead_tuples) + { + if (check_guc && wal_level < WAL_LEVEL_REPLICA) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("\"wal_level\" is insufficient to create the replication slot required by retain_dead_tuples"), + errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")); + + if (check_guc && !track_commit_timestamp) + ereport(WARNING, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("commit timestamp and origin data required for detecting conflicts won't be retained"), + errhint("Consider setting \"%s\" to true.", + "track_commit_timestamp")); + + if (sub_disabled && retention_active) + ereport(elevel_for_sub_disabled, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("deleted rows to detect conflicts would not be removed until the subscription is enabled"), + (elevel_for_sub_disabled > NOTICE) + ? errhint("Consider setting %s to false.", + "retain_dead_tuples") : 0); + } + else if (max_retention_set) + { + ereport(NOTICE, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("max_retention_duration is ineffective when retain_dead_tuples is disabled")); + } +} + +/* + * Return true iff 'rv' is a member of the list. + */ +static bool +list_member_rangevar(const List *list, RangeVar *rv) +{ + foreach_ptr(PublicationRelKind, relinfo, list) + { + if (equal(relinfo->rv, rv)) + return true; + } + + return false; +} + +/* + * Get the list of tables and sequences which belong to specified publications + * on the publisher connection. * * Note that we don't support the case where the column list is different for * the same table in different publications to avoid sending unwanted column @@ -2206,26 +2872,28 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, * list and row filter are specified for different publications. */ static List * -fetch_table_list(WalReceiverConn *wrconn, List *publications) +fetch_relation_list(WalReceiverConn *wrconn, List *publications) { WalRcvExecResult *res; StringInfoData cmd; TupleTableSlot *slot; - Oid tableRow[3] = {TEXTOID, TEXTOID, InvalidOid}; - List *tablelist = NIL; + Oid tableRow[4] = {TEXTOID, TEXTOID, CHAROID, InvalidOid}; + List *relationlist = NIL; int server_version = walrcv_server_version(wrconn); bool check_columnlist = (server_version >= 150000); - StringInfo pub_names = makeStringInfo(); + int column_count = check_columnlist ? 4 : 3; + StringInfoData pub_names; initStringInfo(&cmd); + initStringInfo(&pub_names); /* Build the pub_names comma-separated string. */ - GetPublicationsStr(publications, pub_names, true); + GetPublicationsStr(publications, &pub_names, true); - /* Get the list of tables from the publisher. */ + /* Get the list of relations from the publisher */ if (server_version >= 160000) { - tableRow[2] = INT2VECTOROID; + tableRow[3] = INT2VECTOROID; /* * From version 16, we allowed passing multiple publications to the @@ -2240,19 +2908,28 @@ fetch_table_list(WalReceiverConn *wrconn, List *publications) * to worry if different publications have specified them in a * different order. See pub_collist_validate. */ - appendStringInfo(&cmd, "SELECT DISTINCT n.nspname, c.relname, gpt.attrs\n" - " FROM pg_class c\n" + appendStringInfo(&cmd, "SELECT DISTINCT n.nspname, c.relname, c.relkind, gpt.attrs\n" + " FROM pg_class c\n" " JOIN pg_namespace n ON n.oid = c.relnamespace\n" " JOIN ( SELECT (pg_get_publication_tables(VARIADIC array_agg(pubname::text))).*\n" " FROM pg_publication\n" " WHERE pubname IN ( %s )) AS gpt\n" " ON gpt.relid = c.oid\n", - pub_names->data); + pub_names.data); + + /* From version 19, inclusion of sequences in the target is supported */ + if (server_version >= 190000) + appendStringInfo(&cmd, + "UNION ALL\n" + " SELECT DISTINCT s.schemaname, s.sequencename, " CppAsString2(RELKIND_SEQUENCE) "::\"char\" AS relkind, NULL::int2vector AS attrs\n" + " FROM pg_catalog.pg_publication_sequences s\n" + " WHERE s.pubname IN ( %s )", + pub_names.data); } else { - tableRow[2] = NAMEARRAYOID; - appendStringInfoString(&cmd, "SELECT DISTINCT t.schemaname, t.tablename \n"); + tableRow[3] = NAMEARRAYOID; + appendStringInfoString(&cmd, "SELECT DISTINCT t.schemaname, t.tablename, " CppAsString2(RELKIND_RELATION) "::\"char\" AS relkind \n"); /* Get column lists for each relation if the publisher supports it */ if (check_columnlist) @@ -2260,12 +2937,12 @@ fetch_table_list(WalReceiverConn *wrconn, List *publications) appendStringInfo(&cmd, "FROM pg_catalog.pg_publication_tables t\n" " WHERE t.pubname IN ( %s )", - pub_names->data); + pub_names.data); } - destroyStringInfo(pub_names); + pfree(pub_names.data); - res = walrcv_exec(wrconn, cmd.data, check_columnlist ? 3 : 2, tableRow); + res = walrcv_exec(wrconn, cmd.data, column_count, tableRow); pfree(cmd.data); if (res->status != WALRCV_OK_TUPLES) @@ -2281,22 +2958,28 @@ fetch_table_list(WalReceiverConn *wrconn, List *publications) char *nspname; char *relname; bool isnull; - RangeVar *rv; + char relkind; + PublicationRelKind *relinfo = palloc_object(PublicationRelKind); nspname = TextDatumGetCString(slot_getattr(slot, 1, &isnull)); Assert(!isnull); relname = TextDatumGetCString(slot_getattr(slot, 2, &isnull)); Assert(!isnull); + relkind = DatumGetChar(slot_getattr(slot, 3, &isnull)); + Assert(!isnull); - rv = makeRangeVar(nspname, relname, -1); + relinfo->rv = makeRangeVar(nspname, relname, -1); + relinfo->relkind = relkind; - if (check_columnlist && list_member(tablelist, rv)) + if (relkind != RELKIND_SEQUENCE && + check_columnlist && + list_member_rangevar(relationlist, relinfo->rv)) ereport(ERROR, errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot use different column lists for table \"%s.%s\" in different publications", nspname, relname)); else - tablelist = lappend(tablelist, rv); + relationlist = lappend(relationlist, relinfo); ExecClearTuple(slot); } @@ -2304,7 +2987,7 @@ fetch_table_list(WalReceiverConn *wrconn, List *publications) walrcv_clear_result(res); - return tablelist; + return relationlist; } /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 54ad38247aa32..1d9565b09fcd7 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -42,6 +42,7 @@ #include "catalog/pg_foreign_table.h" #include "catalog/pg_inherits.h" #include "catalog/pg_largeobject.h" +#include "catalog/pg_largeobject_metadata.h" #include "catalog/pg_namespace.h" #include "catalog/pg_opclass.h" #include "catalog/pg_policy.h" @@ -430,8 +431,8 @@ static void AlterConstrUpdateConstraintEntry(ATAlterConstraint *cmdcon, Relation static ObjectAddress ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, bool recurse, bool recursing, LOCKMODE lockmode); -static void QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, - HeapTuple contuple, LOCKMODE lockmode); +static void QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation fkrel, + Oid pkrelid, HeapTuple contuple, LOCKMODE lockmode); static void QueueCheckConstraintValidation(List **wqueue, Relation conrel, Relation rel, char *constrName, HeapTuple contuple, bool recurse, bool recursing, LOCKMODE lockmode); @@ -721,7 +722,6 @@ static void QueuePartitionConstraintValidation(List **wqueue, Relation scanrel, List *partConstraint, bool validate_default); static void CloneRowTriggersToPartition(Relation parent, Relation partition); -static void DetachAddConstraintIfNeeded(List **wqueue, Relation partRel); static void DropClonedTriggersFromPartition(Oid partitionId); static ObjectAddress ATExecDetachPartition(List **wqueue, AlteredTableInfo *tab, Relation rel, RangeVar *name, @@ -740,6 +740,11 @@ static void ATDetachCheckNoForeignKeyRefs(Relation partition); static char GetAttributeCompression(Oid atttypid, const char *compression); static char GetAttributeStorage(Oid atttypid, const char *storagemode); +static void ATExecMergePartitions(List **wqueue, AlteredTableInfo *tab, Relation rel, + PartitionCmd *cmd, AlterTableUtilityContext *context); +static void ATExecSplitPartition(List **wqueue, AlteredTableInfo *tab, + Relation rel, PartitionCmd *cmd, + AlterTableUtilityContext *context); /* ---------------------------------------------------------------- * DefineRelation @@ -999,7 +1004,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, Assert(colDef->cooked_default == NULL); - rawEnt = (RawColumnDefault *) palloc(sizeof(RawColumnDefault)); + rawEnt = palloc_object(RawColumnDefault); rawEnt->attnum = attnum; rawEnt->raw_default = colDef->raw_default; rawEnt->generated = colDef->generated; @@ -1009,7 +1014,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, { CookedConstraint *cooked; - cooked = (CookedConstraint *) palloc(sizeof(CookedConstraint)); + cooked = palloc_object(CookedConstraint); cooked->contype = CONSTR_DEFAULT; cooked->conoid = InvalidOid; /* until created */ cooked->name = NULL; @@ -2294,7 +2299,7 @@ ExecuteTruncateGuts(List *explicit_rels, xl_heap_truncate xlrec; int i = 0; - /* should only get here if wal_level >= logical */ + /* should only get here if effective_wal_level is 'logical' */ Assert(XLogLogicalInfoActive()); logrelids = palloc(list_length(relids_logged) * sizeof(Oid)); @@ -2389,12 +2394,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple) /* * Most system catalogs can't be truncated at all, or at least not unless * allow_system_table_mods=on. As an exception, however, we allow - * pg_largeobject to be truncated as part of pg_upgrade, because we need - * to change its relfilenode to match the old cluster, and allowing a - * TRUNCATE command to be executed is the easiest way of doing that. + * pg_largeobject and pg_largeobject_metadata to be truncated as part of + * pg_upgrade, because we need to change its relfilenode to match the old + * cluster, and allowing a TRUNCATE command to be executed is the easiest + * way of doing that. */ if (!allowSystemTableMods && IsSystemClass(relid, reltuple) - && (!IsBinaryUpgrade || relid != LargeObjectRelationId)) + && (!IsBinaryUpgrade || + (relid != LargeObjectRelationId && + relid != LargeObjectMetadataRelationId))) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied: \"%s\" is a system catalog", @@ -2711,8 +2719,7 @@ MergeAttributes(List *columns, const List *supers, char relpersistence, RelationGetRelationName(relation)))); /* If existing rel is temp, it must belong to this session */ - if (relation->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !relation->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(relation)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg(!is_partition @@ -4834,6 +4841,11 @@ AlterTableGetLockLevel(List *cmds) cmd_lockmode = ShareUpdateExclusiveLock; break; + case AT_MergePartitions: + case AT_SplitPartition: + cmd_lockmode = AccessExclusiveLock; + break; + default: /* oops */ elog(ERROR, "unrecognized alter table type: %d", (int) cmd->subtype); @@ -5269,6 +5281,12 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd, /* No command-specific prep needed */ pass = AT_PASS_MISC; break; + case AT_MergePartitions: + case AT_SplitPartition: + ATSimplePermissions(cmd->subtype, rel, ATT_PARTITIONED_TABLE); + /* No command-specific prep needed */ + pass = AT_PASS_MISC; + break; default: /* oops */ elog(ERROR, "unrecognized alter table type: %d", (int) cmd->subtype); @@ -5665,6 +5683,22 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, case AT_DetachPartitionFinalize: address = ATExecDetachPartitionFinalize(rel, ((PartitionCmd *) cmd->def)->name); break; + case AT_MergePartitions: + cmd = ATParseTransformCmd(wqueue, tab, rel, cmd, false, lockmode, + cur_pass, context); + Assert(cmd != NULL); + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + ATExecMergePartitions(wqueue, tab, rel, (PartitionCmd *) cmd->def, + context); + break; + case AT_SplitPartition: + cmd = ATParseTransformCmd(wqueue, tab, rel, cmd, false, lockmode, + cur_pass, context); + Assert(cmd != NULL); + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + ATExecSplitPartition(wqueue, tab, rel, (PartitionCmd *) cmd->def, + context); + break; default: /* oops */ elog(ERROR, "unrecognized alter table type: %d", (int) cmd->subtype); @@ -6203,7 +6237,7 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap) NewColumnValue *ex = lfirst(l); /* expr already planned */ - ex->exprstate = ExecInitExpr((Expr *) ex->expr, NULL); + ex->exprstate = ExecInitExpr(ex->expr, NULL); } notnull_attrs = notnull_virtual_attrs = NIL; @@ -6566,7 +6600,7 @@ ATGetQueueEntry(List **wqueue, Relation rel) * Not there, so add it. Note that we make a copy of the relation's * existing descriptor before anything interesting can happen to it. */ - tab = (AlteredTableInfo *) palloc0(sizeof(AlteredTableInfo)); + tab = palloc0_object(AlteredTableInfo); tab->relid = relid; tab->rel = NULL; /* set later */ tab->relkind = rel->rd_rel->relkind; @@ -6705,6 +6739,10 @@ alter_table_type_to_string(AlterTableType cmdtype) return "DETACH PARTITION"; case AT_DetachPartitionFinalize: return "DETACH PARTITION ... FINALIZE"; + case AT_MergePartitions: + return "MERGE PARTITIONS"; + case AT_SplitPartition: + return "SPLIT PARTITION"; case AT_AddIdentity: return "ALTER COLUMN ... ADD IDENTITY"; case AT_SetIdentity: @@ -7374,7 +7412,7 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel, /* make sure datatype is legal for a column */ CheckAttributeType(NameStr(attribute->attname), attribute->atttypid, attribute->attcollation, list_make1_oid(rel->rd_rel->reltype), - 0); + (attribute->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL ? CHKATYPE_IS_VIRTUAL : 0)); InsertPgAttributeTuples(attrdesc, tupdesc, myrelid, NULL, NULL); @@ -7404,7 +7442,7 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel, { RawColumnDefault *rawEnt; - rawEnt = (RawColumnDefault *) palloc(sizeof(RawColumnDefault)); + rawEnt = palloc_object(RawColumnDefault); rawEnt->attnum = attribute->attnum; rawEnt->raw_default = copyObject(colDef->raw_default); rawEnt->generated = colDef->generated; @@ -7505,7 +7543,7 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel, defval = expression_planner(defval); /* Add the new default to the newvals list */ - newval = (NewColumnValue *) palloc0(sizeof(NewColumnValue)); + newval = palloc0_object(NewColumnValue); newval->attnum = attribute->attnum; newval->expr = defval; newval->is_generated = (colDef->generated != '\0'); @@ -8174,7 +8212,7 @@ ATExecColumnDefault(Relation rel, const char *colName, /* SET DEFAULT */ RawColumnDefault *rawEnt; - rawEnt = (RawColumnDefault *) palloc(sizeof(RawColumnDefault)); + rawEnt = palloc_object(RawColumnDefault); rawEnt->attnum = attnum; rawEnt->raw_default = newDefault; rawEnt->generated = '\0'; @@ -8279,6 +8317,31 @@ ATExecAddIdentity(Relation rel, const char *colName, errmsg("column \"%s\" of relation \"%s\" must be declared NOT NULL before identity can be added", colName, RelationGetRelationName(rel)))); + /* + * On the other hand, if a not-null constraint exists, then verify that + * it's compatible. + */ + if (attTup->attnotnull) + { + HeapTuple contup; + Form_pg_constraint conForm; + + contup = findNotNullConstraintAttnum(RelationGetRelid(rel), + attnum); + if (!HeapTupleIsValid(contup)) + elog(ERROR, "cache lookup failed for not-null constraint on column \"%s\" of relation \"%s\"", + colName, RelationGetRelationName(rel)); + + conForm = (Form_pg_constraint) GETSTRUCT(contup); + if (!conForm->convalidated) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("incompatible NOT VALID constraint \"%s\" on relation \"%s\"", + NameStr(conForm->conname), RelationGetRelationName(rel)), + errhint("You might need to validate it using %s.", + "ALTER TABLE ... VALIDATE CONSTRAINT")); + } + if (attTup->attidentity) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -8609,7 +8672,7 @@ ATExecSetExpression(AlteredTableInfo *tab, Relation rel, const char *colName, rel->rd_att->constr && rel->rd_att->constr->num_check > 0) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns on tables with check constraints"), + errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns in tables with check constraints"), errdetail("Column \"%s\" of relation \"%s\" is a virtual generated column.", colName, RelationGetRelationName(rel)))); @@ -8627,7 +8690,7 @@ ATExecSetExpression(AlteredTableInfo *tab, Relation rel, const char *colName, GetRelationPublications(RelationGetRelid(rel)) != NIL) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns on tables that are part of a publication"), + errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns in tables that are part of a publication"), errdetail("Column \"%s\" of relation \"%s\" is a virtual generated column.", colName, RelationGetRelationName(rel)))); @@ -8677,7 +8740,7 @@ ATExecSetExpression(AlteredTableInfo *tab, Relation rel, const char *colName, false, false); /* Prepare to store the new expression, in the catalogs */ - rawEnt = (RawColumnDefault *) palloc(sizeof(RawColumnDefault)); + rawEnt = palloc_object(RawColumnDefault); rawEnt->attnum = attnum; rawEnt->raw_default = newExpr; rawEnt->generated = attgenerated; @@ -8694,7 +8757,7 @@ ATExecSetExpression(AlteredTableInfo *tab, Relation rel, const char *colName, /* Prepare for table rewrite */ defval = (Expr *) build_column_default(rel, attnum); - newval = (NewColumnValue *) palloc0(sizeof(NewColumnValue)); + newval = palloc0_object(NewColumnValue); newval->attnum = attnum; newval->expr = expression_planner(defval); newval->is_generated = true; @@ -8986,7 +9049,7 @@ ATExecSetStatistics(Relation rel, const char *colName, int16 colNum, Node *newVa memset(repl_null, false, sizeof(repl_null)); memset(repl_repl, false, sizeof(repl_repl)); if (!newtarget_default) - repl_val[Anum_pg_attribute_attstattarget - 1] = newtarget; + repl_val[Anum_pg_attribute_attstattarget - 1] = Int16GetDatum(newtarget); else repl_null[Anum_pg_attribute_attstattarget - 1] = true; repl_repl[Anum_pg_attribute_attstattarget - 1] = true; @@ -9655,7 +9718,7 @@ ATExecAddStatistics(AlteredTableInfo *tab, Relation rel, /* The CreateStatsStmt has already been through transformStatsStmt */ Assert(stmt->transformed); - address = CreateStatistics(stmt); + address = CreateStatistics(stmt, !is_rebuild); return address; } @@ -9922,7 +9985,7 @@ ATAddCheckNNConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, { NewConstraint *newcon; - newcon = (NewConstraint *) palloc0(sizeof(NewConstraint)); + newcon = palloc0_object(NewConstraint); newcon->name = ccon->name; newcon->contype = ccon->contype; newcon->qual = ccon->expr; @@ -10189,7 +10252,7 @@ ATAddForeignKeyConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, if (pk_has_without_overlaps && !with_period) ereport(ERROR, errcode(ERRCODE_INVALID_FOREIGN_KEY), - errmsg("foreign key must use PERIOD when referencing a primary using WITHOUT OVERLAPS")); + errmsg("foreign key must use PERIOD when referencing a primary key using WITHOUT OVERLAPS")); /* * Now we can check permissions. @@ -10330,8 +10393,8 @@ ATAddForeignKeyConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, for_overlaps ? errmsg("could not identify an overlaps operator for foreign key") : errmsg("could not identify an equality operator for foreign key"), - errdetail("Could not translate compare type %d for operator family \"%s\", input type %s, access method \"%s\".", - cmptype, get_opfamily_name(opfamily, false), format_type_be(opcintype), get_am_name(amid))); + errdetail("Could not translate compare type %d for operator family \"%s\" of access method \"%s\".", + cmptype, get_opfamily_name(opfamily, false), get_am_name(amid))); /* * There had better be a primary equality operator for the index. @@ -10919,7 +10982,7 @@ addFkRecurseReferenced(Constraint *fkconstraint, Relation rel, false); if (map) { - mapped_pkattnum = palloc(sizeof(AttrNumber) * numfks); + mapped_pkattnum = palloc_array(AttrNumber, numfks); for (int j = 0; j < numfks; j++) mapped_pkattnum[j] = map->attnums[pkattnum[j] - 1]; } @@ -11054,7 +11117,7 @@ addFkRecurseReferencing(List **wqueue, Constraint *fkconstraint, Relation rel, tab = ATGetQueueEntry(wqueue, rel); - newcon = (NewConstraint *) palloc0(sizeof(NewConstraint)); + newcon = palloc0_object(NewConstraint); newcon->name = get_constraint_name(parentConstr); newcon->contype = CONSTR_FOREIGN; newcon->refrelid = RelationGetRelid(pkrel); @@ -11858,6 +11921,7 @@ AttachPartitionForeignKey(List **wqueue, if (queueValidation) { Relation conrel; + Oid confrelid; conrel = table_open(ConstraintRelationId, RowExclusiveLock); @@ -11865,9 +11929,11 @@ AttachPartitionForeignKey(List **wqueue, if (!HeapTupleIsValid(partcontup)) elog(ERROR, "cache lookup failed for constraint %u", partConstrOid); + confrelid = ((Form_pg_constraint) GETSTRUCT(partcontup))->confrelid; + /* Use the same lock as for AT_ValidateConstraint */ - QueueFKConstraintValidation(wqueue, conrel, partition, partcontup, - ShareUpdateExclusiveLock); + QueueFKConstraintValidation(wqueue, conrel, partition, confrelid, + partcontup, ShareUpdateExclusiveLock); ReleaseSysCache(partcontup); table_close(conrel, RowExclusiveLock); } @@ -12463,14 +12529,17 @@ ATExecAlterConstrEnforceability(List **wqueue, ATAlterConstraint *cmdcon, /* * Tell Phase 3 to check that the constraint is satisfied by existing - * rows. + * rows. Only applies to leaf partitions, and (for constraints that + * reference a partitioned table) only if this is not one of the + * pg_constraint rows that exist solely to support action triggers. */ - if (rel->rd_rel->relkind == RELKIND_RELATION) + if (rel->rd_rel->relkind == RELKIND_RELATION && + currcon->confrelid == pkrelid) { AlteredTableInfo *tab; NewConstraint *newcon; - newcon = (NewConstraint *) palloc0(sizeof(NewConstraint)); + newcon = palloc0_object(NewConstraint); newcon->name = fkconstraint->conname; newcon->contype = CONSTR_FOREIGN; newcon->refrelid = currcon->confrelid; @@ -12907,8 +12976,9 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, con->contype != CONSTRAINT_NOTNULL) ereport(ERROR, errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("constraint \"%s\" of relation \"%s\" is not a foreign key, check, or not-null constraint", - constrName, RelationGetRelationName(rel))); + errmsg("cannot validate constraint \"%s\" of relation \"%s\"", + constrName, RelationGetRelationName(rel)), + errdetail("This operation is not supported for this type of constraint.")); if (!con->conenforced) ereport(ERROR, @@ -12919,7 +12989,8 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, { if (con->contype == CONSTRAINT_FOREIGN) { - QueueFKConstraintValidation(wqueue, conrel, rel, tuple, lockmode); + QueueFKConstraintValidation(wqueue, conrel, rel, con->confrelid, + tuple, lockmode); } else if (con->contype == CONSTRAINT_CHECK) { @@ -12952,8 +13023,8 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, * for the specified relation and all its children. */ static void -QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, - HeapTuple contuple, LOCKMODE lockmode) +QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation fkrel, + Oid pkrelid, HeapTuple contuple, LOCKMODE lockmode) { Form_pg_constraint con; AlteredTableInfo *tab; @@ -12964,7 +13035,17 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, Assert(con->contype == CONSTRAINT_FOREIGN); Assert(!con->convalidated); - if (rel->rd_rel->relkind == RELKIND_RELATION) + /* + * Add the validation to phase 3's queue; not needed for partitioned + * tables themselves, only for their partitions. + * + * When the referenced table (pkrelid) is partitioned, the referencing + * table (fkrel) has one pg_constraint row pointing to each partition + * thereof. These rows are there only to support action triggers and no + * table scan is needed, therefore skip this for them as well. + */ + if (fkrel->rd_rel->relkind == RELKIND_RELATION && + con->confrelid == pkrelid) { NewConstraint *newcon; Constraint *fkconstraint; @@ -12974,7 +13055,7 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, /* for now this is all we need */ fkconstraint->conname = pstrdup(NameStr(con->conname)); - newcon = (NewConstraint *) palloc0(sizeof(NewConstraint)); + newcon = palloc0_object(NewConstraint); newcon->name = fkconstraint->conname; newcon->contype = CONSTR_FOREIGN; newcon->refrelid = con->confrelid; @@ -12983,15 +13064,16 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, newcon->qual = (Node *) fkconstraint; /* Find or create work queue entry for this table */ - tab = ATGetQueueEntry(wqueue, rel); + tab = ATGetQueueEntry(wqueue, fkrel); tab->constraints = lappend(tab->constraints, newcon); } /* * If the table at either end of the constraint is partitioned, we need to - * recurse and handle every constraint that is a child of this constraint. + * recurse and handle every unvalidated constraint that is a child of this + * constraint. */ - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || + if (fkrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || get_rel_relkind(con->confrelid) == RELKIND_PARTITIONED_TABLE) { ScanKeyData pkey; @@ -13023,8 +13105,12 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, childrel = table_open(childcon->conrelid, lockmode); - QueueFKConstraintValidation(wqueue, conrel, childrel, childtup, - lockmode); + /* + * NB: Note that pkrelid should be passed as-is during recursion, + * as it is required to identify the root referenced table. + */ + QueueFKConstraintValidation(wqueue, conrel, childrel, pkrelid, + childtup, lockmode); table_close(childrel, NoLock); } @@ -13032,7 +13118,11 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, } /* - * Now update the catalog, while we have the door open. + * Now mark the pg_constraint row as validated (even if we didn't check, + * notably the ones for partitions on the referenced side). + * + * We rely on transaction abort to roll back this change if phase 3 + * ultimately finds violating rows. This is a bit ugly. */ copyTuple = heap_copytuple(contuple); copy_con = (Form_pg_constraint) GETSTRUCT(copyTuple); @@ -13113,7 +13203,7 @@ QueueCheckConstraintValidation(List **wqueue, Relation conrel, Relation rel, } /* Queue validation for phase 3 */ - newcon = (NewConstraint *) palloc0(sizeof(NewConstraint)); + newcon = palloc0_object(NewConstraint); newcon->name = constrName; newcon->contype = CONSTR_CHECK; newcon->refrelid = InvalidOid; @@ -14400,7 +14490,7 @@ ATPrepAlterColumnType(List **wqueue, /* make sure datatype is legal for a column */ CheckAttributeType(colName, targettype, targetcollid, list_make1_oid(rel->rd_rel->reltype), - 0); + (attTup->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL ? CHKATYPE_IS_VIRTUAL : 0)); if (attTup->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) { @@ -14458,6 +14548,9 @@ ATPrepAlterColumnType(List **wqueue, /* Fix collations after all else */ assign_expr_collations(pstate, transform); + /* Expand virtual generated columns in the expr. */ + transform = expand_generated_columns_in_expr(transform, rel, 1); + /* Plan the expr now so we can accurately assess the need to rewrite. */ transform = (Node *) expression_planner((Expr *) transform); @@ -14465,7 +14558,7 @@ ATPrepAlterColumnType(List **wqueue, * Add a work queue item to make ATRewriteTable update the column * contents. */ - newval = (NewColumnValue *) palloc0(sizeof(NewColumnValue)); + newval = palloc0_object(NewColumnValue); newval->attnum = attnum; newval->expr = (Expr *) transform; newval->is_generated = false; @@ -15385,9 +15478,12 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) /* * Re-parse the index and constraint definitions, and attach them to the * appropriate work queue entries. We do this before dropping because in - * the case of a FOREIGN KEY constraint, we might not yet have exclusive - * lock on the table the constraint is attached to, and we need to get - * that before reparsing/dropping. + * the case of a constraint on another table, we might not yet have + * exclusive lock on the table the constraint is attached to, and we need + * to get that before reparsing/dropping. (That's possible at least for + * FOREIGN KEY, CHECK, and EXCLUSION constraints; in non-FK cases it + * requires a dependency on the target table's composite type in the other + * table's constraint expressions.) * * We can't rely on the output of deparsing to tell us which relation to * operate on, because concurrent activity might have made the name @@ -15403,7 +15499,6 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Form_pg_constraint con; Oid relid; Oid confrelid; - char contype; bool conislocal; tup = SearchSysCache1(CONSTROID, ObjectIdGetDatum(oldId)); @@ -15420,7 +15515,6 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) elog(ERROR, "could not identify relation associated with constraint %u", oldId); } confrelid = con->confrelid; - contype = con->contype; conislocal = con->conislocal; ReleaseSysCache(tup); @@ -15438,12 +15532,12 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) continue; /* - * When rebuilding an FK constraint that references the table we're - * modifying, we might not yet have any lock on the FK's table, so get - * one now. We'll need AccessExclusiveLock for the DROP CONSTRAINT - * step, so there's no value in asking for anything weaker. + * When rebuilding another table's constraint that references the + * table we're modifying, we might not yet have any lock on the other + * table, so get one now. We'll need AccessExclusiveLock for the DROP + * CONSTRAINT step, so there's no value in asking for anything weaker. */ - if (relid != tab->relid && contype == CONSTRAINT_FOREIGN) + if (relid != tab->relid) LockRelationOid(relid, AccessExclusiveLock); ATPostAlterTypeParse(oldId, relid, confrelid, @@ -15457,6 +15551,14 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = IndexGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the index's table if it's not + * the same table. + */ + if (relid != tab->relid) + LockRelationOid(relid, AccessExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); @@ -15473,6 +15575,20 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = StatisticsGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the statistics object's table + * if it's not the same table. However, we take + * ShareUpdateExclusiveLock here, aligning with the lock level used in + * CreateStatistics and RemoveStatisticsById. + * + * CAUTION: this should be done after all cases that grab + * AccessExclusiveLock, else we risk causing deadlock due to needing + * to promote our table lock. + */ + if (relid != tab->relid) + LockRelationOid(relid, ShareUpdateExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); @@ -15696,7 +15812,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, { AlterDomainStmt *stmt = (AlterDomainStmt *) stm; - if (stmt->subtype == 'C') /* ADD CONSTRAINT */ + if (stmt->subtype == AD_AddConstraint) { Constraint *con = castNode(Constraint, stmt->def); AlterTableCmd *cmd = makeNode(AlterTableCmd); @@ -15939,7 +16055,7 @@ ATExecAlterColumnGenericOptions(Relation rel, options, fdw->fdwvalidator); - if (PointerIsValid(DatumGetPointer(datum))) + if (DatumGetPointer(datum) != NULL) repl_val[Anum_pg_attribute_attfdwoptions - 1] = datum; else repl_null[Anum_pg_attribute_attfdwoptions - 1] = true; @@ -17199,15 +17315,13 @@ ATExecAddInherit(Relation child_rel, RangeVar *parent, LOCKMODE lockmode) RelationGetRelationName(parent_rel)))); /* If parent rel is temp, it must belong to this session */ - if (parent_rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !parent_rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(parent_rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit from temporary relation of another session"))); /* Ditto for the child */ - if (child_rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !child_rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(child_rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit to temporary relation of another session"))); @@ -18619,7 +18733,7 @@ ATExecGenericOptions(Relation rel, List *options) options, fdw->fdwvalidator); - if (PointerIsValid(DatumGetPointer(datum))) + if (DatumGetPointer(datum) != NULL) repl_val[Anum_pg_foreign_table_ftoptions - 1] = datum; else repl_null[Anum_pg_foreign_table_ftoptions - 1] = true; @@ -19184,7 +19298,7 @@ register_on_commit_action(Oid relid, OnCommitAction action) oldcxt = MemoryContextSwitchTo(CacheMemoryContext); - oc = (OnCommitItem *) palloc(sizeof(OnCommitItem)); + oc = palloc_object(OnCommitItem); oc->relid = relid; oc->oncommit = action; oc->creating_subid = GetCurrentSubTransactionId(); @@ -19757,6 +19871,8 @@ ComputePartitionAttrs(ParseState *pstate, Relation rel, List *partParams, AttrNu /* Expression */ Node *expr = pelem->expr; char partattname[16]; + Bitmapset *expr_attrs = NULL; + int i; Assert(expr != NULL); atttype = exprType(expr); @@ -19780,43 +19896,36 @@ ComputePartitionAttrs(ParseState *pstate, Relation rel, List *partParams, AttrNu while (IsA(expr, CollateExpr)) expr = (Node *) ((CollateExpr *) expr)->arg; - if (IsA(expr, Var) && - ((Var *) expr)->varattno > 0) + /* + * Examine all the columns in the partition key expression. When + * the whole-row reference is present, examine all the columns of + * the partitioned table. + */ + pull_varattnos(expr, 1, &expr_attrs); + if (bms_is_member(0 - FirstLowInvalidHeapAttributeNumber, expr_attrs)) { - /* - * User wrote "(column)" or "(column COLLATE something)". - * Treat it like simple attribute anyway. - */ - partattrs[attn] = ((Var *) expr)->varattno; + expr_attrs = bms_add_range(expr_attrs, + 1 - FirstLowInvalidHeapAttributeNumber, + RelationGetNumberOfAttributes(rel) - FirstLowInvalidHeapAttributeNumber); + expr_attrs = bms_del_member(expr_attrs, 0 - FirstLowInvalidHeapAttributeNumber); } - else - { - Bitmapset *expr_attrs = NULL; - int i; - partattrs[attn] = 0; /* marks the column as expression */ - *partexprs = lappend(*partexprs, expr); + i = -1; + while ((i = bms_next_member(expr_attrs, i)) >= 0) + { + AttrNumber attno = i + FirstLowInvalidHeapAttributeNumber; - /* - * transformPartitionSpec() should have already rejected - * subqueries, aggregates, window functions, and SRFs, based - * on the EXPR_KIND_ for partition expressions. - */ + Assert(attno != 0); /* * Cannot allow system column references, since that would * make partition routing impossible: their values won't be * known yet when we need to do that. */ - pull_varattnos(expr, 1, &expr_attrs); - for (i = FirstLowInvalidHeapAttributeNumber; i < 0; i++) - { - if (bms_is_member(i - FirstLowInvalidHeapAttributeNumber, - expr_attrs)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("partition key expressions cannot contain system column references"))); - } + if (attno < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("partition key expressions cannot contain system column references"))); /* * Stored generated columns cannot work: They are computed @@ -19826,20 +19935,35 @@ ComputePartitionAttrs(ParseState *pstate, Relation rel, List *partParams, AttrNu * SET EXPRESSION would need to check whether the column is * used in partition keys). Seems safer to prohibit for now. */ - i = -1; - while ((i = bms_next_member(expr_attrs, i)) >= 0) - { - AttrNumber attno = i + FirstLowInvalidHeapAttributeNumber; + if (TupleDescAttr(RelationGetDescr(rel), attno - 1)->attgenerated) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("cannot use generated column in partition key"), + errdetail("Column \"%s\" is a generated column.", + get_attname(RelationGetRelid(rel), attno, false)), + parser_errposition(pstate, pelem->location))); + } - if (attno > 0 && - TupleDescAttr(RelationGetDescr(rel), attno - 1)->attgenerated) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("cannot use generated column in partition key"), - errdetail("Column \"%s\" is a generated column.", - get_attname(RelationGetRelid(rel), attno, false)), - parser_errposition(pstate, pelem->location))); - } + if (IsA(expr, Var) && + ((Var *) expr)->varattno > 0) + { + + /* + * User wrote "(column)" or "(column COLLATE something)". + * Treat it like simple attribute anyway. + */ + partattrs[attn] = ((Var *) expr)->varattno; + } + else + { + partattrs[attn] = 0; /* marks the column as expression */ + *partexprs = lappend(*partexprs, expr); + + /* + * transformPartitionSpec() should have already rejected + * subqueries, aggregates, window functions, and SRFs, based + * on the EXPR_KIND_ for partition expressions. + */ /* * Preprocess the expression before checking for mutability. @@ -20143,6 +20267,40 @@ QueuePartitionConstraintValidation(List **wqueue, Relation scanrel, } } +/* + * attachPartitionTable: attach a new partition to the partitioned table + * + * wqueue: the ALTER TABLE work queue; can be NULL when not running as part + * of an ALTER TABLE sequence. + * rel: partitioned relation; + * attachrel: relation of attached partition; + * bound: bounds of attached relation. + */ +static void +attachPartitionTable(List **wqueue, Relation rel, Relation attachrel, PartitionBoundSpec *bound) +{ + /* + * Create an inheritance; the relevant checks are performed inside the + * function. + */ + CreateInheritance(attachrel, rel, true); + + /* Update the pg_class entry. */ + StorePartitionBound(attachrel, rel, bound); + + /* Ensure there exists a correct set of indexes in the partition. */ + AttachPartitionEnsureIndexes(wqueue, rel, attachrel); + + /* and triggers */ + CloneRowTriggersToPartition(rel, attachrel); + + /* + * Clone foreign key constraints. Callee is responsible for setting up + * for phase 3 constraint verification. + */ + CloneForeignKeyConstraints(wqueue, rel, attachrel); +} + /* * ALTER TABLE ATTACH PARTITION FOR VALUES * @@ -20278,15 +20436,13 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd, RelationGetRelationName(rel)))); /* If the parent is temp, it must belong to this session */ - if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot attach as partition of temporary relation of another session"))); /* Ditto for the partition */ - if (attachrel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !attachrel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(attachrel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot attach temporary relation of another session as partition"))); @@ -20346,26 +20502,10 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd, check_new_partition_bound(RelationGetRelationName(attachrel), rel, cmd->bound, pstate); - /* OK to create inheritance. Rest of the checks performed there */ - CreateInheritance(attachrel, rel, true); - - /* Update the pg_class entry. */ - StorePartitionBound(attachrel, rel, cmd->bound); - - /* Ensure there exists a correct set of indexes in the partition. */ - AttachPartitionEnsureIndexes(wqueue, rel, attachrel); - - /* and triggers */ - CloneRowTriggersToPartition(rel, attachrel); - - /* - * Clone foreign key constraints. Callee is responsible for setting up - * for phase 3 constraint verification. - */ - CloneForeignKeyConstraints(wqueue, rel, attachrel); + attachPartitionTable(wqueue, rel, attachrel, cmd->bound); /* - * Generate partition constraint from the partition bound specification. + * Generate a partition constraint from the partition bound specification. * If the parent itself is a partition, make sure to include its * constraint as well. */ @@ -20489,8 +20629,8 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) idxes = RelationGetIndexList(rel); attachRelIdxs = RelationGetIndexList(attachrel); - attachrelIdxRels = palloc(sizeof(Relation) * list_length(attachRelIdxs)); - attachInfos = palloc(sizeof(IndexInfo *) * list_length(attachRelIdxs)); + attachrelIdxRels = palloc_array(Relation, list_length(attachRelIdxs)); + attachInfos = palloc_array(IndexInfo *, list_length(attachRelIdxs)); /* Build arrays of all existing indexes and their IndexInfos */ foreach_oid(cldIdxId, attachRelIdxs) @@ -20889,12 +21029,6 @@ ATExecDetachPartition(List **wqueue, AlteredTableInfo *tab, Relation rel, char *parentrelname; char *partrelname; - /* - * Add a new constraint to the partition being detached, which - * supplants the partition constraint (unless there is one already). - */ - DetachAddConstraintIfNeeded(wqueue, partRel); - /* * We're almost done now; the only traces that remain are the * pg_inherits tuple and the partition's relpartbounds. Before we can @@ -20964,9 +21098,17 @@ ATExecDetachPartition(List **wqueue, AlteredTableInfo *tab, Relation rel, tab->rel = rel; } + /* + * Detaching the partition might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + /* Do the final part of detaching */ DetachPartitionFinalize(rel, partRel, concurrent, defaultPartOid); + PopActiveSnapshot(); + ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partRel)); /* keep our lock until commit */ @@ -21343,49 +21485,6 @@ ATExecDetachPartitionFinalize(Relation rel, RangeVar *name) return address; } -/* - * DetachAddConstraintIfNeeded - * Subroutine for ATExecDetachPartition. Create a constraint that - * takes the place of the partition constraint, but avoid creating - * a dupe if a constraint already exists which implies the needed - * constraint. - */ -static void -DetachAddConstraintIfNeeded(List **wqueue, Relation partRel) -{ - List *constraintExpr; - - constraintExpr = RelationGetPartitionQual(partRel); - constraintExpr = (List *) eval_const_expressions(NULL, (Node *) constraintExpr); - - /* - * Avoid adding a new constraint if the needed constraint is implied by an - * existing constraint - */ - if (!PartConstraintImpliedByRelConstraint(partRel, constraintExpr)) - { - AlteredTableInfo *tab; - Constraint *n; - - tab = ATGetQueueEntry(wqueue, partRel); - - /* Add constraint on partition, equivalent to the partition constraint */ - n = makeNode(Constraint); - n->contype = CONSTR_CHECK; - n->conname = NULL; - n->location = -1; - n->is_no_inherit = false; - n->raw_expr = NULL; - n->cooked_expr = nodeToString(make_ands_explicit(constraintExpr)); - n->is_enforced = true; - n->initially_valid = true; - n->skip_validation = true; - /* It's a re-add, since it nominally already exists */ - ATAddCheckNNConstraint(wqueue, tab, partRel, n, - true, false, true, ShareUpdateExclusiveLock); - } -} - /* * DropClonedTriggersFromPartition * subroutine for ATExecDetachPartition to remove any triggers that were @@ -21694,7 +21793,8 @@ refuseDupeIndexAttach(Relation parentIdx, Relation partIdx, Relation partitionTb errmsg("cannot attach index \"%s\" as a partition of index \"%s\"", RelationGetRelationName(partIdx), RelationGetRelationName(parentIdx)), - errdetail("Another index is already attached for partition \"%s\".", + errdetail("Another index \"%s\" is already attached for partition \"%s\".", + get_rel_name(existingIdx), RelationGetRelationName(partitionTbl)))); } @@ -22001,3 +22101,1239 @@ GetAttributeStorage(Oid atttypid, const char *storagemode) return cstorage; } + +/* + * buildExpressionExecutionStates: build the needed expression execution states + * for new partition (newPartRel) checks and initialize expressions for + * generated columns. All expressions should be created in "tab" + * (AlteredTableInfo structure). + */ +static void +buildExpressionExecutionStates(AlteredTableInfo *tab, Relation newPartRel, EState *estate) +{ + /* + * Build the needed expression execution states. Here, we expect only NOT + * NULL and CHECK constraint. + */ + foreach_ptr(NewConstraint, con, tab->constraints) + { + switch (con->contype) + { + case CONSTR_CHECK: + + /* + * We already expanded virtual expression in + * createTableConstraints. + */ + con->qualstate = ExecPrepareExpr((Expr *) con->qual, estate); + break; + case CONSTR_NOTNULL: + /* Nothing to do here. */ + break; + default: + elog(ERROR, "unrecognized constraint type: %d", + (int) con->contype); + } + } + + /* Expression already planned in createTableConstraints */ + foreach_ptr(NewColumnValue, ex, tab->newvals) + ex->exprstate = ExecInitExpr((Expr *) ex->expr, NULL); +} + +/* + * evaluateGeneratedExpressionsAndCheckConstraints: evaluate any generated + * expressions for "tab" (AlteredTableInfo structure) whose inputs come from + * the new tuple (insertslot) of the new partition (newPartRel). + */ +static void +evaluateGeneratedExpressionsAndCheckConstraints(AlteredTableInfo *tab, + Relation newPartRel, + TupleTableSlot *insertslot, + ExprContext *econtext) +{ + econtext->ecxt_scantuple = insertslot; + + foreach_ptr(NewColumnValue, ex, tab->newvals) + { + if (!ex->is_generated) + continue; + + insertslot->tts_values[ex->attnum - 1] + = ExecEvalExpr(ex->exprstate, + econtext, + &insertslot->tts_isnull[ex->attnum - 1]); + } + + foreach_ptr(NewConstraint, con, tab->constraints) + { + switch (con->contype) + { + case CONSTR_CHECK: + if (!ExecCheck(con->qualstate, econtext)) + ereport(ERROR, + errcode(ERRCODE_CHECK_VIOLATION), + errmsg("check constraint \"%s\" of relation \"%s\" is violated by some row", + con->name, RelationGetRelationName(newPartRel)), + errtableconstraint(newPartRel, con->name)); + break; + case CONSTR_NOTNULL: + case CONSTR_FOREIGN: + /* Nothing to do here */ + break; + default: + elog(ERROR, "unrecognized constraint type: %d", + (int) con->contype); + } + } +} + +/* + * getAttributesList: build a list of columns (ColumnDef) based on parent_rel + */ +static List * +getAttributesList(Relation parent_rel) +{ + AttrNumber parent_attno; + TupleDesc modelDesc; + List *colList = NIL; + + modelDesc = RelationGetDescr(parent_rel); + + for (parent_attno = 1; parent_attno <= modelDesc->natts; + parent_attno++) + { + Form_pg_attribute attribute = TupleDescAttr(modelDesc, + parent_attno - 1); + ColumnDef *def; + + /* Ignore dropped columns in the parent. */ + if (attribute->attisdropped) + continue; + + def = makeColumnDef(NameStr(attribute->attname), attribute->atttypid, + attribute->atttypmod, attribute->attcollation); + + def->is_not_null = attribute->attnotnull; + + /* Copy identity. */ + def->identity = attribute->attidentity; + + /* Copy attgenerated. */ + def->generated = attribute->attgenerated; + + def->storage = attribute->attstorage; + + /* Likewise, copy compression. */ + if (CompressionMethodIsValid(attribute->attcompression)) + def->compression = + pstrdup(GetCompressionMethodName(attribute->attcompression)); + else + def->compression = NULL; + + /* Add to column list. */ + colList = lappend(colList, def); + } + + return colList; +} + +/* + * createTableConstraints: + * create check constraints, default values, and generated values for newRel + * based on parent_rel. tab is pending-work queue for newRel, we may need it in + * MergePartitionsMoveRows. + */ +static void +createTableConstraints(List **wqueue, AlteredTableInfo *tab, + Relation parent_rel, Relation newRel) +{ + TupleDesc tupleDesc; + TupleConstr *constr; + AttrMap *attmap; + AttrNumber parent_attno; + int ccnum; + List *constraints = NIL; + List *cookedConstraints = NIL; + + tupleDesc = RelationGetDescr(parent_rel); + constr = tupleDesc->constr; + + if (!constr) + return; + + /* + * Construct a map from the parent relation's attnos to the child rel's. + * This re-checks type match, etc, although it shouldn't be possible to + * have a failure since both tables are locked. + */ + attmap = build_attrmap_by_name(RelationGetDescr(newRel), + tupleDesc, + false); + + /* Cycle for default values. */ + for (parent_attno = 1; parent_attno <= tupleDesc->natts; parent_attno++) + { + Form_pg_attribute attribute = TupleDescAttr(tupleDesc, + parent_attno - 1); + + /* Ignore dropped columns in the parent. */ + if (attribute->attisdropped) + continue; + + /* Copy the default, if present, and it should be copied. */ + if (attribute->atthasdef) + { + Node *this_default = NULL; + bool found_whole_row; + AttrNumber num; + Node *def; + NewColumnValue *newval; + + if (attribute->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + this_default = build_generation_expression(parent_rel, attribute->attnum); + else + { + this_default = TupleDescGetDefault(tupleDesc, attribute->attnum); + if (this_default == NULL) + elog(ERROR, "default expression not found for attribute %d of relation \"%s\"", + attribute->attnum, RelationGetRelationName(parent_rel)); + } + + num = attmap->attnums[parent_attno - 1]; + def = map_variable_attnos(this_default, 1, 0, attmap, InvalidOid, &found_whole_row); + + if (found_whole_row && attribute->attgenerated != '\0') + elog(ERROR, "cannot convert whole-row table reference"); + + /* Add a pre-cooked default expression. */ + StoreAttrDefault(newRel, num, def, true); + + /* + * Stored generated column expressions in parent_rel might + * reference the tableoid. newRel, parent_rel tableoid clear is + * not the same. If so, these stored generated columns require + * recomputation for newRel within MergePartitionsMoveRows. + */ + if (attribute->attgenerated == ATTRIBUTE_GENERATED_STORED) + { + newval = palloc0_object(NewColumnValue); + newval->attnum = num; + newval->expr = expression_planner((Expr *) def); + newval->is_generated = (attribute->attgenerated != '\0'); + tab->newvals = lappend(tab->newvals, newval); + } + } + } + + /* Cycle for CHECK constraints. */ + for (ccnum = 0; ccnum < constr->num_check; ccnum++) + { + char *ccname = constr->check[ccnum].ccname; + char *ccbin = constr->check[ccnum].ccbin; + bool ccenforced = constr->check[ccnum].ccenforced; + bool ccnoinherit = constr->check[ccnum].ccnoinherit; + bool ccvalid = constr->check[ccnum].ccvalid; + Node *ccbin_node; + bool found_whole_row; + Constraint *constr; + + /* + * The partitioned table can not have a NO INHERIT check constraint + * (see StoreRelCheck function for details). + */ + Assert(!ccnoinherit); + + ccbin_node = map_variable_attnos(stringToNode(ccbin), + 1, 0, + attmap, + InvalidOid, &found_whole_row); + + /* + * For the moment we have to reject whole-row variables (as for CREATE + * TABLE LIKE and inheritances). + */ + if (found_whole_row) + elog(ERROR, "Constraint \"%s\" contains a whole-row reference to table \"%s\".", + ccname, + RelationGetRelationName(parent_rel)); + + constr = makeNode(Constraint); + constr->contype = CONSTR_CHECK; + constr->conname = pstrdup(ccname); + constr->deferrable = false; + constr->initdeferred = false; + constr->is_enforced = ccenforced; + constr->skip_validation = !ccvalid; + constr->initially_valid = ccvalid; + constr->is_no_inherit = ccnoinherit; + constr->raw_expr = NULL; + constr->cooked_expr = nodeToString(ccbin_node); + constr->location = -1; + constraints = lappend(constraints, constr); + } + + /* Install all CHECK constraints. */ + cookedConstraints = AddRelationNewConstraints(newRel, NIL, constraints, + false, true, true, NULL); + + /* Make the additional catalog changes visible. */ + CommandCounterIncrement(); + + /* + * parent_rel check constraint expression may reference tableoid, so later + * in MergePartitionsMoveRows, we need to evaluate the check constraint + * again for the newRel. We can check whether the check constraint + * contains a tableoid reference via pull_varattnos. + */ + foreach_ptr(CookedConstraint, ccon, cookedConstraints) + { + if (!ccon->skip_validation) + { + Node *qual; + Bitmapset *attnums = NULL; + + Assert(ccon->contype == CONSTR_CHECK); + qual = expand_generated_columns_in_expr(ccon->expr, newRel, 1); + pull_varattnos(qual, 1, &attnums); + + /* + * Add a check only if it contains a tableoid + * (TableOidAttributeNumber). + */ + if (bms_is_member(TableOidAttributeNumber - FirstLowInvalidHeapAttributeNumber, + attnums)) + { + NewConstraint *newcon; + + newcon = palloc0_object(NewConstraint); + newcon->name = ccon->name; + newcon->contype = CONSTR_CHECK; + newcon->qual = qual; + + tab->constraints = lappend(tab->constraints, newcon); + } + } + } + + /* Don't need the cookedConstraints anymore. */ + list_free_deep(cookedConstraints); + + /* Reproduce not-null constraints. */ + if (constr->has_not_null) + { + List *nnconstraints; + + /* + * The "include_noinh" argument is false because a partitioned table + * can't have NO INHERIT constraint. + */ + nnconstraints = RelationGetNotNullConstraints(RelationGetRelid(parent_rel), + false, false); + + Assert(list_length(nnconstraints) > 0); + + /* + * We already set pg_attribute.attnotnull in createPartitionTable. No + * need call set_attnotnull again. + */ + AddRelationNewConstraints(newRel, NIL, nnconstraints, false, true, true, NULL); + } +} + +/* + * createPartitionTable: + * + * Create a new partition (newPartName) for the partitioned table (parent_rel). + * ownerId is determined by the partition on which the operation is performed, + * so it is passed separately. The new partition will inherit the access method + * and persistence type from the parent table. + * + * Returns the created relation (locked in AccessExclusiveLock mode). + */ +static Relation +createPartitionTable(List **wqueue, RangeVar *newPartName, + Relation parent_rel, Oid ownerId) +{ + Relation newRel; + Oid newRelId; + Oid existingRelid; + TupleDesc descriptor; + List *colList = NIL; + Oid relamId; + Oid namespaceId; + AlteredTableInfo *new_partrel_tab; + Form_pg_class parent_relform = parent_rel->rd_rel; + + /* If the existing rel is temp, it must belong to this session. */ + if (RELATION_IS_OTHER_TEMP(parent_rel)) + ereport(ERROR, + errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot create as partition of temporary relation of another session")); + + /* Look up inheritance ancestors and generate the relation schema. */ + colList = getAttributesList(parent_rel); + + /* Create a tuple descriptor from the relation schema. */ + descriptor = BuildDescForRelation(colList); + + /* Look up the access method for the new relation. */ + relamId = (parent_relform->relam != InvalidOid) ? parent_relform->relam : HEAP_TABLE_AM_OID; + + /* Look up the namespace in which we are supposed to create the relation. */ + namespaceId = + RangeVarGetAndCheckCreationNamespace(newPartName, NoLock, &existingRelid); + if (OidIsValid(existingRelid)) + ereport(ERROR, + errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" already exists", newPartName->relname)); + + /* + * We intended to create the partition with the same persistence as the + * parent table, but we still need to recheck because that might be + * affected by the search_path. If the parent is permanent, so must be + * all of its partitions. + */ + if (parent_relform->relpersistence != RELPERSISTENCE_TEMP && + newPartName->relpersistence == RELPERSISTENCE_TEMP) + ereport(ERROR, + errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot create a temporary relation as partition of permanent relation \"%s\"", + RelationGetRelationName(parent_rel))); + + /* Permanent rels cannot be partitions belonging to a temporary parent. */ + if (newPartName->relpersistence != RELPERSISTENCE_TEMP && + parent_relform->relpersistence == RELPERSISTENCE_TEMP) + ereport(ERROR, + errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot create a permanent relation as partition of temporary relation \"%s\"", + RelationGetRelationName(parent_rel))); + + /* Create the relation. */ + newRelId = heap_create_with_catalog(newPartName->relname, + namespaceId, + parent_relform->reltablespace, + InvalidOid, + InvalidOid, + InvalidOid, + ownerId, + relamId, + descriptor, + NIL, + RELKIND_RELATION, + newPartName->relpersistence, + false, + false, + ONCOMMIT_NOOP, + (Datum) 0, + true, + allowSystemTableMods, + true, + InvalidOid, + NULL); + + /* + * We must bump the command counter to make the newly-created relation + * tuple visible for opening. + */ + CommandCounterIncrement(); + + /* + * Open the new partition with no lock, because we already have an + * AccessExclusiveLock placed there after creation. + */ + newRel = table_open(newRelId, NoLock); + + /* Find or create a work queue entry for the newly created table. */ + new_partrel_tab = ATGetQueueEntry(wqueue, newRel); + + /* Create constraints, default values, and generated values. */ + createTableConstraints(wqueue, new_partrel_tab, parent_rel, newRel); + + /* + * Need to call CommandCounterIncrement, so a fresh relcache entry has + * newly installed constraint info. + */ + CommandCounterIncrement(); + + return newRel; +} + +/* + * MergePartitionsMoveRows: scan partitions to be merged (mergingPartitions) + * of the partitioned table and move rows into the new partition + * (newPartRel). We also verify check constraints against these rows. + */ +static void +MergePartitionsMoveRows(List **wqueue, List *mergingPartitions, Relation newPartRel) +{ + CommandId mycid; + EState *estate; + AlteredTableInfo *tab; + ListCell *ltab; + + /* The FSM is empty, so don't bother using it. */ + int ti_options = TABLE_INSERT_SKIP_FSM; + BulkInsertState bistate; /* state of bulk inserts for partition */ + TupleTableSlot *dstslot; + + /* Find the work queue entry for the new partition table: newPartRel. */ + tab = ATGetQueueEntry(wqueue, newPartRel); + + /* Generate the constraint and default execution states. */ + estate = CreateExecutorState(); + + buildExpressionExecutionStates(tab, newPartRel, estate); + + mycid = GetCurrentCommandId(true); + + /* Prepare a BulkInsertState for table_tuple_insert. */ + bistate = GetBulkInsertState(); + + /* Create the necessary tuple slot. */ + dstslot = table_slot_create(newPartRel, NULL); + + foreach_oid(merging_oid, mergingPartitions) + { + ExprContext *econtext; + TupleTableSlot *srcslot; + TupleConversionMap *tuple_map; + TableScanDesc scan; + MemoryContext oldCxt; + Snapshot snapshot; + Relation mergingPartition; + + econtext = GetPerTupleExprContext(estate); + + /* + * Partition is already locked in the transformPartitionCmdForMerge + * function. + */ + mergingPartition = table_open(merging_oid, NoLock); + + /* Create a source tuple slot for the partition being merged. */ + srcslot = table_slot_create(mergingPartition, NULL); + + /* + * Map computing for moving attributes of the merged partition to the + * new partition. + */ + tuple_map = convert_tuples_by_name(RelationGetDescr(mergingPartition), + RelationGetDescr(newPartRel)); + + /* Scan through the rows. */ + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = table_beginscan(mergingPartition, snapshot, 0, NULL); + + /* + * Switch to per-tuple memory context and reset it for each tuple + * produced, so we don't leak memory. + */ + oldCxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + while (table_scan_getnextslot(scan, ForwardScanDirection, srcslot)) + { + TupleTableSlot *insertslot; + + CHECK_FOR_INTERRUPTS(); + + if (tuple_map) + { + /* Need to use a map to copy attributes. */ + insertslot = execute_attr_map_slot(tuple_map->attrMap, srcslot, dstslot); + } + else + { + slot_getallattrs(srcslot); + + /* Copy attributes directly. */ + insertslot = dstslot; + + ExecClearTuple(insertslot); + + memcpy(insertslot->tts_values, srcslot->tts_values, + sizeof(Datum) * srcslot->tts_nvalid); + memcpy(insertslot->tts_isnull, srcslot->tts_isnull, + sizeof(bool) * srcslot->tts_nvalid); + + ExecStoreVirtualTuple(insertslot); + } + + /* + * Constraints and GENERATED expressions might reference the + * tableoid column, so fill tts_tableOid with the desired value. + * (We must do this each time, because it gets overwritten with + * newrel's OID during storing.) + */ + insertslot->tts_tableOid = RelationGetRelid(newPartRel); + + /* + * Now, evaluate any generated expressions whose inputs come from + * the new tuple. We assume these columns won't reference each + * other, so that there's no ordering dependency. + */ + evaluateGeneratedExpressionsAndCheckConstraints(tab, newPartRel, + insertslot, econtext); + + /* Write the tuple out to the new relation. */ + table_tuple_insert(newPartRel, insertslot, mycid, + ti_options, bistate); + + ResetExprContext(econtext); + } + + MemoryContextSwitchTo(oldCxt); + table_endscan(scan); + UnregisterSnapshot(snapshot); + + if (tuple_map) + free_conversion_map(tuple_map); + + ExecDropSingleTupleTableSlot(srcslot); + table_close(mergingPartition, NoLock); + } + + FreeExecutorState(estate); + ExecDropSingleTupleTableSlot(dstslot); + FreeBulkInsertState(bistate); + + table_finish_bulk_insert(newPartRel, ti_options); + + /* + * We don't need to process this newPartRel since we already processed it + * here, so delete the ALTER TABLE queue for it. + */ + foreach(ltab, *wqueue) + { + tab = (AlteredTableInfo *) lfirst(ltab); + if (tab->relid == RelationGetRelid(newPartRel)) + { + *wqueue = list_delete_cell(*wqueue, ltab); + break; + } + } +} + +/* + * detachPartitionTable: detach partition "child_rel" from partitioned table + * "parent_rel" with default partition identifier "defaultPartOid" + */ +static void +detachPartitionTable(Relation parent_rel, Relation child_rel, Oid defaultPartOid) +{ + /* Remove the pg_inherits row first. */ + RemoveInheritance(child_rel, parent_rel, false); + + /* + * Detaching the partition might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + + /* Do the final part of detaching. */ + DetachPartitionFinalize(parent_rel, child_rel, false, defaultPartOid); + + PopActiveSnapshot(); +} + +/* + * ALTER TABLE MERGE PARTITIONS INTO + */ +static void +ATExecMergePartitions(List **wqueue, AlteredTableInfo *tab, Relation rel, + PartitionCmd *cmd, AlterTableUtilityContext *context) +{ + Relation newPartRel; + List *mergingPartitions = NIL; + Oid defaultPartOid; + Oid existingRelid; + Oid ownerId = InvalidOid; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + + /* + * Check ownership of merged partitions - partitions with different owners + * cannot be merged. Also, collect the OIDs of these partitions during the + * check. + */ + foreach_node(RangeVar, name, cmd->partlist) + { + Relation mergingPartition; + + /* + * We are going to detach and remove this partition. We already took + * AccessExclusiveLock lock on transformPartitionCmdForMerge, so here, + * NoLock is fine. + */ + mergingPartition = table_openrv_extended(name, NoLock, false); + Assert(CheckRelationLockedByMe(mergingPartition, AccessExclusiveLock, false)); + + if (OidIsValid(ownerId)) + { + /* Do the partitions being merged have different owners? */ + if (ownerId != mergingPartition->rd_rel->relowner) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("partitions being merged have different owners")); + } + else + ownerId = mergingPartition->rd_rel->relowner; + + /* Store the next merging partition into the list. */ + mergingPartitions = lappend_oid(mergingPartitions, + RelationGetRelid(mergingPartition)); + + table_close(mergingPartition, NoLock); + } + + /* Look up the existing relation by the new partition name. */ + RangeVarGetAndCheckCreationNamespace(cmd->name, NoLock, &existingRelid); + + /* + * Check if this name is already taken. This helps us to detect the + * situation when one of the merging partitions has the same name as the + * new partition. Otherwise, this would fail later on anyway, but + * catching this here allows us to emit a nicer error message. + */ + if (OidIsValid(existingRelid)) + { + if (list_member_oid(mergingPartitions, existingRelid)) + { + /* + * The new partition has the same name as one of the merging + * partitions. + */ + char tmpRelName[NAMEDATALEN]; + + /* Generate a temporary name. */ + sprintf(tmpRelName, "merge-%u-%X-tmp", RelationGetRelid(rel), MyProcPid); + + /* + * Rename the existing partition with a temporary name, leaving it + * free for the new partition. We don't need to care about this + * in the future because we're going to eventually drop the + * existing partition anyway. + */ + RenameRelationInternal(existingRelid, tmpRelName, true, false); + + /* + * We must bump the command counter to make the new partition + * tuple visible for rename. + */ + CommandCounterIncrement(); + } + else + { + ereport(ERROR, + errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" already exists", cmd->name->relname)); + } + } + + defaultPartOid = + get_default_oid_from_partdesc(RelationGetPartitionDesc(rel, true)); + + /* Detach all merging partitions. */ + foreach_oid(mergingPartitionOid, mergingPartitions) + { + Relation child_rel; + + child_rel = table_open(mergingPartitionOid, NoLock); + + detachPartitionTable(rel, child_rel, defaultPartOid); + + table_close(child_rel, NoLock); + } + + /* + * Perform a preliminary check to determine whether it's safe to drop all + * merging partitions before we actually do so later. After merging rows + * into the new partitions via MergePartitionsMoveRows, all old partitions + * need to be dropped. However, since the drop behavior is DROP_RESTRICT + * and the merge process (MergePartitionsMoveRows) can be time-consuming, + * performing an early check on the drop eligibility of old partitions is + * preferable. + */ + foreach_oid(mergingPartitionOid, mergingPartitions) + { + ObjectAddress object; + + /* Get oid of the later to be dropped relation. */ + object.objectId = mergingPartitionOid; + object.classId = RelationRelationId; + object.objectSubId = 0; + + performDeletionCheck(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); + } + + /* + * Create a table for the new partition, using the partitioned table as a + * model. + */ + Assert(OidIsValid(ownerId)); + newPartRel = createPartitionTable(wqueue, cmd->name, rel, ownerId); + + /* + * Switch to the table owner's userid, so that any index functions are run + * as that user. Also, lockdown security-restricted operations and + * arrange to make GUC variable changes local to this command. + * + * Need to do it after determining the namespace in the + * createPartitionTable() call. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(ownerId, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + RestrictSearchPath(); + + /* Copy data from merged partitions to the new partition. */ + MergePartitionsMoveRows(wqueue, mergingPartitions, newPartRel); + + /* Drop the current partitions before attaching the new one. */ + foreach_oid(mergingPartitionOid, mergingPartitions) + { + ObjectAddress object; + + object.objectId = mergingPartitionOid; + object.classId = RelationRelationId; + object.objectSubId = 0; + + performDeletion(&object, DROP_RESTRICT, 0); + } + + list_free(mergingPartitions); + + /* + * Attach a new partition to the partitioned table. wqueue = NULL: + * verification for each cloned constraint is not needed. + */ + attachPartitionTable(NULL, rel, newPartRel, cmd->bound); + + /* Keep the lock until commit. */ + table_close(newPartRel, NoLock); + + /* Roll back any GUC changes executed by index functions. */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore the userid and security context. */ + SetUserIdAndSecContext(save_userid, save_sec_context); +} + +/* + * Struct with the context of the new partition for inserting rows from the + * split partition. + */ +typedef struct SplitPartitionContext +{ + ExprState *partqualstate; /* expression for checking a slot for a + * partition (NULL for DEFAULT partition) */ + BulkInsertState bistate; /* state of bulk inserts for partition */ + TupleTableSlot *dstslot; /* slot for inserting row into partition */ + AlteredTableInfo *tab; /* structure with generated column expressions + * and check constraint expressions. */ + Relation partRel; /* relation for partition */ +} SplitPartitionContext; + +/* + * createSplitPartitionContext: create context for partition and fill it + */ +static SplitPartitionContext * +createSplitPartitionContext(Relation partRel) +{ + SplitPartitionContext *pc; + + pc = palloc0_object(SplitPartitionContext); + pc->partRel = partRel; + + /* + * Prepare a BulkInsertState for table_tuple_insert. The FSM is empty, so + * don't bother using it. + */ + pc->bistate = GetBulkInsertState(); + + /* Create a destination tuple slot for the new partition. */ + pc->dstslot = table_slot_create(pc->partRel, NULL); + + return pc; +} + +/* + * deleteSplitPartitionContext: delete context for partition + */ +static void +deleteSplitPartitionContext(SplitPartitionContext *pc, List **wqueue, int ti_options) +{ + ListCell *ltab; + + ExecDropSingleTupleTableSlot(pc->dstslot); + FreeBulkInsertState(pc->bistate); + + table_finish_bulk_insert(pc->partRel, ti_options); + + /* + * We don't need to process this pc->partRel so delete the ALTER TABLE + * queue of it. + */ + foreach(ltab, *wqueue) + { + AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab); + + if (tab->relid == RelationGetRelid(pc->partRel)) + { + *wqueue = list_delete_cell(*wqueue, ltab); + break; + } + } + + pfree(pc); +} + +/* + * SplitPartitionMoveRows: scan split partition (splitRel) of partitioned table + * (rel) and move rows into new partitions. + * + * New partitions description: + * partlist: list of pointers to SinglePartitionSpec structures. It contains + * the partition specification details for all new partitions. + * newPartRels: list of Relations, new partitions created in + * ATExecSplitPartition. + */ +static void +SplitPartitionMoveRows(List **wqueue, Relation rel, Relation splitRel, + List *partlist, List *newPartRels) +{ + /* The FSM is empty, so don't bother using it. */ + int ti_options = TABLE_INSERT_SKIP_FSM; + CommandId mycid; + EState *estate; + ListCell *listptr, + *listptr2; + TupleTableSlot *srcslot; + ExprContext *econtext; + TableScanDesc scan; + Snapshot snapshot; + MemoryContext oldCxt; + List *partContexts = NIL; + TupleConversionMap *tuple_map; + SplitPartitionContext *defaultPartCtx = NULL, + *pc; + + mycid = GetCurrentCommandId(true); + + estate = CreateExecutorState(); + + forboth(listptr, partlist, listptr2, newPartRels) + { + SinglePartitionSpec *sps = (SinglePartitionSpec *) lfirst(listptr); + + pc = createSplitPartitionContext((Relation) lfirst(listptr2)); + + /* Find the work queue entry for the new partition table: newPartRel. */ + pc->tab = ATGetQueueEntry(wqueue, pc->partRel); + + buildExpressionExecutionStates(pc->tab, pc->partRel, estate); + + if (sps->bound->is_default) + { + /* + * We should not create a structure to check the partition + * constraint for the new DEFAULT partition. + */ + defaultPartCtx = pc; + } + else + { + List *partConstraint; + + /* Build expression execution states for partition check quals. */ + partConstraint = get_qual_from_partbound(rel, sps->bound); + partConstraint = + (List *) eval_const_expressions(NULL, + (Node *) partConstraint); + /* Make a boolean expression for ExecCheck(). */ + partConstraint = list_make1(make_ands_explicit(partConstraint)); + + /* + * Map the vars in the constraint expression from rel's attnos to + * splitRel's. + */ + partConstraint = map_partition_varattnos(partConstraint, + 1, splitRel, rel); + + pc->partqualstate = + ExecPrepareExpr((Expr *) linitial(partConstraint), estate); + Assert(pc->partqualstate != NULL); + } + + /* Store partition context into a list. */ + partContexts = lappend(partContexts, pc); + } + + econtext = GetPerTupleExprContext(estate); + + /* Create the necessary tuple slot. */ + srcslot = table_slot_create(splitRel, NULL); + + /* + * Map computing for moving attributes of the split partition to the new + * partition (for the first new partition, but other new partitions can + * use the same map). + */ + pc = (SplitPartitionContext *) lfirst(list_head(partContexts)); + tuple_map = convert_tuples_by_name(RelationGetDescr(splitRel), + RelationGetDescr(pc->partRel)); + + /* Scan through the rows. */ + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = table_beginscan(splitRel, snapshot, 0, NULL); + + /* + * Switch to per-tuple memory context and reset it for each tuple + * produced, so we don't leak memory. + */ + oldCxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + while (table_scan_getnextslot(scan, ForwardScanDirection, srcslot)) + { + bool found = false; + TupleTableSlot *insertslot; + + CHECK_FOR_INTERRUPTS(); + + econtext->ecxt_scantuple = srcslot; + + /* Search partition for the current slot, srcslot. */ + foreach(listptr, partContexts) + { + pc = (SplitPartitionContext *) lfirst(listptr); + + /* skip DEFAULT partition */ + if (pc->partqualstate && ExecCheck(pc->partqualstate, econtext)) + { + found = true; + break; + } + } + if (!found) + { + /* Use the DEFAULT partition if it exists. */ + if (defaultPartCtx) + pc = defaultPartCtx; + else + ereport(ERROR, + errcode(ERRCODE_CHECK_VIOLATION), + errmsg("can not find partition for split partition row"), + errtable(splitRel)); + } + + if (tuple_map) + { + /* Need to use a map to copy attributes. */ + insertslot = execute_attr_map_slot(tuple_map->attrMap, srcslot, pc->dstslot); + } + else + { + /* Extract data from the old tuple. */ + slot_getallattrs(srcslot); + + /* Copy attributes directly. */ + insertslot = pc->dstslot; + + ExecClearTuple(insertslot); + + memcpy(insertslot->tts_values, srcslot->tts_values, + sizeof(Datum) * srcslot->tts_nvalid); + memcpy(insertslot->tts_isnull, srcslot->tts_isnull, + sizeof(bool) * srcslot->tts_nvalid); + + ExecStoreVirtualTuple(insertslot); + } + + /* + * Constraints and GENERATED expressions might reference the tableoid + * column, so fill tts_tableOid with the desired value. (We must do + * this each time, because it gets overwritten with newrel's OID + * during storing.) + */ + insertslot->tts_tableOid = RelationGetRelid(pc->partRel); + + /* + * Now, evaluate any generated expressions whose inputs come from the + * new tuple. We assume these columns won't reference each other, so + * that there's no ordering dependency. + */ + evaluateGeneratedExpressionsAndCheckConstraints(pc->tab, pc->partRel, + insertslot, econtext); + + /* Write the tuple out to the new relation. */ + table_tuple_insert(pc->partRel, insertslot, mycid, + ti_options, pc->bistate); + + ResetExprContext(econtext); + } + + MemoryContextSwitchTo(oldCxt); + + table_endscan(scan); + UnregisterSnapshot(snapshot); + + if (tuple_map) + free_conversion_map(tuple_map); + + ExecDropSingleTupleTableSlot(srcslot); + + FreeExecutorState(estate); + + foreach_ptr(SplitPartitionContext, spc, partContexts) + deleteSplitPartitionContext(spc, wqueue, ti_options); +} + +/* + * ALTER TABLE SPLIT PARTITION INTO + */ +static void +ATExecSplitPartition(List **wqueue, AlteredTableInfo *tab, Relation rel, + PartitionCmd *cmd, AlterTableUtilityContext *context) +{ + Relation splitRel; + Oid splitRelOid; + ListCell *listptr, + *listptr2; + bool isSameName = false; + char tmpRelName[NAMEDATALEN]; + List *newPartRels = NIL; + ObjectAddress object; + Oid defaultPartOid; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + + defaultPartOid = get_default_oid_from_partdesc(RelationGetPartitionDesc(rel, true)); + + /* + * Partition is already locked in the transformPartitionCmdForSplit + * function. + */ + splitRel = table_openrv(cmd->name, NoLock); + + splitRelOid = RelationGetRelid(splitRel); + + /* Check descriptions of new partitions. */ + foreach_node(SinglePartitionSpec, sps, cmd->partlist) + { + Oid existingRelid; + + /* Look up the existing relation by the new partition name. */ + RangeVarGetAndCheckCreationNamespace(sps->name, NoLock, &existingRelid); + + /* + * This would fail later on anyway if the relation already exists. But + * by catching it here, we can emit a nicer error message. + */ + if (existingRelid == splitRelOid && !isSameName) + /* One new partition can have the same name as a split partition. */ + isSameName = true; + else if (OidIsValid(existingRelid)) + ereport(ERROR, + errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" already exists", sps->name->relname)); + } + + /* Detach the split partition. */ + detachPartitionTable(rel, splitRel, defaultPartOid); + + /* + * Perform a preliminary check to determine whether it's safe to drop the + * split partition before we actually do so later. After merging rows into + * the new partitions via SplitPartitionMoveRows, all old partitions need + * to be dropped. However, since the drop behavior is DROP_RESTRICT and + * the merge process (SplitPartitionMoveRows) can be time-consuming, + * performing an early check on the drop eligibility of old partitions is + * preferable. + */ + object.objectId = splitRelOid; + object.classId = RelationRelationId; + object.objectSubId = 0; + performDeletionCheck(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); + + /* + * If a new partition has the same name as the split partition, then we + * should rename the split partition to reuse its name. + */ + if (isSameName) + { + /* + * We must bump the command counter to make the split partition tuple + * visible for renaming. + */ + CommandCounterIncrement(); + /* Rename partition. */ + sprintf(tmpRelName, "split-%u-%X-tmp", RelationGetRelid(rel), MyProcPid); + RenameRelationInternal(splitRelOid, tmpRelName, true, false); + + /* + * We must bump the command counter to make the split partition tuple + * visible after renaming. + */ + CommandCounterIncrement(); + } + + /* Create new partitions (like a split partition), without indexes. */ + foreach_node(SinglePartitionSpec, sps, cmd->partlist) + { + Relation newPartRel; + + newPartRel = createPartitionTable(wqueue, sps->name, rel, + splitRel->rd_rel->relowner); + newPartRels = lappend(newPartRels, newPartRel); + } + + /* + * Switch to the table owner's userid, so that any index functions are run + * as that user. Also, lockdown security-restricted operations and + * arrange to make GUC variable changes local to this command. + * + * Need to do it after determining the namespace in the + * createPartitionTable() call. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(splitRel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + RestrictSearchPath(); + + /* Copy data from the split partition to the new partitions. */ + SplitPartitionMoveRows(wqueue, rel, splitRel, cmd->partlist, newPartRels); + /* Keep the lock until commit. */ + table_close(splitRel, NoLock); + + /* Attach new partitions to the partitioned table. */ + forboth(listptr, cmd->partlist, listptr2, newPartRels) + { + SinglePartitionSpec *sps = (SinglePartitionSpec *) lfirst(listptr); + Relation newPartRel = (Relation) lfirst(listptr2); + + /* + * wqueue = NULL: verification for each cloned constraint is not + * needed. + */ + attachPartitionTable(NULL, rel, newPartRel, sps->bound); + /* Keep the lock until commit. */ + table_close(newPartRel, NoLock); + } + + /* Drop the split partition. */ + object.classId = RelationRelationId; + object.objectId = splitRelOid; + object.objectSubId = 0; + /* Probably DROP_CASCADE is not needed. */ + performDeletion(&object, DROP_RESTRICT, 0); + + /* Roll back any GUC changes executed by index functions. */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore the userid and security context. */ + SetUserIdAndSecContext(save_userid, save_sec_context); +} diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index a9005cc7212b6..df31eace47ac9 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -500,7 +500,7 @@ DropTableSpace(DropTableSpaceStmt *stmt) * mustn't delete. So instead, we force a checkpoint which will clean * out any lingering files, and try again. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* * On Windows, an unlinked file persists in the directory listing diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 67f8e70f9c166..12c97f2c023b3 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -30,7 +30,6 @@ #include "catalog/pg_proc.h" #include "catalog/pg_trigger.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "commands/trigger.h" #include "executor/executor.h" #include "miscadmin.h" @@ -80,6 +79,7 @@ static bool GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp); @@ -871,7 +871,7 @@ CreateTriggerFiringOn(CreateTrigStmt *stmt, const char *queryString, CStringGetDatum(trigname)); values[Anum_pg_trigger_tgfoid - 1] = ObjectIdGetDatum(funcoid); values[Anum_pg_trigger_tgtype - 1] = Int16GetDatum(tgtype); - values[Anum_pg_trigger_tgenabled - 1] = trigger_fires_when; + values[Anum_pg_trigger_tgenabled - 1] = CharGetDatum(trigger_fires_when); values[Anum_pg_trigger_tgisinternal - 1] = BoolGetDatum(isInternal); values[Anum_pg_trigger_tgconstrrelid - 1] = ObjectIdGetDatum(constrrelid); values[Anum_pg_trigger_tgconstrindid - 1] = ObjectIdGetDatum(indexOid); @@ -1991,7 +1991,7 @@ RelationBuildTriggers(Relation relation) } /* Build trigdesc */ - trigdesc = (TriggerDesc *) palloc0(sizeof(TriggerDesc)); + trigdesc = palloc0_object(TriggerDesc); trigdesc->triggers = triggers; trigdesc->numtriggers = numtrigs; for (i = 0; i < numtrigs; i++) @@ -2096,7 +2096,7 @@ CopyTriggerDesc(TriggerDesc *trigdesc) if (trigdesc == NULL || trigdesc->numtriggers <= 0) return NULL; - newdesc = (TriggerDesc *) palloc(sizeof(TriggerDesc)); + newdesc = palloc_object(TriggerDesc); memcpy(newdesc, trigdesc, sizeof(TriggerDesc)); trigger = (Trigger *) palloc(trigdesc->numtriggers * sizeof(Trigger)); @@ -2284,6 +2284,8 @@ FindTriggerIncompatibleWithInheritance(TriggerDesc *trigdesc) { Trigger *trigger = &trigdesc->triggers[i]; + if (!TRIGGER_FOR_ROW(trigger->tgtype)) + continue; if (trigger->tgoldtable != NULL || trigger->tgnewtable != NULL) return trigger->tgname; } @@ -2544,6 +2546,15 @@ ExecARInsertTriggers(EState *estate, ResultRelInfo *relinfo, { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; + if (relinfo->ri_FdwRoutine && transition_capture && + transition_capture->tcs_insert_new_table) + { + Assert(relinfo->ri_RootResultRelInfo); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot collect transition tuples from child foreign tables"))); + } + if ((trigdesc && trigdesc->trig_insert_after_row) || (transition_capture && transition_capture->tcs_insert_new_table)) AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, @@ -2693,7 +2704,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_delete) { TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -2708,9 +2720,17 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; + /* + * Get a copy of the on-disk tuple we are planning to delete. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE DELETE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - LockTupleExclusive, slot, &epqslot_candidate, - tmresult, tmfd)) + LockTupleExclusive, slot, !is_merge_delete, + &epqslot_candidate, tmresult, tmfd)) return false; /* @@ -2787,6 +2807,15 @@ ExecARDeleteTriggers(EState *estate, { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; + if (relinfo->ri_FdwRoutine && transition_capture && + transition_capture->tcs_delete_old_table) + { + Assert(relinfo->ri_RootResultRelInfo); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot collect transition tuples from child foreign tables"))); + } + if ((trigdesc && trigdesc->trig_delete_after_row) || (transition_capture && transition_capture->tcs_delete_old_table)) { @@ -2800,6 +2829,7 @@ ExecARDeleteTriggers(EState *estate, tupleid, LockTupleExclusive, slot, + false, NULL, NULL, NULL); @@ -2944,7 +2974,8 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo); @@ -2965,10 +2996,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; - /* get a copy of the on-disk tuple we are planning to update */ + /* + * Get a copy of the on-disk tuple we are planning to update. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE UPDATE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - lockmode, oldslot, &epqslot_candidate, - tmresult, tmfd)) + lockmode, oldslot, !is_merge_update, + &epqslot_candidate, tmresult, tmfd)) return false; /* cancel the update action */ /* @@ -3115,6 +3153,16 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; + if (relinfo->ri_FdwRoutine && transition_capture && + (transition_capture->tcs_update_old_table || + transition_capture->tcs_update_new_table)) + { + Assert(relinfo->ri_RootResultRelInfo); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot collect transition tuples from child foreign tables"))); + } + if ((trigdesc && trigdesc->trig_update_after_row) || (transition_capture && (transition_capture->tcs_update_old_table || @@ -3142,6 +3190,7 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, tupleid, LockTupleExclusive, oldslot, + false, NULL, NULL, NULL); @@ -3298,6 +3347,7 @@ GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp) @@ -3357,29 +3407,30 @@ GetTupleForTrigger(EState *estate, if (tmfd.traversed) { /* - * Recheck the tuple using EPQ. For MERGE, we leave this - * to the caller (it must do additional rechecking, and - * might end up executing a different action entirely). + * Recheck the tuple using EPQ, if requested. Otherwise, + * just return that it was concurrently updated. */ - if (estate->es_plannedstmt->commandType == CMD_MERGE) + if (do_epq_recheck) { - if (tmresultp) - *tmresultp = TM_Updated; - return false; + *epqslot = EvalPlanQual(epqstate, + relation, + relinfo->ri_RangeTableIndex, + oldslot); + + /* + * If PlanQual failed for updated tuple - we must not + * process this tuple! + */ + if (TupIsNull(*epqslot)) + { + *epqslot = NULL; + return false; + } } - - *epqslot = EvalPlanQual(epqstate, - relation, - relinfo->ri_RangeTableIndex, - oldslot); - - /* - * If PlanQual failed for updated tuple - we must not - * process this tuple! - */ - if (TupIsNull(*epqslot)) + else { - *epqslot = NULL; + if (tmresultp) + *tmresultp = TM_Updated; return false; } } @@ -4850,7 +4901,7 @@ GetAfterTriggersTableData(Oid relid, CmdType cmdType) oldcxt = MemoryContextSwitchTo(CurTransactionContext); - table = (AfterTriggersTableData *) palloc0(sizeof(AfterTriggersTableData)); + table = palloc0_object(AfterTriggersTableData); table->relid = relid; table->cmdType = cmdType; qs->tables = lappend(qs->tables, table); @@ -4999,7 +5050,7 @@ MakeTransitionCaptureState(TriggerDesc *trigdesc, Oid relid, CmdType cmdType) MemoryContextSwitchTo(oldcxt); /* Now build the TransitionCaptureState struct, in caller's context */ - state = (TransitionCaptureState *) palloc0(sizeof(TransitionCaptureState)); + state = palloc0_object(TransitionCaptureState); state->tcs_delete_old_table = need_old_del; state->tcs_update_old_table = need_old_upd; state->tcs_update_new_table = need_new_upd; diff --git a/src/backend/commands/tsearchcmds.c b/src/backend/commands/tsearchcmds.c index ab16d42ad56ba..ec4580e17c900 100644 --- a/src/backend/commands/tsearchcmds.c +++ b/src/backend/commands/tsearchcmds.c @@ -1027,7 +1027,7 @@ DefineTSConfiguration(List *names, List *parameters, ObjectAddress *copied) * know that they will be used. */ max_slots = MAX_CATALOG_MULTI_INSERT_BYTES / sizeof(FormData_pg_ts_config_map); - slot = palloc(sizeof(TupleTableSlot *) * max_slots); + slot = palloc_array(TupleTableSlot *, max_slots); ScanKeyInit(&skey, Anum_pg_ts_config_map_mapcfg, @@ -1058,10 +1058,10 @@ DefineTSConfiguration(List *names, List *parameters, ObjectAddress *copied) memset(slot[slot_stored_count]->tts_isnull, false, slot[slot_stored_count]->tts_tupleDescriptor->natts * sizeof(bool)); - slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_mapcfg - 1] = cfgOid; - slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_maptokentype - 1] = cfgmap->maptokentype; - slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_mapseqno - 1] = cfgmap->mapseqno; - slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_mapdict - 1] = cfgmap->mapdict; + slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_mapcfg - 1] = ObjectIdGetDatum(cfgOid); + slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_maptokentype - 1] = Int32GetDatum(cfgmap->maptokentype); + slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_mapseqno - 1] = Int32GetDatum(cfgmap->mapseqno); + slot[slot_stored_count]->tts_values[Anum_pg_ts_config_map_mapdict - 1] = ObjectIdGetDatum(cfgmap->mapdict); ExecStoreVirtualTuple(slot[slot_stored_count]); slot_stored_count++; @@ -1261,7 +1261,7 @@ getTokenTypes(Oid prsId, List *tokennames) { if (strcmp(strVal(val), list[j].alias) == 0) { - TSTokenTypeItem *ts = (TSTokenTypeItem *) palloc0(sizeof(TSTokenTypeItem)); + TSTokenTypeItem *ts = palloc0_object(TSTokenTypeItem); ts->num = list[j].lexid; ts->name = pstrdup(strVal(val)); @@ -1344,7 +1344,7 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt, * Convert list of dictionary names to array of dict OIDs */ ndict = list_length(stmt->dicts); - dictIds = (Oid *) palloc(sizeof(Oid) * ndict); + dictIds = palloc_array(Oid, ndict); i = 0; foreach(c, stmt->dicts) { @@ -1432,7 +1432,7 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt, /* Allocate the slots to use and initialize them */ nslots = Min(ntoken * ndict, MAX_CATALOG_MULTI_INSERT_BYTES / sizeof(FormData_pg_ts_config_map)); - slot = palloc(sizeof(TupleTableSlot *) * nslots); + slot = palloc_array(TupleTableSlot *, nslots); for (i = 0; i < nslots; i++) slot[i] = MakeSingleTupleTableSlot(RelationGetDescr(relMap), &TTSOpsHeapTuple); diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 45ae7472ab5ad..be6ffd6ddb091 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -126,7 +126,7 @@ static Oid findTypeSubscriptingFunction(List *procname, Oid typeOid); static Oid findRangeSubOpclass(List *opcname, Oid subtype); static Oid findRangeCanonicalFunction(List *procname, Oid typeOid); static Oid findRangeSubtypeDiffFunction(List *procname, Oid subtype); -static void validateDomainCheckConstraint(Oid domainoid, const char *ccbin); +static void validateDomainCheckConstraint(Oid domainoid, const char *ccbin, LOCKMODE lockmode); static void validateDomainNotNullConstraint(Oid domainoid); static List *get_rels_with_domain(Oid domainOid, LOCKMODE lockmode); static void checkEnumOwner(HeapTuple tup); @@ -939,11 +939,19 @@ DefineDomain(ParseState *pstate, CreateDomainStmt *stmt) break; case CONSTR_NOTNULL: - if (nullDefined && !typNotNull) + if (nullDefined) + { + if (!typNotNull) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting NULL/NOT NULL constraints"), + parser_errposition(pstate, constr->location)); + ereport(ERROR, - errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting NULL/NOT NULL constraints"), + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("redundant NOT NULL constraint definition"), parser_errposition(pstate, constr->location)); + } if (constr->is_no_inherit) ereport(ERROR, errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -1734,6 +1742,9 @@ DefineRange(ParseState *pstate, CreateRangeStmt *stmt) false, /* Type NOT NULL */ InvalidOid); /* typcollation */ + /* Ensure these new types are visible to ProcedureCreate */ + CommandCounterIncrement(); + /* And create the constructor functions for this range type */ makeRangeConstructors(typeName, typeNamespace, typoid, rangeSubtype); makeMultirangeConstructors(multirangeTypeName, typeNamespace, @@ -1756,7 +1767,7 @@ DefineRange(ParseState *pstate, CreateRangeStmt *stmt) * impossible to define a polymorphic constructor; we have to generate new * constructor functions explicitly for each range type. * - * We actually define 4 functions, with 0 through 3 arguments. This is just + * We actually define 2 functions, with 2 through 3 arguments. This is just * to offer more convenience for the user. */ static void @@ -2978,7 +2989,7 @@ AlterDomainAddConstraint(List *names, Node *newConstraint, * to. */ if (!constr->skip_validation) - validateDomainCheckConstraint(domainoid, ccbin); + validateDomainCheckConstraint(domainoid, ccbin, ShareLock); /* * We must send out an sinval message for the domain, to ensure that @@ -3090,7 +3101,12 @@ AlterDomainValidateConstraint(List *names, const char *constrName) val = SysCacheGetAttrNotNull(CONSTROID, tuple, Anum_pg_constraint_conbin); conbin = TextDatumGetCString(val); - validateDomainCheckConstraint(domainoid, conbin); + /* + * Locking related relations with ShareUpdateExclusiveLock is ok because + * not-yet-valid constraints are still enforced against concurrent inserts + * or updates. + */ + validateDomainCheckConstraint(domainoid, conbin, ShareUpdateExclusiveLock); /* * Now update the catalog, while we have the door open. @@ -3183,9 +3199,16 @@ validateDomainNotNullConstraint(Oid domainoid) /* * Verify that all columns currently using the domain satisfy the given check * constraint expression. + * + * It is used to validate existing constraints and to add newly created check + * constraints to a domain. + * + * The lockmode is used for relations using the domain. It should be + * ShareLock when adding a new constraint to domain. It can be + * ShareUpdateExclusiveLock when validating an existing constraint. */ static void -validateDomainCheckConstraint(Oid domainoid, const char *ccbin) +validateDomainCheckConstraint(Oid domainoid, const char *ccbin, LOCKMODE lockmode) { Expr *expr = (Expr *) stringToNode(ccbin); List *rels; @@ -3202,9 +3225,7 @@ validateDomainCheckConstraint(Oid domainoid, const char *ccbin) exprstate = ExecPrepareExpr(expr, estate); /* Fetch relation list with attributes based on this domain */ - /* ShareLock is sufficient to prevent concurrent data changes */ - - rels = get_rels_with_domain(domainoid, ShareLock); + rels = get_rels_with_domain(domainoid, lockmode); foreach(rt, rels) { @@ -3230,7 +3251,6 @@ validateDomainCheckConstraint(Oid domainoid, const char *ccbin) Datum d; bool isNull; Datum conResult; - Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum - 1); d = slot_getattr(slot, attnum, &isNull); @@ -3243,6 +3263,8 @@ validateDomainCheckConstraint(Oid domainoid, const char *ccbin) if (!isNull && !DatumGetBool(conResult)) { + Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum - 1); + /* * In principle the auxiliary information for this error * should be errdomainconstraint(), but errtablecol() @@ -3425,10 +3447,10 @@ get_rels_with_domain(Oid domainOid, LOCKMODE lockmode) } /* Build the RelToCheck entry with enough space for all atts */ - rtc = (RelToCheck *) palloc(sizeof(RelToCheck)); + rtc = palloc_object(RelToCheck); rtc->rel = rel; rtc->natts = 0; - rtc->atts = (int *) palloc(sizeof(int) * RelationGetNumberOfAttributes(rel)); + rtc->atts = palloc_array(int, RelationGetNumberOfAttributes(rel)); result = lappend(result, rtc); } diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c index 0d638e29d0066..ef4de1e7fd124 100644 --- a/src/backend/commands/user.c +++ b/src/backend/commands/user.c @@ -30,7 +30,6 @@ #include "commands/defrem.h" #include "commands/seclabel.h" #include "commands/user.h" -#include "lib/qunique.h" #include "libpq/crypt.h" #include "miscadmin.h" #include "storage/lmgr.h" @@ -490,7 +489,8 @@ CreateRole(ParseState *pstate, CreateRoleStmt *stmt) * Advance command counter so we can see new record; else tests in * AddRoleMems may fail. */ - CommandCounterIncrement(); + if (addroleto || adminmembers || rolemembers) + CommandCounterIncrement(); /* Default grant. */ InitGrantRoleOptions(&popt); @@ -1904,8 +1904,7 @@ AddRoleMems(Oid currentUserId, const char *rolename, Oid roleid, else { Oid objectId; - Oid *newmembers = (Oid *) palloc(3 * sizeof(Oid)); - int nnewmembers; + Oid *newmembers = palloc_object(Oid); /* * The values for these options can be taken directly from 'popt'. @@ -1924,7 +1923,7 @@ AddRoleMems(Oid currentUserId, const char *rolename, Oid roleid, */ if ((popt->specified & GRANT_ROLE_SPECIFIED_INHERIT) != 0) new_record[Anum_pg_auth_members_inherit_option - 1] = - popt->inherit; + BoolGetDatum(popt->inherit); else { HeapTuple mrtup; @@ -1935,34 +1934,24 @@ AddRoleMems(Oid currentUserId, const char *rolename, Oid roleid, elog(ERROR, "cache lookup failed for role %u", memberid); mrform = (Form_pg_authid) GETSTRUCT(mrtup); new_record[Anum_pg_auth_members_inherit_option - 1] = - mrform->rolinherit; + BoolGetDatum(mrform->rolinherit); ReleaseSysCache(mrtup); } /* get an OID for the new row and insert it */ objectId = GetNewOidWithIndex(pg_authmem_rel, AuthMemOidIndexId, Anum_pg_auth_members_oid); - new_record[Anum_pg_auth_members_oid - 1] = objectId; + new_record[Anum_pg_auth_members_oid - 1] = ObjectIdGetDatum(objectId); tuple = heap_form_tuple(pg_authmem_dsc, new_record, new_record_nulls); CatalogTupleInsert(pg_authmem_rel, tuple); - /* - * Record dependencies on the roleid, member, and grantor, as if a - * pg_auth_members entry were an object ACL. - * updateAclDependencies() requires an input array that is - * palloc'd (it will free it), sorted, and de-duped. - */ - newmembers[0] = roleid; - newmembers[1] = memberid; - newmembers[2] = grantorId; - qsort(newmembers, 3, sizeof(Oid), oid_cmp); - nnewmembers = qunique(newmembers, 3, sizeof(Oid), oid_cmp); - + /* updateAclDependencies wants to pfree array inputs */ + newmembers[0] = grantorId; updateAclDependencies(AuthMemRelationId, objectId, 0, InvalidOid, 0, NULL, - nnewmembers, newmembers); + 1, newmembers); } /* CCI after each change, in case there are duplicates in list */ @@ -2306,7 +2295,7 @@ initialize_revoke_actions(CatCList *memlist) if (memlist->n_members == 0) return NULL; - result = palloc(sizeof(RevokeRoleGrantAction) * memlist->n_members); + result = palloc_array(RevokeRoleGrantAction, memlist->n_members); for (i = 0; i < memlist->n_members; i++) result[i] = RRG_NOOP; return result; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 33a33bf6b1cfa..0528d1b6ecbed 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -37,6 +37,7 @@ #include "catalog/namespace.h" #include "catalog/pg_database.h" #include "catalog/pg_inherits.h" +#include "commands/async.h" #include "commands/cluster.h" #include "commands/defrem.h" #include "commands/progress.h" @@ -56,6 +57,7 @@ #include "utils/fmgroids.h" #include "utils/guc.h" #include "utils/guc_hooks.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -123,7 +125,7 @@ static void vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti, TransactionId lastSaneFrozenXid, MultiXactId lastSaneMinMulti); -static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, +static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, BufferAccessStrategy bstrategy); static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); @@ -219,9 +221,10 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("BUFFER_USAGE_LIMIT option must be 0 or between %d kB and %d kB", + errmsg("%s option must be 0 or between %d kB and %d kB", + "BUFFER_USAGE_LIMIT", MIN_BAS_VAC_RING_SIZE_KB, MAX_BAS_VAC_RING_SIZE_KB), - hintmsg ? errhint("%s", _(hintmsg)) : 0)); + hintmsg ? errhint_internal("%s", _(hintmsg)) : 0)); } ring_size = result; @@ -229,7 +232,8 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) else if (!vacstmt->is_vacuumcmd) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized ANALYZE option \"%s\"", opt->defname), + errmsg("unrecognized %s option \"%s\"", + "ANALYZE", opt->defname), parser_errposition(pstate, opt->location))); /* Parse options available on VACUUM */ @@ -265,35 +269,24 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) params.truncate = get_vacoptval_from_boolean(opt); else if (strcmp(opt->defname, "parallel") == 0) { - if (opt->arg == NULL) - { + int nworkers = defGetInt32(opt); + + if (nworkers < 0 || nworkers > MAX_PARALLEL_WORKER_LIMIT) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("parallel option requires a value between 0 and %d", + errmsg("%s option must be between 0 and %d", + "PARALLEL", MAX_PARALLEL_WORKER_LIMIT), parser_errposition(pstate, opt->location))); - } - else - { - int nworkers; - - nworkers = defGetInt32(opt); - if (nworkers < 0 || nworkers > MAX_PARALLEL_WORKER_LIMIT) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("parallel workers for vacuum must be between 0 and %d", - MAX_PARALLEL_WORKER_LIMIT), - parser_errposition(pstate, opt->location))); - /* - * Disable parallel vacuum, if user has specified parallel - * degree as zero. - */ - if (nworkers == 0) - params.nworkers = -1; - else - params.nworkers = nworkers; - } + /* + * Disable parallel vacuum, if user has specified parallel degree + * as zero. + */ + if (nworkers == 0) + params.nworkers = -1; + else + params.nworkers = nworkers; } else if (strcmp(opt->defname, "skip_database_stats") == 0) skip_database_stats = defGetBoolean(opt); @@ -302,7 +295,8 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized VACUUM option \"%s\"", opt->defname), + errmsg("unrecognized %s option \"%s\"", + "VACUUM", opt->defname), parser_errposition(pstate, opt->location))); } @@ -415,8 +409,12 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) /* user-invoked vacuum is never "for wraparound" */ params.is_wraparound = false; - /* user-invoked vacuum uses VACOPT_VERBOSE instead of log_min_duration */ - params.log_min_duration = -1; + /* + * user-invoked vacuum uses VACOPT_VERBOSE instead of + * log_vacuum_min_duration and log_analyze_min_duration + */ + params.log_vacuum_min_duration = -1; + params.log_analyze_min_duration = -1; /* * Later, in vacuum_rel(), we check if a reloption override was specified. @@ -464,7 +462,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) } /* Now go through the common routine */ - vacuum(vacstmt->rels, ¶ms, bstrategy, vac_context, isTopLevel); + vacuum(vacstmt->rels, params, bstrategy, vac_context, isTopLevel); /* Finally, clean up the vacuum memory context */ MemoryContextDelete(vac_context); @@ -493,7 +491,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) * memory context that will not disappear at transaction commit. */ void -vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, +vacuum(List *relations, const VacuumParams params, BufferAccessStrategy bstrategy, MemoryContext vac_context, bool isTopLevel) { static bool in_vacuum = false; @@ -502,9 +500,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, volatile bool in_outer_xact, use_own_xacts; - Assert(params != NULL); - - stmttype = (params->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; + stmttype = (params.options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; /* * We cannot run VACUUM inside a user transaction block; if we were inside @@ -514,7 +510,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * * ANALYZE (without VACUUM) can run either way. */ - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) { PreventInTransactionBlock(isTopLevel, stmttype); in_outer_xact = false; @@ -537,7 +533,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * Build list of relation(s) to process, putting any new data in * vac_context for safekeeping. */ - if (params->options & VACOPT_ONLY_DATABASE_STATS) + if (params.options & VACOPT_ONLY_DATABASE_STATS) { /* We don't process any tables in this case */ Assert(relations == NIL); @@ -553,7 +549,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, List *sublist; MemoryContext old_context; - sublist = expand_vacuum_rel(vrel, vac_context, params->options); + sublist = expand_vacuum_rel(vrel, vac_context, params.options); old_context = MemoryContextSwitchTo(vac_context); newrels = list_concat(newrels, sublist); MemoryContextSwitchTo(old_context); @@ -561,7 +557,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, relations = newrels; } else - relations = get_all_vacuum_rels(vac_context, params->options); + relations = get_all_vacuum_rels(vac_context, params.options); /* * Decide whether we need to start/commit our own transactions. @@ -577,11 +573,11 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * transaction block, and also in an autovacuum worker, use own * transactions so we can release locks sooner. */ - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) use_own_xacts = true; else { - Assert(params->options & VACOPT_ANALYZE); + Assert(params.options & VACOPT_ANALYZE); if (AmAutoVacuumWorkerProcess()) use_own_xacts = true; else if (in_outer_xact) @@ -632,13 +628,13 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, { VacuumRelation *vrel = lfirst_node(VacuumRelation, cur); - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) { if (!vacuum_rel(vrel->oid, vrel->relation, params, bstrategy)) continue; } - if (params->options & VACOPT_ANALYZE) + if (params.options & VACOPT_ANALYZE) { /* * If using separate xacts, start one for analyze. Otherwise, @@ -702,8 +698,8 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, StartTransactionCommand(); } - if ((params->options & VACOPT_VACUUM) && - !(params->options & VACOPT_SKIP_DATABASE_STATS)) + if ((params.options & VACOPT_VACUUM) && + !(params.options & VACOPT_SKIP_DATABASE_STATS)) { /* * Update pg_database.datfrozenxid, and truncate pg_xact if possible. @@ -1101,7 +1097,7 @@ get_all_vacuum_rels(MemoryContext vac_context, int options) * minimum). */ bool -vacuum_get_cutoffs(Relation rel, const VacuumParams *params, +vacuum_get_cutoffs(Relation rel, const VacuumParams params, struct VacuumCutoffs *cutoffs) { int freeze_min_age, @@ -1117,10 +1113,10 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, aggressiveMXIDCutoff; /* Use mutable copies of freeze age parameters */ - freeze_min_age = params->freeze_min_age; - multixact_freeze_min_age = params->multixact_freeze_min_age; - freeze_table_age = params->freeze_table_age; - multixact_freeze_table_age = params->multixact_freeze_table_age; + freeze_min_age = params.freeze_min_age; + multixact_freeze_min_age = params.multixact_freeze_min_age; + freeze_table_age = params.freeze_table_age; + multixact_freeze_table_age = params.multixact_freeze_table_age; /* Set pg_class fields in cutoffs */ cutoffs->relfrozenxid = rel->rd_rel->relfrozenxid; @@ -1151,8 +1147,8 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, /* * Also compute the multixact age for which freezing is urgent. This is - * normally autovacuum_multixact_freeze_max_age, but may be less if we are - * short of multixact member space. + * normally autovacuum_multixact_freeze_max_age, but may be less if + * multixact members are bloated. */ effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); @@ -1948,6 +1944,12 @@ vac_truncate_clog(TransactionId frozenXID, return; } + /* + * Freeze any old transaction IDs in the async notification queue before + * CLOG truncation. + */ + AsyncNotifyFreezeXids(frozenXID); + /* * Advance the oldest value for commit timestamps before truncating, so * that if a user requests a timestamp for a transaction we're truncating @@ -1971,7 +1973,7 @@ vac_truncate_clog(TransactionId frozenXID, * signaling twice? */ SetTransactionIdLimit(frozenXID, oldestxid_datoid); - SetMultiXactIdLimit(minMulti, minmulti_datoid, false); + SetMultiXactIdLimit(minMulti, minmulti_datoid); LWLockRelease(WrapLimitsVacuumLock); } @@ -1997,7 +1999,7 @@ vac_truncate_clog(TransactionId frozenXID, * At entry and exit, we are not inside a transaction. */ static bool -vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, +vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, BufferAccessStrategy bstrategy) { LOCKMODE lmode; @@ -2008,13 +2010,18 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, Oid save_userid; int save_sec_context; int save_nestlevel; + VacuumParams toast_vacuum_params; - Assert(params != NULL); + /* + * This function scribbles on the parameters, so make a copy early to + * avoid affecting the TOAST table (if we do end up recursing to it). + */ + memcpy(&toast_vacuum_params, ¶ms, sizeof(VacuumParams)); /* Begin a transaction for vacuuming this relation */ StartTransactionCommand(); - if (!(params->options & VACOPT_FULL)) + if (!(params.options & VACOPT_FULL)) { /* * In lazy vacuum, we can set the PROC_IN_VACUUM flag, which lets @@ -2040,7 +2047,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); MyProc->statusFlags |= PROC_IN_VACUUM; - if (params->is_wraparound) + if (params.is_wraparound) MyProc->statusFlags |= PROC_VACUUM_FOR_WRAPAROUND; ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; LWLockRelease(ProcArrayLock); @@ -2064,12 +2071,12 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either * way, we can be sure that no other backend is vacuuming the same table. */ - lmode = (params->options & VACOPT_FULL) ? + lmode = (params.options & VACOPT_FULL) ? AccessExclusiveLock : ShareUpdateExclusiveLock; /* open the relation and get the appropriate lock on it */ - rel = vacuum_open_relation(relid, relation, params->options, - params->log_min_duration >= 0, lmode); + rel = vacuum_open_relation(relid, relation, params.options, + params.log_vacuum_min_duration >= 0, lmode); /* leave if relation could not be opened or locked */ if (!rel) @@ -2084,8 +2091,8 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * This is only safe to do because we hold a session lock on the main * relation that prevents concurrent deletion. */ - if (OidIsValid(params->toast_parent)) - priv_relid = params->toast_parent; + if (OidIsValid(params.toast_parent)) + priv_relid = params.toast_parent; else priv_relid = RelationGetRelid(rel); @@ -2098,7 +2105,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ if (!vacuum_is_permitted_for_relation(priv_relid, rel->rd_rel, - params->options & ~VACOPT_ANALYZE)) + params.options & ~VACOPT_ANALYZE)) { relation_close(rel, lmode); PopActiveSnapshot(); @@ -2169,7 +2176,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * Set index_cleanup option based on index_cleanup reloption if it wasn't * specified in VACUUM command, or when running in an autovacuum worker */ - if (params->index_cleanup == VACOPTVALUE_UNSPECIFIED) + if (params.index_cleanup == VACOPTVALUE_UNSPECIFIED) { StdRdOptIndexCleanup vacuum_index_cleanup; @@ -2180,56 +2187,74 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, ((StdRdOptions *) rel->rd_options)->vacuum_index_cleanup; if (vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_AUTO) - params->index_cleanup = VACOPTVALUE_AUTO; + params.index_cleanup = VACOPTVALUE_AUTO; else if (vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON) - params->index_cleanup = VACOPTVALUE_ENABLED; + params.index_cleanup = VACOPTVALUE_ENABLED; else { Assert(vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF); - params->index_cleanup = VACOPTVALUE_DISABLED; + params.index_cleanup = VACOPTVALUE_DISABLED; } } +#ifdef USE_INJECTION_POINTS + if (params.index_cleanup == VACOPTVALUE_AUTO) + INJECTION_POINT("vacuum-index-cleanup-auto", NULL); + else if (params.index_cleanup == VACOPTVALUE_DISABLED) + INJECTION_POINT("vacuum-index-cleanup-disabled", NULL); + else if (params.index_cleanup == VACOPTVALUE_ENABLED) + INJECTION_POINT("vacuum-index-cleanup-enabled", NULL); +#endif + /* * Check if the vacuum_max_eager_freeze_failure_rate table storage * parameter was specified. This overrides the GUC value. */ if (rel->rd_options != NULL && ((StdRdOptions *) rel->rd_options)->vacuum_max_eager_freeze_failure_rate >= 0) - params->max_eager_freeze_failure_rate = + params.max_eager_freeze_failure_rate = ((StdRdOptions *) rel->rd_options)->vacuum_max_eager_freeze_failure_rate; /* * Set truncate option based on truncate reloption or GUC if it wasn't * specified in VACUUM command, or when running in an autovacuum worker */ - if (params->truncate == VACOPTVALUE_UNSPECIFIED) + if (params.truncate == VACOPTVALUE_UNSPECIFIED) { StdRdOptions *opts = (StdRdOptions *) rel->rd_options; if (opts && opts->vacuum_truncate_set) { if (opts->vacuum_truncate) - params->truncate = VACOPTVALUE_ENABLED; + params.truncate = VACOPTVALUE_ENABLED; else - params->truncate = VACOPTVALUE_DISABLED; + params.truncate = VACOPTVALUE_DISABLED; } else if (vacuum_truncate) - params->truncate = VACOPTVALUE_ENABLED; + params.truncate = VACOPTVALUE_ENABLED; else - params->truncate = VACOPTVALUE_DISABLED; + params.truncate = VACOPTVALUE_DISABLED; } +#ifdef USE_INJECTION_POINTS + if (params.truncate == VACOPTVALUE_AUTO) + INJECTION_POINT("vacuum-truncate-auto", NULL); + else if (params.truncate == VACOPTVALUE_DISABLED) + INJECTION_POINT("vacuum-truncate-disabled", NULL); + else if (params.truncate == VACOPTVALUE_ENABLED) + INJECTION_POINT("vacuum-truncate-enabled", NULL); +#endif + /* * Remember the relation's TOAST relation for later, if the caller asked * us to process it. In VACUUM FULL, though, the toast table is * automatically rebuilt by cluster_rel so we shouldn't recurse to it, * unless PROCESS_MAIN is disabled. */ - if ((params->options & VACOPT_PROCESS_TOAST) != 0 && - ((params->options & VACOPT_FULL) == 0 || - (params->options & VACOPT_PROCESS_MAIN) == 0)) + if ((params.options & VACOPT_PROCESS_TOAST) != 0 && + ((params.options & VACOPT_FULL) == 0 || + (params.options & VACOPT_PROCESS_MAIN) == 0)) toast_relid = rel->rd_rel->reltoastrelid; else toast_relid = InvalidOid; @@ -2252,16 +2277,16 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * table is required (e.g., PROCESS_TOAST is set), we force PROCESS_MAIN * to be set when we recurse to the TOAST table. */ - if (params->options & VACOPT_PROCESS_MAIN) + if (params.options & VACOPT_PROCESS_MAIN) { /* * Do the actual work --- either FULL or "lazy" vacuum */ - if (params->options & VACOPT_FULL) + if (params.options & VACOPT_FULL) { ClusterParams cluster_params = {0}; - if ((params->options & VACOPT_VERBOSE) != 0) + if ((params.options & VACOPT_VERBOSE) != 0) cluster_params.options |= CLUOPT_VERBOSE; /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ @@ -2299,19 +2324,16 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ if (toast_relid != InvalidOid) { - VacuumParams toast_vacuum_params; - /* * Force VACOPT_PROCESS_MAIN so vacuum_rel() processes it. Likewise, * set toast_parent so that the privilege checks are done on the main * relation. NB: This is only safe to do because we hold a session * lock on the main relation that prevents concurrent deletion. */ - memcpy(&toast_vacuum_params, params, sizeof(VacuumParams)); toast_vacuum_params.options |= VACOPT_PROCESS_MAIN; toast_vacuum_params.toast_parent = relid; - vacuum_rel(toast_relid, NULL, &toast_vacuum_params, bstrategy); + vacuum_rel(toast_relid, NULL, toast_vacuum_params, bstrategy); } /* diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 2b9d548cdeb10..8a37c08871a0a 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -63,7 +63,7 @@ typedef struct PVShared */ Oid relid; int elevel; - uint64 queryid; + int64 queryid; /* * Fields for both index vacuum and cleanup. @@ -268,7 +268,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, /* * Compute the number of parallel vacuum workers to launch */ - will_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes); + will_parallel_vacuum = palloc0_array(bool, nindexes); parallel_workers = parallel_vacuum_compute_workers(indrels, nindexes, nrequested_workers, will_parallel_vacuum); @@ -279,7 +279,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, return NULL; } - pvs = (ParallelVacuumState *) palloc0(sizeof(ParallelVacuumState)); + pvs = palloc0_object(ParallelVacuumState); pvs->indrels = indrels; pvs->nindexes = nindexes; pvs->will_parallel_vacuum = will_parallel_vacuum; @@ -444,7 +444,7 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) if (indstats->istat_updated) { - istats[i] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + istats[i] = palloc0_object(IndexBulkDeleteResult); memcpy(istats[i], &indstats->istat, sizeof(IndexBulkDeleteResult)); } else diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c index 6f0301555e0ae..4cc2af7b5ecdc 100644 --- a/src/backend/commands/view.c +++ b/src/backend/commands/view.c @@ -22,7 +22,6 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "parser/analyze.h" -#include "parser/parse_relation.h" #include "rewrite/rewriteDefine.h" #include "rewrite/rewriteHandler.h" #include "rewrite/rewriteSupport.h" @@ -362,6 +361,7 @@ DefineView(ViewStmt *stmt, const char *queryString, ListCell *cell; bool check_option; ObjectAddress address; + ObjectAddress temp_object; /* * Run parse analysis to convert the raw parse tree to a Query. Note this @@ -484,12 +484,14 @@ DefineView(ViewStmt *stmt, const char *queryString, */ view = copyObject(stmt->view); /* don't corrupt original command */ if (view->relpersistence == RELPERSISTENCE_PERMANENT - && isQueryUsingTempRelation(viewParse)) + && query_uses_temp_object(viewParse, &temp_object)) { view->relpersistence = RELPERSISTENCE_TEMP; ereport(NOTICE, (errmsg("view \"%s\" will be a temporary view", - view->relname))); + view->relname), + errdetail("It depends on temporary %s.", + getObjectDescription(&temp_object, false)))); } /* diff --git a/src/backend/commands/wait.c b/src/backend/commands/wait.c new file mode 100644 index 0000000000000..a37bddaefb27c --- /dev/null +++ b/src/backend/commands/wait.c @@ -0,0 +1,211 @@ +/*------------------------------------------------------------------------- + * + * wait.c + * Implements WAIT FOR, which allows waiting for events such as + * time passing or LSN having been replayed on replica. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/commands/wait.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/xlogrecovery.h" +#include "access/xlogwait.h" +#include "commands/defrem.h" +#include "commands/wait.h" +#include "executor/executor.h" +#include "parser/parse_node.h" +#include "storage/proc.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/pg_lsn.h" +#include "utils/snapmgr.h" + + +void +ExecWaitStmt(ParseState *pstate, WaitStmt *stmt, DestReceiver *dest) +{ + XLogRecPtr lsn; + int64 timeout = 0; + WaitLSNResult waitLSNResult; + bool throw = true; + TupleDesc tupdesc; + TupOutputState *tstate; + const char *result = ""; + bool timeout_specified = false; + bool no_throw_specified = false; + + /* Parse and validate the mandatory LSN */ + lsn = DatumGetLSN(DirectFunctionCall1(pg_lsn_in, + CStringGetDatum(stmt->lsn_literal))); + + foreach_node(DefElem, defel, stmt->options) + { + if (strcmp(defel->defname, "timeout") == 0) + { + char *timeout_str; + const char *hintmsg; + double result; + + if (timeout_specified) + errorConflictingDefElem(defel, pstate); + timeout_specified = true; + + timeout_str = defGetString(defel); + + if (!parse_real(timeout_str, &result, GUC_UNIT_MS, &hintmsg)) + { + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid timeout value: \"%s\"", timeout_str), + hintmsg ? errhint("%s", _(hintmsg)) : 0); + } + + /* + * Get rid of any fractional part in the input. This is so we + * don't fail on just-out-of-range values that would round into + * range. + */ + result = rint(result); + + /* Range check */ + if (unlikely(isnan(result) || !FLOAT8_FITS_IN_INT64(result))) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("timeout value is out of range")); + + if (result < 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("timeout cannot be negative")); + + timeout = (int64) result; + } + else if (strcmp(defel->defname, "no_throw") == 0) + { + if (no_throw_specified) + errorConflictingDefElem(defel, pstate); + + no_throw_specified = true; + + throw = !defGetBoolean(defel); + } + else + { + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("option \"%s\" not recognized", + defel->defname), + parser_errposition(pstate, defel->location)); + } + } + + /* + * We are going to wait for the LSN replay. We should first care that we + * don't hold a snapshot and correspondingly our MyProc->xmin is invalid. + * Otherwise, our snapshot could prevent the replay of WAL records + * implying a kind of self-deadlock. This is the reason why WAIT FOR is a + * command, not a procedure or function. + * + * At first, we should check there is no active snapshot. According to + * PlannedStmtRequiresSnapshot(), even in an atomic context, CallStmt is + * processed with a snapshot. Thankfully, we can pop this snapshot, + * because PortalRunUtility() can tolerate this. + */ + if (ActiveSnapshotSet()) + PopActiveSnapshot(); + + /* + * At second, invalidate a catalog snapshot if any. And we should be done + * with the preparation. + */ + InvalidateCatalogSnapshot(); + + /* Give up if there is still an active or registered snapshot. */ + if (HaveRegisteredOrActiveSnapshot()) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAIT FOR must be only called without an active or registered snapshot"), + errdetail("WAIT FOR cannot be executed from a function or a procedure or within a transaction with an isolation level higher than READ COMMITTED.")); + + /* + * As the result we should hold no snapshot, and correspondingly our xmin + * should be unset. + */ + Assert(MyProc->xmin == InvalidTransactionId); + + waitLSNResult = WaitForLSN(WAIT_LSN_TYPE_REPLAY, lsn, timeout); + + /* + * Process the result of WaitForLSN(). Throw appropriate error if needed. + */ + switch (waitLSNResult) + { + case WAIT_LSN_RESULT_SUCCESS: + /* Nothing to do on success */ + result = "success"; + break; + + case WAIT_LSN_RESULT_TIMEOUT: + if (throw) + ereport(ERROR, + errcode(ERRCODE_QUERY_CANCELED), + errmsg("timed out while waiting for target LSN %X/%08X to be replayed; current replay LSN %X/%08X", + LSN_FORMAT_ARGS(lsn), + LSN_FORMAT_ARGS(GetXLogReplayRecPtr(NULL)))); + else + result = "timeout"; + break; + + case WAIT_LSN_RESULT_NOT_IN_RECOVERY: + if (throw) + { + if (PromoteIsTriggered()) + { + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errdetail("Recovery ended before replaying target LSN %X/%08X; last replay LSN %X/%08X.", + LSN_FORMAT_ARGS(lsn), + LSN_FORMAT_ARGS(GetXLogReplayRecPtr(NULL)))); + } + else + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Waiting for the replay LSN can only be executed during recovery.")); + } + else + result = "not in recovery"; + break; + } + + /* need a tuple descriptor representing a single TEXT column */ + tupdesc = WaitStmtResultDesc(stmt); + + /* prepare for projection of tuples */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + /* Send it */ + do_text_output_oneline(tstate, result); + + end_tup_output(tstate); +} + +TupleDesc +WaitStmtResultDesc(WaitStmt *stmt) +{ + TupleDesc tupdesc; + + /* Need a tuple descriptor representing a single TEXT column */ + tupdesc = CreateTemplateTupleDesc(1); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", + TEXTOID, -1, 0); + return tupdesc; +} diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index f1569879b529d..c35744b105e59 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -1312,7 +1312,7 @@ ExecInitExprRec(Expr *node, ExprState *state, } /* Set up the primary fmgr lookup information */ - finfo = palloc0(sizeof(FmgrInfo)); + finfo = palloc0_object(FmgrInfo); fcinfo = palloc0(SizeForFunctionCallInfo(2)); fmgr_info(cmpfuncid, finfo); fmgr_info_set_expr((Node *) node, finfo); @@ -1388,7 +1388,7 @@ ExecInitExprRec(Expr *node, ExprState *state, /* allocate scratch memory used by all steps of AND/OR */ if (boolexpr->boolop != NOT_EXPR) - scratch.d.boolexpr.anynull = (bool *) palloc(sizeof(bool)); + scratch.d.boolexpr.anynull = palloc_object(bool); /* * For each argument evaluate the argument itself, then @@ -1521,11 +1521,11 @@ ExecInitExprRec(Expr *node, ExprState *state, ReleaseTupleDesc(tupDesc); /* create workspace for column values */ - values = (Datum *) palloc(sizeof(Datum) * ncolumns); - nulls = (bool *) palloc(sizeof(bool) * ncolumns); + values = palloc_array(Datum, ncolumns); + nulls = palloc_array(bool, ncolumns); /* create shared composite-type-lookup cache struct */ - rowcachep = palloc(sizeof(ExprEvalRowtypeCache)); + rowcachep = palloc_object(ExprEvalRowtypeCache); rowcachep->cacheptr = NULL; /* emit code to evaluate the composite input value */ @@ -1634,7 +1634,7 @@ ExecInitExprRec(Expr *node, ExprState *state, scratch.opcode = EEOP_IOCOERCE_SAFE; /* lookup the source type's output function */ - scratch.d.iocoerce.finfo_out = palloc0(sizeof(FmgrInfo)); + scratch.d.iocoerce.finfo_out = palloc0_object(FmgrInfo); scratch.d.iocoerce.fcinfo_data_out = palloc0(SizeForFunctionCallInfo(1)); getTypeOutputInfo(exprType((Node *) iocoerce->arg), @@ -1646,7 +1646,7 @@ ExecInitExprRec(Expr *node, ExprState *state, 1, InvalidOid, NULL, NULL); /* lookup the result type's input function */ - scratch.d.iocoerce.finfo_in = palloc0(sizeof(FmgrInfo)); + scratch.d.iocoerce.finfo_in = palloc0_object(FmgrInfo); scratch.d.iocoerce.fcinfo_data_in = palloc0(SizeForFunctionCallInfo(3)); getTypeInputInfo(iocoerce->resulttype, @@ -1699,8 +1699,8 @@ ExecInitExprRec(Expr *node, ExprState *state, elemstate->parent = state->parent; elemstate->ext_params = state->ext_params; - elemstate->innermost_caseval = (Datum *) palloc(sizeof(Datum)); - elemstate->innermost_casenull = (bool *) palloc(sizeof(bool)); + elemstate->innermost_caseval = palloc_object(Datum); + elemstate->innermost_casenull = palloc_object(bool); ExecInitExprRec(acoerce->elemexpr, elemstate, &elemstate->resvalue, &elemstate->resnull); @@ -1727,8 +1727,7 @@ ExecInitExprRec(Expr *node, ExprState *state, if (elemstate) { /* Set up workspace for array_map */ - scratch.d.arraycoerce.amstate = - (ArrayMapState *) palloc0(sizeof(ArrayMapState)); + scratch.d.arraycoerce.amstate = palloc0_object(ArrayMapState); } else { @@ -1783,8 +1782,8 @@ ExecInitExprRec(Expr *node, ExprState *state, if (caseExpr->arg != NULL) { /* Evaluate testexpr into caseval/casenull workspace */ - caseval = palloc(sizeof(Datum)); - casenull = palloc(sizeof(bool)); + caseval = palloc_object(Datum); + casenull = palloc_object(bool); ExecInitExprRec(caseExpr->arg, state, caseval, casenull); @@ -1930,9 +1929,9 @@ ExecInitExprRec(Expr *node, ExprState *state, */ scratch.opcode = EEOP_ARRAYEXPR; scratch.d.arrayexpr.elemvalues = - (Datum *) palloc(sizeof(Datum) * nelems); + palloc_array(Datum, nelems); scratch.d.arrayexpr.elemnulls = - (bool *) palloc(sizeof(bool) * nelems); + palloc_array(bool, nelems); scratch.d.arrayexpr.nelems = nelems; /* fill remaining fields of step */ @@ -2006,9 +2005,9 @@ ExecInitExprRec(Expr *node, ExprState *state, /* space for the individual field datums */ scratch.d.row.elemvalues = - (Datum *) palloc(sizeof(Datum) * nelems); + palloc_array(Datum, nelems); scratch.d.row.elemnulls = - (bool *) palloc(sizeof(bool) * nelems); + palloc_array(bool, nelems); /* as explained above, make sure any extra columns are null */ memset(scratch.d.row.elemnulls, true, sizeof(bool) * nelems); @@ -2109,7 +2108,7 @@ ExecInitExprRec(Expr *node, ExprState *state, BTORDER_PROC, lefttype, righttype, opfamily); /* Set up the primary fmgr lookup information */ - finfo = palloc0(sizeof(FmgrInfo)); + finfo = palloc0_object(FmgrInfo); fcinfo = palloc0(SizeForFunctionCallInfo(2)); fmgr_info(proc, finfo); fmgr_info_set_expr((Node *) node, finfo); @@ -2252,7 +2251,7 @@ ExecInitExprRec(Expr *node, ExprState *state, */ /* Perform function lookup */ - finfo = palloc0(sizeof(FmgrInfo)); + finfo = palloc0_object(FmgrInfo); fcinfo = palloc0(SizeForFunctionCallInfo(2)); fmgr_info(typentry->cmp_proc, finfo); fmgr_info_set_expr((Node *) node, finfo); @@ -2261,10 +2260,8 @@ ExecInitExprRec(Expr *node, ExprState *state, scratch.opcode = EEOP_MINMAX; /* allocate space to store arguments */ - scratch.d.minmax.values = - (Datum *) palloc(sizeof(Datum) * nelems); - scratch.d.minmax.nulls = - (bool *) palloc(sizeof(bool) * nelems); + scratch.d.minmax.values = palloc_array(Datum, nelems); + scratch.d.minmax.nulls = palloc_array(bool, nelems); scratch.d.minmax.nelems = nelems; scratch.d.minmax.op = minmaxexpr->op; @@ -2313,10 +2310,8 @@ ExecInitExprRec(Expr *node, ExprState *state, /* allocate space for storing all the arguments */ if (nnamed) { - scratch.d.xmlexpr.named_argvalue = - (Datum *) palloc(sizeof(Datum) * nnamed); - scratch.d.xmlexpr.named_argnull = - (bool *) palloc(sizeof(bool) * nnamed); + scratch.d.xmlexpr.named_argvalue = palloc_array(Datum, nnamed); + scratch.d.xmlexpr.named_argnull = palloc_array(bool, nnamed); } else { @@ -2326,10 +2321,8 @@ ExecInitExprRec(Expr *node, ExprState *state, if (nargs) { - scratch.d.xmlexpr.argvalue = - (Datum *) palloc(sizeof(Datum) * nargs); - scratch.d.xmlexpr.argnull = - (bool *) palloc(sizeof(bool) * nargs); + scratch.d.xmlexpr.argvalue = palloc_array(Datum, nargs); + scratch.d.xmlexpr.argnull = palloc_array(bool, nargs); } else { @@ -2398,15 +2391,15 @@ ExecInitExprRec(Expr *node, ExprState *state, { JsonConstructorExprState *jcstate; - jcstate = palloc0(sizeof(JsonConstructorExprState)); + jcstate = palloc0_object(JsonConstructorExprState); scratch.opcode = EEOP_JSON_CONSTRUCTOR; scratch.d.json_constructor.jcstate = jcstate; jcstate->constructor = ctor; - jcstate->arg_values = (Datum *) palloc(sizeof(Datum) * nargs); - jcstate->arg_nulls = (bool *) palloc(sizeof(bool) * nargs); - jcstate->arg_types = (Oid *) palloc(sizeof(Oid) * nargs); + jcstate->arg_values = palloc_array(Datum, nargs); + jcstate->arg_nulls = palloc_array(bool, nargs); + jcstate->arg_types = palloc_array(Oid, nargs); jcstate->nargs = nargs; foreach(lc, args) @@ -2680,7 +2673,7 @@ ExprEvalPushStep(ExprState *es, const ExprEvalStep *s) if (es->steps_alloc == 0) { es->steps_alloc = 16; - es->steps = palloc(sizeof(ExprEvalStep) * es->steps_alloc); + es->steps = palloc_array(ExprEvalStep, es->steps_alloc); } else if (es->steps_alloc == es->steps_len) { @@ -2732,7 +2725,7 @@ ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, Oid funcid, FUNC_MAX_ARGS))); /* Allocate function lookup data and parameter workspace for this call */ - scratch->d.func.finfo = palloc0(sizeof(FmgrInfo)); + scratch->d.func.finfo = palloc0_object(FmgrInfo); scratch->d.func.fcinfo_data = palloc0(SizeForFunctionCallInfo(nargs)); flinfo = scratch->d.func.finfo; fcinfo = scratch->d.func.fcinfo_data; @@ -3557,8 +3550,7 @@ ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest, * during executor initialization. That means we don't need typcache.c to * provide compiled exprs. */ - constraint_ref = (DomainConstraintRef *) - palloc(sizeof(DomainConstraintRef)); + constraint_ref = palloc_object(DomainConstraintRef); InitDomainConstraintRef(ctest->resulttype, constraint_ref, CurrentMemoryContext, @@ -3588,9 +3580,9 @@ ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest, if (scratch->d.domaincheck.checkvalue == NULL) { scratch->d.domaincheck.checkvalue = - (Datum *) palloc(sizeof(Datum)); + palloc_object(Datum); scratch->d.domaincheck.checknull = - (bool *) palloc(sizeof(bool)); + palloc_object(bool); } /* @@ -3608,8 +3600,8 @@ ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest, ExprEvalStep scratch2 = {0}; /* Yes, so make output workspace for MAKE_READONLY */ - domainval = (Datum *) palloc(sizeof(Datum)); - domainnull = (bool *) palloc(sizeof(bool)); + domainval = palloc_object(Datum); + domainnull = palloc_object(bool); /* Emit MAKE_READONLY */ scratch2.opcode = EEOP_MAKE_READONLY; @@ -4159,7 +4151,7 @@ ExecBuildHash32FromAttrs(TupleDesc desc, const TupleTableSlotOps *ops, * one column to hash or an initial value plus one column. */ if ((int64) numCols + (init_value != 0) > 1) - iresult = palloc(sizeof(NullableDatum)); + iresult = palloc_object(NullableDatum); /* find the highest attnum so we deform the tuple to that point */ for (int i = 0; i < numCols; i++) @@ -4325,7 +4317,7 @@ ExecBuildHash32Expr(TupleDesc desc, const TupleTableSlotOps *ops, * than one expression to hash or an initial value plus one expression. */ if ((int64) num_exprs + (init_value != 0) > 1) - iresult = palloc(sizeof(NullableDatum)); + iresult = palloc_object(NullableDatum); if (init_value == 0) { @@ -4371,7 +4363,7 @@ ExecBuildHash32Expr(TupleDesc desc, const TupleTableSlotOps *ops, funcid = hashfunc_oids[i]; /* Allocate hash function lookup data. */ - finfo = palloc0(sizeof(FmgrInfo)); + finfo = palloc0_object(FmgrInfo); fcinfo = palloc0(SizeForFunctionCallInfo(1)); fmgr_info(funcid, finfo); @@ -4540,7 +4532,7 @@ ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc, InvokeFunctionExecuteHook(foid); /* Set up the primary fmgr lookup information */ - finfo = palloc0(sizeof(FmgrInfo)); + finfo = palloc0_object(FmgrInfo); fcinfo = palloc0(SizeForFunctionCallInfo(2)); fmgr_info(foid, finfo); fmgr_info_set_expr(NULL, finfo); @@ -4676,7 +4668,7 @@ ExecBuildParamSetEqual(TupleDesc desc, InvokeFunctionExecuteHook(foid); /* Set up the primary fmgr lookup information */ - finfo = palloc0(sizeof(FmgrInfo)); + finfo = palloc0_object(FmgrInfo); fcinfo = palloc0(SizeForFunctionCallInfo(2)); fmgr_info(foid, finfo); fmgr_info_set_expr(NULL, finfo); @@ -4749,7 +4741,7 @@ ExecInitJsonExpr(JsonExpr *jsexpr, ExprState *state, Datum *resv, bool *resnull, ExprEvalStep *scratch) { - JsonExprState *jsestate = palloc0(sizeof(JsonExprState)); + JsonExprState *jsestate = palloc0_object(JsonExprState); ListCell *argexprlc; ListCell *argnamelc; List *jumps_return_null = NIL; @@ -4800,14 +4792,14 @@ ExecInitJsonExpr(JsonExpr *jsexpr, ExprState *state, { Expr *argexpr = (Expr *) lfirst(argexprlc); String *argname = lfirst_node(String, argnamelc); - JsonPathVariable *var = palloc(sizeof(*var)); + JsonPathVariable *var = palloc_object(JsonPathVariable); var->name = argname->sval; var->namelen = strlen(var->name); var->typid = exprType((Node *) argexpr); var->typmod = exprTypmod((Node *) argexpr); - ExecInitExprRec((Expr *) argexpr, state, &var->value, &var->isnull); + ExecInitExprRec(argexpr, state, &var->value, &var->isnull); jsestate->args = lappend(jsestate->args, var); } @@ -4874,7 +4866,7 @@ ExecInitJsonExpr(JsonExpr *jsexpr, ExprState *state, FunctionCallInfo fcinfo; getTypeInputInfo(jsexpr->returning->typid, &typinput, &typioparam); - finfo = palloc0(sizeof(FmgrInfo)); + finfo = palloc0_object(FmgrInfo); fcinfo = palloc0(SizeForFunctionCallInfo(3)); fmgr_info(typinput, finfo); fmgr_info_set_expr((Node *) jsexpr->returning, finfo); diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 8a72b5e70a4ec..5e7bd933afc3d 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -2815,7 +2815,7 @@ ExecJustHashVarImpl(ExprState *state, TupleTableSlot *slot, bool *isnull) *isnull = false; if (!fcinfo->args[0].isnull) - return DatumGetUInt32(hashop->d.hashdatum.fn_addr(fcinfo)); + return hashop->d.hashdatum.fn_addr(fcinfo); else return (Datum) 0; } @@ -2849,7 +2849,7 @@ ExecJustHashVarVirtImpl(ExprState *state, TupleTableSlot *slot, bool *isnull) *isnull = false; if (!fcinfo->args[0].isnull) - return DatumGetUInt32(hashop->d.hashdatum.fn_addr(fcinfo)); + return hashop->d.hashdatum.fn_addr(fcinfo); else return (Datum) 0; } @@ -2892,7 +2892,7 @@ ExecJustHashOuterVarStrict(ExprState *state, ExprContext *econtext, if (!fcinfo->args[0].isnull) { *isnull = false; - return DatumGetUInt32(hashop->d.hashdatum.fn_addr(fcinfo)); + return hashop->d.hashdatum.fn_addr(fcinfo); } else { @@ -3283,7 +3283,7 @@ ExecEvalNextValueExpr(ExprState *state, ExprEvalStep *op) *op->resvalue = Int32GetDatum((int32) newval); break; case INT8OID: - *op->resvalue = Int64GetDatum((int64) newval); + *op->resvalue = Int64GetDatum(newval); break; default: elog(ERROR, "unsupported sequence type %u", @@ -4393,7 +4393,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco * is the equality function and we need not-equals. */ if (!inclause) - result = !result; + result = BoolGetDatum(!DatumGetBool(result)); } } @@ -5228,7 +5228,6 @@ ExecEvalJsonCoercionFinish(ExprState *state, ExprEvalStep *op) * JsonBehavior expression. */ jsestate->escontext.error_occurred = false; - jsestate->escontext.error_occurred = false; jsestate->escontext.details_wanted = true; } } diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index 255bd795361a2..8eb4c25e1cb0f 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -14,15 +14,18 @@ */ #include "postgres.h" +#include + +#include "access/htup_details.h" #include "access/parallel.h" #include "common/hashfn.h" #include "executor/executor.h" #include "miscadmin.h" #include "utils/lsyscache.h" -static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2); +static int TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple tuple2); static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb, - const MinimalTuple tuple); + MinimalTuple tuple); static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 hash); @@ -143,10 +146,10 @@ execTuplesHashPrepare(int numCols, * eqfuncoids: OIDs of equality comparison functions to use * hashfunctions: FmgrInfos of datatype-specific hashing functions to use * collations: collations to use in comparisons - * nbuckets: initial estimate of hashtable size - * additionalsize: size of data stored in ->additional - * metacxt: memory context for long-lived allocation, but not per-entry data - * tablecxt: memory context in which to store table entries + * nelements: initial estimate of hashtable size + * additionalsize: size of data that may be stored along with the hash entry + * metacxt: memory context for long-lived data and the simplehash table + * tuplescxt: memory context in which to store the hashed tuples themselves * tempcxt: short-lived context for evaluation hash and comparison functions * use_variable_hash_iv: if true, adjust hash IV per-parallel-worker * @@ -156,6 +159,26 @@ execTuplesHashPrepare(int numCols, * * Note that the keyColIdx, hashfunctions, and collations arrays must be * allocated in storage that will live as long as the hashtable does. + * + * The metacxt and tuplescxt are separate because it's usually desirable for + * tuplescxt to be a BumpContext to avoid memory wastage, while metacxt must + * support pfree in case the simplehash table needs to be enlarged. (We could + * simplify the API of TupleHashTables by managing the tuplescxt internally. + * But that would be disadvantageous to nodeAgg.c and nodeSubplan.c, which use + * a single tuplescxt for multiple TupleHashTables that are reset together.) + * + * LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak + * memory in the tempcxt. It is caller's responsibility to reset that context + * reasonably often, typically once per tuple. (We do it that way, rather + * than managing an extra context within the hashtable, because in many cases + * the caller can specify a tempcxt that it needs to reset per-tuple anyway.) + * + * We don't currently provide DestroyTupleHashTable functionality; the hash + * table will be cleaned up at destruction of the metacxt. (Some callers + * bother to delete the tuplescxt explicitly, though it'd be sufficient to + * ensure it's a child of the metacxt.) There's not much point in working + * harder than this so long as the expression-evaluation infrastructure + * behaves similarly. */ TupleHashTable BuildTupleHashTable(PlanState *parent, @@ -166,37 +189,47 @@ BuildTupleHashTable(PlanState *parent, const Oid *eqfuncoids, FmgrInfo *hashfunctions, Oid *collations, - long nbuckets, + double nelements, Size additionalsize, MemoryContext metacxt, - MemoryContext tablecxt, + MemoryContext tuplescxt, MemoryContext tempcxt, bool use_variable_hash_iv) { TupleHashTable hashtable; - Size entrysize; - Size hash_mem_limit; + uint32 nbuckets; MemoryContext oldcontext; - bool allow_jit; uint32 hash_iv = 0; - Assert(nbuckets > 0); - additionalsize = MAXALIGN(additionalsize); - entrysize = sizeof(TupleHashEntryData) + additionalsize; + /* + * tuplehash_create requires a uint32 element count, so we had better + * clamp the given nelements to fit in that. As long as we have to do + * that, we might as well protect against completely insane input like + * zero or NaN. But it is not our job here to enforce issues like staying + * within hash_mem: the caller should have done that, and we don't have + * enough info to second-guess. + */ + if (isnan(nelements) || nelements <= 0) + nbuckets = 1; + else if (nelements >= PG_UINT32_MAX) + nbuckets = PG_UINT32_MAX; + else + nbuckets = (uint32) nelements; - /* Limit initial table size request to not more than hash_mem */ - hash_mem_limit = get_hash_memory_limit() / entrysize; - if (nbuckets > hash_mem_limit) - nbuckets = hash_mem_limit; + /* tuplescxt must be separate, else ResetTupleHashTable breaks things */ + Assert(metacxt != tuplescxt); + + /* ensure additionalsize is maxalign'ed */ + additionalsize = MAXALIGN(additionalsize); oldcontext = MemoryContextSwitchTo(metacxt); - hashtable = (TupleHashTable) palloc(sizeof(TupleHashTableData)); + hashtable = palloc_object(TupleHashTableData); hashtable->numCols = numCols; hashtable->keyColIdx = keyColIdx; hashtable->tab_collations = collations; - hashtable->tablecxt = tablecxt; + hashtable->tuplescxt = tuplescxt; hashtable->tempcxt = tempcxt; hashtable->additionalsize = additionalsize; hashtable->tableslot = NULL; /* will be made on first lookup */ @@ -224,16 +257,6 @@ BuildTupleHashTable(PlanState *parent, hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc), &TTSOpsMinimalTuple); - /* - * If the caller fails to make the metacxt different from the tablecxt, - * allowing JIT would lead to the generated functions to a) live longer - * than the query or b) be re-generated each time the table is being - * reset. Therefore prevent JIT from being used in that case, by not - * providing a parent node (which prevents accessing the JitContext in the - * EState). - */ - allow_jit = (metacxt != tablecxt); - /* build hash ExprState for all columns */ hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc, inputOps, @@ -241,7 +264,7 @@ BuildTupleHashTable(PlanState *parent, collations, numCols, keyColIdx, - allow_jit ? parent : NULL, + parent, hash_iv); /* build comparator for all columns */ @@ -250,7 +273,7 @@ BuildTupleHashTable(PlanState *parent, &TTSOpsMinimalTuple, numCols, keyColIdx, eqfuncoids, collations, - allow_jit ? parent : NULL); + parent); /* * While not pretty, it's ok to not shut down this context, but instead @@ -267,13 +290,77 @@ BuildTupleHashTable(PlanState *parent, /* * Reset contents of the hashtable to be empty, preserving all the non-content - * state. Note that the tablecxt passed to BuildTupleHashTable() should - * also be reset, otherwise there will be leaks. + * state. + * + * Note: in usages where several TupleHashTables share a tuplescxt, all must + * be reset together, as the first one's reset call will destroy all their + * data. The additional reset calls for the rest will redundantly reset the + * tuplescxt. But because of mcxt.c's isReset flag, that's cheap enough that + * we need not avoid it. */ void ResetTupleHashTable(TupleHashTable hashtable) { tuplehash_reset(hashtable->hashtab); + MemoryContextReset(hashtable->tuplescxt); +} + +/* + * Estimate the amount of space needed for a TupleHashTable with nentries + * entries, if the tuples have average data width tupleWidth and the caller + * requires additionalsize extra space per entry. + * + * Return SIZE_MAX if it'd overflow size_t. + * + * nentries is "double" because this is meant for use by the planner, + * which typically works with double rowcount estimates. So we'd need to + * clamp to integer somewhere and that might as well be here. We do expect + * the value not to be NaN or negative, else the result will be garbage. + */ +Size +EstimateTupleHashTableSpace(double nentries, + Size tupleWidth, + Size additionalsize) +{ + Size sh_space; + double tuples_space; + + /* First estimate the space needed for the simplehash table */ + sh_space = tuplehash_estimate_space(nentries); + + /* Give up if that's already too big */ + if (sh_space >= SIZE_MAX) + return sh_space; + + /* + * Compute space needed for hashed tuples with additional data. nentries + * must be somewhat sane, so it should be safe to compute this product. + * + * We assume that the hashed tuples will be kept in a BumpContext so that + * there is not additional per-tuple overhead. + * + * (Note that this is only accurate if MEMORY_CONTEXT_CHECKING is off, + * else bump.c will add a MemoryChunk header to each tuple. However, it + * seems undesirable for debug builds to make different planning choices + * than production builds, so we assume the production behavior always.) + */ + tuples_space = nentries * (MAXALIGN(SizeofMinimalTupleHeader) + + MAXALIGN(tupleWidth) + + MAXALIGN(additionalsize)); + + /* + * Check for size_t overflow. This coding is trickier than it may appear, + * because on 64-bit machines SIZE_MAX cannot be represented exactly as a + * double. We must cast it explicitly to suppress compiler warnings about + * an inexact conversion, and we must trust that any double value that + * compares strictly less than "(double) SIZE_MAX" will cast to a + * representable size_t value. + */ + if (sh_space + tuples_space >= (double) SIZE_MAX) + return SIZE_MAX; + + /* We don't bother estimating size of the miscellaneous overhead data */ + return (Size) (sh_space + tuples_space); } /* @@ -288,7 +375,7 @@ ResetTupleHashTable(TupleHashTable hashtable) * * If isnew isn't NULL, then a new entry is created if no existing entry * matches. On return, *isnew is true if the entry is newly created, - * false if it existed already. ->additional_data in the new entry has + * false if it existed already. The additional data in the new entry has * been zeroed. */ TupleHashEntry @@ -413,7 +500,7 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, */ static uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb, - const MinimalTuple tuple) + MinimalTuple tuple) { TupleHashTable hashtable = (TupleHashTable) tb->private_data; uint32 hashkey; @@ -483,10 +570,10 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot, /* created new entry */ *isnew = true; - MemoryContextSwitchTo(hashtable->tablecxt); + MemoryContextSwitchTo(hashtable->tuplescxt); /* - * Copy the first tuple into the table context, and request + * Copy the first tuple into the tuples context, and request * additionalsize extra bytes before the allocation. * * The caller can get a pointer to the additional data with @@ -511,7 +598,7 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot, * See whether two tuples (presumably of the same hash value) match */ static int -TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2) +TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple tuple2) { TupleTableSlot *slot1; TupleTableSlot *slot2; diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b24062e..0b3a31f170351 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -114,6 +114,7 @@ #include "executor/executor.h" #include "nodes/nodeFuncs.h" #include "storage/lmgr.h" +#include "utils/injection_point.h" #include "utils/multirangetypes.h" #include "utils/rangetypes.h" #include "utils/snapmgr.h" @@ -128,7 +129,7 @@ typedef enum static bool check_exclusion_or_unique_constraint(Relation heap, Relation index, IndexInfo *indexInfo, - ItemPointer tupleid, + const ItemPointerData *tupleid, const Datum *values, const bool *isnull, EState *estate, bool newIndex, CEOUC_WAIT_MODE waitMode, @@ -187,8 +188,8 @@ ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative) /* * allocate space for result arrays */ - relationDescs = (RelationPtr) palloc(len * sizeof(Relation)); - indexInfoArray = (IndexInfo **) palloc(len * sizeof(IndexInfo *)); + relationDescs = palloc_array(Relation, len); + indexInfoArray = palloc_array(IndexInfo *, len); resultRelInfo->ri_NumIndices = len; resultRelInfo->ri_IndexRelationDescs = relationDescs; @@ -279,7 +280,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * executor is performing an UPDATE that could not use an * optimization like heapam's HOT (in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_All). Receiving this hint makes + * 'update_indexes' to TU_All). Receiving this hint makes * us consider if we should pass down the 'indexUnchanged' * hint in turn. That's something that we figure out for * each index_insert() call iff 'update' is true. @@ -290,7 +291,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * HOT has been applied and any updated columns are indexed * only by summarizing indexes (or in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_Summarizing). We can (and must) + * 'update_indexes' to TU_Summarizing). We can (and must) * therefore only update the indexes that have * 'amsummarizing' = true. * @@ -541,7 +542,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, ItemPointer conflictTid, - ItemPointer tupleid, List *arbiterIndexes) + const ItemPointerData *tupleid, List *arbiterIndexes) { int i; int numIndices; @@ -703,7 +704,7 @@ ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, static bool check_exclusion_or_unique_constraint(Relation heap, Relation index, IndexInfo *indexInfo, - ItemPointer tupleid, + const ItemPointerData *tupleid, const Datum *values, const bool *isnull, EState *estate, bool newIndex, CEOUC_WAIT_MODE waitMode, @@ -943,6 +944,11 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, ExecDropSingleTupleTableSlot(existing_slot); +#ifdef USE_INJECTION_POINTS + if (!conflict) + INJECTION_POINT("check-exclusion-or-unique-constraint-no-conflict", NULL); +#endif + return !conflict; } @@ -955,7 +961,7 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, void check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, - ItemPointer tupleid, + const ItemPointerData *tupleid, const Datum *values, const bool *isnull, EState *estate, bool newIndex) { diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 0391798dd2c33..797d8b1ca1cbc 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -84,7 +84,6 @@ static void ExecutePlan(QueryDesc *queryDesc, uint64 numberTuples, ScanDirection direction, DestReceiver *dest); -static bool ExecCheckOneRelPerms(RTEPermissionInfo *perminfo); static bool ExecCheckPermissionsModified(Oid relOid, Oid userid, Bitmapset *modifiedCols, AclMode requiredPerms); @@ -190,7 +189,7 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) nParamExec = list_length(queryDesc->plannedstmt->paramExecTypes); estate->es_param_exec_vals = (ParamExecData *) - palloc0(nParamExec * sizeof(ParamExecData)); + palloc0_array(ParamExecData, nParamExec); } /* We now require all callers to provide sourceText */ @@ -643,7 +642,7 @@ ExecCheckPermissions(List *rangeTable, List *rteperminfos, * ExecCheckOneRelPerms * Check access permissions for a single relation. */ -static bool +bool ExecCheckOneRelPerms(RTEPermissionInfo *perminfo) { AclMode requiredPerms; @@ -877,7 +876,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) if (plannedstmt->rowMarks) { estate->es_rowmarks = (ExecRowMark **) - palloc0(estate->es_range_table_size * sizeof(ExecRowMark *)); + palloc0_array(ExecRowMark *, estate->es_range_table_size); foreach(l, plannedstmt->rowMarks) { PlanRowMark *rc = (PlanRowMark *) lfirst(l); @@ -921,7 +920,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) if (relation) CheckValidRowMarkRel(relation, rc->markType); - erm = (ExecRowMark *) palloc(sizeof(ExecRowMark)); + erm = palloc_object(ExecRowMark); erm->relation = relation; erm->relid = relid; erm->rti = rc->rti; @@ -1037,6 +1036,9 @@ InitPlan(QueryDesc *queryDesc, int eflags) * Generally the parser and/or planner should have noticed any such mistake * already, but let's make sure. * + * For INSERT ON CONFLICT, the result relation is required to support the + * onConflictAction, regardless of whether a conflict actually occurs. + * * For MERGE, mergeActions is the list of actions that may be performed. The * result relation is required to support every action, regardless of whether * or not they are all executed. @@ -1046,7 +1048,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) */ void CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation, - List *mergeActions) + OnConflictAction onConflictAction, List *mergeActions) { Relation resultRel = resultRelInfo->ri_RelationDesc; FdwRoutine *fdwroutine; @@ -1059,7 +1061,23 @@ CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation, { case RELKIND_RELATION: case RELKIND_PARTITIONED_TABLE: - CheckCmdReplicaIdentity(resultRel, operation); + + /* + * For MERGE, check that the target relation supports each action. + * For other operations, just check the operation itself. + */ + if (operation == CMD_MERGE) + foreach_node(MergeAction, action, mergeActions) + CheckCmdReplicaIdentity(resultRel, action->commandType); + else + CheckCmdReplicaIdentity(resultRel, operation); + + /* + * For INSERT ON CONFLICT DO UPDATE, additionally check that the + * target relation supports UPDATE. + */ + if (onConflictAction == ONCONFLICT_UPDATE) + CheckCmdReplicaIdentity(resultRel, CMD_UPDATE); break; case RELKIND_SEQUENCE: ereport(ERROR, @@ -1244,9 +1262,9 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, int n = resultRelInfo->ri_TrigDesc->numtriggers; resultRelInfo->ri_TrigFunctions = (FmgrInfo *) - palloc0(n * sizeof(FmgrInfo)); + palloc0_array(FmgrInfo, n); resultRelInfo->ri_TrigWhenExprs = (ExprState **) - palloc0(n * sizeof(ExprState *)); + palloc0_array(ExprState *, n); if (instrument_options) resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false); } @@ -1308,10 +1326,9 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, * Get a ResultRelInfo for a trigger target relation. * * Most of the time, triggers are fired on one of the result relations of the - * query, and so we can just return a member of the es_result_relations array, - * or the es_tuple_routing_result_relations list (if any). (Note: in self-join - * situations there might be multiple members with the same OID; if so it - * doesn't matter which one we pick.) + * query, and so we can just return a suitable one we already made and stored + * in the es_opened_result_relations or es_tuple_routing_result_relations + * Lists. * * However, it is sometimes necessary to fire triggers on other relations; * this happens mainly when an RI update trigger queues additional triggers @@ -1331,11 +1348,20 @@ ExecGetTriggerResultRel(EState *estate, Oid relid, Relation rel; MemoryContext oldcontext; + /* + * Before creating a new ResultRelInfo, check if we've already made and + * cached one for this relation. We must ensure that the given + * 'rootRelInfo' matches the one stored in the cached ResultRelInfo as + * trigger handling for partitions can result in mixed requirements for + * what ri_RootResultRelInfo is set to. + */ + /* Search through the query result relations */ foreach(l, estate->es_opened_result_relations) { rInfo = lfirst(l); - if (RelationGetRelid(rInfo->ri_RelationDesc) == relid) + if (RelationGetRelid(rInfo->ri_RelationDesc) == relid && + rInfo->ri_RootResultRelInfo == rootRelInfo) return rInfo; } @@ -1346,7 +1372,8 @@ ExecGetTriggerResultRel(EState *estate, Oid relid, foreach(l, estate->es_tuple_routing_result_relations) { rInfo = (ResultRelInfo *) lfirst(l); - if (RelationGetRelid(rInfo->ri_RelationDesc) == relid) + if (RelationGetRelid(rInfo->ri_RelationDesc) == relid && + rInfo->ri_RootResultRelInfo == rootRelInfo) return rInfo; } @@ -1354,7 +1381,8 @@ ExecGetTriggerResultRel(EState *estate, Oid relid, foreach(l, estate->es_trig_target_relations) { rInfo = (ResultRelInfo *) lfirst(l); - if (RelationGetRelid(rInfo->ri_RelationDesc) == relid) + if (RelationGetRelid(rInfo->ri_RelationDesc) == relid && + rInfo->ri_RootResultRelInfo == rootRelInfo) return rInfo; } /* Nope, so we need a new one */ @@ -2550,7 +2578,7 @@ ExecFindRowMark(EState *estate, Index rti, bool missing_ok) ExecAuxRowMark * ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist) { - ExecAuxRowMark *aerm = (ExecAuxRowMark *) palloc0(sizeof(ExecAuxRowMark)); + ExecAuxRowMark *aerm = palloc0_object(ExecAuxRowMark); char resname[32]; aerm->rowmark = erm; @@ -2706,8 +2734,7 @@ EvalPlanQualInit(EPQState *epqstate, EState *parentestate, * EvalPlanQualBegin(). */ epqstate->tuple_table = NIL; - epqstate->relsubs_slot = (TupleTableSlot **) - palloc0(rtsize * sizeof(TupleTableSlot *)); + epqstate->relsubs_slot = palloc0_array(TupleTableSlot *, rtsize); /* ... and remember data that EvalPlanQualBegin will need */ epqstate->plan = subplan; @@ -3046,8 +3073,7 @@ EvalPlanQualStart(EPQState *epqstate, Plan *planTree) /* now make the internal param workspace ... */ i = list_length(parentestate->es_plannedstmt->paramExecTypes); - rcestate->es_param_exec_vals = (ParamExecData *) - palloc0(i * sizeof(ParamExecData)); + rcestate->es_param_exec_vals = palloc0_array(ParamExecData, i); /* ... and copy down all values, whether really needed or not */ while (--i >= 0) { @@ -3066,6 +3092,18 @@ EvalPlanQualStart(EPQState *epqstate, Plan *planTree) */ rcestate->es_unpruned_relids = parentestate->es_unpruned_relids; + /* + * Also make the PartitionPruneInfo and the results of pruning available. + * These need to match exactly so that we initialize all the same Append + * and MergeAppend subplans as the parent did. + */ + rcestate->es_part_prune_infos = parentestate->es_part_prune_infos; + rcestate->es_part_prune_states = parentestate->es_part_prune_states; + rcestate->es_part_prune_results = parentestate->es_part_prune_results; + + /* We'll also borrow the es_partition_directory from the parent state */ + rcestate->es_partition_directory = parentestate->es_partition_directory; + /* * Initialize private state information for each SubPlan. We must do this * before running ExecInitNode on the main query tree, since @@ -3090,8 +3128,7 @@ EvalPlanQualStart(EPQState *epqstate, Plan *planTree) * EvalPlanQualFetchRowMark() can efficiently access the to be fetched * rowmark. */ - epqstate->relsubs_rowmark = (ExecAuxRowMark **) - palloc0(rtsize * sizeof(ExecAuxRowMark *)); + epqstate->relsubs_rowmark = palloc0_array(ExecAuxRowMark *, rtsize); foreach(l, epqstate->arowMarks) { ExecAuxRowMark *earm = (ExecAuxRowMark *) lfirst(l); @@ -3183,6 +3220,13 @@ EvalPlanQualEnd(EPQState *epqstate) MemoryContextSwitchTo(oldcontext); + /* + * NULLify the partition directory before freeing the executor state. + * Since EvalPlanQualStart() just borrowed the parent EState's directory, + * we'd better leave it up to the parent to delete it. + */ + estate->es_partition_directory = NULL; + FreeExecutorState(estate); /* Mark EPQState idle */ diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index f3e77bda27906..40d8fa44c19cf 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -40,10 +40,12 @@ #include "executor/nodeSeqscan.h" #include "executor/nodeSort.h" #include "executor/nodeSubplan.h" +#include "executor/nodeTidrangescan.h" #include "executor/tqueue.h" #include "jit/jit.h" #include "nodes/nodeFuncs.h" #include "pgstat.h" +#include "storage/bufmgr.h" #include "tcop/tcopprot.h" #include "utils/datum.h" #include "utils/dsa.h" @@ -77,6 +79,7 @@ typedef struct FixedParallelExecutorState dsa_pointer param_exec; int eflags; int jit_flags; + int dirtied_localbufs; /* Just for debugging purposes */ } FixedParallelExecutorState; /* @@ -189,6 +192,7 @@ ExecSerializePlan(Plan *plan, EState *estate) pstmt->permInfos = estate->es_rteperminfos; pstmt->resultRelations = NIL; pstmt->appendRelations = NIL; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list @@ -265,6 +269,11 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) ExecForeignScanEstimate((ForeignScanState *) planstate, e->pcxt); break; + case T_TidRangeScanState: + if (planstate->plan->parallel_aware) + ExecTidRangeScanEstimate((TidRangeScanState *) planstate, + e->pcxt); + break; case T_AppendState: if (planstate->plan->parallel_aware) ExecAppendEstimate((AppendState *) planstate, @@ -492,6 +501,11 @@ ExecParallelInitializeDSM(PlanState *planstate, ExecForeignScanInitializeDSM((ForeignScanState *) planstate, d->pcxt); break; + case T_TidRangeScanState: + if (planstate->plan->parallel_aware) + ExecTidRangeScanInitializeDSM((TidRangeScanState *) planstate, + d->pcxt); + break; case T_AppendState: if (planstate->plan->parallel_aware) ExecAppendInitializeDSM((AppendState *) planstate, @@ -635,7 +649,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, ExecSetParamPlanMulti(sendParams, GetPerTupleExprContext(estate)); /* Allocate object for return value. */ - pei = palloc0(sizeof(ParallelExecutorInfo)); + pei = palloc0_object(ParallelExecutorInfo); pei->finished = false; pei->planstate = planstate; @@ -756,6 +770,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, fpes->param_exec = InvalidDsaPointer; fpes->eflags = estate->es_top_eflags; fpes->jit_flags = estate->es_jit_flags; + fpes->dirtied_localbufs = dirtied_localbufs; shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, fpes); /* Store query string */ @@ -993,6 +1008,11 @@ ExecParallelReInitializeDSM(PlanState *planstate, ExecForeignScanReInitializeDSM((ForeignScanState *) planstate, pcxt); break; + case T_TidRangeScanState: + if (planstate->plan->parallel_aware) + ExecTidRangeScanReInitializeDSM((TidRangeScanState *) planstate, + pcxt); + break; case T_AppendState: if (planstate->plan->parallel_aware) ExecAppendReInitializeDSM((AppendState *) planstate, pcxt); @@ -1361,6 +1381,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) ExecForeignScanInitializeWorker((ForeignScanState *) planstate, pwcxt); break; + case T_TidRangeScanState: + if (planstate->plan->parallel_aware) + ExecTidRangeScanInitializeWorker((TidRangeScanState *) planstate, + pwcxt); + break; case T_AppendState: if (planstate->plan->parallel_aware) ExecAppendInitializeWorker((AppendState *) planstate, pwcxt); @@ -1442,6 +1467,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Get fixed-size state. */ fpes = shm_toc_lookup(toc, PARALLEL_KEY_EXECUTOR_FIXED, false); + dirtied_localbufs = fpes->dirtied_localbufs; /* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */ receiver = ExecParallelGetReceiver(seg, toc); diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 514eae1037dc3..e30db12113b44 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -15,6 +15,7 @@ #include "access/table.h" #include "access/tableam.h" +#include "catalog/index.h" #include "catalog/partition.h" #include "executor/execPartition.h" #include "executor/executor.h" @@ -173,11 +174,11 @@ static void FormPartitionKeyDatum(PartitionDispatch pd, EState *estate, Datum *values, bool *isnull); -static int get_partition_for_tuple(PartitionDispatch pd, Datum *values, - bool *isnull); +static int get_partition_for_tuple(PartitionDispatch pd, const Datum *values, + const bool *isnull); static char *ExecBuildSlotPartitionKeyDescription(Relation rel, - Datum *values, - bool *isnull, + const Datum *values, + const bool *isnull, int maxfieldlen); static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri); static List *adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap); @@ -226,7 +227,7 @@ ExecSetupPartitionTupleRouting(EState *estate, Relation rel) * The reason for this is that a common case is for INSERT to insert a * single tuple into a partitioned table and this must be fast. */ - proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); + proute = palloc0_object(PartitionTupleRouting); proute->partition_root = rel; proute->memcxt = CurrentMemoryContext; /* Rest of members initialized by zeroing */ @@ -360,8 +361,12 @@ ExecFindPartition(ModifyTableState *mtstate, true, false); if (rri) { + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + /* Verify this ResultRelInfo allows INSERTs */ - CheckValidResultRel(rri, CMD_INSERT, NIL); + CheckValidResultRel(rri, CMD_INSERT, + node ? node->onConflictAction : ONCONFLICT_NONE, + NIL); /* * Initialize information needed to insert this and @@ -486,6 +491,65 @@ ExecFindPartition(ModifyTableState *mtstate, return rri; } +/* + * IsIndexCompatibleAsArbiter + * Return true if two indexes are identical for INSERT ON CONFLICT + * purposes. + * + * Only indexes of the same relation are supported. + */ +static bool +IsIndexCompatibleAsArbiter(Relation arbiterIndexRelation, + IndexInfo *arbiterIndexInfo, + Relation indexRelation, + IndexInfo *indexInfo) +{ + Assert(arbiterIndexRelation->rd_index->indrelid == indexRelation->rd_index->indrelid); + + /* must match whether they're unique */ + if (arbiterIndexInfo->ii_Unique != indexInfo->ii_Unique) + return false; + + /* No support currently for comparing exclusion indexes. */ + if (arbiterIndexInfo->ii_ExclusionOps != NULL || + indexInfo->ii_ExclusionOps != NULL) + return false; + + /* the "nulls not distinct" criterion must match */ + if (arbiterIndexInfo->ii_NullsNotDistinct != + indexInfo->ii_NullsNotDistinct) + return false; + + /* number of key attributes must match */ + if (arbiterIndexInfo->ii_NumIndexKeyAttrs != + indexInfo->ii_NumIndexKeyAttrs) + return false; + + for (int i = 0; i < arbiterIndexInfo->ii_NumIndexKeyAttrs; i++) + { + if (arbiterIndexRelation->rd_indcollation[i] != + indexRelation->rd_indcollation[i]) + return false; + + if (arbiterIndexRelation->rd_opfamily[i] != + indexRelation->rd_opfamily[i]) + return false; + + if (arbiterIndexRelation->rd_index->indkey.values[i] != + indexRelation->rd_index->indkey.values[i]) + return false; + } + + if (list_difference(RelationGetIndexExpressions(arbiterIndexRelation), + RelationGetIndexExpressions(indexRelation)) != NIL) + return false; + + if (list_difference(RelationGetIndexPredicate(arbiterIndexRelation), + RelationGetIndexPredicate(indexRelation)) != NIL) + return false; + return true; +} + /* * ExecInitPartitionInfo * Lock the partition and initialize ResultRelInfo. Also setup other @@ -527,7 +591,8 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, * partition-key becomes a DELETE+INSERT operation, so this check is still * required when the operation is CMD_UPDATE. */ - CheckValidResultRel(leaf_part_rri, CMD_INSERT, NIL); + CheckValidResultRel(leaf_part_rri, CMD_INSERT, + node ? node->onConflictAction : ONCONFLICT_NONE, NIL); /* * Open partition indices. The user may have asked to check for conflicts @@ -684,44 +749,117 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, { TupleDesc partrelDesc = RelationGetDescr(partrel); ExprContext *econtext = mtstate->ps.ps_ExprContext; - ListCell *lc; List *arbiterIndexes = NIL; + int additional_arbiters = 0; /* * If there is a list of arbiter indexes, map it to a list of indexes - * in the partition. We do that by scanning the partition's index - * list and searching for ancestry relationships to each index in the - * ancestor table. + * in the partition. We also add any "identical indexes" to any of + * those, to cover the case where one of them is concurrently being + * reindexed. */ if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL) { - List *childIdxs; - - childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc); + List *unparented_idxs = NIL, + *arbiters_listidxs = NIL; - foreach(lc, childIdxs) + for (int listidx = 0; listidx < leaf_part_rri->ri_NumIndices; listidx++) { - Oid childIdx = lfirst_oid(lc); + Oid indexoid; List *ancestors; - ListCell *lc2; - ancestors = get_partition_ancestors(childIdx); - foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes) + /* + * If one of this index's ancestors is in the root's arbiter + * list, then use this index as arbiter for this partition. + * Otherwise, if this index has no parent, track it for later, + * in case REINDEX CONCURRENTLY is working on one of the + * arbiters. + * + * XXX get_partition_ancestors is slow: it scans pg_inherits + * each time. Consider a syscache or some other way to cache? + */ + indexoid = RelationGetRelid(leaf_part_rri->ri_IndexRelationDescs[listidx]); + ancestors = get_partition_ancestors(indexoid); + if (ancestors != NIL) { - if (list_member_oid(ancestors, lfirst_oid(lc2))) - arbiterIndexes = lappend_oid(arbiterIndexes, childIdx); + foreach_oid(parent_idx, rootResultRelInfo->ri_onConflictArbiterIndexes) + { + if (list_member_oid(ancestors, parent_idx)) + { + arbiterIndexes = lappend_oid(arbiterIndexes, indexoid); + arbiters_listidxs = lappend_int(arbiters_listidxs, listidx); + break; + } + } } + else + unparented_idxs = lappend_int(unparented_idxs, listidx); list_free(ancestors); } + + /* + * If we found any indexes with no ancestors, it's possible that + * some arbiter index is undergoing concurrent reindex. Match all + * unparented indexes against arbiters; add unparented matching + * ones as "additional arbiters". + * + * This is critical so that all concurrent transactions use the + * same set as arbiters during REINDEX CONCURRENTLY, to avoid + * spurious "duplicate key" errors. + */ + if (unparented_idxs && arbiterIndexes) + { + foreach_int(unparented_i, unparented_idxs) + { + Relation unparented_rel; + IndexInfo *unparenred_ii; + + unparented_rel = leaf_part_rri->ri_IndexRelationDescs[unparented_i]; + unparenred_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i]; + + Assert(!list_member_oid(arbiterIndexes, + unparented_rel->rd_index->indexrelid)); + + /* Ignore indexes not ready */ + if (!unparenred_ii->ii_ReadyForInserts) + continue; + + foreach_int(arbiter_i, arbiters_listidxs) + { + Relation arbiter_rel; + IndexInfo *arbiter_ii; + + arbiter_rel = leaf_part_rri->ri_IndexRelationDescs[arbiter_i]; + arbiter_ii = leaf_part_rri->ri_IndexRelationInfo[arbiter_i]; + + /* + * If the non-ancestor index is compatible with the + * arbiter, use the non-ancestor as arbiter too. + */ + if (IsIndexCompatibleAsArbiter(arbiter_rel, + arbiter_ii, + unparented_rel, + unparenred_ii)) + { + arbiterIndexes = lappend_oid(arbiterIndexes, + unparented_rel->rd_index->indexrelid); + additional_arbiters++; + break; + } + } + } + } + list_free(unparented_idxs); + list_free(arbiters_listidxs); } /* - * If the resulting lists are of inequal length, something is wrong. - * (This shouldn't happen, since arbiter index selection should not - * pick up an invalid index.) + * We expect to find as many arbiter indexes on this partition as the + * root has, plus however many "additional arbiters" (to wit: those + * being concurrently rebuilt) we found. */ if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) != - list_length(arbiterIndexes)) + list_length(arbiterIndexes) - additional_arbiters) elog(ERROR, "invalid arbiter index list"); leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes; @@ -850,7 +988,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, &found_whole_row); /* We ignore the value of found_whole_row. */ onconfl->oc_WhereClause = - ExecInitQual((List *) clause, &mtstate->ps); + ExecInitQual(clause, &mtstate->ps); } } } @@ -1060,10 +1198,8 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, if (proute->max_partitions == 0) { proute->max_partitions = 8; - proute->partitions = (ResultRelInfo **) - palloc(sizeof(ResultRelInfo *) * proute->max_partitions); - proute->is_borrowed_rel = (bool *) - palloc(sizeof(bool) * proute->max_partitions); + proute->partitions = palloc_array(ResultRelInfo *, proute->max_partitions); + proute->is_borrowed_rel = palloc_array(bool, proute->max_partitions); } else { @@ -1178,10 +1314,8 @@ ExecInitPartitionDispatchInfo(EState *estate, if (proute->max_dispatch == 0) { proute->max_dispatch = 4; - proute->partition_dispatch_info = (PartitionDispatch *) - palloc(sizeof(PartitionDispatch) * proute->max_dispatch); - proute->nonleaf_partitions = (ResultRelInfo **) - palloc(sizeof(ResultRelInfo *) * proute->max_dispatch); + proute->partition_dispatch_info = palloc_array(PartitionDispatch, proute->max_dispatch); + proute->nonleaf_partitions = palloc_array(ResultRelInfo *, proute->max_dispatch); } else { @@ -1391,7 +1525,7 @@ FormPartitionKeyDatum(PartitionDispatch pd, * found or -1 if none found. */ static int -get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) +get_partition_for_tuple(PartitionDispatch pd, const Datum *values, const bool *isnull) { int bound_offset = -1; int part_index = -1; @@ -1612,8 +1746,8 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) */ static char * ExecBuildSlotPartitionKeyDescription(Relation rel, - Datum *values, - bool *isnull, + const Datum *values, + const bool *isnull, int maxfieldlen) { StringInfoData buf; @@ -2073,7 +2207,7 @@ CreatePartitionPruneState(EState *estate, PartitionPruneInfo *pruneinfo, * arrays are in partition bounds order. */ pprune->nparts = partdesc->nparts; - pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts); + pprune->subplan_map = palloc_array(int, partdesc->nparts); if (partdesc->nparts == pinfo->nparts && memcmp(partdesc->oids, pinfo->relid_map, @@ -2100,8 +2234,8 @@ CreatePartitionPruneState(EState *estate, PartitionPruneInfo *pruneinfo, * attached. Cope with that by creating a map that skips any * mismatches. */ - pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts); - pprune->leafpart_rti_map = palloc(sizeof(int) * partdesc->nparts); + pprune->subpart_map = palloc_array(int, partdesc->nparts); + pprune->leafpart_rti_map = palloc_array(int, partdesc->nparts); for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++) { @@ -2252,16 +2386,14 @@ InitPartitionPruneContext(PartitionPruneContext *context, context->partsupfunc = partkey->partsupfunc; /* We'll look up type-specific support functions as needed */ - context->stepcmpfuncs = (FmgrInfo *) - palloc0(sizeof(FmgrInfo) * n_steps * partnatts); + context->stepcmpfuncs = palloc0_array(FmgrInfo, n_steps * partnatts); context->ppccontext = CurrentMemoryContext; context->planstate = planstate; context->exprcontext = econtext; /* Initialize expression state for each expression we need */ - context->exprstates = (ExprState **) - palloc0(sizeof(ExprState *) * n_steps * partnatts); + context->exprstates = palloc0_array(ExprState *, n_steps * partnatts); foreach(lc, pruning_steps) { PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc); @@ -2362,7 +2494,7 @@ InitExecPartitionPruneContexts(PartitionPruneState *prunestate, * indexes to new ones. For convenience of initialization, we use * 1-based indexes in this array and leave pruned items as 0. */ - new_subplan_indexes = (int *) palloc0(sizeof(int) * n_total_subplans); + new_subplan_indexes = palloc0_array(int, n_total_subplans); newidx = 1; i = -1; while ((i = bms_next_member(initially_valid_subplans, i)) >= 0) diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 53ddd25c42db9..860f79f9cc1a0 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -14,12 +14,14 @@ #include "postgres.h" +#include "access/commit_ts.h" #include "access/genam.h" #include "access/gist.h" #include "access/relscan.h" #include "access/tableam.h" #include "access/transam.h" #include "access/xact.h" +#include "access/heapam.h" #include "catalog/pg_am_d.h" #include "commands/trigger.h" #include "executor/executor.h" @@ -36,7 +38,7 @@ static bool tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2, - TypeCacheEntry **eq); + TypeCacheEntry **eq, Bitmapset *columns); /* * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that @@ -219,9 +221,9 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, if (!isIdxSafeToSkipDuplicates) { if (eq == NULL) - eq = palloc0(sizeof(*eq) * outslot->tts_tupleDescriptor->natts); + eq = palloc0_array(TypeCacheEntry *, outslot->tts_tupleDescriptor->natts); - if (!tuples_equal(outslot, searchslot, eq)) + if (!tuples_equal(outslot, searchslot, eq, NULL)) continue; } @@ -277,10 +279,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, /* * Compare the tuples in the slots by checking if they have equal values. + * + * If 'columns' is not null, only the columns specified within it will be + * considered for the equality check, ignoring all other columns. */ static bool tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2, - TypeCacheEntry **eq) + TypeCacheEntry **eq, Bitmapset *columns) { int attrnum; @@ -305,6 +310,14 @@ tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2, if (att->attisdropped || att->attgenerated) continue; + /* + * Ignore columns that are not listed for checking. + */ + if (columns && + !bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber, + columns)) + continue; + /* * If one value is NULL and other is not, then they are certainly not * equal @@ -365,7 +378,7 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, Assert(equalTupleDescs(desc, outslot->tts_tupleDescriptor)); - eq = palloc0(sizeof(*eq) * outslot->tts_tupleDescriptor->natts); + eq = palloc0_array(TypeCacheEntry *, outslot->tts_tupleDescriptor->natts); /* Start a heap scan. */ InitDirtySnapshot(snap); @@ -380,7 +393,7 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, /* Try to find the tuple */ while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot)) { - if (!tuples_equal(scanslot, searchslot, eq)) + if (!tuples_equal(scanslot, searchslot, eq, NULL)) continue; found = true; @@ -455,6 +468,236 @@ BuildConflictIndexInfo(ResultRelInfo *resultRelInfo, Oid conflictindex) } } +/* + * If the tuple is recently dead and was deleted by a transaction with a newer + * commit timestamp than previously recorded, update the associated transaction + * ID, commit time, and origin. This helps ensure that conflict detection uses + * the most recent and relevant deletion metadata. + */ +static void +update_most_recent_deletion_info(TupleTableSlot *scanslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + TimestampTz *delete_time, + RepOriginId *delete_origin) +{ + BufferHeapTupleTableSlot *hslot; + HeapTuple tuple; + Buffer buf; + bool recently_dead = false; + TransactionId xmax; + TimestampTz localts; + RepOriginId localorigin; + + hslot = (BufferHeapTupleTableSlot *) scanslot; + + tuple = ExecFetchSlotHeapTuple(scanslot, false, NULL); + buf = hslot->buffer; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + /* + * We do not consider HEAPTUPLE_DEAD status because it indicates either + * tuples whose inserting transaction was aborted (meaning there is no + * commit timestamp or origin), or tuples deleted by a transaction older + * than oldestxmin, making it safe to ignore them during conflict + * detection (See comments atop worker.c for details). + */ + if (HeapTupleSatisfiesVacuum(tuple, oldestxmin, buf) == HEAPTUPLE_RECENTLY_DEAD) + recently_dead = true; + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!recently_dead) + return; + + xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + if (!TransactionIdIsValid(xmax)) + return; + + /* Select the dead tuple with the most recent commit timestamp */ + if (TransactionIdGetCommitTsData(xmax, &localts, &localorigin) && + TimestampDifferenceExceeds(*delete_time, localts, 0)) + { + *delete_xid = xmax; + *delete_time = localts; + *delete_origin = localorigin; + } +} + +/* + * Searches the relation 'rel' for the most recently deleted tuple that matches + * the values in 'searchslot' and is not yet removable by VACUUM. The function + * returns the transaction ID, origin, and commit timestamp of the transaction + * that deleted this tuple. + * + * 'oldestxmin' acts as a cutoff transaction ID. Tuples deleted by transactions + * with IDs >= 'oldestxmin' are considered recently dead and are eligible for + * conflict detection. + * + * Instead of stopping at the first match, we scan all matching dead tuples to + * identify most recent deletion. This is crucial because only the latest + * deletion is relevant for resolving conflicts. + * + * For example, consider a scenario on the subscriber where a row is deleted, + * re-inserted, and then deleted again only on the subscriber: + * + * - (pk, 1) - deleted at 9:00, + * - (pk, 1) - deleted at 9:02, + * + * Now, a remote update arrives: (pk, 1) -> (pk, 2), timestamped at 9:01. + * + * If we mistakenly return the older deletion (9:00), the system may wrongly + * apply the remote update using a last-update-wins strategy. Instead, we must + * recognize the more recent deletion at 9:02 and skip the update. See + * comments atop worker.c for details. Note, as of now, conflict resolution + * is not implemented. Consequently, the system may incorrectly report the + * older tuple as the conflicted one, leading to misleading results. + * + * The commit timestamp of the deleting transaction is used to determine which + * tuple was deleted most recently. + */ +bool +RelationFindDeletedTupleInfoSeq(Relation rel, TupleTableSlot *searchslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time) +{ + TupleTableSlot *scanslot; + TableScanDesc scan; + TypeCacheEntry **eq; + Bitmapset *indexbitmap; + TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel); + + Assert(equalTupleDescs(desc, searchslot->tts_tupleDescriptor)); + + *delete_xid = InvalidTransactionId; + *delete_origin = InvalidRepOriginId; + *delete_time = 0; + + /* + * If the relation has a replica identity key or a primary key that is + * unusable for locating deleted tuples (see + * IsIndexUsableForFindingDeletedTuple), a full table scan becomes + * necessary. In such cases, comparing the entire tuple is not required, + * since the remote tuple might not include all column values. Instead, + * the indexed columns alone are sufficient to identify the target tuple + * (see logicalrep_rel_mark_updatable). + */ + indexbitmap = RelationGetIndexAttrBitmap(rel, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + + /* fallback to PK if no replica identity */ + if (!indexbitmap) + indexbitmap = RelationGetIndexAttrBitmap(rel, + INDEX_ATTR_BITMAP_PRIMARY_KEY); + + eq = palloc0_array(TypeCacheEntry *, searchslot->tts_tupleDescriptor->natts); + + /* + * Start a heap scan using SnapshotAny to identify dead tuples that are + * not visible under a standard MVCC snapshot. Tuples from transactions + * not yet committed or those just committed prior to the scan are + * excluded in update_most_recent_deletion_info(). + */ + scan = table_beginscan(rel, SnapshotAny, 0, NULL); + scanslot = table_slot_create(rel, NULL); + + table_rescan(scan, NULL); + + /* Try to find the tuple */ + while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot)) + { + if (!tuples_equal(scanslot, searchslot, eq, indexbitmap)) + continue; + + update_most_recent_deletion_info(scanslot, oldestxmin, delete_xid, + delete_time, delete_origin); + } + + table_endscan(scan); + ExecDropSingleTupleTableSlot(scanslot); + + return *delete_time != 0; +} + +/* + * Similar to RelationFindDeletedTupleInfoSeq() but using index scan to locate + * the deleted tuple. + */ +bool +RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, + TupleTableSlot *searchslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time) +{ + Relation idxrel; + ScanKeyData skey[INDEX_MAX_KEYS]; + int skey_attoff; + IndexScanDesc scan; + TupleTableSlot *scanslot; + TypeCacheEntry **eq = NULL; + bool isIdxSafeToSkipDuplicates; + TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel); + + Assert(equalTupleDescs(desc, searchslot->tts_tupleDescriptor)); + Assert(OidIsValid(idxoid)); + + *delete_xid = InvalidTransactionId; + *delete_time = 0; + *delete_origin = InvalidRepOriginId; + + isIdxSafeToSkipDuplicates = (GetRelationIdentityOrPK(rel) == idxoid); + + scanslot = table_slot_create(rel, NULL); + + idxrel = index_open(idxoid, RowExclusiveLock); + + /* Build scan key. */ + skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); + + /* + * Start an index scan using SnapshotAny to identify dead tuples that are + * not visible under a standard MVCC snapshot. Tuples from transactions + * not yet committed or those just committed prior to the scan are + * excluded in update_most_recent_deletion_info(). + */ + scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0); + + index_rescan(scan, skey, skey_attoff, NULL, 0); + + /* Try to find the tuple */ + while (index_getnext_slot(scan, ForwardScanDirection, scanslot)) + { + /* + * Avoid expensive equality check if the index is primary key or + * replica identity index. + */ + if (!isIdxSafeToSkipDuplicates) + { + if (eq == NULL) + eq = palloc0_array(TypeCacheEntry *, scanslot->tts_tupleDescriptor->natts); + + if (!tuples_equal(scanslot, searchslot, eq, NULL)) + continue; + } + + update_most_recent_deletion_info(scanslot, oldestxmin, delete_xid, + delete_time, delete_origin); + } + + index_endscan(scan); + + index_close(idxrel, NoLock); + + ExecDropSingleTupleTableSlot(scanslot); + + return *delete_time != 0; +} + /* * Find the tuple that violates the passed unique index (conflictindex). * @@ -609,10 +852,10 @@ ExecSimpleRelationInsert(ResultRelInfo *resultRelInfo, conflictindexes, false); /* - * Checks the conflict indexes to fetch the conflicting local tuple - * and reports the conflict. We perform this check here, instead of + * Checks the conflict indexes to fetch the conflicting local row and + * reports the conflict. We perform this check here, instead of * performing an additional index scan before the actual insertion and - * reporting the conflict if any conflicting tuples are found. This is + * reporting the conflict if any conflicting rows are found. This is * to avoid the overhead of executing the extra scan for each INSERT * operation, even when no conflict arises, which could introduce * significant overhead to replication, particularly in cases where @@ -670,7 +913,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + tid, NULL, slot, NULL, NULL, false)) skip_tuple = true; /* "do nothing" */ } @@ -746,7 +989,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + tid, NULL, NULL, NULL, NULL, false); } if (!skip_tuple) @@ -869,18 +1112,36 @@ CheckCmdReplicaIdentity(Relation rel, CmdType cmd) /* - * Check if we support writing into specific relkind. + * Check if we support writing into specific relkind of local relation and check + * if it aligns with the relkind of the relation on the publisher. * * The nspname and relname are only needed for error reporting. */ void -CheckSubscriptionRelkind(char relkind, const char *nspname, - const char *relname) +CheckSubscriptionRelkind(char localrelkind, char remoterelkind, + const char *nspname, const char *relname) { - if (relkind != RELKIND_RELATION && relkind != RELKIND_PARTITIONED_TABLE) + if (localrelkind != RELKIND_RELATION && + localrelkind != RELKIND_PARTITIONED_TABLE && + localrelkind != RELKIND_SEQUENCE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot use relation \"%s.%s\" as logical replication target", nspname, relname), - errdetail_relkind_not_supported(relkind))); + errdetail_relkind_not_supported(localrelkind))); + + /* + * Allow RELKIND_RELATION and RELKIND_PARTITIONED_TABLE to be treated + * interchangeably, but ensure that sequences (RELKIND_SEQUENCE) match + * exactly on both publisher and subscriber. + */ + if ((localrelkind == RELKIND_SEQUENCE && remoterelkind != RELKIND_SEQUENCE) || + (localrelkind != RELKIND_SEQUENCE && remoterelkind == RELKIND_SEQUENCE)) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + /* translator: 3rd and 4th %s are "sequence" or "table" */ + errmsg("relation \"%s.%s\" type mismatch: source \"%s\", target \"%s\"", + nspname, relname, + remoterelkind == RELKIND_SEQUENCE ? "sequence" : "table", + localrelkind == RELKIND_SEQUENCE ? "sequence" : "table")); } diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c index 90726949a8708..31ed4783c1d3c 100644 --- a/src/backend/executor/execScan.c +++ b/src/backend/executor/execScan.c @@ -134,7 +134,7 @@ ExecScanReScan(ScanState *node) /* * If an FDW or custom scan provider has replaced the join with a - * scan, there are multiple RTIs; reset the epqScanDone flag for + * scan, there are multiple RTIs; reset the relsubs_done flag for * all of them. */ if (IsA(node->ps.plan, ForeignScan)) diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c index 8e02d68824fad..b0dc2cfa66f9f 100644 --- a/src/backend/executor/execTuples.c +++ b/src/backend/executor/execTuples.c @@ -2283,7 +2283,7 @@ TupleDescGetAttInMetadata(TupleDesc tupdesc) int32 *atttypmods; AttInMetadata *attinmeta; - attinmeta = (AttInMetadata *) palloc(sizeof(AttInMetadata)); + attinmeta = palloc_object(AttInMetadata); /* "Bless" the tupledesc so that we can make rowtype datums with it */ attinmeta->tupdesc = BlessTupleDesc(tupdesc); @@ -2447,7 +2447,7 @@ begin_tup_output_tupdesc(DestReceiver *dest, { TupOutputState *tstate; - tstate = (TupOutputState *) palloc(sizeof(TupOutputState)); + tstate = palloc_object(TupOutputState); tstate->slot = MakeSingleTupleTableSlot(tupdesc, tts_ops); tstate->dest = dest; diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index fdc65c2b42b33..09acdb18652d9 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -161,6 +161,7 @@ CreateExecutorState(void) estate->es_use_parallel_mode = false; estate->es_parallel_workers_to_launch = 0; estate->es_parallel_workers_launched = 0; + estate->es_tempbufs_flushed = false; /* Is the backend's temp buffers were flushed? */ estate->es_jit_flags = 0; estate->es_jit = NULL; diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index 359aafea681b9..d9cc723daa0d1 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -143,6 +143,7 @@ typedef struct SQLFunctionCache { SQLFunctionHashEntry *func; /* associated SQLFunctionHashEntry */ + bool active; /* are we executing this cache entry? */ bool lazyEvalOK; /* true if lazyEval is safe */ bool shutdown_reg; /* true if registered shutdown callback */ bool lazyEval; /* true if using lazyEval for result query */ @@ -255,7 +256,7 @@ prepare_sql_fn_parse_info(HeapTuple procedureTuple, Form_pg_proc procedureStruct = (Form_pg_proc) GETSTRUCT(procedureTuple); int nargs; - pinfo = (SQLFunctionParseInfoPtr) palloc0(sizeof(SQLFunctionParseInfo)); + pinfo = (SQLFunctionParseInfoPtr) palloc0_object(SQLFunctionParseInfo); /* Function's name (only) can be used to qualify argument names */ pinfo->fname = pstrdup(NameStr(procedureStruct->proname)); @@ -556,6 +557,28 @@ init_sql_fcache(FunctionCallInfo fcinfo, bool lazyEvalOK) finfo->fn_extra = fcache; } + /* + * If the SQLFunctionCache is marked as active, we must have errored out + * of a prior execution. Reset state. (It might seem that we could also + * reach this during recursive invocation of a SQL function, but we won't + * because that case won't involve re-use of the same FmgrInfo.) + */ + if (fcache->active) + { + /* + * In general, this stanza should clear all the same fields that + * ShutdownSQLFunction would. Note we must clear fcache->cplan + * without doing ReleaseCachedPlan, because error cleanup from the + * prior execution would have taken care of releasing that plan. + * Likewise, if tstore is still set then it is pointing at garbage. + */ + fcache->cplan = NULL; + fcache->eslist = NULL; + fcache->tstore = NULL; + fcache->shutdown_reg = false; + fcache->active = false; + } + /* * If we are resuming execution of a set-returning function, just keep * using the same cache. We do not ask funccache.c to re-validate the @@ -1597,6 +1620,9 @@ fmgr_sql(PG_FUNCTION_ARGS) */ fcache = init_sql_fcache(fcinfo, lazyEvalOK); + /* Mark fcache as active */ + fcache->active = true; + /* Remember info that we might need later to construct tuplestore */ fcache->tscontext = tscontext; fcache->randomAccess = randomAccess; @@ -1853,6 +1879,9 @@ fmgr_sql(PG_FUNCTION_ARGS) if (es == NULL) fcache->eslist = NULL; + /* Mark fcache as inactive */ + fcache->active = false; + error_context_stack = sqlerrcontext.previous; return result; @@ -2454,7 +2483,7 @@ check_sql_stmt_retval(List *queryTreeList, rte = makeNode(RangeTblEntry); rte->rtekind = RTE_SUBQUERY; rte->subquery = parse; - rte->eref = rte->alias = makeAlias("*SELECT*", colnames); + rte->eref = makeAlias("unnamed_subquery", colnames); rte->lateral = false; rte->inh = false; rte->inFromCl = true; @@ -2587,7 +2616,7 @@ get_sql_fn_result_tlist(List *queryTreeList) DestReceiver * CreateSQLFunctionDestReceiver(void) { - DR_sqlfunction *self = (DR_sqlfunction *) palloc0(sizeof(DR_sqlfunction)); + DR_sqlfunction *self = palloc0_object(DR_sqlfunction); self->pub.receiveSlot = sqlfunction_receive; self->pub.rStartup = sqlfunction_startup; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 56e635f47000d..9e11c662a7c1d 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -280,6 +280,7 @@ WalUsageAdd(WalUsage *dst, WalUsage *add) dst->wal_bytes += add->wal_bytes; dst->wal_records += add->wal_records; dst->wal_fpi += add->wal_fpi; + dst->wal_fpi_bytes += add->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full; } @@ -289,5 +290,6 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_bytes += add->wal_bytes - sub->wal_bytes; dst->wal_records += add->wal_records - sub->wal_records; dst->wal_fpi += add->wal_fpi - sub->wal_fpi; + dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 377e016d73225..a18556f62ecca 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -267,7 +267,6 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/datum.h" -#include "utils/dynahash.h" #include "utils/expandeddatum.h" #include "utils/injection_point.h" #include "utils/logtape.h" @@ -403,12 +402,12 @@ static void find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated); static bool find_cols_walker(Node *node, FindColsContext *context); static void build_hash_tables(AggState *aggstate); -static void build_hash_table(AggState *aggstate, int setno, long nbuckets); +static void build_hash_table(AggState *aggstate, int setno, double nbuckets); static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck); static void hash_create_memory(AggState *aggstate); -static long hash_choose_num_buckets(double hashentrysize, - long ngroups, Size memory); +static double hash_choose_num_buckets(double hashentrysize, + double ngroups, Size memory); static int hash_choose_num_partitions(double input_groups, double hashentrysize, int used_bits, @@ -1458,7 +1457,7 @@ find_cols_walker(Node *node, FindColsContext *context) * We have a separate hashtable and associated perhash data structure for each * grouping set for which we're doing hashing. * - * The contents of the hash tables always live in the hashcontext's per-tuple + * The contents of the hash tables live in the aggstate's hash_tuplescxt * memory context (there is only one of these for all tables together, since * they are all reset at the same time). */ @@ -1470,7 +1469,7 @@ build_hash_tables(AggState *aggstate) for (setno = 0; setno < aggstate->num_hashes; ++setno) { AggStatePerHash perhash = &aggstate->perhash[setno]; - long nbuckets; + double nbuckets; Size memory; if (perhash->hashtable != NULL) @@ -1479,8 +1478,6 @@ build_hash_tables(AggState *aggstate) continue; } - Assert(perhash->aggnode->numGroups > 0); - memory = aggstate->hash_mem_limit / aggstate->num_hashes; /* choose reasonable number of buckets per hashtable */ @@ -1506,11 +1503,11 @@ build_hash_tables(AggState *aggstate) * Build a single hashtable for this grouping set. */ static void -build_hash_table(AggState *aggstate, int setno, long nbuckets) +build_hash_table(AggState *aggstate, int setno, double nbuckets) { AggStatePerHash perhash = &aggstate->perhash[setno]; MemoryContext metacxt = aggstate->hash_metacxt; - MemoryContext tablecxt = aggstate->hash_tablecxt; + MemoryContext tuplescxt = aggstate->hash_tuplescxt; MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory; Size additionalsize; @@ -1536,7 +1533,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) nbuckets, additionalsize, metacxt, - tablecxt, + tuplescxt, tmpcxt, DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); } @@ -1869,7 +1866,7 @@ hash_agg_check_limits(AggState *aggstate) uint64 ngroups = aggstate->hash_ngroups_current; Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, + Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, true); Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); @@ -1923,7 +1920,7 @@ hash_agg_enter_spill_mode(AggState *aggstate) aggstate->hash_tapeset = LogicalTapeSetCreate(true, NULL, -1); - aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes); + aggstate->hash_spills = palloc_array(HashAggSpill, aggstate->num_hashes); for (int setno = 0; setno < aggstate->num_hashes; setno++) { @@ -1960,7 +1957,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); /* memory for hash entries */ - entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true); + entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, true); /* memory for byref transition states */ hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); @@ -2043,22 +2040,22 @@ hash_create_memory(AggState *aggstate) /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */ maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE); - aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, - "HashAgg table context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - maxBlockSize); + aggstate->hash_tuplescxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg hashed tuples", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + maxBlockSize); } /* * Choose a reasonable number of buckets for the initial hash table size. */ -static long -hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory) +static double +hash_choose_num_buckets(double hashentrysize, double ngroups, Size memory) { - long max_nbuckets; - long nbuckets = ngroups; + double max_nbuckets; + double nbuckets = ngroups; max_nbuckets = memory / hashentrysize; @@ -2066,12 +2063,16 @@ hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory) * Underestimating is better than overestimating. Too many buckets crowd * out space for group keys and transition state values. */ - max_nbuckets >>= 1; + max_nbuckets /= 2; if (nbuckets > max_nbuckets) nbuckets = max_nbuckets; - return Max(nbuckets, 1); + /* + * BuildTupleHashTable will clamp any obviously-insane result, so we don't + * need to be too careful here. + */ + return nbuckets; } /* @@ -2115,7 +2116,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize, npartitions = (int) dpartitions; /* ceil(log2(npartitions)) */ - partition_bits = my_log2(npartitions); + partition_bits = pg_ceil_log2_32(npartitions); /* make sure that we don't exhaust the hash bits */ if (partition_bits + used_bits >= 32) @@ -2708,7 +2709,6 @@ agg_refill_hash_table(AggState *aggstate) /* free memory and reset hash tables */ ReScanExprContext(aggstate->hashcontext); - MemoryContextReset(aggstate->hash_tablecxt); for (int setno = 0; setno < aggstate->num_hashes; setno++) ResetTupleHashTable(aggstate->perhash[setno].hashtable); @@ -2912,7 +2912,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate) perhash = &aggstate->perhash[aggstate->current_set]; - ResetTupleHashIterator(hashtable, &perhash->hashiter); + ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter); continue; } @@ -2999,9 +2999,9 @@ hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits, } #endif - spill->partitions = palloc0(sizeof(LogicalTape *) * npartitions); - spill->ntuples = palloc0(sizeof(int64) * npartitions); - spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions); + spill->partitions = palloc0_array(LogicalTape *, npartitions); + spill->ntuples = palloc0_array(int64, npartitions); + spill->hll_card = palloc0_array(hyperLogLogState, npartitions); for (int i = 0; i < npartitions; i++) spill->partitions[i] = LogicalTapeCreate(tapeset); @@ -3097,7 +3097,7 @@ static HashAggBatch * hashagg_batch_new(LogicalTape *input_tape, int setno, int64 input_tuples, double input_card, int used_bits) { - HashAggBatch *batch = palloc0(sizeof(HashAggBatch)); + HashAggBatch *batch = palloc0_object(HashAggBatch); batch->setno = setno; batch->used_bits = used_bits; @@ -3368,8 +3368,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) aggstate->maxsets = numGroupingSets; aggstate->numphases = numPhases; - aggstate->aggcontexts = (ExprContext **) - palloc0(sizeof(ExprContext *) * numGroupingSets); + aggstate->aggcontexts = palloc0_array(ExprContext *, numGroupingSets); /* * Create expression contexts. We need three or more, one for @@ -3492,15 +3491,15 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) * For each phase, prepare grouping set data and fmgr lookup data for * compare functions. Accumulate all_grouped_cols in passing. */ - aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData)); + aggstate->phases = palloc0_array(AggStatePerPhaseData, numPhases); aggstate->num_hashes = numHashes; if (numHashes) { - aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes); + aggstate->perhash = palloc0_array(AggStatePerHashData, numHashes); aggstate->phases[0].numsets = 0; - aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int)); - aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *)); + aggstate->phases[0].gset_lengths = palloc_array(int, numHashes); + aggstate->phases[0].grouped_cols = palloc_array(Bitmapset *, numHashes); } phase = 0; @@ -3598,8 +3597,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) * Build a separate function for each subset of columns that * need to be compared. */ - phasedata->eqfunctions = - (ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *)); + phasedata->eqfunctions = palloc0_array(ExprState *, aggnode->numCols); /* for each grouping set */ for (int k = 0; k < phasedata->numsets; k++) @@ -3655,27 +3653,24 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) * allocate my private per-agg working storage */ econtext = aggstate->ss.ps.ps_ExprContext; - econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs); - econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs); + econtext->ecxt_aggvalues = palloc0_array(Datum, numaggs); + econtext->ecxt_aggnulls = palloc0_array(bool, numaggs); - peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs); - pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans); + peraggs = palloc0_array(AggStatePerAggData, numaggs); + pertransstates = palloc0_array(AggStatePerTransData, numtrans); aggstate->peragg = peraggs; aggstate->pertrans = pertransstates; - aggstate->all_pergroups = - (AggStatePerGroup *) palloc0(sizeof(AggStatePerGroup) - * (numGroupingSets + numHashes)); + aggstate->all_pergroups = palloc0_array(AggStatePerGroup, numGroupingSets + numHashes); pergroups = aggstate->all_pergroups; if (node->aggstrategy != AGG_HASHED) { for (i = 0; i < numGroupingSets; i++) { - pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData) - * numaggs); + pergroups[i] = palloc0_array(AggStatePerGroupData, numaggs); } aggstate->pergroups = pergroups; @@ -3688,7 +3683,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) if (use_hashing) { Plan *outerplan = outerPlan(node); - uint64 totalGroups = 0; + double totalGroups = 0; aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc, &TTSOpsMinimalTuple); @@ -4375,8 +4370,7 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans, pfree(ops); } - pertrans->sortstates = (Tuplesortstate **) - palloc0(sizeof(Tuplesortstate *) * numGroupingSets); + pertrans->sortstates = palloc0_array(Tuplesortstate *, numGroupingSets); } @@ -4429,18 +4423,18 @@ ExecEndAgg(AggState *node) hashagg_reset_spill_state(node); + /* Release hash tables too */ if (node->hash_metacxt != NULL) { MemoryContextDelete(node->hash_metacxt); node->hash_metacxt = NULL; } - if (node->hash_tablecxt != NULL) + if (node->hash_tuplescxt != NULL) { - MemoryContextDelete(node->hash_tablecxt); - node->hash_tablecxt = NULL; + MemoryContextDelete(node->hash_tuplescxt); + node->hash_tuplescxt = NULL; } - for (transno = 0; transno < node->numtrans; transno++) { AggStatePerTrans pertrans = &node->pertrans[transno]; @@ -4556,8 +4550,7 @@ ExecReScanAgg(AggState *node) node->hash_ngroups_current = 0; ReScanExprContext(node->hashcontext); - MemoryContextReset(node->hash_tablecxt); - /* Rebuild an empty hash table */ + /* Rebuild empty hash table(s) */ build_hash_tables(node); node->table_filled = false; /* iterator will be reset when the table is filled */ diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c index a11b36c717662..77c4dd9e4b18f 100644 --- a/src/backend/executor/nodeAppend.c +++ b/src/backend/executor/nodeAppend.c @@ -263,7 +263,7 @@ ExecInitAppend(Append *node, EState *estate, int eflags) { AsyncRequest *areq; - areq = palloc(sizeof(AsyncRequest)); + areq = palloc_object(AsyncRequest); areq->requestor = (PlanState *) appendstate; areq->requestee = appendplanstates[i]; areq->request_index = i; diff --git a/src/backend/executor/nodeFunctionscan.c b/src/backend/executor/nodeFunctionscan.c index 644363582d913..af75dd8fc5e5d 100644 --- a/src/backend/executor/nodeFunctionscan.c +++ b/src/backend/executor/nodeFunctionscan.c @@ -333,7 +333,7 @@ ExecInitFunctionScan(FunctionScan *node, EState *estate, int eflags) */ ExecAssignExprContext(estate, &scanstate->ss.ps); - scanstate->funcstates = palloc(nfuncs * sizeof(FunctionScanPerFuncState)); + scanstate->funcstates = palloc_array(FunctionScanPerFuncState, nfuncs); natts = 0; i = 0; diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c index dc7d1830259f5..572a54df6add2 100644 --- a/src/backend/executor/nodeGather.c +++ b/src/backend/executor/nodeGather.c @@ -36,6 +36,7 @@ #include "executor/tqueue.h" #include "miscadmin.h" #include "optimizer/optimizer.h" +#include "storage/bufmgr.h" #include "utils/wait_event.h" @@ -161,6 +162,17 @@ ExecGather(PlanState *pstate) { ParallelContext *pcxt; + /* + * Flush temporary buffers if this parallel section contains + * any objects with temporary storage type. Don't bother to do it + * more than once per the query execution. + */ + if (gather->process_temp_tables && !estate->es_tempbufs_flushed) + { + FlushAllBuffers(); + estate->es_tempbufs_flushed = true; + } + /* Initialize, or re-initialize, shared state needed by workers. */ if (!node->pei) node->pei = ExecInitParallelPlan(outerPlanState(node), diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c index 15f8459706773..4232a5a3a0bb4 100644 --- a/src/backend/executor/nodeGatherMerge.c +++ b/src/backend/executor/nodeGatherMerge.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/htup_details.h" #include "executor/executor.h" #include "executor/execParallel.h" #include "executor/nodeGatherMerge.h" @@ -21,6 +22,7 @@ #include "lib/binaryheap.h" #include "miscadmin.h" #include "optimizer/optimizer.h" +#include "storage/bufmgr.h" /* * When we read tuples from workers, it's a good idea to read several at once @@ -144,8 +146,7 @@ ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags) int i; gm_state->gm_nkeys = node->numCols; - gm_state->gm_sortkeys = - palloc0(sizeof(SortSupportData) * node->numCols); + gm_state->gm_sortkeys = palloc0_array(SortSupportData, node->numCols); for (i = 0; i < node->numCols; i++) { @@ -205,6 +206,13 @@ ExecGatherMerge(PlanState *pstate) { ParallelContext *pcxt; + /* The same as in the ExecGather */ + if (gm->process_temp_tables && !estate->es_tempbufs_flushed) + { + FlushAllBuffers(); + estate->es_tempbufs_flushed = true; + } + /* Initialize, or re-initialize, shared state needed by workers. */ if (!node->pei) node->pei = ExecInitParallelPlan(outerPlanState(node), @@ -417,8 +425,7 @@ gather_merge_setup(GatherMergeState *gm_state) for (i = 0; i < nreaders; i++) { /* Allocate the tuple array with length MAX_TUPLE_STORE */ - gm_state->gm_tuple_buffers[i].tuple = - (MinimalTuple *) palloc0(sizeof(MinimalTuple) * MAX_TUPLE_STORE); + gm_state->gm_tuple_buffers[i].tuple = palloc0_array(MinimalTuple, MAX_TUPLE_STORE); /* Initialize tuple slot for worker */ gm_state->gm_slots[i + 1] = diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8d2201ab67fa5..88441859bf98d 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -36,7 +36,6 @@ #include "executor/nodeHashjoin.h" #include "miscadmin.h" #include "port/pg_bitutils.h" -#include "utils/dynahash.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" @@ -340,7 +339,7 @@ MultiExecParallelHash(HashState *node) */ hashtable->curbatch = -1; hashtable->nbuckets = pstate->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->totalTuples = pstate->total_tuples; /* @@ -480,7 +479,7 @@ ExecHashTableCreate(HashState *state) &nbuckets, &nbatch, &num_skew_mcvs); /* nbuckets must be a power of 2 */ - log2_nbuckets = my_log2(nbuckets); + log2_nbuckets = pg_ceil_log2_32(nbuckets); Assert(nbuckets == (1 << log2_nbuckets)); /* @@ -851,85 +850,91 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, /* * Optimize the total amount of memory consumed by the hash node. * - * The nbatch calculation above focuses on the size of the in-memory hash - * table, assuming no per-batch overhead. Now adjust the number of batches - * and the size of the hash table to minimize total memory consumed by the - * hash node. - * - * Each batch file has a BLCKSZ buffer, and we may need two files per - * batch (inner and outer side). So with enough batches this can be - * significantly more memory than the hashtable itself. + * The nbatch calculation above focuses on the in-memory hash table, + * assuming no per-batch overhead. But each batch may have two files, each + * with a BLCKSZ buffer. For large nbatch values these buffers may use + * significantly more memory than the hash table. * * The total memory usage may be expressed by this formula: * - * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ) <= hash_table_bytes + * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ) * * where (inner_rel_bytes / nbatch) is the size of the in-memory hash * table and (2 * nbatch * BLCKSZ) is the amount of memory used by file - * buffers. But for sufficiently large values of inner_rel_bytes value - * there may not be a nbatch value that would make both parts fit into - * hash_table_bytes. - * - * In this case we can't enforce the memory limit - we're going to exceed - * it. We can however minimize the impact and use as little memory as - * possible. (We haven't really enforced it before either, as we simply - * ignored the batch files.) + * buffers. * - * The formula for total memory usage says that given an inner relation of - * size inner_rel_bytes, we may divide it into an arbitrary number of - * batches. This determines both the size of the in-memory hash table and - * the amount of memory needed for batch files. These two terms work in - * opposite ways - when one decreases, the other increases. + * The nbatch calculation however ignores the second part. And for very + * large inner_rel_bytes, there may be no nbatch that keeps total memory + * usage under the budget (work_mem * hash_mem_multiplier). To deal with + * that, we will adjust nbatch to minimize total memory consumption across + * both the hashtable and file buffers. * - * For low nbatch values, the hash table takes most of the memory, but at - * some point the batch files start to dominate. If you combine these two - * terms, the memory consumption (for a fixed size of the inner relation) - * has a u-shape, with a minimum at some nbatch value. + * As we increase the size of the hashtable, the number of batches + * decreases, and the total memory usage follows a U-shaped curve. We find + * the minimum nbatch by "walking back" -- checking if halving nbatch + * would lower the total memory usage. We stop when it no longer helps. * - * Our goal is to find this nbatch value, minimizing the memory usage. We - * calculate the memory usage with half the batches (i.e. nbatch/2), and - * if it's lower than the current memory usage we know it's better to use - * fewer batches. We repeat this until reducing the number of batches does - * not reduce the memory usage - we found the optimum. We know the optimum - * exists, thanks to the u-shape. + * We only reduce the number of batches. Adding batches reduces memory + * usage only when most of the memory is used by the hash table, with + * total memory usage within the limit or not far from it. We don't want + * to start batching when not needed, even if that would reduce memory + * usage. * - * We only want to do this when exceeding the memory limit, not every - * time. The goal is not to minimize memory usage in every case, but to - * minimize the memory usage when we can't stay within the memory limit. + * While growing the hashtable, we also adjust the number of buckets to + * maintain a load factor of NTUP_PER_BUCKET while squeezing tuples back + * from batches into the hashtable. * - * For this reason we only consider reducing the number of batches. We - * could try the opposite direction too, but that would save memory only - * when most of the memory is used by the hash table. And the hash table - * was used for the initial sizing, so we shouldn't be exceeding the - * memory limit too much. We might save memory by using more batches, but - * it would result in spilling more batch files, which does not seem like - * a great trade off. + * Note that we can only change nbuckets during initial hashtable sizing. + * Once we start building the hash, nbuckets is fixed (we may still grow + * the hash table). * - * While growing the hashtable, we also adjust the number of buckets, to - * not have more than one tuple per bucket (load factor 1). We can only do - * this during the initial sizing - once we start building the hash, - * nbucket is fixed. + * We double several parameters (space_allowed, nbuckets, num_skew_mcvs), + * which introduces a risk of overflow. We avoid this by exiting the loop. + * We could do something smarter (e.g. capping nbuckets and continue), but + * the complexity is not worth it. Such cases are extremely rare, and this + * is a best-effort attempt to reduce memory usage. */ - while (nbatch > 0) + while (nbatch > 1) { - /* how much memory are we using with current nbatch value */ - size_t current_space = hash_table_bytes + (2 * nbatch * BLCKSZ); + /* Check that buckets won't overflow MaxAllocSize */ + if (nbuckets > (MaxAllocSize / sizeof(HashJoinTuple) / 2)) + break; + + /* num_skew_mcvs should be less than nbuckets */ + Assert((*num_skew_mcvs) < (INT_MAX / 2)); - /* how much memory would we use with half the batches */ - size_t new_space = hash_table_bytes * 2 + (nbatch * BLCKSZ); + /* + * Check that space_allowed won't overflow SIZE_MAX. + * + * We don't use hash_table_bytes here, because it does not include the + * skew buckets. And we want to limit the overall memory limit. + */ + if ((*space_allowed) > (SIZE_MAX / 2)) + break; - /* If the memory usage would not decrease, we found the optimum. */ - if (current_space < new_space) + /* + * Will halving the number of batches and doubling the size of the + * hashtable reduce overall memory usage? + * + * This is the same as (S = space_allowed): + * + * (S + 2 * nbatch * BLCKSZ) < (S * 2 + nbatch * BLCKSZ) + * + * but avoiding intermediate overflow. + */ + if (nbatch < (*space_allowed) / BLCKSZ) break; /* - * It's better to use half the batches, so do that and adjust the - * nbucket in the opposite direction, and double the allowance. + * MaxAllocSize is sufficiently small that we are not worried about + * overflowing nbuckets. */ - nbatch /= 2; nbuckets *= 2; + *num_skew_mcvs = (*num_skew_mcvs) * 2; *space_allowed = (*space_allowed) * 2; + + nbatch /= 2; } Assert(nbuckets > 0); @@ -995,14 +1000,14 @@ ExecHashIncreaseBatchSize(HashJoinTable hashtable) * How much additional memory would doubling nbatch use? Each batch may * require two buffered files (inner/outer), with a BLCKSZ buffer. */ - size_t batchSpace = (hashtable->nbatch * 2 * BLCKSZ); + size_t batchSpace = (hashtable->nbatch * 2 * (size_t) BLCKSZ); /* * Compare the new space needed for doubling nbatch and for enlarging the * in-memory hash table. If doubling the hash table needs less memory, * just do that. Otherwise, continue with doubling the nbatch. * - * We're either doubling spaceAllowed of batchSpace, so which of those + * We're either doubling spaceAllowed or batchSpace, so which of those * increases the memory usage the least is the same as comparing the * values directly. */ @@ -3499,7 +3504,7 @@ ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno) dsa_get_address(hashtable->area, hashtable->batches[batchno].shared->buckets); hashtable->nbuckets = hashtable->parallel_state->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->current_chunk = NULL; hashtable->current_chunk_shared = InvalidDsaPointer; hashtable->batches[batchno].at_least_one_chunk = false; diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 5661ad7683004..cc50bee19eb93 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -899,7 +899,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) */ if (OidIsValid(hash->skewTable)) { - hashstate->skew_hashfunction = palloc0(sizeof(FmgrInfo)); + hashstate->skew_hashfunction = palloc0_object(FmgrInfo); hashstate->skew_collation = linitial_oid(node->hashcollations); fmgr_info(outer_hashfuncid[0], hashstate->skew_hashfunction); } @@ -1541,8 +1541,7 @@ ExecReScanHashJoin(HashJoinState *node) /* accumulate stats from old hash table, if wanted */ /* (this should match ExecShutdownHash) */ if (hashNode->ps.instrument && !hashNode->hinstrument) - hashNode->hinstrument = (HashInstrumentation *) - palloc0(sizeof(HashInstrumentation)); + hashNode->hinstrument = palloc0_object(HashInstrumentation); if (hashNode->hinstrument) ExecHashAccumInstrumentation(hashNode->hinstrument, hashNode->hashtable); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca9507a5..6bea42f128ff1 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -693,8 +693,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) * Now create an array to mark the attribute numbers of the keys that * need to be converted from cstring to name. */ - indexstate->ioss_NameCStringAttNums = (AttrNumber *) - palloc(sizeof(AttrNumber) * namecount); + indexstate->ioss_NameCStringAttNums = palloc_array(AttrNumber, namecount); for (int attnum = 0; attnum < indnkeyatts; attnum++) { diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 7fcaa37fe6253..72b135e5dcf05 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -65,7 +65,7 @@ static int cmp_orderbyvals(const Datum *adist, const bool *anulls, static int reorderqueue_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg); static void reorderqueue_push(IndexScanState *node, TupleTableSlot *slot, - Datum *orderbyvals, bool *orderbynulls); + const Datum *orderbyvals, const bool *orderbynulls); static HeapTuple reorderqueue_pop(IndexScanState *node); @@ -458,7 +458,7 @@ reorderqueue_cmp(const pairingheap_node *a, const pairingheap_node *b, */ static void reorderqueue_push(IndexScanState *node, TupleTableSlot *slot, - Datum *orderbyvals, bool *orderbynulls) + const Datum *orderbyvals, const bool *orderbynulls) { IndexScanDesc scandesc = node->iss_ScanDesc; EState *estate = node->ss.ps.state; @@ -466,12 +466,10 @@ reorderqueue_push(IndexScanState *node, TupleTableSlot *slot, ReorderTuple *rt; int i; - rt = (ReorderTuple *) palloc(sizeof(ReorderTuple)); + rt = palloc_object(ReorderTuple); rt->htup = ExecCopySlotHeapTuple(slot); - rt->orderbyvals = - (Datum *) palloc(sizeof(Datum) * scandesc->numberOfOrderBys); - rt->orderbynulls = - (bool *) palloc(sizeof(bool) * scandesc->numberOfOrderBys); + rt->orderbyvals = palloc_array(Datum, scandesc->numberOfOrderBys); + rt->orderbynulls = palloc_array(bool, scandesc->numberOfOrderBys); for (i = 0; i < node->iss_NumOrderByKeys; i++) { if (!orderbynulls[i]) diff --git a/src/backend/executor/nodeMemoize.c b/src/backend/executor/nodeMemoize.c index 609deb12afb2a..7444391e8a194 100644 --- a/src/backend/executor/nodeMemoize.c +++ b/src/backend/executor/nodeMemoize.c @@ -66,6 +66,7 @@ #include "postgres.h" +#include "access/htup_details.h" #include "common/hashfn.h" #include "executor/executor.h" #include "executor/nodeMemoize.h" @@ -554,7 +555,7 @@ cache_lookup(MemoizeState *mstate, bool *found) oldcontext = MemoryContextSwitchTo(mstate->tableContext); /* Allocate a new key */ - entry->key = key = (MemoizeKey *) palloc(sizeof(MemoizeKey)); + entry->key = key = palloc_object(MemoizeKey); key->params = ExecCopySlotMinimalTuple(mstate->probeslot); /* Update the total cache memory utilization */ @@ -633,7 +634,7 @@ cache_store_tuple(MemoizeState *mstate, TupleTableSlot *slot) oldcontext = MemoryContextSwitchTo(mstate->tableContext); - tuple = (MemoizeTuple *) palloc(sizeof(MemoizeTuple)); + tuple = palloc_object(MemoizeTuple); tuple->mintuple = ExecCopySlotMinimalTuple(slot); tuple->next = NULL; diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c index 405e8f942857f..300bcd5cf33a9 100644 --- a/src/backend/executor/nodeMergeAppend.c +++ b/src/backend/executor/nodeMergeAppend.c @@ -122,11 +122,11 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) mergestate->ms_prune_state = NULL; } - mergeplanstates = (PlanState **) palloc(nplans * sizeof(PlanState *)); + mergeplanstates = palloc_array(PlanState *, nplans); mergestate->mergeplans = mergeplanstates; mergestate->ms_nplans = nplans; - mergestate->ms_slots = (TupleTableSlot **) palloc0(sizeof(TupleTableSlot *) * nplans); + mergestate->ms_slots = palloc0_array(TupleTableSlot *, nplans); mergestate->ms_heap = binaryheap_allocate(nplans, heap_compare_slots, mergestate); @@ -174,7 +174,7 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) * initialize sort-key information */ mergestate->ms_nkeys = node->numCols; - mergestate->ms_sortkeys = palloc0(sizeof(SortSupportData) * node->numCols); + mergestate->ms_sortkeys = palloc0_array(SortSupportData, node->numCols); for (i = 0; i < node->numCols; i++) { diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 2bc89bf84dc3f..874b71e6608ea 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -64,9 +64,11 @@ #include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" #include "rewrite/rewriteHandler.h" +#include "rewrite/rewriteManip.h" #include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/datum.h" +#include "utils/injection_point.h" #include "utils/rel.h" #include "utils/snapmgr.h" @@ -579,8 +581,8 @@ ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo, oldContext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); - values = palloc(sizeof(*values) * natts); - nulls = palloc(sizeof(*nulls) * natts); + values = palloc_array(Datum, natts); + nulls = palloc_array(bool, natts); slot_getallattrs(slot); memcpy(nulls, slot->tts_isnull, sizeof(*nulls) * natts); @@ -960,10 +962,8 @@ ExecInsert(ModifyTableContext *context, if (resultRelInfo->ri_Slots == NULL) { - resultRelInfo->ri_Slots = palloc(sizeof(TupleTableSlot *) * - resultRelInfo->ri_BatchSize); - resultRelInfo->ri_PlanSlots = palloc(sizeof(TupleTableSlot *) * - resultRelInfo->ri_BatchSize); + resultRelInfo->ri_Slots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize); + resultRelInfo->ri_PlanSlots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize); } /* @@ -1185,6 +1185,7 @@ ExecInsert(ModifyTableContext *context, * if we're going to go ahead with the insertion, instead of * waiting for the whole transaction to complete. */ + INJECTION_POINT("exec-insert-before-insert-speculative", NULL); specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); /* insert the tuple, with the speculative token */ @@ -1473,7 +1474,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRDeleteTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, - epqreturnslot, result, &context->tmfd); + epqreturnslot, result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; @@ -2116,7 +2118,8 @@ ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRUpdateTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, slot, - result, &context->tmfd); + result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; @@ -3399,7 +3402,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * the tuple moved, and setting our current * resultRelInfo to that. */ - if (ItemPointerIndicatesMovedPartitions(&context->tmfd.ctid)) + if (ItemPointerIndicatesMovedPartitions(tupleid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("tuple to be merged was already moved to another partition due to concurrent update"))); @@ -3447,12 +3450,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (ItemPointerIsValid(&lockedtid)) UnlockTuple(resultRelInfo->ri_RelationDesc, &lockedtid, InplaceUpdateTupleLock); - LockTuple(resultRelInfo->ri_RelationDesc, &context->tmfd.ctid, + LockTuple(resultRelInfo->ri_RelationDesc, tupleid, InplaceUpdateTupleLock); - lockedtid = context->tmfd.ctid; + lockedtid = *tupleid; } + if (!table_tuple_fetch_row_version(resultRelationDesc, - &context->tmfd.ctid, + tupleid, SnapshotAny, resultRelInfo->ri_oldTupleSlot)) elog(ERROR, "failed to fetch the target tuple"); @@ -3463,7 +3467,28 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, /* Switch lists, if necessary */ if (!*matched) + { actionStates = mergeActions[MERGE_WHEN_NOT_MATCHED_BY_SOURCE]; + + /* + * If we have both NOT MATCHED BY SOURCE + * and NOT MATCHED BY TARGET actions (a + * full join between the source and target + * relations), the single previously + * matched tuple from the outer plan node + * is treated as two not matched tuples, + * in the same way as if they had not + * matched to start with. Therefore, we + * must adjust the outer plan node's tuple + * count, if we're instrumenting the + * query, to get the correct "skipped" row + * count --- see show_modifytable_info(). + */ + if (outerPlanState(mtstate)->instrument && + mergeActions[MERGE_WHEN_NOT_MATCHED_BY_SOURCE] && + mergeActions[MERGE_WHEN_NOT_MATCHED_BY_TARGET]) + InstrUpdateTupleCount(outerPlanState(mtstate)->instrument, 1.0); + } } /* @@ -3735,6 +3760,7 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) switch (action->commandType) { case CMD_INSERT: + /* INSERT actions always use rootRelInfo */ ExecCheckPlanOutput(rootRelInfo->ri_RelationDesc, action->targetList); @@ -3774,9 +3800,23 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) } else { - /* not partitioned? use the stock relation and slot */ - tgtslot = resultRelInfo->ri_newTupleSlot; - tgtdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + /* + * If the MERGE targets an inherited table, we insert + * into the root table, so we must initialize its + * "new" tuple slot, if not already done, and use its + * relation descriptor for the projection. + * + * For non-inherited tables, rootRelInfo and + * resultRelInfo are the same, and the "new" tuple + * slot will already have been initialized. + */ + if (rootRelInfo->ri_newTupleSlot == NULL) + rootRelInfo->ri_newTupleSlot = + table_slot_create(rootRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + + tgtslot = rootRelInfo->ri_newTupleSlot; + tgtdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); } action_state->mas_proj = @@ -3809,6 +3849,114 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) } } } + + /* + * If the MERGE targets an inherited table, any INSERT actions will use + * rootRelInfo, and rootRelInfo will not be in the resultRelInfo array. + * Therefore we must initialize its WITH CHECK OPTION constraints and + * RETURNING projection, as ExecInitModifyTable did for the resultRelInfo + * entries. + * + * Note that the planner does not build a withCheckOptionList or + * returningList for the root relation, but as in ExecInitPartitionInfo, + * we can use the first resultRelInfo entry as a reference to calculate + * the attno's for the root table. + */ + if (rootRelInfo != mtstate->resultRelInfo && + rootRelInfo->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && + (mtstate->mt_merge_subcommands & MERGE_INSERT) != 0) + { + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Relation rootRelation = rootRelInfo->ri_RelationDesc; + Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; + int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + AttrMap *part_attmap = NULL; + bool found_whole_row; + + if (node->withCheckOptionLists != NIL) + { + List *wcoList; + List *wcoExprs = NIL; + + /* There should be as many WCO lists as result rels */ + Assert(list_length(node->withCheckOptionLists) == + list_length(node->resultRelations)); + + /* + * Use the first WCO list as a reference. In the most common case, + * this will be for the same relation as rootRelInfo, and so there + * will be no need to adjust its attno's. + */ + wcoList = linitial(node->withCheckOptionLists); + if (rootRelation != firstResultRel) + { + /* Convert any Vars in it to contain the root's attno's */ + part_attmap = + build_attrmap_by_name(RelationGetDescr(rootRelation), + RelationGetDescr(firstResultRel), + false); + + wcoList = (List *) + map_variable_attnos((Node *) wcoList, + firstVarno, 0, + part_attmap, + RelationGetForm(rootRelation)->reltype, + &found_whole_row); + } + + foreach(lc, wcoList) + { + WithCheckOption *wco = lfirst_node(WithCheckOption, lc); + ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), + &mtstate->ps); + + wcoExprs = lappend(wcoExprs, wcoExpr); + } + + rootRelInfo->ri_WithCheckOptions = wcoList; + rootRelInfo->ri_WithCheckOptionExprs = wcoExprs; + } + + if (node->returningLists != NIL) + { + List *returningList; + + /* There should be as many returning lists as result rels */ + Assert(list_length(node->returningLists) == + list_length(node->resultRelations)); + + /* + * Use the first returning list as a reference. In the most common + * case, this will be for the same relation as rootRelInfo, and so + * there will be no need to adjust its attno's. + */ + returningList = linitial(node->returningLists); + if (rootRelation != firstResultRel) + { + /* Convert any Vars in it to contain the root's attno's */ + if (part_attmap == NULL) + part_attmap = + build_attrmap_by_name(RelationGetDescr(rootRelation), + RelationGetDescr(firstResultRel), + false); + + returningList = (List *) + map_variable_attnos((Node *) returningList, + firstVarno, 0, + part_attmap, + RelationGetForm(rootRelation)->reltype, + &found_whole_row); + } + rootRelInfo->ri_returningList = returningList; + + /* Initialize the RETURNING projection */ + rootRelInfo->ri_projectReturning = + ExecBuildProjectionInfo(returningList, econtext, + mtstate->ps.ps_ResultTupleSlot, + &mtstate->ps, + RelationGetDescr(rootRelation)); + } + } } /* @@ -4595,8 +4743,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) mtstate->mt_done = false; mtstate->mt_nrels = nrels; - mtstate->resultRelInfo = (ResultRelInfo *) - palloc(nrels * sizeof(ResultRelInfo)); + mtstate->resultRelInfo = palloc_array(ResultRelInfo, nrels); mtstate->mt_merge_pending_not_matched = NULL; mtstate->mt_merge_inserted = 0; @@ -4685,7 +4832,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) /* * Verify result relation is a valid target for the current operation */ - CheckValidResultRel(resultRelInfo, operation, mergeActions); + CheckValidResultRel(resultRelInfo, operation, node->onConflictAction, + mergeActions); resultRelInfo++; i++; diff --git a/src/backend/executor/nodeProjectSet.c b/src/backend/executor/nodeProjectSet.c index 880f39fb2ff1e..7d621cebc7bc6 100644 --- a/src/backend/executor/nodeProjectSet.c +++ b/src/backend/executor/nodeProjectSet.c @@ -267,10 +267,8 @@ ExecInitProjectSet(ProjectSet *node, EState *estate, int eflags) /* Create workspace for per-tlist-entry expr state & SRF-is-done state */ state->nelems = list_length(node->plan.targetlist); - state->elems = (Node **) - palloc(sizeof(Node *) * state->nelems); - state->elemdone = (ExprDoneCond *) - palloc(sizeof(ExprDoneCond) * state->nelems); + state->elems = palloc_array(Node *, state->nelems); + state->elemdone = palloc_array(ExprDoneCond, state->nelems); /* * Build expressions to evaluate targetlist. We can't use diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c index 40f66fd0680b2..cd0ad51dcd297 100644 --- a/src/backend/executor/nodeRecursiveunion.c +++ b/src/backend/executor/nodeRecursiveunion.c @@ -35,7 +35,6 @@ build_hash_table(RecursiveUnionState *rustate) TupleDesc desc = ExecGetResultType(outerPlanState(rustate)); Assert(node->numCols > 0); - Assert(node->numGroups > 0); /* * If both child plans deliver the same fixed tuple slot type, we can tell @@ -53,7 +52,7 @@ build_hash_table(RecursiveUnionState *rustate) node->numGroups, 0, rustate->ps.state->es_query_cxt, - rustate->tableContext, + rustate->tuplesContext, rustate->tempContext, false); } @@ -197,7 +196,7 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags) rustate->hashfunctions = NULL; rustate->hashtable = NULL; rustate->tempContext = NULL; - rustate->tableContext = NULL; + rustate->tuplesContext = NULL; /* initialize processing state */ rustate->recursing = false; @@ -209,7 +208,8 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags) * If hashing, we need a per-tuple memory context for comparisons, and a * longer-lived context to store the hash table. The table can't just be * kept in the per-query context because we want to be able to throw it - * away when rescanning. + * away when rescanning. We can use a BumpContext to save storage, + * because we will have no need to delete individual table entries. */ if (node->numCols > 0) { @@ -217,10 +217,10 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags) AllocSetContextCreate(CurrentMemoryContext, "RecursiveUnion", ALLOCSET_DEFAULT_SIZES); - rustate->tableContext = - AllocSetContextCreate(CurrentMemoryContext, - "RecursiveUnion hash table", - ALLOCSET_DEFAULT_SIZES); + rustate->tuplesContext = + BumpContextCreate(CurrentMemoryContext, + "RecursiveUnion hashed tuples", + ALLOCSET_DEFAULT_SIZES); } /* @@ -288,11 +288,11 @@ ExecEndRecursiveUnion(RecursiveUnionState *node) tuplestore_end(node->working_table); tuplestore_end(node->intermediate_table); - /* free subsidiary stuff including hashtable */ + /* free subsidiary stuff including hashtable data */ if (node->tempContext) MemoryContextDelete(node->tempContext); - if (node->tableContext) - MemoryContextDelete(node->tableContext); + if (node->tuplesContext) + MemoryContextDelete(node->tuplesContext); /* * close down subplans @@ -328,10 +328,6 @@ ExecReScanRecursiveUnion(RecursiveUnionState *node) if (outerPlan->chgParam == NULL) ExecReScan(outerPlan); - /* Release any hashtable storage */ - if (node->tableContext) - MemoryContextReset(node->tableContext); - /* Empty hashtable if needed */ if (plan->numCols > 0) ResetTupleHashTable(node->hashtable); diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 6b3db7548ed99..c28bc6fc62044 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -228,7 +228,7 @@ tablesample_init(SampleScanState *scanstate) ListCell *arg; scanstate->donetuples = 0; - params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum)); + params = palloc_array(Datum, list_length(scanstate->args)); i = 0; foreach(arg, scanstate->args) diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index ed35c58c2c346..94047d29430d6 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -131,8 +131,12 @@ ExecSeqScanWithQual(PlanState *pstate) { SeqScanState *node = castNode(SeqScanState, pstate); + /* + * Use pg_assume() for != NULL tests to make the compiler realize no + * runtime check for the field is needed in ExecScanExtended(). + */ Assert(pstate->state->es_epq_active == NULL); - Assert(pstate->qual != NULL); + pg_assume(pstate->qual != NULL); Assert(pstate->ps_ProjInfo == NULL); return ExecScanExtended(&node->ss, @@ -153,7 +157,7 @@ ExecSeqScanWithProject(PlanState *pstate) Assert(pstate->state->es_epq_active == NULL); Assert(pstate->qual == NULL); - Assert(pstate->ps_ProjInfo != NULL); + pg_assume(pstate->ps_ProjInfo != NULL); return ExecScanExtended(&node->ss, (ExecScanAccessMtd) SeqNext, @@ -173,8 +177,8 @@ ExecSeqScanWithQualProject(PlanState *pstate) SeqScanState *node = castNode(SeqScanState, pstate); Assert(pstate->state->es_epq_active == NULL); - Assert(pstate->qual != NULL); - Assert(pstate->ps_ProjInfo != NULL); + pg_assume(pstate->qual != NULL); + pg_assume(pstate->ps_ProjInfo != NULL); return ExecScanExtended(&node->ss, (ExecScanAccessMtd) SeqNext, diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c index 4068481a52392..9e0f9274fb190 100644 --- a/src/backend/executor/nodeSetOp.c +++ b/src/backend/executor/nodeSetOp.c @@ -88,7 +88,6 @@ build_hash_table(SetOpState *setopstate) TupleDesc desc = ExecGetResultType(outerPlanState(setopstate)); Assert(node->strategy == SETOP_HASHED); - Assert(node->numGroups > 0); /* * If both child plans deliver the same fixed tuple slot type, we can tell @@ -106,11 +105,20 @@ build_hash_table(SetOpState *setopstate) node->numGroups, sizeof(SetOpStatePerGroupData), setopstate->ps.state->es_query_cxt, - setopstate->tableContext, + setopstate->tuplesContext, econtext->ecxt_per_tuple_memory, false); } +/* Planner support routine to estimate space needed for hash table */ +Size +EstimateSetOpHashTableSpace(double nentries, Size tupleWidth) +{ + return EstimateTupleHashTableSpace(nentries, + tupleWidth, + sizeof(SetOpStatePerGroupData)); +} + /* * We've completed processing a tuple group. Decide how many copies (if any) * of its representative row to emit, and store the count into numOutput. @@ -589,13 +597,15 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) /* * If hashing, we also need a longer-lived context to store the hash * table. The table can't just be kept in the per-query context because - * we want to be able to throw it away in ExecReScanSetOp. + * we want to be able to throw it away in ExecReScanSetOp. We can use a + * BumpContext to save storage, because we will have no need to delete + * individual table entries. */ if (node->strategy == SETOP_HASHED) - setopstate->tableContext = - AllocSetContextCreate(CurrentMemoryContext, - "SetOp hash table", - ALLOCSET_DEFAULT_SIZES); + setopstate->tuplesContext = + BumpContextCreate(CurrentMemoryContext, + "SetOp hashed tuples", + ALLOCSET_DEFAULT_SIZES); /* * initialize child nodes @@ -680,9 +690,9 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) void ExecEndSetOp(SetOpState *node) { - /* free subsidiary stuff including hashtable */ - if (node->tableContext) - MemoryContextDelete(node->tableContext); + /* free subsidiary stuff including hashtable data */ + if (node->tuplesContext) + MemoryContextDelete(node->tuplesContext); ExecEndNode(outerPlanState(node)); ExecEndNode(innerPlanState(node)); @@ -721,11 +731,7 @@ ExecReScanSetOp(SetOpState *node) return; } - /* Release any hashtable storage */ - if (node->tableContext) - MemoryContextReset(node->tableContext); - - /* And rebuild an empty hashtable */ + /* Else, we must rebuild the hashtable */ ResetTupleHashTable(node->hashtable); node->table_filled = false; } diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index f7f6fc2da0b95..c8b7bd9eb6610 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -34,7 +34,6 @@ #include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" -#include "optimizer/optimizer.h" #include "utils/array.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -102,6 +101,7 @@ ExecHashSubPlan(SubPlanState *node, ExprContext *econtext, bool *isNull) { + bool result = false; SubPlan *subplan = node->subplan; PlanState *planstate = node->planstate; TupleTableSlot *slot; @@ -132,14 +132,6 @@ ExecHashSubPlan(SubPlanState *node, node->projLeft->pi_exprContext = econtext; slot = ExecProject(node->projLeft); - /* - * Note: because we are typically called in a per-tuple context, we have - * to explicitly clear the projected tuple before returning. Otherwise, - * we'll have a double-free situation: the per-tuple context will probably - * be reset before we're called again, and then the tuple slot will think - * it still needs to free the tuple. - */ - /* * If the LHS is all non-null, probe for an exact match in the main hash * table. If we find one, the result is TRUE. Otherwise, scan the @@ -161,19 +153,10 @@ ExecHashSubPlan(SubPlanState *node, slot, node->cur_eq_comp, node->lhs_hash_expr) != NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(true); - } - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + result = true; + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - ExecClearTuple(slot); - return BoolGetDatum(false); } /* @@ -186,34 +169,31 @@ ExecHashSubPlan(SubPlanState *node, * aren't provably unequal to the LHS; if so, the result is UNKNOWN. * Otherwise, the result is FALSE. */ - if (node->hashnulls == NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(false); - } - if (slotAllNulls(slot)) - { - ExecClearTuple(slot); + else if (node->hashnulls == NULL) + /* just return FALSE */ ; + else if (slotAllNulls(slot)) *isNull = true; - return BoolGetDatum(false); - } /* Scan partly-null table first, since more likely to get a match */ - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - if (node->havehashrows && - findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havehashrows && + findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } + + /* + * Note: because we are typically called in a per-tuple context, we have + * to explicitly clear the projected tuple before returning. Otherwise, + * we'll have a double-free situation: the per-tuple context will probably + * be reset before we're called again, and then the tuple slot will think + * it still needs to free the tuple. + */ ExecClearTuple(slot); - return BoolGetDatum(false); + + /* Also must reset the innerecontext after each hashtable lookup. */ + ResetExprContext(node->innerecontext); + + return BoolGetDatum(result); } /* @@ -500,7 +480,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) int ncols = node->numCols; ExprContext *innerecontext = node->innerecontext; MemoryContext oldcontext; - long nbuckets; + double nentries; TupleTableSlot *slot; Assert(subplan->subLinkType == ANY_SUBLINK); @@ -525,13 +505,10 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) * saves a needless fetch inner op step for the hashing ExprState created * in BuildTupleHashTable(). */ - MemoryContextReset(node->hashtablecxt); node->havehashrows = false; node->havenullrows = false; - nbuckets = clamp_cardinality_to_long(planstate->plan->plan_rows); - if (nbuckets < 1) - nbuckets = 1; + nentries = planstate->plan->plan_rows; if (node->hashtable) ResetTupleHashTable(node->hashtable); @@ -544,22 +521,22 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) node->tab_eq_funcoids, node->tab_hash_funcs, node->tab_collations, - nbuckets, - 0, + nentries, + 0, /* no additional data */ node->planstate->state->es_query_cxt, - node->hashtablecxt, - node->hashtempcxt, + node->tuplesContext, + innerecontext->ecxt_per_tuple_memory, false); if (!subplan->unknownEqFalse) { if (ncols == 1) - nbuckets = 1; /* there can only be one entry */ + nentries = 1; /* there can only be one entry */ else { - nbuckets /= 16; - if (nbuckets < 1) - nbuckets = 1; + nentries /= 16; + if (nentries < 1) + nentries = 1; } if (node->hashnulls) @@ -573,11 +550,11 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) node->tab_eq_funcoids, node->tab_hash_funcs, node->tab_collations, - nbuckets, - 0, + nentries, + 0, /* no additional data */ node->planstate->state->es_query_cxt, - node->hashtablecxt, - node->hashtempcxt, + node->tuplesContext, + innerecontext->ecxt_per_tuple_memory, false); } else @@ -639,7 +616,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) /* * Reset innerecontext after each inner tuple to free any memory used - * during ExecProject. + * during ExecProject and hashtable lookup. */ ResetExprContext(innerecontext); } @@ -656,6 +633,55 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) MemoryContextSwitchTo(oldcontext); } +/* Planner support routine to estimate space needed for hash table(s) */ +Size +EstimateSubplanHashTableSpace(double nentries, + Size tupleWidth, + bool unknownEqFalse) +{ + Size tab1space, + tab2space; + + /* Estimate size of main hashtable */ + tab1space = EstimateTupleHashTableSpace(nentries, + tupleWidth, + 0 /* no additional data */ ); + + /* Give up if that's already too big */ + if (tab1space >= SIZE_MAX) + return tab1space; + + /* Done if we don't need a hashnulls table */ + if (unknownEqFalse) + return tab1space; + + /* + * Adjust the rowcount estimate in the same way that buildSubPlanHash + * will, except that we don't bother with the special case for a single + * hash column. (We skip that detail because it'd be notationally painful + * for our caller to provide the column count, and this table has + * relatively little impact on the total estimate anyway.) + */ + nentries /= 16; + if (nentries < 1) + nentries = 1; + + /* + * It might be sane to also reduce the tupleWidth, but on the other hand + * we are not accounting for the space taken by the tuples' null bitmaps. + * Leave it alone for now. + */ + tab2space = EstimateTupleHashTableSpace(nentries, + tupleWidth, + 0 /* no additional data */ ); + + /* Guard against overflow */ + if (tab2space >= SIZE_MAX - tab1space) + return SIZE_MAX; + + return tab1space + tab2space; +} + /* * execTuplesUnequal * Return true if two tuples are definitely unequal in the indicated @@ -857,8 +883,7 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) sstate->projRight = NULL; sstate->hashtable = NULL; sstate->hashnulls = NULL; - sstate->hashtablecxt = NULL; - sstate->hashtempcxt = NULL; + sstate->tuplesContext = NULL; sstate->innerecontext = NULL; sstate->keyColIdx = NULL; sstate->tab_eq_funcoids = NULL; @@ -909,16 +934,11 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) *righttlist; ListCell *l; - /* We need a memory context to hold the hash table(s) */ - sstate->hashtablecxt = - AllocSetContextCreate(CurrentMemoryContext, - "Subplan HashTable Context", - ALLOCSET_DEFAULT_SIZES); - /* and a small one for the hash tables to use as temp storage */ - sstate->hashtempcxt = - AllocSetContextCreate(CurrentMemoryContext, - "Subplan HashTable Temp Context", - ALLOCSET_SMALL_SIZES); + /* We need a memory context to hold the hash table(s)' tuples */ + sstate->tuplesContext = + BumpContextCreate(CurrentMemoryContext, + "SubPlan hashed tuples", + ALLOCSET_DEFAULT_SIZES); /* and a short-lived exprcontext for function evaluation */ sstate->innerecontext = CreateExprContext(estate); diff --git a/src/backend/executor/nodeTableFuncscan.c b/src/backend/executor/nodeTableFuncscan.c index 83ade3f943763..4abada0e03e0f 100644 --- a/src/backend/executor/nodeTableFuncscan.c +++ b/src/backend/executor/nodeTableFuncscan.c @@ -192,8 +192,8 @@ ExecInitTableFuncScan(TableFuncScan *node, EState *estate, int eflags) scanstate->notnulls = tf->notnulls; /* these are allocated now and initialized later */ - scanstate->in_functions = palloc(sizeof(FmgrInfo) * tupdesc->natts); - scanstate->typioparams = palloc(sizeof(Oid) * tupdesc->natts); + scanstate->in_functions = palloc_array(FmgrInfo, tupdesc->natts); + scanstate->typioparams = palloc_array(Oid, tupdesc->natts); /* * Fill in the necessary fmgr infos. @@ -363,7 +363,7 @@ tfuncInitialize(TableFuncScanState *tstate, ExprContext *econtext, Datum doc) char *ns_uri; char *ns_name; - value = ExecEvalExpr((ExprState *) expr, econtext, &isnull); + value = ExecEvalExpr(expr, econtext, &isnull); if (isnull) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index ab2eab9596e42..4ceb181d622c0 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -72,7 +72,7 @@ MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate) else elog(ERROR, "could not identify CTID variable"); - tidopexpr = (TidOpExpr *) palloc(sizeof(TidOpExpr)); + tidopexpr = palloc_object(TidOpExpr); tidopexpr->inclusive = false; /* for now */ switch (expr->opno) @@ -128,9 +128,11 @@ TidExprListCreate(TidRangeScanState *tidrangestate) * TidRangeEval * * Compute and set node's block and offset range to scan by evaluating - * the trss_tidexprs. Returns false if we detect the range cannot + * node->trss_tidexprs. Returns false if we detect the range cannot * contain any tuples. Returns true if it's possible for the range to - * contain tuples. + * contain tuples. We don't bother validating that trss_mintid is less + * than or equal to trss_maxtid, as the scan_set_tidrange() table AM + * function will handle that. * ---------------------------------------------------------------- */ static bool @@ -272,6 +274,16 @@ TidRangeNext(TidRangeScanState *node) static bool TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot) { + if (!TidRangeEval(node)) + return false; + + Assert(ItemPointerIsValid(&slot->tts_tid)); + + /* Recheck the ctid is still within range */ + if (ItemPointerCompare(&slot->tts_tid, &node->trss_mintid) < 0 || + ItemPointerCompare(&slot->tts_tid, &node->trss_maxtid) > 0) + return false; + return true; } @@ -403,3 +415,83 @@ ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags) */ return tidrangestate; } + +/* ---------------------------------------------------------------- + * Parallel Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecTidRangeScanEstimate + * + * Compute the amount of space we'll need in the parallel + * query DSM, and inform pcxt->estimator about our needs. + * ---------------------------------------------------------------- + */ +void +ExecTidRangeScanEstimate(TidRangeScanState *node, ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + node->trss_pscanlen = + table_parallelscan_estimate(node->ss.ss_currentRelation, + estate->es_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, node->trss_pscanlen); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecTidRangeScanInitializeDSM + * + * Set up a parallel TID range scan descriptor. + * ---------------------------------------------------------------- + */ +void +ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + ParallelTableScanDesc pscan; + + pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); + table_parallelscan_initialize(node->ss.ss_currentRelation, + pscan, + estate->es_snapshot); + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + node->ss.ss_currentScanDesc = + table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, + pscan); +} + +/* ---------------------------------------------------------------- + * ExecTidRangeScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecTidRangeScanReInitializeDSM(TidRangeScanState *node, + ParallelContext *pcxt) +{ + ParallelTableScanDesc pscan; + + pscan = node->ss.ss_currentScanDesc->rs_parallel; + table_parallelscan_reinitialize(node->ss.ss_currentRelation, pscan); +} + +/* ---------------------------------------------------------------- + * ExecTidRangeScanInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void +ExecTidRangeScanInitializeWorker(TidRangeScanState *node, + ParallelWorkerContext *pwcxt) +{ + ParallelTableScanDesc pscan; + + pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); + node->ss.ss_currentScanDesc = + table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, + pscan); +} diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 5e56e29a15fc4..35fcd5625db32 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -78,7 +78,7 @@ TidExprListCreate(TidScanState *tidstate) foreach(l, node->tidquals) { Expr *expr = (Expr *) lfirst(l); - TidExpr *tidexpr = (TidExpr *) palloc0(sizeof(TidExpr)); + TidExpr *tidexpr = palloc0_object(TidExpr); if (is_opclause(expr)) { @@ -402,12 +402,23 @@ TidNext(TidScanState *node) static bool TidRecheck(TidScanState *node, TupleTableSlot *slot) { + ItemPointer match; + + /* WHERE CURRENT OF always intends to resolve to the latest tuple */ + if (node->tss_isCurrentOf) + return true; + + if (node->tss_TidList == NULL) + TidListEval(node); + /* - * XXX shouldn't we check here to make sure tuple matches TID list? In - * runtime-key case this is not certain, is it? However, in the WHERE - * CURRENT OF case it might not match anyway ... + * Binary search the TidList to see if this ctid is mentioned and return + * true if it is. */ - return true; + match = (ItemPointer) bsearch(&slot->tts_tid, node->tss_TidList, + node->tss_NumTids, sizeof(ItemPointerData), + itemptr_comparator); + return match != NULL; } diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c index 9a1acce2b5d36..d92d632e248d2 100644 --- a/src/backend/executor/nodeWindowAgg.c +++ b/src/backend/executor/nodeWindowAgg.c @@ -69,6 +69,16 @@ typedef struct WindowObjectData int readptr; /* tuplestore read pointer for this fn */ int64 markpos; /* row that markptr is positioned on */ int64 seekpos; /* row that readptr is positioned on */ + uint8 **notnull_info; /* not null info for each func args */ + int64 *num_notnull_info; /* track size (number of tuples in + * partition) of the notnull_info array + * for each func args */ + + /* + * Null treatment options. One of: NO_NULLTREATMENT, PARSER_IGNORE_NULLS, + * PARSER_RESPECT_NULLS or IGNORE_NULLS. + */ + int ignore_nulls; } WindowObjectData; /* @@ -96,9 +106,10 @@ typedef struct WindowStatePerFuncData bool plain_agg; /* is it just a plain aggregate function? */ int aggno; /* if so, index of its WindowStatePerAggData */ + uint8 ignore_nulls; /* ignore nulls */ WindowObject winobj; /* object used in window function API */ -} WindowStatePerFuncData; +} WindowStatePerFuncData; /* * For plain aggregate window functions, we also have one of these. @@ -182,8 +193,8 @@ static void begin_partition(WindowAggState *winstate); static void spool_tuples(WindowAggState *winstate, int64 pos); static void release_partition(WindowAggState *winstate); -static int row_is_in_frame(WindowAggState *winstate, int64 pos, - TupleTableSlot *slot); +static int row_is_in_frame(WindowObject winobj, int64 pos, + TupleTableSlot *slot, bool fetch_tuple); static void update_frameheadpos(WindowAggState *winstate); static void update_frametailpos(WindowAggState *winstate); static void update_grouptailpos(WindowAggState *winstate); @@ -198,6 +209,38 @@ static bool are_peers(WindowAggState *winstate, TupleTableSlot *slot1, static bool window_gettupleslot(WindowObject winobj, int64 pos, TupleTableSlot *slot); +static Datum ignorenulls_getfuncarginframe(WindowObject winobj, int argno, + int relpos, int seektype, + bool set_mark, bool *isnull, + bool *isout); +static Datum gettuple_eval_partition(WindowObject winobj, int argno, + int64 abs_pos, bool *isnull, + bool *isout); +static void init_notnull_info(WindowObject winobj, + WindowStatePerFunc perfuncstate); +static void grow_notnull_info(WindowObject winobj, + int64 pos, int argno); +static uint8 get_notnull_info(WindowObject winobj, + int64 pos, int argno); +static void put_notnull_info(WindowObject winobj, + int64 pos, int argno, bool isnull); + +/* + * Not null info bit array consists of 2-bit items + */ +#define NN_UNKNOWN 0x00 /* value not calculated yet */ +#define NN_NULL 0x01 /* NULL */ +#define NN_NOTNULL 0x02 /* NOT NULL */ +#define NN_MASK 0x03 /* mask for NOT NULL MAP */ +#define NN_BITS_PER_MEMBER 2 /* number of bits in not null map */ +/* number of items per variable */ +#define NN_ITEM_PER_VAR (BITS_PER_BYTE / NN_BITS_PER_MEMBER) +/* convert map position to byte offset */ +#define NN_POS_TO_BYTES(pos) ((pos) / NN_ITEM_PER_VAR) +/* bytes offset to map position */ +#define NN_BYTES_TO_POS(bytes) ((bytes) * NN_ITEM_PER_VAR) +/* calculate shift bits */ +#define NN_SHIFT(pos) ((pos) % NN_ITEM_PER_VAR) * NN_BITS_PER_MEMBER /* * initialize_windowaggregate @@ -942,7 +985,8 @@ eval_windowaggregates(WindowAggState *winstate) * Exit loop if no more rows can be in frame. Skip aggregation if * current row is not in frame but there might be more in the frame. */ - ret = row_is_in_frame(winstate, winstate->aggregatedupto, agg_row_slot); + ret = row_is_in_frame(agg_winobj, winstate->aggregatedupto, + agg_row_slot, false); if (ret < 0) break; if (ret == 0) @@ -1263,6 +1307,22 @@ begin_partition(WindowAggState *winstate) winobj->markpos = -1; winobj->seekpos = -1; + + /* reset null map */ + if (winobj->ignore_nulls == IGNORE_NULLS || + winobj->ignore_nulls == PARSER_IGNORE_NULLS) + { + int numargs = perfuncstate->numArguments; + + for (int j = 0; j < numargs; j++) + { + int n = winobj->num_notnull_info[j]; + + if (n > 0) + memset(winobj->notnull_info[j], 0, + NN_POS_TO_BYTES(n)); + } + } } } @@ -1412,8 +1472,8 @@ release_partition(WindowAggState *winstate) * to our window framing rule * * The caller must have already determined that the row is in the partition - * and fetched it into a slot. This function just encapsulates the framing - * rules. + * and fetched it into a slot if fetch_tuple is false. + * This function just encapsulates the framing rules. * * Returns: * -1, if the row is out of frame and no succeeding rows can be in frame @@ -1423,8 +1483,10 @@ release_partition(WindowAggState *winstate) * May clobber winstate->temp_slot_2. */ static int -row_is_in_frame(WindowAggState *winstate, int64 pos, TupleTableSlot *slot) +row_is_in_frame(WindowObject winobj, int64 pos, TupleTableSlot *slot, + bool fetch_tuple) { + WindowAggState *winstate = winobj->winstate; int frameOptions = winstate->frameOptions; Assert(pos >= 0); /* else caller error */ @@ -1453,9 +1515,14 @@ row_is_in_frame(WindowAggState *winstate, int64 pos, TupleTableSlot *slot) else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) { /* following row that is not peer is out of frame */ - if (pos > winstate->currentpos && - !are_peers(winstate, slot, winstate->ss.ss_ScanTupleSlot)) - return -1; + if (pos > winstate->currentpos) + { + if (fetch_tuple) /* need to fetch tuple? */ + if (!window_gettupleslot(winobj, pos, slot)) + return -1; + if (!are_peers(winstate, slot, winstate->ss.ss_ScanTupleSlot)) + return -1; + } } else Assert(false); @@ -2594,14 +2661,14 @@ ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags) numfuncs = winstate->numfuncs; numaggs = winstate->numaggs; econtext = winstate->ss.ps.ps_ExprContext; - econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numfuncs); - econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numfuncs); + econtext->ecxt_aggvalues = palloc0_array(Datum, numfuncs); + econtext->ecxt_aggnulls = palloc0_array(bool, numfuncs); /* * allocate per-wfunc/per-agg state information. */ - perfunc = (WindowStatePerFunc) palloc0(sizeof(WindowStatePerFuncData) * numfuncs); - peragg = (WindowStatePerAgg) palloc0(sizeof(WindowStatePerAggData) * numaggs); + perfunc = palloc0_array(WindowStatePerFuncData, numfuncs); + peragg = palloc0_array(WindowStatePerAggData, numaggs); winstate->perfunc = perfunc; winstate->peragg = peragg; @@ -2619,14 +2686,17 @@ ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags) elog(ERROR, "WindowFunc with winref %u assigned to WindowAgg with winref %u", wfunc->winref, node->winref); - /* Look for a previous duplicate window function */ + /* + * Look for a previous duplicate window function, which needs the same + * ignore_nulls value + */ for (i = 0; i <= wfuncno; i++) { if (equal(wfunc, perfunc[i].wfunc) && !contain_volatile_functions((Node *) wfunc)) break; } - if (i <= wfuncno) + if (i <= wfuncno && wfunc->ignore_nulls == perfunc[i].ignore_nulls) { /* Found a match to an existing entry, so just mark it */ wfuncstate->wfuncno = i; @@ -2679,6 +2749,8 @@ ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags) winobj->argstates = wfuncstate->args; winobj->localmem = NULL; perfuncstate->winobj = winobj; + winobj->ignore_nulls = wfunc->ignore_nulls; + init_notnull_info(winobj, perfuncstate); /* It's a real window function, so set up to call it. */ fmgr_info_cxt(wfunc->winfnoid, &perfuncstate->flinfo, @@ -3214,12 +3286,315 @@ window_gettupleslot(WindowObject winobj, int64 pos, TupleTableSlot *slot) return true; } +/* gettuple_eval_partition + * get tuple in a partition and evaluate the window function's argument + * expression on it. + */ +static Datum +gettuple_eval_partition(WindowObject winobj, int argno, + int64 abs_pos, bool *isnull, bool *isout) +{ + WindowAggState *winstate; + ExprContext *econtext; + TupleTableSlot *slot; + + winstate = winobj->winstate; + slot = winstate->temp_slot_1; + if (!window_gettupleslot(winobj, abs_pos, slot)) + { + /* out of partition */ + if (isout) + *isout = true; + *isnull = true; + return (Datum) 0; + } + + if (isout) + *isout = false; + econtext = winstate->ss.ps.ps_ExprContext; + econtext->ecxt_outertuple = slot; + return ExecEvalExpr((ExprState *) list_nth + (winobj->argstates, argno), + econtext, isnull); +} + +/* + * ignorenulls_getfuncarginframe + * For IGNORE NULLS, get the next nonnull value in the frame, moving forward + * or backward until we find a value or reach the frame's end. + */ +static Datum +ignorenulls_getfuncarginframe(WindowObject winobj, int argno, + int relpos, int seektype, bool set_mark, + bool *isnull, bool *isout) +{ + WindowAggState *winstate; + ExprContext *econtext; + TupleTableSlot *slot; + Datum datum; + int64 abs_pos; + int64 mark_pos; + int notnull_offset; + int notnull_relpos; + int forward; + + Assert(WindowObjectIsValid(winobj)); + winstate = winobj->winstate; + econtext = winstate->ss.ps.ps_ExprContext; + slot = winstate->temp_slot_1; + datum = (Datum) 0; + notnull_offset = 0; + notnull_relpos = abs(relpos); + + switch (seektype) + { + case WINDOW_SEEK_CURRENT: + elog(ERROR, "WINDOW_SEEK_CURRENT is not supported for WinGetFuncArgInFrame"); + abs_pos = mark_pos = 0; /* keep compiler quiet */ + break; + case WINDOW_SEEK_HEAD: + /* rejecting relpos < 0 is easy and simplifies code below */ + if (relpos < 0) + goto out_of_frame; + update_frameheadpos(winstate); + abs_pos = winstate->frameheadpos; + mark_pos = winstate->frameheadpos; + forward = 1; + break; + case WINDOW_SEEK_TAIL: + /* rejecting relpos > 0 is easy and simplifies code below */ + if (relpos > 0) + goto out_of_frame; + update_frametailpos(winstate); + abs_pos = winstate->frametailpos - 1; + mark_pos = 0; /* keep compiler quiet */ + forward = -1; + break; + default: + elog(ERROR, "unrecognized window seek type: %d", seektype); + abs_pos = mark_pos = 0; /* keep compiler quiet */ + break; + } + + /* + * Get the next nonnull value in the frame, moving forward or backward + * until we find a value or reach the frame's end. + */ + do + { + int inframe; + int v; + + /* + * Check apparent out of frame case. We need to do this because we + * may not call window_gettupleslot before row_is_in_frame, which + * supposes abs_pos is never negative. + */ + if (abs_pos < 0) + goto out_of_frame; + + /* check whether row is in frame */ + inframe = row_is_in_frame(winobj, abs_pos, slot, true); + if (inframe == -1) + goto out_of_frame; + else if (inframe == 0) + goto advance; + + if (isout) + *isout = false; + + v = get_notnull_info(winobj, abs_pos, argno); + if (v == NN_NULL) /* this row is known to be NULL */ + goto advance; + + else if (v == NN_UNKNOWN) /* need to check NULL or not */ + { + if (!window_gettupleslot(winobj, abs_pos, slot)) + goto out_of_frame; + + econtext->ecxt_outertuple = slot; + datum = ExecEvalExpr( + (ExprState *) list_nth(winobj->argstates, + argno), econtext, + isnull); + if (!*isnull) + notnull_offset++; + + /* record the row status */ + put_notnull_info(winobj, abs_pos, argno, *isnull); + } + else /* this row is known to be NOT NULL */ + { + notnull_offset++; + if (notnull_offset > notnull_relpos) + { + /* to prepare exiting this loop, datum needs to be set */ + if (!window_gettupleslot(winobj, abs_pos, slot)) + goto out_of_frame; + + econtext->ecxt_outertuple = slot; + datum = ExecEvalExpr( + (ExprState *) list_nth + (winobj->argstates, argno), + econtext, isnull); + } + } +advance: + abs_pos += forward; + } while (notnull_offset <= notnull_relpos); + + if (set_mark) + WinSetMarkPosition(winobj, mark_pos); + + return datum; + +out_of_frame: + if (isout) + *isout = true; + *isnull = true; + return (Datum) 0; +} + + +/* + * init_notnull_info + * Initialize non null map. + */ +static void +init_notnull_info(WindowObject winobj, WindowStatePerFunc perfuncstate) +{ + int numargs = perfuncstate->numArguments; + + if (winobj->ignore_nulls == PARSER_IGNORE_NULLS) + { + winobj->notnull_info = palloc0_array(uint8 *, numargs); + winobj->num_notnull_info = palloc0_array(int64, numargs); + } +} + +/* + * grow_notnull_info + * expand notnull_info if necessary. + * pos: not null info position + * argno: argument number +*/ +static void +grow_notnull_info(WindowObject winobj, int64 pos, int argno) +{ +/* initial number of notnull info members */ +#define INIT_NOT_NULL_INFO_NUM 128 + + if (pos >= winobj->num_notnull_info[argno]) + { + /* We may be called in a short-lived context */ + MemoryContext oldcontext = MemoryContextSwitchTo + (winobj->winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory); + + for (;;) + { + Size oldsize = NN_POS_TO_BYTES + (winobj->num_notnull_info[argno]); + Size newsize; + + if (oldsize == 0) /* memory has not been allocated yet for this + * arg */ + { + newsize = NN_POS_TO_BYTES(INIT_NOT_NULL_INFO_NUM); + winobj->notnull_info[argno] = palloc0(newsize); + } + else + { + newsize = oldsize * 2; + winobj->notnull_info[argno] = + repalloc0(winobj->notnull_info[argno], oldsize, newsize); + } + winobj->num_notnull_info[argno] = NN_BYTES_TO_POS(newsize); + if (winobj->num_notnull_info[argno] > pos) + break; + } + MemoryContextSwitchTo(oldcontext); + } +} + +/* + * get_notnull_info + * retrieve a map + * pos: map position + * argno: argument number + */ +static uint8 +get_notnull_info(WindowObject winobj, int64 pos, int argno) +{ + uint8 *mbp; + uint8 mb; + int64 bpos; + + grow_notnull_info(winobj, pos, argno); + bpos = NN_POS_TO_BYTES(pos); + mbp = winobj->notnull_info[argno]; + mb = mbp[bpos]; + return (mb >> (NN_SHIFT(pos))) & NN_MASK; +} + +/* + * put_notnull_info + * update map + * pos: map position + * argno: argument number + * isnull: indicate NULL or NOT + */ +static void +put_notnull_info(WindowObject winobj, int64 pos, int argno, bool isnull) +{ + uint8 *mbp; + uint8 mb; + int64 bpos; + uint8 val = isnull ? NN_NULL : NN_NOTNULL; + int shift; + + grow_notnull_info(winobj, pos, argno); + bpos = NN_POS_TO_BYTES(pos); + mbp = winobj->notnull_info[argno]; + mb = mbp[bpos]; + shift = NN_SHIFT(pos); + mb &= ~(NN_MASK << shift); /* clear map */ + mb |= (val << shift); /* update map */ + mbp[bpos] = mb; +} /*********************************************************************** * API exposed to window functions ***********************************************************************/ +/* + * WinCheckAndInitializeNullTreatment + * Check null treatment clause and sets ignore_nulls + * + * Window functions should call this to check if they are being called with + * a null treatment clause when they don't allow it, or to set ignore_nulls. + */ +void +WinCheckAndInitializeNullTreatment(WindowObject winobj, + bool allowNullTreatment, + FunctionCallInfo fcinfo) +{ + Assert(WindowObjectIsValid(winobj)); + if (winobj->ignore_nulls != NO_NULLTREATMENT && !allowNullTreatment) + { + const char *funcname = get_func_name(fcinfo->flinfo->fn_oid); + + if (!funcname) + elog(ERROR, "could not get function name"); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function %s does not allow RESPECT/IGNORE NULLS", + funcname))); + } + else if (winobj->ignore_nulls == PARSER_IGNORE_NULLS) + winobj->ignore_nulls = IGNORE_NULLS; +} + /* * WinGetPartitionLocalMemory * Get working memory that lives till end of partition processing @@ -3378,23 +3753,33 @@ WinGetFuncArgInPartition(WindowObject winobj, int argno, bool *isnull, bool *isout) { WindowAggState *winstate; - ExprContext *econtext; - TupleTableSlot *slot; - bool gottuple; int64 abs_pos; + int64 mark_pos; + Datum datum; + bool null_treatment; + int notnull_offset; + int notnull_relpos; + int forward; + bool myisout; Assert(WindowObjectIsValid(winobj)); winstate = winobj->winstate; - econtext = winstate->ss.ps.ps_ExprContext; - slot = winstate->temp_slot_1; + + null_treatment = (winobj->ignore_nulls == IGNORE_NULLS && relpos != 0); switch (seektype) { case WINDOW_SEEK_CURRENT: - abs_pos = winstate->currentpos + relpos; + if (null_treatment) + abs_pos = winstate->currentpos; + else + abs_pos = winstate->currentpos + relpos; break; case WINDOW_SEEK_HEAD: - abs_pos = relpos; + if (null_treatment) + abs_pos = 0; + else + abs_pos = relpos; break; case WINDOW_SEEK_TAIL: spool_tuples(winstate, -1); @@ -3406,25 +3791,94 @@ WinGetFuncArgInPartition(WindowObject winobj, int argno, break; } - gottuple = window_gettupleslot(winobj, abs_pos, slot); - - if (!gottuple) + /* Easy case if IGNORE NULLS is not specified */ + if (!null_treatment) { + /* get tuple and evaluate in partition */ + datum = gettuple_eval_partition(winobj, argno, + abs_pos, isnull, &myisout); + if (!myisout && set_mark) + WinSetMarkPosition(winobj, abs_pos); if (isout) - *isout = true; - *isnull = true; - return (Datum) 0; + *isout = myisout; + return datum; } + + /* Prepare for loop */ + notnull_offset = 0; + notnull_relpos = abs(relpos); + forward = relpos > 0 ? 1 : -1; + myisout = false; + datum = 0; + + /* + * IGNORE NULLS + WINDOW_SEEK_CURRENT + relpos > 0 case, we would fetch + * beyond the current row + relpos to find out the target row. If we mark + * at abs_pos, next call to WinGetFuncArgInPartition or + * WinGetFuncArgInFrame (in case when a window function have multiple + * args) could fail with "cannot fetch row before WindowObject's mark + * position". So keep the mark position at currentpos. + */ + if (seektype == WINDOW_SEEK_CURRENT && relpos > 0) + mark_pos = winstate->currentpos; else { - if (isout) - *isout = false; - if (set_mark) - WinSetMarkPosition(winobj, abs_pos); - econtext->ecxt_outertuple = slot; - return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno), - econtext, isnull); + /* + * For other cases we have no idea what position of row callers would + * fetch next time. Also for relpos < 0 case (we go backward), we + * cannot set mark either. For those cases we always set mark at 0. + */ + mark_pos = 0; } + + /* + * Get the next nonnull value in the partition, moving forward or backward + * until we find a value or reach the partition's end. We cache the + * nullness status because we may repeat this process many times. + */ + do + { + int nn_info; /* NOT NULL status */ + + abs_pos += forward; + if (abs_pos < 0) /* clearly out of partition */ + break; + + /* check NOT NULL cached info */ + nn_info = get_notnull_info(winobj, abs_pos, argno); + if (nn_info == NN_NOTNULL) /* this row is known to be NOT NULL */ + notnull_offset++; + else if (nn_info == NN_NULL) /* this row is known to be NULL */ + continue; /* keep on moving forward or backward */ + else /* need to check NULL or not */ + { + /* + * NOT NULL info does not exist yet. Get tuple and evaluate func + * arg in partition. We ignore the return value from + * gettuple_eval_partition because we are just interested in + * whether we are inside or outside of partition, NULL or NOT + * NULL. + */ + (void) gettuple_eval_partition(winobj, argno, + abs_pos, isnull, &myisout); + if (myisout) /* out of partition? */ + break; + if (!*isnull) + notnull_offset++; + /* record the row status */ + put_notnull_info(winobj, abs_pos, argno, *isnull); + } + } while (notnull_offset < notnull_relpos); + + /* get tuple and evaluate func arg in partition */ + datum = gettuple_eval_partition(winobj, argno, + abs_pos, isnull, &myisout); + if (!myisout && set_mark) + WinSetMarkPosition(winobj, mark_pos); + if (isout) + *isout = myisout; + + return datum; } /* @@ -3476,6 +3930,10 @@ WinGetFuncArgInFrame(WindowObject winobj, int argno, econtext = winstate->ss.ps.ps_ExprContext; slot = winstate->temp_slot_1; + if (winobj->ignore_nulls == IGNORE_NULLS) + return ignorenulls_getfuncarginframe(winobj, argno, relpos, seektype, + set_mark, isnull, isout); + switch (seektype) { case WINDOW_SEEK_CURRENT: @@ -3624,7 +4082,7 @@ WinGetFuncArgInFrame(WindowObject winobj, int argno, goto out_of_frame; /* The code above does not detect all out-of-frame cases, so check */ - if (row_is_in_frame(winstate, abs_pos, slot) <= 0) + if (row_is_in_frame(winobj, abs_pos, slot, false) <= 0) goto out_of_frame; if (isout) diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index ecb2e4ccaa1ca..34a63a977b6e4 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -68,7 +68,7 @@ static int _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, bool fire_triggers); static ParamListInfo _SPI_convert_params(int nargs, Oid *argtypes, - Datum *Values, const char *Nulls); + const Datum *Values, const char *Nulls); static int _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount); @@ -669,7 +669,7 @@ SPI_execute_extended(const char *src, /* Execute a previously prepared plan */ int -SPI_execute_plan(SPIPlanPtr plan, Datum *Values, const char *Nulls, +SPI_execute_plan(SPIPlanPtr plan, const Datum *Values, const char *Nulls, bool read_only, long tcount) { SPIExecuteOptions options; @@ -771,7 +771,7 @@ SPI_execute_plan_with_paramlist(SPIPlanPtr plan, ParamListInfo params, */ int SPI_execute_snapshot(SPIPlanPtr plan, - Datum *Values, const char *Nulls, + const Datum *Values, const char *Nulls, Snapshot snapshot, Snapshot crosscheck_snapshot, bool read_only, bool fire_triggers, long tcount) { @@ -811,7 +811,7 @@ SPI_execute_snapshot(SPIPlanPtr plan, int SPI_execute_with_args(const char *src, int nargs, Oid *argtypes, - Datum *Values, const char *Nulls, + const Datum *Values, const char *Nulls, bool read_only, long tcount) { int res; @@ -1130,8 +1130,8 @@ SPI_modifytuple(Relation rel, HeapTuple tuple, int natts, int *attnum, SPI_result = 0; numberOfAttributes = rel->rd_att->natts; - v = (Datum *) palloc(numberOfAttributes * sizeof(Datum)); - n = (bool *) palloc(numberOfAttributes * sizeof(bool)); + v = palloc_array(Datum, numberOfAttributes); + n = palloc_array(bool, numberOfAttributes); /* fetch old values and nulls */ heap_deform_tuple(tuple, rel->rd_att, v, n); @@ -1258,7 +1258,7 @@ SPI_getbinval(HeapTuple tuple, TupleDesc tupdesc, int fnumber, bool *isnull) { SPI_result = SPI_ERROR_NOATTRIBUTE; *isnull = true; - return (Datum) NULL; + return (Datum) 0; } return heap_getattr(tuple, fnumber, tupdesc, isnull); @@ -1443,7 +1443,7 @@ SPI_freetuptable(SPITupleTable *tuptable) */ Portal SPI_cursor_open(const char *name, SPIPlanPtr plan, - Datum *Values, const char *Nulls, + const Datum *Values, const char *Nulls, bool read_only) { Portal portal; @@ -2141,8 +2141,7 @@ spi_dest_startup(DestReceiver *self, int operation, TupleDesc typeinfo) ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(tuptabcxt); - _SPI_current->tuptable = tuptable = (SPITupleTable *) - palloc0(sizeof(SPITupleTable)); + _SPI_current->tuptable = tuptable = palloc0_object(SPITupleTable); tuptable->tuptabcxt = tuptabcxt; tuptable->subid = GetCurrentSubTransactionId(); @@ -2155,7 +2154,7 @@ spi_dest_startup(DestReceiver *self, int operation, TupleDesc typeinfo) /* set up initial allocations */ tuptable->alloced = 128; - tuptable->vals = (HeapTuple *) palloc(tuptable->alloced * sizeof(HeapTuple)); + tuptable->vals = palloc_array(HeapTuple, tuptable->alloced); tuptable->numvals = 0; tuptable->tupdesc = CreateTupleDescCopy(typeinfo); @@ -2847,7 +2846,7 @@ _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, */ static ParamListInfo _SPI_convert_params(int nargs, Oid *argtypes, - Datum *Values, const char *Nulls) + const Datum *Values, const char *Nulls) { ParamListInfo paramLI; @@ -3162,7 +3161,7 @@ _SPI_make_plan_non_temp(SPIPlanPtr plan) oldcxt = MemoryContextSwitchTo(plancxt); /* Copy the _SPI_plan struct and subsidiary data into the new context */ - newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan)); + newplan = palloc0_object(_SPI_plan); newplan->magic = _SPI_PLAN_MAGIC; newplan->plancxt = plancxt; newplan->parse_mode = plan->parse_mode; @@ -3170,7 +3169,7 @@ _SPI_make_plan_non_temp(SPIPlanPtr plan) newplan->nargs = plan->nargs; if (plan->nargs > 0) { - newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid)); + newplan->argtypes = palloc_array(Oid, plan->nargs); memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid)); } else @@ -3227,7 +3226,7 @@ _SPI_save_plan(SPIPlanPtr plan) oldcxt = MemoryContextSwitchTo(plancxt); /* Copy the SPI plan into its own context */ - newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan)); + newplan = palloc0_object(_SPI_plan); newplan->magic = _SPI_PLAN_MAGIC; newplan->plancxt = plancxt; newplan->parse_mode = plan->parse_mode; @@ -3235,7 +3234,7 @@ _SPI_save_plan(SPIPlanPtr plan) newplan->nargs = plan->nargs; if (plan->nargs > 0) { - newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid)); + newplan->argtypes = palloc_array(Oid, plan->nargs); memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid)); } else @@ -3369,7 +3368,7 @@ SPI_register_trigger_data(TriggerData *tdata) if (tdata->tg_newtable) { EphemeralNamedRelation enr = - palloc(sizeof(EphemeralNamedRelationData)); + palloc_object(EphemeralNamedRelationData); int rc; enr->md.name = tdata->tg_trigger->tgnewtable; @@ -3386,7 +3385,7 @@ SPI_register_trigger_data(TriggerData *tdata) if (tdata->tg_oldtable) { EphemeralNamedRelation enr = - palloc(sizeof(EphemeralNamedRelationData)); + palloc_object(EphemeralNamedRelationData); int rc; enr->md.name = tdata->tg_trigger->tgoldtable; diff --git a/src/backend/executor/tqueue.c b/src/backend/executor/tqueue.c index 6c5e1f1262d82..d9bf59f672d57 100644 --- a/src/backend/executor/tqueue.c +++ b/src/backend/executor/tqueue.c @@ -120,7 +120,7 @@ CreateTupleQueueDestReceiver(shm_mq_handle *handle) { TQueueDestReceiver *self; - self = (TQueueDestReceiver *) palloc0(sizeof(TQueueDestReceiver)); + self = palloc0_object(TQueueDestReceiver); self->pub.receiveSlot = tqueueReceiveSlot; self->pub.rStartup = tqueueStartupReceiver; @@ -138,7 +138,7 @@ CreateTupleQueueDestReceiver(shm_mq_handle *handle) TupleQueueReader * CreateTupleQueueReader(shm_mq_handle *handle) { - TupleQueueReader *reader = palloc0(sizeof(TupleQueueReader)); + TupleQueueReader *reader = palloc0_object(TupleQueueReader); reader->queue = handle; diff --git a/src/backend/executor/tstoreReceiver.c b/src/backend/executor/tstoreReceiver.c index 562de67645771..2012618e230fe 100644 --- a/src/backend/executor/tstoreReceiver.c +++ b/src/backend/executor/tstoreReceiver.c @@ -25,6 +25,7 @@ #include "access/detoast.h" #include "access/tupconvert.h" #include "executor/tstoreReceiver.h" +#include "varatt.h" typedef struct @@ -237,7 +238,7 @@ tstoreDestroyReceiver(DestReceiver *self) DestReceiver * CreateTuplestoreDestReceiver(void) { - TStoreState *self = (TStoreState *) palloc0(sizeof(TStoreState)); + TStoreState *self = palloc0_object(TStoreState); self->pub.receiveSlot = tstoreReceiveSlot_notoast; /* might change */ self->pub.rStartup = tstoreStartupReceiver; diff --git a/src/backend/foreign/foreign.c b/src/backend/foreign/foreign.c index a57e59f27ea64..fa3f4c7524751 100644 --- a/src/backend/foreign/foreign.c +++ b/src/backend/foreign/foreign.c @@ -66,7 +66,7 @@ GetForeignDataWrapperExtended(Oid fdwid, bits16 flags) fdwform = (Form_pg_foreign_data_wrapper) GETSTRUCT(tp); - fdw = (ForeignDataWrapper *) palloc(sizeof(ForeignDataWrapper)); + fdw = palloc_object(ForeignDataWrapper); fdw->fdwid = fdwid; fdw->owner = fdwform->fdwowner; fdw->fdwname = pstrdup(NameStr(fdwform->fdwname)); @@ -140,7 +140,7 @@ GetForeignServerExtended(Oid serverid, bits16 flags) serverform = (Form_pg_foreign_server) GETSTRUCT(tp); - server = (ForeignServer *) palloc(sizeof(ForeignServer)); + server = palloc_object(ForeignServer); server->serverid = serverid; server->servername = pstrdup(NameStr(serverform->srvname)); server->owner = serverform->srvowner; @@ -227,7 +227,7 @@ GetUserMapping(Oid userid, Oid serverid) MappingUserName(userid), server->servername))); } - um = (UserMapping *) palloc(sizeof(UserMapping)); + um = palloc_object(UserMapping); um->umid = ((Form_pg_user_mapping) GETSTRUCT(tp))->oid; um->userid = userid; um->serverid = serverid; @@ -265,7 +265,7 @@ GetForeignTable(Oid relid) elog(ERROR, "cache lookup failed for foreign table %u", relid); tableform = (Form_pg_foreign_table) GETSTRUCT(tp); - ft = (ForeignTable *) palloc(sizeof(ForeignTable)); + ft = palloc_object(ForeignTable); ft->relid = relid; ft->serverid = tableform->ftserver; @@ -463,7 +463,7 @@ GetFdwRoutineForRelation(Relation relation, bool makecopy) /* We have valid cached data --- does the caller want a copy? */ if (makecopy) { - fdwroutine = (FdwRoutine *) palloc(sizeof(FdwRoutine)); + fdwroutine = palloc_object(FdwRoutine); memcpy(fdwroutine, relation->rd_fdwroutine, sizeof(FdwRoutine)); return fdwroutine; } diff --git a/src/backend/jit/llvm/Makefile b/src/backend/jit/llvm/Makefile index e8c12060b93df..68677ba42e189 100644 --- a/src/backend/jit/llvm/Makefile +++ b/src/backend/jit/llvm/Makefile @@ -31,7 +31,7 @@ endif # All files in this directory use LLVM. CFLAGS += $(LLVM_CFLAGS) CXXFLAGS += $(LLVM_CXXFLAGS) -override CPPFLAGS := $(LLVM_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(LLVM_CPPFLAGS) SHLIB_LINK += $(LLVM_LIBS) # Because this module includes C++ files, we need to use a C++ diff --git a/src/backend/jit/llvm/llvmjit.c b/src/backend/jit/llvm/llvmjit.c index 46511624f0166..15c3475ede2f9 100644 --- a/src/backend/jit/llvm/llvmjit.c +++ b/src/backend/jit/llvm/llvmjit.c @@ -54,6 +54,7 @@ typedef struct LLVMJitHandle /* types & functions commonly needed for JITing */ LLVMTypeRef TypeSizeT; +LLVMTypeRef TypeDatum; LLVMTypeRef TypeParamBool; LLVMTypeRef TypeStorageBool; LLVMTypeRef TypePGFunction; @@ -499,7 +500,7 @@ llvm_copy_attributes_at_index(LLVMValueRef v_from, LLVMValueRef v_to, uint32 ind if (num_attributes == 0) return; - attrs = palloc(sizeof(LLVMAttributeRef) * num_attributes); + attrs = palloc_array(LLVMAttributeRef, num_attributes); LLVMGetAttributesAtIndex(v_from, index, attrs); for (int attno = 0; attno < num_attributes; attno++) @@ -1011,6 +1012,7 @@ llvm_create_types(void) LLVMDisposeMemoryBuffer(buf); TypeSizeT = llvm_pg_var_type("TypeSizeT"); + TypeDatum = llvm_pg_var_type("TypeDatum"); TypeParamBool = load_return_type(llvm_types_module, "FunctionReturningBool"); TypeStorageBool = llvm_pg_var_type("TypeStorageBool"); TypePGFunction = llvm_pg_var_type("TypePGFunction"); @@ -1121,9 +1123,9 @@ llvm_resolve_symbols(LLVMOrcDefinitionGeneratorRef GeneratorObj, void *Ctx, LLVMOrcCLookupSet LookupSet, size_t LookupSetSize) { #if LLVM_VERSION_MAJOR > 14 - LLVMOrcCSymbolMapPairs symbols = palloc0(sizeof(LLVMOrcCSymbolMapPair) * LookupSetSize); + LLVMOrcCSymbolMapPairs symbols = palloc0_array(LLVMOrcCSymbolMapPair, LookupSetSize); #else - LLVMOrcCSymbolMapPairs symbols = palloc0(sizeof(LLVMJITCSymbolMapPair) * LookupSetSize); + LLVMOrcCSymbolMapPairs symbols = palloc0_array(LLVMJITCSymbolMapPair, LookupSetSize); #endif LLVMErrorRef error; LLVMOrcMaterializationUnitRef mu; diff --git a/src/backend/jit/llvm/llvmjit_deform.c b/src/backend/jit/llvm/llvmjit_deform.c index c562edd094bb2..89167d15c3fb5 100644 --- a/src/backend/jit/llvm/llvmjit_deform.c +++ b/src/backend/jit/llvm/llvmjit_deform.c @@ -156,12 +156,12 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc, b = LLVMCreateBuilderInContext(lc); - attcheckattnoblocks = palloc(sizeof(LLVMBasicBlockRef) * natts); - attstartblocks = palloc(sizeof(LLVMBasicBlockRef) * natts); - attisnullblocks = palloc(sizeof(LLVMBasicBlockRef) * natts); - attcheckalignblocks = palloc(sizeof(LLVMBasicBlockRef) * natts); - attalignblocks = palloc(sizeof(LLVMBasicBlockRef) * natts); - attstoreblocks = palloc(sizeof(LLVMBasicBlockRef) * natts); + attcheckattnoblocks = palloc_array(LLVMBasicBlockRef, natts); + attstartblocks = palloc_array(LLVMBasicBlockRef, natts); + attisnullblocks = palloc_array(LLVMBasicBlockRef, natts); + attcheckalignblocks = palloc_array(LLVMBasicBlockRef, natts); + attalignblocks = palloc_array(LLVMBasicBlockRef, natts); + attstoreblocks = palloc_array(LLVMBasicBlockRef, natts); known_alignment = 0; @@ -479,8 +479,8 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc, l_gep(b, LLVMInt8TypeInContext(lc), v_tts_nulls, &l_attno, 1, "")); /* store zero datum */ LLVMBuildStore(b, - l_sizet_const(0), - l_gep(b, TypeSizeT, v_tts_values, &l_attno, 1, "")); + l_datum_const(0), + l_gep(b, TypeDatum, v_tts_values, &l_attno, 1, "")); LLVMBuildBr(b, b_next); attguaranteedalign = false; @@ -644,7 +644,7 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc, } /* compute address to store value at */ - v_resultp = l_gep(b, TypeSizeT, v_tts_values, &l_attno, 1, ""); + v_resultp = l_gep(b, TypeDatum, v_tts_values, &l_attno, 1, ""); /* store null-byte (false) */ LLVMBuildStore(b, l_int8_const(lc, 0), @@ -663,7 +663,7 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc, v_tmp_loaddata = LLVMBuildPointerCast(b, v_attdatap, vartypep, ""); v_tmp_loaddata = l_load(b, vartype, v_tmp_loaddata, "attr_byval"); - v_tmp_loaddata = LLVMBuildZExt(b, v_tmp_loaddata, TypeSizeT, ""); + v_tmp_loaddata = LLVMBuildSExt(b, v_tmp_loaddata, TypeDatum, ""); LLVMBuildStore(b, v_tmp_loaddata, v_resultp); } @@ -675,7 +675,7 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc, v_tmp_loaddata = LLVMBuildPtrToInt(b, v_attdatap, - TypeSizeT, + TypeDatum, "attr_ptr"); LLVMBuildStore(b, v_tmp_loaddata, v_resultp); } diff --git a/src/backend/jit/llvm/llvmjit_expr.c b/src/backend/jit/llvm/llvmjit_expr.c index 890bcb0b0a79d..f9c7f29e7280f 100644 --- a/src/backend/jit/llvm/llvmjit_expr.c +++ b/src/backend/jit/llvm/llvmjit_expr.c @@ -297,7 +297,7 @@ llvm_compile_expr(ExprState *state) "v.econtext.aggnulls"); /* allocate blocks for each op upfront, so we can do jumps easily */ - opblocks = palloc(sizeof(LLVMBasicBlockRef) * state->steps_len); + opblocks = palloc_array(LLVMBasicBlockRef, state->steps_len); for (int opno = 0; opno < state->steps_len; opno++) opblocks[opno] = l_bb_append_v(eval_fn, "b.op.%d.start", opno); @@ -316,7 +316,7 @@ llvm_compile_expr(ExprState *state) op = &state->steps[opno]; opcode = ExecEvalStepOp(state, op); - v_resvaluep = l_ptr_const(op->resvalue, l_ptr(TypeSizeT)); + v_resvaluep = l_ptr_const(op->resvalue, l_ptr(TypeDatum)); v_resnullp = l_ptr_const(op->resnull, l_ptr(TypeStorageBool)); switch (opcode) @@ -326,7 +326,7 @@ llvm_compile_expr(ExprState *state) LLVMValueRef v_tmpisnull; LLVMValueRef v_tmpvalue; - v_tmpvalue = l_load(b, TypeSizeT, v_tmpvaluep, ""); + v_tmpvalue = l_load(b, TypeDatum, v_tmpvaluep, ""); v_tmpisnull = l_load(b, TypeStorageBool, v_tmpisnullp, ""); LLVMBuildStore(b, v_tmpisnull, v_isnullp); @@ -336,7 +336,7 @@ llvm_compile_expr(ExprState *state) } case EEOP_DONE_NO_RETURN: - LLVMBuildRet(b, l_sizet_const(0)); + LLVMBuildRet(b, l_datum_const(0)); break; case EEOP_INNER_FETCHSOME: @@ -478,7 +478,7 @@ llvm_compile_expr(ExprState *state) } v_attnum = l_int32_const(lc, op->d.var.attnum); - value = l_load_gep1(b, TypeSizeT, v_values, v_attnum, ""); + value = l_load_gep1(b, TypeDatum, v_values, v_attnum, ""); isnull = l_load_gep1(b, TypeStorageBool, v_nulls, v_attnum, ""); LLVMBuildStore(b, value, v_resvaluep); LLVMBuildStore(b, isnull, v_resnullp); @@ -562,13 +562,13 @@ llvm_compile_expr(ExprState *state) /* load data */ v_attnum = l_int32_const(lc, op->d.assign_var.attnum); - v_value = l_load_gep1(b, TypeSizeT, v_values, v_attnum, ""); + v_value = l_load_gep1(b, TypeDatum, v_values, v_attnum, ""); v_isnull = l_load_gep1(b, TypeStorageBool, v_nulls, v_attnum, ""); /* compute addresses of targets */ v_resultnum = l_int32_const(lc, op->d.assign_var.resultnum); v_rvaluep = l_gep(b, - TypeSizeT, + TypeDatum, v_resultvalues, &v_resultnum, 1, ""); v_risnullp = l_gep(b, @@ -595,13 +595,13 @@ llvm_compile_expr(ExprState *state) size_t resultnum = op->d.assign_tmp.resultnum; /* load data */ - v_value = l_load(b, TypeSizeT, v_tmpvaluep, ""); + v_value = l_load(b, TypeDatum, v_tmpvaluep, ""); v_isnull = l_load(b, TypeStorageBool, v_tmpisnullp, ""); /* compute addresses of targets */ v_resultnum = l_int32_const(lc, resultnum); v_rvaluep = - l_gep(b, TypeSizeT, v_resultvalues, &v_resultnum, 1, ""); + l_gep(b, TypeDatum, v_resultvalues, &v_resultnum, 1, ""); v_risnullp = l_gep(b, TypeStorageBool, v_resultnulls, &v_resultnum, 1, ""); @@ -650,7 +650,7 @@ llvm_compile_expr(ExprState *state) LLVMValueRef v_constvalue, v_constnull; - v_constvalue = l_sizet_const(op->d.constval.value); + v_constvalue = l_datum_const(op->d.constval.value); v_constnull = l_sbool_const(op->d.constval.isnull); LLVMBuildStore(b, v_constvalue, v_resvaluep); @@ -698,8 +698,8 @@ llvm_compile_expr(ExprState *state) LLVMBuildStore(b, l_sbool_const(1), v_resnullp); /* create blocks for checking args, one for each */ - b_checkargnulls = - palloc(sizeof(LLVMBasicBlockRef *) * op->d.func.nargs); + b_checkargnulls = (LLVMBasicBlockRef *) + palloc(sizeof(LLVMBasicBlockRef) * op->d.func.nargs); for (int argno = 0; argno < op->d.func.nargs; argno++) b_checkargnulls[argno] = l_bb_before_v(b_nonull, "b.%d.isnull.%d", opno, @@ -798,7 +798,7 @@ llvm_compile_expr(ExprState *state) LLVMBuildStore(b, l_sbool_const(0), v_boolanynullp); v_boolnull = l_load(b, TypeStorageBool, v_resnullp, ""); - v_boolvalue = l_load(b, TypeSizeT, v_resvaluep, ""); + v_boolvalue = l_load(b, TypeDatum, v_resvaluep, ""); /* check if current input is NULL */ LLVMBuildCondBr(b, @@ -818,7 +818,7 @@ llvm_compile_expr(ExprState *state) LLVMPositionBuilderAtEnd(b, b_boolcheckfalse); LLVMBuildCondBr(b, LLVMBuildICmp(b, LLVMIntEQ, v_boolvalue, - l_sizet_const(0), ""), + l_datum_const(0), ""), b_boolisfalse, b_boolcont); @@ -846,7 +846,7 @@ llvm_compile_expr(ExprState *state) /* set resnull to true */ LLVMBuildStore(b, l_sbool_const(1), v_resnullp); /* reset resvalue */ - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); LLVMBuildBr(b, opblocks[opno + 1]); break; @@ -889,7 +889,7 @@ llvm_compile_expr(ExprState *state) if (opcode == EEOP_BOOL_OR_STEP_FIRST) LLVMBuildStore(b, l_sbool_const(0), v_boolanynullp); v_boolnull = l_load(b, TypeStorageBool, v_resnullp, ""); - v_boolvalue = l_load(b, TypeSizeT, v_resvaluep, ""); + v_boolvalue = l_load(b, TypeDatum, v_resvaluep, ""); LLVMBuildCondBr(b, LLVMBuildICmp(b, LLVMIntEQ, v_boolnull, @@ -908,7 +908,7 @@ llvm_compile_expr(ExprState *state) LLVMPositionBuilderAtEnd(b, b_boolchecktrue); LLVMBuildCondBr(b, LLVMBuildICmp(b, LLVMIntEQ, v_boolvalue, - l_sizet_const(1), ""), + l_datum_const(1), ""), b_boolistrue, b_boolcont); @@ -936,7 +936,7 @@ llvm_compile_expr(ExprState *state) /* set resnull to true */ LLVMBuildStore(b, l_sbool_const(1), v_resnullp); /* reset resvalue */ - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); LLVMBuildBr(b, opblocks[opno + 1]); break; @@ -948,13 +948,13 @@ llvm_compile_expr(ExprState *state) LLVMValueRef v_negbool; /* compute !boolvalue */ - v_boolvalue = l_load(b, TypeSizeT, v_resvaluep, ""); + v_boolvalue = l_load(b, TypeDatum, v_resvaluep, ""); v_negbool = LLVMBuildZExt(b, LLVMBuildICmp(b, LLVMIntEQ, v_boolvalue, - l_sizet_const(0), + l_datum_const(0), ""), - TypeSizeT, ""); + TypeDatum, ""); /* * Store it back in resvalue. We can ignore resnull here; @@ -977,7 +977,7 @@ llvm_compile_expr(ExprState *state) b_qualfail = l_bb_before_v(opblocks[opno + 1], "op.%d.qualfail", opno); - v_resvalue = l_load(b, TypeSizeT, v_resvaluep, ""); + v_resvalue = l_load(b, TypeDatum, v_resvaluep, ""); v_resnull = l_load(b, TypeStorageBool, v_resnullp, ""); v_nullorfalse = @@ -985,7 +985,7 @@ llvm_compile_expr(ExprState *state) LLVMBuildICmp(b, LLVMIntEQ, v_resnull, l_sbool_const(1), ""), LLVMBuildICmp(b, LLVMIntEQ, v_resvalue, - l_sizet_const(0), ""), + l_datum_const(0), ""), ""); LLVMBuildCondBr(b, @@ -998,7 +998,7 @@ llvm_compile_expr(ExprState *state) /* set resnull to false */ LLVMBuildStore(b, l_sbool_const(0), v_resnullp); /* set resvalue to false */ - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); /* and jump out */ LLVMBuildBr(b, opblocks[op->d.qualexpr.jumpdone]); break; @@ -1051,7 +1051,7 @@ llvm_compile_expr(ExprState *state) /* Transfer control if current result is null or false */ - v_resvalue = l_load(b, TypeSizeT, v_resvaluep, ""); + v_resvalue = l_load(b, TypeDatum, v_resvaluep, ""); v_resnull = l_load(b, TypeStorageBool, v_resnullp, ""); v_nullorfalse = @@ -1059,7 +1059,7 @@ llvm_compile_expr(ExprState *state) LLVMBuildICmp(b, LLVMIntEQ, v_resnull, l_sbool_const(1), ""), LLVMBuildICmp(b, LLVMIntEQ, v_resvalue, - l_sizet_const(0), ""), + l_datum_const(0), ""), ""); LLVMBuildCondBr(b, @@ -1078,8 +1078,8 @@ llvm_compile_expr(ExprState *state) LLVMBuildSelect(b, LLVMBuildICmp(b, LLVMIntEQ, v_resnull, l_sbool_const(1), ""), - l_sizet_const(1), - l_sizet_const(0), + l_datum_const(1), + l_datum_const(0), ""); LLVMBuildStore(b, v_resvalue, v_resvaluep); LLVMBuildStore(b, l_sbool_const(0), v_resnullp); @@ -1097,8 +1097,8 @@ llvm_compile_expr(ExprState *state) LLVMBuildSelect(b, LLVMBuildICmp(b, LLVMIntEQ, v_resnull, l_sbool_const(1), ""), - l_sizet_const(0), - l_sizet_const(1), + l_datum_const(0), + l_datum_const(1), ""); LLVMBuildStore(b, v_resvalue, v_resvaluep); LLVMBuildStore(b, l_sbool_const(0), v_resnullp); @@ -1148,11 +1148,11 @@ llvm_compile_expr(ExprState *state) if (opcode == EEOP_BOOLTEST_IS_TRUE || opcode == EEOP_BOOLTEST_IS_FALSE) { - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); } else { - LLVMBuildStore(b, l_sizet_const(1), v_resvaluep); + LLVMBuildStore(b, l_datum_const(1), v_resvaluep); } LLVMBuildBr(b, opblocks[opno + 1]); @@ -1170,14 +1170,14 @@ llvm_compile_expr(ExprState *state) else { LLVMValueRef v_value = - l_load(b, TypeSizeT, v_resvaluep, ""); + l_load(b, TypeDatum, v_resvaluep, ""); v_value = LLVMBuildZExt(b, LLVMBuildICmp(b, LLVMIntEQ, v_value, - l_sizet_const(0), + l_datum_const(0), ""), - TypeSizeT, ""); + TypeDatum, ""); LLVMBuildStore(b, v_value, v_resvaluep); } LLVMBuildBr(b, opblocks[opno + 1]); @@ -1279,11 +1279,11 @@ llvm_compile_expr(ExprState *state) v_casenull; v_casevaluep = l_ptr_const(op->d.casetest.value, - l_ptr(TypeSizeT)); + l_ptr(TypeDatum)); v_casenullp = l_ptr_const(op->d.casetest.isnull, l_ptr(TypeStorageBool)); - v_casevalue = l_load(b, TypeSizeT, v_casevaluep, ""); + v_casevalue = l_load(b, TypeDatum, v_casevaluep, ""); v_casenull = l_load(b, TypeStorageBool, v_casenullp, ""); LLVMBuildStore(b, v_casevalue, v_resvaluep); LLVMBuildStore(b, v_casenull, v_resnullp); @@ -1345,9 +1345,9 @@ llvm_compile_expr(ExprState *state) LLVMPositionBuilderAtEnd(b, b_notnull); v_valuep = l_ptr_const(op->d.make_readonly.value, - l_ptr(TypeSizeT)); + l_ptr(TypeDatum)); - v_value = l_load(b, TypeSizeT, v_valuep, ""); + v_value = l_load(b, TypeDatum, v_valuep, ""); v_params[0] = v_value; v_ret = @@ -1415,11 +1415,11 @@ llvm_compile_expr(ExprState *state) b_calloutput); LLVMPositionBuilderAtEnd(b, b_skipoutput); - v_output_skip = l_sizet_const(0); + v_output_skip = l_datum_const(0); LLVMBuildBr(b, b_input); LLVMPositionBuilderAtEnd(b, b_calloutput); - v_resvalue = l_load(b, TypeSizeT, v_resvaluep, ""); + v_resvalue = l_load(b, TypeDatum, v_resvaluep, ""); /* set arg[0] */ LLVMBuildStore(b, @@ -1449,7 +1449,7 @@ llvm_compile_expr(ExprState *state) incoming_values[1] = v_output; incoming_blocks[1] = b_calloutput; - v_output = LLVMBuildPhi(b, TypeSizeT, "output"); + v_output = LLVMBuildPhi(b, TypeDatum, "output"); LLVMAddIncoming(v_output, incoming_values, incoming_blocks, lengthof(incoming_blocks)); @@ -1463,7 +1463,7 @@ llvm_compile_expr(ExprState *state) { LLVMBuildCondBr(b, LLVMBuildICmp(b, LLVMIntEQ, v_output, - l_sizet_const(0), ""), + l_datum_const(0), ""), opblocks[opno + 1], b_inputcall); } @@ -1564,9 +1564,9 @@ llvm_compile_expr(ExprState *state) LLVMPositionBuilderAtEnd(b, b_bothargnull); LLVMBuildStore(b, l_sbool_const(0), v_resnullp); if (opcode == EEOP_NOT_DISTINCT) - LLVMBuildStore(b, l_sizet_const(1), v_resvaluep); + LLVMBuildStore(b, l_datum_const(1), v_resvaluep); else - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); LLVMBuildBr(b, opblocks[opno + 1]); @@ -1574,9 +1574,9 @@ llvm_compile_expr(ExprState *state) LLVMPositionBuilderAtEnd(b, b_anyargnull); LLVMBuildStore(b, l_sbool_const(0), v_resnullp); if (opcode == EEOP_NOT_DISTINCT) - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); else - LLVMBuildStore(b, l_sizet_const(1), v_resvaluep); + LLVMBuildStore(b, l_datum_const(1), v_resvaluep); LLVMBuildBr(b, opblocks[opno + 1]); /* neither argument is null: compare */ @@ -1592,8 +1592,8 @@ llvm_compile_expr(ExprState *state) LLVMBuildZExt(b, LLVMBuildICmp(b, LLVMIntEQ, v_result, - l_sizet_const(0), ""), - TypeSizeT, ""); + l_datum_const(0), ""), + TypeDatum, ""); } LLVMBuildStore(b, v_fcinfo_isnull, v_resnullp); @@ -1691,7 +1691,7 @@ llvm_compile_expr(ExprState *state) ""), LLVMBuildICmp(b, LLVMIntEQ, v_retval, - l_sizet_const(1), + l_datum_const(1), ""), ""); LLVMBuildCondBr(b, v_argsequal, b_argsequal, b_hasnull); @@ -1699,7 +1699,7 @@ llvm_compile_expr(ExprState *state) /* build block setting result to NULL, if args are equal */ LLVMPositionBuilderAtEnd(b, b_argsequal); LLVMBuildStore(b, l_sbool_const(1), v_resnullp); - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); LLVMBuildBr(b, opblocks[opno + 1]); break; @@ -1755,7 +1755,7 @@ llvm_compile_expr(ExprState *state) LLVMPositionBuilderAtEnd(b, b_isnull); - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); LLVMBuildStore(b, l_sbool_const(1), v_resnullp); LLVMBuildBr(b, opblocks[op->d.returningexpr.jumpdone]); @@ -1861,7 +1861,7 @@ llvm_compile_expr(ExprState *state) LLVMBuildICmp(b, LLVMIntEQ, v_retval, - l_sizet_const(0), ""), + l_datum_const(0), ""), opblocks[opno + 1], opblocks[op->d.rowcompare_step.jumpdone]); @@ -1891,7 +1891,7 @@ llvm_compile_expr(ExprState *state) */ v_cmpresult = LLVMBuildTrunc(b, - l_load(b, TypeSizeT, v_resvaluep, ""), + l_load(b, TypeDatum, v_resvaluep, ""), LLVMInt32TypeInContext(lc), ""); switch (cmptype) @@ -1920,7 +1920,7 @@ llvm_compile_expr(ExprState *state) v_cmpresult, l_int32_const(lc, 0), ""); - v_result = LLVMBuildZExt(b, v_result, TypeSizeT, ""); + v_result = LLVMBuildZExt(b, v_result, TypeDatum, ""); LLVMBuildStore(b, l_sbool_const(0), v_resnullp); LLVMBuildStore(b, v_result, v_resvaluep); @@ -1961,11 +1961,11 @@ llvm_compile_expr(ExprState *state) v_casenull; v_casevaluep = l_ptr_const(op->d.casetest.value, - l_ptr(TypeSizeT)); + l_ptr(TypeDatum)); v_casenullp = l_ptr_const(op->d.casetest.isnull, l_ptr(TypeStorageBool)); - v_casevalue = l_load(b, TypeSizeT, v_casevaluep, ""); + v_casevalue = l_load(b, TypeDatum, v_casevaluep, ""); v_casenull = l_load(b, TypeStorageBool, v_casenullp, ""); LLVMBuildStore(b, v_casevalue, v_resvaluep); LLVMBuildStore(b, v_casenull, v_resnullp); @@ -2014,7 +2014,7 @@ llvm_compile_expr(ExprState *state) { LLVMValueRef v_initvalue; - v_initvalue = l_sizet_const(op->d.hashdatum_initvalue.init_value); + v_initvalue = l_datum_const(op->d.hashdatum_initvalue.init_value); LLVMBuildStore(b, v_initvalue, v_resvaluep); LLVMBuildStore(b, l_sbool_const(0), v_resnullp); @@ -2053,24 +2053,24 @@ llvm_compile_expr(ExprState *state) LLVMValueRef tmp; tmp = l_ptr_const(&op->d.hashdatum.iresult->value, - l_ptr(TypeSizeT)); + l_ptr(TypeDatum)); /* * Fetch the previously hashed value from where the * previous hash operation stored it. */ - v_prevhash = l_load(b, TypeSizeT, tmp, "prevhash"); + v_prevhash = l_load(b, TypeDatum, tmp, "prevhash"); /* * Rotate bits left by 1 bit. Be careful not to - * overflow uint32 when working with size_t. + * overflow uint32 when working with Datum. */ - v_tmp1 = LLVMBuildShl(b, v_prevhash, l_sizet_const(1), + v_tmp1 = LLVMBuildShl(b, v_prevhash, l_datum_const(1), ""); v_tmp1 = LLVMBuildAnd(b, v_tmp1, - l_sizet_const(0xffffffff), ""); + l_datum_const(0xffffffff), ""); v_tmp2 = LLVMBuildLShr(b, v_prevhash, - l_sizet_const(31), ""); + l_datum_const(31), ""); v_prevhash = LLVMBuildOr(b, v_tmp1, v_tmp2, "rotatedhash"); } @@ -2113,7 +2113,7 @@ llvm_compile_expr(ExprState *state) * the NULL result and goto jumpdone. */ LLVMBuildStore(b, l_sbool_const(1), v_resnullp); - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); LLVMBuildBr(b, opblocks[op->d.hashdatum.jumpdone]); } else @@ -2145,7 +2145,7 @@ llvm_compile_expr(ExprState *state) * Store a zero Datum when the Datum to hash is * NULL */ - LLVMBuildStore(b, l_sizet_const(0), v_resvaluep); + LLVMBuildStore(b, l_datum_const(0), v_resvaluep); } LLVMBuildBr(b, opblocks[opno + 1]); @@ -2178,24 +2178,24 @@ llvm_compile_expr(ExprState *state) LLVMValueRef tmp; tmp = l_ptr_const(&op->d.hashdatum.iresult->value, - l_ptr(TypeSizeT)); + l_ptr(TypeDatum)); /* * Fetch the previously hashed value from where the * previous hash operation stored it. */ - v_prevhash = l_load(b, TypeSizeT, tmp, "prevhash"); + v_prevhash = l_load(b, TypeDatum, tmp, "prevhash"); /* * Rotate bits left by 1 bit. Be careful not to - * overflow uint32 when working with size_t. + * overflow uint32 when working with Datum. */ - v_tmp1 = LLVMBuildShl(b, v_prevhash, l_sizet_const(1), + v_tmp1 = LLVMBuildShl(b, v_prevhash, l_datum_const(1), ""); v_tmp1 = LLVMBuildAnd(b, v_tmp1, - l_sizet_const(0xffffffff), ""); + l_datum_const(0xffffffff), ""); v_tmp2 = LLVMBuildLShr(b, v_prevhash, - l_sizet_const(31), ""); + l_datum_const(31), ""); v_prevhash = LLVMBuildOr(b, v_tmp1, v_tmp2, "rotatedhash"); } @@ -2373,7 +2373,7 @@ llvm_compile_expr(ExprState *state) v_aggno = l_int32_const(lc, op->d.aggref.aggno); /* load agg value / null */ - value = l_load_gep1(b, TypeSizeT, v_aggvalues, v_aggno, "aggvalue"); + value = l_load_gep1(b, TypeDatum, v_aggvalues, v_aggno, "aggvalue"); isnull = l_load_gep1(b, TypeStorageBool, v_aggnulls, v_aggno, "aggnull"); /* and store result */ @@ -2408,7 +2408,7 @@ llvm_compile_expr(ExprState *state) v_wfuncno = l_load(b, LLVMInt32TypeInContext(lc), v_wfuncnop, "v_wfuncno"); /* load window func value / null */ - value = l_load_gep1(b, TypeSizeT, v_aggvalues, v_wfuncno, + value = l_load_gep1(b, TypeDatum, v_aggvalues, v_wfuncno, "windowvalue"); isnull = l_load_gep1(b, TypeStorageBool, v_aggnulls, v_wfuncno, "windownull"); @@ -2505,7 +2505,7 @@ llvm_compile_expr(ExprState *state) v_nullsp = l_ptr_const(nulls, l_ptr(TypeStorageBool)); /* create blocks for checking args */ - b_checknulls = palloc(sizeof(LLVMBasicBlockRef *) * nargs); + b_checknulls = palloc_array(LLVMBasicBlockRef, nargs); for (int argno = 0; argno < nargs; argno++) { b_checknulls[argno] = @@ -2585,8 +2585,8 @@ llvm_compile_expr(ExprState *state) LLVMBuildCondBr(b, LLVMBuildICmp(b, LLVMIntEQ, - LLVMBuildPtrToInt(b, v_pergroup_allaggs, TypeSizeT, ""), - l_sizet_const(0), ""), + LLVMBuildPtrToInt(b, v_pergroup_allaggs, TypeDatum, ""), + l_datum_const(0), ""), opblocks[jumpnull], opblocks[opno + 1]); break; @@ -2788,7 +2788,7 @@ llvm_compile_expr(ExprState *state) "transnullp"); LLVMBuildStore(b, l_load(b, - TypeSizeT, + TypeDatum, v_transvaluep, "transvalue"), l_funcvaluep(b, v_fcinfo, 0)); @@ -2826,7 +2826,7 @@ llvm_compile_expr(ExprState *state) b_nocall = l_bb_before_v(opblocks[opno + 1], "op.%d.transnocall", opno); - v_transvalue = l_load(b, TypeSizeT, v_transvaluep, ""); + v_transvalue = l_load(b, TypeDatum, v_transvaluep, ""); v_transnull = l_load(b, TypeStorageBool, v_transnullp, ""); /* @@ -2956,7 +2956,7 @@ llvm_compile_expr(ExprState *state) */ { - CompiledExprState *cstate = palloc0(sizeof(CompiledExprState)); + CompiledExprState *cstate = palloc0_object(CompiledExprState); cstate->context = context; cstate->funcname = funcname; @@ -3068,7 +3068,7 @@ build_EvalXFuncInt(LLVMBuilderRef b, LLVMModuleRef mod, const char *funcname, elog(ERROR, "parameter mismatch: %s expects %d passed %d", funcname, LLVMCountParams(v_fn), nargs + 2); - params = palloc(sizeof(LLVMValueRef) * (2 + nargs)); + params = palloc_array(LLVMValueRef, (2 + nargs)); params[argno++] = v_state; params[argno++] = l_ptr_const(op, l_ptr(StructExprEvalStep)); diff --git a/src/backend/jit/llvm/llvmjit_inline.cpp b/src/backend/jit/llvm/llvmjit_inline.cpp index 2764c3bbe2f03..51b32cd9f940d 100644 --- a/src/backend/jit/llvm/llvmjit_inline.cpp +++ b/src/backend/jit/llvm/llvmjit_inline.cpp @@ -238,7 +238,11 @@ llvm_build_inline_plan(LLVMContextRef lc, llvm::Module *mod) llvm_split_symbol_name(symbolName.data(), &cmodname, &cfuncname); +#if LLVM_VERSION_MAJOR >= 21 + funcGUID = llvm::GlobalValue::getGUIDAssumingExternalLinkage(cfuncname); +#else funcGUID = llvm::GlobalValue::getGUID(cfuncname); +#endif /* already processed */ if (inlineState.processed) diff --git a/src/backend/jit/llvm/llvmjit_types.c b/src/backend/jit/llvm/llvmjit_types.c index dbe0282e98f4b..167cd554b9c07 100644 --- a/src/backend/jit/llvm/llvmjit_types.c +++ b/src/backend/jit/llvm/llvmjit_types.c @@ -47,6 +47,7 @@ */ PGFunction TypePGFunction; size_t TypeSizeT; +Datum TypeDatum; bool TypeStorageBool; ExecEvalSubroutine TypeExecEvalSubroutine; diff --git a/src/backend/jit/llvm/llvmjit_wrap.cpp b/src/backend/jit/llvm/llvmjit_wrap.cpp index da850d67ab647..c31a57b8563e8 100644 --- a/src/backend/jit/llvm/llvmjit_wrap.cpp +++ b/src/backend/jit/llvm/llvmjit_wrap.cpp @@ -53,7 +53,14 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(llvm::orc::ObjectLayer, LLVMOrcObjectLayerRef LLVMOrcObjectLayerRef LLVMOrcCreateRTDyldObjectLinkingLayerWithSafeSectionMemoryManager(LLVMOrcExecutionSessionRef ES) { +#if LLVM_VERSION_MAJOR >= 21 + return wrap(new llvm::orc::RTDyldObjectLinkingLayer( + *unwrap(ES), [](const llvm::MemoryBuffer&) { + return std::make_unique(nullptr, true); + })); +#else return wrap(new llvm::orc::RTDyldObjectLinkingLayer( *unwrap(ES), [] { return std::make_unique(nullptr, true); })); +#endif } #endif diff --git a/src/backend/jit/llvm/meson.build b/src/backend/jit/llvm/meson.build index c8e06dfbe351b..805fbd6900679 100644 --- a/src/backend/jit/llvm/meson.build +++ b/src/backend/jit/llvm/meson.build @@ -53,7 +53,7 @@ llvm_irgen_args = [ if ccache.found() llvm_irgen_command = ccache - llvm_irgen_args = [clang.path()] + llvm_irgen_args + llvm_irgen_args = [clang.full_path()] + llvm_irgen_args else llvm_irgen_command = clang endif diff --git a/src/backend/lib/README b/src/backend/lib/README index f2fb591237dba..c28cbe356f0b3 100644 --- a/src/backend/lib/README +++ b/src/backend/lib/README @@ -1,8 +1,6 @@ This directory contains a general purpose data structures, for use anywhere in the backend: -binaryheap.c - a binary heap - bipartite_match.c - Hopcroft-Karp maximum cardinality algorithm for bipartite graphs bloomfilter.c - probabilistic, space-efficient set membership testing @@ -21,8 +19,6 @@ pairingheap.c - a pairing heap rbtree.c - a red-black tree -stringinfo.c - an extensible string type - Aside from the inherent characteristics of the data structures, there are a few practical differences between the binary heap and the pairing heap. The diff --git a/src/backend/lib/bipartite_match.c b/src/backend/lib/bipartite_match.c index 5af789652c794..ed54f190494b4 100644 --- a/src/backend/lib/bipartite_match.c +++ b/src/backend/lib/bipartite_match.c @@ -38,7 +38,7 @@ static bool hk_depth_search(BipartiteMatchState *state, int u); BipartiteMatchState * BipartiteMatch(int u_size, int v_size, short **adjacency) { - BipartiteMatchState *state = palloc(sizeof(BipartiteMatchState)); + BipartiteMatchState *state = palloc_object(BipartiteMatchState); if (u_size < 0 || u_size >= SHRT_MAX || v_size < 0 || v_size >= SHRT_MAX) diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c index b8d031f201520..82f6aa966de97 100644 --- a/src/backend/lib/dshash.c +++ b/src/backend/lib/dshash.c @@ -31,6 +31,8 @@ #include "postgres.h" +#include + #include "common/hashfn.h" #include "lib/dshash.h" #include "storage/lwlock.h" @@ -209,7 +211,7 @@ dshash_create(dsa_area *area, const dshash_parameters *params, void *arg) dsa_pointer control; /* Allocate the backend-local object representing the hash table. */ - hash_table = palloc(sizeof(dshash_table)); + hash_table = palloc_object(dshash_table); /* Allocate the control object in shared memory. */ control = dsa_allocate(area, sizeof(dshash_table_control)); @@ -274,7 +276,7 @@ dshash_attach(dsa_area *area, const dshash_parameters *params, dsa_pointer control; /* Allocate the backend-local object representing the hash table. */ - hash_table = palloc(sizeof(dshash_table)); + hash_table = palloc_object(dshash_table); /* Find the control object in shared memory. */ control = handle; diff --git a/src/backend/lib/integerset.c b/src/backend/lib/integerset.c index f4153b0e15a24..aca1df2ad5a1f 100644 --- a/src/backend/lib/integerset.c +++ b/src/backend/lib/integerset.c @@ -284,7 +284,7 @@ intset_create(void) { IntegerSet *intset; - intset = (IntegerSet *) palloc(sizeof(IntegerSet)); + intset = palloc_object(IntegerSet); intset->context = CurrentMemoryContext; intset->mem_used = GetMemoryChunkSpace(intset); diff --git a/src/backend/lib/pairingheap.c b/src/backend/lib/pairingheap.c index 0aef8a88f1b5e..3497dc7902afa 100644 --- a/src/backend/lib/pairingheap.c +++ b/src/backend/lib/pairingheap.c @@ -43,13 +43,27 @@ pairingheap_allocate(pairingheap_comparator compare, void *arg) { pairingheap *heap; - heap = (pairingheap *) palloc(sizeof(pairingheap)); + heap = palloc_object(pairingheap); + pairingheap_initialize(heap, compare, arg); + + return heap; +} + +/* + * pairingheap_initialize + * + * Same as pairingheap_allocate(), but initializes the pairing heap in-place + * rather than allocating a new chunk of memory. Useful to store the pairing + * heap in a shared memory. + */ +void +pairingheap_initialize(pairingheap *heap, pairingheap_comparator compare, + void *arg) +{ heap->ph_compare = compare; heap->ph_arg = arg; heap->ph_root = NULL; - - return heap; } /* diff --git a/src/backend/lib/rbtree.c b/src/backend/lib/rbtree.c index 3b5e5faa9bf5f..8fe6bfc539a21 100644 --- a/src/backend/lib/rbtree.c +++ b/src/backend/lib/rbtree.c @@ -7,9 +7,9 @@ * This code comes from Thomas Niemann's "Sorting and Searching Algorithms: * a Cookbook". * - * See http://www.cs.auckland.ac.nz/software/AlgAnim/niemann/s_man.htm for - * license terms: "Source code, when part of a software project, may be used - * freely without reference to the author." + * See https://web.archive.org/web/20131202103513/http://www.cs.auckland.ac.nz/software/AlgAnim/niemann/s_man.htm + * for license terms: "Source code, when part of a software project, may be + * used freely without reference to the author." * * Red-black trees are a type of balanced binary tree wherein (1) any child of * a red node is always black, and (2) every path from root to leaf traverses @@ -106,7 +106,7 @@ rbt_create(Size node_size, rbt_freefunc freefunc, void *arg) { - RBTree *tree = (RBTree *) palloc(sizeof(RBTree)); + RBTree *tree = palloc_object(RBTree); Assert(node_size > sizeof(RBTNode)); diff --git a/src/backend/libpq/auth-oauth.c b/src/backend/libpq/auth-oauth.c index 27f7af7be0024..32a370ede117b 100644 --- a/src/backend/libpq/auth-oauth.c +++ b/src/backend/libpq/auth-oauth.c @@ -108,7 +108,7 @@ oauth_init(Port *port, const char *selected_mech, const char *shadow_pass) errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("client selected an invalid SASL authentication mechanism")); - ctx = palloc0(sizeof(*ctx)); + ctx = palloc0_object(struct oauth_ctx); ctx->state = OAUTH_STATE_INIT; ctx->port = port; @@ -656,7 +656,7 @@ validate(Port *port, const char *auth) errmsg("validation of OAuth token requested without a validator loaded")); /* Call the validation function from the validator module */ - ret = palloc0(sizeof(ValidatorModuleResult)); + ret = palloc0_object(ValidatorModuleResult); if (!ValidatorCallbacks->validate_cb(validator_module_state, token, port->user_name, ret)) { @@ -785,14 +785,14 @@ load_validator_library(const char *libname) "OAuth validator", libname, "validate_cb")); /* Allocate memory for validator library private state data */ - validator_module_state = (ValidatorModuleState *) palloc0(sizeof(ValidatorModuleState)); + validator_module_state = palloc0_object(ValidatorModuleState); validator_module_state->sversion = PG_VERSION_NUM; if (ValidatorCallbacks->startup_cb != NULL) ValidatorCallbacks->startup_cb(validator_module_state); /* Shut down the library before cleaning up its state. */ - mcb = palloc0(sizeof(*mcb)); + mcb = palloc0_object(MemoryContextCallback); mcb->func = shutdown_validator_library; MemoryContextRegisterResetCallback(CurrentMemoryContext, mcb); diff --git a/src/backend/libpq/auth-scram.c b/src/backend/libpq/auth-scram.c index db778405724ad..2d6d97b4bb6ee 100644 --- a/src/backend/libpq/auth-scram.c +++ b/src/backend/libpq/auth-scram.c @@ -134,8 +134,6 @@ typedef struct { scram_state_enum state; - const char *username; /* username from startup packet */ - Port *port; bool channel_binding_in_use; @@ -242,7 +240,7 @@ scram_init(Port *port, const char *selected_mech, const char *shadow_pass) scram_state *state; bool got_secret; - state = (scram_state *) palloc0(sizeof(scram_state)); + state = palloc0_object(scram_state); state->port = port; state->state = SCRAM_AUTH_INIT; diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 9f4d05ffbd453..8045dbffe049f 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -70,14 +70,14 @@ static int CheckMD5Auth(Port *port, char *shadow_pass, /* Standard TCP port number for Ident service. Assigned by IANA */ #define IDENT_PORT 113 -static int ident_inet(hbaPort *port); +static int ident_inet(Port *port); /*---------------------------------------------------------------- * Peer authentication *---------------------------------------------------------------- */ -static int auth_peer(hbaPort *port); +static int auth_peer(Port *port); /*---------------------------------------------------------------- @@ -94,8 +94,16 @@ static int auth_peer(hbaPort *port); #define PGSQL_PAM_SERVICE "postgresql" /* Service name passed to PAM */ +/* Work around original Solaris' lack of "const" in the conv_proc signature */ +#ifdef _PAM_LEGACY_NONCONST +#define PG_PAM_CONST +#else +#define PG_PAM_CONST const +#endif + static int CheckPAMAuth(Port *port, const char *user, const char *password); -static int pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +static int pam_passwd_conv_proc(int num_msg, + PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr); static struct pam_conv pam_passw_conv = { @@ -990,8 +998,8 @@ pg_GSS_recvauth(Port *port) gbuf.length = buf.len; gbuf.value = buf.data; - elog(DEBUG4, "processing received GSS token of length %u", - (unsigned int) gbuf.length); + elog(DEBUG4, "processing received GSS token of length %zu", + gbuf.length); maj_stat = gss_accept_sec_context(&min_stat, &port->gss->ctx, @@ -1009,9 +1017,9 @@ pg_GSS_recvauth(Port *port) pfree(buf.data); elog(DEBUG5, "gss_accept_sec_context major: %u, " - "minor: %u, outlen: %u, outflags: %x", + "minor: %u, outlen: %zu, outflags: %x", maj_stat, min_stat, - (unsigned int) port->gss->outbuf.length, gflags); + port->gss->outbuf.length, gflags); CHECK_FOR_INTERRUPTS(); @@ -1026,8 +1034,8 @@ pg_GSS_recvauth(Port *port) /* * Negotiation generated data to be sent to the client. */ - elog(DEBUG4, "sending GSS response token of length %u", - (unsigned int) port->gss->outbuf.length); + elog(DEBUG4, "sending GSS response token of length %zu", + port->gss->outbuf.length); sendAuthRequest(port, AUTH_REQ_GSS_CONT, port->gss->outbuf.value, port->gss->outbuf.length); @@ -1572,6 +1580,15 @@ pg_SSPI_make_upn(char *accountname, *---------------------------------------------------------------- */ +/* + * Per RFC 1413, space and tab are whitespace in ident messages. + */ +static bool +is_ident_whitespace(const char c) +{ + return c == ' ' || c == '\t'; +} + /* * Parse the string "*ident_response" as a response from a query to an Ident * server. If it's a normal response indicating a user name, return true @@ -1605,14 +1622,14 @@ interpret_ident_response(const char *ident_response, int i; /* Index into *response_type */ cursor++; /* Go over colon */ - while (pg_isblank(*cursor)) + while (is_ident_whitespace(*cursor)) cursor++; /* skip blanks */ i = 0; - while (*cursor != ':' && *cursor != '\r' && !pg_isblank(*cursor) && + while (*cursor != ':' && *cursor != '\r' && !is_ident_whitespace(*cursor) && i < (int) (sizeof(response_type) - 1)) response_type[i++] = *cursor++; response_type[i] = '\0'; - while (pg_isblank(*cursor)) + while (is_ident_whitespace(*cursor)) cursor++; /* skip blanks */ if (strcmp(response_type, "USERID") != 0) return false; @@ -1635,7 +1652,7 @@ interpret_ident_response(const char *ident_response, else { cursor++; /* Go over colon */ - while (pg_isblank(*cursor)) + while (is_ident_whitespace(*cursor)) cursor++; /* skip blanks */ /* Rest of line is user name. Copy it over. */ i = 0; @@ -1660,7 +1677,7 @@ interpret_ident_response(const char *ident_response, * latch was set would improve the responsiveness to timeouts/cancellations. */ static int -ident_inet(hbaPort *port) +ident_inet(Port *port) { const SockAddr remote_addr = port->raddr; const SockAddr local_addr = port->laddr; @@ -1845,7 +1862,7 @@ ident_inet(hbaPort *port) * Iff authorized, return STATUS_OK, otherwise return STATUS_ERROR. */ static int -auth_peer(hbaPort *port) +auth_peer(Port *port) { uid_t uid; gid_t gid; @@ -1917,7 +1934,7 @@ auth_peer(hbaPort *port) */ static int -pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +pam_passwd_conv_proc(int num_msg, PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr) { const char *passwd; @@ -2223,8 +2240,8 @@ InitializeLDAPConnection(Port *port, LDAP **ldap) if (!*ldap) { ereport(LOG, - (errmsg("could not initialize LDAP: error code %d", - (int) LdapGetLastError()))); + (errmsg("could not initialize LDAP: error code %lu", + LdapGetLastError()))); return STATUS_ERROR; } diff --git a/src/backend/libpq/be-secure-gssapi.c b/src/backend/libpq/be-secure-gssapi.c index 717ba9824f914..5d98c58ffa8be 100644 --- a/src/backend/libpq/be-secure-gssapi.c +++ b/src/backend/libpq/be-secure-gssapi.c @@ -46,11 +46,18 @@ * don't want the other side to send arbitrarily huge packets as we * would have to allocate memory for them to then pass them to GSSAPI. * - * Therefore, these two #define's are effectively part of the protocol + * Therefore, this #define is effectively part of the protocol * spec and can't ever be changed. */ -#define PQ_GSS_SEND_BUFFER_SIZE 16384 -#define PQ_GSS_RECV_BUFFER_SIZE 16384 +#define PQ_GSS_MAX_PACKET_SIZE 16384 /* includes uint32 header word */ + +/* + * However, during the authentication exchange we must cope with whatever + * message size the GSSAPI library wants to send (because our protocol + * doesn't support splitting those messages). Depending on configuration + * those messages might be as much as 64kB. + */ +#define PQ_GSS_AUTH_BUFFER_SIZE 65536 /* includes uint32 header word */ /* * Since we manage at most one GSS-encrypted connection per backend, @@ -114,9 +121,9 @@ be_gssapi_write(Port *port, const void *ptr, size_t len) * again, so if it offers a len less than that, something is wrong. * * Note: it may seem attractive to report partial write completion once - * we've successfully sent any encrypted packets. However, that can cause - * problems for callers; notably, pqPutMsgEnd's heuristic to send only - * full 8K blocks interacts badly with such a hack. We won't save much, + * we've successfully sent any encrypted packets. However, doing that + * expands the state space of this processing and has been responsible for + * bugs in the past (cf. commit d053a879b). We won't save much, * typically, by letting callers discard data early, so don't risk it. */ if (len < PqGSSSendConsumed) @@ -210,12 +217,12 @@ be_gssapi_write(Port *port, const void *ptr, size_t len) errno = ECONNRESET; return -1; } - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("server tried to send oversize GSSAPI packet (%zu > %zu)", (size_t) output.length, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)))); errno = ECONNRESET; return -1; } @@ -346,12 +353,12 @@ be_gssapi_read(Port *port, void *ptr, size_t len) /* Decode the packet length and check for overlength packet */ input.length = pg_ntoh32(*(uint32 *) PqGSSRecvBuffer); - if (input.length > PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)) + if (input.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("oversize GSSAPI packet sent by the client (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)))); errno = ECONNRESET; return -1; } @@ -517,10 +524,13 @@ secure_open_gssapi(Port *port) * that will never use them, and we ensure that the buffers are * sufficiently aligned for the length-word accesses that we do in some * places in this file. + * + * We'll use PQ_GSS_AUTH_BUFFER_SIZE-sized buffers until transport + * negotiation is complete, then switch to PQ_GSS_MAX_PACKET_SIZE. */ - PqGSSSendBuffer = malloc(PQ_GSS_SEND_BUFFER_SIZE); - PqGSSRecvBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); - PqGSSResultBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); + PqGSSSendBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), @@ -568,16 +578,16 @@ secure_open_gssapi(Port *port) /* * During initialization, packets are always fully consumed and - * shouldn't ever be over PQ_GSS_RECV_BUFFER_SIZE in length. + * shouldn't ever be over PQ_GSS_AUTH_BUFFER_SIZE in total length. * * Verify on our side that the client doesn't do something funny. */ - if (input.length > PQ_GSS_RECV_BUFFER_SIZE) + if (input.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { ereport(COMMERROR, - (errmsg("oversize GSSAPI packet sent by the client (%zu > %d)", + (errmsg("oversize GSSAPI packet sent by the client (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE))); + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)))); return -1; } @@ -631,12 +641,12 @@ secure_open_gssapi(Port *port) { uint32 netlen = pg_hton32(output.length); - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("server tried to send oversize GSSAPI packet (%zu > %zu)", (size_t) output.length, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)))); gss_release_buffer(&minor, &output); return -1; } @@ -691,12 +701,29 @@ secure_open_gssapi(Port *port) break; } + /* + * Release the large authentication buffers and allocate the ones we want + * for normal operation. + */ + free(PqGSSSendBuffer); + free(PqGSSRecvBuffer); + free(PqGSSResultBuffer); + PqGSSSendBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + PqGSSSendLength = PqGSSSendNext = PqGSSSendConsumed = 0; + PqGSSRecvLength = PqGSSResultLength = PqGSSResultNext = 0; + /* * Determine the max packet size which will fit in our buffer, after * accounting for the length. be_gssapi_write will need this. */ major = gss_wrap_size_limit(&minor, port->gss->ctx, 1, GSS_C_QOP_DEFAULT, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32), + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32), &PqGSSMaxPktSize); if (GSS_ERROR(major)) diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c index 64ff3ce3d6a7a..37f4d97f20919 100644 --- a/src/backend/libpq/be-secure-openssl.c +++ b/src/backend/libpq/be-secure-openssl.c @@ -87,8 +87,14 @@ static bool ssl_is_server_start; static int ssl_protocol_version_to_openssl(int v); static const char *ssl_protocol_version_to_string(int v); -/* for passing data back from verify_cb() */ -static const char *cert_errdetail; +struct CallbackErr +{ + /* + * Storage for passing certificate verification error logging from the + * callback. + */ + char *cert_errdetail; +}; /* ------------------------------------------------------------ */ /* Public interface */ @@ -443,6 +449,7 @@ be_tls_open_server(Port *port) int waitfor; unsigned long ecode; bool give_proto_hint; + static struct CallbackErr err_context; Assert(!port->ssl); Assert(!port->peer); @@ -477,6 +484,10 @@ be_tls_open_server(Port *port) SSLerrmessage(ERR_get_error())))); return -1; } + + err_context.cert_errdetail = NULL; + SSL_set_ex_data(port->ssl, 0, &err_context); + port->ssl_in_use = true; aloop: @@ -576,7 +587,7 @@ be_tls_open_server(Port *port) (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not accept SSL connection: %s", SSLerrmessage(ecode)), - cert_errdetail ? errdetail_internal("%s", cert_errdetail) : 0, + err_context.cert_errdetail ? errdetail_internal("%s", err_context.cert_errdetail) : 0, give_proto_hint ? errhint("This may indicate that the client does not support any SSL protocol version between %s and %s.", ssl_min_protocol_version ? @@ -585,7 +596,8 @@ be_tls_open_server(Port *port) ssl_max_protocol_version ? ssl_protocol_version_to_string(ssl_max_protocol_version) : MAX_OPENSSL_TLS_VERSION) : 0)); - cert_errdetail = NULL; + if (err_context.cert_errdetail) + pfree(err_context.cert_errdetail); break; case SSL_ERROR_ZERO_RETURN: ereport(COMMERROR, @@ -1209,6 +1221,8 @@ verify_cb(int ok, X509_STORE_CTX *ctx) const char *errstring; StringInfoData str; X509 *cert; + SSL *ssl; + struct CallbackErr *cb_err; if (ok) { @@ -1221,6 +1235,13 @@ verify_cb(int ok, X509_STORE_CTX *ctx) errcode = X509_STORE_CTX_get_error(ctx); errstring = X509_verify_cert_error_string(errcode); + /* + * Extract the current SSL and CallbackErr object to use for passing error + * detail back from the callback. + */ + ssl = X509_STORE_CTX_get_ex_data(ctx, SSL_get_ex_data_X509_STORE_CTX_idx()); + cb_err = (struct CallbackErr *) SSL_get_ex_data(ssl, 0); + initStringInfo(&str); appendStringInfo(&str, _("Client certificate verification failed at depth %d: %s."), @@ -1271,7 +1292,7 @@ verify_cb(int ok, X509_STORE_CTX *ctx) } /* Store our detail message to be logged later. */ - cert_errdetail = str.data; + cb_err->cert_errdetail = str.data; return ok; } @@ -1436,10 +1457,10 @@ initialize_ecdh(SSL_CTX *context, bool isServerStart) */ ereport(isServerStart ? FATAL : LOG, errcode(ERRCODE_CONFIG_FILE_ERROR), - errmsg("failed to set group names specified in ssl_groups: %s", + errmsg("could not set group names specified in ssl_groups: %s", SSLerrmessageExt(ERR_get_error(), _("No valid groups found"))), - errhint("Ensure that each group name is spelled correctly and supported by the installed version of OpenSSL")); + errhint("Ensure that each group name is spelled correctly and supported by the installed version of OpenSSL.")); return false; } #endif diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index 332fad278351c..4c259f58d77b0 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -138,14 +138,11 @@ static int regexec_auth_token(const char *match, AuthToken *token, static void tokenize_error_callback(void *arg); -/* - * isblank() exists in the ISO C99 spec, but it's not very portable yet, - * so provide our own version. - */ -bool +static bool pg_isblank(const char c) { - return c == ' ' || c == '\t' || c == '\r'; + /* don't accept non-ASCII data */ + return (!IS_HIGHBIT_SET(c) && isblank(c)); } @@ -312,7 +309,7 @@ regcomp_auth_token(AuthToken *token, char *filename, int line_num, if (token->string[0] != '/') return 0; /* nothing to compile */ - token->regex = (regex_t *) palloc0(sizeof(regex_t)); + token->regex = palloc0_object(regex_t); wstr = palloc((strlen(token->string + 1) + 1) * sizeof(pg_wchar)); wlen = pg_mb2wchar_with_len(token->string + 1, wstr, strlen(token->string + 1)); @@ -894,7 +891,7 @@ tokenize_auth_file(const char *filename, FILE *file, List **tok_lines, * to this list. */ oldcxt = MemoryContextSwitchTo(tokenize_context); - tok_line = (TokenizedAuthLine *) palloc0(sizeof(TokenizedAuthLine)); + tok_line = palloc0_object(TokenizedAuthLine); tok_line->fields = current_line; tok_line->file_name = pstrdup(filename); tok_line->line_num = line_number; @@ -1075,7 +1072,7 @@ hostname_match(const char *pattern, const char *actual_hostname) * Check to see if a connecting IP matches a given host name. */ static bool -check_hostname(hbaPort *port, const char *hostname) +check_hostname(Port *port, const char *hostname) { struct addrinfo *gai_result, *gai; @@ -1342,7 +1339,7 @@ parse_hba_line(TokenizedAuthLine *tok_line, int elevel) AuthToken *token; HbaLine *parsedline; - parsedline = palloc0(sizeof(HbaLine)); + parsedline = palloc0_object(HbaLine); parsedline->sourcefile = pstrdup(file_name); parsedline->linenumber = line_num; parsedline->rawline = pstrdup(tok_line->raw_line); @@ -2528,7 +2525,7 @@ parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline, * request. */ static void -check_hba(hbaPort *port) +check_hba(Port *port) { Oid roleid; ListCell *line; @@ -2625,7 +2622,7 @@ check_hba(hbaPort *port) } /* If no matching entry was found, then implicitly reject. */ - hba = palloc0(sizeof(HbaLine)); + hba = palloc0_object(HbaLine); hba->auth_method = uaImplicitReject; port->hba = hba; } @@ -2761,7 +2758,7 @@ parse_ident_line(TokenizedAuthLine *tok_line, int elevel) Assert(tok_line->fields != NIL); field = list_head(tok_line->fields); - parsedline = palloc0(sizeof(IdentLine)); + parsedline = palloc0_object(IdentLine); parsedline->linenumber = line_num; /* Get the map token (must exist) */ @@ -2873,8 +2870,11 @@ check_ident_usermap(IdentLine *identLine, const char *usermap_name, !token_has_regexp(identLine->pg_user) && (ofs = strstr(identLine->pg_user->string, "\\1")) != NULL) { + const char *repl_str; + size_t repl_len; + char *old_pg_user; char *expanded_pg_user; - int offset; + size_t offset; /* substitution of the first argument requested */ if (matches[1].rm_so < 0) @@ -2886,18 +2886,33 @@ check_ident_usermap(IdentLine *identLine, const char *usermap_name, *error_p = true; return; } + repl_str = system_user + matches[1].rm_so; + repl_len = matches[1].rm_eo - matches[1].rm_so; /* - * length: original length minus length of \1 plus length of match - * plus null terminator + * It's allowed to have more than one \1 in the string, and we'll + * replace them all. But that's pretty unusual so we optimize on + * the assumption of only one occurrence, which motivates doing + * repeated replacements instead of making two passes over the + * string to determine the final length right away. */ - expanded_pg_user = palloc0(strlen(identLine->pg_user->string) - 2 + (matches[1].rm_eo - matches[1].rm_so) + 1); - offset = ofs - identLine->pg_user->string; - memcpy(expanded_pg_user, identLine->pg_user->string, offset); - memcpy(expanded_pg_user + offset, - system_user + matches[1].rm_so, - matches[1].rm_eo - matches[1].rm_so); - strcat(expanded_pg_user, ofs + 2); + old_pg_user = identLine->pg_user->string; + do + { + /* + * length: current length minus length of \1 plus length of + * replacement plus null terminator + */ + expanded_pg_user = palloc(strlen(old_pg_user) - 2 + repl_len + 1); + /* ofs points into the old_pg_user string at this point */ + offset = ofs - old_pg_user; + memcpy(expanded_pg_user, old_pg_user, offset); + memcpy(expanded_pg_user + offset, repl_str, repl_len); + strcpy(expanded_pg_user + offset + repl_len, ofs + 2); + if (old_pg_user != identLine->pg_user->string) + pfree(old_pg_user); + old_pg_user = expanded_pg_user; + } while ((ofs = strstr(old_pg_user + offset + repl_len, "\\1")) != NULL); /* * Mark the token as quoted, so it will only be compared literally @@ -3107,7 +3122,7 @@ load_ident(void) * method = uaImplicitReject. */ void -hba_getauthmethod(hbaPort *port) +hba_getauthmethod(Port *port) { check_hba(port); } diff --git a/src/backend/libpq/pg_ident.conf.sample b/src/backend/libpq/pg_ident.conf.sample index f5225f26cdf2c..8ee6c0ba31576 100644 --- a/src/backend/libpq/pg_ident.conf.sample +++ b/src/backend/libpq/pg_ident.conf.sample @@ -13,25 +13,25 @@ # user names to their corresponding PostgreSQL user names. Records # are of the form: # -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME # # (The uppercase quantities must be replaced by actual values.) # # MAPNAME is the (otherwise freely chosen) map name that was used in # pg_hba.conf. SYSTEM-USERNAME is the detected user name of the -# client. PG-USERNAME is the requested PostgreSQL user name. The -# existence of a record specifies that SYSTEM-USERNAME may connect as -# PG-USERNAME. +# client. DATABASE-USERNAME is the requested PostgreSQL user name. +# The existence of a record specifies that SYSTEM-USERNAME may connect +# as DATABASE-USERNAME. # -# If SYSTEM-USERNAME starts with a slash (/), it will be treated as a -# regular expression. Optionally this can contain a capture (a -# parenthesized subexpression). The substring matching the capture -# will be substituted for \1 (backslash-one) if present in -# PG-USERNAME. +# If SYSTEM-USERNAME starts with a slash (/), the rest of it will be +# treated as a regular expression. Optionally this can contain a capture +# (a parenthesized subexpression). The substring matching the capture +# will be substituted for \1 (backslash-one) if that appears in +# DATABASE-USERNAME. # -# PG-USERNAME can be "all", a user name, a group name prefixed with "+", or -# a regular expression (if it starts with a slash (/)). If it is a regular -# expression, the substring matching with \1 has no effect. +# DATABASE-USERNAME can be "all", a user name, a group name prefixed with "+", +# or a regular expression (if it starts with a slash (/)). If it is a regular +# expression, no substitution for \1 will occur. # # Multiple maps may be specified in this file and used by pg_hba.conf. # @@ -69,4 +69,4 @@ # Put your actual configuration here # ---------------------------------- -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index e5171467de18d..e33623fdbace7 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -178,7 +178,7 @@ pq_init(ClientSocket *client_sock) int latch_pos PG_USED_FOR_ASSERTS_ONLY; /* allocate the Port struct and copy the ClientSocket contents to it */ - port = palloc0(sizeof(Port)); + port = palloc0_object(Port); port->sock = client_sock->sock; memcpy(&port->raddr.addr, &client_sock->raddr.addr, client_sock->raddr.salen); port->raddr.salen = client_sock->raddr.salen; @@ -454,9 +454,9 @@ ListenServerPort(int family, const char *hostName, unsigned short portNumber, if (strlen(unixSocketPath) >= UNIXSOCK_PATH_BUFLEN) { ereport(LOG, - (errmsg("Unix-domain socket path \"%s\" is too long (maximum %d bytes)", + (errmsg("Unix-domain socket path \"%s\" is too long (maximum %zu bytes)", unixSocketPath, - (int) (UNIXSOCK_PATH_BUFLEN - 1)))); + (UNIXSOCK_PATH_BUFLEN - 1)))); return STATUS_ERROR; } if (Lock_AF_UNIX(unixSocketDir, unixSocketPath) != STATUS_OK) @@ -618,10 +618,10 @@ ListenServerPort(int family, const char *hostName, unsigned short portNumber, saved_errno == EADDRINUSE ? (addr->ai_family == AF_UNIX ? errhint("Is another postmaster already running on port %d?", - (int) portNumber) : + portNumber) : errhint("Is another postmaster already running on port %d?" " If not, wait a few seconds and retry.", - (int) portNumber)) : 0)); + portNumber)) : 0)); closesocket(fd); continue; } @@ -662,7 +662,7 @@ ListenServerPort(int family, const char *hostName, unsigned short portNumber, ereport(LOG, /* translator: first %s is IPv4 or IPv6 */ (errmsg("listening on %s address \"%s\", port %d", - familyDesc, addrDesc, (int) portNumber))); + familyDesc, addrDesc, portNumber))); ListenSockets[*NumListenSockets] = fd; (*NumListenSockets)++; @@ -858,7 +858,6 @@ RemoveSocketFiles(void) (void) unlink(sock_path); } /* Since we're about to exit, no need to reclaim storage */ - sock_paths = NIL; } diff --git a/src/backend/libpq/pqformat.c b/src/backend/libpq/pqformat.c index 1cc126772f7c0..67bdd3d93d05f 100644 --- a/src/backend/libpq/pqformat.c +++ b/src/backend/libpq/pqformat.c @@ -307,9 +307,8 @@ pq_endmessage(StringInfo buf) * * The data buffer is *not* freed, allowing to reuse the buffer with * pq_beginmessage_reuse. - -------------------------------- + * -------------------------------- */ - void pq_endmessage_reuse(StringInfo buf) { diff --git a/src/backend/libpq/pqmq.c b/src/backend/libpq/pqmq.c index f1a08bc32ca17..2b75de0ddef94 100644 --- a/src/backend/libpq/pqmq.c +++ b/src/backend/libpq/pqmq.c @@ -23,7 +23,7 @@ #include "tcop/tcopprot.h" #include "utils/builtins.h" -static shm_mq_handle *pq_mq_handle; +static shm_mq_handle *pq_mq_handle = NULL; static bool pq_mq_busy = false; static pid_t pq_mq_parallel_leader_pid = 0; static ProcNumber pq_mq_parallel_leader_proc_number = INVALID_PROC_NUMBER; @@ -66,7 +66,11 @@ pq_redirect_to_shm_mq(dsm_segment *seg, shm_mq_handle *mqh) static void pq_cleanup_redirect_to_shm_mq(dsm_segment *seg, Datum arg) { - pq_mq_handle = NULL; + if (pq_mq_handle != NULL) + { + pfree(pq_mq_handle); + pq_mq_handle = NULL; + } whereToSendOutput = DestNone; } @@ -131,8 +135,11 @@ mq_putmessage(char msgtype, const char *s, size_t len) if (pq_mq_busy) { if (pq_mq_handle != NULL) + { shm_mq_detach(pq_mq_handle); - pq_mq_handle = NULL; + pfree(pq_mq_handle); + pq_mq_handle = NULL; + } return EOF; } @@ -152,8 +159,6 @@ mq_putmessage(char msgtype, const char *s, size_t len) iov[1].data = s; iov[1].len = len; - Assert(pq_mq_handle != NULL); - for (;;) { /* @@ -161,6 +166,7 @@ mq_putmessage(char msgtype, const char *s, size_t len) * that the shared memory value is updated before we send the parallel * message signal right after this. */ + Assert(pq_mq_handle != NULL); result = shm_mq_sendv(pq_mq_handle, iov, 2, true, true); if (pq_mq_parallel_leader_pid != 0) @@ -323,7 +329,7 @@ pq_parse_errornotice(StringInfo msg, ErrorData *edata) edata->funcname = pstrdup(value); break; default: - elog(ERROR, "unrecognized error field code: %d", (int) code); + elog(ERROR, "unrecognized error field code: %d", code); break; } } diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 7d63cf94a6b44..72aaee36a6872 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -125,13 +125,17 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("postgres")); /* - * In the postmaster, absorb the environment values for LC_COLLATE and - * LC_CTYPE. Individual backends will change these later to settings - * taken from pg_database, but the postmaster cannot do that. If we leave - * these set to "C" then message localization might not work well in the - * postmaster. + * Collation is handled by pg_locale.c, and the behavior is dependent on + * the provider. strcoll(), etc., should not be called directly. + */ + init_locale("LC_COLLATE", LC_COLLATE, "C"); + + /* + * In the postmaster, absorb the environment value for LC_CTYPE. + * Individual backends will change it later to pg_database.datctype, but + * the postmaster cannot do that. If we leave it set to "C" then message + * localization might not work well in the postmaster. */ - init_locale("LC_COLLATE", LC_COLLATE, ""); init_locale("LC_CTYPE", LC_CTYPE, ""); /* @@ -482,20 +486,29 @@ check_root(const char *progname) /* * At least on linux, set_ps_display() breaks /proc/$pid/environ. The * sanitizer library uses /proc/$pid/environ to implement getenv() as it wants - * to work independent of libc. When just using undefined and alignment - * sanitizers, the sanitizer library is only initialized when the first error - * occurs, by which time we've often already called set_ps_display(), - * preventing the sanitizer libraries from seeing the options. + * to work independent of libc. Depending on which sanitizers are enabled, + * the sanitizer library may not get initialized until after we've called + * set_ps_display(), preventing the sanitizer from seeing environment-supplied + * options. * * We can work around that by defining __ubsan_default_options, a weak symbol * libsanitizer uses to get defaults from the application, and return * getenv("UBSAN_OPTIONS"). But only if main already was reached, so that we * don't end up relying on a not-yet-working getenv(). * + * On the other hand, with different sanitizers enabled, libsanitizer can + * call this so early that it's not fully initialized itself, resulting in + * recursion and a core dump within libsanitizer. To prevent that, ensure + * that this function is built without any sanitizer callbacks in it. + * * As this function won't get called when not running a sanitizer, it doesn't * seem necessary to only compile it conditionally. */ const char *__ubsan_default_options(void); + +#if __has_attribute(disable_sanitizer_instrumentation) +__attribute__((disable_sanitizer_instrumentation)) +#endif const char * __ubsan_default_options(void) { diff --git a/src/backend/meson.build b/src/backend/meson.build index 2b0db21480470..b831a541652bc 100644 --- a/src/backend/meson.build +++ b/src/backend/meson.build @@ -169,7 +169,7 @@ backend_mod_code = declare_dependency( compile_args: pg_mod_c_args, include_directories: postgres_inc, link_args: pg_mod_link_args, - sources: generated_headers + generated_backend_headers, + sources: generated_backend_headers_stamp, dependencies: backend_mod_deps, ) diff --git a/src/backend/nls.mk b/src/backend/nls.mk index b7d5dd46e4513..698b1083f4bd6 100644 --- a/src/backend/nls.mk +++ b/src/backend/nls.mk @@ -28,7 +28,7 @@ GETTEXT_FLAGS = $(BACKEND_COMMON_GETTEXT_FLAGS) \ error_cb:2:c-format gettext-files: generated-parser-sources generated-headers - find $(srcdir) $(srcdir)/../common $(srcdir)/../port -name '*.c' -print | LC_ALL=C sort >$@ + find $(srcdir) $(srcdir)/../common $(srcdir)/../port $(srcdir)/../include/ \( -name '*.c' -o -name "proctypelist.h" \) -print | LC_ALL=C sort >$@ my-clean: rm -f gettext-files diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index bf512cf806ff7..7b1e9d94103f8 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -538,7 +538,6 @@ bms_is_member(int x, const Bitmapset *a) int bms_member_index(Bitmapset *a, int x) { - int i; int bitnum; int wordnum; int result = 0; @@ -554,7 +553,7 @@ bms_member_index(Bitmapset *a, int x) bitnum = BITNUM(x); /* count bits in preceding words */ - for (i = 0; i < wordnum; i++) + for (int i = 0; i < wordnum; i++) { bitmapword w = a->words[i]; @@ -1306,7 +1305,6 @@ int bms_next_member(const Bitmapset *a, int prevbit) { int nwords; - int wordnum; bitmapword mask; Assert(bms_is_valid_set(a)); @@ -1316,7 +1314,7 @@ bms_next_member(const Bitmapset *a, int prevbit) nwords = a->nwords; prevbit++; mask = (~(bitmapword) 0) << BITNUM(prevbit); - for (wordnum = WORDNUM(prevbit); wordnum < nwords; wordnum++) + for (int wordnum = WORDNUM(prevbit); wordnum < nwords; wordnum++) { bitmapword w = a->words[wordnum]; @@ -1343,7 +1341,7 @@ bms_next_member(const Bitmapset *a, int prevbit) * * Returns largest member less than "prevbit", or -2 if there is none. * "prevbit" must NOT be more than one above the highest possible bit that can - * be set at the Bitmapset at its current size. + * be set in the Bitmapset at its current size. * * To ease finding the highest set bit for the initial loop, the special * prevbit value of -1 can be passed to have the function find the highest @@ -1366,7 +1364,6 @@ bms_next_member(const Bitmapset *a, int prevbit) int bms_prev_member(const Bitmapset *a, int prevbit) { - int wordnum; int ushiftbits; bitmapword mask; @@ -1379,6 +1376,10 @@ bms_prev_member(const Bitmapset *a, int prevbit) if (a == NULL || prevbit == 0) return -2; + /* Validate callers didn't give us something out of range */ + Assert(prevbit <= a->nwords * BITS_PER_BITMAPWORD); + Assert(prevbit >= -1); + /* transform -1 to the highest possible bit we could have set */ if (prevbit == -1) prevbit = a->nwords * BITS_PER_BITMAPWORD - 1; @@ -1387,7 +1388,7 @@ bms_prev_member(const Bitmapset *a, int prevbit) ushiftbits = BITS_PER_BITMAPWORD - (BITNUM(prevbit) + 1); mask = (~(bitmapword) 0) >> ushiftbits; - for (wordnum = WORDNUM(prevbit); wordnum >= 0; wordnum--) + for (int wordnum = WORDNUM(prevbit); wordnum >= 0; wordnum--) { bitmapword w = a->words[wordnum]; diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 475693b08bc5a..efd02eb01c405 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -204,7 +204,7 @@ copyObjectImpl(const void *from) default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(from)); - retval = 0; /* keep compiler quiet */ + retval = NULL; /* keep compiler quiet */ break; } diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index 77659b0f76020..9ecddb1423143 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -1039,6 +1039,11 @@ sub elem print $off "\tWRITE_UINT_FIELD($f);\n"; print $rff "\tREAD_UINT_FIELD($f);\n" unless $no_read; } + elsif ($t eq 'int64') + { + print $off "\tWRITE_INT64_FIELD($f);\n"; + print $rff "\tREAD_INT64_FIELD($f);\n" unless $no_read; + } elsif ($t eq 'uint64' || $t eq 'AclMode') { @@ -1324,7 +1329,7 @@ sub elem # Node type. Squash constants if requested. if ($query_jumble_squash) { - print $jff "\tJUMBLE_ELEMENTS($f);\n" + print $jff "\tJUMBLE_ELEMENTS($f, node);\n" unless $query_jumble_ignore; } else diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index 7bc823507f1b3..024a2b2fd8416 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -1274,12 +1274,8 @@ exprSetCollation(Node *expr, Oid collation) } break; case T_JsonBehavior: - { - JsonBehavior *behavior = (JsonBehavior *) expr; - - if (behavior->expr) - exprSetCollation(behavior->expr, collation); - } + Assert(((JsonBehavior *) expr)->expr == NULL || + exprCollation(((JsonBehavior *) expr)->expr) == collation); break; case T_NullTest: /* NullTest's result is boolean ... */ @@ -2957,7 +2953,7 @@ expression_tree_mutator_impl(Node *node, */ #define FLATCOPY(newnode, node, nodetype) \ - ( (newnode) = (nodetype *) palloc(sizeof(nodetype)), \ + ( (newnode) = palloc_object(nodetype), \ memcpy((newnode), (node), sizeof(nodetype)) ) #define MUTATE(newfield, oldfield, fieldtype) \ @@ -4830,9 +4826,7 @@ planstate_walk_members(PlanState **planstates, int nplans, planstate_tree_walker_callback walker, void *context) { - int j; - - for (j = 0; j < nplans; j++) + for (int j = 0; j < nplans; j++) { if (PSWALK(planstates[j])) return true; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index ceac3fd862014..808909537e4cd 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -51,6 +51,12 @@ static void outDouble(StringInfo str, double d); #define WRITE_UINT_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) +/* Write a signed integer field (anything written with INT64_FORMAT) */ +#define WRITE_INT64_FIELD(fldname) \ + appendStringInfo(str, \ + " :" CppAsString(fldname) " " INT64_FORMAT, \ + node->fldname) + /* Write an unsigned integer field (anything written with UINT64_FORMAT) */ #define WRITE_UINT64_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " " UINT64_FORMAT, \ @@ -340,8 +346,7 @@ outBitmapset(StringInfo str, const Bitmapset *bms) void outDatum(StringInfo str, Datum value, int typlen, bool typbyval) { - Size length, - i; + Size length; char *s; length = datumGetSize(value, typbyval, typlen); @@ -349,20 +354,20 @@ outDatum(StringInfo str, Datum value, int typlen, bool typbyval) if (typbyval) { s = (char *) (&value); - appendStringInfo(str, "%u [ ", (unsigned int) length); - for (i = 0; i < (Size) sizeof(Datum); i++) + appendStringInfo(str, "%zu [ ", length); + for (Size i = 0; i < (Size) sizeof(Datum); i++) appendStringInfo(str, "%d ", (int) (s[i])); appendStringInfoChar(str, ']'); } else { s = (char *) DatumGetPointer(value); - if (!PointerIsValid(s)) + if (!s) appendStringInfoString(str, "0 [ ]"); else { - appendStringInfo(str, "%u [ ", (unsigned int) length); - for (i = 0; i < length; i++) + appendStringInfo(str, "%zu [ ", length); + for (Size i = 0; i < length; i++) appendStringInfo(str, "%d ", (int) (s[i])); appendStringInfoChar(str, ']'); } @@ -428,8 +433,6 @@ _outBoolExpr(StringInfo str, const BoolExpr *node) static void _outForeignKeyOptInfo(StringInfo str, const ForeignKeyOptInfo *node) { - int i; - WRITE_NODE_TYPE("FOREIGNKEYOPTINFO"); WRITE_UINT_FIELD(con_relid); @@ -444,10 +447,10 @@ _outForeignKeyOptInfo(StringInfo str, const ForeignKeyOptInfo *node) WRITE_INT_FIELD(nmatched_ri); /* for compactness, just print the number of matches per column: */ appendStringInfoString(str, " :eclass"); - for (i = 0; i < node->nkeys; i++) + for (int i = 0; i < node->nkeys; i++) appendStringInfo(str, " %d", (node->eclass[i] != NULL)); appendStringInfoString(str, " :rinfos"); - for (i = 0; i < node->nkeys; i++) + for (int i = 0; i < node->nkeys; i++) appendStringInfo(str, " %d", list_length(node->rinfos[i])); } @@ -647,6 +650,8 @@ _outA_Expr(StringInfo str, const A_Expr *node) WRITE_NODE_FIELD(lexpr); WRITE_NODE_FIELD(rexpr); + WRITE_LOCATION_FIELD(rexpr_list_start); + WRITE_LOCATION_FIELD(rexpr_list_end); WRITE_LOCATION_FIELD(location); } diff --git a/src/backend/nodes/params.c b/src/backend/nodes/params.c index ec5946c5777dc..aeb8ace2c5419 100644 --- a/src/backend/nodes/params.c +++ b/src/backend/nodes/params.c @@ -166,13 +166,12 @@ paramlist_param_ref(ParseState *pstate, ParamRef *pref) Size EstimateParamListSpace(ParamListInfo paramLI) { - int i; Size sz = sizeof(int); if (paramLI == NULL || paramLI->numParams <= 0) return sz; - for (i = 0; i < paramLI->numParams; i++) + for (int i = 0; i < paramLI->numParams; i++) { ParamExternData *prm; ParamExternData prmdata; @@ -229,7 +228,6 @@ void SerializeParamList(ParamListInfo paramLI, char **start_address) { int nparams; - int i; /* Write number of parameters. */ if (paramLI == NULL || paramLI->numParams <= 0) @@ -240,7 +238,7 @@ SerializeParamList(ParamListInfo paramLI, char **start_address) *start_address += sizeof(int); /* Write each parameter in turn. */ - for (i = 0; i < nparams; i++) + for (int i = 0; i < nparams; i++) { ParamExternData *prm; ParamExternData prmdata; diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c index d1e82a63f09a8..ffc230af4278c 100644 --- a/src/backend/nodes/queryjumblefuncs.c +++ b/src/backend/nodes/queryjumblefuncs.c @@ -21,6 +21,11 @@ * tree(s) generated from the query. The executor can then use this value * to blame query costs on the proper queryId. * + * Arrays of two or more constants and PARAM_EXTERN parameters are "squashed" + * and contribute only once to the jumble. This has the effect that queries + * that differ only on the length of such lists have the same queryId. + * + * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -56,16 +61,18 @@ int compute_query_id = COMPUTE_QUERY_ID_AUTO; bool query_id_enabled = false; static JumbleState *InitJumble(void); -static uint64 DoJumble(JumbleState *jstate, Node *node); +static int64 DoJumble(JumbleState *jstate, Node *node); static void AppendJumble(JumbleState *jstate, const unsigned char *value, Size size); static void FlushPendingNulls(JumbleState *jstate); static void RecordConstLocation(JumbleState *jstate, - int location, bool squashed); + bool extern_param, + int location, int len); static void _jumbleNode(JumbleState *jstate, Node *node); -static void _jumbleElements(JumbleState *jstate, List *elements); -static void _jumbleA_Const(JumbleState *jstate, Node *node); static void _jumbleList(JumbleState *jstate, Node *node); +static void _jumbleElements(JumbleState *jstate, List *elements, Node *node); +static void _jumbleParam(JumbleState *jstate, Node *node); +static void _jumbleA_Const(JumbleState *jstate, Node *node); static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node); static void _jumbleRangeTblEntry_eref(JumbleState *jstate, RangeTblEntry *rte, @@ -141,12 +148,12 @@ JumbleQuery(Query *query) * If we are unlucky enough to get a hash of zero, use 1 instead for * normal statements and 2 for utility queries. */ - if (query->queryId == UINT64CONST(0)) + if (query->queryId == INT64CONST(0)) { if (query->utilityStmt) - query->queryId = UINT64CONST(2); + query->queryId = INT64CONST(2); else - query->queryId = UINT64CONST(1); + query->queryId = INT64CONST(1); } return jstate; @@ -174,7 +181,7 @@ InitJumble(void) { JumbleState *jstate; - jstate = (JumbleState *) palloc(sizeof(JumbleState)); + jstate = palloc_object(JumbleState); /* Set up workspace for query jumbling */ jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE); @@ -185,6 +192,7 @@ InitJumble(void) jstate->clocations_count = 0; jstate->highest_extern_param_id = 0; jstate->pending_nulls = 0; + jstate->has_squashed_lists = false; #ifdef USE_ASSERT_CHECKING jstate->total_jumble_len = 0; #endif @@ -197,7 +205,7 @@ InitJumble(void) * Jumble the given Node using the given JumbleState and return the resulting * jumble hash. */ -static uint64 +static int64 DoJumble(JumbleState *jstate, Node *node) { /* Jumble the given node */ @@ -207,10 +215,14 @@ DoJumble(JumbleState *jstate, Node *node) if (jstate->pending_nulls > 0) FlushPendingNulls(jstate); + /* Squashed list found, reset highest_extern_param_id */ + if (jstate->has_squashed_lists) + jstate->highest_extern_param_id = 0; + /* Process the jumble buffer and produce the hash value */ - return DatumGetUInt64(hash_any_extended(jstate->jumble, - jstate->jumble_len, - 0)); + return DatumGetInt64(hash_any_extended(jstate->jumble, + jstate->jumble_len, + 0)); } /* @@ -256,10 +268,10 @@ AppendJumbleInternal(JumbleState *jstate, const unsigned char *item, if (unlikely(jumble_len >= JUMBLE_SIZE)) { - uint64 start_hash; + int64 start_hash; - start_hash = DatumGetUInt64(hash_any_extended(jumble, - JUMBLE_SIZE, 0)); + start_hash = DatumGetInt64(hash_any_extended(jumble, + JUMBLE_SIZE, 0)); memcpy(jumble, &start_hash, sizeof(start_hash)); jumble_len = sizeof(start_hash); } @@ -373,15 +385,17 @@ FlushPendingNulls(JumbleState *jstate) /* - * Record location of constant within query string of query tree that is - * currently being walked. + * Record the location of some kind of constant within a query string. + * These are not only bare constants but also expressions that ultimately + * constitute a constant, such as those inside casts and simple function + * calls; if extern_param, then it corresponds to a PARAM_EXTERN Param. * - * 'squashed' signals that the constant represents the first or the last - * element in a series of merged constants, and everything but the first/last - * element contributes nothing to the jumble hash. + * If length is -1, it indicates a single such constant element. If + * it's a positive integer, it indicates the length of a squashable + * list of them. */ static void -RecordConstLocation(JumbleState *jstate, int location, bool squashed) +RecordConstLocation(JumbleState *jstate, bool extern_param, int location, int len) { /* -1 indicates unknown or undefined location */ if (location >= 0) @@ -396,9 +410,15 @@ RecordConstLocation(JumbleState *jstate, int location, bool squashed) sizeof(LocationLen)); } jstate->clocations[jstate->clocations_count].location = location; - /* initialize lengths to -1 to simplify third-party module usage */ - jstate->clocations[jstate->clocations_count].squashed = squashed; - jstate->clocations[jstate->clocations_count].length = -1; + + /* + * Lengths are either positive integers (indicating a squashable + * list), or -1. + */ + Assert(len > -1 || len == -1); + jstate->clocations[jstate->clocations_count].length = len; + jstate->clocations[jstate->clocations_count].squashed = (len > -1); + jstate->clocations[jstate->clocations_count].extern_param = extern_param; jstate->clocations_count++; } } @@ -407,47 +427,74 @@ RecordConstLocation(JumbleState *jstate, int location, bool squashed) * Subroutine for _jumbleElements: Verify a few simple cases where we can * deduce that the expression is a constant: * - * - Ignore a possible wrapping RelabelType and CoerceViaIO. - * - If it's a FuncExpr, check that the function is an implicit + * - See through any wrapping RelabelType and CoerceViaIO layers. + * - If it's a FuncExpr, check that the function is a builtin * cast and its arguments are Const. - * - Otherwise test if the expression is a simple Const. + * - Otherwise test if the expression is a simple Const or a + * PARAM_EXTERN param. */ static bool -IsSquashableConst(Node *element) +IsSquashableConstant(Node *element) { - if (IsA(element, RelabelType)) - element = (Node *) ((RelabelType *) element)->arg; - - if (IsA(element, CoerceViaIO)) - element = (Node *) ((CoerceViaIO *) element)->arg; - - if (IsA(element, FuncExpr)) +restart: + switch (nodeTag(element)) { - FuncExpr *func = (FuncExpr *) element; - ListCell *temp; + case T_RelabelType: + /* Unwrap RelabelType */ + element = (Node *) ((RelabelType *) element)->arg; + goto restart; - if (func->funcformat != COERCE_IMPLICIT_CAST && - func->funcformat != COERCE_EXPLICIT_CAST) - return false; + case T_CoerceViaIO: + /* Unwrap CoerceViaIO */ + element = (Node *) ((CoerceViaIO *) element)->arg; + goto restart; - if (func->funcid > FirstGenbkiObjectId) - return false; + case T_Const: + return true; - foreach(temp, func->args) - { - Node *arg = lfirst(temp); + case T_Param: + return castNode(Param, element)->paramkind == PARAM_EXTERN; - if (!IsA(arg, Const)) /* XXX we could recurse here instead */ - return false; - } + case T_FuncExpr: + { + FuncExpr *func = (FuncExpr *) element; + ListCell *temp; - return true; - } + if (func->funcformat != COERCE_IMPLICIT_CAST && + func->funcformat != COERCE_EXPLICIT_CAST) + return false; - if (!IsA(element, Const)) - return false; + if (func->funcid > FirstGenbkiObjectId) + return false; - return true; + /* + * We can check function arguments recursively, being careful + * about recursing too deep. At each recursion level it's + * enough to test the stack on the first element. (Note that + * I wasn't able to hit this without bloating the stack + * artificially in this function: the parser errors out before + * stack size becomes a problem here.) + */ + foreach(temp, func->args) + { + Node *arg = lfirst(temp); + + if (!IsA(arg, Const)) + { + if (foreach_current_index(temp) == 0 && + stack_is_too_deep()) + return false; + else if (!IsSquashableConstant(arg)) + return false; + } + } + + return true; + } + + default: + return false; + } } /* @@ -457,39 +504,33 @@ IsSquashableConst(Node *element) * Return value indicates if squashing is possible. * * Note that this function searches only for explicit Const nodes with - * possibly very simple decorations on top, and does not try to simplify - * expressions. + * possibly very simple decorations on top and PARAM_EXTERN parameters, + * and does not try to simplify expressions. */ static bool -IsSquashableConstList(List *elements, Node **firstExpr, Node **lastExpr) +IsSquashableConstantList(List *elements) { ListCell *temp; - /* - * If squashing is disabled, or the list is too short, we don't try to - * squash it. - */ + /* If the list is too short, we don't try to squash it. */ if (list_length(elements) < 2) return false; foreach(temp, elements) { - if (!IsSquashableConst(lfirst(temp))) + if (!IsSquashableConstant(lfirst(temp))) return false; } - *firstExpr = linitial(elements); - *lastExpr = llast(elements); - return true; } #define JUMBLE_NODE(item) \ _jumbleNode(jstate, (Node *) expr->item) -#define JUMBLE_ELEMENTS(list) \ - _jumbleElements(jstate, (List *) expr->list) +#define JUMBLE_ELEMENTS(list, node) \ + _jumbleElements(jstate, (List *) expr->list, node) #define JUMBLE_LOCATION(location) \ - RecordConstLocation(jstate, expr->location, false) + RecordConstLocation(jstate, false, expr->location, -1) #define JUMBLE_FIELD(item) \ do { \ if (sizeof(expr->item) == 8) \ @@ -516,42 +557,6 @@ do { \ #include "queryjumblefuncs.funcs.c" -/* - * We jumble lists of constant elements as one individual item regardless - * of how many elements are in the list. This means different queries - * jumble to the same query_id, if the only difference is the number of - * elements in the list. - */ -static void -_jumbleElements(JumbleState *jstate, List *elements) -{ - Node *first, - *last; - - if (IsSquashableConstList(elements, &first, &last)) - { - /* - * If this list of elements is squashable, keep track of the location - * of its first and last elements. When reading back the locations - * array, we'll see two consecutive locations with ->squashed set to - * true, indicating the location of initial and final elements of this - * list. - * - * For the limited set of cases we support now (implicit coerce via - * FuncExpr, Const) it's fine to use exprLocation of the 'last' - * expression, but if more complex composite expressions are to be - * supported (e.g., OpExpr or FuncExpr as an explicit call), more - * sophisticated tracking will be needed. - */ - RecordConstLocation(jstate, exprLocation(first), true); - RecordConstLocation(jstate, exprLocation(last), true); - } - else - { - _jumbleNode(jstate, (Node *) elements); - } -} - static void _jumbleNode(JumbleState *jstate, Node *node) { @@ -593,26 +598,6 @@ _jumbleNode(JumbleState *jstate, Node *node) break; } - /* Special cases to handle outside the automated code */ - switch (nodeTag(expr)) - { - case T_Param: - { - Param *p = (Param *) node; - - /* - * Update the highest Param id seen, in order to start - * normalization correctly. - */ - if (p->paramkind == PARAM_EXTERN && - p->paramid > jstate->highest_extern_param_id) - jstate->highest_extern_param_id = p->paramid; - } - break; - default: - break; - } - /* Ensure we added something to the jumble buffer */ Assert(jstate->total_jumble_len > prev_jumble_len); } @@ -648,6 +633,79 @@ _jumbleList(JumbleState *jstate, Node *node) } } +/* + * We try to jumble lists of expressions as one individual item regardless + * of how many elements are in the list. This is know as squashing, which + * results in different queries jumbling to the same query_id, if the only + * difference is the number of elements in the list. + * + * We allow constants and PARAM_EXTERN parameters to be squashed. To normalize + * such queries, we use the start and end locations of the list of elements in + * a list. + */ +static void +_jumbleElements(JumbleState *jstate, List *elements, Node *node) +{ + bool normalize_list = false; + + if (IsSquashableConstantList(elements)) + { + if (IsA(node, ArrayExpr)) + { + ArrayExpr *aexpr = (ArrayExpr *) node; + + if (aexpr->list_start > 0 && aexpr->list_end > 0) + { + RecordConstLocation(jstate, + false, + aexpr->list_start + 1, + (aexpr->list_end - aexpr->list_start) - 1); + normalize_list = true; + jstate->has_squashed_lists = true; + } + } + } + + if (!normalize_list) + { + _jumbleNode(jstate, (Node *) elements); + } +} + +/* + * We store the highest param ID of extern params. This can later be used + * to start the numbering of the placeholder for squashed lists. + */ +static void +_jumbleParam(JumbleState *jstate, Node *node) +{ + Param *expr = (Param *) node; + + JUMBLE_FIELD(paramkind); + JUMBLE_FIELD(paramid); + JUMBLE_FIELD(paramtype); + /* paramtypmode and paramcollid are ignored */ + + if (expr->paramkind == PARAM_EXTERN) + { + /* + * At this point, only external parameter locations outside of + * squashable lists will be recorded. + */ + RecordConstLocation(jstate, true, expr->location, -1); + + /* + * Update the highest Param id seen, in order to start normalization + * correctly. + * + * Note: This value is reset at the end of jumbling if there exists a + * squashable list. See the comment in the definition of JumbleState. + */ + if (expr->paramid > jstate->highest_extern_param_id) + jstate->highest_extern_param_id = expr->paramid; + } +} + static void _jumbleA_Const(JumbleState *jstate, Node *node) { diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 64d3a09f765bb..9a8ca27ec10cd 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -68,6 +68,12 @@ token = pg_strtok(&length); /* get field value */ \ local_node->fldname = atoui(token) +/* Read a signed integer field (anything written using INT64_FORMAT) */ +#define READ_INT64_FIELD(fldname) \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get field value */ \ + local_node->fldname = strtoi64(token, NULL, 10) + /* Read an unsigned integer field (anything written using UINT64_FORMAT) */ #define READ_UINT64_FIELD(fldname) \ token = pg_strtok(&length); /* skip :fldname */ \ @@ -520,6 +526,8 @@ _readA_Expr(void) READ_NODE_FIELD(lexpr); READ_NODE_FIELD(rexpr); + READ_LOCATION_FIELD(rexpr_list_start); + READ_LOCATION_FIELD(rexpr_list_end); READ_LOCATION_FIELD(location); READ_DONE(); @@ -591,8 +599,7 @@ parseNodeString(void) Datum readDatum(bool typbyval) { - Size length, - i; + Size length; int tokenLength; const char *token; Datum res; @@ -615,18 +622,18 @@ readDatum(bool typbyval) elog(ERROR, "byval datum but length = %zu", length); res = (Datum) 0; s = (char *) (&res); - for (i = 0; i < (Size) sizeof(Datum); i++) + for (Size i = 0; i < (Size) sizeof(Datum); i++) { token = pg_strtok(&tokenLength); s[i] = (char) atoi(token); } } else if (length <= 0) - res = (Datum) NULL; + res = (Datum) 0; else { s = (char *) palloc(length); - for (i = 0; i < length; i++) + for (Size i = 0; i < length; i++) { token = pg_strtok(&tokenLength); s[i] = (char) atoi(token); diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 41031aa8f2fa8..a2fa96317dbb7 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -40,6 +40,7 @@ #include +#include "access/htup_details.h" #include "common/hashfn.h" #include "common/int.h" #include "nodes/bitmapset.h" @@ -363,15 +364,14 @@ tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp) * TBMIterateResult when any of these tuples are reported out. */ void -tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, +tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck) { BlockNumber currblk = InvalidBlockNumber; PagetableEntry *page = NULL; /* only valid when currblk is valid */ - int i; Assert(tbm->iterating == TBM_NOT_ITERATING); - for (i = 0; i < ntids; i++) + for (int i = 0; i < ntids; i++) { BlockNumber blk = ItemPointerGetBlockNumber(tids + i); OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); @@ -470,12 +470,11 @@ static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) { PagetableEntry *apage; - int wordnum; if (bpage->ischunk) { /* Scan b's chunk, mark each indicated page lossy in a */ - for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) + for (int wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) { bitmapword w = bpage->words[wordnum]; @@ -510,7 +509,7 @@ tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) else { /* Both pages are exact, merge at the bit level */ - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + for (int wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) apage->words[wordnum] |= bpage->words[wordnum]; apage->recheck |= bpage->recheck; } @@ -578,14 +577,13 @@ static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) { const PagetableEntry *bpage; - int wordnum; if (apage->ischunk) { /* Scan each bit in chunk, try to clear */ bool candelete = true; - for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) + for (int wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) { bitmapword w = apage->words[wordnum]; @@ -639,7 +637,7 @@ tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) { /* Both pages are exact, merge at the bit level */ Assert(!bpage->ischunk); - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + for (int wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { apage->words[wordnum] &= bpage->words[wordnum]; if (apage->words[wordnum] != 0) @@ -685,7 +683,7 @@ tbm_begin_private_iterate(TIDBitmap *tbm) * Create the TBMPrivateIterator struct, with enough trailing space to * serve the needs of the TBMIterateResult sub-struct. */ - iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator)); + iterator = palloc_object(TBMPrivateIterator); iterator->tbm = tbm; /* @@ -903,10 +901,9 @@ tbm_extract_page_tuple(TBMIterateResult *iteritem, uint32 max_offsets) { PagetableEntry *page = iteritem->internal_page; - int wordnum; int ntuples = 0; - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + for (int wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { bitmapword w = page->words[wordnum]; @@ -1471,7 +1468,7 @@ tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp) * Create the TBMSharedIterator struct, with enough trailing space to * serve the needs of the TBMIterateResult sub-struct. */ - iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator)); + iterator = palloc0_object(TBMSharedIterator); istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp); diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README index 9c724ccfabf83..6c35baceedb2f 100644 --- a/src/backend/optimizer/README +++ b/src/backend/optimizer/README @@ -640,7 +640,6 @@ RelOptInfo - a relation or joined relations GroupResultPath - childless Result plan node (used for degenerate grouping) MaterialPath - a Material plan node MemoizePath - a Memoize plan node for caching tuples from sub-paths - UniquePath - remove duplicate rows (either by hashing or sorting) GatherPath - collect the results of parallel workers GatherMergePath - collect parallel results, preserving their common sort order ProjectionPath - a Result plan node with child (used for projection) @@ -648,7 +647,7 @@ RelOptInfo - a relation or joined relations SortPath - a Sort plan node applied to some sub-path IncrementalSortPath - an IncrementalSort plan node applied to some sub-path GroupPath - a Group plan node applied to some sub-path - UpperUniquePath - a Unique plan node applied to some sub-path + UniquePath - a Unique plan node applied to some sub-path AggPath - an Agg plan node applied to some sub-path GroupingSetsPath - an Agg plan node used to implement GROUPING SETS MinMaxAggPath - a Result plan node with subplans performing MIN/MAX @@ -1501,3 +1500,113 @@ breaking down aggregation or grouping over a partitioned relation into aggregation or grouping over its partitions is called partitionwise aggregation. Especially when the partition keys match the GROUP BY clause, this can be significantly faster than the regular method. + +Eager aggregation +----------------- + +Eager aggregation is a query optimization technique that partially +pushes aggregation past a join, and finalizes it once all the +relations are joined. Eager aggregation may reduce the number of +input rows to the join and thus could result in a better overall plan. + +To prove that the transformation is correct, let's first consider the +case where only inner joins are involved. In this case, we partition +the tables in the FROM clause into two groups: those that contain at +least one aggregation column, and those that do not contain any +aggregation columns. Each group can be treated as a single relation +formed by the Cartesian product of the tables within that group. +Therefore, without loss of generality, we can assume that the FROM +clause contains exactly two relations, R1 and R2, where R1 represents +the relation containing all aggregation columns, and R2 represents the +relation without any aggregation columns. + +Let the query be of the form: + +SELECT G, AGG(A) +FROM R1 JOIN R2 ON J +GROUP BY G; + +where G is the set of grouping keys that may include columns from R1 +and/or R2; AGG(A) is an aggregate function over columns A from R1; J +is the join condition between R1 and R2. + +The transformation of eager aggregation is: + + GROUP BY G, AGG(A) on (R1 JOIN R2 ON J) + = + GROUP BY G, AGG(agg_A) on ((GROUP BY G1, AGG(A) AS agg_A on R1) JOIN R2 ON J) + +This equivalence holds under the following conditions: + +1) AGG is decomposable, meaning that it can be computed in two stages: +a partial aggregation followed by a final aggregation; +2) The set G1 used in the pre-aggregation of R1 includes: + * all columns from R1 that are part of the grouping keys G, and + * all columns from R1 that appear in the join condition J. +3) The grouping operator for any column in G1 must be compatible with +the operator used for that column in the join condition J. + +Since G1 includes all columns from R1 that appear in either the +grouping keys G or the join condition J, all rows within each partial +group have identical values for both the grouping keys and the +join-relevant columns from R1, assuming compatible operators are used. +As a result, the rows within a partial group are indistinguishable in +terms of their contribution to the aggregation and their behavior in +the join. This ensures that all rows in the same partial group share +the same "destiny": they either all match or all fail to match a given +row in R2. Because the aggregate function AGG is decomposable, +aggregating the partial results after the join yields the same final +result as aggregating after the full join, thereby preserving query +semantics. Q.E.D. + +In the case where there are any outer joins, the situation becomes +more complex due to join order constraints and the semantics of +null-extension in outer joins. If the relations that contain at least +one aggregation column cannot be treated as a single relation because +of the join order constraints, partial aggregation paths will not be +generated, and thus the transformation is not applicable. Otherwise, +let R1 be the relation containing all aggregation columns, and R2, R3, +... be the remaining relations. From the inner join case, under the +aforementioned conditions, we have the equivalence: + + GROUP BY G, AGG(A) on (R1 JOIN R2 JOIN R3 ...) + = + GROUP BY G, AGG(agg_A) on ((GROUP BY G1, AGG(A) AS agg_A on R1) JOIN R2 JOIN R3 ...) + +To preserve correctness when outer joins are involved, we require an +additional condition: + +4) R1 must not be on the nullable side of any outer join. + +This condition ensures that partial aggregation over R1 does not +suppress any null-extended rows that would be introduced by outer +joins. If R1 is on the nullable side of an outer join, the +NULL-extended rows produced by the outer join would not be available +when we perform the partial aggregation, while with a +non-eager-aggregation plan these rows are available for the top-level +aggregation. Pushing partial aggregation in this case may result in +the rows being grouped differently than expected, or produce incorrect +values from the aggregate functions. + +During the construction of the join tree, we evaluate each base or +join relation to determine if eager aggregation can be applied. If +feasible, we create a separate RelOptInfo called a "grouped relation" +and generate grouped paths by adding sorted and hashed partial +aggregation paths on top of the non-grouped paths. To limit planning +time, we consider only the cheapest or suitably-sorted non-grouped +paths in this step. + +Another way to generate grouped paths is to join a grouped relation +with a non-grouped relation. Joining two grouped relations is +currently not supported. + +To further limit planning time, we currently adopt a strategy where +partial aggregation is pushed only to the lowest feasible level in the +join tree where it provides a significant reduction in row count. +This strategy also helps ensure that all grouped paths for the same +grouped relation produce the same set of rows, which is important to +support a fundamental assumption of the planner. + +If we have generated a grouped relation for the topmost join relation, +we need to finalize its paths at the end. The final paths will +compete in the usual way with paths built from regular planning. diff --git a/src/backend/optimizer/geqo/geqo_erx.c b/src/backend/optimizer/geqo/geqo_erx.c index af289f7eeb713..f11a59e4a289e 100644 --- a/src/backend/optimizer/geqo/geqo_erx.c +++ b/src/backend/optimizer/geqo/geqo_erx.c @@ -62,7 +62,7 @@ alloc_edge_table(PlannerInfo *root, int num_gene) * directly; 0 will not be used */ - edge_table = (Edge *) palloc((num_gene + 1) * sizeof(Edge)); + edge_table = palloc_array(Edge, num_gene + 1); return edge_table; } diff --git a/src/backend/optimizer/geqo/geqo_eval.c b/src/backend/optimizer/geqo/geqo_eval.c index f07d1dc8ac69b..c65a31d0679d6 100644 --- a/src/backend/optimizer/geqo/geqo_eval.c +++ b/src/backend/optimizer/geqo/geqo_eval.c @@ -162,7 +162,7 @@ geqo_eval(PlannerInfo *root, Gene *tour, int num_gene) RelOptInfo * gimme_tree(PlannerInfo *root, Gene *tour, int num_gene) { - GeqoPrivateData *private = (GeqoPrivateData *) root->join_search_private; + GeqoPrivateData *private = GetGeqoPrivateData(root); List *clumps; int rel_count; @@ -191,7 +191,7 @@ gimme_tree(PlannerInfo *root, Gene *tour, int num_gene) cur_rel_index - 1); /* Make it into a single-rel clump */ - cur_clump = (Clump *) palloc(sizeof(Clump)); + cur_clump = palloc_object(Clump); cur_clump->joinrel = cur_rel; cur_clump->size = 1; @@ -264,6 +264,9 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, int num_gene, /* Keep searching if join order is not valid */ if (joinrel) { + bool is_top_rel = bms_equal(joinrel->relids, + root->all_query_rels); + /* Create paths for partitionwise joins. */ generate_partitionwise_join_paths(root, joinrel); @@ -273,12 +276,28 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, int num_gene, * rel once we know the final targetlist (see * grouping_planner). */ - if (!bms_equal(joinrel->relids, root->all_query_rels)) + if (!is_top_rel) generate_useful_gather_paths(root, joinrel, false); /* Find and save the cheapest paths for this joinrel */ set_cheapest(joinrel); + /* + * Except for the topmost scan/join rel, consider generating + * partial aggregation paths for the grouped relation on top + * of the paths of this rel. After that, we're done creating + * paths for the grouped relation, so run set_cheapest(). + */ + if (joinrel->grouped_rel != NULL && !is_top_rel) + { + RelOptInfo *grouped_rel = joinrel->grouped_rel; + + Assert(IS_GROUPED_REL(grouped_rel)); + + generate_grouped_paths(root, grouped_rel, joinrel); + set_cheapest(grouped_rel); + } + /* Absorb new clump into old */ old_clump->joinrel = joinrel; old_clump->size += new_clump->size; diff --git a/src/backend/optimizer/geqo/geqo_main.c b/src/backend/optimizer/geqo/geqo_main.c index 38402ce58db25..0064556087a3f 100644 --- a/src/backend/optimizer/geqo/geqo_main.c +++ b/src/backend/optimizer/geqo/geqo_main.c @@ -47,6 +47,8 @@ int Geqo_generations; double Geqo_selection_bias; double Geqo_seed; +/* GEQO is treated as an in-core planner extension */ +int Geqo_planner_extension_id = -1; static int gimme_pool_size(int nr_rel); static int gimme_number_generations(int pool_size); @@ -98,10 +100,16 @@ geqo(PlannerInfo *root, int number_of_rels, List *initial_rels) int mutations = 0; #endif + if (Geqo_planner_extension_id < 0) + Geqo_planner_extension_id = GetPlannerExtensionId("geqo"); + /* set up private information */ - root->join_search_private = &private; + SetPlannerInfoExtensionState(root, Geqo_planner_extension_id, &private); private.initial_rels = initial_rels; +/* inform core planner that we may replan */ + root->assumeReplanning = true; + /* initialize private number generator */ geqo_set_seed(root, Geqo_seed); @@ -304,7 +312,7 @@ geqo(PlannerInfo *root, int number_of_rels, List *initial_rels) free_pool(root, pool); /* ... clear root pointer to our private storage */ - root->join_search_private = NULL; + SetPlannerInfoExtensionState(root, Geqo_planner_extension_id, NULL); return best_rel; } diff --git a/src/backend/optimizer/geqo/geqo_pmx.c b/src/backend/optimizer/geqo/geqo_pmx.c index 01d5571192543..af1cb86839154 100644 --- a/src/backend/optimizer/geqo/geqo_pmx.c +++ b/src/backend/optimizer/geqo/geqo_pmx.c @@ -48,10 +48,10 @@ void pmx(PlannerInfo *root, Gene *tour1, Gene *tour2, Gene *offspring, int num_gene) { - int *failed = (int *) palloc((num_gene + 1) * sizeof(int)); - int *from = (int *) palloc((num_gene + 1) * sizeof(int)); - int *indx = (int *) palloc((num_gene + 1) * sizeof(int)); - int *check_list = (int *) palloc((num_gene + 1) * sizeof(int)); + int *failed = palloc_array(int, num_gene + 1); + int *from = palloc_array(int, num_gene + 1); + int *indx = palloc_array(int, num_gene + 1); + int *check_list = palloc_array(int, num_gene + 1); int left, right, diff --git a/src/backend/optimizer/geqo/geqo_pool.c b/src/backend/optimizer/geqo/geqo_pool.c index b6de0d93f2817..d0f53d888ef43 100644 --- a/src/backend/optimizer/geqo/geqo_pool.c +++ b/src/backend/optimizer/geqo/geqo_pool.c @@ -46,17 +46,17 @@ alloc_pool(PlannerInfo *root, int pool_size, int string_length) int i; /* pool */ - new_pool = (Pool *) palloc(sizeof(Pool)); - new_pool->size = (int) pool_size; - new_pool->string_length = (int) string_length; + new_pool = palloc_object(Pool); + new_pool->size = pool_size; + new_pool->string_length = string_length; /* all chromosome */ - new_pool->data = (Chromosome *) palloc(pool_size * sizeof(Chromosome)); + new_pool->data = palloc_array(Chromosome, pool_size); /* all gene */ chromo = (Chromosome *) new_pool->data; /* vector of all chromos */ for (i = 0; i < pool_size; i++) - chromo[i].string = palloc((string_length + 1) * sizeof(Gene)); + chromo[i].string = palloc_array(Gene, string_length + 1); return new_pool; } @@ -163,8 +163,8 @@ alloc_chromo(PlannerInfo *root, int string_length) { Chromosome *chromo; - chromo = (Chromosome *) palloc(sizeof(Chromosome)); - chromo->string = (Gene *) palloc((string_length + 1) * sizeof(Gene)); + chromo = palloc_object(Chromosome); + chromo->string = palloc_array(Gene, string_length + 1); return chromo; } diff --git a/src/backend/optimizer/geqo/geqo_random.c b/src/backend/optimizer/geqo/geqo_random.c index 6c7a411f69f44..46d28baa2e62b 100644 --- a/src/backend/optimizer/geqo/geqo_random.c +++ b/src/backend/optimizer/geqo/geqo_random.c @@ -15,11 +15,10 @@ #include "optimizer/geqo_random.h" - void geqo_set_seed(PlannerInfo *root, double seed) { - GeqoPrivateData *private = (GeqoPrivateData *) root->join_search_private; + GeqoPrivateData *private = GetGeqoPrivateData(root); pg_prng_fseed(&private->random_state, seed); } @@ -27,7 +26,7 @@ geqo_set_seed(PlannerInfo *root, double seed) double geqo_rand(PlannerInfo *root) { - GeqoPrivateData *private = (GeqoPrivateData *) root->join_search_private; + GeqoPrivateData *private = GetGeqoPrivateData(root); return pg_prng_double(&private->random_state); } @@ -35,7 +34,7 @@ geqo_rand(PlannerInfo *root) int geqo_randint(PlannerInfo *root, int upper, int lower) { - GeqoPrivateData *private = (GeqoPrivateData *) root->join_search_private; + GeqoPrivateData *private = GetGeqoPrivateData(root); /* * In current usage, "lower" is never negative so we can just use diff --git a/src/backend/optimizer/geqo/geqo_recombination.c b/src/backend/optimizer/geqo/geqo_recombination.c index a5d3e47ad115e..41d35c179e14e 100644 --- a/src/backend/optimizer/geqo/geqo_recombination.c +++ b/src/backend/optimizer/geqo/geqo_recombination.c @@ -74,7 +74,7 @@ alloc_city_table(PlannerInfo *root, int num_gene) * palloc one extra location so that nodes numbered 1..n can be indexed * directly; 0 will not be used */ - city_table = (City *) palloc((num_gene + 1) * sizeof(City)); + city_table = palloc_array(City, num_gene + 1); return city_table; } diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 6cc6966b0600a..58e64e2063137 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -40,6 +40,7 @@ #include "optimizer/paths.h" #include "optimizer/plancat.h" #include "optimizer/planner.h" +#include "optimizer/prep.h" #include "optimizer/tlist.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" @@ -47,6 +48,7 @@ #include "port/pg_bitutils.h" #include "rewrite/rewriteManip.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" /* Bitmask flags for pushdown_safety_info.unsafeFlags */ @@ -77,7 +79,9 @@ typedef enum pushdown_safe_type /* These parameters are set by GUC */ bool enable_geqo = false; /* just in case GUC doesn't set it */ +bool enable_eager_aggregate = true; int geqo_threshold; +double min_eager_agg_group_size; int min_parallel_table_scan_size; int min_parallel_index_scan_size; @@ -90,6 +94,7 @@ join_search_hook_type join_search_hook = NULL; static void set_base_rel_consider_startup(PlannerInfo *root); static void set_base_rel_sizes(PlannerInfo *root); +static void setup_simple_grouped_rels(PlannerInfo *root); static void set_base_rel_pathlists(PlannerInfo *root); static void set_rel_size(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); @@ -114,6 +119,7 @@ static void set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); static void set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); +static void set_grouped_rel_pathlist(PlannerInfo *root, RelOptInfo *rel); static void generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, List *live_childrels, List *all_child_pathkeys); @@ -182,6 +188,12 @@ make_one_rel(PlannerInfo *root, List *joinlist) */ set_base_rel_sizes(root); + /* + * Build grouped relations for simple rels (i.e., base or "other" member + * relations) where possible. + */ + setup_simple_grouped_rels(root); + /* * We should now have size estimates for every actual table involved in * the query, and we also know which if any have been deleted from the @@ -323,6 +335,39 @@ set_base_rel_sizes(PlannerInfo *root) } } +/* + * setup_simple_grouped_rels + * For each simple relation, build a grouped simple relation if eager + * aggregation is possible and if this relation can produce grouped paths. + */ +static void +setup_simple_grouped_rels(PlannerInfo *root) +{ + Index rti; + + /* + * If there are no aggregate expressions or grouping expressions, eager + * aggregation is not possible. + */ + if (root->agg_clause_list == NIL || + root->group_expr_list == NIL) + return; + + for (rti = 1; rti < root->simple_rel_array_size; rti++) + { + RelOptInfo *rel = root->simple_rel_array[rti]; + + /* there may be empty slots corresponding to non-baserel RTEs */ + if (rel == NULL) + continue; + + Assert(rel->relid == rti); /* sanity check on array */ + Assert(IS_SIMPLE_REL(rel)); /* sanity check on rel */ + + (void) build_simple_grouped_rel(root, rel); + } +} + /* * set_base_rel_pathlists * Finds all paths available for scanning each base-relation entry. @@ -559,6 +604,15 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, /* Now find the cheapest of the paths for this rel */ set_cheapest(rel); + /* + * If a grouped relation for this rel exists, build partial aggregation + * paths for it. + * + * Note that this can only happen after we've called set_cheapest() for + * this base rel, because we need its cheapest paths. + */ + set_grouped_rel_pathlist(root, rel); + #ifdef OPTIMIZER_DEBUG pprint(rel); #endif @@ -601,23 +655,25 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, /* This should only be called for baserels and appendrel children. */ Assert(IS_SIMPLE_REL(rel)); + /* Set if the data source refers temp storage somehow */ + rel->needs_temp_safety = false; + /* Assorted checks based on rtekind. */ switch (rte->rtekind) { case RTE_RELATION: /* - * Currently, parallel workers can't access the leader's temporary - * tables. We could possibly relax this if we wrote all of its - * local buffers at the start of the query and made no changes - * thereafter (maybe we could allow hint bit changes), and if we - * taught the workers to read them. Writing a large number of - * temporary buffers could be expensive, though, and we don't have - * the rest of the necessary infrastructure right now anyway. So - * for now, bail out if we see a temporary table. + * It is not free to process objects with a temporary storage in + * parallel because we need to flush temporary buffers beforehand. + * So, hide this feature under a GUC. */ if (get_rel_persistence(rte->relid) == RELPERSISTENCE_TEMP) - return; + { + if (!extended_parallel_processing) + return; + rel->needs_temp_safety = true; + } /* * Table sampling can be pushed down to workers if the sample @@ -629,7 +685,7 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, if (proparallel != PROPARALLEL_SAFE) return; - if (!is_parallel_safe(root, (Node *) rte->tablesample->args)) + if (!is_parallel_safe(root, (Node *) rte->tablesample->args, &rel->needs_temp_safety)) return; } @@ -695,7 +751,7 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, case RTE_FUNCTION: /* Check for parallel-restricted functions. */ - if (!is_parallel_safe(root, (Node *) rte->functions)) + if (!is_parallel_safe(root, (Node *) rte->functions, &rel->needs_temp_safety)) return; break; @@ -705,7 +761,7 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, case RTE_VALUES: /* Check for parallel-restricted functions. */ - if (!is_parallel_safe(root, (Node *) rte->values_lists)) + if (!is_parallel_safe(root, (Node *) rte->values_lists, &rel->needs_temp_safety)) return; break; @@ -746,14 +802,14 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, * outer join clauses work correctly. It would likely break equivalence * classes, too. */ - if (!is_parallel_safe(root, (Node *) rel->baserestrictinfo)) + if (!is_parallel_safe(root, (Node *) rel->baserestrictinfo, &rel->needs_temp_safety)) return; /* * Likewise, if the relation's outputs are not parallel-safe, give up. * (Usually, they're just Vars, but sometimes they're not.) */ - if (!is_parallel_safe(root, (Node *) rel->reltarget->exprs)) + if (!is_parallel_safe(root, (Node *) rel->reltarget->exprs, &rel->needs_temp_safety)) return; /* We have a winner. */ @@ -1305,6 +1361,35 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, add_paths_to_append_rel(root, rel, live_childrels); } +/* + * set_grouped_rel_pathlist + * If a grouped relation for the given 'rel' exists, build partial + * aggregation paths for it. + */ +static void +set_grouped_rel_pathlist(PlannerInfo *root, RelOptInfo *rel) +{ + RelOptInfo *grouped_rel; + + /* + * If there are no aggregate expressions or grouping expressions, eager + * aggregation is not possible. + */ + if (root->agg_clause_list == NIL || + root->group_expr_list == NIL) + return; + + /* Add paths to the grouped base relation if one exists. */ + grouped_rel = rel->grouped_rel; + if (grouped_rel) + { + Assert(IS_GROUPED_REL(grouped_rel)); + + generate_grouped_paths(root, grouped_rel, rel); + set_cheapest(grouped_rel); + } +} + /* * add_paths_to_append_rel @@ -1727,9 +1812,11 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * We generate a path for each ordering (pathkey list) appearing in * all_child_pathkeys. * - * We consider both cheapest-startup and cheapest-total cases, ie, for each - * interesting ordering, collect all the cheapest startup subpaths and all the - * cheapest total paths, and build a suitable path for each case. + * We consider the cheapest-startup and cheapest-total cases, and also the + * cheapest-fractional case when not all tuples need to be retrieved. For each + * interesting ordering, we collect all the cheapest startup subpaths, all the + * cheapest total paths, and, if applicable, all the cheapest fractional paths, + * and build a suitable path for each case. * * We don't currently generate any parameterized ordered paths here. While * it would not take much more code here to do so, it's very unclear that it @@ -1792,6 +1879,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, List *total_subpaths = NIL; List *fractional_subpaths = NIL; bool startup_neq_total = false; + bool fraction_neq_total = false; bool match_partition_order; bool match_partition_order_desc; int end_index; @@ -1894,14 +1982,18 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, double path_fraction = root->tuple_fraction; /* - * Merge Append considers only live children relations. Dummy - * relations must be filtered out before. + * We should not have a dummy child relation here. However, + * we cannot use childrel->rows to compute the tuple fraction, + * as childrel can be an upper relation with an unset row + * estimate. Instead, we use the row estimate from the + * cheapest_total path, which should already have been forced + * to a sane value. */ - Assert(childrel->rows > 0); + Assert(cheapest_total->rows > 0); /* Convert absolute limit to a path fraction */ if (path_fraction >= 1.0) - path_fraction /= childrel->rows; + path_fraction /= cheapest_total->rows; cheapest_fractional = get_cheapest_fractional_path_for_pathkeys(childrel->pathlist, @@ -1916,15 +2008,21 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, * XXX We might consider partially sorted paths too (with an * incremental sort on top). But we'd have to build all the * incremental paths, do the costing etc. + * + * Also, notice whether we actually have different paths for + * the "fractional" and "total" cases. This helps avoid + * generating two identical ordered append paths. */ - if (!cheapest_fractional) + if (cheapest_fractional == NULL) cheapest_fractional = cheapest_total; + else if (cheapest_fractional != cheapest_total) + fraction_neq_total = true; } /* * Notice whether we actually have different paths for the - * "cheapest" and "total" cases; frequently there will be no point - * in two create_merge_append_path() calls. + * "cheapest" and "total" cases. This helps avoid generating two + * identical ordered append paths. */ if (cheapest_startup != cheapest_total) startup_neq_total = true; @@ -1995,7 +2093,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, false, -1)); - if (fractional_subpaths) + if (fractional_subpaths && fraction_neq_total) add_path(rel, (Path *) create_append_path(root, rel, fractional_subpaths, @@ -2021,7 +2119,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, pathkeys, NULL)); - if (fractional_subpaths) + if (fractional_subpaths && fraction_neq_total) add_path(rel, (Path *) create_merge_append_path(root, rel, fractional_subpaths, @@ -2254,10 +2352,9 @@ set_dummy_rel_pathlist(RelOptInfo *rel) * return false. */ static bool -find_window_run_conditions(Query *subquery, RangeTblEntry *rte, Index rti, - AttrNumber attno, WindowFunc *wfunc, OpExpr *opexpr, - bool wfunc_left, bool *keep_original, - Bitmapset **run_cond_attrs) +find_window_run_conditions(Query *subquery, AttrNumber attno, + WindowFunc *wfunc, OpExpr *opexpr, bool wfunc_left, + bool *keep_original, Bitmapset **run_cond_attrs) { Oid prosupport; Expr *otherexpr; @@ -2445,8 +2542,8 @@ find_window_run_conditions(Query *subquery, RangeTblEntry *rte, Index rti, * will use the runCondition to stop returning tuples. */ static bool -check_and_push_window_quals(Query *subquery, RangeTblEntry *rte, Index rti, - Node *clause, Bitmapset **run_cond_attrs) +check_and_push_window_quals(Query *subquery, Node *clause, + Bitmapset **run_cond_attrs) { OpExpr *opexpr = (OpExpr *) clause; bool keep_original = true; @@ -2485,9 +2582,8 @@ check_and_push_window_quals(Query *subquery, RangeTblEntry *rte, Index rti, TargetEntry *tle = list_nth(subquery->targetList, var1->varattno - 1); WindowFunc *wfunc = (WindowFunc *) tle->expr; - if (find_window_run_conditions(subquery, rte, rti, tle->resno, wfunc, - opexpr, true, &keep_original, - run_cond_attrs)) + if (find_window_run_conditions(subquery, tle->resno, wfunc, opexpr, + true, &keep_original, run_cond_attrs)) return keep_original; } @@ -2498,9 +2594,8 @@ check_and_push_window_quals(Query *subquery, RangeTblEntry *rte, Index rti, TargetEntry *tle = list_nth(subquery->targetList, var2->varattno - 1); WindowFunc *wfunc = (WindowFunc *) tle->expr; - if (find_window_run_conditions(subquery, rte, rti, tle->resno, wfunc, - opexpr, false, &keep_original, - run_cond_attrs)) + if (find_window_run_conditions(subquery, tle->resno, wfunc, opexpr, + false, &keep_original, run_cond_attrs)) return keep_original; } @@ -2532,6 +2627,7 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, RelOptInfo *sub_final_rel; Bitmapset *run_cond_attrs = NULL; ListCell *lc; + char *plan_name; /* * Must copy the Query so that planning doesn't mess up the RTE contents @@ -2622,7 +2718,7 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, * runCondition. */ if (!subquery->hasWindowFuncs || - check_and_push_window_quals(subquery, rte, rti, clause, + check_and_push_window_quals(subquery, clause, &run_cond_attrs)) { /* @@ -2674,8 +2770,9 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, Assert(root->plan_params == NIL); /* Generate a subroot and Paths for the subquery */ - rel->subroot = subquery_planner(root->glob, subquery, root, false, - tuple_fraction, NULL); + plan_name = choose_plan_name(root->glob, rte->eref->aliasname, false); + rel->subroot = subquery_planner(root->glob, subquery, plan_name, + root, false, tuple_fraction, NULL); /* Isolate the params needed by this specific subplan */ rel->subplan_params = root->plan_params; @@ -3335,6 +3432,345 @@ generate_useful_gather_paths(PlannerInfo *root, RelOptInfo *rel, bool override_r } } +/* + * generate_grouped_paths + * Generate paths for a grouped relation by adding sorted and hashed + * partial aggregation paths on top of paths of the ungrouped relation. + * + * The information needed is provided by the RelAggInfo structure stored in + * "grouped_rel". + */ +void +generate_grouped_paths(PlannerInfo *root, RelOptInfo *grouped_rel, + RelOptInfo *rel) +{ + RelAggInfo *agg_info = grouped_rel->agg_info; + AggClauseCosts agg_costs; + bool can_hash; + bool can_sort; + Path *cheapest_total_path = NULL; + Path *cheapest_partial_path = NULL; + double dNumGroups = 0; + double dNumPartialGroups = 0; + List *group_pathkeys = NIL; + + if (IS_DUMMY_REL(rel)) + { + mark_dummy_rel(grouped_rel); + return; + } + + /* + * We push partial aggregation only to the lowest possible level in the + * join tree that is deemed useful. + */ + if (!bms_equal(agg_info->apply_agg_at, rel->relids) || + !agg_info->agg_useful) + return; + + MemSet(&agg_costs, 0, sizeof(AggClauseCosts)); + get_agg_clause_costs(root, AGGSPLIT_INITIAL_SERIAL, &agg_costs); + + /* + * Determine whether it's possible to perform sort-based implementations + * of grouping, and generate the pathkeys that represent the grouping + * requirements in that case. + */ + can_sort = grouping_is_sortable(agg_info->group_clauses); + if (can_sort) + { + RelOptInfo *top_grouped_rel; + List *top_group_tlist; + + top_grouped_rel = IS_OTHER_REL(rel) ? + rel->top_parent->grouped_rel : grouped_rel; + top_group_tlist = + make_tlist_from_pathtarget(top_grouped_rel->agg_info->target); + + group_pathkeys = + make_pathkeys_for_sortclauses(root, agg_info->group_clauses, + top_group_tlist); + } + + /* + * Determine whether we should consider hash-based implementations of + * grouping. + */ + Assert(root->numOrderedAggs == 0); + can_hash = (agg_info->group_clauses != NIL && + grouping_is_hashable(agg_info->group_clauses)); + + /* + * Consider whether we should generate partially aggregated non-partial + * paths. We can only do this if we have a non-partial path. + */ + if (rel->pathlist != NIL) + { + cheapest_total_path = rel->cheapest_total_path; + Assert(cheapest_total_path != NULL); + } + + /* + * If parallelism is possible for grouped_rel, then we should consider + * generating partially-grouped partial paths. However, if the ungrouped + * rel has no partial paths, then we can't. + */ + if (grouped_rel->consider_parallel && rel->partial_pathlist != NIL) + { + cheapest_partial_path = linitial(rel->partial_pathlist); + Assert(cheapest_partial_path != NULL); + } + + /* Estimate number of partial groups. */ + if (cheapest_total_path != NULL) + dNumGroups = estimate_num_groups(root, + agg_info->group_exprs, + cheapest_total_path->rows, + NULL, NULL); + if (cheapest_partial_path != NULL) + dNumPartialGroups = estimate_num_groups(root, + agg_info->group_exprs, + cheapest_partial_path->rows, + NULL, NULL); + + if (can_sort && cheapest_total_path != NULL) + { + ListCell *lc; + + /* + * Use any available suitably-sorted path as input, and also consider + * sorting the cheapest-total path and incremental sort on any paths + * with presorted keys. + * + * To save planning time, we ignore parameterized input paths unless + * they are the cheapest-total path. + */ + foreach(lc, rel->pathlist) + { + Path *input_path = (Path *) lfirst(lc); + Path *path; + bool is_sorted; + int presorted_keys; + + /* + * Ignore parameterized paths that are not the cheapest-total + * path. + */ + if (input_path->param_info && + input_path != cheapest_total_path) + continue; + + is_sorted = pathkeys_count_contained_in(group_pathkeys, + input_path->pathkeys, + &presorted_keys); + + /* + * Ignore paths that are not suitably or partially sorted, unless + * they are the cheapest total path (no need to deal with paths + * which have presorted keys when incremental sort is disabled). + */ + if (!is_sorted && input_path != cheapest_total_path && + (presorted_keys == 0 || !enable_incremental_sort)) + continue; + + /* + * Since the path originates from a non-grouped relation that is + * not aware of eager aggregation, we must ensure that it provides + * the correct input for partial aggregation. + */ + path = (Path *) create_projection_path(root, + grouped_rel, + input_path, + agg_info->agg_input); + + if (!is_sorted) + { + /* + * We've no need to consider both a sort and incremental sort. + * We'll just do a sort if there are no presorted keys and an + * incremental sort when there are presorted keys. + */ + if (presorted_keys == 0 || !enable_incremental_sort) + path = (Path *) create_sort_path(root, + grouped_rel, + path, + group_pathkeys, + -1.0); + else + path = (Path *) create_incremental_sort_path(root, + grouped_rel, + path, + group_pathkeys, + presorted_keys, + -1.0); + } + + /* + * qual is NIL because the HAVING clause cannot be evaluated until + * the final value of the aggregate is known. + */ + path = (Path *) create_agg_path(root, + grouped_rel, + path, + agg_info->target, + AGG_SORTED, + AGGSPLIT_INITIAL_SERIAL, + agg_info->group_clauses, + NIL, + &agg_costs, + dNumGroups); + + add_path(grouped_rel, path); + } + } + + if (can_sort && cheapest_partial_path != NULL) + { + ListCell *lc; + + /* Similar to above logic, but for partial paths. */ + foreach(lc, rel->partial_pathlist) + { + Path *input_path = (Path *) lfirst(lc); + Path *path; + bool is_sorted; + int presorted_keys; + + is_sorted = pathkeys_count_contained_in(group_pathkeys, + input_path->pathkeys, + &presorted_keys); + + /* + * Ignore paths that are not suitably or partially sorted, unless + * they are the cheapest partial path (no need to deal with paths + * which have presorted keys when incremental sort is disabled). + */ + if (!is_sorted && input_path != cheapest_partial_path && + (presorted_keys == 0 || !enable_incremental_sort)) + continue; + + /* + * Since the path originates from a non-grouped relation that is + * not aware of eager aggregation, we must ensure that it provides + * the correct input for partial aggregation. + */ + path = (Path *) create_projection_path(root, + grouped_rel, + input_path, + agg_info->agg_input); + + if (!is_sorted) + { + /* + * We've no need to consider both a sort and incremental sort. + * We'll just do a sort if there are no presorted keys and an + * incremental sort when there are presorted keys. + */ + if (presorted_keys == 0 || !enable_incremental_sort) + path = (Path *) create_sort_path(root, + grouped_rel, + path, + group_pathkeys, + -1.0); + else + path = (Path *) create_incremental_sort_path(root, + grouped_rel, + path, + group_pathkeys, + presorted_keys, + -1.0); + } + + /* + * qual is NIL because the HAVING clause cannot be evaluated until + * the final value of the aggregate is known. + */ + path = (Path *) create_agg_path(root, + grouped_rel, + path, + agg_info->target, + AGG_SORTED, + AGGSPLIT_INITIAL_SERIAL, + agg_info->group_clauses, + NIL, + &agg_costs, + dNumPartialGroups); + + add_partial_path(grouped_rel, path); + } + } + + /* + * Add a partially-grouped HashAgg Path where possible + */ + if (can_hash && cheapest_total_path != NULL) + { + Path *path; + + /* + * Since the path originates from a non-grouped relation that is not + * aware of eager aggregation, we must ensure that it provides the + * correct input for partial aggregation. + */ + path = (Path *) create_projection_path(root, + grouped_rel, + cheapest_total_path, + agg_info->agg_input); + + /* + * qual is NIL because the HAVING clause cannot be evaluated until the + * final value of the aggregate is known. + */ + path = (Path *) create_agg_path(root, + grouped_rel, + path, + agg_info->target, + AGG_HASHED, + AGGSPLIT_INITIAL_SERIAL, + agg_info->group_clauses, + NIL, + &agg_costs, + dNumGroups); + + add_path(grouped_rel, path); + } + + /* + * Now add a partially-grouped HashAgg partial Path where possible + */ + if (can_hash && cheapest_partial_path != NULL) + { + Path *path; + + /* + * Since the path originates from a non-grouped relation that is not + * aware of eager aggregation, we must ensure that it provides the + * correct input for partial aggregation. + */ + path = (Path *) create_projection_path(root, + grouped_rel, + cheapest_partial_path, + agg_info->agg_input); + + /* + * qual is NIL because the HAVING clause cannot be evaluated until the + * final value of the aggregate is known. + */ + path = (Path *) create_agg_path(root, + grouped_rel, + path, + agg_info->target, + AGG_HASHED, + AGGSPLIT_INITIAL_SERIAL, + agg_info->group_clauses, + NIL, + &agg_costs, + dNumPartialGroups); + + add_partial_path(grouped_rel, path); + } +} + /* * make_rel_from_joinlist * Build access paths using a "joinlist" to guide the join path search. @@ -3494,11 +3930,19 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels) * * After that, we're done creating paths for the joinrel, so run * set_cheapest(). + * + * In addition, we also run generate_grouped_paths() for the grouped + * relation of each just-processed joinrel, and run set_cheapest() for + * the grouped relation afterwards. */ foreach(lc, root->join_rel_level[lev]) { + bool is_top_rel; + rel = (RelOptInfo *) lfirst(lc); + is_top_rel = bms_equal(rel->relids, root->all_query_rels); + /* Create paths for partitionwise joins. */ generate_partitionwise_join_paths(root, rel); @@ -3508,12 +3952,28 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels) * once we know the final targetlist (see grouping_planner's and * its call to apply_scanjoin_target_to_paths). */ - if (!bms_equal(rel->relids, root->all_query_rels)) + if (!is_top_rel) generate_useful_gather_paths(root, rel, false); /* Find and save the cheapest paths for this rel */ set_cheapest(rel); + /* + * Except for the topmost scan/join rel, consider generating + * partial aggregation paths for the grouped relation on top of + * the paths of this rel. After that, we're done creating paths + * for the grouped relation, so run set_cheapest(). + */ + if (rel->grouped_rel != NULL && !is_top_rel) + { + RelOptInfo *grouped_rel = rel->grouped_rel; + + Assert(IS_GROUPED_REL(grouped_rel)); + + generate_grouped_paths(root, grouped_rel, rel); + set_cheapest(grouped_rel); + } + #ifdef OPTIMIZER_DEBUG pprint(rel); #endif @@ -4383,6 +4843,25 @@ generate_partitionwise_join_paths(PlannerInfo *root, RelOptInfo *rel) if (IS_DUMMY_REL(child_rel)) continue; + /* + * Except for the topmost scan/join rel, consider generating partial + * aggregation paths for the grouped relation on top of the paths of + * this partitioned child-join. After that, we're done creating paths + * for the grouped relation, so run set_cheapest(). + */ + if (child_rel->grouped_rel != NULL && + !bms_equal(IS_OTHER_REL(rel) ? + rel->top_parent_relids : rel->relids, + root->all_query_rels)) + { + RelOptInfo *grouped_rel = child_rel->grouped_rel; + + Assert(IS_GROUPED_REL(grouped_rel)); + + generate_grouped_paths(root, grouped_rel, child_rel); + set_cheapest(grouped_rel); + } + #ifdef OPTIMIZER_DEBUG pprint(child_rel); #endif diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index 5d51f97f21906..eb9b7f3eabd02 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -495,7 +495,7 @@ addRangeClause(RangeQueryClause **rqlist, Node *clause, } /* No matching var found, so make a new clause-pair data structure */ - rqelem = (RangeQueryClause *) palloc(sizeof(RangeQueryClause)); + rqelem = palloc_object(RangeQueryClause); rqelem->var = var; if (is_lobound) { @@ -874,6 +874,10 @@ clause_selectivity_ext(PlannerInfo *root, varRelid, jointype, sjinfo); + + /* If no support, fall back on boolvarsel */ + if (s1 < 0) + s1 = boolvarsel(root, clause, varRelid); } else if (IsA(clause, ScalarArrayOpExpr)) { diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 3d44815ed5adf..205baa0dd5cf6 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -104,6 +104,7 @@ #include "optimizer/plancat.h" #include "optimizer/restrictinfo.h" #include "parser/parsetree.h" +#include "storage/bufmgr.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" #include "utils/spccache.h" @@ -129,6 +130,7 @@ double seq_page_cost = DEFAULT_SEQ_PAGE_COST; double random_page_cost = DEFAULT_RANDOM_PAGE_COST; +double write_page_cost = DEFAULT_WRITE_PAGE_COST; double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST; double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST; double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST; @@ -164,6 +166,8 @@ bool enable_partition_pruning = true; bool enable_presorted_aggregate = true; bool enable_async_append = true; +bool extended_parallel_processing = true; + typedef struct { PlannerInfo *root; @@ -257,32 +261,6 @@ clamp_width_est(int64 tuple_width) return (int32) tuple_width; } -/* - * clamp_cardinality_to_long - * Cast a Cardinality value to a sane long value. - */ -long -clamp_cardinality_to_long(Cardinality x) -{ - /* - * Just for paranoia's sake, ensure we do something sane with negative or - * NaN values. - */ - if (isnan(x)) - return LONG_MAX; - if (x <= 0) - return 0; - - /* - * If "long" is 64 bits, then LONG_MAX cannot be represented exactly as a - * double. Casting it to double and back may well result in overflow due - * to rounding, so avoid doing that. We trust that any double value that - * compares strictly less than "(double) LONG_MAX" will cast to a - * representable "long" value. - */ - return (x < (double) LONG_MAX) ? (long) x : LONG_MAX; -} - /* * cost_seqscan @@ -1366,8 +1344,9 @@ cost_tidrangescan(Path *path, PlannerInfo *root, { Selectivity selectivity; double pages; - Cost startup_cost = 0; - Cost run_cost = 0; + Cost startup_cost; + Cost cpu_run_cost; + Cost disk_run_cost; QualCost qpqual_cost; Cost cpu_per_tuple; QualCost tid_qual_cost; @@ -1399,8 +1378,8 @@ cost_tidrangescan(Path *path, PlannerInfo *root, * page is just a normal sequential page read. NOTE: it's desirable for * TID Range Scans to cost more than the equivalent Sequential Scans, * because Seq Scans have some performance advantages such as scan - * synchronization and parallelizability, and we'd prefer one of them to - * be picked unless a TID Range Scan really is better. + * synchronization, and we'd prefer one of them to be picked unless a TID + * Range Scan really is better. */ ntuples = selectivity * baserel->tuples; nseqpages = pages - 1.0; @@ -1417,7 +1396,7 @@ cost_tidrangescan(Path *path, PlannerInfo *root, &spc_seq_page_cost); /* disk costs; 1 random page and the remainder as seq pages */ - run_cost += spc_random_page_cost + spc_seq_page_cost * nseqpages; + disk_run_cost = spc_random_page_cost + spc_seq_page_cost * nseqpages; /* Add scanning CPU costs */ get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); @@ -1429,20 +1408,35 @@ cost_tidrangescan(Path *path, PlannerInfo *root, * can't be removed, this is a mistake and we're going to underestimate * the CPU cost a bit.) */ - startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple; + startup_cost = qpqual_cost.startup + tid_qual_cost.per_tuple; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple - tid_qual_cost.per_tuple; - run_cost += cpu_per_tuple * ntuples; + cpu_run_cost = cpu_per_tuple * ntuples; /* tlist eval costs are paid per output row, not per tuple scanned */ startup_cost += path->pathtarget->cost.startup; - run_cost += path->pathtarget->cost.per_tuple * path->rows; + cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows; + + /* Adjust costing for parallelism, if used. */ + if (path->parallel_workers > 0) + { + double parallel_divisor = get_parallel_divisor(path); + + /* The CPU cost is divided among all the workers. */ + cpu_run_cost /= parallel_divisor; + + /* + * In the case of a parallel plan, the row count needs to represent + * the number of tuples processed per worker. + */ + path->rows = clamp_row_est(path->rows / parallel_divisor); + } /* we should not generate this path type when enable_tidscan=false */ Assert(enable_tidscan); path->disabled_nodes = 0; path->startup_cost = startup_cost; - path->total_cost = startup_cost + run_cost; + path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; } /* @@ -2189,7 +2183,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers) * whichever is less. */ arrlen = Min(parallel_workers, numpaths); - costarr = (Cost *) palloc(sizeof(Cost) * arrlen); + costarr = palloc_array(Cost, arrlen); /* The first few paths will each be claimed by a different worker. */ path_index = 0; @@ -2247,7 +2241,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers) * Determines and returns the cost of an Append node. */ void -cost_append(AppendPath *apath) +cost_append(AppendPath *apath, PlannerInfo *root) { ListCell *l; @@ -2309,26 +2303,52 @@ cost_append(AppendPath *apath) foreach(l, apath->subpaths) { Path *subpath = (Path *) lfirst(l); - Path sort_path; /* dummy for result of cost_sort */ + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { /* * We'll need to insert a Sort node, so include costs for - * that. We can use the parent's LIMIT if any, since we + * that. We choose to use incremental sort if it is + * enabled and there are presorted keys; otherwise we use + * full sort. + * + * We can use the parent's LIMIT if any, since we * certainly won't pull more than that many tuples from * any child. */ - cost_sort(&sort_path, - NULL, /* doesn't currently need root */ - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - apath->limit_tuples); + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + subpath = &sort_path; } @@ -2546,13 +2566,13 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, Cost input_startup_cost = mpath->subpath->startup_cost; Cost input_total_cost = mpath->subpath->total_cost; double tuples = mpath->subpath->rows; - double calls = mpath->calls; + Cardinality est_calls = mpath->est_calls; int width = mpath->subpath->pathtarget->width; double hash_mem_bytes; double est_entry_bytes; - double est_cache_entries; - double ndistinct; + Cardinality est_cache_entries; + Cardinality ndistinct; double evict_ratio; double hit_ratio; Cost startup_cost; @@ -2578,7 +2598,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, est_cache_entries = floor(hash_mem_bytes / est_entry_bytes); /* estimate on the distinct number of parameter values */ - ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL, + ndistinct = estimate_num_groups(root, mpath->param_exprs, est_calls, NULL, &estinfo); /* @@ -2590,7 +2610,10 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * certainly mean a MemoizePath will never survive add_path(). */ if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0) - ndistinct = calls; + ndistinct = est_calls; + + /* Remember the ndistinct estimate for EXPLAIN */ + mpath->est_unique_keys = ndistinct; /* * Since we've already estimated the maximum number of entries we can @@ -2618,9 +2641,12 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * must look at how many scans are estimated in total for this node and * how many of those scans we expect to get a cache hit. */ - hit_ratio = ((calls - ndistinct) / calls) * + hit_ratio = ((est_calls - ndistinct) / est_calls) * (est_cache_entries / Max(ndistinct, est_cache_entries)); + /* Remember the hit ratio estimate for EXPLAIN */ + mpath->est_hit_ratio = hit_ratio; + Assert(hit_ratio >= 0 && hit_ratio <= 1.0); /* @@ -3934,10 +3960,12 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path, * when we should not. Can we do better without expensive selectivity * computations? * - * The whole issue is moot if we are working from a unique-ified outer - * input, or if we know we don't need to mark/restore at all. + * The whole issue is moot if we know we don't need to mark/restore at + * all, or if we are working from a unique-ified outer input. */ - if (IsA(outer_path, UniquePath) || path->skip_mark_restore) + if (path->skip_mark_restore || + RELATION_WAS_MADE_UNIQUE(outer_path->parent, extra->sjinfo, + path->jpath.jointype)) rescannedtuples = 0; else { @@ -4113,7 +4141,7 @@ cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey) /* Cache the result in suitably long-lived workspace */ oldcontext = MemoryContextSwitchTo(root->planner_cxt); - cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache)); + cache = palloc_object(MergeScanSelCache); cache->opfamily = pathkey->pk_opfamily; cache->collation = pathkey->pk_eclass->ec_collation; cache->cmptype = pathkey->pk_cmptype; @@ -4332,7 +4360,8 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, * because we avoid contaminating the cache with a value that's wrong for * non-unique-ified paths. */ - if (IsA(inner_path, UniquePath)) + if (RELATION_WAS_MADE_UNIQUE(inner_path->parent, extra->sjinfo, + path->jpath.jointype)) { innerbucketsize = 1.0 / virtualbuckets; innermcvfreq = 0.0; @@ -4535,10 +4564,24 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan) { QualCost sp_cost; - /* Figure any cost for evaluating the testexpr */ + /* + * Figure any cost for evaluating the testexpr. + * + * Usually, SubPlan nodes are built very early, before we have constructed + * any RelOptInfos for the parent query level, which means the parent root + * does not yet contain enough information to safely consult statistics. + * Therefore, we pass root as NULL here. cost_qual_eval() is already + * well-equipped to handle a NULL root. + * + * One exception is SubPlan nodes built for the initplans of MIN/MAX + * aggregates from indexes (cf. SS_make_initplan_from_plan). In this + * case, having a NULL root is safe because testexpr will be NULL. + * Besides, an initplan will by definition not consult anything from the + * parent plan. + */ cost_qual_eval(&sp_cost, make_ands_implicit((Expr *) subplan->testexpr), - root); + NULL); if (subplan->useHashTable) { @@ -6614,3 +6657,25 @@ compute_gather_rows(Path *path) return clamp_row_est(path->rows * get_parallel_divisor(path)); } + +/* + * Before the launch parallel workers in a SELECT query, the leader process must + * flush all dirty pages in temp buffers to guarantee equal access to the data + * in each parallel worker. + * It seems difficult to calculate specific set of tables, indexes and toasts + * that may be touched inside the subtree. Moreover, stored procedures may also + * scan temporary tables. So, it makes sense to flush all temporary buffers. + * Here we calculate the cost of such operation to allow small queries do not + * activate expensive parallel scan over temp resources. + */ +Cost +tempbuf_flush_extra_cost() +{ + if (!extended_parallel_processing) + /* Fast exit if feature is disabled */ + return 0.0; + + /* Hopefully, we have an statistics on the number of dirtied buffers */ + Assert(dirtied_localbufs >= 0); + return write_page_cost * dirtied_localbufs; +} diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index 441f12f6c50cf..1573ffc5ce0b2 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -1015,6 +1015,7 @@ find_computable_ec_member(PlannerInfo *root, { List *emvars; ListCell *lc2; + bool needs_temp_flush = false; /* * We shouldn't be trying to sort by an equivalence class that @@ -1049,9 +1050,11 @@ find_computable_ec_member(PlannerInfo *root, /* * If requested, reject expressions that are not parallel-safe. We * check this last because it's a rather expensive test. + * TODO: Not sure if it is really necessary. */ if (require_parallel_safe && - !is_parallel_safe(root, (Node *) em->em_expr)) + (!is_parallel_safe(root, (Node *) em->em_expr, &needs_temp_flush) || + needs_temp_flush)) continue; return em; /* found usable expression */ @@ -1093,6 +1096,7 @@ relation_can_be_sorted_early(PlannerInfo *root, RelOptInfo *rel, foreach(lc, target->exprs) { Expr *targetexpr = (Expr *) lfirst(lc); + bool needs_temp_flush = false; em = find_ec_member_matching_expr(ec, targetexpr, rel->relids); if (!em) @@ -1112,7 +1116,8 @@ relation_can_be_sorted_early(PlannerInfo *root, RelOptInfo *rel, * check this last because it's a rather expensive test. */ if (require_parallel_safe && - !is_parallel_safe(root, (Node *) em->em_expr)) + (!is_parallel_safe(root, (Node *) em->em_expr, &needs_temp_flush) || + needs_temp_flush)) continue; return true; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 601354ea3e056..5d4f81ee77e07 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -19,6 +19,7 @@ #include "access/stratnum.h" #include "access/sysattr.h" +#include "access/transam.h" #include "catalog/pg_am.h" #include "catalog/pg_amop.h" #include "catalog/pg_operator.h" @@ -1290,7 +1291,7 @@ group_similar_or_args(PlannerInfo *root, RelOptInfo *rel, RestrictInfo *rinfo) * which will be used to sort these arguments at the next step. */ i = -1; - matches = (OrArgIndexMatch *) palloc(sizeof(OrArgIndexMatch) * n); + matches = palloc_array(OrArgIndexMatch, n); foreach(lc, orargs) { Node *arg = lfirst(lc); @@ -1852,8 +1853,7 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel, List *paths) * same set of clauses; keep only the cheapest-to-scan of any such groups. * The surviving paths are put into an array for qsort'ing. */ - pathinfoarray = (PathClauseUsage **) - palloc(npaths * sizeof(PathClauseUsage *)); + pathinfoarray = palloc_array(PathClauseUsage *, npaths); clauselist = NIL; npaths = 0; foreach(l, paths) @@ -2089,7 +2089,7 @@ classify_index_clause_usage(Path *path, List **clauselist) Bitmapset *clauseids; ListCell *lc; - result = (PathClauseUsage *) palloc(sizeof(PathClauseUsage)); + result = palloc_object(PathClauseUsage); result->path = path; /* Recursively find the quals and preds used by the path */ @@ -3289,8 +3289,8 @@ match_rowcompare_to_indexcol(PlannerInfo *root, * * In this routine, we attempt to transform a list of OR-clause args into a * single SAOP expression matching the target index column. On success, - * return an IndexClause, containing the transformed expression or NULL, - * if failed. + * return an IndexClause containing the transformed expression. + * Return NULL if the transformation fails. */ static IndexClause * match_orclause_to_indexcol(PlannerInfo *root, @@ -3298,85 +3298,59 @@ match_orclause_to_indexcol(PlannerInfo *root, int indexcol, IndexOptInfo *index) { - ListCell *lc; BoolExpr *orclause = (BoolExpr *) rinfo->orclause; - Node *indexExpr = NULL; List *consts = NIL; - ScalarArrayOpExpr *saopexpr = NULL; + Node *indexExpr = NULL; Oid matchOpno = InvalidOid; - IndexClause *iclause; Oid consttype = InvalidOid; Oid arraytype = InvalidOid; Oid inputcollid = InvalidOid; bool firstTime = true; bool haveNonConst = false; Index indexRelid = index->rel->relid; + ScalarArrayOpExpr *saopexpr; + IndexClause *iclause; + ListCell *lc; - Assert(IsA(orclause, BoolExpr)); - Assert(orclause->boolop == OR_EXPR); - - /* Ignore index if it doesn't support SAOP clauses */ + /* Forget it if index doesn't support SAOP clauses */ if (!index->amsearcharray) return NULL; /* * Try to convert a list of OR-clauses to a single SAOP expression. Each * OR entry must be in the form: (indexkey operator constant) or (constant - * operator indexkey). Operators of all the entries must match. To be - * effective, give up on the first non-matching entry. Exit is - * implemented as a break from the loop, which is catched afterwards. + * operator indexkey). Operators of all the entries must match. On + * discovery of anything unsupported, we give up by breaking out of the + * loop immediately and returning NULL. */ foreach(lc, orclause->args) { - RestrictInfo *subRinfo; + RestrictInfo *subRinfo = (RestrictInfo *) lfirst(lc); OpExpr *subClause; Oid opno; Node *leftop, *rightop; Node *constExpr; - if (!IsA(lfirst(lc), RestrictInfo)) + /* If it's not a RestrictInfo (i.e. it's a sub-AND), we can't use it */ + if (!IsA(subRinfo, RestrictInfo)) break; - subRinfo = (RestrictInfo *) lfirst(lc); - - /* Only operator clauses can match */ + /* Only operator clauses can match */ if (!IsA(subRinfo->clause, OpExpr)) break; subClause = (OpExpr *) subRinfo->clause; opno = subClause->opno; - /* Only binary operators can match */ + /* Only binary operators can match */ if (list_length(subClause->args) != 2) break; - /* - * The parameters below must match between sub-rinfo and its parent as - * make_restrictinfo() fills them with the same values, and further - * modifications are also the same for the whole subtree. However, - * still make a sanity check. - */ - Assert(subRinfo->is_pushed_down == rinfo->is_pushed_down); - Assert(subRinfo->is_clone == rinfo->is_clone); - Assert(subRinfo->security_level == rinfo->security_level); - Assert(bms_equal(subRinfo->incompatible_relids, rinfo->incompatible_relids)); - Assert(bms_equal(subRinfo->outer_relids, rinfo->outer_relids)); - - /* - * Also, check that required_relids in sub-rinfo is subset of parent's - * required_relids. - */ - Assert(bms_is_subset(subRinfo->required_relids, rinfo->required_relids)); - - /* Only the operator returning a boolean suit the transformation. */ - if (get_op_rettype(opno) != BOOLOID) - break; - /* * Check for clauses of the form: (indexkey operator constant) or - * (constant operator indexkey). See match_clause_to_indexcol's notes - * about const-ness. + * (constant operator indexkey). These tests should agree with + * match_opclause_to_indexcol. */ leftop = (Node *) linitial(subClause->args); rightop = (Node *) lsecond(subClause->args); @@ -3405,22 +3379,6 @@ match_orclause_to_indexcol(PlannerInfo *root, break; } - /* - * Ignore any RelabelType node above the operands. This is needed to - * be able to apply indexscanning in binary-compatible-operator cases. - * Note: we can assume there is at most one RelabelType node; - * eval_const_expressions() will have simplified if more than one. - */ - if (IsA(constExpr, RelabelType)) - constExpr = (Node *) ((RelabelType *) constExpr)->arg; - if (IsA(indexExpr, RelabelType)) - indexExpr = (Node *) ((RelabelType *) indexExpr)->arg; - - /* Forbid transformation for composite types, records. */ - if (type_is_rowtype(exprType(constExpr)) || - type_is_rowtype(exprType(indexExpr))) - break; - /* * Save information about the operator, type, and collation for the * first matching qual. Then, check that subsequent quals match the @@ -3438,54 +3396,71 @@ match_orclause_to_indexcol(PlannerInfo *root, * the expression collation matches the index collation. Also, * there must be an array type to construct an array later. */ - if (!IndexCollMatchesExprColl(index->indexcollations[indexcol], inputcollid) || + if (!IndexCollMatchesExprColl(index->indexcollations[indexcol], + inputcollid) || !op_in_opfamily(matchOpno, index->opfamily[indexcol]) || !OidIsValid(arraytype)) break; + + /* + * Disallow if either type is RECORD, mainly because we can't be + * positive that all the RHS expressions are the same record type. + */ + if (consttype == RECORDOID || exprType(indexExpr) == RECORDOID) + break; + firstTime = false; } else { - if (opno != matchOpno || + if (matchOpno != opno || inputcollid != subClause->inputcollid || consttype != exprType(constExpr)) break; } /* - * Check if our list of constants in match_clause_to_indexcol's - * understanding of const-ness have something other than Const. + * The righthand inputs don't necessarily have to be plain Consts, but + * make_SAOP_expr needs to know if any are not. */ if (!IsA(constExpr, Const)) haveNonConst = true; + consts = lappend(consts, constExpr); } /* - * Catch the break from the loop above. Normally, a foreach() loop ends - * up with a NULL list cell. A non-NULL list cell indicates a break from - * the foreach() loop. Free the consts list and return NULL then. + * Handle failed conversion from breaking out of the loop because of an + * unsupported qual. Also check that we have an indexExpr, just in case + * the OR list was somehow empty (it shouldn't be). Return NULL to + * indicate the conversion failed. */ - if (lc != NULL) + if (lc != NULL || indexExpr == NULL) { - list_free(consts); + list_free(consts); /* might as well */ return NULL; } + /* + * Build the new SAOP node. We use the indexExpr from the last OR arm; + * since all the arms passed match_index_to_operand, it shouldn't matter + * which one we use. But using "inputcollid" twice is a bit of a cheat: + * we might end up with an array Const node that is labeled with a + * collation despite its elements being of a noncollatable type. But + * nothing is likely to complain about that, so we don't bother being more + * accurate. + */ saopexpr = make_SAOP_expr(matchOpno, indexExpr, consttype, inputcollid, inputcollid, consts, haveNonConst); + Assert(saopexpr != NULL); /* - * Finally, build an IndexClause based on the SAOP node. Use - * make_simple_restrictinfo() to get RestrictInfo with clean selectivity - * estimations, because they may differ from the estimation made for an OR - * clause. Although it is not a lossy expression, keep the original rinfo - * in iclause->rinfo as prescribed. + * Finally, build an IndexClause based on the SAOP node. It's not lossy. */ iclause = makeNode(IndexClause); iclause->rinfo = rinfo; iclause->indexquals = list_make1(make_simple_restrictinfo(root, - &saopexpr->xpr)); + (Expr *) saopexpr)); iclause->lossy = false; iclause->indexcol = indexcol; iclause->indexcols = NIL; @@ -4075,6 +4050,16 @@ check_index_predicates(PlannerInfo *root, RelOptInfo *rel) if (is_target_rel) continue; + /* + * If index is !amoptionalkey, also leave indrestrictinfo as set + * above. Otherwise we risk removing all quals for the first index + * key and then not being able to generate an indexscan at all. It + * would be better to be more selective, but we've not yet identified + * which if any of the quals match the first index key. + */ + if (!index->amoptionalkey) + continue; + /* Else compute indrestrictinfo as the non-implied quals */ index->indrestrictinfo = NIL; foreach(lcr, rel->baserestrictinfo) @@ -4142,47 +4127,26 @@ ec_member_matches_indexcol(PlannerInfo *root, RelOptInfo *rel, * a set of equality conditions, because the conditions constrain all * columns of some unique index. * - * The conditions can be represented in either or both of two ways: - * 1. A list of RestrictInfo nodes, where the caller has already determined - * that each condition is a mergejoinable equality with an expression in - * this relation on one side, and an expression not involving this relation - * on the other. The transient outer_is_left flag is used to identify which - * side we should look at: left side if outer_is_left is false, right side - * if it is true. - * 2. A list of expressions in this relation, and a corresponding list of - * equality operators. The caller must have already checked that the operators - * represent equality. (Note: the operators could be cross-type; the - * expressions should correspond to their RHS inputs.) + * The conditions are provided as a list of RestrictInfo nodes, where the + * caller has already determined that each condition is a mergejoinable + * equality with an expression in this relation on one side, and an + * expression not involving this relation on the other. The transient + * outer_is_left flag is used to identify which side we should look at: + * left side if outer_is_left is false, right side if it is true. * * The caller need only supply equality conditions arising from joins; * this routine automatically adds in any usable baserestrictinfo clauses. * (Note that the passed-in restrictlist will be destructively modified!) + * + * If extra_clauses isn't NULL, return baserestrictinfo clauses which were used + * to derive uniqueness. */ bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, - List *restrictlist, - List *exprlist, List *oprlist) -{ - return relation_has_unique_index_ext(root, rel, restrictlist, - exprlist, oprlist, NULL); -} - -/* - * relation_has_unique_index_ext - * Same as relation_has_unique_index_for(), but supports extra_clauses - * parameter. If extra_clauses isn't NULL, return baserestrictinfo clauses - * which were used to derive uniqueness. - */ -bool -relation_has_unique_index_ext(PlannerInfo *root, RelOptInfo *rel, - List *restrictlist, - List *exprlist, List *oprlist, - List **extra_clauses) + List *restrictlist, List **extra_clauses) { ListCell *ic; - Assert(list_length(exprlist) == list_length(oprlist)); - /* Short-circuit if no indexes... */ if (rel->indexlist == NIL) return false; @@ -4225,7 +4189,7 @@ relation_has_unique_index_ext(PlannerInfo *root, RelOptInfo *rel, } /* Short-circuit the easy case */ - if (restrictlist == NIL && exprlist == NIL) + if (restrictlist == NIL) return false; /* Examine each index of the relation ... */ @@ -4247,14 +4211,12 @@ relation_has_unique_index_ext(PlannerInfo *root, RelOptInfo *rel, continue; /* - * Try to find each index column in the lists of conditions. This is + * Try to find each index column in the list of conditions. This is * O(N^2) or worse, but we expect all the lists to be short. */ for (c = 0; c < ind->nkeycolumns; c++) { - bool matched = false; ListCell *lc; - ListCell *lc2; foreach(lc, restrictlist) { @@ -4284,8 +4246,6 @@ relation_has_unique_index_ext(PlannerInfo *root, RelOptInfo *rel, if (match_index_to_operand(rexpr, c, ind)) { - matched = true; /* column is unique */ - if (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) { MemoryContext oldMemCtx = @@ -4303,43 +4263,11 @@ relation_has_unique_index_ext(PlannerInfo *root, RelOptInfo *rel, MemoryContextSwitchTo(oldMemCtx); } - break; + break; /* found a match; column is unique */ } } - if (matched) - continue; - - forboth(lc, exprlist, lc2, oprlist) - { - Node *expr = (Node *) lfirst(lc); - Oid opr = lfirst_oid(lc2); - - /* See if the expression matches the index key */ - if (!match_index_to_operand(expr, c, ind)) - continue; - - /* - * The equality operator must be a member of the index - * opfamily, else it is not asserting the right kind of - * equality behavior for this index. We assume the caller - * determined it is an equality operator, so we don't need to - * check any more tightly than this. - */ - if (!op_in_opfamily(opr, ind->opfamily[c])) - continue; - - /* - * XXX at some point we may need to check collations here too. - * For the moment we assume all collations reduce to the same - * notion of equality. - */ - - matched = true; /* column is unique */ - break; - } - - if (!matched) + if (lc == NULL) break; /* no match; this index doesn't help us */ } diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 26f0336f1e409..ea5b6415186a3 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -112,12 +112,12 @@ static void generate_mergejoin_paths(PlannerInfo *root, * "flipped around" if we are considering joining the rels in the opposite * direction from what's indicated in sjinfo. * - * Also, this routine and others in this module accept the special JoinTypes - * JOIN_UNIQUE_OUTER and JOIN_UNIQUE_INNER to indicate that we should - * unique-ify the outer or inner relation and then apply a regular inner - * join. These values are not allowed to propagate outside this module, - * however. Path cost estimation code may need to recognize that it's - * dealing with such a case --- the combination of nominal jointype INNER + * Also, this routine accepts the special JoinTypes JOIN_UNIQUE_OUTER and + * JOIN_UNIQUE_INNER to indicate that the outer or inner relation has been + * unique-ified and a regular inner join should then be applied. These values + * are not allowed to propagate outside this routine, however. Path cost + * estimation code, as well as match_unsorted_outer, may need to recognize that + * it's dealing with such a case --- the combination of nominal jointype INNER * with sjinfo->jointype == JOIN_SEMI indicates that. */ void @@ -129,6 +129,7 @@ add_paths_to_joinrel(PlannerInfo *root, SpecialJoinInfo *sjinfo, List *restrictlist) { + JoinType save_jointype = jointype; JoinPathExtraData extra; bool mergejoin_allowed = true; ListCell *lc; @@ -154,27 +155,25 @@ add_paths_to_joinrel(PlannerInfo *root, /* * See if the inner relation is provably unique for this outer rel. * - * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't - * matter since the executor can make the equivalent optimization anyway; - * we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we - * must be considering a semijoin whose inner side is not provably unique - * (else reduce_unique_semijoins would've simplified it), so there's no - * point in calling innerrel_is_unique. However, if the LHS covers all of - * the semijoin's min_lefthand, then it's appropriate to set inner_unique - * because the path produced by create_unique_path will be unique relative - * to the LHS. (If we have an LHS that's only part of the min_lefthand, - * that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid - * letting that value escape this module. + * We have some special cases: for JOIN_SEMI, it doesn't matter since the + * executor can make the equivalent optimization anyway. It also doesn't + * help enable use of Memoize, since a semijoin with a provably unique + * inner side should have been reduced to an inner join in that case. + * Therefore, we need not expend planner cycles on proofs. (For + * JOIN_ANTI, although it doesn't help the executor for the same reason, + * it can benefit Memoize paths.) For JOIN_UNIQUE_INNER, we must be + * considering a semijoin whose inner side is not provably unique (else + * reduce_unique_semijoins would've simplified it), so there's no point in + * calling innerrel_is_unique. However, if the LHS covers all of the + * semijoin's min_lefthand, then it's appropriate to set inner_unique + * because the unique relation produced by create_unique_paths will be + * unique relative to the LHS. (If we have an LHS that's only part of the + * min_lefthand, that is *not* true.) For JOIN_UNIQUE_OUTER, pass + * JOIN_INNER to avoid letting that value escape this module. */ switch (jointype) { case JOIN_SEMI: - case JOIN_ANTI: - - /* - * XXX it may be worth proving this to allow a Memoize to be - * considered for Nested Loop Semi/Anti Joins. - */ extra.inner_unique = false; /* well, unproven */ break; case JOIN_UNIQUE_INNER: @@ -201,6 +200,13 @@ add_paths_to_joinrel(PlannerInfo *root, break; } + /* + * If the outer or inner relation has been unique-ified, handle as a plain + * inner join. + */ + if (jointype == JOIN_UNIQUE_OUTER || jointype == JOIN_UNIQUE_INNER) + jointype = JOIN_INNER; + /* * Find potential mergejoin clauses. We can skip this if we are not * interested in doing a mergejoin. However, mergejoin may be our only @@ -331,7 +337,7 @@ add_paths_to_joinrel(PlannerInfo *root, joinrel->fdwroutine->GetForeignJoinPaths) joinrel->fdwroutine->GetForeignJoinPaths(root, joinrel, outerrel, innerrel, - jointype, &extra); + save_jointype, &extra); /* * 6. Finally, give extensions a chance to manipulate the path list. They @@ -341,7 +347,7 @@ add_paths_to_joinrel(PlannerInfo *root, */ if (set_join_pathlist_hook) set_join_pathlist_hook(root, joinrel, outerrel, innerrel, - jointype, &extra); + save_jointype, &extra); } /* @@ -715,16 +721,21 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel, return NULL; /* - * Currently we don't do this for SEMI and ANTI joins unless they're - * marked as inner_unique. This is because nested loop SEMI/ANTI joins - * don't scan the inner node to completion, which will mean memoize cannot - * mark the cache entry as complete. - * - * XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique - * = true. Should we? See add_paths_to_joinrel() + * Currently we don't do this for SEMI and ANTI joins, because nested loop + * SEMI/ANTI joins don't scan the inner node to completion, which means + * memoize cannot mark the cache entry as complete. Nor can we mark the + * cache entry as complete after fetching the first inner tuple, because + * if that tuple and the current outer tuple don't satisfy the join + * clauses, a second inner tuple that satisfies the parameters would find + * the cache entry already marked as complete. The only exception is when + * the inner relation is provably unique, as in that case, there won't be + * a second matching tuple and we can safely mark the cache entry as + * complete after fetching the first inner tuple. Note that in such + * cases, the SEMI join should have been reduced to an inner join by + * reduce_unique_semijoins. */ - if (!extra->inner_unique && (jointype == JOIN_SEMI || - jointype == JOIN_ANTI)) + if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) && + !extra->inner_unique) return NULL; /* @@ -876,16 +887,13 @@ try_nestloop_path(PlannerInfo *root, /* * Check to see if proposed path is still parameterized, and reject if the * parameterization wouldn't be sensible --- unless allow_star_schema_join - * says to allow it anyway. Also, we must reject if have_dangerous_phv - * doesn't like the look of it, which could only happen if the nestloop is - * still parameterized. + * says to allow it anyway. */ required_outer = calc_nestloop_required_outer(outerrelids, outer_paramrels, innerrelids, inner_paramrels); if (required_outer && - ((!bms_overlap(required_outer, extra->param_source_rels) && - !allow_star_schema_join(root, outerrelids, inner_paramrels)) || - have_dangerous_phv(root, outerrelids, inner_paramrels))) + !bms_overlap(required_outer, extra->param_source_rels) && + !allow_star_schema_join(root, outerrelids, inner_paramrels)) { /* Waste no memory when we reject a path here */ bms_free(required_outer); @@ -1364,7 +1372,6 @@ sort_inner_and_outer(PlannerInfo *root, JoinType jointype, JoinPathExtraData *extra) { - JoinType save_jointype = jointype; Path *outer_path; Path *inner_path; Path *cheapest_partial_outer = NULL; @@ -1402,38 +1409,16 @@ sort_inner_and_outer(PlannerInfo *root, PATH_PARAM_BY_REL(inner_path, outerrel)) return; - /* - * If unique-ification is requested, do it and then handle as a plain - * inner join. - */ - if (jointype == JOIN_UNIQUE_OUTER) - { - outer_path = (Path *) create_unique_path(root, outerrel, - outer_path, extra->sjinfo); - Assert(outer_path); - jointype = JOIN_INNER; - } - else if (jointype == JOIN_UNIQUE_INNER) - { - inner_path = (Path *) create_unique_path(root, innerrel, - inner_path, extra->sjinfo); - Assert(inner_path); - jointype = JOIN_INNER; - } - /* * If the joinrel is parallel-safe, we may be able to consider a partial - * merge join. However, we can't handle JOIN_UNIQUE_OUTER, because the - * outer path will be partial, and therefore we won't be able to properly - * guarantee uniqueness. Similarly, we can't handle JOIN_FULL, JOIN_RIGHT - * and JOIN_RIGHT_ANTI, because they can produce false null extended rows. + * merge join. However, we can't handle JOIN_FULL, JOIN_RIGHT and + * JOIN_RIGHT_ANTI, because they can produce false null extended rows. * Also, the resulting path must not be parameterized. */ if (joinrel->consider_parallel && - save_jointype != JOIN_UNIQUE_OUTER && - save_jointype != JOIN_FULL && - save_jointype != JOIN_RIGHT && - save_jointype != JOIN_RIGHT_ANTI && + jointype != JOIN_FULL && + jointype != JOIN_RIGHT && + jointype != JOIN_RIGHT_ANTI && outerrel->partial_pathlist != NIL && bms_is_empty(joinrel->lateral_relids)) { @@ -1441,7 +1426,7 @@ sort_inner_and_outer(PlannerInfo *root, if (inner_path->parallel_safe) cheapest_safe_inner = inner_path; - else if (save_jointype != JOIN_UNIQUE_INNER) + else cheapest_safe_inner = get_cheapest_parallel_safe_total_inner(innerrel->pathlist); } @@ -1580,13 +1565,9 @@ generate_mergejoin_paths(PlannerInfo *root, List *trialsortkeys; Path *cheapest_startup_inner; Path *cheapest_total_inner; - JoinType save_jointype = jointype; int num_sortkeys; int sortkeycnt; - if (jointype == JOIN_UNIQUE_OUTER || jointype == JOIN_UNIQUE_INNER) - jointype = JOIN_INNER; - /* Look for useful mergeclauses (if any) */ mergeclauses = find_mergeclauses_for_outer_pathkeys(root, @@ -1636,10 +1617,6 @@ generate_mergejoin_paths(PlannerInfo *root, extra, is_partial); - /* Can't do anything else if inner path needs to be unique'd */ - if (save_jointype == JOIN_UNIQUE_INNER) - return; - /* * Look for presorted inner paths that satisfy the innersortkey list --- * or any truncation thereof, if we are allowed to build a mergejoin using @@ -1819,7 +1796,6 @@ match_unsorted_outer(PlannerInfo *root, JoinType jointype, JoinPathExtraData *extra) { - JoinType save_jointype = jointype; bool nestjoinOK; bool useallclauses; Path *inner_cheapest_total = innerrel->cheapest_total_path; @@ -1855,12 +1831,6 @@ match_unsorted_outer(PlannerInfo *root, nestjoinOK = false; useallclauses = true; break; - case JOIN_UNIQUE_OUTER: - case JOIN_UNIQUE_INNER: - jointype = JOIN_INNER; - nestjoinOK = true; - useallclauses = false; - break; default: elog(ERROR, "unrecognized join type: %d", (int) jointype); @@ -1873,24 +1843,20 @@ match_unsorted_outer(PlannerInfo *root, * If inner_cheapest_total is parameterized by the outer rel, ignore it; * we will consider it below as a member of cheapest_parameterized_paths, * but the other possibilities considered in this routine aren't usable. + * + * Furthermore, if the inner side is a unique-ified relation, we cannot + * generate any valid paths here, because the inner rel's dependency on + * the outer rel makes unique-ification meaningless. */ if (PATH_PARAM_BY_REL(inner_cheapest_total, outerrel)) + { inner_cheapest_total = NULL; - /* - * If we need to unique-ify the inner path, we will consider only the - * cheapest-total inner. - */ - if (save_jointype == JOIN_UNIQUE_INNER) - { - /* No way to do this with an inner path parameterized by outer rel */ - if (inner_cheapest_total == NULL) + if (RELATION_WAS_MADE_UNIQUE(innerrel, extra->sjinfo, jointype)) return; - inner_cheapest_total = (Path *) - create_unique_path(root, innerrel, inner_cheapest_total, extra->sjinfo); - Assert(inner_cheapest_total); } - else if (nestjoinOK) + + if (nestjoinOK) { /* * Consider materializing the cheapest inner path, unless @@ -1914,20 +1880,6 @@ match_unsorted_outer(PlannerInfo *root, if (PATH_PARAM_BY_REL(outerpath, innerrel)) continue; - /* - * If we need to unique-ify the outer path, it's pointless to consider - * any but the cheapest outer. (XXX we don't consider parameterized - * outers, nor inners, for unique-ified cases. Should we?) - */ - if (save_jointype == JOIN_UNIQUE_OUTER) - { - if (outerpath != outerrel->cheapest_total_path) - continue; - outerpath = (Path *) create_unique_path(root, outerrel, - outerpath, extra->sjinfo); - Assert(outerpath); - } - /* * The result will have this sort order (even if it is implemented as * a nestloop, and even if some of the mergeclauses are implemented by @@ -1936,21 +1888,7 @@ match_unsorted_outer(PlannerInfo *root, merge_pathkeys = build_join_pathkeys(root, joinrel, jointype, outerpath->pathkeys); - if (save_jointype == JOIN_UNIQUE_INNER) - { - /* - * Consider nestloop join, but only with the unique-ified cheapest - * inner path - */ - try_nestloop_path(root, - joinrel, - outerpath, - inner_cheapest_total, - merge_pathkeys, - jointype, - extra); - } - else if (nestjoinOK) + if (nestjoinOK) { /* * Consider nestloop joins using this outer path and various @@ -2001,17 +1939,13 @@ match_unsorted_outer(PlannerInfo *root, extra); } - /* Can't do anything else if outer path needs to be unique'd */ - if (save_jointype == JOIN_UNIQUE_OUTER) - continue; - /* Can't do anything else if inner rel is parameterized by outer */ if (inner_cheapest_total == NULL) continue; /* Generate merge join paths */ generate_mergejoin_paths(root, joinrel, innerrel, outerpath, - save_jointype, extra, useallclauses, + jointype, extra, useallclauses, inner_cheapest_total, merge_pathkeys, false); } @@ -2019,41 +1953,35 @@ match_unsorted_outer(PlannerInfo *root, /* * Consider partial nestloop and mergejoin plan if outerrel has any * partial path and the joinrel is parallel-safe. However, we can't - * handle JOIN_UNIQUE_OUTER, because the outer path will be partial, and - * therefore we won't be able to properly guarantee uniqueness. Nor can - * we handle joins needing lateral rels, since partial paths must not be - * parameterized. Similarly, we can't handle JOIN_FULL, JOIN_RIGHT and + * handle joins needing lateral rels, since partial paths must not be + * parameterized. Similarly, we can't handle JOIN_FULL, JOIN_RIGHT and * JOIN_RIGHT_ANTI, because they can produce false null extended rows. */ if (joinrel->consider_parallel && - save_jointype != JOIN_UNIQUE_OUTER && - save_jointype != JOIN_FULL && - save_jointype != JOIN_RIGHT && - save_jointype != JOIN_RIGHT_ANTI && + jointype != JOIN_FULL && + jointype != JOIN_RIGHT && + jointype != JOIN_RIGHT_ANTI && outerrel->partial_pathlist != NIL && bms_is_empty(joinrel->lateral_relids)) { if (nestjoinOK) consider_parallel_nestloop(root, joinrel, outerrel, innerrel, - save_jointype, extra); + jointype, extra); /* * If inner_cheapest_total is NULL or non parallel-safe then find the - * cheapest total parallel safe path. If doing JOIN_UNIQUE_INNER, we - * can't use any alternative inner path. + * cheapest total parallel safe path. */ if (inner_cheapest_total == NULL || !inner_cheapest_total->parallel_safe) { - if (save_jointype == JOIN_UNIQUE_INNER) - return; - - inner_cheapest_total = get_cheapest_parallel_safe_total_inner(innerrel->pathlist); + inner_cheapest_total = + get_cheapest_parallel_safe_total_inner(innerrel->pathlist); } if (inner_cheapest_total) consider_parallel_mergejoin(root, joinrel, outerrel, innerrel, - save_jointype, extra, + jointype, extra, inner_cheapest_total); } } @@ -2118,24 +2046,17 @@ consider_parallel_nestloop(PlannerInfo *root, JoinType jointype, JoinPathExtraData *extra) { - JoinType save_jointype = jointype; Path *inner_cheapest_total = innerrel->cheapest_total_path; Path *matpath = NULL; ListCell *lc1; - if (jointype == JOIN_UNIQUE_INNER) - jointype = JOIN_INNER; - /* - * Consider materializing the cheapest inner path, unless: 1) we're doing - * JOIN_UNIQUE_INNER, because in this case we have to unique-ify the - * cheapest inner path, 2) enable_material is off, 3) the cheapest inner - * path is not parallel-safe, 4) the cheapest inner path is parameterized - * by the outer rel, or 5) the cheapest inner path materializes its output - * anyway. + * Consider materializing the cheapest inner path, unless: 1) + * enable_material is off, 2) the cheapest inner path is not + * parallel-safe, 3) the cheapest inner path is parameterized by the outer + * rel, or 4) the cheapest inner path materializes its output anyway. */ - if (save_jointype != JOIN_UNIQUE_INNER && - enable_material && inner_cheapest_total->parallel_safe && + if (enable_material && inner_cheapest_total->parallel_safe && !PATH_PARAM_BY_REL(inner_cheapest_total, outerrel) && !ExecMaterializesOutput(inner_cheapest_total->pathtype)) { @@ -2169,23 +2090,6 @@ consider_parallel_nestloop(PlannerInfo *root, if (!innerpath->parallel_safe) continue; - /* - * If we're doing JOIN_UNIQUE_INNER, we can only use the inner's - * cheapest_total_path, and we have to unique-ify it. (We might - * be able to relax this to allow other safe, unparameterized - * inner paths, but right now create_unique_path is not on board - * with that.) - */ - if (save_jointype == JOIN_UNIQUE_INNER) - { - if (innerpath != innerrel->cheapest_total_path) - continue; - innerpath = (Path *) create_unique_path(root, innerrel, - innerpath, - extra->sjinfo); - Assert(innerpath); - } - try_partial_nestloop_path(root, joinrel, outerpath, innerpath, pathkeys, jointype, extra); @@ -2227,7 +2131,6 @@ hash_inner_and_outer(PlannerInfo *root, JoinType jointype, JoinPathExtraData *extra) { - JoinType save_jointype = jointype; bool isouterjoin = IS_OUTER_JOIN(jointype); List *hashclauses; ListCell *l; @@ -2290,6 +2193,8 @@ hash_inner_and_outer(PlannerInfo *root, Path *cheapest_startup_outer = outerrel->cheapest_startup_path; Path *cheapest_total_outer = outerrel->cheapest_total_path; Path *cheapest_total_inner = innerrel->cheapest_total_path; + ListCell *lc1; + ListCell *lc2; /* * If either cheapest-total path is parameterized by the other rel, we @@ -2301,114 +2206,74 @@ hash_inner_and_outer(PlannerInfo *root, PATH_PARAM_BY_REL(cheapest_total_inner, outerrel)) return; - /* Unique-ify if need be; we ignore parameterized possibilities */ - if (jointype == JOIN_UNIQUE_OUTER) - { - cheapest_total_outer = (Path *) - create_unique_path(root, outerrel, - cheapest_total_outer, extra->sjinfo); - Assert(cheapest_total_outer); - jointype = JOIN_INNER; - try_hashjoin_path(root, - joinrel, - cheapest_total_outer, - cheapest_total_inner, - hashclauses, - jointype, - extra); - /* no possibility of cheap startup here */ - } - else if (jointype == JOIN_UNIQUE_INNER) - { - cheapest_total_inner = (Path *) - create_unique_path(root, innerrel, - cheapest_total_inner, extra->sjinfo); - Assert(cheapest_total_inner); - jointype = JOIN_INNER; + /* + * Consider the cheapest startup outer together with the cheapest + * total inner, and then consider pairings of cheapest-total paths + * including parameterized ones. There is no use in generating + * parameterized paths on the basis of possibly cheap startup cost, so + * this is sufficient. + */ + if (cheapest_startup_outer != NULL) try_hashjoin_path(root, joinrel, - cheapest_total_outer, + cheapest_startup_outer, cheapest_total_inner, hashclauses, jointype, extra); - if (cheapest_startup_outer != NULL && - cheapest_startup_outer != cheapest_total_outer) - try_hashjoin_path(root, - joinrel, - cheapest_startup_outer, - cheapest_total_inner, - hashclauses, - jointype, - extra); - } - else + + foreach(lc1, outerrel->cheapest_parameterized_paths) { + Path *outerpath = (Path *) lfirst(lc1); + /* - * For other jointypes, we consider the cheapest startup outer - * together with the cheapest total inner, and then consider - * pairings of cheapest-total paths including parameterized ones. - * There is no use in generating parameterized paths on the basis - * of possibly cheap startup cost, so this is sufficient. + * We cannot use an outer path that is parameterized by the inner + * rel. */ - ListCell *lc1; - ListCell *lc2; - - if (cheapest_startup_outer != NULL) - try_hashjoin_path(root, - joinrel, - cheapest_startup_outer, - cheapest_total_inner, - hashclauses, - jointype, - extra); + if (PATH_PARAM_BY_REL(outerpath, innerrel)) + continue; - foreach(lc1, outerrel->cheapest_parameterized_paths) + foreach(lc2, innerrel->cheapest_parameterized_paths) { - Path *outerpath = (Path *) lfirst(lc1); + Path *innerpath = (Path *) lfirst(lc2); /* - * We cannot use an outer path that is parameterized by the - * inner rel. + * We cannot use an inner path that is parameterized by the + * outer rel, either. */ - if (PATH_PARAM_BY_REL(outerpath, innerrel)) + if (PATH_PARAM_BY_REL(innerpath, outerrel)) continue; - foreach(lc2, innerrel->cheapest_parameterized_paths) - { - Path *innerpath = (Path *) lfirst(lc2); - - /* - * We cannot use an inner path that is parameterized by - * the outer rel, either. - */ - if (PATH_PARAM_BY_REL(innerpath, outerrel)) - continue; - - if (outerpath == cheapest_startup_outer && - innerpath == cheapest_total_inner) - continue; /* already tried it */ + if (outerpath == cheapest_startup_outer && + innerpath == cheapest_total_inner) + continue; /* already tried it */ - try_hashjoin_path(root, - joinrel, - outerpath, - innerpath, - hashclauses, - jointype, - extra); - } + try_hashjoin_path(root, + joinrel, + outerpath, + innerpath, + hashclauses, + jointype, + extra); } } /* * If the joinrel is parallel-safe, we may be able to consider a - * partial hash join. However, we can't handle JOIN_UNIQUE_OUTER, - * because the outer path will be partial, and therefore we won't be - * able to properly guarantee uniqueness. Also, the resulting path - * must not be parameterized. + * partial hash join. + * + * However, we can't handle JOIN_RIGHT_SEMI, because the hash table is + * either a shared hash table or a private hash table per backend. In + * the shared case, there is no concurrency protection for the match + * flags, so multiple workers could inspect and set the flags + * concurrently, potentially producing incorrect results. In the + * private case, each worker has its own copy of the hash table, so no + * single process has all the match flags. + * + * Also, the resulting path must not be parameterized. */ if (joinrel->consider_parallel && - save_jointype != JOIN_UNIQUE_OUTER && + jointype != JOIN_RIGHT_SEMI && outerrel->partial_pathlist != NIL && bms_is_empty(joinrel->lateral_relids)) { @@ -2421,11 +2286,9 @@ hash_inner_and_outer(PlannerInfo *root, /* * Can we use a partial inner plan too, so that we can build a - * shared hash table in parallel? We can't handle - * JOIN_UNIQUE_INNER because we can't guarantee uniqueness. + * shared hash table in parallel? */ if (innerrel->partial_pathlist != NIL && - save_jointype != JOIN_UNIQUE_INNER && enable_parallel_hash) { cheapest_partial_inner = @@ -2441,19 +2304,17 @@ hash_inner_and_outer(PlannerInfo *root, * Normally, given that the joinrel is parallel-safe, the cheapest * total inner path will also be parallel-safe, but if not, we'll * have to search for the cheapest safe, unparameterized inner - * path. If doing JOIN_UNIQUE_INNER, we can't use any alternative - * inner path. If full, right, right-semi or right-anti join, we - * can't use parallelism (building the hash table in each backend) - * because no one process has all the match bits. + * path. If full, right, or right-anti join, we can't use + * parallelism (building the hash table in each backend) because + * no one process has all the match bits. */ - if (save_jointype == JOIN_FULL || - save_jointype == JOIN_RIGHT || - save_jointype == JOIN_RIGHT_SEMI || - save_jointype == JOIN_RIGHT_ANTI) + if (jointype == JOIN_FULL || + jointype == JOIN_RIGHT || + jointype == JOIN_RIGHT_ANTI) cheapest_safe_inner = NULL; else if (cheapest_total_inner->parallel_safe) cheapest_safe_inner = cheapest_total_inner; - else if (save_jointype != JOIN_UNIQUE_INNER) + else cheapest_safe_inner = get_cheapest_parallel_safe_total_inner(innerrel->pathlist); diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 60d65762b5d5e..8827b9a5245a0 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -16,9 +16,11 @@ #include "miscadmin.h" #include "optimizer/appendinfo.h" +#include "optimizer/cost.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" +#include "optimizer/planner.h" #include "partitioning/partbounds.h" #include "utils/memutils.h" @@ -35,6 +37,9 @@ static bool has_legal_joinclause(PlannerInfo *root, RelOptInfo *rel); static bool restriction_is_constant_false(List *restrictlist, RelOptInfo *joinrel, bool only_pushed_down); +static void make_grouped_join_rel(PlannerInfo *root, RelOptInfo *rel1, + RelOptInfo *rel2, RelOptInfo *joinrel, + SpecialJoinInfo *sjinfo, List *restrictlist); static void populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, RelOptInfo *joinrel, SpecialJoinInfo *sjinfo, List *restrictlist); @@ -444,8 +449,7 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, } else if (sjinfo->jointype == JOIN_SEMI && bms_equal(sjinfo->syn_righthand, rel2->relids) && - create_unique_path(root, rel2, rel2->cheapest_total_path, - sjinfo) != NULL) + create_unique_paths(root, rel2, sjinfo) != NULL) { /*---------- * For a semijoin, we can join the RHS to anything else by @@ -477,8 +481,7 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, } else if (sjinfo->jointype == JOIN_SEMI && bms_equal(sjinfo->syn_righthand, rel1->relids) && - create_unique_path(root, rel1, rel1->cheapest_total_path, - sjinfo) != NULL) + create_unique_paths(root, rel1, sjinfo) != NULL) { /* Reversed semijoin case */ if (match_sjinfo) @@ -565,9 +568,6 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, * Also, if the lateral reference is only indirect, we should reject * the join; whatever rel(s) the reference chain goes through must be * joined to first. - * - * Another case that might keep us from building a valid plan is the - * implementation restriction described by have_dangerous_phv(). */ lateral_fwd = bms_overlap(rel1->relids, rel2->lateral_relids); lateral_rev = bms_overlap(rel2->relids, rel1->lateral_relids); @@ -584,9 +584,6 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, /* check there is a direct reference from rel2 to rel1 */ if (!bms_overlap(rel1->relids, rel2->direct_lateral_relids)) return false; /* only indirect refs, so reject */ - /* check we won't have a dangerous PHV */ - if (have_dangerous_phv(root, rel1->relids, rel2->lateral_relids)) - return false; /* might be unable to handle required PHV */ } else if (lateral_rev) { @@ -599,9 +596,6 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, /* check there is a direct reference from rel1 to rel2 */ if (!bms_overlap(rel2->relids, rel1->direct_lateral_relids)) return false; /* only indirect refs, so reject */ - /* check we won't have a dangerous PHV */ - if (have_dangerous_phv(root, rel2->relids, rel1->lateral_relids)) - return false; /* might be unable to handle required PHV */ } /* @@ -772,6 +766,10 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2) return joinrel; } + /* Build a grouped join relation for 'joinrel' if possible. */ + make_grouped_join_rel(root, rel1, rel2, joinrel, sjinfo, + restrictlist); + /* Add paths to the join relation. */ populate_joinrel_with_paths(root, rel1, rel2, joinrel, sjinfo, restrictlist); @@ -883,6 +881,186 @@ add_outer_joins_to_relids(PlannerInfo *root, Relids input_relids, return input_relids; } +/* + * make_grouped_join_rel + * Build a grouped join relation for the given "joinrel" if eager + * aggregation is applicable and the resulting grouped paths are considered + * useful. + * + * There are two strategies for generating grouped paths for a join relation: + * + * 1. Join a grouped (partially aggregated) input relation with a non-grouped + * input (e.g., AGG(B) JOIN A). + * + * 2. Apply partial aggregation (sorted or hashed) on top of existing + * non-grouped join paths (e.g., AGG(A JOIN B)). + * + * To limit planning effort and avoid an explosion of alternatives, we adopt a + * strategy where partial aggregation is only pushed to the lowest possible + * level in the join tree that is deemed useful. That is, if grouped paths can + * be built using the first strategy, we skip consideration of the second + * strategy for the same join level. + * + * Additionally, if there are multiple lowest useful levels where partial + * aggregation could be applied, such as in a join tree with relations A, B, + * and C where both "AGG(A JOIN B) JOIN C" and "A JOIN AGG(B JOIN C)" are valid + * placements, we choose only the first one encountered during join search. + * This avoids generating multiple versions of the same grouped relation based + * on different aggregation placements. + * + * These heuristics also ensure that all grouped paths for the same grouped + * relation produce the same set of rows, which is a basic assumption in the + * planner. + */ +static void +make_grouped_join_rel(PlannerInfo *root, RelOptInfo *rel1, + RelOptInfo *rel2, RelOptInfo *joinrel, + SpecialJoinInfo *sjinfo, List *restrictlist) +{ + RelOptInfo *grouped_rel; + RelOptInfo *grouped_rel1; + RelOptInfo *grouped_rel2; + bool rel1_empty; + bool rel2_empty; + Relids apply_agg_at; + + /* + * If there are no aggregate expressions or grouping expressions, eager + * aggregation is not possible. + */ + if (root->agg_clause_list == NIL || + root->group_expr_list == NIL) + return; + + /* Retrieve the grouped relations for the two input rels */ + grouped_rel1 = rel1->grouped_rel; + grouped_rel2 = rel2->grouped_rel; + + rel1_empty = (grouped_rel1 == NULL || IS_DUMMY_REL(grouped_rel1)); + rel2_empty = (grouped_rel2 == NULL || IS_DUMMY_REL(grouped_rel2)); + + /* Find or construct a grouped joinrel for this joinrel */ + grouped_rel = joinrel->grouped_rel; + if (grouped_rel == NULL) + { + RelAggInfo *agg_info = NULL; + + /* + * Prepare the information needed to create grouped paths for this + * join relation. + */ + agg_info = create_rel_agg_info(root, joinrel, rel1_empty == rel2_empty); + if (agg_info == NULL) + return; + + /* + * If grouped paths for the given join relation are not considered + * useful, and no grouped paths can be built by joining grouped input + * relations, skip building the grouped join relation. + */ + if (!agg_info->agg_useful && + (rel1_empty == rel2_empty)) + return; + + /* build the grouped relation */ + grouped_rel = build_grouped_rel(root, joinrel); + grouped_rel->reltarget = agg_info->target; + + if (rel1_empty != rel2_empty) + { + /* + * If there is exactly one grouped input relation, then we can + * build grouped paths by joining the input relations. Set size + * estimates for the grouped join relation based on the input + * relations, and update the set of relids where partial + * aggregation is applied to that of the grouped input relation. + */ + set_joinrel_size_estimates(root, grouped_rel, + rel1_empty ? rel1 : grouped_rel1, + rel2_empty ? rel2 : grouped_rel2, + sjinfo, restrictlist); + agg_info->apply_agg_at = rel1_empty ? + grouped_rel2->agg_info->apply_agg_at : + grouped_rel1->agg_info->apply_agg_at; + } + else + { + /* + * Otherwise, grouped paths can be built by applying partial + * aggregation on top of existing non-grouped join paths. Set + * size estimates for the grouped join relation based on the + * estimated number of groups, and track the set of relids where + * partial aggregation is applied. Note that these values may be + * updated later if it is determined that grouped paths can be + * constructed by joining other input relations. + */ + grouped_rel->rows = agg_info->grouped_rows; + agg_info->apply_agg_at = bms_copy(joinrel->relids); + } + + grouped_rel->agg_info = agg_info; + joinrel->grouped_rel = grouped_rel; + } + + Assert(IS_GROUPED_REL(grouped_rel)); + + /* We may have already proven this grouped join relation to be dummy. */ + if (IS_DUMMY_REL(grouped_rel)) + return; + + /* + * Nothing to do if there's no grouped input relation. Also, joining two + * grouped relations is not currently supported. + */ + if (rel1_empty == rel2_empty) + return; + + /* + * Get the set of relids where partial aggregation is applied among the + * given input relations. + */ + apply_agg_at = rel1_empty ? + grouped_rel2->agg_info->apply_agg_at : + grouped_rel1->agg_info->apply_agg_at; + + /* + * If it's not the designated level, skip building grouped paths. + * + * One exception is when it is a subset of the previously recorded level. + * In that case, we need to update the designated level to this one, and + * adjust the size estimates for the grouped join relation accordingly. + * For example, suppose partial aggregation can be applied on top of (B + * JOIN C). If we first construct the join as ((A JOIN B) JOIN C), we'd + * record the designated level as including all three relations (A B C). + * Later, when we consider (A JOIN (B JOIN C)), we encounter the smaller + * (B C) join level directly. Since this is a subset of the previous + * level and still valid for partial aggregation, we update the designated + * level to (B C), and adjust the size estimates accordingly. + */ + if (!bms_equal(apply_agg_at, grouped_rel->agg_info->apply_agg_at)) + { + if (bms_is_subset(apply_agg_at, grouped_rel->agg_info->apply_agg_at)) + { + /* Adjust the size estimates for the grouped join relation. */ + set_joinrel_size_estimates(root, grouped_rel, + rel1_empty ? rel1 : grouped_rel1, + rel2_empty ? rel2 : grouped_rel2, + sjinfo, restrictlist); + grouped_rel->agg_info->apply_agg_at = apply_agg_at; + } + else + return; + } + + /* Make paths for the grouped join relation. */ + populate_joinrel_with_paths(root, + rel1_empty ? rel1 : grouped_rel1, + rel2_empty ? rel2 : grouped_rel2, + grouped_rel, + sjinfo, + restrictlist); +} + /* * populate_joinrel_with_paths * Add paths to the given joinrel for given pair of joining relations. The @@ -895,6 +1073,8 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, RelOptInfo *joinrel, SpecialJoinInfo *sjinfo, List *restrictlist) { + RelOptInfo *unique_rel2; + /* * Consider paths using each rel as both outer and inner. Depending on * the join type, a provably empty outer or inner rel might mean the join @@ -1000,14 +1180,13 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, /* * If we know how to unique-ify the RHS and one input rel is * exactly the RHS (not a superset) we can consider unique-ifying - * it and then doing a regular join. (The create_unique_path + * it and then doing a regular join. (The create_unique_paths * check here is probably redundant with what join_is_legal did, * but if so the check is cheap because it's cached. So test * anyway to be sure.) */ if (bms_equal(sjinfo->syn_righthand, rel2->relids) && - create_unique_path(root, rel2, rel2->cheapest_total_path, - sjinfo) != NULL) + (unique_rel2 = create_unique_paths(root, rel2, sjinfo)) != NULL) { if (is_dummy_rel(rel1) || is_dummy_rel(rel2) || restriction_is_constant_false(restrictlist, joinrel, false)) @@ -1015,10 +1194,10 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, mark_dummy_rel(joinrel); break; } - add_paths_to_joinrel(root, joinrel, rel1, rel2, + add_paths_to_joinrel(root, joinrel, rel1, unique_rel2, JOIN_UNIQUE_INNER, sjinfo, restrictlist); - add_paths_to_joinrel(root, joinrel, rel2, rel1, + add_paths_to_joinrel(root, joinrel, unique_rel2, rel1, JOIN_UNIQUE_OUTER, sjinfo, restrictlist); } @@ -1278,57 +1457,6 @@ has_legal_joinclause(PlannerInfo *root, RelOptInfo *rel) } -/* - * There's a pitfall for creating parameterized nestloops: suppose the inner - * rel (call it A) has a parameter that is a PlaceHolderVar, and that PHV's - * minimum eval_at set includes the outer rel (B) and some third rel (C). - * We might think we could create a B/A nestloop join that's parameterized by - * C. But we would end up with a plan in which the PHV's expression has to be - * evaluated as a nestloop parameter at the B/A join; and the executor is only - * set up to handle simple Vars as NestLoopParams. Rather than add complexity - * and overhead to the executor for such corner cases, it seems better to - * forbid the join. (Note that we can still make use of A's parameterized - * path with pre-joined B+C as the outer rel. have_join_order_restriction() - * ensures that we will consider making such a join even if there are not - * other reasons to do so.) - * - * So we check whether any PHVs used in the query could pose such a hazard. - * We don't have any simple way of checking whether a risky PHV would actually - * be used in the inner plan, and the case is so unusual that it doesn't seem - * worth working very hard on it. - * - * This needs to be checked in two places. If the inner rel's minimum - * parameterization would trigger the restriction, then join_is_legal() should - * reject the join altogether, because there will be no workable paths for it. - * But joinpath.c has to check again for every proposed nestloop path, because - * the inner path might have more than the minimum parameterization, causing - * some PHV to be dangerous for it that otherwise wouldn't be. - */ -bool -have_dangerous_phv(PlannerInfo *root, - Relids outer_relids, Relids inner_params) -{ - ListCell *lc; - - foreach(lc, root->placeholder_list) - { - PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(lc); - - if (!bms_is_subset(phinfo->ph_eval_at, inner_params)) - continue; /* ignore, could not be a nestloop param */ - if (!bms_overlap(phinfo->ph_eval_at, outer_relids)) - continue; /* ignore, not relevant to this join */ - if (bms_is_subset(phinfo->ph_eval_at, outer_relids)) - continue; /* safe, it can be eval'd within outerrel */ - /* Otherwise, it's potentially unsafe, so reject the join */ - return true; - } - - /* OK to perform the join */ - return false; -} - - /* * is_dummy_rel --- has relation been proven empty? */ @@ -1675,6 +1803,11 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, adjust_child_relids(joinrel->relids, nappinfos, appinfos))); + /* Build a grouped join relation for 'child_joinrel' if possible */ + make_grouped_join_rel(root, child_rel1, child_rel2, + child_joinrel, child_sjinfo, + child_restrictlist); + /* And make paths for the child join */ populate_joinrel_with_paths(root, child_rel1, child_rel2, child_joinrel, child_sjinfo, @@ -1857,8 +1990,7 @@ compute_partition_bounds(PlannerInfo *root, RelOptInfo *rel1, Assert(nparts > 0); joinrel->boundinfo = boundinfo; joinrel->nparts = nparts; - joinrel->part_rels = - (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts); + joinrel->part_rels = palloc0_array(RelOptInfo *, nparts); } else { diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 8b04d40d36d73..139fa1f875a1b 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -2147,154 +2147,126 @@ right_merge_direction(PlannerInfo *root, PathKey *pathkey) } /* - * pathkeys_useful_for_ordering - * Count the number of pathkeys that are useful for meeting the - * query's requested output ordering. - * - * Because we the have the possibility of incremental sort, a prefix list of - * keys is potentially useful for improving the performance of the requested - * ordering. Thus we return 0, if no valuable keys are found, or the number - * of leading keys shared by the list and the requested ordering.. + * count_common_leading_pathkeys_ordered + * Returns the number of leading pathkeys which both lists have in common */ static int -pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys) +count_common_leading_pathkeys_ordered(List *keys1, List *keys2) { - int n_common_pathkeys; + int ncommon; - (void) pathkeys_count_contained_in(root->query_pathkeys, pathkeys, - &n_common_pathkeys); + (void) pathkeys_count_contained_in(keys1, keys2, &ncommon); - return n_common_pathkeys; + return ncommon; } /* - * pathkeys_useful_for_grouping - * Count the number of pathkeys that are useful for grouping (instead of - * explicit sort) - * - * Group pathkeys could be reordered to benefit from the ordering. The - * ordering may not be "complete" and may require incremental sort, but that's - * fine. So we simply count prefix pathkeys with a matching group key, and - * stop once we find the first pathkey without a match. - * - * So e.g. with pathkeys (a,b,c) and group keys (a,b,e) this determines (a,b) - * pathkeys are useful for grouping, and we might do incremental sort to get - * path ordered by (a,b,e). - * - * This logic is necessary to retain paths with ordering not matching grouping - * keys directly, without the reordering. - * - * Returns the length of pathkey prefix with matching group keys. + * count_common_leading_pathkeys_unordered + * Returns the number of leading PathKeys in 'keys2' which exist in + * 'keys1'. */ static int -pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys) +count_common_leading_pathkeys_unordered(List *keys1, List *keys2) { - ListCell *key; - int n = 0; + int ncommon = 0; - /* no special ordering requested for grouping */ - if (root->group_pathkeys == NIL) + /* No point in searching keys2 when keys1 is empty */ + if (keys1 == NIL) return 0; - /* walk the pathkeys and search for matching group key */ - foreach(key, pathkeys) + /* walk keys2 and search for matching PathKeys in keys1 */ + foreach_node(PathKey, pathkey, keys2) { - PathKey *pathkey = (PathKey *) lfirst(key); - - /* no matching group key, we're done */ - if (!list_member_ptr(root->group_pathkeys, pathkey)) + /* + * return the number of matches so far as soon as keys1 doesn't + * contain the given keys2 key. + */ + if (!list_member_ptr(keys1, pathkey)) break; - n++; + ncommon++; } - return n; + return ncommon; } /* - * pathkeys_useful_for_distinct - * Count the number of pathkeys that are useful for DISTINCT or DISTINCT - * ON clause. - * - * DISTINCT keys could be reordered to benefit from the given pathkey list. As - * with pathkeys_useful_for_grouping, we return the number of leading keys in - * the list that are shared by the distinctClause pathkeys. + * truncate_useless_pathkeys + * Shorten the given PathKey List to just the useful PathKeys. If all + * PathKeys are useful, return the input List, otherwise return a new + * List containing only the useful PathKeys. */ -static int -pathkeys_useful_for_distinct(PlannerInfo *root, List *pathkeys) +List * +truncate_useless_pathkeys(PlannerInfo *root, + RelOptInfo *rel, + List *pathkeys) { - int n_common_pathkeys; + int nuseful; + int nuseful2; + int ntotal = list_length(pathkeys); /* - * distinct_pathkeys may have become empty if all of the pathkeys were - * determined to be redundant. Return 0 in this case. + * Here we determine how many items in 'pathkeys' might be useful for + * various Path sort ordering requirements the planner has. Operations + * such as ORDER BY require a Path's pathkeys to match the PathKeys of the + * ORDER BY in the same order, however operations such as GROUP BY and + * DISTINCT are less critical as a Unique or GroupAggregate only need to + * care that all PathKeys exist in their subpath, and don't need to care + * if they're in the same order as the clause in the query. */ - if (root->distinct_pathkeys == NIL) - return 0; + nuseful = count_common_leading_pathkeys_ordered(root->sort_pathkeys, + pathkeys); - /* walk the pathkeys and search for matching DISTINCT key */ - n_common_pathkeys = 0; - foreach_node(PathKey, pathkey, pathkeys) - { - /* no matching DISTINCT key, we're done */ - if (!list_member_ptr(root->distinct_pathkeys, pathkey)) - break; + /* Short-circuit at any point we discover *all* pathkeys are useful */ + if (nuseful == ntotal) + return pathkeys; - n_common_pathkeys++; - } + nuseful2 = count_common_leading_pathkeys_ordered(root->window_pathkeys, + pathkeys); + if (nuseful2 == ntotal) + return pathkeys; - return n_common_pathkeys; -} + nuseful = Max(nuseful, nuseful2); + nuseful2 = count_common_leading_pathkeys_ordered(root->setop_pathkeys, + pathkeys); + if (nuseful2 == ntotal) + return pathkeys; -/* - * pathkeys_useful_for_setop - * Count the number of leading common pathkeys root's 'setop_pathkeys' in - * 'pathkeys'. - */ -static int -pathkeys_useful_for_setop(PlannerInfo *root, List *pathkeys) -{ - int n_common_pathkeys; + nuseful = Max(nuseful, nuseful2); - (void) pathkeys_count_contained_in(root->setop_pathkeys, pathkeys, - &n_common_pathkeys); + /* + * Check if these pathkeys are useful for GROUP BY or DISTINCT. The order + * of the pathkeys does not matter here as Unique and GroupAggregate for + * these operations can take advantage of Paths presorted by any of the + * GROUP BY/DISTINCT pathkeys. + */ + nuseful2 = count_common_leading_pathkeys_unordered(root->group_pathkeys, + pathkeys); + if (nuseful2 == ntotal) + return pathkeys; - return n_common_pathkeys; -} + nuseful = Max(nuseful, nuseful2); + nuseful2 = count_common_leading_pathkeys_unordered(root->distinct_pathkeys, + pathkeys); -/* - * truncate_useless_pathkeys - * Shorten the given pathkey list to just the useful pathkeys. - */ -List * -truncate_useless_pathkeys(PlannerInfo *root, - RelOptInfo *rel, - List *pathkeys) -{ - int nuseful; - int nuseful2; + if (nuseful2 == ntotal) + return pathkeys; + + nuseful = Max(nuseful, nuseful2); - nuseful = pathkeys_useful_for_merging(root, rel, pathkeys); - nuseful2 = pathkeys_useful_for_ordering(root, pathkeys); - if (nuseful2 > nuseful) - nuseful = nuseful2; - nuseful2 = pathkeys_useful_for_grouping(root, pathkeys); - if (nuseful2 > nuseful) - nuseful = nuseful2; - nuseful2 = pathkeys_useful_for_distinct(root, pathkeys); - if (nuseful2 > nuseful) - nuseful = nuseful2; - nuseful2 = pathkeys_useful_for_setop(root, pathkeys); - if (nuseful2 > nuseful) - nuseful = nuseful2; + /* + * Finally, check how many PathKeys might be useful for Merge Joins. This + * is a bit more expensive, so do it last and only if we've not figured + * out that all the pathkeys are useful already. + */ + nuseful2 = pathkeys_useful_for_merging(root, rel, pathkeys); + nuseful = Max(nuseful, nuseful2); /* * Note: not safe to modify input list destructively, but we can avoid * copying the list if we're not actually going to change it */ - if (nuseful == 0) - return NIL; - else if (nuseful == list_length(pathkeys)) + if (nuseful == ntotal) return pathkeys; else return list_copy_head(pathkeys, nuseful); @@ -2320,9 +2292,8 @@ has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel) { if (rel->joininfo != NIL || rel->has_eclass_joins) return true; /* might be able to use pathkeys for merging */ - if (root->group_pathkeys != NIL) - return true; /* might be able to use pathkeys for grouping */ if (root->query_pathkeys != NIL) - return true; /* might be able to use them for ordering */ + return true; /* the upper planner might need them */ + return false; /* definitely useless */ } diff --git a/src/backend/optimizer/path/tidpath.c b/src/backend/optimizer/path/tidpath.c index 2bfb338b81ced..3ddbc10bbdf1a 100644 --- a/src/backend/optimizer/path/tidpath.c +++ b/src/backend/optimizer/path/tidpath.c @@ -490,9 +490,8 @@ ec_member_matches_ctid(PlannerInfo *root, RelOptInfo *rel, /* * create_tidscan_paths - * Create paths corresponding to direct TID scans of the given rel. - * - * Candidate paths are added to the rel's pathlist (using add_path). + * Create paths corresponding to direct TID scans of the given rel and add + * them to the corresponding path list via add_path or add_partial_path. */ bool create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel) @@ -553,7 +552,24 @@ create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel) add_path(rel, (Path *) create_tidrangescan_path(root, rel, tidrangequals, - required_outer)); + required_outer, + 0)); + + /* If appropriate, consider parallel tid range scan. */ + if (rel->consider_parallel && required_outer == NULL) + { + int parallel_workers; + + parallel_workers = compute_parallel_worker(rel, rel->pages, -1, + max_parallel_workers_per_gather); + + if (parallel_workers > 0) + add_partial_path(rel, (Path *) create_tidrangescan_path(root, + rel, + tidrangequals, + required_outer, + parallel_workers)); + } } /* diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 4d55c2ea59162..e2784c01549c0 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -31,6 +31,7 @@ #include "optimizer/placeholder.h" #include "optimizer/planmain.h" #include "optimizer/restrictinfo.h" +#include "parser/parse_agg.h" #include "rewrite/rewriteManip.h" #include "utils/lsyscache.h" @@ -631,6 +632,7 @@ remove_leftjoinrel_from_query(PlannerInfo *root, int relid, * remove_join_clause_from_rels will touch it.) */ root->simple_rel_array[relid] = NULL; + root->simple_rte_array[relid] = NULL; /* And nuke the RelOptInfo, just in case there's another access path */ pfree(rel); @@ -990,11 +992,10 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, { /* * Examine the indexes to see if we have a matching unique index. - * relation_has_unique_index_ext automatically adds any usable + * relation_has_unique_index_for automatically adds any usable * restriction clauses for the rel, so we needn't do that here. */ - if (relation_has_unique_index_ext(root, rel, clause_list, NIL, NIL, - extra_clauses)) + if (relation_has_unique_index_for(root, rel, clause_list, extra_clauses)) return true; } else if (rel->rtekind == RTE_SUBQUERY) @@ -1175,6 +1176,8 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) } else if (query->groupingSets) { + List *gsets; + /* * If we have grouping sets with expressions, we probably don't have * uniqueness and analysis would be hard. Punt. @@ -1184,15 +1187,17 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) /* * If we have no groupClause (therefore no grouping expressions), we - * might have one or many empty grouping sets. If there's just one, - * then we're returning only one row and are certainly unique. But - * otherwise, we know we're certainly not unique. + * might have one or many empty grouping sets. If there's just one, + * or if the DISTINCT clause is used on the GROUP BY, then we're + * returning only one row and are certainly unique. But otherwise, we + * know we're certainly not unique. */ - if (list_length(query->groupingSets) == 1 && - ((GroupingSet *) linitial(query->groupingSets))->kind == GROUPING_SET_EMPTY) + if (query->groupDistinct) return true; - else - return false; + + gsets = expand_grouping_sets(query->groupingSets, false, -1); + + return (list_length(gsets) == 1); } else { @@ -1425,17 +1430,14 @@ innerrel_is_unique_ext(PlannerInfo *root, * * However, in normal planning mode, caching this knowledge is totally * pointless; it won't be queried again, because we build up joinrels - * from smaller to larger. It is useful in GEQO mode, where the - * knowledge can be carried across successive planning attempts; and - * it's likely to be useful when using join-search plugins, too. Hence - * cache when join_search_private is non-NULL. (Yeah, that's a hack, - * but it seems reasonable.) + * from smaller to larger. It's only useful when using GEQO or + * another planner extension that attempts planning multiple times. * * Also, allow callers to override that heuristic and force caching; * that's useful for reduce_unique_semijoins, which calls here before * the normal join search starts. */ - if (force_cache || root->join_search_private) + if (force_cache || root->assumeReplanning) { old_context = MemoryContextSwitchTo(root->planner_cxt); innerrel->non_unique_for_rels = @@ -1686,14 +1688,14 @@ add_non_redundant_clauses(PlannerInfo *root, } /* - * A custom callback for ChangeVarNodesExtended() providing - * Self-join elimination (SJE) related functionality + * A custom callback for ChangeVarNodesExtended() providing Self-join + * elimination (SJE) related functionality * - * SJE needs to skip the RangeTblRef node - * type. During SJE's last step, remove_rel_from_joinlist() removes - * remaining RangeTblRefs with target relid. If ChangeVarNodes() replaces - * the target relid before, remove_rel_from_joinlist() fails to identify - * the nodes to delete. + * SJE needs to skip the RangeTblRef node type. During SJE's last + * step, remove_rel_from_joinlist() removes remaining RangeTblRefs + * with target relid. If ChangeVarNodes() replaces the target relid + * before, remove_rel_from_joinlist() would fail to identify the nodes + * to delete. * * SJE also needs to change the relids within RestrictInfo's. */ @@ -1772,8 +1774,8 @@ replace_relid_callback(Node *node, ChangeVarNodes_context *context) /* * For self-join elimination, changing varnos could transform * "t1.a = t2.a" into "t1.a = t1.a". That is always true as long - * as "t1.a" is not null. We use qual() to check for such a case, - * and then we replace the qual for a check for not null + * as "t1.a" is not null. We use equal() to check for such a + * case, and then we replace the qual with a check for not null * (NullTest). */ if (leftOp != NULL && equal(leftOp, rightOp)) @@ -1979,10 +1981,12 @@ remove_self_join_rel(PlannerInfo *root, PlanRowMark *kmark, PlanRowMark *rmark, * remove_join_clause_from_rels will touch it.) */ root->simple_rel_array[toRemove->relid] = NULL; + root->simple_rte_array[toRemove->relid] = NULL; /* And nuke the RelOptInfo, just in case there's another access path. */ pfree(toRemove); + /* * Now repeat construction of attr_needed bits coming from all other * sources. @@ -2142,21 +2146,21 @@ remove_self_joins_one_group(PlannerInfo *root, Relids relids) while ((r = bms_next_member(relids, r)) > 0) { - RelOptInfo *inner = root->simple_rel_array[r]; + RelOptInfo *rrel = root->simple_rel_array[r]; k = r; while ((k = bms_next_member(relids, k)) > 0) { Relids joinrelids = NULL; - RelOptInfo *outer = root->simple_rel_array[k]; + RelOptInfo *krel = root->simple_rel_array[k]; List *restrictlist; List *selfjoinquals; List *otherjoinquals; ListCell *lc; bool jinfo_check = true; - PlanRowMark *omark = NULL; - PlanRowMark *imark = NULL; + PlanRowMark *kmark = NULL; + PlanRowMark *rmark = NULL; List *uclauses = NIL; /* A sanity check: the relations have the same Oid. */ @@ -2194,21 +2198,21 @@ remove_self_joins_one_group(PlannerInfo *root, Relids relids) { PlanRowMark *rowMark = (PlanRowMark *) lfirst(lc); - if (rowMark->rti == k) + if (rowMark->rti == r) { - Assert(imark == NULL); - imark = rowMark; + Assert(rmark == NULL); + rmark = rowMark; } - else if (rowMark->rti == r) + else if (rowMark->rti == k) { - Assert(omark == NULL); - omark = rowMark; + Assert(kmark == NULL); + kmark = rowMark; } - if (omark && imark) + if (kmark && rmark) break; } - if (omark && imark && omark->markType != imark->markType) + if (kmark && rmark && kmark->markType != rmark->markType) continue; /* @@ -2229,8 +2233,8 @@ remove_self_joins_one_group(PlannerInfo *root, Relids relids) * build_joinrel_restrictlist() routine. */ restrictlist = generate_join_implied_equalities(root, joinrelids, - inner->relids, - outer, NULL); + rrel->relids, + krel, NULL); if (restrictlist == NIL) continue; @@ -2240,7 +2244,7 @@ remove_self_joins_one_group(PlannerInfo *root, Relids relids) * otherjoinquals. */ split_selfjoin_quals(root, restrictlist, &selfjoinquals, - &otherjoinquals, inner->relid, outer->relid); + &otherjoinquals, rrel->relid, krel->relid); Assert(list_length(restrictlist) == (list_length(selfjoinquals) + list_length(otherjoinquals))); @@ -2251,17 +2255,17 @@ remove_self_joins_one_group(PlannerInfo *root, Relids relids) * degenerate case works only if both sides have the same clause. * So doesn't matter which side to add. */ - selfjoinquals = list_concat(selfjoinquals, outer->baserestrictinfo); + selfjoinquals = list_concat(selfjoinquals, krel->baserestrictinfo); /* - * Determine if the inner table can duplicate outer rows. We must - * bypass the unique rel cache here since we're possibly using a - * subset of join quals. We can use 'force_cache' == true when all - * join quals are self-join quals. Otherwise, we could end up - * putting false negatives in the cache. + * Determine if the rrel can duplicate outer rows. We must bypass + * the unique rel cache here since we're possibly using a subset + * of join quals. We can use 'force_cache' == true when all join + * quals are self-join quals. Otherwise, we could end up putting + * false negatives in the cache. */ - if (!innerrel_is_unique_ext(root, joinrelids, inner->relids, - outer, JOIN_INNER, selfjoinquals, + if (!innerrel_is_unique_ext(root, joinrelids, rrel->relids, + krel, JOIN_INNER, selfjoinquals, list_length(otherjoinquals) == 0, &uclauses)) continue; @@ -2277,14 +2281,14 @@ remove_self_joins_one_group(PlannerInfo *root, Relids relids) * expressions, or we won't match the same row on each side of the * join. */ - if (!match_unique_clauses(root, inner, uclauses, outer->relid)) + if (!match_unique_clauses(root, rrel, uclauses, krel->relid)) continue; /* - * We can remove either relation, so remove the inner one in order - * to simplify this loop. + * Remove rrel ReloptInfo from the planner structures and the + * corresponding row mark. */ - remove_self_join_rel(root, omark, imark, outer, inner, restrictlist); + remove_self_join_rel(root, kmark, rmark, krel, rrel, restrictlist); result = bms_add_member(result, r); @@ -2359,8 +2363,7 @@ remove_self_joins_recurse(PlannerInfo *root, List *joinlist, Relids toRemove) * In order to find relations with the same oid we first build an array of * candidates and then sort it by oid. */ - candidates = (SelfJoinCandidate *) palloc(sizeof(SelfJoinCandidate) * - numRels); + candidates = palloc_array(SelfJoinCandidate, numRels); i = -1; j = 0; while ((i = bms_next_member(relids, i)) >= 0) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 4ad30b7627e6e..4997cd2722d5a 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -95,19 +95,17 @@ static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path int flags); static Memoize *create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags); -static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path, - int flags); static Gather *create_gather_plan(PlannerInfo *root, GatherPath *best_path); static Plan *create_projection_plan(PlannerInfo *root, ProjectionPath *best_path, int flags); -static Plan *inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe); +static Plan *inject_projection_plan(Plan *subplan, List *tlist, + ParallelSafe parallel_safe); static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags); static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path, int flags); static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path); -static Unique *create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, - int flags); +static Unique *create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags); static Agg *create_agg_plan(PlannerInfo *root, AggPath *best_path); static Plan *create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path); static Result *create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path); @@ -228,7 +226,7 @@ static RecursiveUnion *make_recursive_union(List *tlist, Plan *righttree, int wtParam, List *distinctList, - long numGroups); + Cardinality numGroups); static BitmapAnd *make_bitmap_and(List *bitmapplans); static BitmapOr *make_bitmap_or(List *bitmapplans); static NestLoop *make_nestloop(List *tlist, @@ -284,7 +282,10 @@ static Material *make_material(Plan *lefttree); static Memoize *make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids); + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, + Cardinality est_unique_keys, + double est_hit_ratio); static WindowAgg *make_windowagg(List *tlist, WindowClause *wc, int partNumCols, AttrNumber *partColIdx, Oid *partOperators, Oid *partCollations, int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators, Oid *ordCollations, @@ -293,21 +294,24 @@ static WindowAgg *make_windowagg(List *tlist, WindowClause *wc, static Group *make_group(List *tlist, List *qual, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, Plan *lefttree); -static Unique *make_unique_from_sortclauses(Plan *lefttree, List *distinctList); static Unique *make_unique_from_pathkeys(Plan *lefttree, - List *pathkeys, int numCols); + List *pathkeys, int numCols, + Relids relids); static Gather *make_gather(List *qptlist, List *qpqual, - int nworkers, int rescan_param, bool single_copy, Plan *subplan); + int nworkers, int rescan_param, bool single_copy, + Plan *subplan, bool process_temp_tables); static SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy, List *tlist, Plan *lefttree, Plan *righttree, - List *groupList, long numGroups); + List *groupList, Cardinality numGroups); static LockRows *make_lockrows(Plan *lefttree, List *rowMarks, int epqParam); -static Result *make_result(List *tlist, Node *resconstantqual, Plan *subplan); +static Result *make_gating_result(List *tlist, Node *resconstantqual, + Plan *subplan); +static Result *make_one_row_result(List *tlist, Node *resconstantqual, + RelOptInfo *rel); static ProjectSet *make_project_set(List *tlist, Plan *subplan); static ModifyTable *make_modifytable(PlannerInfo *root, Plan *subplan, CmdType operation, bool canSetTag, Index nominalRelation, Index rootRelation, - bool partColsUpdated, List *resultRelations, List *updateColnosLists, List *withCheckOptionLists, List *returningLists, @@ -467,19 +471,9 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) flags); break; case T_Unique: - if (IsA(best_path, UpperUniquePath)) - { - plan = (Plan *) create_upper_unique_plan(root, - (UpperUniquePath *) best_path, - flags); - } - else - { - Assert(IsA(best_path, UniquePath)); - plan = create_unique_plan(root, - (UniquePath *) best_path, - flags); - } + plan = (Plan *) create_unique_plan(root, + (UniquePath *) best_path, + flags); break; case T_Gather: plan = (Plan *) create_gather_plan(root, @@ -1022,36 +1016,36 @@ static Plan * create_gating_plan(PlannerInfo *root, Path *path, Plan *plan, List *gating_quals) { - Plan *gplan; - Plan *splan; + Result *gplan; Assert(gating_quals); /* - * We might have a trivial Result plan already. Stacking one Result atop - * another is silly, so if that applies, just discard the input plan. + * Since we need a Result node anyway, always return the path's requested + * tlist; that's never a wrong choice, even if the parent node didn't ask + * for CP_EXACT_TLIST. + */ + gplan = make_gating_result(build_path_tlist(root, path), + (Node *) gating_quals, plan); + + /* + * We might have had a trivial Result plan already. Stacking one Result + * atop another is silly, so if that applies, just discard the input plan. * (We're assuming its targetlist is uninteresting; it should be either - * the same as the result of build_path_tlist, or a simplified version.) + * the same as the result of build_path_tlist, or a simplified version. + * However, we preserve the set of relids that it purports to scan and + * attribute that to our replacement Result instead, and likewise for the + * result_type.) */ - splan = plan; if (IsA(plan, Result)) { Result *rplan = (Result *) plan; - if (rplan->plan.lefttree == NULL && - rplan->resconstantqual == NULL) - splan = NULL; + gplan->plan.lefttree = NULL; + gplan->relids = rplan->relids; + gplan->result_type = rplan->result_type; } - /* - * Since we need a Result node anyway, always return the path's requested - * tlist; that's never a wrong choice, even if the parent node didn't ask - * for CP_EXACT_TLIST. - */ - gplan = (Plan *) make_result(build_path_tlist(root, path), - (Node *) gating_quals, - splan); - /* * Notice that we don't change cost or size estimates when doing gating. * The costs of qual eval were already included in the subplan's cost. @@ -1064,12 +1058,12 @@ create_gating_plan(PlannerInfo *root, Path *path, Plan *plan, * in most cases we have only a very bad idea of the probability of the * gating qual being true. */ - copy_plan_costsize(gplan, plan); + copy_plan_costsize(&gplan->plan, plan); /* Gating quals could be unsafe, so better use the Path's safety flag */ - gplan->parallel_safe = path->parallel_safe; + gplan->plan.parallel_safe = path->parallel_safe; - return gplan; + return &gplan->plan; } /* @@ -1245,10 +1239,10 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) /* Generate a Result plan with constant-FALSE gating qual */ Plan *plan; - plan = (Plan *) make_result(tlist, - (Node *) list_make1(makeBoolConst(false, - false)), - NULL); + plan = (Plan *) make_one_row_result(tlist, + (Node *) list_make1(makeBoolConst(false, + false)), + best_path->path.parent); copy_generic_path_info(plan, (Path *) best_path); @@ -1318,6 +1312,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* * Compute sort column info, and adjust subplan's tlist as needed. @@ -1353,14 +1348,38 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and + * there are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } } @@ -1491,6 +1510,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* Build the child plan */ /* Must insist that all children return the same tlist */ @@ -1525,14 +1545,38 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and there + * are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } subplans = lappend(subplans, subplan); @@ -1596,7 +1640,7 @@ create_group_result_plan(PlannerInfo *root, GroupResultPath *best_path) /* best_path->quals is just bare clauses */ quals = order_qual_clauses(root, best_path->quals); - plan = make_result(tlist, (Node *) quals, NULL); + plan = make_one_row_result(tlist, (Node *) quals, best_path->path.parent); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -1703,214 +1747,14 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags) plan = make_memoize(subplan, operators, collations, param_exprs, best_path->singlerow, best_path->binary_mode, - best_path->est_entries, keyparamids); + best_path->est_entries, keyparamids, best_path->est_calls, + best_path->est_unique_keys, best_path->est_hit_ratio); copy_generic_path_info(&plan->plan, (Path *) best_path); return plan; } -/* - * create_unique_plan - * Create a Unique plan for 'best_path' and (recursively) plans - * for its subpaths. - * - * Returns a Plan node. - */ -static Plan * -create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) -{ - Plan *plan; - Plan *subplan; - List *in_operators; - List *uniq_exprs; - List *newtlist; - int nextresno; - bool newitems; - int numGroupCols; - AttrNumber *groupColIdx; - Oid *groupCollations; - int groupColPos; - ListCell *l; - - /* Unique doesn't project, so tlist requirements pass through */ - subplan = create_plan_recurse(root, best_path->subpath, flags); - - /* Done if we don't need to do any actual unique-ifying */ - if (best_path->umethod == UNIQUE_PATH_NOOP) - return subplan; - - /* - * As constructed, the subplan has a "flat" tlist containing just the Vars - * needed here and at upper levels. The values we are supposed to - * unique-ify may be expressions in these variables. We have to add any - * such expressions to the subplan's tlist. - * - * The subplan may have a "physical" tlist if it is a simple scan plan. If - * we're going to sort, this should be reduced to the regular tlist, so - * that we don't sort more data than we need to. For hashing, the tlist - * should be left as-is if we don't need to add any expressions; but if we - * do have to add expressions, then a projection step will be needed at - * runtime anyway, so we may as well remove unneeded items. Therefore - * newtlist starts from build_path_tlist() not just a copy of the - * subplan's tlist; and we don't install it into the subplan unless we are - * sorting or stuff has to be added. - */ - in_operators = best_path->in_operators; - uniq_exprs = best_path->uniq_exprs; - - /* initialize modified subplan tlist as just the "required" vars */ - newtlist = build_path_tlist(root, &best_path->path); - nextresno = list_length(newtlist) + 1; - newitems = false; - - foreach(l, uniq_exprs) - { - Expr *uniqexpr = lfirst(l); - TargetEntry *tle; - - tle = tlist_member(uniqexpr, newtlist); - if (!tle) - { - tle = makeTargetEntry((Expr *) uniqexpr, - nextresno, - NULL, - false); - newtlist = lappend(newtlist, tle); - nextresno++; - newitems = true; - } - } - - /* Use change_plan_targetlist in case we need to insert a Result node */ - if (newitems || best_path->umethod == UNIQUE_PATH_SORT) - subplan = change_plan_targetlist(subplan, newtlist, - best_path->path.parallel_safe); - - /* - * Build control information showing which subplan output columns are to - * be examined by the grouping step. Unfortunately we can't merge this - * with the previous loop, since we didn't then know which version of the - * subplan tlist we'd end up using. - */ - newtlist = subplan->targetlist; - numGroupCols = list_length(uniq_exprs); - groupColIdx = (AttrNumber *) palloc(numGroupCols * sizeof(AttrNumber)); - groupCollations = (Oid *) palloc(numGroupCols * sizeof(Oid)); - - groupColPos = 0; - foreach(l, uniq_exprs) - { - Expr *uniqexpr = lfirst(l); - TargetEntry *tle; - - tle = tlist_member(uniqexpr, newtlist); - if (!tle) /* shouldn't happen */ - elog(ERROR, "failed to find unique expression in subplan tlist"); - groupColIdx[groupColPos] = tle->resno; - groupCollations[groupColPos] = exprCollation((Node *) tle->expr); - groupColPos++; - } - - if (best_path->umethod == UNIQUE_PATH_HASH) - { - Oid *groupOperators; - - /* - * Get the hashable equality operators for the Agg node to use. - * Normally these are the same as the IN clause operators, but if - * those are cross-type operators then the equality operators are the - * ones for the IN clause operators' RHS datatype. - */ - groupOperators = (Oid *) palloc(numGroupCols * sizeof(Oid)); - groupColPos = 0; - foreach(l, in_operators) - { - Oid in_oper = lfirst_oid(l); - Oid eq_oper; - - if (!get_compatible_hash_operators(in_oper, NULL, &eq_oper)) - elog(ERROR, "could not find compatible hash operator for operator %u", - in_oper); - groupOperators[groupColPos++] = eq_oper; - } - - /* - * Since the Agg node is going to project anyway, we can give it the - * minimum output tlist, without any stuff we might have added to the - * subplan tlist. - */ - plan = (Plan *) make_agg(build_path_tlist(root, &best_path->path), - NIL, - AGG_HASHED, - AGGSPLIT_SIMPLE, - numGroupCols, - groupColIdx, - groupOperators, - groupCollations, - NIL, - NIL, - best_path->path.rows, - 0, - subplan); - } - else - { - List *sortList = NIL; - Sort *sort; - - /* Create an ORDER BY list to sort the input compatibly */ - groupColPos = 0; - foreach(l, in_operators) - { - Oid in_oper = lfirst_oid(l); - Oid sortop; - Oid eqop; - TargetEntry *tle; - SortGroupClause *sortcl; - - sortop = get_ordering_op_for_equality_op(in_oper, false); - if (!OidIsValid(sortop)) /* shouldn't happen */ - elog(ERROR, "could not find ordering operator for equality operator %u", - in_oper); - - /* - * The Unique node will need equality operators. Normally these - * are the same as the IN clause operators, but if those are - * cross-type operators then the equality operators are the ones - * for the IN clause operators' RHS datatype. - */ - eqop = get_equality_op_for_ordering_op(sortop, NULL); - if (!OidIsValid(eqop)) /* shouldn't happen */ - elog(ERROR, "could not find equality operator for ordering operator %u", - sortop); - - tle = get_tle_by_resno(subplan->targetlist, - groupColIdx[groupColPos]); - Assert(tle != NULL); - - sortcl = makeNode(SortGroupClause); - sortcl->tleSortGroupRef = assignSortGroupRef(tle, - subplan->targetlist); - sortcl->eqop = eqop; - sortcl->sortop = sortop; - sortcl->reverse_sort = false; - sortcl->nulls_first = false; - sortcl->hashable = false; /* no need to make this accurate */ - sortList = lappend(sortList, sortcl); - groupColPos++; - } - sort = make_sort_from_sortclauses(sortList, subplan); - label_sort_with_costsize(root, sort, -1.0); - plan = (Plan *) make_unique_from_sortclauses((Plan *) sort, sortList); - } - - /* Copy cost data from Path to Plan */ - copy_generic_path_info(plan, &best_path->path); - - return plan; -} - /* * create_gather_plan * @@ -1934,12 +1778,14 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path) tlist = build_path_tlist(root, &best_path->path); + Assert(best_path->subpath->parallel_safe > PARALLEL_UNSAFE); gather_plan = make_gather(tlist, NIL, best_path->num_workers, assign_special_exec_param(root), best_path->single_copy, - subplan); + subplan, + best_path->subpath->parallel_safe == NEEDS_TEMP_FLUSH); copy_generic_path_info(&gather_plan->plan, &best_path->path); @@ -2093,8 +1939,7 @@ create_projection_plan(PlannerInfo *root, ProjectionPath *best_path, int flags) } else { - /* We need a Result node */ - plan = (Plan *) make_result(tlist, NULL, subplan); + plan = (Plan *) make_gating_result(tlist, NULL, subplan); copy_generic_path_info(plan, (Path *) best_path); } @@ -2114,11 +1959,11 @@ create_projection_plan(PlannerInfo *root, ProjectionPath *best_path, int flags) * to apply (since the tlist might be unsafe even if the child plan is safe). */ static Plan * -inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe) +inject_projection_plan(Plan *subplan, List *tlist, ParallelSafe parallel_safe) { Plan *plan; - plan = (Plan *) make_result(tlist, NULL, subplan); + plan = (Plan *) make_gating_result(tlist, NULL, subplan); /* * In principle, we should charge tlist eval cost plus cpu_per_tuple per @@ -2146,7 +1991,7 @@ inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe) * flag of the FDW's own Path node. */ Plan * -change_plan_targetlist(Plan *subplan, List *tlist, bool tlist_parallel_safe) +change_plan_targetlist(Plan *subplan, List *tlist, ParallelSafe tlist_parallel_safe) { /* * If the top plan node can't do projections and its existing target list @@ -2162,7 +2007,7 @@ change_plan_targetlist(Plan *subplan, List *tlist, bool tlist_parallel_safe) { /* Else we can just replace the plan node's tlist */ subplan->targetlist = tlist; - subplan->parallel_safe &= tlist_parallel_safe; + subplan->parallel_safe = tlist_parallel_safe; } return subplan; } @@ -2268,13 +2113,13 @@ create_group_plan(PlannerInfo *root, GroupPath *best_path) } /* - * create_upper_unique_plan + * create_unique_plan * * Create a Unique plan for 'best_path' and (recursively) plans * for its subpaths. */ static Unique * -create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flags) +create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) { Unique *plan; Plan *subplan; @@ -2286,9 +2131,17 @@ create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flag subplan = create_plan_recurse(root, best_path->subpath, flags | CP_LABEL_TLIST); + /* + * make_unique_from_pathkeys calls find_ec_member_matching_expr, which + * will ignore any child EC members that don't belong to the given relids. + * Thus, if this unique path is based on a child relation, we must pass + * its relids. + */ plan = make_unique_from_pathkeys(subplan, best_path->path.pathkeys, - best_path->numkeys); + best_path->numkeys, + IS_OTHER_REL(best_path->path.parent) ? + best_path->path.parent->relids : NULL); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -2357,7 +2210,7 @@ remap_groupColIdx(PlannerInfo *root, List *groupClause) Assert(grouping_map); - new_grpColIdx = palloc0(sizeof(AttrNumber) * list_length(groupClause)); + new_grpColIdx = palloc0_array(AttrNumber, list_length(groupClause)); i = 0; foreach(lc, groupClause) @@ -2588,7 +2441,9 @@ create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path) /* Generate the output plan --- basically just a Result */ tlist = build_path_tlist(root, &best_path->path); - plan = make_result(tlist, (Node *) best_path->quals, NULL); + plan = make_one_row_result(tlist, (Node *) best_path->quals, + best_path->path.parent); + plan->result_type = RESULT_TYPE_MINMAX; copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -2644,9 +2499,9 @@ create_windowagg_plan(PlannerInfo *root, WindowAggPath *best_path) * Convert SortGroupClause lists into arrays of attr indexes and equality * operators, as wanted by executor. */ - partColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numPart); - partOperators = (Oid *) palloc(sizeof(Oid) * numPart); - partCollations = (Oid *) palloc(sizeof(Oid) * numPart); + partColIdx = palloc_array(AttrNumber, numPart); + partOperators = palloc_array(Oid, numPart); + partCollations = palloc_array(Oid, numPart); partNumCols = 0; foreach(lc, wc->partitionClause) @@ -2661,9 +2516,9 @@ create_windowagg_plan(PlannerInfo *root, WindowAggPath *best_path) partNumCols++; } - ordColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numOrder); - ordOperators = (Oid *) palloc(sizeof(Oid) * numOrder); - ordCollations = (Oid *) palloc(sizeof(Oid) * numOrder); + ordColIdx = palloc_array(AttrNumber, numOrder); + ordOperators = palloc_array(Oid, numOrder); + ordCollations = palloc_array(Oid, numOrder); ordNumCols = 0; foreach(lc, wc->orderClause) @@ -2712,7 +2567,6 @@ create_setop_plan(PlannerInfo *root, SetOpPath *best_path, int flags) List *tlist = build_path_tlist(root, &best_path->path); Plan *leftplan; Plan *rightplan; - long numGroups; /* * SetOp doesn't project, so tlist requirements pass through; moreover we @@ -2723,16 +2577,13 @@ create_setop_plan(PlannerInfo *root, SetOpPath *best_path, int flags) rightplan = create_plan_recurse(root, best_path->rightpath, flags | CP_LABEL_TLIST); - /* Convert numGroups to long int --- but 'ware overflow! */ - numGroups = clamp_cardinality_to_long(best_path->numGroups); - plan = make_setop(best_path->cmd, best_path->strategy, tlist, leftplan, rightplan, best_path->groupList, - numGroups); + best_path->numGroups); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -2752,7 +2603,6 @@ create_recursiveunion_plan(PlannerInfo *root, RecursiveUnionPath *best_path) Plan *leftplan; Plan *rightplan; List *tlist; - long numGroups; /* Need both children to produce same tlist, so force it */ leftplan = create_plan_recurse(root, best_path->leftpath, CP_EXACT_TLIST); @@ -2760,15 +2610,12 @@ create_recursiveunion_plan(PlannerInfo *root, RecursiveUnionPath *best_path) tlist = build_path_tlist(root, &best_path->path); - /* Convert numGroups to long int --- but 'ware overflow! */ - numGroups = clamp_cardinality_to_long(best_path->numGroups); - plan = make_recursive_union(tlist, leftplan, rightplan, best_path->wtParam, best_path->distinctList, - numGroups); + best_path->numGroups); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -2823,7 +2670,6 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path) best_path->canSetTag, best_path->nominalRelation, best_path->rootRelation, - best_path->partColsUpdated, best_path->resultRelations, best_path->updateColnosLists, best_path->withCheckOptionLists, @@ -4039,7 +3885,8 @@ create_resultscan_plan(PlannerInfo *root, Path *best_path, replace_nestloop_params(root, (Node *) scan_clauses); } - scan_plan = make_result(tlist, (Node *) scan_clauses, NULL); + scan_plan = make_one_row_result(tlist, (Node *) scan_clauses, + best_path->parent); copy_generic_path_info(&scan_plan->plan, best_path); @@ -4344,13 +4191,17 @@ create_nestloop_plan(PlannerInfo *root, NestLoop *join_plan; Plan *outer_plan; Plan *inner_plan; + Relids outerrelids; List *tlist = build_path_tlist(root, &best_path->jpath.path); List *joinrestrictclauses = best_path->jpath.joinrestrictinfo; List *joinclauses; List *otherclauses; - Relids outerrelids; List *nestParams; + List *outer_tlist; + ParallelSafe outer_parallel_safe; + bool needs_temp_flush = false; Relids saveOuterRels = root->curOuterRels; + ListCell *lc; /* * If the inner path is parameterized by the topmost parent of the outer @@ -4372,8 +4223,8 @@ create_nestloop_plan(PlannerInfo *root, outer_plan = create_plan_recurse(root, best_path->jpath.outerjoinpath, 0); /* For a nestloop, include outer relids in curOuterRels for inner side */ - root->curOuterRels = bms_union(root->curOuterRels, - best_path->jpath.outerjoinpath->parent->relids); + outerrelids = best_path->jpath.outerjoinpath->parent->relids; + root->curOuterRels = bms_union(root->curOuterRels, outerrelids); inner_plan = create_plan_recurse(root, best_path->jpath.innerjoinpath, 0); @@ -4412,9 +4263,71 @@ create_nestloop_plan(PlannerInfo *root, * Identify any nestloop parameters that should be supplied by this join * node, and remove them from root->curOuterParams. */ - outerrelids = best_path->jpath.outerjoinpath->parent->relids; - nestParams = identify_current_nestloop_params(root, outerrelids); + nestParams = identify_current_nestloop_params(root, + outerrelids, + PATH_REQ_OUTER((Path *) best_path)); + + /* + * While nestloop parameters that are Vars had better be available from + * the outer_plan already, there are edge cases where nestloop parameters + * that are PHVs won't be. In such cases we must add them to the + * outer_plan's tlist, since the executor's NestLoopParam machinery + * requires the params to be simple outer-Var references to that tlist. + * (This is cheating a little bit, because the outer path's required-outer + * relids might not be enough to allow evaluating such a PHV. But in + * practice, if we could have evaluated the PHV at the nestloop node, we + * can do so in the outer plan too.) + */ + outer_tlist = outer_plan->targetlist; + outer_parallel_safe = outer_plan->parallel_safe; + foreach(lc, nestParams) + { + NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); + PlaceHolderVar *phv; + TargetEntry *tle; + + if (IsA(nlp->paramval, Var)) + continue; /* nothing to do for simple Vars */ + /* Otherwise it must be a PHV */ + phv = castNode(PlaceHolderVar, nlp->paramval); + if (tlist_member((Expr *) phv, outer_tlist)) + continue; /* already available */ + + /* + * It's possible that nestloop parameter PHVs selected to evaluate + * here contain references to surviving root->curOuterParams items + * (that is, they reference values that will be supplied by some + * higher-level nestloop). Those need to be converted to Params now. + * Note: it's safe to do this after the tlist_member() check, because + * equal() won't pay attention to phv->phexpr. + */ + phv->phexpr = (Expr *) replace_nestloop_params(root, + (Node *) phv->phexpr); + + /* Make a shallow copy of outer_tlist, if we didn't already */ + if (outer_tlist == outer_plan->targetlist) + outer_tlist = list_copy(outer_tlist); + /* ... and add the needed expression */ + tle = makeTargetEntry((Expr *) copyObject(phv), + list_length(outer_tlist) + 1, + NULL, + true); + outer_tlist = lappend(outer_tlist, tle); + /* ... and track whether tlist is (still) parallel-safe */ + if (outer_parallel_safe > PARALLEL_UNSAFE) + { + if (!is_parallel_safe(root, (Node *) phv, &needs_temp_flush)) + outer_parallel_safe = PARALLEL_UNSAFE; + else if (needs_temp_flush) + outer_parallel_safe = NEEDS_TEMP_FLUSH; + } + } + if (outer_tlist != outer_plan->targetlist) + outer_plan = change_plan_targetlist(outer_plan, outer_tlist, + outer_parallel_safe); + + /* And finally, we can build the join plan node */ join_plan = make_nestloop(tlist, joinclauses, otherclauses, @@ -5933,7 +5846,7 @@ make_recursive_union(List *tlist, Plan *righttree, int wtParam, List *distinctList, - long numGroups) + Cardinality numGroups) { RecursiveUnion *node = makeNode(RecursiveUnion); Plan *plan = &node->plan; @@ -5958,9 +5871,9 @@ make_recursive_union(List *tlist, Oid *dupCollations; ListCell *slitem; - dupColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); - dupOperators = (Oid *) palloc(sizeof(Oid) * numCols); - dupCollations = (Oid *) palloc(sizeof(Oid) * numCols); + dupColIdx = palloc_array(AttrNumber, numCols); + dupOperators = palloc_array(Oid, numCols); + dupCollations = palloc_array(Oid, numCols); foreach(slitem, distinctList) { @@ -6639,7 +6552,9 @@ materialize_finished_plan(Plan *subplan) static Memoize * make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids) + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, Cardinality est_unique_keys, + double est_hit_ratio) { Memoize *node = makeNode(Memoize); Plan *plan = &node->plan; @@ -6657,6 +6572,9 @@ make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, node->binary_mode = binary_mode; node->est_entries = est_entries; node->keyparamids = keyparamids; + node->est_calls = est_calls; + node->est_unique_keys = est_unique_keys; + node->est_hit_ratio = est_hit_ratio; return node; } @@ -6665,15 +6583,11 @@ Agg * make_agg(List *tlist, List *qual, AggStrategy aggstrategy, AggSplit aggsplit, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, - List *groupingSets, List *chain, double dNumGroups, + List *groupingSets, List *chain, Cardinality numGroups, Size transitionSpace, Plan *lefttree) { Agg *node = makeNode(Agg); Plan *plan = &node->plan; - long numGroups; - - /* Reduce to long, but 'ware overflow! */ - numGroups = clamp_cardinality_to_long(dNumGroups); node->aggstrategy = aggstrategy; node->aggsplit = aggsplit; @@ -6761,61 +6675,14 @@ make_group(List *tlist, } /* - * distinctList is a list of SortGroupClauses, identifying the targetlist items - * that should be considered by the Unique filter. The input path must - * already be sorted accordingly. - */ -static Unique * -make_unique_from_sortclauses(Plan *lefttree, List *distinctList) -{ - Unique *node = makeNode(Unique); - Plan *plan = &node->plan; - int numCols = list_length(distinctList); - int keyno = 0; - AttrNumber *uniqColIdx; - Oid *uniqOperators; - Oid *uniqCollations; - ListCell *slitem; - - plan->targetlist = lefttree->targetlist; - plan->qual = NIL; - plan->lefttree = lefttree; - plan->righttree = NULL; - - /* - * convert SortGroupClause list into arrays of attr indexes and equality - * operators, as wanted by executor - */ - Assert(numCols > 0); - uniqColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); - uniqOperators = (Oid *) palloc(sizeof(Oid) * numCols); - uniqCollations = (Oid *) palloc(sizeof(Oid) * numCols); - - foreach(slitem, distinctList) - { - SortGroupClause *sortcl = (SortGroupClause *) lfirst(slitem); - TargetEntry *tle = get_sortgroupclause_tle(sortcl, plan->targetlist); - - uniqColIdx[keyno] = tle->resno; - uniqOperators[keyno] = sortcl->eqop; - uniqCollations[keyno] = exprCollation((Node *) tle->expr); - Assert(OidIsValid(uniqOperators[keyno])); - keyno++; - } - - node->numCols = numCols; - node->uniqColIdx = uniqColIdx; - node->uniqOperators = uniqOperators; - node->uniqCollations = uniqCollations; - - return node; -} - -/* - * as above, but use pathkeys to identify the sort columns and semantics + * pathkeys is a list of PathKeys, identifying the sort columns and semantics. + * The input plan must already be sorted accordingly. + * + * relids identifies the child relation being unique-ified, if any. */ static Unique * -make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols) +make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols, + Relids relids) { Unique *node = makeNode(Unique); Plan *plan = &node->plan; @@ -6836,9 +6703,9 @@ make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols) * prepare_sort_from_pathkeys ... maybe unify sometime? */ Assert(numCols >= 0 && numCols <= list_length(pathkeys)); - uniqColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); - uniqOperators = (Oid *) palloc(sizeof(Oid) * numCols); - uniqCollations = (Oid *) palloc(sizeof(Oid) * numCols); + uniqColIdx = palloc_array(AttrNumber, numCols); + uniqOperators = palloc_array(Oid, numCols); + uniqCollations = palloc_array(Oid, numCols); foreach(lc, pathkeys) { @@ -6878,7 +6745,7 @@ make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols) foreach(j, plan->targetlist) { tle = (TargetEntry *) lfirst(j); - em = find_ec_member_matching_expr(ec, tle->expr, NULL); + em = find_ec_member_matching_expr(ec, tle->expr, relids); if (em) { /* found expr already in tlist */ @@ -6926,7 +6793,8 @@ make_gather(List *qptlist, int nworkers, int rescan_param, bool single_copy, - Plan *subplan) + Plan *subplan, + bool process_temp_tables) { Gather *node = makeNode(Gather); Plan *plan = &node->plan; @@ -6940,6 +6808,7 @@ make_gather(List *qptlist, node->single_copy = single_copy; node->invisible = false; node->initParam = NULL; + node->process_temp_tables = process_temp_tables; return node; } @@ -6952,7 +6821,7 @@ make_gather(List *qptlist, static SetOp * make_setop(SetOpCmd cmd, SetOpStrategy strategy, List *tlist, Plan *lefttree, Plan *righttree, - List *groupList, long numGroups) + List *groupList, Cardinality numGroups) { SetOp *node = makeNode(SetOp); Plan *plan = &node->plan; @@ -6973,10 +6842,10 @@ make_setop(SetOpCmd cmd, SetOpStrategy strategy, * convert SortGroupClause list into arrays of attr indexes and comparison * operators, as wanted by executor */ - cmpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); - cmpOperators = (Oid *) palloc(sizeof(Oid) * numCols); - cmpCollations = (Oid *) palloc(sizeof(Oid) * numCols); - cmpNullsFirst = (bool *) palloc(sizeof(bool) * numCols); + cmpColIdx = palloc_array(AttrNumber, numCols); + cmpOperators = palloc_array(Oid, numCols); + cmpCollations = palloc_array(Oid, numCols); + cmpNullsFirst = palloc_array(bool, numCols); foreach(slitem, groupList) { @@ -7056,22 +6925,57 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, } /* - * make_result - * Build a Result plan node + * make_gating_result + * Build a Result plan node that performs projection of a subplan, and/or + * applies a one time filter (resconstantqual) */ static Result * -make_result(List *tlist, - Node *resconstantqual, - Plan *subplan) +make_gating_result(List *tlist, + Node *resconstantqual, + Plan *subplan) { Result *node = makeNode(Result); Plan *plan = &node->plan; + Assert(subplan != NULL); + plan->targetlist = tlist; plan->qual = NIL; plan->lefttree = subplan; plan->righttree = NULL; + node->result_type = RESULT_TYPE_GATING; + node->resconstantqual = resconstantqual; + node->relids = NULL; + + return node; +} + +/* + * make_one_row_result + * Build a Result plan node that returns a single row (or possibly no rows, + * if the one-time filtered defined by resconstantqual returns false) + * + * 'rel' should be this path's RelOptInfo. In essence, we're saying that this + * Result node generates all the tuples for that RelOptInfo. Note that the same + * consideration can never arise in make_gating_result(), because in that case + * the tuples are always coming from some subordinate node. + */ +static Result * +make_one_row_result(List *tlist, + Node *resconstantqual, + RelOptInfo *rel) +{ + Result *node = makeNode(Result); + Plan *plan = &node->plan; + + plan->targetlist = tlist; + plan->qual = NIL; + plan->lefttree = NULL; + plan->righttree = NULL; + node->result_type = IS_UPPER_REL(rel) ? RESULT_TYPE_UPPER : + IS_JOIN_REL(rel) ? RESULT_TYPE_JOIN : RESULT_TYPE_SCAN; node->resconstantqual = resconstantqual; + node->relids = rel->relids; return node; } @@ -7103,7 +7007,6 @@ static ModifyTable * make_modifytable(PlannerInfo *root, Plan *subplan, CmdType operation, bool canSetTag, Index nominalRelation, Index rootRelation, - bool partColsUpdated, List *resultRelations, List *updateColnosLists, List *withCheckOptionLists, List *returningLists, @@ -7114,6 +7017,8 @@ make_modifytable(PlannerInfo *root, Plan *subplan, ModifyTable *node = makeNode(ModifyTable); bool returning_old_or_new = false; bool returning_old_or_new_valid = false; + bool transition_tables = false; + bool transition_tables_valid = false; List *fdw_private_list; Bitmapset *direct_modify_plans; ListCell *lc; @@ -7138,7 +7043,6 @@ make_modifytable(PlannerInfo *root, Plan *subplan, node->canSetTag = canSetTag; node->nominalRelation = nominalRelation; node->rootRelation = rootRelation; - node->partColsUpdated = partColsUpdated; node->resultRelations = resultRelations; if (!onconflict) { @@ -7260,8 +7164,8 @@ make_modifytable(PlannerInfo *root, Plan *subplan, * callback functions needed for that and (2) there are no local * structures that need to be run for each modified row: row-level * triggers on the foreign table, stored generated columns, WITH CHECK - * OPTIONs from parent views, or Vars returning OLD/NEW in the - * RETURNING list. + * OPTIONs from parent views, Vars returning OLD/NEW in the RETURNING + * list, or transition tables on the named relation. */ direct_modify = false; if (fdwroutine != NULL && @@ -7273,7 +7177,10 @@ make_modifytable(PlannerInfo *root, Plan *subplan, !has_row_triggers(root, rti, operation) && !has_stored_generated_columns(root, rti)) { - /* returning_old_or_new is the same for all result relations */ + /* + * returning_old_or_new and transition_tables are the same for all + * result relations, respectively + */ if (!returning_old_or_new_valid) { returning_old_or_new = @@ -7282,7 +7189,18 @@ make_modifytable(PlannerInfo *root, Plan *subplan, returning_old_or_new_valid = true; } if (!returning_old_or_new) - direct_modify = fdwroutine->PlanDirectModify(root, node, rti, i); + { + if (!transition_tables_valid) + { + transition_tables = has_transition_tables(root, + nominalRelation, + operation); + transition_tables_valid = true; + } + if (!transition_tables) + direct_modify = fdwroutine->PlanDirectModify(root, node, + rti, i); + } } if (direct_modify) direct_modify_plans = bms_add_member(direct_modify_plans, i); diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 01804b085b3ba..f7f76469a2dbf 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/nbtree.h" #include "catalog/pg_constraint.h" #include "catalog/pg_type.h" #include "nodes/makefuncs.h" @@ -81,6 +82,12 @@ typedef struct JoinTreeItem } JoinTreeItem; +static bool is_partial_agg_memory_risky(PlannerInfo *root); +static void create_agg_clause_infos(PlannerInfo *root); +static void create_grouping_expr_infos(PlannerInfo *root); +static EquivalenceClass *get_eclass_for_sortgroupclause(PlannerInfo *root, + SortGroupClause *sgc, + Expr *expr); static void extract_lateral_references(PlannerInfo *root, RelOptInfo *brel, Index rtindex); static List *deconstruct_recurse(PlannerInfo *root, Node *jtnode, @@ -431,8 +438,7 @@ remove_useless_groupby_columns(PlannerInfo *root) * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k * that are GROUP BY items. */ - groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) * - (list_length(parse->rtable) + 1)); + groupbyattnos = palloc0_array(Bitmapset *, list_length(parse->rtable) + 1); foreach(lc, root->processed_groupClause) { SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); @@ -590,8 +596,7 @@ remove_useless_groupby_columns(PlannerInfo *root) * allocate the surplusvars[] array until we find something. */ if (surplusvars == NULL) - surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) * - (list_length(parse->rtable) + 1)); + surplusvars = palloc0_array(Bitmapset *, list_length(parse->rtable) + 1); /* Remember the attnos of the removable columns */ surplusvars[relid] = bms_difference(relattnos, best_keycolumns); @@ -628,6 +633,368 @@ remove_useless_groupby_columns(PlannerInfo *root) } } +/* + * setup_eager_aggregation + * Check if eager aggregation is applicable, and if so collect suitable + * aggregate expressions and grouping expressions in the query. + */ +void +setup_eager_aggregation(PlannerInfo *root) +{ + /* + * Don't apply eager aggregation if disabled by user. + */ + if (!enable_eager_aggregate) + return; + + /* + * Don't apply eager aggregation if there are no available GROUP BY + * clauses. + */ + if (!root->processed_groupClause) + return; + + /* + * For now we don't try to support grouping sets. + */ + if (root->parse->groupingSets) + return; + + /* + * For now we don't try to support DISTINCT or ORDER BY aggregates. + */ + if (root->numOrderedAggs > 0) + return; + + /* + * If there are any aggregates that do not support partial mode, or any + * partial aggregates that are non-serializable, do not apply eager + * aggregation. + */ + if (root->hasNonPartialAggs || root->hasNonSerialAggs) + return; + + /* + * We don't try to apply eager aggregation if there are set-returning + * functions in targetlist. + */ + if (root->parse->hasTargetSRFs) + return; + + /* + * Eager aggregation only makes sense if there are multiple base rels in + * the query. + */ + if (bms_membership(root->all_baserels) != BMS_MULTIPLE) + return; + + /* + * Don't apply eager aggregation if any aggregate poses a risk of + * excessive memory usage during partial aggregation. + */ + if (is_partial_agg_memory_risky(root)) + return; + + /* + * Collect aggregate expressions and plain Vars that appear in the + * targetlist and havingQual. + */ + create_agg_clause_infos(root); + + /* + * If there are no suitable aggregate expressions, we cannot apply eager + * aggregation. + */ + if (root->agg_clause_list == NIL) + return; + + /* + * Collect grouping expressions that appear in grouping clauses. + */ + create_grouping_expr_infos(root); +} + +/* + * is_partial_agg_memory_risky + * Check if any aggregate poses a risk of excessive memory usage during + * partial aggregation. + * + * We check if any aggregate has a negative aggtransspace value, which + * indicates that its transition state data can grow unboundedly in size. + * Applying eager aggregation in such cases risks high memory usage since + * partial aggregation results might be stored in join hash tables or + * materialized nodes. + */ +static bool +is_partial_agg_memory_risky(PlannerInfo *root) +{ + ListCell *lc; + + foreach(lc, root->aggtransinfos) + { + AggTransInfo *transinfo = lfirst_node(AggTransInfo, lc); + + if (transinfo->aggtransspace < 0) + return true; + } + + return false; +} + +/* + * create_agg_clause_infos + * Search the targetlist and havingQual for Aggrefs and plain Vars, and + * create an AggClauseInfo for each Aggref node. + */ +static void +create_agg_clause_infos(PlannerInfo *root) +{ + List *tlist_exprs; + List *agg_clause_list = NIL; + List *tlist_vars = NIL; + Relids aggregate_relids = NULL; + bool eager_agg_applicable = true; + ListCell *lc; + + Assert(root->agg_clause_list == NIL); + Assert(root->tlist_vars == NIL); + + tlist_exprs = pull_var_clause((Node *) root->processed_tlist, + PVC_INCLUDE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_RECURSE_PLACEHOLDERS); + + /* + * Aggregates within the HAVING clause need to be processed in the same + * way as those in the targetlist. Note that HAVING can contain Aggrefs + * but not WindowFuncs. + */ + if (root->parse->havingQual != NULL) + { + List *having_exprs; + + having_exprs = pull_var_clause((Node *) root->parse->havingQual, + PVC_INCLUDE_AGGREGATES | + PVC_RECURSE_PLACEHOLDERS); + if (having_exprs != NIL) + { + tlist_exprs = list_concat(tlist_exprs, having_exprs); + list_free(having_exprs); + } + } + + foreach(lc, tlist_exprs) + { + Expr *expr = (Expr *) lfirst(lc); + Aggref *aggref; + Relids agg_eval_at; + AggClauseInfo *ac_info; + + /* For now we don't try to support GROUPING() expressions */ + if (IsA(expr, GroupingFunc)) + { + eager_agg_applicable = false; + break; + } + + /* Collect plain Vars for future reference */ + if (IsA(expr, Var)) + { + tlist_vars = list_append_unique(tlist_vars, expr); + continue; + } + + aggref = castNode(Aggref, expr); + + Assert(aggref->aggorder == NIL); + Assert(aggref->aggdistinct == NIL); + + /* + * If there are any securityQuals, do not try to apply eager + * aggregation if any non-leakproof aggregate functions are present. + * This is overly strict, but for now... + */ + if (root->qual_security_level > 0 && + !get_func_leakproof(aggref->aggfnoid)) + { + eager_agg_applicable = false; + break; + } + + agg_eval_at = pull_varnos(root, (Node *) aggref); + + /* + * If all base relations in the query are referenced by aggregate + * functions, then eager aggregation is not applicable. + */ + aggregate_relids = bms_add_members(aggregate_relids, agg_eval_at); + if (bms_is_subset(root->all_baserels, aggregate_relids)) + { + eager_agg_applicable = false; + break; + } + + /* OK, create the AggClauseInfo node */ + ac_info = makeNode(AggClauseInfo); + ac_info->aggref = aggref; + ac_info->agg_eval_at = agg_eval_at; + + /* ... and add it to the list */ + agg_clause_list = list_append_unique(agg_clause_list, ac_info); + } + + list_free(tlist_exprs); + + if (eager_agg_applicable) + { + root->agg_clause_list = agg_clause_list; + root->tlist_vars = tlist_vars; + } + else + { + list_free_deep(agg_clause_list); + list_free(tlist_vars); + } +} + +/* + * create_grouping_expr_infos + * Create a GroupingExprInfo for each expression usable as grouping key. + * + * If any grouping expression is not suitable, we will just return with + * root->group_expr_list being NIL. + */ +static void +create_grouping_expr_infos(PlannerInfo *root) +{ + List *exprs = NIL; + List *sortgrouprefs = NIL; + List *ecs = NIL; + ListCell *lc, + *lc1, + *lc2, + *lc3; + + Assert(root->group_expr_list == NIL); + + foreach(lc, root->processed_groupClause) + { + SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); + TargetEntry *tle = get_sortgroupclause_tle(sgc, root->processed_tlist); + TypeCacheEntry *tce; + Oid equalimageproc; + + Assert(tle->ressortgroupref > 0); + + /* + * For now we only support plain Vars as grouping expressions. + */ + if (!IsA(tle->expr, Var)) + return; + + /* + * Eager aggregation is only possible if equality implies image + * equality for each grouping key. Otherwise, placing keys with + * different byte images into the same group may result in the loss of + * information that could be necessary to evaluate upper qual clauses. + * + * For instance, the NUMERIC data type is not supported, as values + * that are considered equal by the equality operator (e.g., 0 and + * 0.0) can have different scales. + */ + tce = lookup_type_cache(exprType((Node *) tle->expr), + TYPECACHE_BTREE_OPFAMILY); + if (!OidIsValid(tce->btree_opf) || + !OidIsValid(tce->btree_opintype)) + return; + + equalimageproc = get_opfamily_proc(tce->btree_opf, + tce->btree_opintype, + tce->btree_opintype, + BTEQUALIMAGE_PROC); + if (!OidIsValid(equalimageproc) || + !DatumGetBool(OidFunctionCall1Coll(equalimageproc, + tce->typcollation, + ObjectIdGetDatum(tce->btree_opintype)))) + return; + + exprs = lappend(exprs, tle->expr); + sortgrouprefs = lappend_int(sortgrouprefs, tle->ressortgroupref); + ecs = lappend(ecs, get_eclass_for_sortgroupclause(root, sgc, tle->expr)); + } + + /* + * Construct a GroupingExprInfo for each expression. + */ + forthree(lc1, exprs, lc2, sortgrouprefs, lc3, ecs) + { + Expr *expr = (Expr *) lfirst(lc1); + int sortgroupref = lfirst_int(lc2); + EquivalenceClass *ec = (EquivalenceClass *) lfirst(lc3); + GroupingExprInfo *ge_info; + + ge_info = makeNode(GroupingExprInfo); + ge_info->expr = (Expr *) copyObject(expr); + ge_info->sortgroupref = sortgroupref; + ge_info->ec = ec; + + root->group_expr_list = lappend(root->group_expr_list, ge_info); + } +} + +/* + * get_eclass_for_sortgroupclause + * Given a group clause and an expression, find an existing equivalence + * class that the expression is a member of; return NULL if none. + */ +static EquivalenceClass * +get_eclass_for_sortgroupclause(PlannerInfo *root, SortGroupClause *sgc, + Expr *expr) +{ + Oid opfamily, + opcintype, + collation; + CompareType cmptype; + Oid equality_op; + List *opfamilies; + + /* Punt if the group clause is not sortable */ + if (!OidIsValid(sgc->sortop)) + return NULL; + + /* Find the operator in pg_amop --- failure shouldn't happen */ + if (!get_ordering_op_properties(sgc->sortop, + &opfamily, &opcintype, &cmptype)) + elog(ERROR, "operator %u is not a valid ordering operator", + sgc->sortop); + + /* Because SortGroupClause doesn't carry collation, consult the expr */ + collation = exprCollation((Node *) expr); + + /* + * EquivalenceClasses need to contain opfamily lists based on the family + * membership of mergejoinable equality operators, which could belong to + * more than one opfamily. So we have to look up the opfamily's equality + * operator and get its membership. + */ + equality_op = get_opfamily_member_for_cmptype(opfamily, + opcintype, + opcintype, + COMPARE_EQ); + if (!OidIsValid(equality_op)) /* shouldn't happen */ + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + COMPARE_EQ, opcintype, opcintype, opfamily); + opfamilies = get_mergejoin_opfamilies(equality_op); + if (!opfamilies) /* certainly should find some */ + elog(ERROR, "could not find opfamilies for equality operator %u", + equality_op); + + /* Now find a matching EquivalenceClass */ + return get_eclass_for_sort_expr(root, expr, opfamilies, opcintype, + collation, sgc->tleSortGroupRef, + NULL, false); +} + /***************************************************************************** * * LATERAL REFERENCES @@ -3044,42 +3411,6 @@ add_base_clause_to_rel(PlannerInfo *root, Index relid, restrictinfo->security_level); } -/* - * expr_is_nonnullable - * Check to see if the Expr cannot be NULL - * - * If the Expr is a simple Var that is defined NOT NULL and meanwhile is not - * nulled by any outer joins, then we can know that it cannot be NULL. - */ -static bool -expr_is_nonnullable(PlannerInfo *root, Expr *expr) -{ - RelOptInfo *rel; - Var *var; - - /* For now only check simple Vars */ - if (!IsA(expr, Var)) - return false; - - var = (Var *) expr; - - /* could the Var be nulled by any outer joins? */ - if (!bms_is_empty(var->varnullingrels)) - return false; - - /* system columns cannot be NULL */ - if (var->varattno < 0) - return true; - - /* is the column defined NOT NULL? */ - rel = find_base_rel(root, var->varno); - if (var->varattno > 0 && - bms_is_member(var->varattno, rel->notnullattnums)) - return true; - - return false; -} - /* * restriction_is_always_true * Check to see if the RestrictInfo is always true. @@ -3116,7 +3447,7 @@ restriction_is_always_true(PlannerInfo *root, if (nulltest->argisrow) return false; - return expr_is_nonnullable(root, nulltest->arg); + return expr_is_nonnullable(root, nulltest->arg, true); } /* If it's an OR, check its sub-clauses */ @@ -3181,7 +3512,7 @@ restriction_is_always_false(PlannerInfo *root, if (nulltest->argisrow) return false; - return expr_is_nonnullable(root, nulltest->arg); + return expr_is_nonnullable(root, nulltest->arg, true); } /* If it's an OR, check its sub-clauses */ diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c index 64605be31781f..1a35c269e04ac 100644 --- a/src/backend/optimizer/plan/planagg.c +++ b/src/backend/optimizer/plan/planagg.c @@ -38,6 +38,7 @@ #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/planmain.h" +#include "optimizer/planner.h" #include "optimizer/subselect.h" #include "optimizer/tlist.h" #include "parser/parse_clause.h" @@ -335,10 +336,12 @@ build_minmax_path(PlannerInfo *root, MinMaxAggInfo *mminfo, * than before. (This means that when we are done, there will be no Vars * of level 1, which is why the subquery can become an initplan.) */ - subroot = (PlannerInfo *) palloc(sizeof(PlannerInfo)); + subroot = palloc_object(PlannerInfo); memcpy(subroot, root, sizeof(PlannerInfo)); subroot->query_level++; subroot->parent_root = root; + subroot->plan_name = choose_plan_name(root->glob, "minmax", true); + /* reset subplan-related stuff */ subroot->plan_params = NIL; subroot->outer_params = NULL; @@ -410,7 +413,7 @@ build_minmax_path(PlannerInfo *root, MinMaxAggInfo *mminfo, parse->limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, sizeof(int64), Int64GetDatum(1), false, - FLOAT8PASSBYVAL); + true); /* * Generate the best paths for this query, telling query_planner that we diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 5467e094ca7e0..af9492e20fa96 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -76,6 +76,9 @@ query_planner(PlannerInfo *root, root->placeholder_list = NIL; root->placeholder_array = NULL; root->placeholder_array_size = 0; + root->agg_clause_list = NIL; + root->group_expr_list = NIL; + root->tlist_vars = NIL; root->fkey_list = NIL; root->initial_rels = NIL; @@ -123,7 +126,7 @@ query_planner(PlannerInfo *root, (root->query_level > 1 || debug_parallel_query != DEBUG_PARALLEL_OFF)) final_rel->consider_parallel = - is_parallel_safe(root, parse->jointree->quals); + is_parallel_safe(root, parse->jointree->quals, &final_rel->needs_temp_safety); /* * The only path for it is a trivial Result path. We cheat a @@ -265,6 +268,12 @@ query_planner(PlannerInfo *root, */ extract_restriction_or_clauses(root); + /* + * Check if eager aggregation is applicable, and if so, set up + * root->agg_clause_list and root->group_expr_list. + */ + setup_eager_aggregation(root); + /* * Now expand appendrels by adding "otherrels" for their children. We * delay this to the end so that we have as much information as possible diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index ff65867eebee7..4f2bec2f5cd2c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -58,6 +58,7 @@ #include "parser/parsetree.h" #include "partitioning/partdesc.h" #include "rewrite/rewriteManip.h" +#include "utils/acl.h" #include "utils/backend_status.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -72,6 +73,12 @@ bool enable_distinct_reordering = true; /* Hook for plugins to get control in planner() */ planner_hook_type planner_hook = NULL; +/* Hook for plugins to get control after PlannerGlobal is initialized */ +planner_setup_hook_type planner_setup_hook = NULL; + +/* Hook for plugins to get control before PlannerGlobal is discarded */ +planner_shutdown_hook_type planner_shutdown_hook = NULL; + /* Hook for plugins to get control when grouping_planner() plans upper rels */ create_upper_paths_hook_type create_upper_paths_hook = NULL; @@ -231,7 +238,6 @@ static void add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, RelOptInfo *partially_grouped_rel, const AggClauseCosts *agg_costs, grouping_sets_data *gd, - double dNumGroups, GroupPathExtraData *extra); static RelOptInfo *create_partial_grouping_paths(PlannerInfo *root, RelOptInfo *grouped_rel, @@ -267,12 +273,35 @@ static bool group_by_has_partkey(RelOptInfo *input_rel, static int common_prefix_cmp(const void *a, const void *b); static List *generate_setop_child_grouplist(SetOperationStmt *op, List *targetlist); +static void create_final_unique_paths(PlannerInfo *root, RelOptInfo *input_rel, + List *sortPathkeys, List *groupClause, + SpecialJoinInfo *sjinfo, RelOptInfo *unique_rel); +static void create_partial_unique_paths(PlannerInfo *root, RelOptInfo *input_rel, + List *sortPathkeys, List *groupClause, + SpecialJoinInfo *sjinfo, RelOptInfo *unique_rel); /***************************************************************************** * * Query optimizer entry point * + * Inputs: + * parse: an analyzed-and-rewritten query tree for an optimizable statement + * query_string: source text for the query tree (used for error reports) + * cursorOptions: bitmask of CURSOR_OPT_XXX flags, see parsenodes.h + * boundParams: passed-in parameter values, or NULL if none + * es: ExplainState if being called from EXPLAIN, else NULL + * + * The result is a PlannedStmt tree. + * + * PARAM_EXTERN Param nodes within the parse tree can be replaced by Consts + * using values from boundParams, if those values are marked PARAM_FLAG_CONST. + * Parameter values not so marked are still relied on for estimation purposes. + * + * The ExplainState pointer is not currently used by the core planner, but it + * is passed through to some planner hooks so that they can report information + * back to EXPLAIN extension hooks. + * * To support loadable plugins that monitor or modify planner behavior, * we provide a hook variable that lets a plugin get control before and * after the standard planning process. The plugin would normally call @@ -284,14 +313,16 @@ static List *generate_setop_child_grouplist(SetOperationStmt *op, *****************************************************************************/ PlannedStmt * planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams) + ParamListInfo boundParams, ExplainState *es) { PlannedStmt *result; if (planner_hook) - result = (*planner_hook) (parse, query_string, cursorOptions, boundParams); + result = (*planner_hook) (parse, query_string, cursorOptions, + boundParams, es); else - result = standard_planner(parse, query_string, cursorOptions, boundParams); + result = standard_planner(parse, query_string, cursorOptions, + boundParams, es); pgstat_report_plan_id(result->planId, false); @@ -300,7 +331,7 @@ planner(Query *parse, const char *query_string, int cursorOptions, PlannedStmt * standard_planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams) + ParamListInfo boundParams, ExplainState *es) { PlannedStmt *result; PlannerGlobal *glob; @@ -342,6 +373,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->transientPlan = false; glob->dependsOnRole = false; glob->partition_directory = NULL; + glob->rel_notnullatts_hash = NULL; /* * Assess whether it's feasible to use parallel mode for this query. We @@ -430,8 +462,13 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, tuple_fraction = 0.0; } + /* Allow plugins to take control after we've initialized "glob" */ + if (planner_setup_hook) + (*planner_setup_hook) (glob, parse, query_string, &tuple_fraction, es); + /* primary planning entry point (may recurse for subqueries) */ - root = subquery_planner(glob, parse, NULL, false, tuple_fraction, NULL); + root = subquery_planner(glob, parse, NULL, NULL, false, tuple_fraction, + NULL); /* Select best Path and turn it into a Plan */ final_rel = fetch_upper_rel(root, UPPERREL_FINAL, NULL); @@ -477,6 +514,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, gather->num_workers = 1; gather->single_copy = true; gather->invisible = (debug_parallel_query == DEBUG_PARALLEL_REGRESS); + gather->process_temp_tables = (best_path->parallel_safe == NEEDS_TEMP_FLUSH); /* Transfer any initPlans to the new top node */ gather->plan.initPlan = top_plan->initPlan; @@ -499,7 +537,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, gather->plan.plan_rows = top_plan->plan_rows; gather->plan.plan_width = top_plan->plan_width; gather->plan.parallel_aware = false; - gather->plan.parallel_safe = false; + gather->plan.parallel_safe = PARALLEL_UNSAFE; /* * Delete the initplans' cost from top_plan. We needn't add it to the @@ -557,6 +595,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->commandType = parse->commandType; result->queryId = parse->queryId; + result->planOrigin = PLAN_STMT_STANDARD; result->hasReturning = (parse->returningList != NIL); result->hasModifyingCTE = parse->hasModifyingCTE; result->canSetTag = parse->canSetTag; @@ -607,6 +646,10 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->jitFlags |= PGJIT_DEFORM; } + /* Allow plugins to take control before we discard "glob" */ + if (planner_shutdown_hook) + (*planner_shutdown_hook) (glob, parse, query_string, result); + if (glob->partition_directory != NULL) DestroyPartitionDirectory(glob->partition_directory); @@ -621,6 +664,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, * * glob is the global state for the current planner run. * parse is the querytree produced by the parser & rewriter. + * plan_name is the name to assign to this subplan (NULL at the top level). * parent_root is the immediate parent Query's info (NULL at the top level). * hasRecursion is true if this is a recursive WITH query. * tuple_fraction is the fraction of tuples we expect will be retrieved. @@ -647,9 +691,9 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, *-------------------- */ PlannerInfo * -subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, - bool hasRecursion, double tuple_fraction, - SetOperationStmt *setops) +subquery_planner(PlannerGlobal *glob, Query *parse, char *plan_name, + PlannerInfo *parent_root, bool hasRecursion, + double tuple_fraction, SetOperationStmt *setops) { PlannerInfo *root; List *newWithCheckOptions; @@ -664,6 +708,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, root->parse = parse; root->glob = glob; root->query_level = parent_root ? parent_root->query_level + 1 : 1; + root->plan_name = plan_name; root->parent_root = parent_root; root->plan_params = NIL; root->outer_params = NULL; @@ -694,12 +739,12 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, root->hasAlternativeSubPlans = false; root->placeholdersFrozen = false; root->hasRecursion = hasRecursion; + root->assumeReplanning = false; if (hasRecursion) root->wt_param_id = assign_special_exec_param(root); else root->wt_param_id = -1; root->non_recursive_path = NULL; - root->partColsUpdated = false; /* * Create the top-level join domain. This won't have valid contents until @@ -720,6 +765,18 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, */ transform_MERGE_to_join(parse); + /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. Note that this step does not descend into sublinks and + * subqueries; if we pull up any sublinks or subqueries below, their + * relation RTEs are processed just before pulling them up. + */ + parse = root->parse = preprocess_relation_rtes(root); + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. @@ -743,14 +800,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, */ preprocess_function_rtes(root); - /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. Recursion issues here are handled in the - * same way as for SubLinks. - */ - parse = root->parse = expand_virtual_generated_columns(root); - /* * Check to see if any subqueries in the jointree can be merged into this * query. @@ -787,23 +836,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, switch (rte->rtekind) { - case RTE_RELATION: - if (rte->inh) - { - /* - * Check to see if the relation actually has any children; - * if not, clear the inh flag so we can treat it as a - * plain base relation. - * - * Note: this could give a false-positive result, if the - * rel once had children but no longer does. We used to - * be able to clear rte->inh later on when we discovered - * that, but no more; we have to handle such cases as - * full-fledged inheritance. - */ - rte->inh = has_subclass(rte->relid); - } - break; case RTE_JOIN: root->hasJoinRTEs = true; if (IS_OUTER_JOIN(rte->jointype)) @@ -848,6 +880,38 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, bms_make_singleton(parse->resultRelation); } + /* + * This would be a convenient time to check access permissions for all + * relations mentioned in the query, since it would be better to fail now, + * before doing any detailed planning. However, for historical reasons, + * we leave this to be done at executor startup. + * + * Note, however, that we do need to check access permissions for any view + * relations mentioned in the query, in order to prevent information being + * leaked by selectivity estimation functions, which only check view owner + * permissions on underlying tables (see all_rows_selectable() and its + * callers). This is a little ugly, because it means that access + * permissions for views will be checked twice, which is another reason + * why it would be better to do all the ACL checks here. + */ + foreach(l, parse->rtable) + { + RangeTblEntry *rte = lfirst_node(RangeTblEntry, l); + + if (rte->perminfoindex != 0 && + rte->relkind == RELKIND_VIEW) + { + RTEPermissionInfo *perminfo; + bool result; + + perminfo = getRTEPermissionInfo(parse->rteperminfos, rte); + result = ExecCheckOneRelPerms(perminfo); + if (!result) + aclcheck_error(ACLCHECK_NO_PRIV, OBJECT_VIEW, + get_rel_name(perminfo->relid)); + } + } + /* * Preprocess RowMark information. We need to do this after subquery * pullup, so that all base relations are present. @@ -1064,15 +1128,28 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, if (parse->hasTargetSRFs) parse->hasTargetSRFs = expression_returns_set((Node *) parse->targetList); + /* + * If we have grouping sets, expand the groupingSets tree of this query to + * a flat list of grouping sets. We need to do this before optimizing + * HAVING, since we can't easily tell if there's an empty grouping set + * until we have this representation. + */ + if (parse->groupingSets) + { + parse->groupingSets = + expand_grouping_sets(parse->groupingSets, parse->groupDistinct, -1); + } + /* * In some cases we may want to transfer a HAVING clause into WHERE. We * cannot do so if the HAVING clause contains aggregates (obviously) or * volatile functions (since a HAVING clause is supposed to be executed - * only once per group). We also can't do this if there are any nonempty - * grouping sets and the clause references any columns that are nullable - * by the grouping sets; moving such a clause into WHERE would potentially - * change the results. (If there are only empty grouping sets, then the - * HAVING clause must be degenerate as discussed below.) + * only once per group). We also can't do this if there are any grouping + * sets and the clause references any columns that are nullable by the + * grouping sets; the nulled values of those columns are not available + * before the grouping step. (The test on groupClause might seem wrong, + * but it's okay: it's just an optimization to avoid running pull_varnos + * when there cannot be any Vars in the HAVING clause.) * * Also, it may be that the clause is so expensive to execute that we're * better off doing it only once per group, despite the loss of @@ -1082,19 +1159,19 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, * clause into WHERE, in hopes of eliminating tuples before aggregation * instead of after. * - * If the query has explicit grouping then we can simply move such a + * If the query has no empty grouping set then we can simply move such a * clause into WHERE; any group that fails the clause will not be in the * output because none of its tuples will reach the grouping or - * aggregation stage. Otherwise we must have a degenerate (variable-free) - * HAVING clause, which we put in WHERE so that query_planner() can use it - * in a gating Result node, but also keep in HAVING to ensure that we - * don't emit a bogus aggregated row. (This could be done better, but it - * seems not worth optimizing.) + * aggregation stage. Otherwise we have to keep the clause in HAVING to + * ensure that we don't emit a bogus aggregated row. But then the HAVING + * clause must be degenerate (variable-free), so we can copy it into WHERE + * so that query_planner() can use it in a gating Result node. (This could + * be done better, but it seems not worth optimizing.) * * Note that a HAVING clause may contain expressions that are not fully * preprocessed. This can happen if these expressions are part of * grouping items. In such cases, they are replaced with GROUP Vars in - * the parser and then replaced back after we've done with expression + * the parser and then replaced back after we're done with expression * preprocessing on havingQual. This is not an issue if the clause * remains in HAVING, because these expressions will be matched to lower * target items in setrefs.c. However, if the clause is moved or copied @@ -1119,8 +1196,11 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, /* keep it in HAVING */ newHaving = lappend(newHaving, havingclause); } - else if (parse->groupClause) + else if (parse->groupClause && + (parse->groupingSets == NIL || + (List *) linitial(parse->groupingSets) != NIL)) { + /* There is GROUP BY, but no empty grouping set */ Node *whereclause; /* Preprocess the HAVING clause fully */ @@ -1133,6 +1213,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, } else { + /* There is an empty grouping set (perhaps implicitly) */ Node *whereclause; /* Preprocess the HAVING clause fully */ @@ -1393,6 +1474,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, List *final_targets; List *final_targets_contain_srfs; bool final_target_parallel_safe; + bool needs_temp_flush = false; RelOptInfo *current_rel; RelOptInfo *final_rel; FinalPathExtraData extra; @@ -1444,7 +1526,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, /* And check whether it's parallel safe */ final_target_parallel_safe = - is_parallel_safe(root, (Node *) final_target->exprs); + is_parallel_safe(root, (Node *) final_target->exprs, &needs_temp_flush); /* The setop result tlist couldn't contain any SRFs */ Assert(!parse->hasTargetSRFs); @@ -1614,7 +1696,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, */ final_target = create_pathtarget(root, root->processed_tlist); final_target_parallel_safe = - is_parallel_safe(root, (Node *) final_target->exprs); + is_parallel_safe(root, (Node *) final_target->exprs, &needs_temp_flush); /* * If ORDER BY was given, consider whether we should use a post-sort @@ -1627,7 +1709,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, final_target, &have_postponed_srfs); sort_input_target_parallel_safe = - is_parallel_safe(root, (Node *) sort_input_target->exprs); + is_parallel_safe(root, (Node *) sort_input_target->exprs, &needs_temp_flush); } else { @@ -1646,7 +1728,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, final_target, activeWindows); grouping_target_parallel_safe = - is_parallel_safe(root, (Node *) grouping_target->exprs); + is_parallel_safe(root, (Node *) grouping_target->exprs, &needs_temp_flush); } else { @@ -1665,7 +1747,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, { scanjoin_target = make_group_input_target(root, final_target); scanjoin_target_parallel_safe = - is_parallel_safe(root, (Node *) scanjoin_target->exprs); + is_parallel_safe(root, (Node *) scanjoin_target->exprs, &needs_temp_flush); } else { @@ -1694,9 +1776,10 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, sort_input_target = linitial_node(PathTarget, sort_input_targets); Assert(!linitial_int(sort_input_targets_contain_srfs)); /* likewise for grouping_target vs. scanjoin_target */ - split_pathtarget_at_srfs(root, grouping_target, scanjoin_target, - &grouping_targets, - &grouping_targets_contain_srfs); + split_pathtarget_at_srfs_grouping(root, + grouping_target, scanjoin_target, + &grouping_targets, + &grouping_targets_contain_srfs); grouping_target = linitial_node(PathTarget, grouping_targets); Assert(!linitial_int(grouping_targets_contain_srfs)); /* scanjoin_target will not have any SRFs precomputed for it */ @@ -1716,6 +1799,18 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, scanjoin_targets_contain_srfs = NIL; } + /* + * Each path may have individual target containing or not references to + * relations with temporary storages. There were attempts to do it + * smartly that end up with a new Target::needs_temp_flush field that + * seems too invasive for this first attempt. + * So, just set current_rel flag as needed for temp buffers flushing and + * let Gather to do the job earlier than it could be. + * XXX: we need to be sure that no one new path created with all these + * target lists till now. + */ + current_rel->needs_temp_safety |= needs_temp_flush; + /* Apply scan/join target. */ scanjoin_target_same_exprs = list_length(scanjoin_targets) == 1 && equal(scanjoin_target->exprs, current_rel->reltarget->exprs); @@ -1824,9 +1919,13 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, * query. */ if (current_rel->consider_parallel && - is_parallel_safe(root, parse->limitOffset) && - is_parallel_safe(root, parse->limitCount)) + is_parallel_safe(root, parse->limitOffset, &needs_temp_flush) && + is_parallel_safe(root, parse->limitCount, &needs_temp_flush)) + { final_rel->consider_parallel = true; + final_rel->needs_temp_safety |= + current_rel->needs_temp_safety | needs_temp_flush; + } /* * If the current_rel belongs to a single FDW, so does the final_rel. @@ -2063,7 +2162,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, parse->canSetTag, parse->resultRelation, rootRelation, - root->partColsUpdated, resultRelations, updateColnosLists, withCheckOptionLists, @@ -2119,10 +2217,13 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, } /* - * Do preprocessing for groupingSets clause and related data. This handles the - * preliminary steps of expanding the grouping sets, organizing them into lists - * of rollups, and preparing annotations which will later be filled in with - * size estimates. + * Do preprocessing for groupingSets clause and related data. + * + * We expect that parse->groupingSets has already been expanded into a flat + * list of grouping sets (that is, just integer Lists of ressortgroupref + * numbers) by expand_grouping_sets(). This function handles the preliminary + * steps of organizing the grouping sets into lists of rollups, and preparing + * annotations which will later be filled in with size estimates. */ static grouping_sets_data * preprocess_grouping_sets(PlannerInfo *root) @@ -2131,14 +2232,7 @@ preprocess_grouping_sets(PlannerInfo *root) List *sets; int maxref = 0; ListCell *lc_set; - grouping_sets_data *gd = palloc0(sizeof(grouping_sets_data)); - - parse->groupingSets = expand_grouping_sets(parse->groupingSets, parse->groupDistinct, -1); - - gd->any_hashable = false; - gd->unhashable_refs = NULL; - gd->unsortable_refs = NULL; - gd->unsortable_sets = NIL; + grouping_sets_data *gd = palloc0_object(grouping_sets_data); /* * We don't currently make any attempt to optimize the groupClause when @@ -2146,6 +2240,12 @@ preprocess_grouping_sets(PlannerInfo *root) */ root->processed_groupClause = parse->groupClause; + /* Detect unhashable and unsortable grouping expressions */ + gd->any_hashable = false; + gd->unhashable_refs = NULL; + gd->unsortable_refs = NULL; + gd->unsortable_sets = NIL; + if (parse->groupClause) { ListCell *lc; @@ -3868,8 +3968,11 @@ make_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * target list and HAVING quals are parallel-safe. */ if (input_rel->consider_parallel && target_parallel_safe && - is_parallel_safe(root, (Node *) havingQual)) + is_parallel_safe(root, (Node *) havingQual, &grouped_rel->needs_temp_safety)) + { grouped_rel->consider_parallel = true; + grouped_rel->needs_temp_safety |= input_rel->needs_temp_safety; + } /* * If the input rel belongs to a single FDW, so does the grouped rel. @@ -3982,9 +4085,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, GroupPathExtraData *extra, RelOptInfo **partially_grouped_rel_p) { - Path *cheapest_path = input_rel->cheapest_total_path; RelOptInfo *partially_grouped_rel = NULL; - double dNumGroups; PartitionwiseAggregateType patype = PARTITIONWISE_AGGREGATE_NONE; /* @@ -4066,23 +4167,16 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, /* Gather any partially grouped partial paths. */ if (partially_grouped_rel && partially_grouped_rel->partial_pathlist) - { gather_grouping_paths(root, partially_grouped_rel); - set_cheapest(partially_grouped_rel); - } - /* - * Estimate number of groups. - */ - dNumGroups = get_number_of_groups(root, - cheapest_path->rows, - gd, - extra->targetList); + /* Now choose the best path(s) for partially_grouped_rel. */ + if (partially_grouped_rel && partially_grouped_rel->pathlist) + set_cheapest(partially_grouped_rel); /* Build final grouping paths */ add_paths_to_grouping_rel(root, input_rel, grouped_rel, partially_grouped_rel, agg_costs, gd, - dNumGroups, extra); + extra); /* Give a helpful error if we failed to find any implementation */ if (grouped_rel->pathlist == NIL) @@ -4497,8 +4591,11 @@ create_window_paths(PlannerInfo *root, * target list and active windows for non-parallel-safe constructs. */ if (input_rel->consider_parallel && output_target_parallel_safe && - is_parallel_safe(root, (Node *) activeWindows)) + is_parallel_safe(root, (Node *) activeWindows, &window_rel->needs_temp_safety)) + { window_rel->consider_parallel = true; + window_rel->needs_temp_safety |= input_rel->needs_temp_safety; + } /* * If the input rel belongs to a single FDW, so does the window rel. @@ -4893,7 +4990,7 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, sizeof(int64), Int64GetDatum(1), false, - FLOAT8PASSBYVAL); + true); /* * Apply a LimitPath onto the partial path to restrict the @@ -4917,10 +5014,10 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, else { add_partial_path(partial_distinct_rel, (Path *) - create_upper_unique_path(root, partial_distinct_rel, - sorted_path, - list_length(root->distinct_pathkeys), - numDistinctRows)); + create_unique_path(root, partial_distinct_rel, + sorted_path, + list_length(root->distinct_pathkeys), + numDistinctRows)); } } } @@ -5096,7 +5193,7 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, sizeof(int64), Int64GetDatum(1), false, - FLOAT8PASSBYVAL); + true); /* * If the query already has a LIMIT clause, then we could @@ -5111,10 +5208,10 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, else { add_path(distinct_rel, (Path *) - create_upper_unique_path(root, distinct_rel, - sorted_path, - list_length(root->distinct_pathkeys), - numDistinctRows)); + create_unique_path(root, distinct_rel, + sorted_path, + list_length(root->distinct_pathkeys), + numDistinctRows)); } } } @@ -5905,8 +6002,8 @@ select_active_windows(PlannerInfo *root, WindowFuncLists *wflists) List *result = NIL; ListCell *lc; int nActive = 0; - WindowClauseSortData *actives = palloc(sizeof(WindowClauseSortData) - * list_length(windowClause)); + WindowClauseSortData *actives = palloc_array(WindowClauseSortData, + list_length(windowClause)); /* First, construct an array of the active windows */ foreach(lc, windowClause) @@ -6879,7 +6976,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) * * tableOid is the table on which the index is to be built. indexOid is the * OID of an index to be created or reindexed (which must be an index with - * support for parallel builds - currently btree or BRIN). + * support for parallel builds - currently btree, GIN, or BRIN). * * Return value is the number of parallel worker processes to request. It * may be unsafe to proceed if this is 0. Note that this does not include the @@ -6960,10 +7057,12 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) * Currently, parallel workers can't access the leader's temporary tables. * Furthermore, any index predicate or index expressions must be parallel * safe. + * TODO: Is this hard to enable? */ if (heap->rd_rel->relpersistence == RELPERSISTENCE_TEMP || - !is_parallel_safe(root, (Node *) RelationGetIndexExpressions(index)) || - !is_parallel_safe(root, (Node *) RelationGetIndexPredicate(index))) + !is_parallel_safe(root, (Node *) RelationGetIndexExpressions(index), &rel->needs_temp_safety) || + !is_parallel_safe(root, (Node *) RelationGetIndexPredicate(index), &rel->needs_temp_safety) || + rel->needs_temp_safety) { parallel_workers = 0; goto done; @@ -7027,16 +7126,42 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, RelOptInfo *grouped_rel, RelOptInfo *partially_grouped_rel, const AggClauseCosts *agg_costs, - grouping_sets_data *gd, double dNumGroups, + grouping_sets_data *gd, GroupPathExtraData *extra) { Query *parse = root->parse; Path *cheapest_path = input_rel->cheapest_total_path; + Path *cheapest_partially_grouped_path = NULL; ListCell *lc; bool can_hash = (extra->flags & GROUPING_CAN_USE_HASH) != 0; bool can_sort = (extra->flags & GROUPING_CAN_USE_SORT) != 0; List *havingQual = (List *) extra->havingQual; AggClauseCosts *agg_final_costs = &extra->agg_final_costs; + double dNumGroups = 0; + double dNumFinalGroups = 0; + + /* + * Estimate number of groups for non-split aggregation. + */ + dNumGroups = get_number_of_groups(root, + cheapest_path->rows, + gd, + extra->targetList); + + if (partially_grouped_rel && partially_grouped_rel->pathlist) + { + cheapest_partially_grouped_path = + partially_grouped_rel->cheapest_total_path; + + /* + * Estimate number of groups for final phase of partial aggregation. + */ + dNumFinalGroups = + get_number_of_groups(root, + cheapest_partially_grouped_path->rows, + gd, + extra->targetList); + } if (can_sort) { @@ -7149,7 +7274,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, path = make_ordered_path(root, grouped_rel, path, - partially_grouped_rel->cheapest_total_path, + cheapest_partially_grouped_path, info->pathkeys, -1.0); @@ -7167,7 +7292,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, info->clauses, havingQual, agg_final_costs, - dNumGroups)); + dNumFinalGroups)); else add_path(grouped_rel, (Path *) create_group_path(root, @@ -7175,7 +7300,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, path, info->clauses, havingQual, - dNumGroups)); + dNumFinalGroups)); } } @@ -7217,19 +7342,17 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, */ if (partially_grouped_rel && partially_grouped_rel->pathlist) { - Path *path = partially_grouped_rel->cheapest_total_path; - add_path(grouped_rel, (Path *) create_agg_path(root, grouped_rel, - path, + cheapest_partially_grouped_path, grouped_rel->reltarget, AGG_HASHED, AGGSPLIT_FINAL_DESERIAL, root->processed_groupClause, havingQual, agg_final_costs, - dNumGroups)); + dNumFinalGroups)); } } @@ -7269,6 +7392,7 @@ create_partial_grouping_paths(PlannerInfo *root, { Query *parse = root->parse; RelOptInfo *partially_grouped_rel; + RelOptInfo *eager_agg_rel = NULL; AggClauseCosts *agg_partial_costs = &extra->agg_partial_costs; AggClauseCosts *agg_final_costs = &extra->agg_final_costs; Path *cheapest_partial_path = NULL; @@ -7279,6 +7403,15 @@ create_partial_grouping_paths(PlannerInfo *root, bool can_hash = (extra->flags & GROUPING_CAN_USE_HASH) != 0; bool can_sort = (extra->flags & GROUPING_CAN_USE_SORT) != 0; + /* + * Check whether any partially aggregated paths have been generated + * through eager aggregation. + */ + if (input_rel->grouped_rel && + !IS_DUMMY_REL(input_rel->grouped_rel) && + input_rel->grouped_rel->pathlist != NIL) + eager_agg_rel = input_rel->grouped_rel; + /* * Consider whether we should generate partially aggregated non-partial * paths. We can only do this if we have a non-partial path, and only if @@ -7300,11 +7433,13 @@ create_partial_grouping_paths(PlannerInfo *root, /* * If we can't partially aggregate partial paths, and we can't partially - * aggregate non-partial paths, then don't bother creating the new + * aggregate non-partial paths, and no partially aggregated paths were + * generated by eager aggregation, then don't bother creating the new * RelOptInfo at all, unless the caller specified force_rel_creation. */ if (cheapest_total_path == NULL && cheapest_partial_path == NULL && + eager_agg_rel == NULL && !force_rel_creation) return NULL; @@ -7529,6 +7664,51 @@ create_partial_grouping_paths(PlannerInfo *root, dNumPartialPartialGroups)); } + /* + * Add any partially aggregated paths generated by eager aggregation to + * the new upper relation after applying projection steps as needed. + */ + if (eager_agg_rel) + { + /* Add the paths */ + foreach(lc, eager_agg_rel->pathlist) + { + Path *path = (Path *) lfirst(lc); + + /* Shouldn't have any parameterized paths anymore */ + Assert(path->param_info == NULL); + + path = (Path *) create_projection_path(root, + partially_grouped_rel, + path, + partially_grouped_rel->reltarget); + + add_path(partially_grouped_rel, path); + } + + /* + * Likewise add the partial paths, but only if parallelism is possible + * for partially_grouped_rel. + */ + if (partially_grouped_rel->consider_parallel) + { + foreach(lc, eager_agg_rel->partial_pathlist) + { + Path *path = (Path *) lfirst(lc); + + /* Shouldn't have any parameterized paths anymore */ + Assert(path->param_info == NULL); + + path = (Path *) create_projection_path(root, + partially_grouped_rel, + path, + partially_grouped_rel->reltarget); + + add_partial_path(partially_grouped_rel, path); + } + } + } + /* * If there is an FDW that's responsible for all baserels of the query, * let it consider adding partially grouped ForeignPaths. @@ -7753,17 +7933,23 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, check_stack_depth(); /* - * If the rel is partitioned, we want to drop its existing paths and - * generate new ones. This function would still be correct if we kept the - * existing paths: we'd modify them to generate the correct target above - * the partitioning Append, and then they'd compete on cost with paths - * generating the target below the Append. However, in our current cost - * model the latter way is always the same or cheaper cost, so modifying - * the existing paths would just be useless work. Moreover, when the cost - * is the same, varying roundoff errors might sometimes allow an existing - * path to be picked, resulting in undesirable cross-platform plan - * variations. So we drop old paths and thereby force the work to be done - * below the Append, except in the case of a non-parallel-safe target. + * If the rel only has Append and MergeAppend paths, we want to drop its + * existing paths and generate new ones. This function would still be + * correct if we kept the existing paths: we'd modify them to generate the + * correct target above the partitioning Append, and then they'd compete + * on cost with paths generating the target below the Append. However, in + * our current cost model the latter way is always the same or cheaper + * cost, so modifying the existing paths would just be useless work. + * Moreover, when the cost is the same, varying roundoff errors might + * sometimes allow an existing path to be picked, resulting in undesirable + * cross-platform plan variations. So we drop old paths and thereby force + * the work to be done below the Append. + * + * However, there are several cases when this optimization is not safe. If + * the rel isn't partitioned, then none of the paths will be Append or + * MergeAppend paths, so we should definitely not do this. If it is + * parititoned but is a joinrel, it may have Append and MergeAppend paths, + * but it can also have join paths that we can't afford to discard. * * Some care is needed, because we have to allow * generate_useful_gather_paths to see the old partial paths in the next @@ -7771,7 +7957,7 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, * generate_useful_gather_paths to add path(s) to the main list, and * finally zap the partial pathlist. */ - if (rel_is_partitioned) + if (rel_is_partitioned && IS_SIMPLE_REL(rel)) rel->pathlist = NIL; /* @@ -7797,7 +7983,7 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, } /* Finish dropping old paths for a partitioned rel, per comment above */ - if (rel_is_partitioned) + if (rel_is_partitioned && IS_SIMPLE_REL(rel)) rel->partial_pathlist = NIL; /* Extract SRF-free scan/join target. */ @@ -8092,13 +8278,6 @@ create_partitionwise_grouping_paths(PlannerInfo *root, add_paths_to_append_rel(root, partially_grouped_rel, partially_grouped_live_children); - - /* - * We need call set_cheapest, since the finalization step will use the - * cheapest path from the rel. - */ - if (partially_grouped_rel->pathlist) - set_cheapest(partially_grouped_rel); } /* If possible, create append paths for fully grouped children. */ @@ -8248,3 +8427,628 @@ generate_setop_child_grouplist(SetOperationStmt *op, List *targetlist) return grouplist; } + +/* + * create_unique_paths + * Build a new RelOptInfo containing Paths that represent elimination of + * distinct rows from the input data. Distinct-ness is defined according to + * the needs of the semijoin represented by sjinfo. If it is not possible + * to identify how to make the data unique, NULL is returned. + * + * If used at all, this is likely to be called repeatedly on the same rel, + * so we cache the result. + */ +RelOptInfo * +create_unique_paths(PlannerInfo *root, RelOptInfo *rel, SpecialJoinInfo *sjinfo) +{ + RelOptInfo *unique_rel; + List *sortPathkeys = NIL; + List *groupClause = NIL; + MemoryContext oldcontext; + + /* Caller made a mistake if SpecialJoinInfo is the wrong one */ + Assert(sjinfo->jointype == JOIN_SEMI); + Assert(bms_equal(rel->relids, sjinfo->syn_righthand)); + + /* If result already cached, return it */ + if (rel->unique_rel) + return rel->unique_rel; + + /* If it's not possible to unique-ify, return NULL */ + if (!(sjinfo->semi_can_btree || sjinfo->semi_can_hash)) + return NULL; + + /* + * Punt if this is a child relation and we failed to build a unique-ified + * relation for its parent. This can happen if all the RHS columns were + * found to be equated to constants when unique-ifying the parent table, + * leaving no columns to unique-ify. + */ + if (IS_OTHER_REL(rel) && rel->top_parent->unique_rel == NULL) + return NULL; + + /* + * When called during GEQO join planning, we are in a short-lived memory + * context. We must make sure that the unique rel and any subsidiary data + * structures created for a baserel survive the GEQO cycle, else the + * baserel is trashed for future GEQO cycles. On the other hand, when we + * are creating those for a joinrel during GEQO, we don't want them to + * clutter the main planning context. Upshot is that the best solution is + * to explicitly allocate memory in the same context the given RelOptInfo + * is in. + */ + oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel)); + + unique_rel = makeNode(RelOptInfo); + memcpy(unique_rel, rel, sizeof(RelOptInfo)); + + /* + * clear path info + */ + unique_rel->pathlist = NIL; + unique_rel->ppilist = NIL; + unique_rel->partial_pathlist = NIL; + unique_rel->cheapest_startup_path = NULL; + unique_rel->cheapest_total_path = NULL; + unique_rel->cheapest_parameterized_paths = NIL; + + /* + * Build the target list for the unique rel. We also build the pathkeys + * that represent the ordering requirements for the sort-based + * implementation, and the list of SortGroupClause nodes that represent + * the columns to be grouped on for the hash-based implementation. + * + * For a child rel, we can construct these fields from those of its + * parent. + */ + if (IS_OTHER_REL(rel)) + { + PathTarget *child_unique_target; + PathTarget *parent_unique_target; + + parent_unique_target = rel->top_parent->unique_rel->reltarget; + + child_unique_target = copy_pathtarget(parent_unique_target); + + /* Translate the target expressions */ + child_unique_target->exprs = (List *) + adjust_appendrel_attrs_multilevel(root, + (Node *) parent_unique_target->exprs, + rel, + rel->top_parent); + + unique_rel->reltarget = child_unique_target; + + sortPathkeys = rel->top_parent->unique_pathkeys; + groupClause = rel->top_parent->unique_groupclause; + } + else + { + List *newtlist; + int nextresno; + List *sortList = NIL; + ListCell *lc1; + ListCell *lc2; + + /* + * The values we are supposed to unique-ify may be expressions in the + * variables of the input rel's targetlist. We have to add any such + * expressions to the unique rel's targetlist. + * + * To complicate matters, some of the values to be unique-ified may be + * known redundant by the EquivalenceClass machinery (e.g., because + * they have been equated to constants). There is no need to compare + * such values during unique-ification, and indeed we had better not + * try because the Vars involved may not have propagated as high as + * the semijoin's level. We use make_pathkeys_for_sortclauses to + * detect such cases, which is a tad inefficient but it doesn't seem + * worth building specialized infrastructure for this. + */ + newtlist = make_tlist_from_pathtarget(rel->reltarget); + nextresno = list_length(newtlist) + 1; + + forboth(lc1, sjinfo->semi_rhs_exprs, lc2, sjinfo->semi_operators) + { + Expr *uniqexpr = lfirst(lc1); + Oid in_oper = lfirst_oid(lc2); + Oid sortop; + TargetEntry *tle; + bool made_tle = false; + + tle = tlist_member(uniqexpr, newtlist); + if (!tle) + { + tle = makeTargetEntry(uniqexpr, + nextresno, + NULL, + false); + newtlist = lappend(newtlist, tle); + nextresno++; + made_tle = true; + } + + /* + * Try to build an ORDER BY list to sort the input compatibly. We + * do this for each sortable clause even when the clauses are not + * all sortable, so that we can detect clauses that are redundant + * according to the pathkey machinery. + */ + sortop = get_ordering_op_for_equality_op(in_oper, false); + if (OidIsValid(sortop)) + { + Oid eqop; + SortGroupClause *sortcl; + + /* + * The Unique node will need equality operators. Normally + * these are the same as the IN clause operators, but if those + * are cross-type operators then the equality operators are + * the ones for the IN clause operators' RHS datatype. + */ + eqop = get_equality_op_for_ordering_op(sortop, NULL); + if (!OidIsValid(eqop)) /* shouldn't happen */ + elog(ERROR, "could not find equality operator for ordering operator %u", + sortop); + + sortcl = makeNode(SortGroupClause); + sortcl->tleSortGroupRef = assignSortGroupRef(tle, newtlist); + sortcl->eqop = eqop; + sortcl->sortop = sortop; + sortcl->reverse_sort = false; + sortcl->nulls_first = false; + sortcl->hashable = false; /* no need to make this accurate */ + sortList = lappend(sortList, sortcl); + + /* + * At each step, convert the SortGroupClause list to pathkey + * form. If the just-added SortGroupClause is redundant, the + * result will be shorter than the SortGroupClause list. + */ + sortPathkeys = make_pathkeys_for_sortclauses(root, sortList, + newtlist); + if (list_length(sortPathkeys) != list_length(sortList)) + { + /* Drop the redundant SortGroupClause */ + sortList = list_delete_last(sortList); + Assert(list_length(sortPathkeys) == list_length(sortList)); + /* Undo tlist addition, if we made one */ + if (made_tle) + { + newtlist = list_delete_last(newtlist); + nextresno--; + } + /* We need not consider this clause for hashing, either */ + continue; + } + } + else if (sjinfo->semi_can_btree) /* shouldn't happen */ + elog(ERROR, "could not find ordering operator for equality operator %u", + in_oper); + + if (sjinfo->semi_can_hash) + { + /* Create a GROUP BY list for the Agg node to use */ + Oid eq_oper; + SortGroupClause *groupcl; + + /* + * Get the hashable equality operators for the Agg node to + * use. Normally these are the same as the IN clause + * operators, but if those are cross-type operators then the + * equality operators are the ones for the IN clause + * operators' RHS datatype. + */ + if (!get_compatible_hash_operators(in_oper, NULL, &eq_oper)) + elog(ERROR, "could not find compatible hash operator for operator %u", + in_oper); + + groupcl = makeNode(SortGroupClause); + groupcl->tleSortGroupRef = assignSortGroupRef(tle, newtlist); + groupcl->eqop = eq_oper; + groupcl->sortop = sortop; + groupcl->reverse_sort = false; + groupcl->nulls_first = false; + groupcl->hashable = true; + groupClause = lappend(groupClause, groupcl); + } + } + + /* + * Done building the sortPathkeys and groupClause. But the + * sortPathkeys are bogus if not all the clauses were sortable. + */ + if (!sjinfo->semi_can_btree) + sortPathkeys = NIL; + + /* + * It can happen that all the RHS columns are equated to constants. + * We'd have to do something special to unique-ify in that case, and + * it's such an unlikely-in-the-real-world case that it's not worth + * the effort. So just punt if we found no columns to unique-ify. + */ + if (sortPathkeys == NIL && groupClause == NIL) + { + MemoryContextSwitchTo(oldcontext); + return NULL; + } + + /* Convert the required targetlist back to PathTarget form */ + unique_rel->reltarget = create_pathtarget(root, newtlist); + } + + /* build unique paths based on input rel's pathlist */ + create_final_unique_paths(root, rel, sortPathkeys, groupClause, + sjinfo, unique_rel); + + /* build unique paths based on input rel's partial_pathlist */ + create_partial_unique_paths(root, rel, sortPathkeys, groupClause, + sjinfo, unique_rel); + + /* Now choose the best path(s) */ + set_cheapest(unique_rel); + + /* + * There shouldn't be any partial paths for the unique relation; + * otherwise, we won't be able to properly guarantee uniqueness. + */ + Assert(unique_rel->partial_pathlist == NIL); + + /* Cache the result */ + rel->unique_rel = unique_rel; + rel->unique_pathkeys = sortPathkeys; + rel->unique_groupclause = groupClause; + + MemoryContextSwitchTo(oldcontext); + + return unique_rel; +} + +/* + * create_final_unique_paths + * Create unique paths in 'unique_rel' based on 'input_rel' pathlist + */ +static void +create_final_unique_paths(PlannerInfo *root, RelOptInfo *input_rel, + List *sortPathkeys, List *groupClause, + SpecialJoinInfo *sjinfo, RelOptInfo *unique_rel) +{ + Path *cheapest_input_path = input_rel->cheapest_total_path; + + /* Estimate number of output rows */ + unique_rel->rows = estimate_num_groups(root, + sjinfo->semi_rhs_exprs, + cheapest_input_path->rows, + NULL, + NULL); + + /* Consider sort-based implementations, if possible. */ + if (sjinfo->semi_can_btree) + { + ListCell *lc; + + /* + * Use any available suitably-sorted path as input, and also consider + * sorting the cheapest-total path and incremental sort on any paths + * with presorted keys. + * + * To save planning time, we ignore parameterized input paths unless + * they are the cheapest-total path. + */ + foreach(lc, input_rel->pathlist) + { + Path *input_path = (Path *) lfirst(lc); + Path *path; + bool is_sorted; + int presorted_keys; + + /* + * Ignore parameterized paths that are not the cheapest-total + * path. + */ + if (input_path->param_info && + input_path != cheapest_input_path) + continue; + + is_sorted = pathkeys_count_contained_in(sortPathkeys, + input_path->pathkeys, + &presorted_keys); + + /* + * Ignore paths that are not suitably or partially sorted, unless + * they are the cheapest total path (no need to deal with paths + * which have presorted keys when incremental sort is disabled). + */ + if (!is_sorted && input_path != cheapest_input_path && + (presorted_keys == 0 || !enable_incremental_sort)) + continue; + + /* + * Make a separate ProjectionPath in case we need a Result node. + */ + path = (Path *) create_projection_path(root, + unique_rel, + input_path, + unique_rel->reltarget); + + if (!is_sorted) + { + /* + * We've no need to consider both a sort and incremental sort. + * We'll just do a sort if there are no presorted keys and an + * incremental sort when there are presorted keys. + */ + if (presorted_keys == 0 || !enable_incremental_sort) + path = (Path *) create_sort_path(root, + unique_rel, + path, + sortPathkeys, + -1.0); + else + path = (Path *) create_incremental_sort_path(root, + unique_rel, + path, + sortPathkeys, + presorted_keys, + -1.0); + } + + path = (Path *) create_unique_path(root, unique_rel, path, + list_length(sortPathkeys), + unique_rel->rows); + + add_path(unique_rel, path); + } + } + + /* Consider hash-based implementation, if possible. */ + if (sjinfo->semi_can_hash) + { + Path *path; + + /* + * Make a separate ProjectionPath in case we need a Result node. + */ + path = (Path *) create_projection_path(root, + unique_rel, + cheapest_input_path, + unique_rel->reltarget); + + path = (Path *) create_agg_path(root, + unique_rel, + path, + cheapest_input_path->pathtarget, + AGG_HASHED, + AGGSPLIT_SIMPLE, + groupClause, + NIL, + NULL, + unique_rel->rows); + + add_path(unique_rel, path); + } +} + +/* + * create_partial_unique_paths + * Create unique paths in 'unique_rel' based on 'input_rel' partial_pathlist + */ +static void +create_partial_unique_paths(PlannerInfo *root, RelOptInfo *input_rel, + List *sortPathkeys, List *groupClause, + SpecialJoinInfo *sjinfo, RelOptInfo *unique_rel) +{ + RelOptInfo *partial_unique_rel; + Path *cheapest_partial_path; + + /* nothing to do when there are no partial paths in the input rel */ + if (!input_rel->consider_parallel || input_rel->partial_pathlist == NIL) + return; + + /* + * nothing to do if there's anything in the targetlist that's + * parallel-restricted. + */ + if (!is_parallel_safe(root, (Node *) unique_rel->reltarget->exprs, + &unique_rel->needs_temp_safety)) + return; + + cheapest_partial_path = linitial(input_rel->partial_pathlist); + + partial_unique_rel = makeNode(RelOptInfo); + memcpy(partial_unique_rel, input_rel, sizeof(RelOptInfo)); + + /* + * clear path info + */ + partial_unique_rel->pathlist = NIL; + partial_unique_rel->ppilist = NIL; + partial_unique_rel->partial_pathlist = NIL; + partial_unique_rel->cheapest_startup_path = NULL; + partial_unique_rel->cheapest_total_path = NULL; + partial_unique_rel->cheapest_parameterized_paths = NIL; + + /* Estimate number of output rows */ + partial_unique_rel->rows = estimate_num_groups(root, + sjinfo->semi_rhs_exprs, + cheapest_partial_path->rows, + NULL, + NULL); + partial_unique_rel->reltarget = unique_rel->reltarget; + + /* Consider sort-based implementations, if possible. */ + if (sjinfo->semi_can_btree) + { + ListCell *lc; + + /* + * Use any available suitably-sorted path as input, and also consider + * sorting the cheapest partial path and incremental sort on any paths + * with presorted keys. + */ + foreach(lc, input_rel->partial_pathlist) + { + Path *input_path = (Path *) lfirst(lc); + Path *path; + bool is_sorted; + int presorted_keys; + + is_sorted = pathkeys_count_contained_in(sortPathkeys, + input_path->pathkeys, + &presorted_keys); + + /* + * Ignore paths that are not suitably or partially sorted, unless + * they are the cheapest partial path (no need to deal with paths + * which have presorted keys when incremental sort is disabled). + */ + if (!is_sorted && input_path != cheapest_partial_path && + (presorted_keys == 0 || !enable_incremental_sort)) + continue; + + /* + * Make a separate ProjectionPath in case we need a Result node. + */ + path = (Path *) create_projection_path(root, + partial_unique_rel, + input_path, + partial_unique_rel->reltarget); + + if (!is_sorted) + { + /* + * We've no need to consider both a sort and incremental sort. + * We'll just do a sort if there are no presorted keys and an + * incremental sort when there are presorted keys. + */ + if (presorted_keys == 0 || !enable_incremental_sort) + path = (Path *) create_sort_path(root, + partial_unique_rel, + path, + sortPathkeys, + -1.0); + else + path = (Path *) create_incremental_sort_path(root, + partial_unique_rel, + path, + sortPathkeys, + presorted_keys, + -1.0); + } + + path = (Path *) create_unique_path(root, partial_unique_rel, path, + list_length(sortPathkeys), + partial_unique_rel->rows); + + add_partial_path(partial_unique_rel, path); + } + } + + /* Consider hash-based implementation, if possible. */ + if (sjinfo->semi_can_hash) + { + Path *path; + + /* + * Make a separate ProjectionPath in case we need a Result node. + */ + path = (Path *) create_projection_path(root, + partial_unique_rel, + cheapest_partial_path, + partial_unique_rel->reltarget); + + path = (Path *) create_agg_path(root, + partial_unique_rel, + path, + cheapest_partial_path->pathtarget, + AGG_HASHED, + AGGSPLIT_SIMPLE, + groupClause, + NIL, + NULL, + partial_unique_rel->rows); + + add_partial_path(partial_unique_rel, path); + } + + if (partial_unique_rel->partial_pathlist != NIL) + { + generate_useful_gather_paths(root, partial_unique_rel, true); + set_cheapest(partial_unique_rel); + + /* + * Finally, create paths to unique-ify the final result. This step is + * needed to remove any duplicates due to combining rows from parallel + * workers. + */ + create_final_unique_paths(root, partial_unique_rel, + sortPathkeys, groupClause, + sjinfo, unique_rel); + } +} + +/* + * Choose a unique name for some subroot. + * + * Modifies glob->subplanNames to track names already used. + */ +char * +choose_plan_name(PlannerGlobal *glob, const char *name, bool always_number) +{ + unsigned n; + + /* + * If a numeric suffix is not required, then search the list of + * previously-assigned names for a match. If none is found, then we can + * use the provided name without modification. + */ + if (!always_number) + { + bool found = false; + + foreach_ptr(char, subplan_name, glob->subplanNames) + { + if (strcmp(subplan_name, name) == 0) + { + found = true; + break; + } + } + + if (!found) + { + /* pstrdup here is just to avoid cast-away-const */ + char *chosen_name = pstrdup(name); + + glob->subplanNames = lappend(glob->subplanNames, chosen_name); + return chosen_name; + } + } + + /* + * If a numeric suffix is required or if the un-suffixed name is already + * in use, then loop until we find a positive integer that produces a + * novel name. + */ + for (n = 1; true; ++n) + { + char *proposed_name = psprintf("%s_%u", name, n); + bool found = false; + + foreach_ptr(char, subplan_name, glob->subplanNames) + { + if (strcmp(subplan_name, proposed_name) == 0) + { + found = true; + break; + } + } + + if (!found) + { + glob->subplanNames = lappend(glob->subplanNames, proposed_name); + return proposed_name; + } + + pfree(proposed_name); + } +} diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 846e44186c366..72f5efd747600 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -307,8 +307,12 @@ set_plan_references(PlannerInfo *root, Plan *plan) PlanRowMark *rc = lfirst_node(PlanRowMark, lc); PlanRowMark *newrc; + /* sanity check on existing row marks */ + Assert(root->simple_rel_array[rc->rti] != NULL && + root->simple_rte_array[rc->rti] != NULL); + /* flat copy is enough since all fields are scalars */ - newrc = (PlanRowMark *) palloc(sizeof(PlanRowMark)); + newrc = palloc_object(PlanRowMark); memcpy(newrc, rc, sizeof(PlanRowMark)); /* adjust indexes ... but *not* the rowmarkId */ @@ -541,7 +545,7 @@ add_rte_to_flat_rtable(PlannerGlobal *glob, List *rteperminfos, RangeTblEntry *newrte; /* flat copy to duplicate all the scalar fields */ - newrte = (RangeTblEntry *) palloc(sizeof(RangeTblEntry)); + newrte = palloc_object(RangeTblEntry); memcpy(newrte, rte, sizeof(RangeTblEntry)); /* zap unneeded sub-structure */ @@ -1030,16 +1034,35 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) * expected to occur here, it seems safer to special-case * it here and keep the assertions that ROWID_VARs * shouldn't be seen by fix_scan_expr. + * + * We also must handle the case where set operations have + * been short-circuited resulting in a dummy Result node. + * prepunion.c uses varno==0 for the set op targetlist. + * See generate_setop_tlist() and generate_setop_tlist(). + * Here we rewrite these to use varno==1, which is the + * varno of the first set-op child. Without this, EXPLAIN + * will have trouble displaying targetlists of dummy set + * operations. */ foreach(l, splan->plan.targetlist) { TargetEntry *tle = (TargetEntry *) lfirst(l); Var *var = (Var *) tle->expr; - if (var && IsA(var, Var) && var->varno == ROWID_VAR) - tle->expr = (Expr *) makeNullConst(var->vartype, - var->vartypmod, - var->varcollid); + if (var && IsA(var, Var)) + { + if (var->varno == ROWID_VAR) + tle->expr = (Expr *) makeNullConst(var->vartype, + var->vartypmod, + var->varcollid); + else if (var->varno == 0) + tle->expr = (Expr *) makeVar(1, + var->varattno, + var->vartype, + var->vartypmod, + var->varcollid, + var->varlevelsup); + } } splan->plan.targetlist = @@ -1052,6 +1075,8 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) /* resconstantqual can't contain any subplan variable refs */ splan->resconstantqual = fix_scan_expr(root, splan->resconstantqual, rtoffset, 1); + /* adjust the relids set */ + splan->relids = offset_relid_set(splan->relids, rtoffset); } break; case T_ProjectSet: @@ -1557,7 +1582,7 @@ clean_up_removed_plan_level(Plan *parent, Plan *child) child->startup_cost += initplan_cost; child->total_cost += initplan_cost; if (unsafe_initplans) - child->parallel_safe = false; + child->parallel_safe = PARALLEL_UNSAFE; /* * Attach plans this way so that parent's initplans are processed @@ -2003,7 +2028,7 @@ offset_relid_set(Relids relids, int rtoffset) static inline Var * copyVar(Var *var) { - Var *newvar = (Var *) palloc(sizeof(Var)); + Var *newvar = palloc_object(Var); *newvar = *var; return newvar; diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index e7cb3fede6658..cd1061e339d43 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -20,6 +20,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_type.h" #include "executor/executor.h" +#include "executor/nodeSubplan.h" #include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" @@ -79,8 +80,8 @@ static Node *convert_testexpr(PlannerInfo *root, List *subst_nodes); static Node *convert_testexpr_mutator(Node *node, convert_testexpr_context *context); -static bool subplan_is_hashable(Plan *plan); -static bool subpath_is_hashable(Path *path); +static bool subplan_is_hashable(Plan *plan, bool unknownEqFalse); +static bool subpath_is_hashable(Path *path, bool unknownEqFalse); static bool testexpr_is_hashable(Node *testexpr, List *param_ids); static bool test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids); static bool hash_ok_operator(OpExpr *expr); @@ -103,6 +104,7 @@ static Bitmapset *finalize_plan(PlannerInfo *root, Bitmapset *scan_params); static bool finalize_primnode(Node *node, finalize_primnode_context *context); static bool finalize_agg_primnode(Node *node, finalize_primnode_context *context); +static const char *sublinktype_to_string(SubLinkType subLinkType); /* @@ -172,6 +174,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, Plan *plan; List *plan_params; Node *result; + const char *sublinkstr = sublinktype_to_string(subLinkType); /* * Copy the source Query node. This is a quick and dirty kluge to resolve @@ -218,8 +221,9 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, Assert(root->plan_params == NIL); /* Generate Paths for the subquery */ - subroot = subquery_planner(root->glob, subquery, root, false, - tuple_fraction, NULL); + subroot = subquery_planner(root->glob, subquery, + choose_plan_name(root->glob, sublinkstr, true), + root, false, tuple_fraction, NULL); /* Isolate the params needed by this specific subplan */ plan_params = root->plan_params; @@ -264,9 +268,12 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, &newtestexpr, ¶mIds); if (subquery) { + char *plan_name; + /* Generate Paths for the ANY subquery; we'll need all rows */ - subroot = subquery_planner(root->glob, subquery, root, false, 0.0, - NULL); + plan_name = choose_plan_name(root->glob, sublinkstr, true); + subroot = subquery_planner(root->glob, subquery, plan_name, + root, false, 0.0, NULL); /* Isolate the params needed by this specific subplan */ plan_params = root->plan_params; @@ -277,7 +284,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, best_path = final_rel->cheapest_total_path; /* Now we can check if it'll fit in hash_mem */ - if (subpath_is_hashable(best_path)) + if (subpath_is_hashable(best_path, true)) { SubPlan *hashplan; AlternativeSubPlan *asplan; @@ -324,15 +331,16 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, { Node *result; SubPlan *splan; - bool isInitPlan; ListCell *lc; /* - * Initialize the SubPlan node. Note plan_id, plan_name, and cost fields - * are set further down. + * Initialize the SubPlan node. + * + * Note: plan_id and cost fields are set further down. */ splan = makeNode(SubPlan); splan->subLinkType = subLinkType; + splan->plan_name = subroot->plan_name; splan->testexpr = NULL; splan->paramIds = NIL; get_first_col_type(plan, &splan->firstColType, &splan->firstColTypmod, @@ -391,7 +399,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, Assert(testexpr == NULL); prm = generate_new_exec_param(root, BOOLOID, -1, InvalidOid); splan->setParam = list_make1_int(prm->paramid); - isInitPlan = true; + splan->isInitPlan = true; result = (Node *) prm; } else if (splan->parParam == NIL && subLinkType == EXPR_SUBLINK) @@ -406,7 +414,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, exprTypmod((Node *) te->expr), exprCollation((Node *) te->expr)); splan->setParam = list_make1_int(prm->paramid); - isInitPlan = true; + splan->isInitPlan = true; result = (Node *) prm; } else if (splan->parParam == NIL && subLinkType == ARRAY_SUBLINK) @@ -426,7 +434,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, exprTypmod((Node *) te->expr), exprCollation((Node *) te->expr)); splan->setParam = list_make1_int(prm->paramid); - isInitPlan = true; + splan->isInitPlan = true; result = (Node *) prm; } else if (splan->parParam == NIL && subLinkType == ROWCOMPARE_SUBLINK) @@ -442,7 +450,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, testexpr, params); splan->setParam = list_copy(splan->paramIds); - isInitPlan = true; + splan->isInitPlan = true; /* * The executable expression is returned to become part of the outer @@ -476,12 +484,12 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, /* It can be an initplan if there are no parParams. */ if (splan->parParam == NIL) { - isInitPlan = true; + splan->isInitPlan = true; result = (Node *) makeNullConst(RECORDOID, -1, InvalidOid); } else { - isInitPlan = false; + splan->isInitPlan = false; result = (Node *) splan; } } @@ -517,7 +525,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, */ if (subLinkType == ANY_SUBLINK && splan->parParam == NIL && - subplan_is_hashable(plan) && + subplan_is_hashable(plan, unknownEqFalse) && testexpr_is_hashable(splan->testexpr, splan->paramIds)) splan->useHashTable = true; @@ -536,7 +544,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, plan = materialize_finished_plan(plan); result = (Node *) splan; - isInitPlan = false; + splan->isInitPlan = false; } /* @@ -547,7 +555,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, root->glob->subroots = lappend(root->glob->subroots, subroot); splan->plan_id = list_length(root->glob->subplans); - if (isInitPlan) + if (splan->isInitPlan) root->init_plans = lappend(root->init_plans, splan); /* @@ -557,15 +565,10 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path, * there's no point since it won't get re-run without parameter changes * anyway. The input of a hashed subplan doesn't need REWIND either. */ - if (splan->parParam == NIL && !isInitPlan && !splan->useHashTable) + if (splan->parParam == NIL && !splan->isInitPlan && !splan->useHashTable) root->glob->rewindPlanIDs = bms_add_member(root->glob->rewindPlanIDs, splan->plan_id); - /* Label the subplan for EXPLAIN purposes */ - splan->plan_name = psprintf("%s %d", - isInitPlan ? "InitPlan" : "SubPlan", - splan->plan_id); - /* Lastly, fill in the cost estimates for use later */ cost_subplan(root, splan, plan); @@ -709,19 +712,19 @@ convert_testexpr_mutator(Node *node, * is suitable for hashing. We only look at the subquery itself. */ static bool -subplan_is_hashable(Plan *plan) +subplan_is_hashable(Plan *plan, bool unknownEqFalse) { - double subquery_size; + Size hashtablesize; /* - * The estimated size of the subquery result must fit in hash_mem. (Note: - * we use heap tuple overhead here even though the tuples will actually be - * stored as MinimalTuples; this provides some fudge factor for hashtable - * overhead.) + * The estimated size of the hashtable holding the subquery result must + * fit in hash_mem. (Note: reject on equality, to ensure that an estimate + * of SIZE_MAX disables hashing regardless of the hash_mem limit.) */ - subquery_size = plan->plan_rows * - (MAXALIGN(plan->plan_width) + MAXALIGN(SizeofHeapTupleHeader)); - if (subquery_size > get_hash_memory_limit()) + hashtablesize = EstimateSubplanHashTableSpace(plan->plan_rows, + plan->plan_width, + unknownEqFalse); + if (hashtablesize >= get_hash_memory_limit()) return false; return true; @@ -733,19 +736,19 @@ subplan_is_hashable(Plan *plan) * Identical to subplan_is_hashable, but work from a Path for the subplan. */ static bool -subpath_is_hashable(Path *path) +subpath_is_hashable(Path *path, bool unknownEqFalse) { - double subquery_size; + Size hashtablesize; /* - * The estimated size of the subquery result must fit in hash_mem. (Note: - * we use heap tuple overhead here even though the tuples will actually be - * stored as MinimalTuples; this provides some fudge factor for hashtable - * overhead.) + * The estimated size of the hashtable holding the subquery result must + * fit in hash_mem. (Note: reject on equality, to ensure that an estimate + * of SIZE_MAX disables hashing regardless of the hash_mem limit.) */ - subquery_size = path->rows * - (MAXALIGN(path->pathtarget->width) + MAXALIGN(SizeofHeapTupleHeader)); - if (subquery_size > get_hash_memory_limit()) + hashtablesize = EstimateSubplanHashTableSpace(path->rows, + path->pathtarget->width, + unknownEqFalse); + if (hashtablesize >= get_hash_memory_limit()) return false; return true; @@ -965,8 +968,9 @@ SS_process_ctes(PlannerInfo *root) * Generate Paths for the CTE query. Always plan for full retrieval * --- we don't have enough info to predict otherwise. */ - subroot = subquery_planner(root->glob, subquery, root, - cte->cterecursive, 0.0, NULL); + subroot = subquery_planner(root->glob, subquery, + choose_plan_name(root->glob, cte->ctename, false), + root, cte->cterecursive, 0.0, NULL); /* * Since the current query level doesn't yet contain any RTEs, it @@ -989,10 +993,11 @@ SS_process_ctes(PlannerInfo *root) * Make a SubPlan node for it. This is just enough unlike * build_subplan that we can't share code. * - * Note plan_id, plan_name, and cost fields are set further down. + * Note: plan_id and cost fields are set further down. */ splan = makeNode(SubPlan); splan->subLinkType = CTE_SUBLINK; + splan->plan_name = subroot->plan_name; splan->testexpr = NULL; splan->paramIds = NIL; get_first_col_type(plan, &splan->firstColType, &splan->firstColTypmod, @@ -1004,7 +1009,7 @@ SS_process_ctes(PlannerInfo *root) * CTE scans are not considered for parallelism (cf * set_rel_consider_parallel). */ - splan->parallel_safe = false; + splan->parallel_safe = PARALLEL_UNSAFE; splan->setParam = NIL; splan->parParam = NIL; splan->args = NIL; @@ -1039,9 +1044,6 @@ SS_process_ctes(PlannerInfo *root) root->cte_plan_ids = lappend_int(root->cte_plan_ids, splan->plan_id); - /* Label the subplan for EXPLAIN purposes */ - splan->plan_name = psprintf("CTE %s", cte->ctename); - /* Lastly, fill in the cost estimates for use later */ cost_subplan(root, splan, plan); } @@ -1397,7 +1399,7 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, */ nsitem = addRangeTableEntryForSubquery(pstate, subselect, - makeAlias("ANY_subquery", NIL), + NULL, use_lateral, false); rte = nsitem->p_rte; @@ -1454,6 +1456,7 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, Query *parse = root->parse; Query *subselect = (Query *) sublink->subselect; Node *whereClause; + PlannerInfo subroot; int rtoffset; int varno; Relids clause_varnos; @@ -1515,6 +1518,35 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, if (contain_volatile_functions(whereClause)) return NULL; + /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + * + * Note: we construct up an entirely dummy PlannerInfo for use here. This + * is fine because only the "glob" and "parse" links will be used in this + * case. + * + * Note: we temporarily assign back the WHERE clause so that any virtual + * generated column references within it can be expanded. It should be + * separated out again afterward. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + subselect->jointree->quals = whereClause; + subselect = preprocess_relation_rtes(&subroot); + + /* + * Now separate out the WHERE clause again. + */ + whereClause = subselect->jointree->quals; + subselect->jointree->quals = NULL; + /* * The subquery must have a nonempty jointree, but we can make it so. */ @@ -1732,6 +1764,7 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, Node **testexpr, List **paramIds) { Node *whereClause; + PlannerInfo subroot; List *leftargs, *rightargs, *opids, @@ -1791,12 +1824,15 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, * parent aliases were flattened already, and we're not going to pull any * child Vars (of any description) into the parent. * - * Note: passing the parent's root to eval_const_expressions is - * technically wrong, but we can get away with it since only the - * boundParams (if any) are used, and those would be the same in a - * subroot. - */ - whereClause = eval_const_expressions(root, whereClause); + * Note: we construct up an entirely dummy PlannerInfo to pass to + * eval_const_expressions. This is fine because only the "glob" and + * "parse" links are used by eval_const_expressions. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + whereClause = eval_const_expressions(&subroot, whereClause); whereClause = (Node *) canonicalize_qual((Expr *) whereClause, false); whereClause = (Node *) make_ands_implicit((Expr *) whereClause); @@ -2272,7 +2308,7 @@ SS_charge_for_initplans(PlannerInfo *root, RelOptInfo *final_rel) path->startup_cost += initplan_cost; path->total_cost += initplan_cost; if (unsafe_initplans) - path->parallel_safe = false; + path->parallel_safe = PARALLEL_UNSAFE; } /* @@ -3151,7 +3187,8 @@ SS_make_initplan_from_plan(PlannerInfo *root, node = makeNode(SubPlan); node->subLinkType = EXPR_SUBLINK; node->plan_id = list_length(root->glob->subplans); - node->plan_name = psprintf("InitPlan %d", node->plan_id); + node->plan_name = subroot->plan_name; + node->isInitPlan = true; get_first_col_type(plan, &node->firstColType, &node->firstColTypmod, &node->firstColCollation); node->parallel_safe = plan->parallel_safe; @@ -3167,3 +3204,32 @@ SS_make_initplan_from_plan(PlannerInfo *root, /* Set costs of SubPlan using info from the plan tree */ cost_subplan(subroot, node, plan); } + +/* + * Get a string equivalent of a given subLinkType. + */ +static const char * +sublinktype_to_string(SubLinkType subLinkType) +{ + switch (subLinkType) + { + case EXISTS_SUBLINK: + return "exists"; + case ALL_SUBLINK: + return "all"; + case ANY_SUBLINK: + return "any"; + case ROWCOMPARE_SUBLINK: + return "rowcompare"; + case EXPR_SUBLINK: + return "expr"; + case MULTIEXPR_SUBLINK: + return "multiexpr"; + case ARRAY_SUBLINK: + return "array"; + case CTE_SUBLINK: + return "cte"; + } + Assert(false); + return "???"; +} diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 87dc6f56b576f..c3b726e93e7d3 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -4,10 +4,10 @@ * Planner preprocessing for subqueries and join tree manipulation. * * NOTE: the intended sequence for invoking these operations is + * preprocess_relation_rtes * replace_empty_jointree * pull_up_sublinks * preprocess_function_rtes - * expand_virtual_generated_columns * pull_up_subqueries * flatten_simple_union_all * do expression preprocessing (including flattening JOIN alias vars) @@ -36,6 +36,7 @@ #include "optimizer/clauses.h" #include "optimizer/optimizer.h" #include "optimizer/placeholder.h" +#include "optimizer/plancat.h" #include "optimizer/prep.h" #include "optimizer/subselect.h" #include "optimizer/tlist.h" @@ -102,6 +103,9 @@ typedef struct reduce_outer_joins_partial_state Relids unreduced_side; /* relids in its still-nullable side */ } reduce_outer_joins_partial_state; +static Query *expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation); static Node *pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode, Relids *relids); static Node *pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, @@ -392,6 +396,181 @@ transform_MERGE_to_join(Query *parse) parse->mergeJoinCondition = NULL; /* join condition not needed */ } +/* + * preprocess_relation_rtes + * Do the preprocessing work for any relation RTEs in the FROM clause. + * + * This scans the rangetable for relation RTEs and retrieves the necessary + * catalog information for each relation. Using this information, it clears + * the inh flag for any relation that has no children, collects not-null + * attribute numbers for any relation that has column not-null constraints, and + * expands virtual generated columns for any relation that contains them. + * + * Note that expanding virtual generated columns may cause the query tree to + * have new copies of rangetable entries. Therefore, we have to use list_nth + * instead of foreach when iterating over the query's rangetable. + * + * Returns a modified copy of the query tree, if any relations with virtual + * generated columns are present. + */ +Query * +preprocess_relation_rtes(PlannerInfo *root) +{ + Query *parse = root->parse; + int rtable_size; + int rt_index; + + rtable_size = list_length(parse->rtable); + + for (rt_index = 0; rt_index < rtable_size; rt_index++) + { + RangeTblEntry *rte = rt_fetch(rt_index + 1, parse->rtable); + Relation relation; + + /* We only care about relation RTEs. */ + if (rte->rtekind != RTE_RELATION) + continue; + + /* + * We need not lock the relation since it was already locked by the + * rewriter. + */ + relation = table_open(rte->relid, NoLock); + + /* + * Check to see if the relation actually has any children; if not, + * clear the inh flag so we can treat it as a plain base relation. + * + * Note: this could give a false-positive result, if the rel once had + * children but no longer does. We used to be able to clear rte->inh + * later on when we discovered that, but no more; we have to handle + * such cases as full-fledged inheritance. + */ + if (rte->inh) + rte->inh = relation->rd_rel->relhassubclass; + + /* + * Check to see if the relation has any column not-null constraints; + * if so, retrieve the constraint information and store it in a + * relation OID based hash table. + */ + get_relation_notnullatts(root, relation); + + /* + * Check to see if the relation has any virtual generated columns; if + * so, replace all Var nodes in the query that reference these columns + * with the generation expressions. + */ + parse = expand_virtual_generated_columns(root, parse, + rte, rt_index + 1, + relation); + + table_close(relation, NoLock); + } + + return parse; +} + +/* + * expand_virtual_generated_columns + * Expand virtual generated columns for the given relation. + * + * This checks whether the given relation has any virtual generated columns, + * and if so, replaces all Var nodes in the query that reference those columns + * with their generation expressions. + * + * Returns a modified copy of the query tree if the relation contains virtual + * generated columns. + */ +static Query * +expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation) +{ + TupleDesc tupdesc; + + /* Only normal relations can have virtual generated columns */ + Assert(rte->rtekind == RTE_RELATION); + + tupdesc = RelationGetDescr(relation); + if (tupdesc->constr && tupdesc->constr->has_generated_virtual) + { + List *tlist = NIL; + pullup_replace_vars_context rvcontext; + + for (int i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + TargetEntry *tle; + + if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + { + Node *defexpr; + + defexpr = build_generation_expression(relation, i + 1); + ChangeVarNodes(defexpr, 1, rt_index, 0); + + tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + else + { + Var *var; + + var = makeVar(rt_index, + i + 1, + attr->atttypid, + attr->atttypmod, + attr->attcollation, + 0); + + tle = makeTargetEntry((Expr *) var, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + } + + Assert(list_length(tlist) > 0); + Assert(!rte->lateral); + + /* + * The relation's targetlist items are now in the appropriate form to + * insert into the query, except that we may need to wrap them in + * PlaceHolderVars. Set up required context data for + * pullup_replace_vars. + */ + rvcontext.root = root; + rvcontext.targetlist = tlist; + rvcontext.target_rte = rte; + rvcontext.result_relation = parse->resultRelation; + /* won't need these values */ + rvcontext.relids = NULL; + rvcontext.nullinfo = NULL; + /* pass NULL for outer_hasSubLinks */ + rvcontext.outer_hasSubLinks = NULL; + rvcontext.varno = rt_index; + /* this flag will be set below, if needed */ + rvcontext.wrap_option = REPLACE_WRAP_NONE; + /* initialize cache array with indexes 0 .. length(tlist) */ + rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * + sizeof(Node *)); + + /* + * If the query uses grouping sets, we need a PlaceHolderVar for each + * expression of the relation's targetlist items. (See comments in + * pull_up_simple_subquery().) + */ + if (parse->groupingSets) + rvcontext.wrap_option = REPLACE_WRAP_ALL; + + /* + * Apply pullup variable replacement throughout the query tree. + */ + parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); + } + + return parse; +} + /* * replace_empty_jointree * If the Query's jointree is empty, replace it with a dummy RTE_RESULT @@ -562,7 +741,7 @@ pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode, * Make a modifiable copy of join node, but don't bother copying its * subnodes (yet). */ - j = (JoinExpr *) palloc(sizeof(JoinExpr)); + j = palloc_object(JoinExpr); memcpy(j, jtnode, sizeof(JoinExpr)); jtlink = (Node *) j; @@ -887,13 +1066,15 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, /* * preprocess_function_rtes * Constant-simplify any FUNCTION RTEs in the FROM clause, and then - * attempt to "inline" any that are set-returning functions. + * attempt to "inline" any that can be converted to simple subqueries. * - * If an RTE_FUNCTION rtable entry invokes a set-returning function that + * If an RTE_FUNCTION rtable entry invokes a set-returning SQL function that * contains just a simple SELECT, we can convert the rtable entry to an - * RTE_SUBQUERY entry exposing the SELECT directly. This is especially - * useful if the subquery can then be "pulled up" for further optimization, - * but we do it even if not, to reduce executor overhead. + * RTE_SUBQUERY entry exposing the SELECT directly. Other sorts of functions + * are also inline-able if they have a support function that can generate + * the replacement sub-Query. This is especially useful if the subquery can + * then be "pulled up" for further optimization, but we do it even if not, + * to reduce executor overhead. * * This has to be done before we have started to do any optimization of * subqueries, else any such steps wouldn't get applied to subqueries @@ -928,7 +1109,7 @@ preprocess_function_rtes(PlannerInfo *root) eval_const_expressions(root, (Node *) rte->functions); /* Check safety of expansion, and expand if possible */ - funcquery = inline_set_returning_function(root, rte); + funcquery = inline_function_in_from(root, rte); if (funcquery) { /* Successful expansion, convert the RTE to a subquery */ @@ -949,128 +1130,6 @@ preprocess_function_rtes(PlannerInfo *root) } } -/* - * expand_virtual_generated_columns - * Expand all virtual generated column references in a query. - * - * This scans the rangetable for relations with virtual generated columns, and - * replaces all Var nodes in the query that reference these columns with the - * generation expressions. Note that we do not descend into subqueries; that - * is taken care of when the subqueries are planned. - * - * This has to be done after we have pulled up any SubLinks within the query's - * quals; otherwise any virtual generated column references within the SubLinks - * that should be transformed into joins wouldn't get expanded. - * - * Returns a modified copy of the query tree, if any relations with virtual - * generated columns are present. - */ -Query * -expand_virtual_generated_columns(PlannerInfo *root) -{ - Query *parse = root->parse; - int rt_index; - ListCell *lc; - - rt_index = 0; - foreach(lc, parse->rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - Relation rel; - TupleDesc tupdesc; - - ++rt_index; - - /* - * Only normal relations can have virtual generated columns. - */ - if (rte->rtekind != RTE_RELATION) - continue; - - rel = table_open(rte->relid, NoLock); - - tupdesc = RelationGetDescr(rel); - if (tupdesc->constr && tupdesc->constr->has_generated_virtual) - { - List *tlist = NIL; - pullup_replace_vars_context rvcontext; - - for (int i = 0; i < tupdesc->natts; i++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, i); - TargetEntry *tle; - - if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) - { - Node *defexpr; - - defexpr = build_generation_expression(rel, i + 1); - ChangeVarNodes(defexpr, 1, rt_index, 0); - - tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - else - { - Var *var; - - var = makeVar(rt_index, - i + 1, - attr->atttypid, - attr->atttypmod, - attr->attcollation, - 0); - - tle = makeTargetEntry((Expr *) var, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - } - - Assert(list_length(tlist) > 0); - Assert(!rte->lateral); - - /* - * The relation's targetlist items are now in the appropriate form - * to insert into the query, except that we may need to wrap them - * in PlaceHolderVars. Set up required context data for - * pullup_replace_vars. - */ - rvcontext.root = root; - rvcontext.targetlist = tlist; - rvcontext.target_rte = rte; - rvcontext.result_relation = parse->resultRelation; - /* won't need these values */ - rvcontext.relids = NULL; - rvcontext.nullinfo = NULL; - /* pass NULL for outer_hasSubLinks */ - rvcontext.outer_hasSubLinks = NULL; - rvcontext.varno = rt_index; - /* this flag will be set below, if needed */ - rvcontext.wrap_option = REPLACE_WRAP_NONE; - /* initialize cache array with indexes 0 .. length(tlist) */ - rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * - sizeof(Node *)); - - /* - * If the query uses grouping sets, we need a PlaceHolderVar for - * each expression of the relation's targetlist items. (See - * comments in pull_up_simple_subquery().) - */ - if (parse->groupingSets) - rvcontext.wrap_option = REPLACE_WRAP_ALL; - - /* - * Apply pullup variable replacement throughout the query tree. - */ - parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); - } - - table_close(rel, NoLock); - } - - return parse; -} - /* * pull_up_subqueries * Look for subqueries in the rangetable that can be pulled up into @@ -1299,6 +1358,7 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, subroot->parse = subquery; subroot->glob = root->glob; subroot->query_level = root->query_level; + subroot->plan_name = root->plan_name; subroot->parent_root = root->parent_root; subroot->plan_params = NIL; subroot->outer_params = NULL; @@ -1326,6 +1386,7 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, subroot->qual_security_level = 0; subroot->placeholdersFrozen = false; subroot->hasRecursion = false; + subroot->assumeReplanning = false; subroot->wt_param_id = -1; subroot->non_recursive_path = NULL; /* We don't currently need a top JoinDomain for the subroot */ @@ -1333,6 +1394,16 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, /* No CTEs to worry about */ Assert(subquery->cteList == NIL); + /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + */ + subquery = subroot->parse = preprocess_relation_rtes(subroot); + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. @@ -1352,13 +1423,6 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, */ preprocess_function_rtes(subroot); - /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. - */ - subquery = subroot->parse = expand_virtual_generated_columns(subroot); - /* * Recursively pull up the subquery's subqueries, so that * pull_up_subqueries' processing is complete for its jointree and @@ -3176,8 +3240,7 @@ reduce_outer_joins_pass1(Node *jtnode) { reduce_outer_joins_pass1_state *result; - result = (reduce_outer_joins_pass1_state *) - palloc(sizeof(reduce_outer_joins_pass1_state)); + result = palloc_object(reduce_outer_joins_pass1_state); result->relids = NULL; result->contains_outer = false; result->sub_states = NIL; @@ -3529,7 +3592,7 @@ report_reduced_full_join(reduce_outer_joins_pass2_state *state2, { reduce_outer_joins_partial_state *statep; - statep = palloc(sizeof(reduce_outer_joins_partial_state)); + statep = palloc_object(reduce_outer_joins_partial_state); statep->full_join_rti = rtindex; statep->unreduced_side = relids; state2->partial_reduced = lappend(state2->partial_reduced, statep); diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index eab44da65b8f0..a01b02f3a7b6f 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -23,6 +23,8 @@ */ #include "postgres.h" +#include + #include "access/htup_details.h" #include "catalog/pg_type.h" #include "miscadmin.h" @@ -74,6 +76,8 @@ static List *generate_append_tlist(List *colTypes, List *colCollations, List *input_tlists, List *refnames_tlist); static List *generate_setop_grouplist(SetOperationStmt *op, List *targetlist); +static PathTarget *create_setop_pathtarget(PlannerInfo *root, List *tlist, + List *child_pathlist); /* @@ -228,6 +232,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root, PlannerInfo *subroot; List *tlist; bool trivial_tlist; + char *plan_name; Assert(subquery != NULL); @@ -242,7 +247,9 @@ recurse_set_operations(Node *setOp, PlannerInfo *root, * parentOp, pass that down to encourage subquery_planner to consider * suitably-sorted Paths. */ - subroot = rel->subroot = subquery_planner(root->glob, subquery, root, + plan_name = choose_plan_name(root->glob, "setop", true); + subroot = rel->subroot = subquery_planner(root->glob, subquery, + plan_name, root, false, root->tuple_fraction, parentOp); @@ -519,6 +526,13 @@ build_setop_child_paths(PlannerInfo *root, RelOptInfo *rel, bool is_sorted; int presorted_keys; + /* If the input rel is dummy, propagate that to this query level */ + if (is_dummy_rel(final_rel)) + { + mark_dummy_rel(rel); + continue; + } + /* * Include the cheapest path as-is so that the set operation can be * cheaply implemented using a method which does not require the input @@ -759,6 +773,16 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, RelOptInfo *rel = lfirst(lc); Path *ordered_path; + /* + * Record the relids so that we can identify the correct + * UPPERREL_SETOP RelOptInfo below. + */ + relids = bms_add_members(relids, rel->relids); + + /* Skip any UNION children that are proven not to yield any rows */ + if (is_dummy_rel(rel)) + continue; + cheapest_pathlist = lappend(cheapest_pathlist, rel->cheapest_total_path); @@ -797,16 +821,23 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, partial_pathlist = lappend(partial_pathlist, linitial(rel->partial_pathlist)); } - - relids = bms_union(relids, rel->relids); } /* Build result relation. */ result_rel = fetch_upper_rel(root, UPPERREL_SETOP, relids); - result_rel->reltarget = create_pathtarget(root, tlist); + result_rel->reltarget = create_setop_pathtarget(root, tlist, + cheapest_pathlist); result_rel->consider_parallel = consider_parallel; result_rel->consider_startup = (root->tuple_fraction > 0); + /* If all UNION children were dummy rels, make the resulting rel dummy */ + if (cheapest_pathlist == NIL) + { + mark_dummy_rel(result_rel); + + return result_rel; + } + /* * Append the child results together using the cheapest paths from each * union child. @@ -870,16 +901,37 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, double dNumGroups; bool can_sort = grouping_is_sortable(groupList); bool can_hash = grouping_is_hashable(groupList); + Path *first_path = linitial(cheapest_pathlist); /* - * XXX for the moment, take the number of distinct groups as equal to - * the total input size, i.e., the worst case. This is too - * conservative, but it's not clear how to get a decent estimate of - * the true size. One should note as well the propensity of novices - * to write UNION rather than UNION ALL even when they don't expect - * any duplicates... + * Estimate the number of UNION output rows. In the case when only a + * single UNION child remains, we can use estimate_num_groups() on + * that child. We must be careful not to do this when that child is + * the result of some other set operation as the targetlist will + * contain Vars with varno==0, which estimate_num_groups() wouldn't + * like. */ - dNumGroups = apath->rows; + if (list_length(cheapest_pathlist) == 1 && + first_path->parent->reloptkind != RELOPT_UPPER_REL) + { + dNumGroups = estimate_num_groups(root, + first_path->pathtarget->exprs, + first_path->rows, + NULL, + NULL); + } + else + { + /* + * Otherwise, for the moment, take the number of distinct groups + * as equal to the total input size, i.e., the worst case. This + * is too conservative, but it's not clear how to get a decent + * estimate of the true size. One should note as well the + * propensity of novices to write UNION rather than UNION ALL even + * when they don't expect any duplicates... + */ + dNumGroups = apath->rows; + } if (can_hash) { @@ -892,7 +944,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, path = (Path *) create_agg_path(root, result_rel, apath, - create_pathtarget(root, tlist), + result_rel->reltarget, AGG_HASHED, AGGSPLIT_SIMPLE, groupList, @@ -908,7 +960,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, path = (Path *) create_agg_path(root, result_rel, gpath, - create_pathtarget(root, tlist), + result_rel->reltarget, AGG_HASHED, AGGSPLIT_SIMPLE, groupList, @@ -929,11 +981,11 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, make_pathkeys_for_sortclauses(root, groupList, tlist), -1.0); - path = (Path *) create_upper_unique_path(root, - result_rel, - path, - list_length(path->pathkeys), - dNumGroups); + path = (Path *) create_unique_path(root, + result_rel, + path, + list_length(path->pathkeys), + dNumGroups); add_path(result_rel, path); @@ -946,11 +998,11 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, make_pathkeys_for_sortclauses(root, groupList, tlist), -1.0); - path = (Path *) create_upper_unique_path(root, - result_rel, - path, - list_length(path->pathkeys), - dNumGroups); + path = (Path *) create_unique_path(root, + result_rel, + path, + list_length(path->pathkeys), + dNumGroups); add_path(result_rel, path); } } @@ -970,11 +1022,11 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, NULL); /* and make the MergeAppend unique */ - path = (Path *) create_upper_unique_path(root, - result_rel, - path, - list_length(tlist), - dNumGroups); + path = (Path *) create_unique_path(root, + result_rel, + path, + list_length(tlist), + dNumGroups); add_path(result_rel, path); } @@ -1130,7 +1182,81 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root, /* Build result relation. */ result_rel = fetch_upper_rel(root, UPPERREL_SETOP, bms_union(lrel->relids, rrel->relids)); - result_rel->reltarget = create_pathtarget(root, tlist); + + /* + * Create the PathTarget and set the width accordingly. For EXCEPT, since + * the set op result won't contain rows from the rpath, we only account + * for the width of the lpath. For INTERSECT, use both input paths. + */ + if (op->op == SETOP_EXCEPT) + result_rel->reltarget = create_setop_pathtarget(root, tlist, + list_make1(lpath)); + else + result_rel->reltarget = create_setop_pathtarget(root, tlist, + list_make2(lpath, rpath)); + + /* Check for provably empty setop inputs and add short-circuit paths. */ + if (op->op == SETOP_EXCEPT) + { + /* + * For EXCEPTs, if the left side is dummy then there's no need to + * inspect the right-hand side as scanning the right to find tuples to + * remove won't make the left-hand input any more empty. + */ + if (is_dummy_rel(lrel)) + { + mark_dummy_rel(result_rel); + + return result_rel; + } + + /* Handle EXCEPTs with dummy right input */ + if (is_dummy_rel(rrel)) + { + if (op->all) + { + Path *apath; + + /* + * EXCEPT ALL: If the right-hand input is dummy then we can + * simply scan the left-hand input. To keep createplan.c + * happy, use a single child Append to handle the translation + * between the set op targetlist and the targetlist of the + * left input. The Append will be removed in setrefs.c. + */ + apath = (Path *) create_append_path(root, result_rel, list_make1(lpath), + NIL, NIL, NULL, 0, false, -1); + + add_path(result_rel, apath); + + return result_rel; + } + else + { + /* + * To make EXCEPT with a dummy RHS work means having to + * deduplicate the left input. That could be done with + * AggPaths, but it doesn't seem worth the effort. Let the + * normal path generation code below handle this one. + */ + } + } + } + else + { + /* + * For INTERSECT, if either input is a dummy rel then we can mark the + * result_rel as dummy since intersecting with an empty relation can + * never yield any results. This is true regardless of INTERSECT or + * INTERSECT ALL. + */ + if (is_dummy_rel(lrel) || is_dummy_rel(rrel)) + { + mark_dummy_rel(result_rel); + + return result_rel; + } + } /* * Estimate number of distinct groups that we'll need hashtable entries @@ -1503,7 +1629,7 @@ generate_append_tlist(List *colTypes, List *colCollations, * If the inputs all agree on type and typmod of a particular column, use * that typmod; else use -1. */ - colTypmods = (int32 *) palloc(list_length(colTypes) * sizeof(int32)); + colTypmods = palloc_array(int32, list_length(colTypes)); foreach(tlistl, input_tlists) { @@ -1619,3 +1745,38 @@ generate_setop_grouplist(SetOperationStmt *op, List *targetlist) Assert(lg == NULL); return grouplist; } + +/* + * create_setop_pathtarget + * Do the normal create_pathtarget() work, plus set the resulting + * PathTarget's width to the average width of the Paths in child_pathlist + * weighted using the estimated row count of each path. + * + * Note: This is required because set op target lists use varno==0, which + * results in a type default width estimate rather than one that's based on + * statistics of the columns from the set op children. + */ +static PathTarget * +create_setop_pathtarget(PlannerInfo *root, List *tlist, List *child_pathlist) +{ + PathTarget *reltarget; + ListCell *lc; + double parent_rows = 0; + double parent_size = 0; + + reltarget = create_pathtarget(root, tlist); + + /* Calculate the total rows and total size. */ + foreach(lc, child_pathlist) + { + Path *path = (Path *) lfirst(lc); + + parent_rows += path->rows; + parent_size += path->parent->reltarget->width * path->rows; + } + + if (parent_rows > 0) + reltarget->width = rint(parent_size / parent_rows); + + return reltarget; +} diff --git a/src/backend/optimizer/util/Makefile b/src/backend/optimizer/util/Makefile index 4fb115cb118f5..87b4c3c086984 100644 --- a/src/backend/optimizer/util/Makefile +++ b/src/backend/optimizer/util/Makefile @@ -15,6 +15,7 @@ include $(top_builddir)/src/Makefile.global OBJS = \ appendinfo.o \ clauses.o \ + extendplan.o \ inherit.o \ joininfo.o \ orclauses.o \ diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index 5b3dc0d865399..271bb4e682826 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -516,6 +516,57 @@ adjust_appendrel_attrs_mutator(Node *node, return (Node *) newinfo; } + /* + * We have to process RelAggInfo nodes specially. + */ + if (IsA(node, RelAggInfo)) + { + RelAggInfo *oldinfo = (RelAggInfo *) node; + RelAggInfo *newinfo = makeNode(RelAggInfo); + + newinfo->target = (PathTarget *) + adjust_appendrel_attrs_mutator((Node *) oldinfo->target, + context); + + newinfo->agg_input = (PathTarget *) + adjust_appendrel_attrs_mutator((Node *) oldinfo->agg_input, + context); + + newinfo->group_clauses = oldinfo->group_clauses; + + newinfo->group_exprs = (List *) + adjust_appendrel_attrs_mutator((Node *) oldinfo->group_exprs, + context); + + return (Node *) newinfo; + } + + /* + * We have to process PathTarget nodes specially. + */ + if (IsA(node, PathTarget)) + { + PathTarget *oldtarget = (PathTarget *) node; + PathTarget *newtarget = makeNode(PathTarget); + + /* Copy all flat-copiable fields */ + memcpy(newtarget, oldtarget, sizeof(PathTarget)); + + newtarget->exprs = (List *) + adjust_appendrel_attrs_mutator((Node *) oldtarget->exprs, + context); + + if (oldtarget->sortgrouprefs) + { + Size nbytes = list_length(oldtarget->exprs) * sizeof(Index); + + newtarget->sortgrouprefs = (Index *) palloc(nbytes); + memcpy(newtarget->sortgrouprefs, oldtarget->sortgrouprefs, nbytes); + } + + return (Node *) newtarget; + } + /* * NOTE: we do not need to recurse into sublinks, because they should * already have been converted to subplans before we see them. @@ -757,8 +808,7 @@ find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos) int i; /* Allocate an array that's certainly big enough */ - appinfos = (AppendRelInfo **) - palloc(sizeof(AppendRelInfo *) * bms_num_members(relids)); + appinfos = palloc_array(AppendRelInfo *, bms_num_members(relids)); i = -1; while ((i = bms_next_member(relids, i)) >= 0) diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 26a3e0500866c..9f82b5189da31 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -20,6 +20,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "catalog/pg_class.h" #include "catalog/pg_language.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" @@ -36,6 +37,7 @@ #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/optimizer.h" +#include "optimizer/pathnode.h" #include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "parser/analyze.h" @@ -43,6 +45,7 @@ #include "parser/parse_collate.h" #include "parser/parse_func.h" #include "parser/parse_oper.h" +#include "parser/parsetree.h" #include "rewrite/rewriteHandler.h" #include "rewrite/rewriteManip.h" #include "tcop/tcopprot.h" @@ -79,7 +82,7 @@ typedef struct int nargs; List *args; int sublevels_up; -} substitute_actual_srf_parameters_context; +} substitute_actual_parameters_in_from_context; typedef struct { @@ -92,6 +95,7 @@ typedef struct char max_hazard; /* worst proparallel hazard found so far */ char max_interesting; /* worst proparallel hazard of interest */ List *safe_param_ids; /* PARAM_EXEC Param IDs to treat as safe */ + bool hasTempObject; } max_parallel_hazard_context; static bool contain_agg_clause_walker(Node *node, void *context); @@ -128,6 +132,8 @@ static Expr *simplify_function(Oid funcid, Oid result_collid, Oid input_collid, List **args_p, bool funcvariadic, bool process_args, bool allow_non_const, eval_const_expressions_context *context); +static Node *simplify_aggref(Aggref *aggref, + eval_const_expressions_context *context); static List *reorder_function_arguments(List *args, int pronargs, HeapTuple func_tuple); static List *add_function_defaults(List *args, int pronargs, @@ -151,10 +157,16 @@ static Node *substitute_actual_parameters(Node *expr, int nargs, List *args, static Node *substitute_actual_parameters_mutator(Node *node, substitute_actual_parameters_context *context); static void sql_inline_error_callback(void *arg); -static Query *substitute_actual_srf_parameters(Query *expr, - int nargs, List *args); -static Node *substitute_actual_srf_parameters_mutator(Node *node, - substitute_actual_srf_parameters_context *context); +static Query *inline_sql_function_in_from(PlannerInfo *root, + RangeTblFunction *rtfunc, + FuncExpr *fexpr, + HeapTuple func_tuple, + Form_pg_proc funcform, + const char *src); +static Query *substitute_actual_parameters_in_from(Query *expr, + int nargs, List *args); +static Node *substitute_actual_parameters_in_from_mutator(Node *node, + substitute_actual_parameters_in_from_context *context); static bool pull_paramids_walker(Node *node, Bitmapset **context); @@ -228,7 +240,7 @@ contain_window_function(Node *clause) WindowFuncLists * find_window_functions(Node *clause, Index maxWinRef) { - WindowFuncLists *lists = palloc(sizeof(WindowFuncLists)); + WindowFuncLists *lists = palloc_object(WindowFuncLists); lists->numWindowFuncs = 0; lists->maxWinRef = maxWinRef; @@ -749,13 +761,17 @@ max_parallel_hazard(Query *parse) * * root->glob->maxParallelHazard must previously have been set to the * result of max_parallel_hazard() on the whole query. + * + * Expression may contain a reference to subplan that employs temporary + * relations. That's why the flag needs_temp_flush is introduced. */ bool -is_parallel_safe(PlannerInfo *root, Node *node) +is_parallel_safe(PlannerInfo *root, Node *node, bool *needs_temp_flush) { max_parallel_hazard_context context; PlannerInfo *proot; ListCell *l; + bool is_safe; /* * Even if the original querytree contained nothing unsafe, we need to @@ -787,7 +803,20 @@ is_parallel_safe(PlannerInfo *root, Node *node) } } - return !max_parallel_hazard_walker(node, &context); + is_safe = !max_parallel_hazard_walker(node, &context); + + /* + * If the expression is parallel-safe, detect if it needs temp buffers + * flushing before the execution start. Don't care changing it if + * the expression is unsafe - it can't be executed by parallel workers + * anyway. + * In some cases user is interested in only negative result to test an idea. + * So, if incoming poointer is NULL, skip this step. + */ + if (needs_temp_flush && is_safe && context.hasTempObject) + *needs_temp_flush = NEEDS_TEMP_FLUSH; + + return is_safe; } /* core logic for all parallel-hazard checks */ @@ -909,6 +938,8 @@ max_parallel_hazard_walker(Node *node, max_parallel_hazard_context *context) max_parallel_hazard_test(PROPARALLEL_RESTRICTED, context)) return true; save_safe_param_ids = context->safe_param_ids; + context->hasTempObject = + context->hasTempObject || (subplan->parallel_safe == NEEDS_TEMP_FLUSH); context->safe_param_ids = list_concat_copy(context->safe_param_ids, subplan->paramIds); if (max_parallel_hazard_walker(subplan->testexpr, context)) @@ -1112,6 +1143,8 @@ contain_nonstrict_functions_walker(Node *node, void *context) return true; if (IsA(node, BooleanTest)) return true; + if (IsA(node, JsonConstructorExpr)) + return true; /* Check other function-containing nodes */ if (check_functions_in_node(node, contain_nonstrict_functions_checker, @@ -2242,7 +2275,8 @@ rowtype_field_matches(Oid rowtypeid, int fieldnum, * only operators and functions that are reasonable to try to execute. * * NOTE: "root" can be passed as NULL if the caller never wants to do any - * Param substitutions nor receive info about inlined functions. + * Param substitutions nor receive info about inlined functions nor reduce + * NullTest for Vars to constant true or constant false. * * NOTE: the planner assumes that this will always flatten nested AND and * OR clauses into N-argument form. See comments in prepqual.c. @@ -2572,6 +2606,7 @@ eval_const_expressions_mutator(Node *node, newexpr->winref = expr->winref; newexpr->winstar = expr->winstar; newexpr->winagg = expr->winagg; + newexpr->ignore_nulls = expr->ignore_nulls; newexpr->location = expr->location; return (Node *) newexpr; @@ -2621,6 +2656,11 @@ eval_const_expressions_mutator(Node *node, newexpr->location = expr->location; return (Node *) newexpr; } + case T_Aggref: + node = ece_generic_processing(node); + if (context->root != NULL) + return simplify_aggref((Aggref *) node, context); + return node; case T_OpExpr: { OpExpr *expr = (OpExpr *) node; @@ -3305,10 +3345,10 @@ eval_const_expressions_mutator(Node *node, context); /* - * We can remove null constants from the list. For a - * non-null constant, if it has not been preceded by any - * other non-null-constant expressions then it is the - * result. Otherwise, it's the next argument, but we can + * We can remove null constants from the list. For a + * nonnullable expression, if it has not been preceded by + * any non-null-constant expressions then it is the + * result. Otherwise, it's the next argument, but we can * drop following arguments since they will never be * reached. */ @@ -3321,6 +3361,14 @@ eval_const_expressions_mutator(Node *node, newargs = lappend(newargs, e); break; } + if (expr_is_nonnullable(context->root, (Expr *) e, false)) + { + if (newargs == NIL) + return e; /* first expr */ + newargs = lappend(newargs, e); + break; + } + newargs = lappend(newargs, e); } @@ -3333,6 +3381,13 @@ eval_const_expressions_mutator(Node *node, -1, coalesceexpr->coalescecollid); + /* + * If there's exactly one surviving argument, we no longer + * need COALESCE at all: the result is that argument + */ + if (list_length(newargs) == 1) + return (Node *) linitial(newargs); + newcoalesce = makeNode(CoalesceExpr); newcoalesce->coalescetype = coalesceexpr->coalescetype; newcoalesce->coalescecollid = coalesceexpr->coalescecollid; @@ -3493,6 +3548,20 @@ eval_const_expressions_mutator(Node *node, continue; } + /* + * A proven non-nullable field refutes the whole + * NullTest if the test is IS NULL; else we can + * discard it. + */ + if (relem && + expr_is_nonnullable(context->root, (Expr *) relem, + false)) + { + if (ntest->nulltesttype == IS_NULL) + return makeBoolConst(false, false); + continue; + } + /* * Else, make a scalar (argisrow == false) NullTest * for this field. Scalar semantics are required @@ -3537,6 +3606,28 @@ eval_const_expressions_mutator(Node *node, return makeBoolConst(result, false); } + if (!ntest->argisrow && arg && + expr_is_nonnullable(context->root, (Expr *) arg, false)) + { + bool result; + + switch (ntest->nulltesttype) + { + case IS_NULL: + result = false; + break; + case IS_NOT_NULL: + result = true; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + result = false; /* keep compiler quiet */ + break; + } + + return makeBoolConst(result, false); + } newntest = makeNode(NullTest); newntest->arg = (Expr *) arg; @@ -4155,6 +4246,246 @@ simplify_function(Oid funcid, Oid result_type, int32 result_typmod, return newexpr; } +/* + * simplify_aggref + * Call the Aggref.aggfnoid's prosupport function to allow it to + * determine if simplification of the Aggref is possible. Returns the + * newly simplified node if conversion took place; otherwise, returns the + * original Aggref. + * + * See SupportRequestSimplifyAggref comments in supportnodes.h for further + * details. + */ +static Node * +simplify_aggref(Aggref *aggref, eval_const_expressions_context *context) +{ + Oid prosupport = get_func_support(aggref->aggfnoid); + + if (OidIsValid(prosupport)) + { + SupportRequestSimplifyAggref req; + Node *newnode; + + /* + * Build a SupportRequestSimplifyAggref node to pass to the support + * function. + */ + req.type = T_SupportRequestSimplifyAggref; + req.root = context->root; + req.aggref = aggref; + + newnode = (Node *) DatumGetPointer(OidFunctionCall1(prosupport, + PointerGetDatum(&req))); + + /* + * We expect the support function to return either a new Node or NULL + * (when simplification isn't possible). + */ + Assert(newnode != (Node *) aggref || newnode == NULL); + + if (newnode != NULL) + return newnode; + } + + return (Node *) aggref; +} + +/* + * var_is_nonnullable: check to see if the Var cannot be NULL + * + * If the Var is defined NOT NULL and meanwhile is not nulled by any outer + * joins or grouping sets, then we can know that it cannot be NULL. + * + * use_rel_info indicates whether the corresponding RelOptInfo is available for + * use. + */ +bool +var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info) +{ + Bitmapset *notnullattnums = NULL; + + Assert(IsA(var, Var)); + + /* skip upper-level Vars */ + if (var->varlevelsup != 0) + return false; + + /* could the Var be nulled by any outer joins or grouping sets? */ + if (!bms_is_empty(var->varnullingrels)) + return false; + + /* system columns cannot be NULL */ + if (var->varattno < 0) + return true; + + /* + * Check if the Var is defined as NOT NULL. We retrieve the column NOT + * NULL constraint information from the corresponding RelOptInfo if it is + * available; otherwise, we search the hash table for this information. + */ + if (use_rel_info) + { + RelOptInfo *rel = find_base_rel(root, var->varno); + + notnullattnums = rel->notnullattnums; + } + else + { + RangeTblEntry *rte = planner_rt_fetch(var->varno, root); + + /* + * We must skip inheritance parent tables, as some child tables may + * have a NOT NULL constraint for a column while others may not. This + * cannot happen with partitioned tables, though. + */ + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + return false; + + notnullattnums = find_relation_notnullatts(root, rte->relid); + } + + if (var->varattno > 0 && + bms_is_member(var->varattno, notnullattnums)) + return true; + + return false; +} + +/* + * expr_is_nonnullable: check to see if the Expr cannot be NULL + * + * Returns true iff the given 'expr' cannot produce SQL NULLs. + * + * If 'use_rel_info' is true, nullability of Vars is checked via the + * corresponding RelOptInfo for the given Var. Some callers require + * nullability information before RelOptInfos are generated. These should + * pass 'use_rel_info' as false. + * + * For now, we support only a limited set of expression types. Support for + * additional node types can be added in the future. + */ +bool +expr_is_nonnullable(PlannerInfo *root, Expr *expr, bool use_rel_info) +{ + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + switch (nodeTag(expr)) + { + case T_Var: + { + if (root) + return var_is_nonnullable(root, (Var *) expr, use_rel_info); + } + break; + case T_Const: + return !((Const *) expr)->constisnull; + case T_CoalesceExpr: + { + /* + * A CoalesceExpr returns NULL if and only if all its + * arguments are NULL. Therefore, we can determine that a + * CoalesceExpr cannot be NULL if at least one of its + * arguments can be proven non-nullable. + */ + CoalesceExpr *coalesceexpr = (CoalesceExpr *) expr; + + foreach_ptr(Expr, arg, coalesceexpr->args) + { + if (expr_is_nonnullable(root, arg, use_rel_info)) + return true; + } + } + break; + case T_MinMaxExpr: + { + /* + * Like CoalesceExpr, a MinMaxExpr returns NULL only if all + * its arguments evaluate to NULL. + */ + MinMaxExpr *minmaxexpr = (MinMaxExpr *) expr; + + foreach_ptr(Expr, arg, minmaxexpr->args) + { + if (expr_is_nonnullable(root, arg, use_rel_info)) + return true; + } + } + break; + case T_CaseExpr: + { + /* + * A CASE expression is non-nullable if all branch results are + * non-nullable. We must also verify that the default result + * (ELSE) exists and is non-nullable. + */ + CaseExpr *caseexpr = (CaseExpr *) expr; + + /* The default result must be present and non-nullable */ + if (caseexpr->defresult == NULL || + !expr_is_nonnullable(root, caseexpr->defresult, use_rel_info)) + return false; + + /* All branch results must be non-nullable */ + foreach_ptr(CaseWhen, casewhen, caseexpr->args) + { + if (!expr_is_nonnullable(root, casewhen->result, use_rel_info)) + return false; + } + + return true; + } + break; + case T_ArrayExpr: + { + /* + * An ARRAY[] expression always returns a valid Array object, + * even if it is empty (ARRAY[]) or contains NULLs + * (ARRAY[NULL]). It never evaluates to a SQL NULL. + */ + return true; + } + case T_NullTest: + { + /* + * An IS NULL / IS NOT NULL expression always returns a + * boolean value. It never returns SQL NULL. + */ + return true; + } + case T_BooleanTest: + { + /* + * A BooleanTest expression always evaluates to a boolean + * value. It never returns SQL NULL. + */ + return true; + } + case T_DistinctExpr: + { + /* + * IS DISTINCT FROM never returns NULL, effectively acting as + * though NULL were a normal data value. + */ + return true; + } + case T_RelabelType: + { + /* + * RelabelType does not change the nullability of the data. + * The result is non-nullable if and only if the argument is + * non-nullable. + */ + return expr_is_nonnullable(root, ((RelabelType *) expr)->arg, + use_rel_info); + } + default: + break; + } + + return false; +} + /* * expand_function_arguments: convert named-notation args to positional args * and/or insert default args, as needed @@ -5049,50 +5380,42 @@ evaluate_expr(Expr *expr, Oid result_type, int32 result_typmod, /* - * inline_set_returning_function - * Attempt to "inline" a set-returning function in the FROM clause. + * inline_function_in_from + * Attempt to "inline" a function in the FROM clause. * * "rte" is an RTE_FUNCTION rangetable entry. If it represents a call of a - * set-returning SQL function that can safely be inlined, expand the function - * and return the substitute Query structure. Otherwise, return NULL. + * function that can be inlined, expand the function and return the + * substitute Query structure. Otherwise, return NULL. * * We assume that the RTE's expression has already been put through * eval_const_expressions(), which among other things will take care of * default arguments and named-argument notation. * * This has a good deal of similarity to inline_function(), but that's - * for the non-set-returning case, and there are enough differences to + * for the general-expression case, and there are enough differences to * justify separate functions. */ Query * -inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) +inline_function_in_from(PlannerInfo *root, RangeTblEntry *rte) { RangeTblFunction *rtfunc; FuncExpr *fexpr; Oid func_oid; HeapTuple func_tuple; Form_pg_proc funcform; - char *src; - Datum tmp; - bool isNull; MemoryContext oldcxt; MemoryContext mycxt; + Datum tmp; + char *src; inline_error_callback_arg callback_arg; ErrorContextCallback sqlerrcontext; - SQLFunctionParseInfoPtr pinfo; - TypeFuncClass functypclass; - TupleDesc rettupdesc; - List *raw_parsetree_list; - List *querytree_list; - Query *querytree; + Query *querytree = NULL; Assert(rte->rtekind == RTE_FUNCTION); /* - * It doesn't make a lot of sense for a SQL SRF to refer to itself in its - * own FROM clause, since that must cause infinite recursion at runtime. - * It will cause this code to recurse too, so check for stack overflow. - * (There's no need to do more.) + * Guard against infinite recursion during expansion by checking for stack + * overflow. (There's no need to do more.) */ check_stack_depth(); @@ -5111,14 +5434,6 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) func_oid = fexpr->funcid; - /* - * The function must be declared to return a set, else inlining would - * change the results if the contained SELECT didn't return exactly one - * row. - */ - if (!fexpr->funcretset) - return NULL; - /* * Refuse to inline if the arguments contain any volatile functions or * sub-selects. Volatile functions are rejected because inlining may @@ -5149,24 +5464,10 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) funcform = (Form_pg_proc) GETSTRUCT(func_tuple); /* - * Forget it if the function is not SQL-language or has other showstopper - * properties. In particular it mustn't be declared STRICT, since we - * couldn't enforce that. It also mustn't be VOLATILE, because that is - * supposed to cause it to be executed with its own snapshot, rather than - * sharing the snapshot of the calling query. We also disallow returning - * SETOF VOID, because inlining would result in exposing the actual result - * of the function's last SELECT, which should not happen in that case. - * (Rechecking prokind, proretset, and pronargs is just paranoia.) + * If the function SETs any configuration parameters, inlining would cause + * us to miss making those changes. */ - if (funcform->prolang != SQLlanguageId || - funcform->prokind != PROKIND_FUNCTION || - funcform->proisstrict || - funcform->provolatile == PROVOLATILE_VOLATILE || - funcform->prorettype == VOIDOID || - funcform->prosecdef || - !funcform->proretset || - list_length(fexpr->args) != funcform->pronargs || - !heap_attisnull(func_tuple, Anum_pg_proc_proconfig, NULL)) + if (!heap_attisnull(func_tuple, Anum_pg_proc_proconfig, NULL)) { ReleaseSysCache(func_tuple); return NULL; @@ -5174,10 +5475,11 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) /* * Make a temporary memory context, so that we don't leak all the stuff - * that parsing might create. + * that parsing and rewriting might create. If we succeed, we'll copy + * just the finished query tree back up to the caller's context. */ mycxt = AllocSetContextCreate(CurrentMemoryContext, - "inline_set_returning_function", + "inline_function_in_from", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(mycxt); @@ -5185,9 +5487,30 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) tmp = SysCacheGetAttrNotNull(PROCOID, func_tuple, Anum_pg_proc_prosrc); src = TextDatumGetCString(tmp); + /* + * If the function has an attached support function that can handle + * SupportRequestInlineInFrom, then attempt to inline with that. + */ + if (funcform->prosupport) + { + SupportRequestInlineInFrom req; + + req.type = T_SupportRequestInlineInFrom; + req.root = root; + req.rtfunc = rtfunc; + req.proc = func_tuple; + + querytree = (Query *) + DatumGetPointer(OidFunctionCall1(funcform->prosupport, + PointerGetDatum(&req))); + } + /* * Setup error traceback support for ereport(). This is so that we can - * finger the function that bad information came from. + * finger the function that bad information came from. We don't install + * this while running the support function, since it'd be likely to do the + * wrong thing: any parse errors reported during that are very likely not + * against the raw function source text. */ callback_arg.proname = NameStr(funcform->proname); callback_arg.prosrc = src; @@ -5197,33 +5520,158 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) sqlerrcontext.previous = error_context_stack; error_context_stack = &sqlerrcontext; + /* + * If SupportRequestInlineInFrom didn't work, try our built-in inlining + * mechanism. + */ + if (!querytree) + querytree = inline_sql_function_in_from(root, rtfunc, fexpr, + func_tuple, funcform, src); + + if (!querytree) + goto fail; /* no luck there either, fail */ + + /* + * The result had better be a SELECT Query. + */ + Assert(IsA(querytree, Query)); + Assert(querytree->commandType == CMD_SELECT); + + /* + * Looks good --- substitute parameters into the query. + */ + querytree = substitute_actual_parameters_in_from(querytree, + funcform->pronargs, + fexpr->args); + + /* + * Copy the modified query out of the temporary memory context, and clean + * up. + */ + MemoryContextSwitchTo(oldcxt); + + querytree = copyObject(querytree); + + MemoryContextDelete(mycxt); + error_context_stack = sqlerrcontext.previous; + ReleaseSysCache(func_tuple); + + /* + * We don't have to fix collations here because the upper query is already + * parsed, ie, the collations in the RTE are what count. + */ + + /* + * Since there is now no trace of the function in the plan tree, we must + * explicitly record the plan's dependency on the function. + */ + record_plan_function_dependency(root, func_oid); + + /* + * We must also notice if the inserted query adds a dependency on the + * calling role due to RLS quals. + */ + if (querytree->hasRowSecurity) + root->glob->dependsOnRole = true; + + return querytree; + + /* Here if func is not inlinable: release temp memory and return NULL */ +fail: + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(mycxt); + error_context_stack = sqlerrcontext.previous; + ReleaseSysCache(func_tuple); + + return NULL; +} + +/* + * inline_sql_function_in_from + * + * This implements inline_function_in_from for SQL-language functions. + * Returns NULL if the function couldn't be inlined. + * + * The division of labor between here and inline_function_in_from is based + * on the rule that inline_function_in_from should make all checks that are + * certain to be required in both this case and the support-function case. + * Support functions might also want to make checks analogous to the ones + * made here, but then again they might not, or they might just assume that + * the function they are attached to can validly be inlined. + */ +static Query * +inline_sql_function_in_from(PlannerInfo *root, + RangeTblFunction *rtfunc, + FuncExpr *fexpr, + HeapTuple func_tuple, + Form_pg_proc funcform, + const char *src) +{ + Datum sqlbody; + bool isNull; + List *querytree_list; + Query *querytree; + TypeFuncClass functypclass; + TupleDesc rettupdesc; + + /* + * The function must be declared to return a set, else inlining would + * change the results if the contained SELECT didn't return exactly one + * row. + */ + if (!fexpr->funcretset) + return NULL; + + /* + * Forget it if the function is not SQL-language or has other showstopper + * properties. In particular it mustn't be declared STRICT, since we + * couldn't enforce that. It also mustn't be VOLATILE, because that is + * supposed to cause it to be executed with its own snapshot, rather than + * sharing the snapshot of the calling query. We also disallow returning + * SETOF VOID, because inlining would result in exposing the actual result + * of the function's last SELECT, which should not happen in that case. + * (Rechecking prokind, proretset, and pronargs is just paranoia.) + */ + if (funcform->prolang != SQLlanguageId || + funcform->prokind != PROKIND_FUNCTION || + funcform->proisstrict || + funcform->provolatile == PROVOLATILE_VOLATILE || + funcform->prorettype == VOIDOID || + funcform->prosecdef || + !funcform->proretset || + list_length(fexpr->args) != funcform->pronargs) + return NULL; + /* If we have prosqlbody, pay attention to that not prosrc */ - tmp = SysCacheGetAttr(PROCOID, - func_tuple, - Anum_pg_proc_prosqlbody, - &isNull); + sqlbody = SysCacheGetAttr(PROCOID, + func_tuple, + Anum_pg_proc_prosqlbody, + &isNull); if (!isNull) { Node *n; - n = stringToNode(TextDatumGetCString(tmp)); + n = stringToNode(TextDatumGetCString(sqlbody)); if (IsA(n, List)) querytree_list = linitial_node(List, castNode(List, n)); else querytree_list = list_make1(n); if (list_length(querytree_list) != 1) - goto fail; + return NULL; querytree = linitial(querytree_list); /* Acquire necessary locks, then apply rewriter. */ AcquireRewriteLocks(querytree, true, false); querytree_list = pg_rewrite_query(querytree); if (list_length(querytree_list) != 1) - goto fail; + return NULL; querytree = linitial(querytree_list); } else { + SQLFunctionParseInfoPtr pinfo; + List *raw_parsetree_list; + /* * Set up to handle parameters while parsing the function body. We * can use the FuncExpr just created as the input for @@ -5240,14 +5688,14 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) */ raw_parsetree_list = pg_parse_query(src); if (list_length(raw_parsetree_list) != 1) - goto fail; + return NULL; querytree_list = pg_analyze_and_rewrite_withcb(linitial(raw_parsetree_list), src, (ParserSetupHook) sql_fn_parser_setup, pinfo, NULL); if (list_length(querytree_list) != 1) - goto fail; + return NULL; querytree = linitial(querytree_list); } @@ -5272,7 +5720,7 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) */ if (!IsA(querytree, Query) || querytree->commandType != CMD_SELECT) - goto fail; + return NULL; /* * Make sure the function (still) returns what it's declared to. This @@ -5294,7 +5742,7 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) (functypclass == TYPEFUNC_COMPOSITE || functypclass == TYPEFUNC_COMPOSITE_DOMAIN || functypclass == TYPEFUNC_RECORD)) - goto fail; /* reject not-whole-tuple-result cases */ + return NULL; /* reject not-whole-tuple-result cases */ /* * check_sql_fn_retval might've inserted a projection step, but that's @@ -5302,53 +5750,7 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) */ querytree = linitial_node(Query, querytree_list); - /* - * Looks good --- substitute parameters into the query. - */ - querytree = substitute_actual_srf_parameters(querytree, - funcform->pronargs, - fexpr->args); - - /* - * Copy the modified query out of the temporary memory context, and clean - * up. - */ - MemoryContextSwitchTo(oldcxt); - - querytree = copyObject(querytree); - - MemoryContextDelete(mycxt); - error_context_stack = sqlerrcontext.previous; - ReleaseSysCache(func_tuple); - - /* - * We don't have to fix collations here because the upper query is already - * parsed, ie, the collations in the RTE are what count. - */ - - /* - * Since there is now no trace of the function in the plan tree, we must - * explicitly record the plan's dependency on the function. - */ - record_plan_function_dependency(root, func_oid); - - /* - * We must also notice if the inserted query adds a dependency on the - * calling role due to RLS quals. - */ - if (querytree->hasRowSecurity) - root->glob->dependsOnRole = true; - return querytree; - - /* Here if func is not inlinable: release temp memory and return NULL */ -fail: - MemoryContextSwitchTo(oldcxt); - MemoryContextDelete(mycxt); - error_context_stack = sqlerrcontext.previous; - ReleaseSysCache(func_tuple); - - return NULL; } /* @@ -5358,23 +5760,23 @@ inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte) * that it needs its own code. */ static Query * -substitute_actual_srf_parameters(Query *expr, int nargs, List *args) +substitute_actual_parameters_in_from(Query *expr, int nargs, List *args) { - substitute_actual_srf_parameters_context context; + substitute_actual_parameters_in_from_context context; context.nargs = nargs; context.args = args; context.sublevels_up = 1; return query_tree_mutator(expr, - substitute_actual_srf_parameters_mutator, + substitute_actual_parameters_in_from_mutator, &context, 0); } static Node * -substitute_actual_srf_parameters_mutator(Node *node, - substitute_actual_srf_parameters_context *context) +substitute_actual_parameters_in_from_mutator(Node *node, + substitute_actual_parameters_in_from_context *context) { Node *result; @@ -5384,7 +5786,7 @@ substitute_actual_srf_parameters_mutator(Node *node, { context->sublevels_up++; result = (Node *) query_tree_mutator((Query *) node, - substitute_actual_srf_parameters_mutator, + substitute_actual_parameters_in_from_mutator, context, 0); context->sublevels_up--; @@ -5409,7 +5811,7 @@ substitute_actual_srf_parameters_mutator(Node *node, } } return expression_tree_mutator(node, - substitute_actual_srf_parameters_mutator, + substitute_actual_parameters_in_from_mutator, context); } @@ -5492,8 +5894,8 @@ make_SAOP_expr(Oid oper, Node *leftexpr, Oid coltype, Oid arraycollid, get_typlenbyvalalign(coltype, &typlen, &typbyval, &typalign); - elems = (Datum *) palloc(sizeof(Datum) * list_length(exprs)); - nulls = (bool *) palloc(sizeof(bool) * list_length(exprs)); + elems = palloc_array(Datum, list_length(exprs)); + nulls = palloc_array(bool, list_length(exprs)); foreach_node(Const, value, exprs) { elems[i] = value->constvalue; diff --git a/src/backend/optimizer/util/extendplan.c b/src/backend/optimizer/util/extendplan.c new file mode 100644 index 0000000000000..2bc4ad32631aa --- /dev/null +++ b/src/backend/optimizer/util/extendplan.c @@ -0,0 +1,177 @@ +/*------------------------------------------------------------------------- + * + * extendplan.c + * Extend core planner objects with additional private state + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994-5, Regents of the University of California + * + * The interfaces defined in this file make it possible for loadable + * modules to store their own private state inside of key planner data + * structures -- specifically, the PlannerGlobal, PlannerInfo, and + * RelOptInfo structures. This can make it much easier to write + * reasonably efficient planner extensions; for instance, code that + * uses set_join_pathlist_hook can arrange to compute a key intermediate + * result once per joinrel rather than on every call. + * + * IDENTIFICATION + * src/backend/optimizer/util/extendplan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "optimizer/extendplan.h" +#include "port/pg_bitutils.h" +#include "utils/memutils.h" + +static const char **PlannerExtensionNameArray = NULL; +static int PlannerExtensionNamesAssigned = 0; +static int PlannerExtensionNamesAllocated = 0; + +/* + * Map the name of a planner extension to an integer ID. + * + * Within the lifetime of a particular backend, the same name will be mapped + * to the same ID every time. IDs are not stable across backends. Use the ID + * that you get from this function to call the remaining functions in this + * file. + */ +int +GetPlannerExtensionId(const char *extension_name) +{ + /* Search for an existing extension by this name; if found, return ID. */ + for (int i = 0; i < PlannerExtensionNamesAssigned; ++i) + if (strcmp(PlannerExtensionNameArray[i], extension_name) == 0) + return i; + + /* If there is no array yet, create one. */ + if (PlannerExtensionNameArray == NULL) + { + PlannerExtensionNamesAllocated = 16; + PlannerExtensionNameArray = (const char **) + MemoryContextAlloc(TopMemoryContext, + PlannerExtensionNamesAllocated + * sizeof(char *)); + } + + /* If there's an array but it's currently full, expand it. */ + if (PlannerExtensionNamesAssigned >= PlannerExtensionNamesAllocated) + { + int i = pg_nextpower2_32(PlannerExtensionNamesAssigned + 1); + + PlannerExtensionNameArray = (const char **) + repalloc(PlannerExtensionNameArray, i * sizeof(char *)); + PlannerExtensionNamesAllocated = i; + } + + /* Assign and return new ID. */ + PlannerExtensionNameArray[PlannerExtensionNamesAssigned] = extension_name; + return PlannerExtensionNamesAssigned++; +} + +/* + * Store extension-specific state into a PlannerGlobal. + */ +void +SetPlannerGlobalExtensionState(PlannerGlobal *glob, int extension_id, + void *opaque) +{ + Assert(extension_id >= 0); + + /* If there is no array yet, create one. */ + if (glob->extension_state == NULL) + { + MemoryContext planner_cxt; + Size sz; + + planner_cxt = GetMemoryChunkContext(glob); + glob->extension_state_allocated = + Max(4, pg_nextpower2_32(extension_id + 1)); + sz = glob->extension_state_allocated * sizeof(void *); + glob->extension_state = MemoryContextAllocZero(planner_cxt, sz); + } + + /* If there's an array but it's currently full, expand it. */ + if (extension_id >= glob->extension_state_allocated) + { + int i; + + i = pg_nextpower2_32(extension_id + 1); + glob->extension_state = repalloc0_array(glob->extension_state, void *, + glob->extension_state_allocated, i); + glob->extension_state_allocated = i; + } + + glob->extension_state[extension_id] = opaque; +} + +/* + * Store extension-specific state into a PlannerInfo. + */ +void +SetPlannerInfoExtensionState(PlannerInfo *root, int extension_id, + void *opaque) +{ + Assert(extension_id >= 0); + + /* If there is no array yet, create one. */ + if (root->extension_state == NULL) + { + Size sz; + + root->extension_state_allocated = + Max(4, pg_nextpower2_32(extension_id + 1)); + sz = root->extension_state_allocated * sizeof(void *); + root->extension_state = MemoryContextAllocZero(root->planner_cxt, sz); + } + + /* If there's an array but it's currently full, expand it. */ + if (extension_id >= root->extension_state_allocated) + { + int i; + + i = pg_nextpower2_32(extension_id + 1); + root->extension_state = repalloc0_array(root->extension_state, void *, + root->extension_state_allocated, i); + root->extension_state_allocated = i; + } + + root->extension_state[extension_id] = opaque; +} + +/* + * Store extension-specific state into a RelOptInfo. + */ +void +SetRelOptInfoExtensionState(RelOptInfo *rel, int extension_id, + void *opaque) +{ + Assert(extension_id >= 0); + + /* If there is no array yet, create one. */ + if (rel->extension_state == NULL) + { + MemoryContext planner_cxt; + Size sz; + + planner_cxt = GetMemoryChunkContext(rel); + rel->extension_state_allocated = + Max(4, pg_nextpower2_32(extension_id + 1)); + sz = rel->extension_state_allocated * sizeof(void *); + rel->extension_state = MemoryContextAllocZero(planner_cxt, sz); + } + + /* If there's an array but it's currently full, expand it. */ + if (extension_id >= rel->extension_state_allocated) + { + int i; + + i = pg_nextpower2_32(extension_id + 1); + rel->extension_state = repalloc0_array(rel->extension_state, void *, + rel->extension_state_allocated, i); + rel->extension_state_allocated = i; + } + + rel->extension_state[extension_id] = opaque; +} diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index 17e51cd75d744..6d5225079f87c 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -322,7 +322,6 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, PlanRowMark *top_parentrc, LOCKMODE lockmode) { PartitionDesc partdesc; - Bitmapset *live_parts; int num_live_parts; int i; @@ -336,16 +335,6 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, /* A partitioned table should always have a partition descriptor. */ Assert(partdesc); - /* - * Note down whether any partition key cols are being updated. Though it's - * the root partitioned table's updatedCols we are interested in, - * parent_updatedCols provided by the caller contains the root partrel's - * updatedCols translated to match the attribute ordering of parentrel. - */ - if (!root->partColsUpdated) - root->partColsUpdated = - has_partition_attrs(parentrel, parent_updatedCols, NULL); - /* Nothing further to do here if there are no partitions. */ if (partdesc->nparts == 0) return; @@ -356,10 +345,10 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, * that survive pruning. Below, we will initialize child objects for the * surviving partitions. */ - relinfo->live_parts = live_parts = prune_append_rel_partitions(relinfo); + relinfo->live_parts = prune_append_rel_partitions(relinfo); /* Expand simple_rel_array and friends to hold child objects. */ - num_live_parts = bms_num_members(live_parts); + num_live_parts = bms_num_members(relinfo->live_parts); if (num_live_parts > 0) expand_planner_arrays(root, num_live_parts); @@ -378,7 +367,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, * table itself, because it's not going to be scanned. */ i = -1; - while ((i = bms_next_member(live_parts, i)) >= 0) + while ((i = bms_next_member(relinfo->live_parts, i)) >= 0) { Oid childOID = partdesc->oids[i]; Relation childrel; @@ -466,8 +455,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index *childRTindex_p) { Query *parse = root->parse; - Oid parentOID PG_USED_FOR_ASSERTS_ONLY = - RelationGetRelid(parentrel); + Oid parentOID = RelationGetRelid(parentrel); Oid childOID = RelationGetRelid(childrel); RangeTblEntry *childrte; Index childRTindex; @@ -513,6 +501,13 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, *childrte_p = childrte; *childRTindex_p = childRTindex; + /* + * Retrieve column not-null constraint information for the child relation + * if its relation OID is different from the parent's. + */ + if (childOID != parentOID) + get_relation_notnullatts(root, childrel); + /* * Build an AppendRelInfo struct for each parent/child pair. */ diff --git a/src/backend/optimizer/util/meson.build b/src/backend/optimizer/util/meson.build index b3bf913d09658..f71f56e37a162 100644 --- a/src/backend/optimizer/util/meson.build +++ b/src/backend/optimizer/util/meson.build @@ -3,6 +3,7 @@ backend_sources += files( 'appendinfo.c', 'clauses.c', + 'extendplan.c', 'inherit.c', 'joininfo.c', 'orclauses.c', diff --git a/src/backend/optimizer/util/paramassign.c b/src/backend/optimizer/util/paramassign.c index 3bd3ce37c8fce..4c13c5931b4c9 100644 --- a/src/backend/optimizer/util/paramassign.c +++ b/src/backend/optimizer/util/paramassign.c @@ -599,38 +599,46 @@ process_subquery_nestloop_params(PlannerInfo *root, List *subplan_params) } /* - * Identify any NestLoopParams that should be supplied by a NestLoop plan - * node with the specified lefthand rels. Remove them from the active - * root->curOuterParams list and return them as the result list. + * Identify any NestLoopParams that should be supplied by a NestLoop + * plan node with the specified lefthand rels and required-outer rels. + * Remove them from the active root->curOuterParams list and return + * them as the result list. * - * XXX Here we also hack up the returned Vars and PHVs so that they do not - * contain nullingrel sets exceeding what is available from the outer side. - * This is needed if we have applied outer join identity 3, - * (A leftjoin B on (Pab)) leftjoin C on (Pb*c) - * = A leftjoin (B leftjoin C on (Pbc)) on (Pab) - * and C contains lateral references to B. It's still safe to apply the - * identity, but the parser will have created those references in the form - * "b*" (i.e., with varnullingrels listing the A/B join), while what we will - * have available from the nestloop's outer side is just "b". We deal with - * that here by stripping the nullingrels down to what is available from the - * outer side according to leftrelids. - * - * That fixes matters for the case of forward application of identity 3. - * If the identity was applied in the reverse direction, we will have - * parameter Vars containing too few nullingrel bits rather than too many. - * Currently, that causes no problems because setrefs.c applies only a - * subset check to nullingrels in NestLoopParams, but we'd have to work - * harder if we ever want to tighten that check. This is all pretty annoying - * because it greatly weakens setrefs.c's cross-check, but the alternative + * Vars and PHVs appearing in the result list must have nullingrel sets + * that could validly appear in the lefthand rel's output. Ordinarily that + * would be true already, but if we have applied outer join identity 3, + * there could be more or fewer nullingrel bits in the nodes appearing in + * curOuterParams than are in the nominal leftrelids. We deal with that by + * forcing their nullingrel sets to include exactly the outer-join relids + * that appear in leftrelids and can null the respective Var or PHV. + * This fix is a bit ad-hoc and intellectually unsatisfactory, because it's + * essentially jumping to the conclusion that we've placed evaluation of + * the nestloop parameters correctly, and thus it defeats the intent of the + * subsequent nullingrel cross-checks in setrefs.c. But the alternative * seems to be to generate multiple versions of each laterally-parameterized * subquery, which'd be unduly expensive. */ List * -identify_current_nestloop_params(PlannerInfo *root, Relids leftrelids) +identify_current_nestloop_params(PlannerInfo *root, + Relids leftrelids, + Relids outerrelids) { List *result; + Relids allleftrelids; ListCell *cell; + /* + * We'll be able to evaluate a PHV in the lefthand path if it uses the + * lefthand rels plus any available required-outer rels. But don't do so + * if it uses *only* required-outer rels; in that case it should be + * evaluated higher in the tree. For Vars, no such hair-splitting is + * necessary since they depend on only one relid. + */ + if (outerrelids) + allleftrelids = bms_union(leftrelids, outerrelids); + else + allleftrelids = leftrelids; + result = NIL; foreach(cell, root->curOuterParams) { @@ -646,25 +654,60 @@ identify_current_nestloop_params(PlannerInfo *root, Relids leftrelids) bms_is_member(nlp->paramval->varno, leftrelids)) { Var *var = (Var *) nlp->paramval; + RelOptInfo *rel = root->simple_rel_array[var->varno]; root->curOuterParams = foreach_delete_current(root->curOuterParams, cell); - var->varnullingrels = bms_intersect(var->varnullingrels, + var->varnullingrels = bms_intersect(rel->nulling_relids, leftrelids); result = lappend(result, nlp); } - else if (IsA(nlp->paramval, PlaceHolderVar) && - bms_is_subset(find_placeholder_info(root, - (PlaceHolderVar *) nlp->paramval)->ph_eval_at, - leftrelids)) + else if (IsA(nlp->paramval, PlaceHolderVar)) { PlaceHolderVar *phv = (PlaceHolderVar *) nlp->paramval; + PlaceHolderInfo *phinfo = find_placeholder_info(root, phv); + Relids eval_at = phinfo->ph_eval_at; - root->curOuterParams = foreach_delete_current(root->curOuterParams, - cell); - phv->phnullingrels = bms_intersect(phv->phnullingrels, - leftrelids); - result = lappend(result, nlp); + if (bms_is_subset(eval_at, allleftrelids) && + bms_overlap(eval_at, leftrelids)) + { + root->curOuterParams = foreach_delete_current(root->curOuterParams, + cell); + + /* + * Deal with an edge case: if the PHV was pulled up out of a + * subquery and it contains a subquery that was originally + * pushed down from this query level, then that will still be + * represented as a SubLink, because SS_process_sublinks won't + * recurse into outer PHVs, so it didn't get transformed + * during expression preprocessing in the subquery. We need a + * version of the PHV that has a SubPlan, which we can get + * from the current query level's placeholder_list. This is + * quite grotty of course, but dealing with it earlier in the + * handling of subplan params would be just as grotty, and it + * might end up being a waste of cycles if we don't decide to + * treat the PHV as a NestLoopParam. (Perhaps that whole + * mechanism should be redesigned someday, but today is not + * that day.) + */ + if (root->parse->hasSubLinks) + { + phv = copyObject(phinfo->ph_var); + + /* + * The ph_var will have empty nullingrels, but that + * doesn't matter since we're about to overwrite + * phv->phnullingrels. Other fields should be OK already. + */ + nlp->paramval = (Var *) phv; + } + + phv->phnullingrels = + bms_intersect(get_placeholder_nulling_relids(root, phinfo), + leftrelids); + + result = lappend(result, nlp); + } } } return result; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e0192d4a491d2..233495c219e67 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -16,6 +16,8 @@ #include +#include "access/htup_details.h" +#include "executor/nodeSetOp.h" #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/extensible.h" @@ -46,7 +48,6 @@ typedef enum */ #define STD_FUZZ_FACTOR 1.01 -static List *translate_sub_tlist(List *tlist, int relid); static int append_total_cost_compare(const ListCell *a, const ListCell *b); static int append_startup_cost_compare(const ListCell *a, const ListCell *b); static List *reparameterize_pathlist_by_child(PlannerInfo *root, @@ -68,6 +69,12 @@ static bool pathlist_is_reparameterizable_by_child(List *pathlist, int compare_path_costs(Path *path1, Path *path2, CostSelector criterion) { + Cost startup_cost1 = path1->startup_cost; + Cost startup_cost2 = path2->startup_cost; + Cost total_cost1 = path1->total_cost; + Cost total_cost2 = path2->total_cost; + Cost extra_cost = tempbuf_flush_extra_cost(); + /* Number of disabled nodes, if different, trumps all else. */ if (unlikely(path1->disabled_nodes != path2->disabled_nodes)) { @@ -77,35 +84,50 @@ compare_path_costs(Path *path1, Path *path2, CostSelector criterion) return +1; } + /* + * Add an extra cost of temporary buffers flushing fofr the time + * of comparison only. + */ + if (path1->parallel_safe == NEEDS_TEMP_FLUSH) + { + startup_cost1 += extra_cost; + total_cost1 += extra_cost; + } + if (path2->parallel_safe == NEEDS_TEMP_FLUSH) + { + startup_cost2 += extra_cost; + total_cost2 += extra_cost; + } + if (criterion == STARTUP_COST) { - if (path1->startup_cost < path2->startup_cost) + if (startup_cost1 < startup_cost2) return -1; - if (path1->startup_cost > path2->startup_cost) + if (startup_cost1 > startup_cost2) return +1; /* * If paths have the same startup cost (not at all unlikely), order * them by total cost. */ - if (path1->total_cost < path2->total_cost) + if (total_cost1 < total_cost2) return -1; - if (path1->total_cost > path2->total_cost) + if (total_cost1 > total_cost2) return +1; } else { - if (path1->total_cost < path2->total_cost) + if (total_cost1 < total_cost2) return -1; - if (path1->total_cost > path2->total_cost) + if (total_cost1 > total_cost2) return +1; /* * If paths have the same total cost, order them by startup cost. */ - if (path1->startup_cost < path2->startup_cost) + if (startup_cost1 < startup_cost2) return -1; - if (path1->startup_cost > path2->startup_cost) + if (startup_cost1 > startup_cost2) return +1; } return 0; @@ -381,7 +403,6 @@ set_cheapest(RelOptInfo *parent_rel) parent_rel->cheapest_startup_path = cheapest_startup_path; parent_rel->cheapest_total_path = cheapest_total_path; - parent_rel->cheapest_unique_path = NULL; /* computed only if needed */ parent_rel->cheapest_parameterized_paths = parameterized_paths; } @@ -969,6 +990,17 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, return true; } +static inline ParallelSafe +parallel_safety(RelOptInfo *rel) +{ + if (!rel->consider_parallel) + return PARALLEL_UNSAFE; + + if (rel->needs_temp_safety) + return NEEDS_TEMP_FLUSH; + + return PARALLEL_SAFE; +} /***************************************************************************** * PATH NODE CREATION ROUTINES @@ -991,7 +1023,7 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = (parallel_workers > 0); - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = parallel_workers; pathnode->pathkeys = NIL; /* seqscan has unordered result */ @@ -1015,7 +1047,7 @@ create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* samplescan has unordered result */ @@ -1067,7 +1099,7 @@ create_index_path(PlannerInfo *root, pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = pathkeys; @@ -1110,7 +1142,7 @@ create_bitmap_heap_path(PlannerInfo *root, pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.parallel_aware = (parallel_degree > 0); - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = parallel_degree; pathnode->path.pathkeys = NIL; /* always unordered */ @@ -1162,7 +1194,7 @@ create_bitmap_and_path(PlannerInfo *root, * without actually iterating over the list of children. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; /* always unordered */ @@ -1214,7 +1246,7 @@ create_bitmap_or_path(PlannerInfo *root, * without actually iterating over the list of children. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; /* always unordered */ @@ -1243,7 +1275,7 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals, pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; /* always unordered */ @@ -1262,7 +1294,8 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals, */ TidRangePath * create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel, - List *tidrangequals, Relids required_outer) + List *tidrangequals, Relids required_outer, + int parallel_workers) { TidRangePath *pathnode = makeNode(TidRangePath); @@ -1271,9 +1304,9 @@ create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->path.pathtarget = rel->reltarget; pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); - pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; - pathnode->path.parallel_workers = 0; + pathnode->path.parallel_aware = (parallel_workers > 0); + pathnode->path.parallel_safe = parallel_safety(rel); + pathnode->path.parallel_workers = parallel_workers; pathnode->path.pathkeys = NIL; /* always unordered */ pathnode->tidrangequals = tidrangequals; @@ -1333,7 +1366,7 @@ create_append_path(PlannerInfo *root, required_outer); pathnode->path.parallel_aware = parallel_aware; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = parallel_workers; pathnode->path.pathkeys = pathkeys; @@ -1374,8 +1407,8 @@ create_append_path(PlannerInfo *root, { Path *subpath = (Path *) lfirst(l); - pathnode->path.parallel_safe = pathnode->path.parallel_safe && - subpath->parallel_safe; + pathnode->path.parallel_safe = Min(pathnode->path.parallel_safe, + subpath->parallel_safe); /* All child paths must have same parameterization */ Assert(bms_equal(PATH_REQ_OUTER(subpath), required_outer)); @@ -1404,12 +1437,12 @@ create_append_path(PlannerInfo *root, pathnode->path.total_cost = child->total_cost; } else - cost_append(pathnode); + cost_append(pathnode, root); /* Must do this last, else cost_append complains */ pathnode->path.pathkeys = child->pathkeys; } else - cost_append(pathnode); + cost_append(pathnode, root); /* If the caller provided a row estimate, override the computed value. */ if (rows >= 0) @@ -1491,7 +1524,7 @@ create_merge_append_path(PlannerInfo *root, pathnode->path.pathtarget = rel->reltarget; pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = pathkeys; pathnode->subpaths = subpaths; @@ -1515,40 +1548,63 @@ create_merge_append_path(PlannerInfo *root, foreach(l, subpaths) { Path *subpath = (Path *) lfirst(l); + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ /* All child paths should be unparameterized */ Assert(bms_is_empty(PATH_REQ_OUTER(subpath))); pathnode->path.rows += subpath->rows; - pathnode->path.parallel_safe = pathnode->path.parallel_safe && - subpath->parallel_safe; + pathnode->path.parallel_safe = Min(pathnode->path.parallel_safe, + subpath->parallel_safe); - if (pathkeys_contained_in(pathkeys, subpath->pathkeys)) - { - /* Subpath is adequately ordered, we won't need to sort it */ - input_disabled_nodes += subpath->disabled_nodes; - input_startup_cost += subpath->startup_cost; - input_total_cost += subpath->total_cost; - } - else + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - /* We'll need to insert a Sort node, so include cost for that */ - Path sort_path; /* dummy for result of cost_sort */ + /* + * We'll need to insert a Sort node, so include costs for that. We + * choose to use incremental sort if it is enabled and there are + * presorted keys; otherwise we use full sort. + * + * We can use the parent's LIMIT if any, since we certainly won't + * pull more than that many tuples from any child. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } - cost_sort(&sort_path, - root, - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - pathnode->limit_tuples); - input_disabled_nodes += sort_path.disabled_nodes; - input_startup_cost += sort_path.startup_cost; - input_total_cost += sort_path.total_cost; + subpath = &sort_path; } + + input_disabled_nodes += subpath->disabled_nodes; + input_startup_cost += subpath->startup_cost; + input_total_cost += subpath->total_cost; } /* @@ -1593,7 +1649,7 @@ create_group_result_path(PlannerInfo *root, RelOptInfo *rel, pathnode->path.pathtarget = target; pathnode->path.param_info = NULL; /* there are no other rels... */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; pathnode->quals = havingqual; @@ -1642,8 +1698,7 @@ create_material_path(RelOptInfo *rel, Path *subpath) pathnode->path.pathtarget = rel->reltarget; pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = Min(parallel_safety(rel), subpath->parallel_safe); pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.pathkeys = subpath->pathkeys; @@ -1666,7 +1721,7 @@ create_material_path(RelOptInfo *rel, Path *subpath) MemoizePath * create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, List *param_exprs, List *hash_operators, - bool singlerow, bool binary_mode, double calls) + bool singlerow, bool binary_mode, Cardinality est_calls) { MemoizePath *pathnode = makeNode(MemoizePath); @@ -1677,8 +1732,8 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.pathtarget = rel->reltarget; pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = Min(parallel_safety(rel), + subpath->parallel_safe); pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.pathkeys = subpath->pathkeys; @@ -1687,7 +1742,6 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->param_exprs = param_exprs; pathnode->singlerow = singlerow; pathnode->binary_mode = binary_mode; - pathnode->calls = clamp_row_est(calls); /* * For now we set est_entries to 0. cost_memoize_rescan() does all the @@ -1697,6 +1751,12 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, */ pathnode->est_entries = 0; + pathnode->est_calls = clamp_row_est(est_calls); + + /* These will also be set later in cost_memoize_rescan() */ + pathnode->est_unique_keys = 0.0; + pathnode->est_hit_ratio = 0.0; + /* we should not generate this path type when enable_memoize=false */ Assert(enable_memoize); pathnode->path.disabled_nodes = subpath->disabled_nodes; @@ -1712,246 +1772,6 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, return pathnode; } -/* - * create_unique_path - * Creates a path representing elimination of distinct rows from the - * input data. Distinct-ness is defined according to the needs of the - * semijoin represented by sjinfo. If it is not possible to identify - * how to make the data unique, NULL is returned. - * - * If used at all, this is likely to be called repeatedly on the same rel; - * and the input subpath should always be the same (the cheapest_total path - * for the rel). So we cache the result. - */ -UniquePath * -create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, - SpecialJoinInfo *sjinfo) -{ - UniquePath *pathnode; - Path sort_path; /* dummy for result of cost_sort */ - Path agg_path; /* dummy for result of cost_agg */ - MemoryContext oldcontext; - int numCols; - - /* Caller made a mistake if subpath isn't cheapest_total ... */ - Assert(subpath == rel->cheapest_total_path); - Assert(subpath->parent == rel); - /* ... or if SpecialJoinInfo is the wrong one */ - Assert(sjinfo->jointype == JOIN_SEMI); - Assert(bms_equal(rel->relids, sjinfo->syn_righthand)); - - /* If result already cached, return it */ - if (rel->cheapest_unique_path) - return (UniquePath *) rel->cheapest_unique_path; - - /* If it's not possible to unique-ify, return NULL */ - if (!(sjinfo->semi_can_btree || sjinfo->semi_can_hash)) - return NULL; - - /* - * When called during GEQO join planning, we are in a short-lived memory - * context. We must make sure that the path and any subsidiary data - * structures created for a baserel survive the GEQO cycle, else the - * baserel is trashed for future GEQO cycles. On the other hand, when we - * are creating those for a joinrel during GEQO, we don't want them to - * clutter the main planning context. Upshot is that the best solution is - * to explicitly allocate memory in the same context the given RelOptInfo - * is in. - */ - oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel)); - - pathnode = makeNode(UniquePath); - - pathnode->path.pathtype = T_Unique; - pathnode->path.parent = rel; - pathnode->path.pathtarget = rel->reltarget; - pathnode->path.param_info = subpath->param_info; - pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; - pathnode->path.parallel_workers = subpath->parallel_workers; - - /* - * Assume the output is unsorted, since we don't necessarily have pathkeys - * to represent it. (This might get overridden below.) - */ - pathnode->path.pathkeys = NIL; - - pathnode->subpath = subpath; - - /* - * Under GEQO and when planning child joins, the sjinfo might be - * short-lived, so we'd better make copies of data structures we extract - * from it. - */ - pathnode->in_operators = copyObject(sjinfo->semi_operators); - pathnode->uniq_exprs = copyObject(sjinfo->semi_rhs_exprs); - - /* - * If the input is a relation and it has a unique index that proves the - * semi_rhs_exprs are unique, then we don't need to do anything. Note - * that relation_has_unique_index_for automatically considers restriction - * clauses for the rel, as well. - */ - if (rel->rtekind == RTE_RELATION && sjinfo->semi_can_btree && - relation_has_unique_index_for(root, rel, NIL, - sjinfo->semi_rhs_exprs, - sjinfo->semi_operators)) - { - pathnode->umethod = UNIQUE_PATH_NOOP; - pathnode->path.rows = rel->rows; - pathnode->path.disabled_nodes = subpath->disabled_nodes; - pathnode->path.startup_cost = subpath->startup_cost; - pathnode->path.total_cost = subpath->total_cost; - pathnode->path.pathkeys = subpath->pathkeys; - - rel->cheapest_unique_path = (Path *) pathnode; - - MemoryContextSwitchTo(oldcontext); - - return pathnode; - } - - /* - * If the input is a subquery whose output must be unique already, then we - * don't need to do anything. The test for uniqueness has to consider - * exactly which columns we are extracting; for example "SELECT DISTINCT - * x,y" doesn't guarantee that x alone is distinct. So we cannot check for - * this optimization unless semi_rhs_exprs consists only of simple Vars - * referencing subquery outputs. (Possibly we could do something with - * expressions in the subquery outputs, too, but for now keep it simple.) - */ - if (rel->rtekind == RTE_SUBQUERY) - { - RangeTblEntry *rte = planner_rt_fetch(rel->relid, root); - - if (query_supports_distinctness(rte->subquery)) - { - List *sub_tlist_colnos; - - sub_tlist_colnos = translate_sub_tlist(sjinfo->semi_rhs_exprs, - rel->relid); - - if (sub_tlist_colnos && - query_is_distinct_for(rte->subquery, - sub_tlist_colnos, - sjinfo->semi_operators)) - { - pathnode->umethod = UNIQUE_PATH_NOOP; - pathnode->path.rows = rel->rows; - pathnode->path.disabled_nodes = subpath->disabled_nodes; - pathnode->path.startup_cost = subpath->startup_cost; - pathnode->path.total_cost = subpath->total_cost; - pathnode->path.pathkeys = subpath->pathkeys; - - rel->cheapest_unique_path = (Path *) pathnode; - - MemoryContextSwitchTo(oldcontext); - - return pathnode; - } - } - } - - /* Estimate number of output rows */ - pathnode->path.rows = estimate_num_groups(root, - sjinfo->semi_rhs_exprs, - rel->rows, - NULL, - NULL); - numCols = list_length(sjinfo->semi_rhs_exprs); - - if (sjinfo->semi_can_btree) - { - /* - * Estimate cost for sort+unique implementation - */ - cost_sort(&sort_path, root, NIL, - subpath->disabled_nodes, - subpath->total_cost, - rel->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - -1.0); - - /* - * Charge one cpu_operator_cost per comparison per input tuple. We - * assume all columns get compared at most of the tuples. (XXX - * probably this is an overestimate.) This should agree with - * create_upper_unique_path. - */ - sort_path.total_cost += cpu_operator_cost * rel->rows * numCols; - } - - if (sjinfo->semi_can_hash) - { - /* - * Estimate the overhead per hashtable entry at 64 bytes (same as in - * planner.c). - */ - int hashentrysize = subpath->pathtarget->width + 64; - - if (hashentrysize * pathnode->path.rows > get_hash_memory_limit()) - { - /* - * We should not try to hash. Hack the SpecialJoinInfo to - * remember this, in case we come through here again. - */ - sjinfo->semi_can_hash = false; - } - else - cost_agg(&agg_path, root, - AGG_HASHED, NULL, - numCols, pathnode->path.rows, - NIL, - subpath->disabled_nodes, - subpath->startup_cost, - subpath->total_cost, - rel->rows, - subpath->pathtarget->width); - } - - if (sjinfo->semi_can_btree && sjinfo->semi_can_hash) - { - if (agg_path.disabled_nodes < sort_path.disabled_nodes || - (agg_path.disabled_nodes == sort_path.disabled_nodes && - agg_path.total_cost < sort_path.total_cost)) - pathnode->umethod = UNIQUE_PATH_HASH; - else - pathnode->umethod = UNIQUE_PATH_SORT; - } - else if (sjinfo->semi_can_btree) - pathnode->umethod = UNIQUE_PATH_SORT; - else if (sjinfo->semi_can_hash) - pathnode->umethod = UNIQUE_PATH_HASH; - else - { - /* we can get here only if we abandoned hashing above */ - MemoryContextSwitchTo(oldcontext); - return NULL; - } - - if (pathnode->umethod == UNIQUE_PATH_HASH) - { - pathnode->path.disabled_nodes = agg_path.disabled_nodes; - pathnode->path.startup_cost = agg_path.startup_cost; - pathnode->path.total_cost = agg_path.total_cost; - } - else - { - pathnode->path.disabled_nodes = sort_path.disabled_nodes; - pathnode->path.startup_cost = sort_path.startup_cost; - pathnode->path.total_cost = sort_path.total_cost; - } - - rel->cheapest_unique_path = (Path *) pathnode; - - MemoryContextSwitchTo(oldcontext); - - return pathnode; -} - /* * create_gather_merge_path * @@ -2003,36 +1823,6 @@ create_gather_merge_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, return pathnode; } -/* - * translate_sub_tlist - get subquery column numbers represented by tlist - * - * The given targetlist usually contains only Vars referencing the given relid. - * Extract their varattnos (ie, the column numbers of the subquery) and return - * as an integer List. - * - * If any of the tlist items is not a simple Var, we cannot determine whether - * the subquery's uniqueness condition (if any) matches ours, so punt and - * return NIL. - */ -static List * -translate_sub_tlist(List *tlist, int relid) -{ - List *result = NIL; - ListCell *l; - - foreach(l, tlist) - { - Var *var = (Var *) lfirst(l); - - if (!var || !IsA(var, Var) || - var->varno != relid) - return NIL; /* punt */ - - result = lappend_int(result, var->varattno); - } - return result; -} - /* * create_gather_path * Creates a path corresponding to a gather scan, returning the @@ -2054,7 +1844,7 @@ create_gather_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = false; + pathnode->path.parallel_safe = PARALLEL_UNSAFE; pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; /* Gather has unordered result */ @@ -2097,8 +1887,8 @@ create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = Min(parallel_safety(rel), + subpath->parallel_safe); pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.pathkeys = pathkeys; pathnode->subpath = subpath; @@ -2126,7 +1916,7 @@ create_functionscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = pathkeys; @@ -2152,7 +1942,7 @@ create_tablefuncscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ @@ -2178,7 +1968,7 @@ create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ @@ -2204,7 +1994,7 @@ create_ctescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = pathkeys; @@ -2230,7 +2020,7 @@ create_namedtuplestorescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ @@ -2256,7 +2046,7 @@ create_resultscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ @@ -2282,7 +2072,7 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->parallel_aware = false; - pathnode->parallel_safe = rel->consider_parallel; + pathnode->parallel_safe = parallel_safety(rel); pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ @@ -2325,7 +2115,7 @@ create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.rows = rows; pathnode->path.disabled_nodes = disabled_nodes; @@ -2379,7 +2169,7 @@ create_foreign_join_path(PlannerInfo *root, RelOptInfo *rel, pathnode->path.pathtarget = target ? target : rel->reltarget; pathnode->path.param_info = NULL; /* XXX see above */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.rows = rows; pathnode->path.disabled_nodes = disabled_nodes; @@ -2428,7 +2218,7 @@ create_foreign_upper_path(PlannerInfo *root, RelOptInfo *rel, pathnode->path.pathtarget = target ? target : rel->reltarget; pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = parallel_safety(rel); pathnode->path.parallel_workers = 0; pathnode->path.rows = rows; pathnode->path.disabled_nodes = disabled_nodes; @@ -2592,8 +2382,9 @@ create_nestloop_path(PlannerInfo *root, required_outer, &restrict_clauses); pathnode->jpath.path.parallel_aware = false; - pathnode->jpath.path.parallel_safe = joinrel->consider_parallel && - outer_path->parallel_safe && inner_path->parallel_safe; + pathnode->jpath.path.parallel_safe = Min(Min(parallel_safety(joinrel), + outer_path->parallel_safe), + inner_path->parallel_safe); /* This is a foolish way to estimate parallel_workers, but for now... */ pathnode->jpath.path.parallel_workers = outer_path->parallel_workers; pathnode->jpath.path.pathkeys = pathkeys; @@ -2658,8 +2449,9 @@ create_mergejoin_path(PlannerInfo *root, required_outer, &restrict_clauses); pathnode->jpath.path.parallel_aware = false; - pathnode->jpath.path.parallel_safe = joinrel->consider_parallel && - outer_path->parallel_safe && inner_path->parallel_safe; + pathnode->jpath.path.parallel_safe = Min(Min(parallel_safety(joinrel), + outer_path->parallel_safe), + inner_path->parallel_safe); /* This is a foolish way to estimate parallel_workers, but for now... */ pathnode->jpath.path.parallel_workers = outer_path->parallel_workers; pathnode->jpath.path.pathkeys = pathkeys; @@ -2724,8 +2516,9 @@ create_hashjoin_path(PlannerInfo *root, &restrict_clauses); pathnode->jpath.path.parallel_aware = joinrel->consider_parallel && parallel_hash; - pathnode->jpath.path.parallel_safe = joinrel->consider_parallel && - outer_path->parallel_safe && inner_path->parallel_safe; + pathnode->jpath.path.parallel_safe = Min(Min(parallel_safety(joinrel), + outer_path->parallel_safe), + inner_path->parallel_safe); /* This is a foolish way to estimate parallel_workers, but for now... */ pathnode->jpath.path.parallel_workers = outer_path->parallel_workers; @@ -2754,6 +2547,33 @@ create_hashjoin_path(PlannerInfo *root, return pathnode; } +static inline ParallelSafe +compute_parallel_safety(PlannerInfo *root, RelOptInfo *rel, + PathTarget *target, Path *subpath) +{ + ParallelSafe level = PARALLEL_SAFE; + bool needs_temp_flush = false; + + if (!rel->consider_parallel) + return PARALLEL_UNSAFE; + + if (rel->needs_temp_safety) + level = NEEDS_TEMP_FLUSH; + + if (subpath) + level = Min(level, subpath->parallel_safe); + + if (target) + { + if (!is_parallel_safe(root, (Node *) target->exprs, &needs_temp_flush)) + return PARALLEL_UNSAFE; + + if (needs_temp_flush) + level = Min(level, NEEDS_TEMP_FLUSH); + } + return level; +} + /* * create_projection_path * Creates a pathnode that represents performing a projection. @@ -2790,12 +2610,11 @@ create_projection_path(PlannerInfo *root, pathnode->path.pathtype = T_Result; pathnode->path.parent = rel; pathnode->path.pathtarget = target; - /* For now, assume we are above any joins, so no parameterization */ - pathnode->path.param_info = NULL; + pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe && - is_parallel_safe(root, (Node *) target->exprs); + + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, target, subpath); + pathnode->path.parallel_workers = subpath->parallel_workers; /* Projection does not change the sort order */ pathnode->path.pathkeys = subpath->pathkeys; @@ -2903,9 +2722,12 @@ apply_projection_to_path(PlannerInfo *root, * arrange for the subpath to return the required target list so that * workers can help project. But if there is something that is not * parallel-safe in the target expressions, then we can't. + * + * XXX: don't need flag here because create_projection_path will check the + * target safety anyway. */ if ((IsA(path, GatherPath) || IsA(path, GatherMergePath)) && - is_parallel_safe(root, (Node *) target->exprs)) + is_parallel_safe(root, (Node *) target->exprs, NULL)) { /* * We always use create_projection_path here, even if the subpath is @@ -2939,14 +2761,14 @@ apply_projection_to_path(PlannerInfo *root, } } else if (path->parallel_safe && - !is_parallel_safe(root, (Node *) target->exprs)) + !is_parallel_safe(root, (Node *) target->exprs, NULL)) { /* * We're inserting a parallel-restricted target list into a path * currently marked parallel-safe, so we have to mark it as no longer * safe. */ - path->parallel_safe = false; + path->parallel_safe = PARALLEL_UNSAFE; } return path; @@ -2977,9 +2799,7 @@ create_set_projection_path(PlannerInfo *root, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe && - is_parallel_safe(root, (Node *) target->exprs); + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, target, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; /* Projection does not change the sort order XXX? */ pathnode->path.pathkeys = subpath->pathkeys; @@ -3046,11 +2866,9 @@ create_incremental_sort_path(PlannerInfo *root, pathnode->path.parent = rel; /* Sort doesn't project, so use source path's pathtarget */ pathnode->path.pathtarget = subpath->pathtarget; - /* For now, assume we are above any joins, so no parameterization */ - pathnode->path.param_info = NULL; + pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.pathkeys = pathkeys; @@ -3094,11 +2912,9 @@ create_sort_path(PlannerInfo *root, pathnode->path.parent = rel; /* Sort doesn't project, so use source path's pathtarget */ pathnode->path.pathtarget = subpath->pathtarget; - /* For now, assume we are above any joins, so no parameterization */ - pathnode->path.param_info = NULL; + pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.pathkeys = pathkeys; @@ -3143,8 +2959,7 @@ create_group_path(PlannerInfo *root, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; /* Group doesn't change sort ordering */ pathnode->path.pathkeys = subpath->pathkeys; @@ -3171,13 +2986,10 @@ create_group_path(PlannerInfo *root, } /* - * create_upper_unique_path + * create_unique_path * Creates a pathnode that represents performing an explicit Unique step * on presorted input. * - * This produces a Unique plan node, but the use-case is so different from - * create_unique_path that it doesn't seem worth trying to merge the two. - * * 'rel' is the parent relation associated with the result * 'subpath' is the path representing the source of data * 'numCols' is the number of grouping columns @@ -3186,24 +2998,22 @@ create_group_path(PlannerInfo *root, * The input path must be sorted on the grouping columns, plus possibly * additional columns; so the first numCols pathkeys are the grouping columns */ -UpperUniquePath * -create_upper_unique_path(PlannerInfo *root, - RelOptInfo *rel, - Path *subpath, - int numCols, - double numGroups) +UniquePath * +create_unique_path(PlannerInfo *root, + RelOptInfo *rel, + Path *subpath, + int numCols, + double numGroups) { - UpperUniquePath *pathnode = makeNode(UpperUniquePath); + UniquePath *pathnode = makeNode(UniquePath); pathnode->path.pathtype = T_Unique; pathnode->path.parent = rel; /* Unique doesn't project, so use source path's pathtarget */ pathnode->path.pathtarget = subpath->pathtarget; - /* For now, assume we are above any joins, so no parameterization */ - pathnode->path.param_info = NULL; + pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; /* Unique doesn't change the input ordering */ pathnode->path.pathkeys = subpath->pathkeys; @@ -3256,11 +3066,9 @@ create_agg_path(PlannerInfo *root, pathnode->path.pathtype = T_Agg; pathnode->path.parent = rel; pathnode->path.pathtarget = target; - /* For now, assume we are above any joins, so no parameterization */ - pathnode->path.param_info = NULL; + pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; if (aggstrategy == AGG_SORTED) @@ -3343,8 +3151,7 @@ create_groupingsets_path(PlannerInfo *root, pathnode->path.pathtarget = target; pathnode->path.param_info = subpath->param_info; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->subpath = subpath; @@ -3504,7 +3311,7 @@ create_minmaxagg_path(PlannerInfo *root, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = true; /* might change below */ + pathnode->path.parallel_safe = PARALLEL_SAFE; /* might change below */ pathnode->path.parallel_workers = 0; /* Result is one unordered row */ pathnode->path.rows = 1; @@ -3522,7 +3329,7 @@ create_minmaxagg_path(PlannerInfo *root, initplan_disabled_nodes += mminfo->path->disabled_nodes; initplan_cost += mminfo->pathcost; if (!mminfo->path->parallel_safe) - pathnode->path.parallel_safe = false; + pathnode->path.parallel_safe = PARALLEL_UNSAFE; } /* add tlist eval cost for each output row, plus cpu_tuple_cost */ @@ -3550,10 +3357,16 @@ create_minmaxagg_path(PlannerInfo *root, * we are in a subquery then it can be useful for the outer query to know * that this one is parallel-safe.) */ - if (pathnode->path.parallel_safe) - pathnode->path.parallel_safe = - is_parallel_safe(root, (Node *) target->exprs) && - is_parallel_safe(root, (Node *) quals); + if (pathnode->path.parallel_safe > PARALLEL_UNSAFE) + { + bool needs_temp_flush = false; + + if (!is_parallel_safe(root, (Node *) target->exprs, &needs_temp_flush) || + !is_parallel_safe(root, (Node *) quals, &needs_temp_flush)) + pathnode->path.parallel_safe = PARALLEL_UNSAFE; + else if (needs_temp_flush) + pathnode->path.parallel_safe = NEEDS_TEMP_FLUSH; + } return pathnode; } @@ -3598,8 +3411,7 @@ create_windowagg_path(PlannerInfo *root, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; /* WindowAgg preserves the input sort order */ pathnode->path.pathkeys = subpath->pathkeys; @@ -3668,8 +3480,7 @@ create_setop_path(PlannerInfo *root, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - leftpath->parallel_safe && rightpath->parallel_safe; + pathnode->path.parallel_safe = Min(compute_parallel_safety(root, rel, NULL, leftpath), rightpath->parallel_safe); pathnode->path.parallel_workers = leftpath->parallel_workers + rightpath->parallel_workers; /* SetOp preserves the input sort order if in sort mode */ @@ -3712,7 +3523,7 @@ create_setop_path(PlannerInfo *root, } else { - Size hashentrysize; + Size hashtablesize; /* * In hashed mode, we must read all the input before we can emit @@ -3741,11 +3552,12 @@ create_setop_path(PlannerInfo *root, /* * Also disable if it doesn't look like the hashtable will fit into - * hash_mem. + * hash_mem. (Note: reject on equality, to ensure that an estimate of + * SIZE_MAX disables hashing regardless of the hash_mem limit.) */ - hashentrysize = MAXALIGN(leftpath->pathtarget->width) + - MAXALIGN(SizeofMinimalTupleHeader); - if (hashentrysize * numGroups > get_hash_memory_limit()) + hashtablesize = EstimateSetOpHashTableSpace(numGroups, + leftpath->pathtarget->width); + if (hashtablesize >= get_hash_memory_limit()) pathnode->path.disabled_nodes++; } pathnode->path.rows = outputRows; @@ -3785,8 +3597,7 @@ create_recursiveunion_path(PlannerInfo *root, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - leftpath->parallel_safe && rightpath->parallel_safe; + pathnode->path.parallel_safe = Min(compute_parallel_safety(root, rel, NULL, leftpath), rightpath->parallel_safe); /* Foolish, but we'll do it like joins for now: */ pathnode->path.parallel_workers = leftpath->parallel_workers; /* RecursiveUnion result is always unsorted */ @@ -3825,7 +3636,7 @@ create_lockrows_path(PlannerInfo *root, RelOptInfo *rel, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = false; + pathnode->path.parallel_safe = PARALLEL_UNSAFE; pathnode->path.parallel_workers = 0; pathnode->path.rows = subpath->rows; @@ -3863,8 +3674,6 @@ create_lockrows_path(PlannerInfo *root, RelOptInfo *rel, * 'canSetTag' is true if we set the command tag/es_processed * 'nominalRelation' is the parent RT index for use of EXPLAIN * 'rootRelation' is the partitioned/inherited table root RTI, or 0 if none - * 'partColsUpdated' is true if any partitioning columns are being updated, - * either from the target relation or a descendent partitioned table. * 'resultRelations' is an integer list of actual RT indexes of target rel(s) * 'updateColnosLists' is a list of UPDATE target column number lists * (one sublist per rel); or NIL if not an UPDATE @@ -3881,7 +3690,6 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, CmdType operation, bool canSetTag, Index nominalRelation, Index rootRelation, - bool partColsUpdated, List *resultRelations, List *updateColnosLists, List *withCheckOptionLists, List *returningLists, @@ -3907,7 +3715,7 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = false; + pathnode->path.parallel_safe = PARALLEL_UNSAFE; pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; @@ -3947,7 +3755,6 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, pathnode->canSetTag = canSetTag; pathnode->nominalRelation = nominalRelation; pathnode->rootRelation = rootRelation; - pathnode->partColsUpdated = partColsUpdated; pathnode->resultRelations = resultRelations; pathnode->updateColnosLists = updateColnosLists; pathnode->withCheckOptionLists = withCheckOptionLists; @@ -3994,8 +3801,7 @@ create_limit_path(PlannerInfo *root, RelOptInfo *rel, /* For now, assume we are above any joins, so no parameterization */ pathnode->path.param_info = NULL; pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel && - subpath->parallel_safe; + pathnode->path.parallel_safe = compute_parallel_safety(root, rel, NULL, subpath); pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.rows = subpath->rows; pathnode->path.disabled_nodes = subpath->disabled_nodes; @@ -4117,7 +3923,7 @@ reparameterize_path(PlannerInfo *root, Path *path, case T_SeqScan: return create_seqscan_path(root, rel, required_outer, 0); case T_SampleScan: - return (Path *) create_samplescan_path(root, rel, required_outer); + return create_samplescan_path(root, rel, required_outer); case T_IndexScan: case T_IndexOnlyScan: { @@ -4236,7 +4042,7 @@ reparameterize_path(PlannerInfo *root, Path *path, mpath->hash_operators, mpath->singlerow, mpath->binary_mode, - mpath->calls); + mpath->est_calls); } default: break; diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c index 41a4c81e94a75..e1cd00a72fbf7 100644 --- a/src/backend/optimizer/util/placeholder.c +++ b/src/backend/optimizer/util/placeholder.c @@ -545,3 +545,43 @@ contain_placeholder_references_walker(Node *node, return expression_tree_walker(node, contain_placeholder_references_walker, context); } + +/* + * Compute the set of outer-join relids that can null a placeholder. + * + * This is analogous to RelOptInfo.nulling_relids for Vars, but we compute it + * on-the-fly rather than saving it somewhere. Currently the value is needed + * at most once per query, so there's little value in doing otherwise. If it + * ever gains more widespread use, perhaps we should cache the result in + * PlaceHolderInfo. + */ +Relids +get_placeholder_nulling_relids(PlannerInfo *root, PlaceHolderInfo *phinfo) +{ + Relids result = NULL; + int relid = -1; + + /* + * Form the union of all potential nulling OJs for each baserel included + * in ph_eval_at. + */ + while ((relid = bms_next_member(phinfo->ph_eval_at, relid)) > 0) + { + RelOptInfo *rel = root->simple_rel_array[relid]; + + /* ignore the RTE_GROUP RTE */ + if (relid == root->group_rtindex) + continue; + + if (rel == NULL) /* must be an outer join */ + { + Assert(bms_is_member(relid, root->outer_join_rels)); + continue; + } + result = bms_add_members(result, rel->nulling_relids); + } + + /* Now remove any OJs already included in ph_eval_at, and we're done. */ + result = bms_del_members(result, phinfo->ph_eval_at); + return result; +} diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 59233b647302d..bf45c355b77fc 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -42,6 +42,7 @@ #include "parser/parse_relation.h" #include "parser/parsetree.h" #include "partitioning/partdesc.h" +#include "rewrite/rewriteHandler.h" #include "rewrite/rewriteManip.h" #include "statistics/statistics.h" #include "storage/bufmgr.h" @@ -59,6 +60,12 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +typedef struct NotnullHashEntry +{ + Oid relid; /* OID of the relation */ + Bitmapset *notnullattnums; /* attnums of NOT NULL columns */ +} NotnullHashEntry; + static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, Relation relation, bool inhparent); @@ -71,7 +78,8 @@ static List *get_relation_constraints(PlannerInfo *root, bool include_partition); static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index, Relation heapRelation); -static List *get_relation_statistics(RelOptInfo *rel, Relation relation); +static List *get_relation_statistics(PlannerInfo *root, RelOptInfo *rel, + Relation relation); static void set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, Relation relation); static PartitionScheme find_partition_scheme(PlannerInfo *root, @@ -172,27 +180,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * RangeTblEntry does get populated. */ if (!inhparent || relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - for (int i = 0; i < relation->rd_att->natts; i++) - { - CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); - - Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); - - if (attr->attnullability == ATTNULLABLE_VALID) - { - rel->notnullattnums = bms_add_member(rel->notnullattnums, - i + 1); - - /* - * Per RemoveAttributeById(), dropped columns will have their - * attnotnull unset, so we needn't check for dropped columns - * in the above condition. - */ - Assert(!attr->attisdropped); - } - } - } + rel->notnullattnums = find_relation_notnullatts(root, relationObjectId); /* * Estimate relation size --- unless it's an inheritance parent, in which @@ -291,11 +279,11 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->ncolumns = ncolumns = index->indnatts; info->nkeycolumns = nkeycolumns = index->indnkeyatts; - info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); - info->indexcollations = (Oid *) palloc(sizeof(Oid) * nkeycolumns); - info->opfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns); - info->opcintype = (Oid *) palloc(sizeof(Oid) * nkeycolumns); - info->canreturn = (bool *) palloc(sizeof(bool) * ncolumns); + info->indexkeys = palloc_array(int, ncolumns); + info->indexcollations = palloc_array(Oid, nkeycolumns); + info->opfamily = palloc_array(Oid, nkeycolumns); + info->opcintype = palloc_array(Oid, nkeycolumns); + info->canreturn = palloc_array(bool, ncolumns); for (i = 0; i < ncolumns; i++) { @@ -348,8 +336,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, Assert(amroutine->amcanorder); info->sortopfamily = info->opfamily; - info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); - info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); + info->reverse_sort = palloc_array(bool, nkeycolumns); + info->nulls_first = palloc_array(bool, nkeycolumns); for (i = 0; i < nkeycolumns; i++) { @@ -372,9 +360,9 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * corresponding btree opfamily for each opfamily of the * other index type. */ - info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns); - info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); - info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); + info->sortopfamily = palloc_array(Oid, nkeycolumns); + info->reverse_sort = palloc_array(bool, nkeycolumns); + info->nulls_first = palloc_array(bool, nkeycolumns); for (i = 0; i < nkeycolumns; i++) { @@ -441,13 +429,32 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. + * + * After fixing the varnos, we need to run the index expressions + * and predicate through const-simplification again, using a valid + * "root". This ensures that NullTest quals for Vars can be + * properly reduced. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); - if (info->indexprs && varno != 1) - ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); - if (info->indpred && varno != 1) - ChangeVarNodes((Node *) info->indpred, 1, varno, 0); + if (info->indexprs) + { + if (varno != 1) + ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); + + info->indexprs = (List *) + eval_const_expressions(root, (Node *) info->indexprs); + } + if (info->indpred) + { + if (varno != 1) + ChangeVarNodes((Node *) info->indpred, 1, varno, 0); + + info->indpred = (List *) + eval_const_expressions(root, + (Node *) make_ands_explicit(info->indpred)); + info->indpred = make_ands_implicit((Expr *) info->indpred); + } /* Build targetlist using the completed indexprs data */ info->indextlist = build_index_tlist(root, info, relation); @@ -522,7 +529,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, rel->indexlist = indexinfos; - rel->statlist = get_relation_statistics(rel, relation); + rel->statlist = get_relation_statistics(root, rel, relation); /* Grab foreign-table info using the relcache, while we have it */ if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) @@ -683,6 +690,105 @@ get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, } } +/* + * get_relation_notnullatts - + * Retrieves column not-null constraint information for a given relation. + * + * We do this while we have the relcache entry open, and store the column + * not-null constraint information in a hash table based on the relation OID. + */ +void +get_relation_notnullatts(PlannerInfo *root, Relation relation) +{ + Oid relid = RelationGetRelid(relation); + NotnullHashEntry *hentry; + bool found; + Bitmapset *notnullattnums = NULL; + + /* bail out if the relation has no not-null constraints */ + if (relation->rd_att->constr == NULL || + !relation->rd_att->constr->has_not_null) + return; + + /* create the hash table if it hasn't been created yet */ + if (root->glob->rel_notnullatts_hash == NULL) + { + HTAB *hashtab; + HASHCTL hash_ctl; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(NotnullHashEntry); + hash_ctl.hcxt = CurrentMemoryContext; + + hashtab = hash_create("Relation NOT NULL attnums", + 64L, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + root->glob->rel_notnullatts_hash = hashtab; + } + + /* + * Create a hash entry for this relation OID, if we don't have one + * already. + */ + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_ENTER, + &found); + + /* bail out if a hash entry already exists for this relation OID */ + if (found) + return; + + /* collect the column not-null constraint information for this relation */ + for (int i = 0; i < relation->rd_att->natts; i++) + { + CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); + + Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); + + if (attr->attnullability == ATTNULLABLE_VALID) + { + notnullattnums = bms_add_member(notnullattnums, i + 1); + + /* + * Per RemoveAttributeById(), dropped columns will have their + * attnotnull unset, so we needn't check for dropped columns in + * the above condition. + */ + Assert(!attr->attisdropped); + } + } + + /* ... and initialize the new hash entry */ + hentry->notnullattnums = notnullattnums; +} + +/* + * find_relation_notnullatts - + * Searches the hash table and returns the column not-null constraint + * information for a given relation. + */ +Bitmapset * +find_relation_notnullatts(PlannerInfo *root, Oid relid) +{ + NotnullHashEntry *hentry; + bool found; + + if (root->glob->rel_notnullatts_hash == NULL) + return NULL; + + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_FIND, + &found); + if (!found) + return NULL; + + return hentry->notnullattnums; +} + /* * infer_arbiter_indexes - * Determine the unique indexes used to arbitrate speculative insertion. @@ -714,14 +820,21 @@ infer_arbiter_indexes(PlannerInfo *root) Relation relation; Oid indexOidFromConstraint = InvalidOid; List *indexList; - ListCell *l; + List *indexRelList = NIL; - /* Normalized inference attributes and inference expressions: */ + /* + * Required attributes and expressions used to match indexes to the clause + * given by the user. In the ON CONFLICT ON CONSTRAINT case, we compute + * these from that constraint's index to match all other indexes, to + * account for the case where that index is being concurrently reindexed. + */ + List *inferIndexExprs = (List *) onconflict->arbiterWhere; Bitmapset *inferAttrs = NULL; List *inferElems = NIL; /* Results */ List *results = NIL; + bool foundValid = false; /* * Quickly return NIL for ON CONFLICT DO NOTHING without an inference @@ -748,12 +861,14 @@ infer_arbiter_indexes(PlannerInfo *root) * well as a separate list of expression items. This simplifies matching * the cataloged definition of indexes. */ - foreach(l, onconflict->arbiterElems) + foreach_ptr(InferenceElem, elem, onconflict->arbiterElems) { - InferenceElem *elem = (InferenceElem *) lfirst(l); Var *var; int attno; + /* we cannot also have a constraint name, per grammar */ + Assert(!OidIsValid(onconflict->constraint)); + if (!IsA(elem->expr, Var)) { /* If not a plain Var, just shove it in inferElems for now */ @@ -774,49 +889,124 @@ infer_arbiter_indexes(PlannerInfo *root) } /* - * Lookup named constraint's index. This is not immediately returned - * because some additional sanity checks are required. + * Next, open all the indexes. We need this list for two things: first, + * if an ON CONSTRAINT clause was given, and that constraint's index is + * undergoing REINDEX CONCURRENTLY, then we need to consider all matches + * for that index. Second, if an attribute list was specified in the ON + * CONFLICT clause, we use the list to find the indexes whose attributes + * match that list. + */ + indexList = RelationGetIndexList(relation); + foreach_oid(indexoid, indexList) + { + Relation idxRel; + + /* obtain the same lock type that the executor will ultimately use */ + idxRel = index_open(indexoid, rte->rellockmode); + indexRelList = lappend(indexRelList, idxRel); + } + + /* + * If a constraint was named in the command, look up its index. We don't + * return it immediately because we need some additional sanity checks, + * and also because we need to include other indexes as arbiters to + * account for REINDEX CONCURRENTLY processing it. */ if (onconflict->constraint != InvalidOid) { - indexOidFromConstraint = get_constraint_index(onconflict->constraint); + /* we cannot also have an explicit list of elements, per grammar */ + Assert(onconflict->arbiterElems == NIL); + indexOidFromConstraint = get_constraint_index(onconflict->constraint); if (indexOidFromConstraint == InvalidOid) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("constraint in ON CONFLICT clause has no associated index"))); + + /* + * Find the named constraint index to extract its attributes and + * predicates. + */ + foreach_ptr(RelationData, idxRel, indexRelList) + { + Form_pg_index idxForm = idxRel->rd_index; + + if (indexOidFromConstraint == idxForm->indexrelid) + { + /* Found it. */ + Assert(idxForm->indisready); + + /* + * Set up inferElems and inferPredExprs to match the + * constraint index, so that we can match them in the loop + * below. + */ + for (int natt = 0; natt < idxForm->indnkeyatts; natt++) + { + int attno; + + attno = idxRel->rd_index->indkey.values[natt]; + if (attno != InvalidAttrNumber) + inferAttrs = + bms_add_member(inferAttrs, + attno - FirstLowInvalidHeapAttributeNumber); + } + + inferElems = RelationGetIndexExpressions(idxRel); + inferIndexExprs = RelationGetIndexPredicate(idxRel); + break; + } + } } /* * Using that representation, iterate through the list of indexes on the - * target relation to try and find a match + * target relation to find matches. */ - indexList = RelationGetIndexList(relation); - - foreach(l, indexList) + foreach_ptr(RelationData, idxRel, indexRelList) { - Oid indexoid = lfirst_oid(l); - Relation idxRel; Form_pg_index idxForm; Bitmapset *indexedAttrs; List *idxExprs; List *predExprs; AttrNumber natt; - ListCell *el; + bool match; /* - * Extract info from the relation descriptor for the index. Obtain - * the same lock type that the executor will ultimately use. + * Extract info from the relation descriptor for the index. * * Let executor complain about !indimmediate case directly, because * enforcement needs to occur there anyway when an inference clause is * omitted. */ - idxRel = index_open(indexoid, rte->rellockmode); idxForm = idxRel->rd_index; - if (!idxForm->indisvalid) - goto next; + /* + * Ignore indexes that aren't indisready, because we cannot trust + * their catalog structure yet. However, if any indexes are marked + * indisready but not yet indisvalid, we still consider them, because + * they might turn valid while we're running. Doing it this way + * allows a concurrent transaction with a slightly later catalog + * snapshot infer the same set of indexes, which is critical to + * prevent spurious 'duplicate key' errors. + * + * However, another critical aspect is that a unique index that isn't + * yet marked indisvalid=true might not be complete yet, meaning it + * wouldn't detect possible duplicate rows. In order to prevent false + * negatives, we require that we include in the set of inferred + * indexes at least one index that is marked valid. + */ + if (!idxForm->indisready) + continue; + + /* + * Ignore invalid indexes for partitioned tables. It's possible that + * some partitions don't have the index (yet), and then we would not + * find a match during ExecInitPartitionInfo. + */ + if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && + !idxForm->indisvalid) + continue; /* * Note that we do not perform a check against indcheckxmin (like e.g. @@ -826,7 +1016,7 @@ infer_arbiter_indexes(PlannerInfo *root) */ /* - * Look for match on "ON constraint_name" variant, which may not be + * Look for match for "ON constraint_name" variant, which may not be a * unique constraint. This can only be a constraint name. */ if (indexOidFromConstraint == idxForm->indexrelid) @@ -836,32 +1026,37 @@ infer_arbiter_indexes(PlannerInfo *root) (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("ON CONFLICT DO UPDATE not supported with exclusion constraints"))); + /* Consider this one a match already */ results = lappend_oid(results, idxForm->indexrelid); - list_free(indexList); - index_close(idxRel, NoLock); - table_close(relation, NoLock); - return results; + foundValid |= idxForm->indisvalid; + continue; } else if (indexOidFromConstraint != InvalidOid) { - /* No point in further work for index in named constraint case */ - goto next; + /* + * In the case of "ON constraint_name DO UPDATE" we need to skip + * non-unique candidates. + */ + if (!idxForm->indisunique && onconflict->action == ONCONFLICT_UPDATE) + continue; + } + else + { + /* + * Only considering conventional inference at this point (not + * named constraints), so index under consideration can be + * immediately skipped if it's not unique. + */ + if (!idxForm->indisunique) + continue; } - - /* - * Only considering conventional inference at this point (not named - * constraints), so index under consideration can be immediately - * skipped if it's not unique - */ - if (!idxForm->indisunique) - goto next; /* * So-called unique constraints with WITHOUT OVERLAPS are really * exclusion constraints, so skip those too. */ if (idxForm->indisexclusion) - goto next; + continue; /* Build BMS representation of plain (non expression) index attrs */ indexedAttrs = NULL; @@ -876,17 +1071,25 @@ infer_arbiter_indexes(PlannerInfo *root) /* Non-expression attributes (if any) must match */ if (!bms_equal(indexedAttrs, inferAttrs)) - goto next; + continue; /* Expression attributes (if any) must match */ idxExprs = RelationGetIndexExpressions(idxRel); - if (idxExprs && varno != 1) - ChangeVarNodes((Node *) idxExprs, 1, varno, 0); - - foreach(el, onconflict->arbiterElems) + if (idxExprs) { - InferenceElem *elem = (InferenceElem *) lfirst(el); + if (varno != 1) + ChangeVarNodes((Node *) idxExprs, 1, varno, 0); + + idxExprs = (List *) eval_const_expressions(root, (Node *) idxExprs); + } + /* + * If arbiterElems are present, check them. (Note that if a + * constraint name was given in the command line, this list is NIL.) + */ + match = true; + foreach_ptr(InferenceElem, elem, onconflict->arbiterElems) + { /* * Ensure that collation/opclass aspects of inference expression * element match. Even though this loop is primarily concerned @@ -895,7 +1098,10 @@ infer_arbiter_indexes(PlannerInfo *root) * attributes appearing as inference elements. */ if (!infer_collation_opclass_match(elem, idxRel, idxExprs)) - goto next; + { + match = false; + break; + } /* * Plain Vars don't factor into count of expression elements, and @@ -916,39 +1122,71 @@ infer_arbiter_indexes(PlannerInfo *root) list_member(idxExprs, elem->expr)) continue; - goto next; + match = false; + break; } + if (!match) + continue; /* - * Now that all inference elements were matched, ensure that the + * In case of inference from an attribute list, ensure that the * expression elements from inference clause are not missing any * cataloged expressions. This does the right thing when unique * indexes redundantly repeat the same attribute, or if attributes * redundantly appear multiple times within an inference clause. + * + * In case a constraint was named, ensure the candidate has an equal + * set of expressions as the named constraint's index. */ if (list_difference(idxExprs, inferElems) != NIL) - goto next; + continue; - /* - * If it's a partial index, its predicate must be implied by the ON - * CONFLICT's WHERE clause. - */ predExprs = RelationGetIndexPredicate(idxRel); - if (predExprs && varno != 1) - ChangeVarNodes((Node *) predExprs, 1, varno, 0); + if (predExprs) + { + if (varno != 1) + ChangeVarNodes((Node *) predExprs, 1, varno, 0); - if (!predicate_implied_by(predExprs, (List *) onconflict->arbiterWhere, false)) - goto next; + predExprs = (List *) + eval_const_expressions(root, + (Node *) make_ands_explicit(predExprs)); + predExprs = make_ands_implicit((Expr *) predExprs); + } + + /* + * Partial indexes affect each form of ON CONFLICT differently: if a + * constraint was named, then the predicates must be identical. In + * conventional inference, the index's predicate must be implied by + * the WHERE clause. + */ + if (OidIsValid(indexOidFromConstraint)) + { + if (list_difference(predExprs, inferIndexExprs) != NIL) + continue; + } + else + { + if (!predicate_implied_by(predExprs, inferIndexExprs, false)) + continue; + } + /* All good -- consider this index a match */ results = lappend_oid(results, idxForm->indexrelid); -next: + foundValid |= idxForm->indisvalid; + } + + /* Close all indexes */ + foreach_ptr(RelationData, idxRel, indexRelList) + { index_close(idxRel, NoLock); } list_free(indexList); + list_free(indexRelList); table_close(relation, NoLock); - if (results == NIL) + /* We require at least one indisvalid index */ + if (results == NIL || !foundValid) ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), errmsg("there is no unique or exclusion constraint matching the ON CONFLICT specification"))); @@ -1321,6 +1559,14 @@ get_relation_constraints(PlannerInfo *root, cexpr = stringToNode(constr->check[i].ccbin); + /* + * Fix Vars to have the desired varno. This must be done before + * const-simplification because eval_const_expressions reduces + * NullTest for Vars based on varno. + */ + if (varno != 1) + ChangeVarNodes(cexpr, 1, varno, 0); + /* * Run each expression through const-simplification and * canonicalization. This is not just an optimization, but is @@ -1335,10 +1581,6 @@ get_relation_constraints(PlannerInfo *root, cexpr = (Node *) canonicalize_qual((Expr *) cexpr, true); - /* Fix Vars to have the desired varno */ - if (varno != 1) - ChangeVarNodes(cexpr, 1, varno, 0); - /* * Finally, convert to implicit-AND format (that is, a List) and * append the resulting item(s) to our output list. @@ -1392,6 +1634,14 @@ get_relation_constraints(PlannerInfo *root, result = list_concat(result, rel->partition_qual); } + /* + * Expand virtual generated columns in the constraint expressions. + */ + if (result) + result = (List *) expand_generated_columns_in_expr((Node *) result, + relation, + varno); + table_close(relation, NoLock); return result; @@ -1487,7 +1737,8 @@ get_relation_statistics_worker(List **stainfos, RelOptInfo *rel, * just the identifying metadata. Only stats actually built are considered. */ static List * -get_relation_statistics(RelOptInfo *rel, Relation relation) +get_relation_statistics(PlannerInfo *root, RelOptInfo *rel, + Relation relation) { Index varno = rel->relid; List *statoidlist; @@ -1519,8 +1770,8 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) keys = bms_add_member(keys, staForm->stxkeys.values[i]); /* - * Preprocess expressions (if any). We read the expressions, run them - * through eval_const_expressions, and fix the varnos. + * Preprocess expressions (if any). We read the expressions, fix the + * varnos, and run them through eval_const_expressions. * * XXX We don't know yet if there are any data for this stats object, * with either stxdinherit value. But it's reasonable to assume there @@ -1543,6 +1794,18 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) exprs = (List *) stringToNode(exprsString); pfree(exprsString); + /* + * Modify the copies we obtain from the relcache to have the + * correct varno for the parent relation, so that they match + * up correctly against qual clauses. + * + * This must be done before const-simplification because + * eval_const_expressions reduces NullTest for Vars based on + * varno. + */ + if (varno != 1) + ChangeVarNodes((Node *) exprs, 1, varno, 0); + /* * Run the expressions through eval_const_expressions. This is * not just an optimization, but is necessary, because the @@ -1551,18 +1814,10 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) * We must not use canonicalize_qual, however, since these * aren't qual expressions. */ - exprs = (List *) eval_const_expressions(NULL, (Node *) exprs); + exprs = (List *) eval_const_expressions(root, (Node *) exprs); /* May as well fix opfuncids too */ fix_opfuncids((Node *) exprs); - - /* - * Modify the copies we obtain from the relcache to have the - * correct varno for the parent relation, so that they match - * up correctly against qual clauses. - */ - if (varno != 1) - ChangeVarNodes((Node *) exprs, 1, varno, 0); } } @@ -2039,9 +2294,8 @@ join_selectivity(PlannerInfo *root, /* * function_selectivity * - * Returns the selectivity of a specified boolean function clause. - * This code executes registered procedures stored in the - * pg_proc relation, by calling the function manager. + * Attempt to estimate the selectivity of a specified boolean function clause + * by asking its support function. If the function lacks support, return -1. * * See clause_selectivity() for the meaning of the additional parameters. */ @@ -2059,15 +2313,8 @@ function_selectivity(PlannerInfo *root, SupportRequestSelectivity req; SupportRequestSelectivity *sresult; - /* - * If no support function is provided, use our historical default - * estimate, 0.3333333. This seems a pretty unprincipled choice, but - * Postgres has been using that estimate for function calls since 1992. - * The hoariness of this behavior suggests that we should not be in too - * much hurry to use another value. - */ if (!prosupport) - return (Selectivity) 0.3333333; + return (Selectivity) -1; /* no support function */ req.type = T_SupportRequestSelectivity; req.root = root; @@ -2084,9 +2331,8 @@ function_selectivity(PlannerInfo *root, DatumGetPointer(OidFunctionCall1(prosupport, PointerGetDatum(&req))); - /* If support function fails, use default */ if (sresult != &req) - return (Selectivity) 0.3333333; + return (Selectivity) -1; /* function did not honor request */ if (req.selectivity < 0.0 || req.selectivity > 1.0) elog(ERROR, "invalid function selectivity: %f", req.selectivity); @@ -2303,6 +2549,60 @@ has_row_triggers(PlannerInfo *root, Index rti, CmdType event) return result; } +/* + * has_transition_tables + * + * Detect whether the specified relation has any transition tables for event. + */ +bool +has_transition_tables(PlannerInfo *root, Index rti, CmdType event) +{ + RangeTblEntry *rte = planner_rt_fetch(rti, root); + Relation relation; + TriggerDesc *trigDesc; + bool result = false; + + Assert(rte->rtekind == RTE_RELATION); + + /* Currently foreign tables cannot have transition tables */ + if (rte->relkind == RELKIND_FOREIGN_TABLE) + return result; + + /* Assume we already have adequate lock */ + relation = table_open(rte->relid, NoLock); + + trigDesc = relation->trigdesc; + switch (event) + { + case CMD_INSERT: + if (trigDesc && + trigDesc->trig_insert_new_table) + result = true; + break; + case CMD_UPDATE: + if (trigDesc && + (trigDesc->trig_update_old_table || + trigDesc->trig_update_new_table)) + result = true; + break; + case CMD_DELETE: + if (trigDesc && + trigDesc->trig_delete_old_table) + result = true; + break; + /* There is no separate event for MERGE, only INSERT/UPDATE/DELETE */ + case CMD_MERGE: + result = false; + break; + default: + elog(ERROR, "unrecognized CmdType: %d", (int) event); + break; + } + + table_close(relation, NoLock); + return result; +} + /* * has_stored_generated_columns * @@ -2360,7 +2660,7 @@ get_dependent_generated_columns(PlannerInfo *root, Index rti, Bitmapset *attrs_used = NULL; /* skip if not generated column */ - if (!TupleDescAttr(tupdesc, defval->adnum - 1)->attgenerated) + if (!TupleDescCompactAttr(tupdesc, defval->adnum - 1)->attgenerated) continue; /* identify columns this generated column depends on */ @@ -2478,32 +2778,31 @@ find_partition_scheme(PlannerInfo *root, Relation relation) * array since the relcache entry may not survive after we have closed the * relation. */ - part_scheme = (PartitionScheme) palloc0(sizeof(PartitionSchemeData)); + part_scheme = palloc0_object(PartitionSchemeData); part_scheme->strategy = partkey->strategy; part_scheme->partnatts = partkey->partnatts; - part_scheme->partopfamily = (Oid *) palloc(sizeof(Oid) * partnatts); + part_scheme->partopfamily = palloc_array(Oid, partnatts); memcpy(part_scheme->partopfamily, partkey->partopfamily, sizeof(Oid) * partnatts); - part_scheme->partopcintype = (Oid *) palloc(sizeof(Oid) * partnatts); + part_scheme->partopcintype = palloc_array(Oid, partnatts); memcpy(part_scheme->partopcintype, partkey->partopcintype, sizeof(Oid) * partnatts); - part_scheme->partcollation = (Oid *) palloc(sizeof(Oid) * partnatts); + part_scheme->partcollation = palloc_array(Oid, partnatts); memcpy(part_scheme->partcollation, partkey->partcollation, sizeof(Oid) * partnatts); - part_scheme->parttyplen = (int16 *) palloc(sizeof(int16) * partnatts); + part_scheme->parttyplen = palloc_array(int16, partnatts); memcpy(part_scheme->parttyplen, partkey->parttyplen, sizeof(int16) * partnatts); - part_scheme->parttypbyval = (bool *) palloc(sizeof(bool) * partnatts); + part_scheme->parttypbyval = palloc_array(bool, partnatts); memcpy(part_scheme->parttypbyval, partkey->parttypbyval, sizeof(bool) * partnatts); - part_scheme->partsupfunc = (FmgrInfo *) - palloc(sizeof(FmgrInfo) * partnatts); + part_scheme->partsupfunc = palloc_array(FmgrInfo, partnatts); for (i = 0; i < partnatts; i++) fmgr_info_copy(&part_scheme->partsupfunc[i], &partkey->partsupfunc[i], CurrentMemoryContext); @@ -2537,7 +2836,7 @@ set_baserel_partition_key_exprs(Relation relation, Assert(partkey != NULL); partnatts = partkey->partnatts; - partexprs = (List **) palloc(sizeof(List *) * partnatts); + partexprs = palloc_array(List *, partnatts); lc = list_head(partkey->partexprs); for (cnt = 0; cnt < partnatts; cnt++) @@ -2578,7 +2877,7 @@ set_baserel_partition_key_exprs(Relation relation, * expression lists to keep partition key expression handling code simple. * See build_joinrel_partition_info() and match_expr_to_partition_keys(). */ - rel->nullable_partexprs = (List **) palloc0(sizeof(List *) * partnatts); + rel->nullable_partexprs = palloc0_array(List *, partnatts); } /* diff --git a/src/backend/optimizer/util/predtest.c b/src/backend/optimizer/util/predtest.c index ac28573cd0a5a..0a14658ed7d6c 100644 --- a/src/backend/optimizer/util/predtest.c +++ b/src/backend/optimizer/util/predtest.c @@ -967,7 +967,7 @@ arrayconst_startup_fn(Node *clause, PredIterInfo info) char elmalign; /* Create working state struct */ - state = (ArrayConstIterState *) palloc(sizeof(ArrayConstIterState)); + state = palloc_object(ArrayConstIterState); info->state = state; /* Deconstruct the array literal */ @@ -1046,7 +1046,7 @@ arrayexpr_startup_fn(Node *clause, PredIterInfo info) ArrayExpr *arrayexpr; /* Create working state struct */ - state = (ArrayExprIterState *) palloc(sizeof(ArrayExprIterState)); + state = palloc_object(ArrayExprIterState); info->state = state; /* Set up a dummy OpExpr to return as the per-item node */ diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index ff507331a061a..5514c1ba73542 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -16,6 +16,8 @@ #include +#include "access/nbtree.h" +#include "catalog/pg_constraint.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "optimizer/appendinfo.h" @@ -27,12 +29,16 @@ #include "optimizer/paths.h" #include "optimizer/placeholder.h" #include "optimizer/plancat.h" +#include "optimizer/planner.h" #include "optimizer/restrictinfo.h" #include "optimizer/tlist.h" +#include "parser/parse_oper.h" #include "parser/parse_relation.h" #include "rewrite/rewriteManip.h" #include "utils/hsearch.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" +#include "utils/typcache.h" typedef struct JoinHashEntry @@ -83,6 +89,14 @@ static void build_child_join_reltarget(PlannerInfo *root, RelOptInfo *childrel, int nappinfos, AppendRelInfo **appinfos); +static bool eager_aggregation_possible_for_relation(PlannerInfo *root, + RelOptInfo *rel); +static bool init_grouping_targets(PlannerInfo *root, RelOptInfo *rel, + PathTarget *target, PathTarget *agg_input, + List **group_clauses, List **group_exprs); +static bool is_var_in_aggref_only(PlannerInfo *root, Var *var); +static bool is_var_needed_by_join(PlannerInfo *root, Var *var, RelOptInfo *rel); +static Index get_expression_sortgroupref(PlannerInfo *root, Expr *expr); /* @@ -106,11 +120,11 @@ setup_simple_rel_arrays(PlannerInfo *root) * exist yet. It'll be filled by later calls to build_simple_rel(). */ root->simple_rel_array = (RelOptInfo **) - palloc0(size * sizeof(RelOptInfo *)); + palloc0_array(RelOptInfo *, size); /* simple_rte_array is an array equivalent of the rtable list */ root->simple_rte_array = (RangeTblEntry **) - palloc0(size * sizeof(RangeTblEntry *)); + palloc0_array(RangeTblEntry *, size); rti = 1; foreach(lc, root->parse->rtable) { @@ -127,7 +141,7 @@ setup_simple_rel_arrays(PlannerInfo *root) } root->append_rel_array = (AppendRelInfo **) - palloc0(size * sizeof(AppendRelInfo *)); + palloc0_array(AppendRelInfo *, size); /* * append_rel_array is filled with any already-existing AppendRelInfos, @@ -211,13 +225,13 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->consider_startup = (root->tuple_fraction > 0); rel->consider_param_startup = false; /* might get changed later */ rel->consider_parallel = false; /* might get changed later */ + rel->needs_temp_safety = false; /* might get changed later */ rel->reltarget = create_empty_pathtarget(); rel->pathlist = NIL; rel->ppilist = NIL; rel->partial_pathlist = NIL; rel->cheapest_startup_path = NULL; rel->cheapest_total_path = NULL; - rel->cheapest_unique_path = NULL; rel->cheapest_parameterized_paths = NIL; rel->relid = relid; rel->rtekind = rte->rtekind; @@ -269,6 +283,9 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->fdw_private = NULL; rel->unique_for_rels = NIL; rel->non_unique_for_rels = NIL; + rel->unique_rel = NULL; + rel->unique_pathkeys = NIL; + rel->unique_groupclause = NIL; rel->baserestrictinfo = NIL; rel->baserestrictcost.startup = 0; rel->baserestrictcost.per_tuple = 0; @@ -276,6 +293,8 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->joininfo = NIL; rel->has_eclass_joins = false; rel->consider_partitionwise_join = false; /* might get changed later */ + rel->agg_info = NULL; + rel->grouped_rel = NULL; rel->part_scheme = NULL; rel->nparts = -1; rel->boundinfo = NULL; @@ -355,9 +374,9 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->min_attr = 0; rel->max_attr = list_length(rte->eref->colnames); rel->attr_needed = (Relids *) - palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); + palloc0_array(Relids, rel->max_attr - rel->min_attr + 1); rel->attr_widths = (int32 *) - palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); + palloc0_array(int32, rel->max_attr - rel->min_attr + 1); break; case RTE_RESULT: /* RTE_RESULT has no columns, nor could it have whole-row Var */ @@ -406,6 +425,103 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) return rel; } +/* + * build_simple_grouped_rel + * Construct a new RelOptInfo representing a grouped version of the input + * simple relation. + */ +RelOptInfo * +build_simple_grouped_rel(PlannerInfo *root, RelOptInfo *rel) +{ + RelOptInfo *grouped_rel; + RelAggInfo *agg_info; + + /* + * We should have available aggregate expressions and grouping + * expressions, otherwise we cannot reach here. + */ + Assert(root->agg_clause_list != NIL); + Assert(root->group_expr_list != NIL); + + /* nothing to do for dummy rel */ + if (IS_DUMMY_REL(rel)) + return NULL; + + /* + * Prepare the information needed to create grouped paths for this simple + * relation. + */ + agg_info = create_rel_agg_info(root, rel, true); + if (agg_info == NULL) + return NULL; + + /* + * If grouped paths for the given simple relation are not considered + * useful, skip building the grouped relation. + */ + if (!agg_info->agg_useful) + return NULL; + + /* Track the set of relids at which partial aggregation is applied */ + agg_info->apply_agg_at = bms_copy(rel->relids); + + /* build the grouped relation */ + grouped_rel = build_grouped_rel(root, rel); + grouped_rel->reltarget = agg_info->target; + grouped_rel->rows = agg_info->grouped_rows; + grouped_rel->agg_info = agg_info; + + rel->grouped_rel = grouped_rel; + + return grouped_rel; +} + +/* + * build_grouped_rel + * Build a grouped relation by flat copying the input relation and resetting + * the necessary fields. + */ +RelOptInfo * +build_grouped_rel(PlannerInfo *root, RelOptInfo *rel) +{ + RelOptInfo *grouped_rel; + + grouped_rel = makeNode(RelOptInfo); + memcpy(grouped_rel, rel, sizeof(RelOptInfo)); + + /* + * clear path info + */ + grouped_rel->pathlist = NIL; + grouped_rel->ppilist = NIL; + grouped_rel->partial_pathlist = NIL; + grouped_rel->cheapest_startup_path = NULL; + grouped_rel->cheapest_total_path = NULL; + grouped_rel->cheapest_parameterized_paths = NIL; + + /* + * clear partition info + */ + grouped_rel->part_scheme = NULL; + grouped_rel->nparts = -1; + grouped_rel->boundinfo = NULL; + grouped_rel->partbounds_merged = false; + grouped_rel->partition_qual = NIL; + grouped_rel->part_rels = NULL; + grouped_rel->live_parts = NULL; + grouped_rel->all_partrels = NULL; + grouped_rel->partexprs = NULL; + grouped_rel->nullable_partexprs = NULL; + grouped_rel->consider_partitionwise_join = false; + + /* + * clear size estimates + */ + grouped_rel->rows = 0; + + return grouped_rel; +} + /* * find_base_rel * Find a base or otherrel relation entry, which must already exist. @@ -707,13 +823,13 @@ build_join_rel(PlannerInfo *root, joinrel->consider_startup = (root->tuple_fraction > 0); joinrel->consider_param_startup = false; joinrel->consider_parallel = false; + joinrel->needs_temp_safety = false; joinrel->reltarget = create_empty_pathtarget(); joinrel->pathlist = NIL; joinrel->ppilist = NIL; joinrel->partial_pathlist = NIL; joinrel->cheapest_startup_path = NULL; joinrel->cheapest_total_path = NULL; - joinrel->cheapest_unique_path = NULL; joinrel->cheapest_parameterized_paths = NIL; /* init direct_lateral_relids from children; we'll finish it up below */ joinrel->direct_lateral_relids = @@ -748,6 +864,9 @@ build_join_rel(PlannerInfo *root, joinrel->fdw_private = NULL; joinrel->unique_for_rels = NIL; joinrel->non_unique_for_rels = NIL; + joinrel->unique_rel = NULL; + joinrel->unique_pathkeys = NIL; + joinrel->unique_groupclause = NIL; joinrel->baserestrictinfo = NIL; joinrel->baserestrictcost.startup = 0; joinrel->baserestrictcost.per_tuple = 0; @@ -755,6 +874,8 @@ build_join_rel(PlannerInfo *root, joinrel->joininfo = NIL; joinrel->has_eclass_joins = false; joinrel->consider_partitionwise_join = false; /* might get changed later */ + joinrel->agg_info = NULL; + joinrel->grouped_rel = NULL; joinrel->parent = NULL; joinrel->top_parent = NULL; joinrel->top_parent_relids = NULL; @@ -840,9 +961,13 @@ build_join_rel(PlannerInfo *root, * here. */ if (inner_rel->consider_parallel && outer_rel->consider_parallel && - is_parallel_safe(root, (Node *) restrictlist) && - is_parallel_safe(root, (Node *) joinrel->reltarget->exprs)) + is_parallel_safe(root, (Node *) restrictlist, &joinrel->needs_temp_safety) && + is_parallel_safe(root, (Node *) joinrel->reltarget->exprs, &joinrel->needs_temp_safety)) + { joinrel->consider_parallel = true; + joinrel->needs_temp_safety |= + (inner_rel->needs_temp_safety | outer_rel->needs_temp_safety); + } /* Add the joinrel to the PlannerInfo. */ add_join_rel(root, joinrel); @@ -906,7 +1031,6 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, joinrel->partial_pathlist = NIL; joinrel->cheapest_startup_path = NULL; joinrel->cheapest_total_path = NULL; - joinrel->cheapest_unique_path = NULL; joinrel->cheapest_parameterized_paths = NIL; joinrel->direct_lateral_relids = NULL; joinrel->lateral_relids = NULL; @@ -933,12 +1057,17 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, joinrel->useridiscurrent = false; joinrel->fdwroutine = NULL; joinrel->fdw_private = NULL; + joinrel->unique_rel = NULL; + joinrel->unique_pathkeys = NIL; + joinrel->unique_groupclause = NIL; joinrel->baserestrictinfo = NIL; joinrel->baserestrictcost.startup = 0; joinrel->baserestrictcost.per_tuple = 0; joinrel->joininfo = NIL; joinrel->has_eclass_joins = false; joinrel->consider_partitionwise_join = false; /* might get changed later */ + joinrel->agg_info = NULL; + joinrel->grouped_rel = NULL; joinrel->parent = parent_joinrel; joinrel->top_parent = parent_joinrel->top_parent ? parent_joinrel->top_parent : parent_joinrel; joinrel->top_parent_relids = joinrel->top_parent->relids; @@ -1488,7 +1617,6 @@ fetch_upper_rel(PlannerInfo *root, UpperRelationKind kind, Relids relids) upperrel->pathlist = NIL; upperrel->cheapest_startup_path = NULL; upperrel->cheapest_total_path = NULL; - upperrel->cheapest_unique_path = NULL; upperrel->cheapest_parameterized_paths = NIL; root->upper_rels[kind] = lappend(root->upper_rels[kind], upperrel); @@ -2364,9 +2492,8 @@ set_joinrel_partition_key_exprs(RelOptInfo *joinrel, PartitionScheme part_scheme = joinrel->part_scheme; int partnatts = part_scheme->partnatts; - joinrel->partexprs = (List **) palloc0(sizeof(List *) * partnatts); - joinrel->nullable_partexprs = - (List **) palloc0(sizeof(List *) * partnatts); + joinrel->partexprs = palloc0_array(List *, partnatts); + joinrel->nullable_partexprs = palloc0_array(List *, partnatts); /* * The joinrel's partition expressions are the same as those of the input @@ -2518,3 +2645,536 @@ build_child_join_reltarget(PlannerInfo *root, childrel->reltarget->cost.per_tuple = parentrel->reltarget->cost.per_tuple; childrel->reltarget->width = parentrel->reltarget->width; } + +/* + * create_rel_agg_info + * Create the RelAggInfo structure for the given relation if it can produce + * grouped paths. The given relation is the non-grouped one which has the + * reltarget already constructed. + * + * calculate_grouped_rows: if true, calculate the estimated number of grouped + * rows for the relation. If false, skip the estimation to avoid unnecessary + * planning overhead. + */ +RelAggInfo * +create_rel_agg_info(PlannerInfo *root, RelOptInfo *rel, + bool calculate_grouped_rows) +{ + ListCell *lc; + RelAggInfo *result; + PathTarget *agg_input; + PathTarget *target; + List *group_clauses = NIL; + List *group_exprs = NIL; + + /* + * The lists of aggregate expressions and grouping expressions should have + * been constructed. + */ + Assert(root->agg_clause_list != NIL); + Assert(root->group_expr_list != NIL); + + /* + * If this is a child rel, the grouped rel for its parent rel must have + * been created if it can. So we can just use parent's RelAggInfo if + * there is one, with appropriate variable substitutions. + */ + if (IS_OTHER_REL(rel)) + { + RelOptInfo *grouped_rel; + RelAggInfo *agg_info; + + grouped_rel = rel->top_parent->grouped_rel; + if (grouped_rel == NULL) + return NULL; + + Assert(IS_GROUPED_REL(grouped_rel)); + + /* Must do multi-level transformation */ + agg_info = (RelAggInfo *) + adjust_appendrel_attrs_multilevel(root, + (Node *) grouped_rel->agg_info, + rel, + rel->top_parent); + + agg_info->apply_agg_at = NULL; /* caller will change this later */ + + if (calculate_grouped_rows) + { + agg_info->grouped_rows = + estimate_num_groups(root, agg_info->group_exprs, + rel->rows, NULL, NULL); + + /* + * The grouped paths for the given relation are considered useful + * iff the average group size is no less than + * min_eager_agg_group_size. + */ + agg_info->agg_useful = + (rel->rows / agg_info->grouped_rows) >= min_eager_agg_group_size; + } + + return agg_info; + } + + /* Check if it's possible to produce grouped paths for this relation. */ + if (!eager_aggregation_possible_for_relation(root, rel)) + return NULL; + + /* + * Create targets for the grouped paths and for the input paths of the + * grouped paths. + */ + target = create_empty_pathtarget(); + agg_input = create_empty_pathtarget(); + + /* ... and initialize these targets */ + if (!init_grouping_targets(root, rel, target, agg_input, + &group_clauses, &group_exprs)) + return NULL; + + /* + * Eager aggregation is not applicable if there are no available grouping + * expressions. + */ + if (group_clauses == NIL) + return NULL; + + /* Add aggregates to the grouping target */ + foreach(lc, root->agg_clause_list) + { + AggClauseInfo *ac_info = lfirst_node(AggClauseInfo, lc); + Aggref *aggref; + + Assert(IsA(ac_info->aggref, Aggref)); + + aggref = (Aggref *) copyObject(ac_info->aggref); + mark_partial_aggref(aggref, AGGSPLIT_INITIAL_SERIAL); + + add_column_to_pathtarget(target, (Expr *) aggref, 0); + } + + /* Set the estimated eval cost and output width for both targets */ + set_pathtarget_cost_width(root, target); + set_pathtarget_cost_width(root, agg_input); + + /* build the RelAggInfo result */ + result = makeNode(RelAggInfo); + result->target = target; + result->agg_input = agg_input; + result->group_clauses = group_clauses; + result->group_exprs = group_exprs; + result->apply_agg_at = NULL; /* caller will change this later */ + + if (calculate_grouped_rows) + { + result->grouped_rows = estimate_num_groups(root, result->group_exprs, + rel->rows, NULL, NULL); + + /* + * The grouped paths for the given relation are considered useful iff + * the average group size is no less than min_eager_agg_group_size. + */ + result->agg_useful = + (rel->rows / result->grouped_rows) >= min_eager_agg_group_size; + } + + return result; +} + +/* + * eager_aggregation_possible_for_relation + * Check if it's possible to produce grouped paths for the given relation. + */ +static bool +eager_aggregation_possible_for_relation(PlannerInfo *root, RelOptInfo *rel) +{ + ListCell *lc; + int cur_relid; + + /* + * Check to see if the given relation is in the nullable side of an outer + * join. In this case, we cannot push a partial aggregation down to the + * relation, because the NULL-extended rows produced by the outer join + * would not be available when we perform the partial aggregation, while + * with a non-eager-aggregation plan these rows are available for the + * top-level aggregation. Doing so may result in the rows being grouped + * differently than expected, or produce incorrect values from the + * aggregate functions. + */ + cur_relid = -1; + while ((cur_relid = bms_next_member(rel->relids, cur_relid)) >= 0) + { + RelOptInfo *baserel = find_base_rel_ignore_join(root, cur_relid); + + if (baserel == NULL) + continue; /* ignore outer joins in rel->relids */ + + if (!bms_is_subset(baserel->nulling_relids, rel->relids)) + return false; + } + + /* + * For now we don't try to support PlaceHolderVars. + */ + foreach(lc, rel->reltarget->exprs) + { + Expr *expr = lfirst(lc); + + if (IsA(expr, PlaceHolderVar)) + return false; + } + + /* Caller should only pass base relations or joins. */ + Assert(rel->reloptkind == RELOPT_BASEREL || + rel->reloptkind == RELOPT_JOINREL); + + /* + * Check if all aggregate expressions can be evaluated on this relation + * level. + */ + foreach(lc, root->agg_clause_list) + { + AggClauseInfo *ac_info = lfirst_node(AggClauseInfo, lc); + + Assert(IsA(ac_info->aggref, Aggref)); + + /* + * Give up if any aggregate requires relations other than the current + * one. If the aggregate requires the current relation plus + * additional relations, grouping the current relation could make some + * input rows unavailable for the higher aggregate and may reduce the + * number of input rows it receives. If the aggregate does not + * require the current relation at all, it should not be grouped, as + * we do not support joining two grouped relations. + */ + if (!bms_is_subset(ac_info->agg_eval_at, rel->relids)) + return false; + } + + return true; +} + +/* + * init_grouping_targets + * Initialize the target for grouped paths (target) as well as the target + * for paths that generate input for the grouped paths (agg_input). + * + * We also construct the list of SortGroupClauses and the list of grouping + * expressions for the partial aggregation, and return them in *group_clause + * and *group_exprs. + * + * Return true if the targets could be initialized, false otherwise. + */ +static bool +init_grouping_targets(PlannerInfo *root, RelOptInfo *rel, + PathTarget *target, PathTarget *agg_input, + List **group_clauses, List **group_exprs) +{ + ListCell *lc; + List *possibly_dependent = NIL; + Index maxSortGroupRef; + + /* Identify the max sortgroupref */ + maxSortGroupRef = 0; + foreach(lc, root->processed_tlist) + { + Index ref = ((TargetEntry *) lfirst(lc))->ressortgroupref; + + if (ref > maxSortGroupRef) + maxSortGroupRef = ref; + } + + /* + * At this point, all Vars from this relation that are needed by upper + * joins or are required in the final targetlist should already be present + * in its reltarget. Therefore, we can safely iterate over this + * relation's reltarget->exprs to construct the PathTarget and grouping + * clauses for the grouped paths. + */ + foreach(lc, rel->reltarget->exprs) + { + Expr *expr = (Expr *) lfirst(lc); + Index sortgroupref; + + /* + * Given that PlaceHolderVar currently prevents us from doing eager + * aggregation, the source target cannot contain anything more complex + * than a Var. + */ + Assert(IsA(expr, Var)); + + /* + * Get the sortgroupref of the expr if it is found among, or can be + * deduced from, the original grouping expressions. + */ + sortgroupref = get_expression_sortgroupref(root, expr); + if (sortgroupref > 0) + { + SortGroupClause *sgc; + + /* Find the matching SortGroupClause */ + sgc = get_sortgroupref_clause(sortgroupref, root->processed_groupClause); + Assert(sgc->tleSortGroupRef <= maxSortGroupRef); + + /* + * If the target expression is to be used as a grouping key, it + * should be emitted by the grouped paths that have been pushed + * down to this relation level. + */ + add_column_to_pathtarget(target, expr, sortgroupref); + + /* + * ... and it also should be emitted by the input paths. + */ + add_column_to_pathtarget(agg_input, expr, sortgroupref); + + /* + * Record this SortGroupClause and grouping expression. Note that + * this SortGroupClause might have already been recorded. + */ + if (!list_member(*group_clauses, sgc)) + { + *group_clauses = lappend(*group_clauses, sgc); + *group_exprs = lappend(*group_exprs, expr); + } + } + else if (is_var_needed_by_join(root, (Var *) expr, rel)) + { + /* + * The expression is needed for an upper join but is neither in + * the GROUP BY clause nor derivable from it using EC (otherwise, + * it would have already been included in the targets above). We + * need to create a special SortGroupClause for this expression. + * + * It is important to include such expressions in the grouping + * keys. This is essential to ensure that an aggregated row from + * the partial aggregation matches the other side of the join if + * and only if each row in the partial group does. This ensures + * that all rows within the same partial group share the same + * 'destiny', which is crucial for maintaining correctness. + */ + SortGroupClause *sgc; + TypeCacheEntry *tce; + Oid equalimageproc; + + /* + * But first, check if equality implies image equality for this + * expression. If not, we cannot use it as a grouping key. See + * comments in create_grouping_expr_infos(). + */ + tce = lookup_type_cache(exprType((Node *) expr), + TYPECACHE_BTREE_OPFAMILY); + if (!OidIsValid(tce->btree_opf) || + !OidIsValid(tce->btree_opintype)) + return false; + + equalimageproc = get_opfamily_proc(tce->btree_opf, + tce->btree_opintype, + tce->btree_opintype, + BTEQUALIMAGE_PROC); + if (!OidIsValid(equalimageproc) || + !DatumGetBool(OidFunctionCall1Coll(equalimageproc, + tce->typcollation, + ObjectIdGetDatum(tce->btree_opintype)))) + return false; + + /* Create the SortGroupClause. */ + sgc = makeNode(SortGroupClause); + + /* Initialize the SortGroupClause. */ + sgc->tleSortGroupRef = ++maxSortGroupRef; + get_sort_group_operators(exprType((Node *) expr), + false, true, false, + &sgc->sortop, &sgc->eqop, NULL, + &sgc->hashable); + + /* This expression should be emitted by the grouped paths */ + add_column_to_pathtarget(target, expr, sgc->tleSortGroupRef); + + /* ... and it also should be emitted by the input paths. */ + add_column_to_pathtarget(agg_input, expr, sgc->tleSortGroupRef); + + /* Record this SortGroupClause and grouping expression */ + *group_clauses = lappend(*group_clauses, sgc); + *group_exprs = lappend(*group_exprs, expr); + } + else if (is_var_in_aggref_only(root, (Var *) expr)) + { + /* + * The expression is referenced by an aggregate function pushed + * down to this relation and does not appear elsewhere in the + * targetlist or havingQual. Add it to 'agg_input' but not to + * 'target'. + */ + add_new_column_to_pathtarget(agg_input, expr); + } + else + { + /* + * The expression may be functionally dependent on other + * expressions in the target, but we cannot verify this until all + * target expressions have been constructed. + */ + possibly_dependent = lappend(possibly_dependent, expr); + } + } + + /* + * Now we can verify whether an expression is functionally dependent on + * others. + */ + foreach(lc, possibly_dependent) + { + Var *tvar; + List *deps = NIL; + RangeTblEntry *rte; + + tvar = lfirst_node(Var, lc); + rte = root->simple_rte_array[tvar->varno]; + + if (check_functional_grouping(rte->relid, tvar->varno, + tvar->varlevelsup, + target->exprs, &deps)) + { + /* + * The expression is functionally dependent on other target + * expressions, so it can be included in the targets. Since it + * will not be used as a grouping key, a sortgroupref is not + * needed for it. + */ + add_new_column_to_pathtarget(target, (Expr *) tvar); + add_new_column_to_pathtarget(agg_input, (Expr *) tvar); + } + else + { + /* + * We may arrive here with a grouping expression that is proven + * redundant by EquivalenceClass processing, such as 't1.a' in the + * query below. + * + * select max(t1.c) from t t1, t t2 where t1.a = 1 group by t1.a, + * t1.b; + * + * For now we just give up in this case. + */ + return false; + } + } + + return true; +} + +/* + * is_var_in_aggref_only + * Check whether the given Var appears in aggregate expressions and not + * elsewhere in the targetlist or havingQual. + */ +static bool +is_var_in_aggref_only(PlannerInfo *root, Var *var) +{ + ListCell *lc; + + /* + * Search the list of aggregate expressions for the Var. + */ + foreach(lc, root->agg_clause_list) + { + AggClauseInfo *ac_info = lfirst_node(AggClauseInfo, lc); + List *vars; + + Assert(IsA(ac_info->aggref, Aggref)); + + if (!bms_is_member(var->varno, ac_info->agg_eval_at)) + continue; + + vars = pull_var_clause((Node *) ac_info->aggref, + PVC_RECURSE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_RECURSE_PLACEHOLDERS); + + if (list_member(vars, var)) + { + list_free(vars); + break; + } + + list_free(vars); + } + + return (lc != NULL && !list_member(root->tlist_vars, var)); +} + +/* + * is_var_needed_by_join + * Check if the given Var is needed by joins above the current rel. + */ +static bool +is_var_needed_by_join(PlannerInfo *root, Var *var, RelOptInfo *rel) +{ + Relids relids; + int attno; + RelOptInfo *baserel; + + /* + * Note that when checking if the Var is needed by joins above, we want to + * exclude cases where the Var is only needed in the final targetlist. So + * include "relation 0" in the check. + */ + relids = bms_copy(rel->relids); + relids = bms_add_member(relids, 0); + + baserel = find_base_rel(root, var->varno); + attno = var->varattno - baserel->min_attr; + + return bms_nonempty_difference(baserel->attr_needed[attno], relids); +} + +/* + * get_expression_sortgroupref + * Return the sortgroupref of the given "expr" if it is found among the + * original grouping expressions, or is known equal to any of the original + * grouping expressions due to equivalence relationships. Return 0 if no + * match is found. + */ +static Index +get_expression_sortgroupref(PlannerInfo *root, Expr *expr) +{ + ListCell *lc; + + Assert(IsA(expr, Var)); + + foreach(lc, root->group_expr_list) + { + GroupingExprInfo *ge_info = lfirst_node(GroupingExprInfo, lc); + ListCell *lc1; + + Assert(IsA(ge_info->expr, Var)); + Assert(ge_info->sortgroupref > 0); + + if (equal(expr, ge_info->expr)) + return ge_info->sortgroupref; + + if (ge_info->ec == NULL || + !bms_is_member(((Var *) expr)->varno, ge_info->ec->ec_relids)) + continue; + + /* + * Scan the EquivalenceClass, looking for a match to the given + * expression. We ignore child members here. + */ + foreach(lc1, ge_info->ec->ec_members) + { + EquivalenceMember *em = (EquivalenceMember *) lfirst(lc1); + + /* Child members should not exist in ec_members */ + Assert(!em->em_is_child); + + if (equal(expr, em->em_expr)) + return ge_info->sortgroupref; + } + } + + /* no match is found */ + return 0; +} diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c index d2b4ecc5e5131..88eb26bf1c4f9 100644 --- a/src/backend/optimizer/util/tlist.c +++ b/src/backend/optimizer/util/tlist.c @@ -19,6 +19,7 @@ #include "optimizer/cost.h" #include "optimizer/optimizer.h" #include "optimizer/tlist.h" +#include "rewrite/rewriteManip.h" /* @@ -45,6 +46,8 @@ typedef struct typedef struct { + PlannerInfo *root; + bool is_grouping_target; /* true if processing grouping target */ /* This is a List of bare expressions: */ List *input_target_exprs; /* exprs available from input */ /* These are Lists of Lists of split_pathtarget_items: */ @@ -59,6 +62,12 @@ typedef struct Index current_sgref; /* current subexpr's sortgroupref, or 0 */ } split_pathtarget_context; +static void split_pathtarget_at_srfs_extended(PlannerInfo *root, + PathTarget *target, + PathTarget *input_target, + List **targets, + List **targets_contain_srfs, + bool is_grouping_target); static bool split_pathtarget_walker(Node *node, split_pathtarget_context *context); static void add_sp_item_to_pathtarget(PathTarget *target, @@ -467,7 +476,7 @@ extract_grouping_ops(List *groupClause) Oid *groupOperators; ListCell *glitem; - groupOperators = (Oid *) palloc(sizeof(Oid) * numCols); + groupOperators = palloc_array(Oid, numCols); foreach(glitem, groupClause) { @@ -493,7 +502,7 @@ extract_grouping_collations(List *groupClause, List *tlist) Oid *grpCollations; ListCell *glitem; - grpCollations = (Oid *) palloc(sizeof(Oid) * numCols); + grpCollations = palloc_array(Oid, numCols); foreach(glitem, groupClause) { @@ -518,7 +527,7 @@ extract_grouping_cols(List *groupClause, List *tlist) int colno = 0; ListCell *glitem; - grpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); + grpColIdx = palloc_array(AttrNumber, numCols); foreach(glitem, groupClause) { @@ -822,6 +831,51 @@ apply_pathtarget_labeling_to_tlist(List *tlist, PathTarget *target) /* * split_pathtarget_at_srfs + * Split given PathTarget into multiple levels to position SRFs safely, + * performing exact matching against input_target. + * + * This is a wrapper for split_pathtarget_at_srfs_extended() that is used when + * both targets are on the same side of the grouping boundary (i.e., both are + * pre-grouping or both are post-grouping). In this case, no special handling + * for the grouping nulling bit is required. + * + * See split_pathtarget_at_srfs_extended() for more details. + */ +void +split_pathtarget_at_srfs(PlannerInfo *root, + PathTarget *target, PathTarget *input_target, + List **targets, List **targets_contain_srfs) +{ + split_pathtarget_at_srfs_extended(root, target, input_target, + targets, targets_contain_srfs, + false); +} + +/* + * split_pathtarget_at_srfs_grouping + * Split given PathTarget into multiple levels to position SRFs safely, + * ignoring the grouping nulling bit when matching against input_target. + * + * This variant is used when the targets cross the grouping boundary (i.e., + * target is post-grouping while input_target is pre-grouping). In this case, + * we need to ignore the grouping nulling bit when checking for expression + * availability to avoid incorrectly re-evaluating SRFs that have already been + * computed in input_target. + * + * See split_pathtarget_at_srfs_extended() for more details. + */ +void +split_pathtarget_at_srfs_grouping(PlannerInfo *root, + PathTarget *target, PathTarget *input_target, + List **targets, List **targets_contain_srfs) +{ + split_pathtarget_at_srfs_extended(root, target, input_target, + targets, targets_contain_srfs, + true); +} + +/* + * split_pathtarget_at_srfs_extended * Split given PathTarget into multiple levels to position SRFs safely * * The executor can only handle set-returning functions that appear at the @@ -860,6 +914,13 @@ apply_pathtarget_labeling_to_tlist(List *tlist, PathTarget *target) * already meant as a reference to a lower subexpression). So, don't expand * any tlist expressions that appear in input_target, if that's not NULL. * + * This check requires extra care when processing the grouping target + * (indicated by the is_grouping_target flag). In this case input_target is + * pre-grouping while target is post-grouping, so the latter may carry + * nullingrels bits from the grouping step that are absent in the former. We + * must ignore those bits to correctly recognize that the tlist expressions are + * available in input_target. + * * It's also important that we preserve any sortgroupref annotation appearing * in the given target, especially on expressions matching input_target items. * @@ -877,10 +938,11 @@ apply_pathtarget_labeling_to_tlist(List *tlist, PathTarget *target) * are only a few possible patterns for which levels contain SRFs. * But this representation decouples callers from that knowledge. */ -void -split_pathtarget_at_srfs(PlannerInfo *root, - PathTarget *target, PathTarget *input_target, - List **targets, List **targets_contain_srfs) +static void +split_pathtarget_at_srfs_extended(PlannerInfo *root, + PathTarget *target, PathTarget *input_target, + List **targets, List **targets_contain_srfs, + bool is_grouping_target) { split_pathtarget_context context; int max_depth; @@ -905,7 +967,12 @@ split_pathtarget_at_srfs(PlannerInfo *root, return; } - /* Pass any input_target exprs down to split_pathtarget_walker() */ + /* + * Pass 'root', the is_grouping_target flag, and any input_target exprs + * down to split_pathtarget_walker(). + */ + context.root = root; + context.is_grouping_target = is_grouping_target; context.input_target_exprs = input_target ? input_target->exprs : NIL; /* @@ -1076,9 +1143,27 @@ split_pathtarget_at_srfs(PlannerInfo *root, static bool split_pathtarget_walker(Node *node, split_pathtarget_context *context) { + Node *sanitized_node = node; + if (node == NULL) return false; + /* + * If we are crossing the grouping boundary (post-grouping target vs + * pre-grouping input_target), we must ignore the grouping nulling bit to + * correctly check if the subexpression is available in input_target. This + * aligns with the matching logic in set_upper_references(). + */ + if (context->is_grouping_target && + context->root->parse->hasGroupRTE && + context->root->parse->groupingSets != NIL) + { + sanitized_node = + remove_nulling_relids(node, + bms_make_singleton(context->root->group_rtindex), + NULL); + } + /* * A subexpression that matches an expression already computed in * input_target can be treated like a Var (which indeed it will be after @@ -1087,9 +1172,9 @@ split_pathtarget_walker(Node *node, split_pathtarget_context *context) * substructure. (Note in particular that this preserves the identity of * any expressions that appear as sortgrouprefs in input_target.) */ - if (list_member(context->input_target_exprs, node)) + if (list_member(context->input_target_exprs, sanitized_node)) { - split_pathtarget_item *item = palloc(sizeof(split_pathtarget_item)); + split_pathtarget_item *item = palloc_object(split_pathtarget_item); item->expr = node; item->sortgroupref = context->current_sgref; @@ -1109,7 +1194,7 @@ split_pathtarget_walker(Node *node, split_pathtarget_context *context) IsA(node, GroupingFunc) || IsA(node, WindowFunc)) { - split_pathtarget_item *item = palloc(sizeof(split_pathtarget_item)); + split_pathtarget_item *item = palloc_object(split_pathtarget_item); item->expr = node; item->sortgroupref = context->current_sgref; @@ -1124,7 +1209,7 @@ split_pathtarget_walker(Node *node, split_pathtarget_context *context) */ if (IS_SRF_CALL(node)) { - split_pathtarget_item *item = palloc(sizeof(split_pathtarget_item)); + split_pathtarget_item *item = palloc_object(split_pathtarget_item); List *save_input_vars = context->current_input_vars; List *save_input_srfs = context->current_input_srfs; int save_current_depth = context->current_depth; diff --git a/src/backend/parser/README b/src/backend/parser/README index e0c986a41efea..e26eb437a9f35 100644 --- a/src/backend/parser/README +++ b/src/backend/parser/README @@ -20,6 +20,7 @@ parse_cte.c handle Common Table Expressions (WITH clauses) parse_expr.c handle expressions like col, col + 3, x = 3 or x = 4 parse_enr.c handle ephemeral named rels (trigger transition tables, ...) parse_func.c handle functions, table.column and column identifiers +parse_jsontable.c handle JSON_TABLE parse_merge.c handle MERGE parse_node.c create nodes for various structures parse_oper.c handle operators in expressions diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index a16fdd65601d5..92be345d9a898 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -25,6 +25,7 @@ #include "postgres.h" #include "access/sysattr.h" +#include "catalog/dependency.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" #include "commands/defrem.h" @@ -55,6 +56,14 @@ #include "utils/syscache.h" +/* Passthrough data for transformPLAssignStmtTarget */ +typedef struct SelectStmtPassthrough +{ + PLAssignStmt *stmt; /* the assignment statement */ + Node *target; /* node representing the target variable */ + List *indirection; /* indirection yet to be applied to target */ +} SelectStmtPassthrough; + /* Hook for plugins to get control at end of parse analysis */ post_parse_analyze_hook_type post_parse_analyze_hook = NULL; @@ -64,7 +73,8 @@ static Query *transformInsertStmt(ParseState *pstate, InsertStmt *stmt); static OnConflictExpr *transformOnConflictClause(ParseState *pstate, OnConflictClause *onConflictClause); static int count_rowexpr_columns(ParseState *pstate, Node *expr); -static Query *transformSelectStmt(ParseState *pstate, SelectStmt *stmt); +static Query *transformSelectStmt(ParseState *pstate, SelectStmt *stmt, + SelectStmtPassthrough *passthru); static Query *transformValuesClause(ParseState *pstate, SelectStmt *stmt); static Query *transformSetOperationStmt(ParseState *pstate, SelectStmt *stmt); static Node *transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, @@ -75,6 +85,8 @@ static Query *transformReturnStmt(ParseState *pstate, ReturnStmt *stmt); static Query *transformUpdateStmt(ParseState *pstate, UpdateStmt *stmt); static Query *transformPLAssignStmt(ParseState *pstate, PLAssignStmt *stmt); +static List *transformPLAssignStmtTarget(ParseState *pstate, List *tlist, + SelectStmtPassthrough *passthru); static Query *transformDeclareCursorStmt(ParseState *pstate, DeclareCursorStmt *stmt); static Query *transformExplainStmt(ParseState *pstate, @@ -238,103 +250,24 @@ parse_sub_analyze(Node *parseTree, ParseState *parentParseState, return query; } -/* - * setQueryLocationAndLength - * Set query's location and length from statement and ParseState - * - * Some statements, like PreparableStmt, can be located within parentheses. - * For example "(SELECT 1)" or "COPY (UPDATE ...) to x;". For those, we - * cannot use the whole string from the statement's location or the SQL - * string would yield incorrectly. The parser will set stmt_len, reflecting - * the size of the statement within the parentheses. Thus, when stmt_len is - * available, we need to use it for the Query's stmt_len. - * - * For other cases, the parser can't provide the length of individual - * statements. However, we have the statement's location plus the length - * (p_stmt_len) and location (p_stmt_location) of the top level RawStmt, - * stored in pstate. Thus, the statement's length is the RawStmt's length - * minus how much we've advanced in the RawStmt's string. If p_stmt_len - * is 0, the SQL string is used up to its end. - */ -static void -setQueryLocationAndLength(ParseState *pstate, Query *qry, Node *parseTree) -{ - ParseLoc stmt_len = 0; - - switch (nodeTag(parseTree)) - { - case T_InsertStmt: - qry->stmt_location = ((InsertStmt *) parseTree)->stmt_location; - stmt_len = ((InsertStmt *) parseTree)->stmt_len; - break; - - case T_DeleteStmt: - qry->stmt_location = ((DeleteStmt *) parseTree)->stmt_location; - stmt_len = ((DeleteStmt *) parseTree)->stmt_len; - break; - - case T_UpdateStmt: - qry->stmt_location = ((UpdateStmt *) parseTree)->stmt_location; - stmt_len = ((UpdateStmt *) parseTree)->stmt_len; - break; - - case T_MergeStmt: - qry->stmt_location = ((MergeStmt *) parseTree)->stmt_location; - stmt_len = ((MergeStmt *) parseTree)->stmt_len; - break; - - case T_SelectStmt: - qry->stmt_location = ((SelectStmt *) parseTree)->stmt_location; - stmt_len = ((SelectStmt *) parseTree)->stmt_len; - break; - - case T_PLAssignStmt: - qry->stmt_location = ((PLAssignStmt *) parseTree)->location; - break; - - default: - qry->stmt_location = pstate->p_stmt_location; - break; - } - - if (stmt_len > 0) - { - /* Statement's length is known, use it */ - qry->stmt_len = stmt_len; - } - else if (pstate->p_stmt_len > 0) - { - /* - * The top RawStmt's length is known, so calculate the statement's - * length from the statement's location and the RawStmt's length and - * location. - */ - qry->stmt_len = pstate->p_stmt_len - (qry->stmt_location - pstate->p_stmt_location); - } - - /* The calculated statement length should be calculated as positive. */ - Assert(qry->stmt_len >= 0); -} - /* * transformTopLevelStmt - * transform a Parse tree into a Query tree. * - * This function is just responsible for storing location data - * from the RawStmt into the ParseState. + * This function is just responsible for transferring statement location data + * from the RawStmt into the finished Query. */ Query * transformTopLevelStmt(ParseState *pstate, RawStmt *parseTree) { Query *result; - /* Store RawStmt's length and location in pstate */ - pstate->p_stmt_len = parseTree->stmt_len; - pstate->p_stmt_location = parseTree->stmt_location; - /* We're at top level, so allow SELECT INTO */ result = transformOptionalSelectInto(pstate, parseTree->stmt); + result->stmt_location = parseTree->stmt_location; + result->stmt_len = parseTree->stmt_len; + return result; } @@ -450,7 +383,7 @@ transformStmt(ParseState *pstate, Node *parseTree) if (n->valuesLists) result = transformValuesClause(pstate, n); else if (n->op == SETOP_NONE) - result = transformSelectStmt(pstate, n); + result = transformSelectStmt(pstate, n, NULL); else result = transformSetOperationStmt(pstate, n); } @@ -496,14 +429,13 @@ transformStmt(ParseState *pstate, Node *parseTree) */ result = makeNode(Query); result->commandType = CMD_UTILITY; - result->utilityStmt = (Node *) parseTree; + result->utilityStmt = parseTree; break; } /* Mark as original query until we learn differently */ result->querySource = QSRC_ORIGINAL; result->canSetTag = true; - setQueryLocationAndLength(pstate, result, parseTree); return result; } @@ -857,7 +789,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) */ nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias("*SELECT*", NIL), + NULL, false, false); addNSItemToQuery(pstate, nsitem, true, false, false); @@ -1454,11 +1386,19 @@ count_rowexpr_columns(ParseState *pstate, Node *expr) * transformSelectStmt - * transforms a Select Statement * + * This function is also used to transform the source expression of a + * PLAssignStmt. In that usage, passthru is non-NULL and we need to + * call transformPLAssignStmtTarget after the initial transformation of the + * SELECT's targetlist. (We could generalize this into an arbitrary callback + * function, but for now that would just be more notation with no benefit.) + * All the rest is the same as a regular SelectStmt. + * * Note: this covers only cases with no set operations and no VALUES lists; * see below for the other cases. */ static Query * -transformSelectStmt(ParseState *pstate, SelectStmt *stmt) +transformSelectStmt(ParseState *pstate, SelectStmt *stmt, + SelectStmtPassthrough *passthru) { Query *qry = makeNode(Query); Node *qual; @@ -1495,8 +1435,16 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt) qry->targetList = transformTargetList(pstate, stmt->targetList, EXPR_KIND_SELECT_TARGET); - /* mark column origins */ - markTargetListOrigins(pstate, qry->targetList); + /* + * If we're within a PLAssignStmt, do further transformation of the + * targetlist; that has to happen before we consider sorting or grouping. + * Otherwise, mark column origins (which are useless in a PLAssignStmt). + */ + if (passthru) + qry->targetList = transformPLAssignStmtTarget(pstate, qry->targetList, + passthru); + else + markTargetListOrigins(pstate, qry->targetList); /* transform WHERE */ qual = transformWhereClause(pstate, stmt->whereClause, @@ -1520,12 +1468,14 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt) qry->groupClause = transformGroupClause(pstate, stmt->groupClause, + stmt->groupByAll, &qry->groupingSets, &qry->targetList, qry->sortClause, EXPR_KIND_GROUP_BY, false /* allow SQL92 rules */ ); qry->groupDistinct = stmt->groupDistinct; + qry->groupByAll = stmt->groupByAll; if (stmt->distinctClause == NIL) { @@ -2180,7 +2130,6 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, { /* Process leaf SELECT */ Query *selectQuery; - char selectName[32]; ParseNamespaceItem *nsitem; RangeTblRef *rtr; ListCell *tl; @@ -2236,11 +2185,9 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, /* * Make the leaf query be a subquery in the top-level rangetable. */ - snprintf(selectName, sizeof(selectName), "*SELECT* %d", - list_length(pstate->p_rtable) + 1); nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias(selectName, NIL), + NULL, false, false); @@ -2694,8 +2641,7 @@ addNSItemForReturning(ParseState *pstate, const char *aliasname, colnames = pstate->p_target_nsitem->p_rte->eref->colnames; numattrs = list_length(colnames); - nscolumns = (ParseNamespaceColumn *) - palloc(numattrs * sizeof(ParseNamespaceColumn)); + nscolumns = palloc_array(ParseNamespaceColumn, numattrs); memcpy(nscolumns, pstate->p_target_nsitem->p_nscolumns, numattrs * sizeof(ParseNamespaceColumn)); @@ -2705,7 +2651,7 @@ addNSItemForReturning(ParseState *pstate, const char *aliasname, nscolumns[i].p_varreturningtype = returning_type; /* build the nsitem, copying most fields from the target relation */ - nsitem = (ParseNamespaceItem *) palloc(sizeof(ParseNamespaceItem)); + nsitem = palloc_object(ParseNamespaceItem); nsitem->p_names = makeAlias(aliasname, colnames); nsitem->p_rte = pstate->p_target_nsitem->p_rte; nsitem->p_rtindex = pstate->p_target_nsitem->p_rtindex; @@ -2846,20 +2792,13 @@ transformReturningClause(ParseState *pstate, Query *qry, static Query * transformPLAssignStmt(ParseState *pstate, PLAssignStmt *stmt) { - Query *qry = makeNode(Query); + Query *qry; ColumnRef *cref = makeNode(ColumnRef); List *indirection = stmt->indirection; int nnames = stmt->nnames; - SelectStmt *sstmt = stmt->val; Node *target; - Oid targettype; - int32 targettypmod; - Oid targetcollation; - List *tlist; - TargetEntry *tle; - Oid type_id; - Node *qual; - ListCell *l; + SelectStmtPassthrough passthru; + bool save_resolve_unknowns; /* * First, construct a ColumnRef for the target variable. If the target @@ -2885,33 +2824,62 @@ transformPLAssignStmt(ParseState *pstate, PLAssignStmt *stmt) /* * Transform the target reference. Typically we will get back a Param - * node, but there's no reason to be too picky about its type. + * node, but there's no reason to be too picky about its type. (Note that + * we must do this before calling transformSelectStmt. It's tempting to + * do it inside transformPLAssignStmtTarget, but we need to do it before + * adding any FROM tables to the pstate's namespace, else we might wrongly + * resolve the target as a table column.) */ target = transformExpr(pstate, (Node *) cref, EXPR_KIND_UPDATE_TARGET); - targettype = exprType(target); - targettypmod = exprTypmod(target); - targetcollation = exprCollation(target); + + /* Set up passthrough data for transformPLAssignStmtTarget */ + passthru.stmt = stmt; + passthru.target = target; + passthru.indirection = indirection; /* - * The rest mostly matches transformSelectStmt, except that we needn't - * consider WITH or INTO, and we build a targetlist our own way. + * To avoid duplicating a lot of code, we use transformSelectStmt to do + * almost all of the work. However, we need to do additional processing + * on the SELECT's targetlist after it's been transformed, but before + * possible addition of targetlist items for ORDER BY or GROUP BY. + * transformSelectStmt knows it should call transformPLAssignStmtTarget if + * it's passed a passthru argument. + * + * Also, disable resolution of unknown-type tlist items; PL/pgSQL wants to + * deal with that itself. */ - qry->commandType = CMD_SELECT; - pstate->p_is_insert = false; - - /* make FOR UPDATE/FOR SHARE info available to addRangeTableEntry */ - pstate->p_locking_clause = sstmt->lockingClause; + save_resolve_unknowns = pstate->p_resolve_unknowns; + pstate->p_resolve_unknowns = false; + qry = transformSelectStmt(pstate, stmt->val, &passthru); + pstate->p_resolve_unknowns = save_resolve_unknowns; - /* make WINDOW info available for window functions, too */ - pstate->p_windowdefs = sstmt->windowClause; + return qry; +} - /* process the FROM clause */ - transformFromClause(pstate, sstmt->fromClause); +/* + * Callback function to adjust a SELECT's tlist to make the output suitable + * for assignment to a PLAssignStmt's target variable. + * + * Note: we actually modify the tle->expr in-place, but the function's API + * is set up to not presume that. + */ +static List * +transformPLAssignStmtTarget(ParseState *pstate, List *tlist, + SelectStmtPassthrough *passthru) +{ + PLAssignStmt *stmt = passthru->stmt; + Node *target = passthru->target; + List *indirection = passthru->indirection; + Oid targettype; + int32 targettypmod; + Oid targetcollation; + TargetEntry *tle; + Oid type_id; - /* initially transform the targetlist as if in SELECT */ - tlist = transformTargetList(pstate, sstmt->targetList, - EXPR_KIND_SELECT_TARGET); + targettype = exprType(target); + targettypmod = exprTypmod(target); + targetcollation = exprCollation(target); /* we should have exactly one targetlist item */ if (list_length(tlist) != 1) @@ -2989,96 +2957,7 @@ transformPLAssignStmt(ParseState *pstate, PLAssignStmt *stmt) pstate->p_expr_kind = EXPR_KIND_NONE; - qry->targetList = list_make1(tle); - - /* transform WHERE */ - qual = transformWhereClause(pstate, sstmt->whereClause, - EXPR_KIND_WHERE, "WHERE"); - - /* initial processing of HAVING clause is much like WHERE clause */ - qry->havingQual = transformWhereClause(pstate, sstmt->havingClause, - EXPR_KIND_HAVING, "HAVING"); - - /* - * Transform sorting/grouping stuff. Do ORDER BY first because both - * transformGroupClause and transformDistinctClause need the results. Note - * that these functions can also change the targetList, so it's passed to - * them by reference. - */ - qry->sortClause = transformSortClause(pstate, - sstmt->sortClause, - &qry->targetList, - EXPR_KIND_ORDER_BY, - false /* allow SQL92 rules */ ); - - qry->groupClause = transformGroupClause(pstate, - sstmt->groupClause, - &qry->groupingSets, - &qry->targetList, - qry->sortClause, - EXPR_KIND_GROUP_BY, - false /* allow SQL92 rules */ ); - - if (sstmt->distinctClause == NIL) - { - qry->distinctClause = NIL; - qry->hasDistinctOn = false; - } - else if (linitial(sstmt->distinctClause) == NULL) - { - /* We had SELECT DISTINCT */ - qry->distinctClause = transformDistinctClause(pstate, - &qry->targetList, - qry->sortClause, - false); - qry->hasDistinctOn = false; - } - else - { - /* We had SELECT DISTINCT ON */ - qry->distinctClause = transformDistinctOnClause(pstate, - sstmt->distinctClause, - &qry->targetList, - qry->sortClause); - qry->hasDistinctOn = true; - } - - /* transform LIMIT */ - qry->limitOffset = transformLimitClause(pstate, sstmt->limitOffset, - EXPR_KIND_OFFSET, "OFFSET", - sstmt->limitOption); - qry->limitCount = transformLimitClause(pstate, sstmt->limitCount, - EXPR_KIND_LIMIT, "LIMIT", - sstmt->limitOption); - qry->limitOption = sstmt->limitOption; - - /* transform window clauses after we have seen all window functions */ - qry->windowClause = transformWindowDefinitions(pstate, - pstate->p_windowdefs, - &qry->targetList); - - qry->rtable = pstate->p_rtable; - qry->rteperminfos = pstate->p_rteperminfos; - qry->jointree = makeFromExpr(pstate->p_joinlist, qual); - - qry->hasSubLinks = pstate->p_hasSubLinks; - qry->hasWindowFuncs = pstate->p_hasWindowFuncs; - qry->hasTargetSRFs = pstate->p_hasTargetSRFs; - qry->hasAggs = pstate->p_hasAggs; - - foreach(l, sstmt->lockingClause) - { - transformLockingClause(pstate, qry, - (LockingClause *) lfirst(l), false); - } - - assign_query_collations(pstate, qry); - - /* this must be done after collations, for reliable comparison of exprs */ - if (pstate->p_hasAggs || qry->groupClause || qry->groupingSets || qry->havingQual) - parseCheckAggregates(pstate, qry); - - return qry; + return list_make1(tle); } @@ -3250,6 +3129,8 @@ transformCreateTableAsStmt(ParseState *pstate, CreateTableAsStmt *stmt) /* additional work needed for CREATE MATERIALIZED VIEW */ if (stmt->objtype == OBJECT_MATVIEW) { + ObjectAddress temp_object; + /* * Prohibit a data-modifying CTE in the query used to create a * materialized view. It's not sufficiently clear what the user would @@ -3265,10 +3146,12 @@ transformCreateTableAsStmt(ParseState *pstate, CreateTableAsStmt *stmt) * creation query. It would be hard to refresh data or incrementally * maintain it if a source disappeared. */ - if (isQueryUsingTempRelation(query)) + if (query_uses_temp_object(query, &temp_object)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("materialized views must not use temporary tables or views"))); + errmsg("materialized views must not use temporary objects"), + errdetail("This view depends on temporary %s.", + getObjectDescription(&temp_object, false)))); /* * A materialized view would either need to save parameters for use in diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 0b5652071d119..28f4e11e30ff2 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -120,6 +120,7 @@ typedef struct SelectLimit typedef struct GroupClause { bool distinct; + bool all; List *list; } GroupClause; @@ -154,7 +155,6 @@ static void base_yyerror(YYLTYPE *yylloc, core_yyscan_t yyscanner, const char *msg); static RawStmt *makeRawStmt(Node *stmt, int stmt_location); static void updateRawStmtEnd(RawStmt *rs, int end_location); -static void updatePreparableStmtEnd(Node *n, int end_location); static Node *makeColumnRef(char *colname, List *indirection, int location, core_yyscan_t yyscanner); static Node *makeTypeCast(Node *arg, TypeName *typename, int location); @@ -178,13 +178,13 @@ static void insertSelectOptions(SelectStmt *stmt, SelectLimit *limitClause, WithClause *withClause, core_yyscan_t yyscanner); -static Node *makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location); +static Node *makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg); static Node *doNegate(Node *n, int location); static void doNegateFloat(Float *v); static Node *makeAndExpr(Node *lexpr, Node *rexpr, int location); static Node *makeOrExpr(Node *lexpr, Node *rexpr, int location); static Node *makeNotExpr(Node *expr, int location); -static Node *makeAArrayExpr(List *elements, int location); +static Node *makeAArrayExpr(List *elements, int location, int end_location); static Node *makeSQLValueFunction(SQLValueFunctionOp op, int32 typmod, int location); static Node *makeXmlExpr(XmlExprOp op, char *name, List *named_args, @@ -202,6 +202,10 @@ static void processCASbits(int cas_bits, int location, const char *constrType, bool *not_valid, bool *no_inherit, core_yyscan_t yyscanner); static PartitionStrategy parsePartitionStrategy(char *strategy, int location, core_yyscan_t yyscanner); +static void preprocess_pub_all_objtype_list(List *all_objects_list, + bool *all_tables, + bool *all_sequences, + core_yyscan_t yyscanner); static void preprocess_pubobj_list(List *pubobjspec_list, core_yyscan_t yyscanner); static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); @@ -258,8 +262,10 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); PartitionElem *partelem; PartitionSpec *partspec; PartitionBoundSpec *partboundspec; + SinglePartitionSpec *singlepartspec; RoleSpec *rolespec; PublicationObjSpec *publicationobjectspec; + PublicationAllObjSpec *publicationallobjectspec; struct SelectLimit *selectlimit; SetQuantifier setquantifier; struct GroupClause *groupclause; @@ -303,7 +309,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); SecLabelStmt SelectStmt TransactionStmt TransactionStmtLegacy TruncateStmt UnlistenStmt UpdateStmt VacuumStmt VariableResetStmt VariableSetStmt VariableShowStmt - ViewStmt CheckPointStmt CreateConversionStmt + ViewStmt WaitStmt CheckPointStmt CreateConversionStmt DeallocateStmt PrepareStmt ExecuteStmt DropOwnedStmt ReassignOwnedStmt AlterTSConfigurationStmt AlterTSDictionaryStmt @@ -319,6 +325,12 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type opt_qualified_name %type opt_concurrently %type opt_drop_behavior +%type opt_utility_option_list +%type opt_wait_with_clause +%type utility_option_list +%type utility_option_elem +%type utility_option_name +%type utility_option_arg %type alter_column_default opclass_item opclass_drop alter_using %type add_drop opt_asc_desc opt_nulls_order @@ -339,10 +351,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); create_extension_opt_item alter_extension_opt_item %type opt_lock lock_type cast_context -%type utility_option_name -%type utility_option_elem -%type utility_option_list -%type utility_option_arg %type drop_option %type opt_or_replace opt_no opt_grant_grant_option @@ -446,7 +454,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); transform_element_list transform_type_list TriggerTransitions TriggerReferencing vacuum_relation_list opt_vacuum_relation_list - drop_option_list pub_obj_list + drop_option_list pub_obj_list pub_all_obj_type_list %type returning_clause %type returning_option @@ -523,7 +531,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type def_elem reloption_elem old_aggr_elem operator_def_elem %type def_arg columnElem where_clause where_or_current_clause a_expr b_expr c_expr AexprConst indirection_el opt_slice_bound - columnref in_expr having_clause func_table xmltable array_expr + columnref having_clause func_table xmltable array_expr OptWhereClause operator_def_arg %type opt_column_and_period_list %type rowsfrom_item rowsfrom_list opt_col_def_list @@ -557,7 +565,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type generic_option_list alter_generic_option_list %type reindex_target_relation reindex_target_all -%type opt_reindex_option_list %type copy_generic_opt_arg copy_generic_opt_arg_list_item %type copy_generic_opt_elem @@ -585,6 +592,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type var_value zone_value %type auth_ident RoleSpec opt_granted_by %type PublicationObjSpec +%type PublicationAllObjSpec %type unreserved_keyword type_func_name_keyword %type col_name_keyword reserved_keyword @@ -632,7 +640,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type window_clause window_definition_list opt_partition_clause %type window_definition over_clause window_specification opt_frame_clause frame_extent frame_bound -%type opt_window_exclusion_clause +%type null_treatment opt_window_exclusion_clause %type opt_existing_window_name %type opt_if_not_exists %type opt_unique_null_treatment @@ -641,6 +649,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type part_elem %type part_params %type PartitionBoundSpec +%type SinglePartitionSpec +%type partitions_list %type hash_partbound %type hash_partbound_elem @@ -672,7 +682,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); json_object_constructor_null_clause_opt json_array_constructor_null_clause_opt - /* * Non-keyword token types. These are hard-wired into the "flex" lexer. * They must be listed first so that their numeric codes do not depend on @@ -730,7 +739,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); HANDLER HAVING HEADER_P HOLD HOUR_P - IDENTITY_P IF_P ILIKE IMMEDIATE IMMUTABLE IMPLICIT_P IMPORT_P IN_P INCLUDE + IDENTITY_P IF_P IGNORE_P ILIKE IMMEDIATE IMMUTABLE IMPLICIT_P IMPORT_P IN_P INCLUDE INCLUDING INCREMENT INDENT INDEX INDEXES INHERIT INHERITS INITIALLY INLINE_P INNER_P INOUT INPUT_P INSENSITIVE INSERT INSTEAD INT_P INTEGER INTERSECT INTERVAL INTO INVOKER IS ISNULL ISOLATION @@ -742,7 +751,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); LABEL LANGUAGE LARGE_P LAST_P LATERAL_P LEADING LEAKPROOF LEAST LEFT LEVEL LIKE LIMIT LISTEN LOAD LOCAL - LOCALTIME LOCALTIMESTAMP LOCATION LOCK_P LOCKED LOGGED + LOCALTIME LOCALTIMESTAMP LOCATION LOCK_P LOCKED LOGGED LSN_P MAPPING MATCH MATCHED MATERIALIZED MAXVALUE MERGE MERGE_ACTION METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE @@ -756,7 +765,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); ORDER ORDINALITY OTHERS OUT_P OUTER_P OVER OVERLAPS OVERLAY OVERRIDING OWNED OWNER - PARALLEL PARAMETER PARSER PARTIAL PARTITION PASSING PASSWORD PATH + PARALLEL PARAMETER PARSER PARTIAL PARTITION PARTITIONS PASSING PASSWORD PATH PERIOD PLACING PLAN PLANS POLICY POSITION PRECEDING PRECISION PRESERVE PREPARE PREPARED PRIMARY PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROCEDURES PROGRAM PUBLICATION @@ -765,13 +774,13 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); RANGE READ REAL REASSIGN RECURSIVE REF_P REFERENCES REFERENCING REFRESH REINDEX RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA - RESET RESTART RESTRICT RETURN RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP + RESET RESPECT_P RESTART RESTRICT RETURN RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP ROUTINE ROUTINES ROW ROWS RULE SAVEPOINT SCALAR SCHEMA SCHEMAS SCROLL SEARCH SECOND_P SECURITY SELECT SEQUENCE SEQUENCES SERIALIZABLE SERVER SESSION SESSION_USER SET SETS SETOF SHARE SHOW - SIMILAR SIMPLE SKIP SMALLINT SNAPSHOT SOME SOURCE SQL_P STABLE STANDALONE_P + SIMILAR SIMPLE SKIP SMALLINT SNAPSHOT SOME SPLIT SOURCE SQL_P STABLE STANDALONE_P START STATEMENT STATISTICS STDIN STDOUT STORAGE STORED STRICT_P STRING_P STRIP_P SUBSCRIPTION SUBSTRING SUPPORT SYMMETRIC SYSID SYSTEM_P SYSTEM_USER @@ -786,7 +795,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING VERBOSE VERSION_P VIEW VIEWS VIRTUAL VOLATILE - WHEN WHERE WHITESPACE_P WINDOW WITH WITHIN WITHOUT WORK WRAPPER WRITE + WAIT WHEN WHERE WHITESPACE_P WINDOW WITH WITHIN WITHOUT WORK WRAPPER WRITE XML_P XMLATTRIBUTES XMLCONCAT XMLELEMENT XMLEXISTS XMLFOREST XMLNAMESPACES XMLPARSE XMLPI XMLROOT XMLSERIALIZE XMLTABLE @@ -1114,6 +1123,7 @@ stmt: | VariableSetStmt | VariableShowStmt | ViewStmt + | WaitStmt | /*EMPTY*/ { $$ = NULL; } ; @@ -1142,6 +1152,41 @@ opt_drop_behavior: | /* EMPTY */ { $$ = DROP_RESTRICT; /* default */ } ; +opt_utility_option_list: + '(' utility_option_list ')' { $$ = $2; } + | /* EMPTY */ { $$ = NULL; } + ; + +utility_option_list: + utility_option_elem + { + $$ = list_make1($1); + } + | utility_option_list ',' utility_option_elem + { + $$ = lappend($1, $3); + } + ; + +utility_option_elem: + utility_option_name utility_option_arg + { + $$ = makeDefElem($1, $2, @1); + } + ; + +utility_option_name: + NonReservedWord { $$ = $1; } + | analyze_keyword { $$ = "analyze"; } + | FORMAT_LA { $$ = "format"; } + ; + +utility_option_arg: + opt_boolean_or_string { $$ = (Node *) makeString($1); } + | NumericOnly { $$ = (Node *) $1; } + | /* EMPTY */ { $$ = NULL; } + ; + /***************************************************************************** * * CALL statement @@ -1675,6 +1720,26 @@ generic_set: n->location = @3; $$ = n; } + | var_name TO NULL_P + { + VariableSetStmt *n = makeNode(VariableSetStmt); + + n->kind = VAR_SET_VALUE; + n->name = $1; + n->args = list_make1(makeNullAConst(@3)); + n->location = @3; + $$ = n; + } + | var_name '=' NULL_P + { + VariableSetStmt *n = makeNode(VariableSetStmt); + + n->kind = VAR_SET_VALUE; + n->name = $1; + n->args = list_make1(makeNullAConst(@3)); + n->location = @3; + $$ = n; + } | var_name TO DEFAULT { VariableSetStmt *n = makeNode(VariableSetStmt); @@ -2029,11 +2094,12 @@ constraints_set_mode: * Checkpoint statement */ CheckPointStmt: - CHECKPOINT + CHECKPOINT opt_utility_option_list { CheckPointStmt *n = makeNode(CheckPointStmt); $$ = (Node *) n; + n->options = $2; } ; @@ -2322,6 +2388,23 @@ alter_table_cmds: | alter_table_cmds ',' alter_table_cmd { $$ = lappend($1, $3); } ; +partitions_list: + SinglePartitionSpec { $$ = list_make1($1); } + | partitions_list ',' SinglePartitionSpec { $$ = lappend($1, $3); } + ; + +SinglePartitionSpec: + PARTITION qualified_name PartitionBoundSpec + { + SinglePartitionSpec *n = makeNode(SinglePartitionSpec); + + n->name = $2; + n->bound = $3; + + $$ = n; + } + ; + partition_cmd: /* ALTER TABLE ATTACH PARTITION FOR VALUES */ ATTACH PARTITION qualified_name PartitionBoundSpec @@ -2332,6 +2415,7 @@ partition_cmd: n->subtype = AT_AttachPartition; cmd->name = $3; cmd->bound = $4; + cmd->partlist = NIL; cmd->concurrent = false; n->def = (Node *) cmd; @@ -2346,6 +2430,7 @@ partition_cmd: n->subtype = AT_DetachPartition; cmd->name = $3; cmd->bound = NULL; + cmd->partlist = NIL; cmd->concurrent = $4; n->def = (Node *) cmd; @@ -2359,6 +2444,35 @@ partition_cmd: n->subtype = AT_DetachPartitionFinalize; cmd->name = $3; cmd->bound = NULL; + cmd->partlist = NIL; + cmd->concurrent = false; + n->def = (Node *) cmd; + $$ = (Node *) n; + } + /* ALTER TABLE SPLIT PARTITION INTO () */ + | SPLIT PARTITION qualified_name INTO '(' partitions_list ')' + { + AlterTableCmd *n = makeNode(AlterTableCmd); + PartitionCmd *cmd = makeNode(PartitionCmd); + + n->subtype = AT_SplitPartition; + cmd->name = $3; + cmd->bound = NULL; + cmd->partlist = $6; + cmd->concurrent = false; + n->def = (Node *) cmd; + $$ = (Node *) n; + } + /* ALTER TABLE MERGE PARTITIONS () INTO */ + | MERGE PARTITIONS '(' qualified_name_list ')' INTO qualified_name + { + AlterTableCmd *n = makeNode(AlterTableCmd); + PartitionCmd *cmd = makeNode(PartitionCmd); + + n->subtype = AT_MergePartitions; + cmd->name = $7; + cmd->bound = NULL; + cmd->partlist = $4; cmd->concurrent = false; n->def = (Node *) cmd; $$ = (Node *) n; @@ -2375,6 +2489,7 @@ index_partition_cmd: n->subtype = AT_AttachPartition; cmd->name = $3; cmd->bound = NULL; + cmd->partlist = NIL; cmd->concurrent = false; n->def = (Node *) cmd; @@ -2669,6 +2784,12 @@ alter_table_cmd: c->alterDeferrability = true; if ($4 & CAS_NO_INHERIT) c->alterInheritability = true; + /* handle unsupported case with specific error message */ + if ($4 & CAS_NOT_VALID) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraints cannot be altered to be NOT VALID"), + parser_errposition(@4)); processCASbits($4, @4, "FOREIGN KEY", &c->deferrable, &c->initdeferred, @@ -3360,7 +3481,7 @@ ClosePortalStmt: * COPY ( query ) TO file [WITH] [(options)] * * where 'query' can be one of: - * { SELECT | UPDATE | INSERT | DELETE } + * { SELECT | UPDATE | INSERT | DELETE | MERGE } * * and 'file' can be one of: * { PROGRAM 'command' | STDIN | STDOUT | 'filename' } @@ -3401,6 +3522,7 @@ CopyStmt: COPY opt_binary qualified_name opt_column_list ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("WHERE clause not allowed with COPY TO"), + errhint("Try the COPY (SELECT ... WHERE ...) TO variant."), parser_errposition(@11))); n->options = NIL; @@ -3417,7 +3539,6 @@ CopyStmt: COPY opt_binary qualified_name opt_column_list { CopyStmt *n = makeNode(CopyStmt); - updatePreparableStmtEnd($3, @4); n->relation = NULL; n->query = $3; n->attlist = NIL; @@ -4486,19 +4607,19 @@ OptWhereClause: key_actions: key_update { - KeyActions *n = palloc(sizeof(KeyActions)); + KeyActions *n = palloc_object(KeyActions); n->updateAction = $1; - n->deleteAction = palloc(sizeof(KeyAction)); + n->deleteAction = palloc_object(KeyAction); n->deleteAction->action = FKCONSTR_ACTION_NOACTION; n->deleteAction->cols = NIL; $$ = n; } | key_delete { - KeyActions *n = palloc(sizeof(KeyActions)); + KeyActions *n = palloc_object(KeyActions); - n->updateAction = palloc(sizeof(KeyAction)); + n->updateAction = palloc_object(KeyAction); n->updateAction->action = FKCONSTR_ACTION_NOACTION; n->updateAction->cols = NIL; n->deleteAction = $1; @@ -4506,7 +4627,7 @@ key_actions: } | key_update key_delete { - KeyActions *n = palloc(sizeof(KeyActions)); + KeyActions *n = palloc_object(KeyActions); n->updateAction = $1; n->deleteAction = $2; @@ -4514,7 +4635,7 @@ key_actions: } | key_delete key_update { - KeyActions *n = palloc(sizeof(KeyActions)); + KeyActions *n = palloc_object(KeyActions); n->updateAction = $2; n->deleteAction = $1; @@ -4522,12 +4643,12 @@ key_actions: } | /*EMPTY*/ { - KeyActions *n = palloc(sizeof(KeyActions)); + KeyActions *n = palloc_object(KeyActions); - n->updateAction = palloc(sizeof(KeyAction)); + n->updateAction = palloc_object(KeyAction); n->updateAction->action = FKCONSTR_ACTION_NOACTION; n->updateAction->cols = NIL; - n->deleteAction = palloc(sizeof(KeyAction)); + n->deleteAction = palloc_object(KeyAction); n->deleteAction->action = FKCONSTR_ACTION_NOACTION; n->deleteAction->cols = NIL; $$ = n; @@ -4555,7 +4676,7 @@ key_delete: ON DELETE_P key_action key_action: NO ACTION { - KeyAction *n = palloc(sizeof(KeyAction)); + KeyAction *n = palloc_object(KeyAction); n->action = FKCONSTR_ACTION_NOACTION; n->cols = NIL; @@ -4563,7 +4684,7 @@ key_action: } | RESTRICT { - KeyAction *n = palloc(sizeof(KeyAction)); + KeyAction *n = palloc_object(KeyAction); n->action = FKCONSTR_ACTION_RESTRICT; n->cols = NIL; @@ -4571,7 +4692,7 @@ key_action: } | CASCADE { - KeyAction *n = palloc(sizeof(KeyAction)); + KeyAction *n = palloc_object(KeyAction); n->action = FKCONSTR_ACTION_CASCADE; n->cols = NIL; @@ -4579,7 +4700,7 @@ key_action: } | SET NULL_P opt_column_list { - KeyAction *n = palloc(sizeof(KeyAction)); + KeyAction *n = palloc_object(KeyAction); n->action = FKCONSTR_ACTION_SETNULL; n->cols = $3; @@ -4587,7 +4708,7 @@ key_action: } | SET DEFAULT opt_column_list { - KeyAction *n = palloc(sizeof(KeyAction)); + KeyAction *n = palloc_object(KeyAction); n->action = FKCONSTR_ACTION_SETDEFAULT; n->cols = $3; @@ -5784,7 +5905,7 @@ import_qualification_type: import_qualification: import_qualification_type '(' relation_expr_list ')' { - ImportQual *n = (ImportQual *) palloc(sizeof(ImportQual)); + ImportQual *n = palloc_object(ImportQual); n->type = $1; n->table_names = $3; @@ -5792,7 +5913,7 @@ import_qualification: } | /*EMPTY*/ { - ImportQual *n = (ImportQual *) palloc(sizeof(ImportQual)); + ImportQual *n = palloc_object(ImportQual); n->type = FDW_IMPORT_SCHEMA_ALL; n->table_names = NIL; $$ = n; @@ -6037,6 +6158,26 @@ CreateTrigStmt: EXECUTE FUNCTION_or_PROCEDURE func_name '(' TriggerFuncArgs ')' { CreateTrigStmt *n = makeNode(CreateTrigStmt); + bool dummy; + + if (($11 & CAS_NOT_VALID) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NOT VALID"), + parser_errposition(@11)); + if (($11 & CAS_NO_INHERIT) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NO INHERIT"), + parser_errposition(@11)); + if (($11 & CAS_NOT_ENFORCED) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NOT ENFORCED"), + parser_errposition(@11)); n->replace = $2; if (n->replace) /* not supported, see CreateTrigger */ @@ -6056,7 +6197,7 @@ CreateTrigStmt: n->whenClause = $15; n->transitionRels = NIL; processCASbits($11, @11, "TRIGGER", - &n->deferrable, &n->initdeferred, NULL, + &n->deferrable, &n->initdeferred, &dummy, NULL, NULL, yyscanner); n->constrrel = $10; $$ = (Node *) n; @@ -7479,6 +7620,8 @@ fetch_args: cursor_name n->portalname = $1; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NONE; $$ = (Node *) n; } | from_in cursor_name @@ -7488,6 +7631,19 @@ fetch_args: cursor_name n->portalname = $2; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NONE; + $$ = (Node *) n; + } + | SignedIconst opt_from_in cursor_name + { + FetchStmt *n = makeNode(FetchStmt); + + n->portalname = $3; + n->direction = FETCH_FORWARD; + n->howMany = $1; + n->location = @1; + n->direction_keyword = FETCH_KEYWORD_NONE; $$ = (Node *) n; } | NEXT opt_from_in cursor_name @@ -7497,6 +7653,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NEXT; $$ = (Node *) n; } | PRIOR opt_from_in cursor_name @@ -7506,6 +7664,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_BACKWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_PRIOR; $$ = (Node *) n; } | FIRST_P opt_from_in cursor_name @@ -7515,6 +7675,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_ABSOLUTE; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FIRST; $$ = (Node *) n; } | LAST_P opt_from_in cursor_name @@ -7524,6 +7686,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_ABSOLUTE; n->howMany = -1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_LAST; $$ = (Node *) n; } | ABSOLUTE_P SignedIconst opt_from_in cursor_name @@ -7533,6 +7697,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_ABSOLUTE; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_ABSOLUTE; $$ = (Node *) n; } | RELATIVE_P SignedIconst opt_from_in cursor_name @@ -7542,15 +7708,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_RELATIVE; n->howMany = $2; - $$ = (Node *) n; - } - | SignedIconst opt_from_in cursor_name - { - FetchStmt *n = makeNode(FetchStmt); - - n->portalname = $3; - n->direction = FETCH_FORWARD; - n->howMany = $1; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_RELATIVE; $$ = (Node *) n; } | ALL opt_from_in cursor_name @@ -7560,6 +7719,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_ALL; $$ = (Node *) n; } | FORWARD opt_from_in cursor_name @@ -7569,6 +7730,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FORWARD; $$ = (Node *) n; } | FORWARD SignedIconst opt_from_in cursor_name @@ -7578,6 +7741,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_FORWARD; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_FORWARD; $$ = (Node *) n; } | FORWARD ALL opt_from_in cursor_name @@ -7587,6 +7752,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_FORWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FORWARD_ALL; $$ = (Node *) n; } | BACKWARD opt_from_in cursor_name @@ -7596,6 +7763,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_BACKWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_BACKWARD; $$ = (Node *) n; } | BACKWARD SignedIconst opt_from_in cursor_name @@ -7605,6 +7774,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_BACKWARD; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_BACKWARD; $$ = (Node *) n; } | BACKWARD ALL opt_from_in cursor_name @@ -7614,6 +7785,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_BACKWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_BACKWARD_ALL; $$ = (Node *) n; } ; @@ -7793,7 +7966,7 @@ parameter_name: privilege_target: qualified_name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_TABLE; @@ -7802,7 +7975,7 @@ privilege_target: } | TABLE qualified_name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_TABLE; @@ -7811,7 +7984,7 @@ privilege_target: } | SEQUENCE qualified_name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_SEQUENCE; @@ -7820,7 +7993,7 @@ privilege_target: } | FOREIGN DATA_P WRAPPER name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_FDW; @@ -7829,7 +8002,7 @@ privilege_target: } | FOREIGN SERVER name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_FOREIGN_SERVER; @@ -7838,7 +8011,7 @@ privilege_target: } | FUNCTION function_with_argtypes_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_FUNCTION; @@ -7847,7 +8020,7 @@ privilege_target: } | PROCEDURE function_with_argtypes_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_PROCEDURE; @@ -7856,7 +8029,7 @@ privilege_target: } | ROUTINE function_with_argtypes_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_ROUTINE; @@ -7865,7 +8038,7 @@ privilege_target: } | DATABASE name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_DATABASE; @@ -7874,7 +8047,7 @@ privilege_target: } | DOMAIN_P any_name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_DOMAIN; @@ -7883,7 +8056,7 @@ privilege_target: } | LANGUAGE name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_LANGUAGE; @@ -7892,7 +8065,7 @@ privilege_target: } | LARGE_P OBJECT_P NumericOnly_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_LARGEOBJECT; @@ -7901,7 +8074,7 @@ privilege_target: } | PARAMETER parameter_name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_PARAMETER_ACL; n->objs = $2; @@ -7909,7 +8082,7 @@ privilege_target: } | SCHEMA name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_SCHEMA; @@ -7918,7 +8091,7 @@ privilege_target: } | TABLESPACE name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_TABLESPACE; @@ -7927,7 +8100,7 @@ privilege_target: } | TYPE_P any_name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_OBJECT; n->objtype = OBJECT_TYPE; @@ -7936,7 +8109,7 @@ privilege_target: } | ALL TABLES IN_P SCHEMA name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_ALL_IN_SCHEMA; n->objtype = OBJECT_TABLE; @@ -7945,7 +8118,7 @@ privilege_target: } | ALL SEQUENCES IN_P SCHEMA name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_ALL_IN_SCHEMA; n->objtype = OBJECT_SEQUENCE; @@ -7954,7 +8127,7 @@ privilege_target: } | ALL FUNCTIONS IN_P SCHEMA name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_ALL_IN_SCHEMA; n->objtype = OBJECT_FUNCTION; @@ -7963,7 +8136,7 @@ privilege_target: } | ALL PROCEDURES IN_P SCHEMA name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_ALL_IN_SCHEMA; n->objtype = OBJECT_PROCEDURE; @@ -7972,7 +8145,7 @@ privilege_target: } | ALL ROUTINES IN_P SCHEMA name_list { - PrivTarget *n = (PrivTarget *) palloc(sizeof(PrivTarget)); + PrivTarget *n = palloc_object(PrivTarget); n->targtype = ACL_TARGET_ALL_IN_SCHEMA; n->objtype = OBJECT_ROUTINE; @@ -9291,7 +9464,7 @@ DropTransformStmt: DROP TRANSFORM opt_if_exists FOR Typename LANGUAGE name opt_d *****************************************************************************/ ReindexStmt: - REINDEX opt_reindex_option_list reindex_target_relation opt_concurrently qualified_name + REINDEX opt_utility_option_list reindex_target_relation opt_concurrently qualified_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9304,7 +9477,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list SCHEMA opt_concurrently name + | REINDEX opt_utility_option_list SCHEMA opt_concurrently name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9317,7 +9490,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list reindex_target_all opt_concurrently opt_single_name + | REINDEX opt_utility_option_list reindex_target_all opt_concurrently opt_single_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9339,10 +9512,6 @@ reindex_target_all: SYSTEM_P { $$ = REINDEX_OBJECT_SYSTEM; } | DATABASE { $$ = REINDEX_OBJECT_DATABASE; } ; -opt_reindex_option_list: - '(' utility_option_list ')' { $$ = $2; } - | /* EMPTY */ { $$ = NULL; } - ; /***************************************************************************** * @@ -10614,7 +10783,12 @@ AlterOwnerStmt: ALTER AGGREGATE aggregate_with_argtypes OWNER TO RoleSpec * * CREATE PUBLICATION name [WITH options] * - * CREATE PUBLICATION FOR ALL TABLES [WITH options] + * CREATE PUBLICATION FOR ALL pub_all_obj_type [, ...] [WITH options] + * + * pub_all_obj_type is one of: + * + * TABLES + * SEQUENCES * * CREATE PUBLICATION FOR pub_obj [, ...] [WITH options] * @@ -10634,13 +10808,15 @@ CreatePublicationStmt: n->options = $4; $$ = (Node *) n; } - | CREATE PUBLICATION name FOR ALL TABLES opt_definition + | CREATE PUBLICATION name FOR pub_all_obj_type_list opt_definition { CreatePublicationStmt *n = makeNode(CreatePublicationStmt); n->pubname = $3; - n->options = $7; - n->for_all_tables = true; + preprocess_pub_all_objtype_list($5, &n->for_all_tables, + &n->for_all_sequences, + yyscanner); + n->options = $6; $$ = (Node *) n; } | CREATE PUBLICATION name FOR pub_obj_list opt_definition @@ -10752,6 +10928,28 @@ pub_obj_list: PublicationObjSpec { $$ = lappend($1, $3); } ; +PublicationAllObjSpec: + ALL TABLES + { + $$ = makeNode(PublicationAllObjSpec); + $$->pubobjtype = PUBLICATION_ALL_TABLES; + $$->location = @1; + } + | ALL SEQUENCES + { + $$ = makeNode(PublicationAllObjSpec); + $$->pubobjtype = PUBLICATION_ALL_SEQUENCES; + $$->location = @1; + } + ; + +pub_all_obj_type_list: PublicationAllObjSpec + { $$ = list_make1($1); } + | pub_all_obj_type_list ',' PublicationAllObjSpec + { $$ = lappend($1, $3); } + ; + + /***************************************************************************** * * ALTER PUBLICATION name SET ( options ) @@ -10861,11 +11059,20 @@ AlterSubscriptionStmt: AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); - n->kind = ALTER_SUBSCRIPTION_REFRESH; + n->kind = ALTER_SUBSCRIPTION_REFRESH_PUBLICATION; n->subname = $3; n->options = $6; $$ = (Node *) n; } + | ALTER SUBSCRIPTION name REFRESH SEQUENCES + { + AlterSubscriptionStmt *n = + makeNode(AlterSubscriptionStmt); + + n->kind = ALTER_SUBSCRIPTION_REFRESH_SEQUENCES; + n->subname = $3; + $$ = (Node *) n; + } | ALTER SUBSCRIPTION name ADD_P PUBLICATION name_list opt_definition { AlterSubscriptionStmt *n = @@ -11629,7 +11836,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'T'; + n->subtype = AD_AlterDefault; n->typeName = $3; n->def = $4; $$ = (Node *) n; @@ -11639,7 +11846,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'N'; + n->subtype = AD_DropNotNull; n->typeName = $3; $$ = (Node *) n; } @@ -11648,7 +11855,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'O'; + n->subtype = AD_SetNotNull; n->typeName = $3; $$ = (Node *) n; } @@ -11657,7 +11864,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'C'; + n->subtype = AD_AddConstraint; n->typeName = $3; n->def = $5; $$ = (Node *) n; @@ -11667,7 +11874,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'X'; + n->subtype = AD_DropConstraint; n->typeName = $3; n->name = $6; n->behavior = $7; @@ -11679,7 +11886,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'X'; + n->subtype = AD_DropConstraint; n->typeName = $3; n->name = $8; n->behavior = $9; @@ -11691,7 +11898,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'V'; + n->subtype = AD_ValidateConstraint; n->typeName = $3; n->name = $6; $$ = (Node *) n; @@ -11840,13 +12047,13 @@ ClusterStmt: n->params = $3; $$ = (Node *) n; } - | CLUSTER '(' utility_option_list ')' + | CLUSTER opt_utility_option_list { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = $3; + n->params = $2; $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-14 compatibility */ @@ -11856,21 +12063,18 @@ ClusterStmt: n->relation = $3; n->indexname = $4; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-17 compatibility */ - | CLUSTER opt_verbose + | CLUSTER VERBOSE { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = NIL; - if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* kept for pre-8.3 compatibility */ @@ -11880,9 +12084,8 @@ ClusterStmt: n->relation = $5; n->indexname = $3; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } ; @@ -11933,64 +12136,31 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose opt_analyze opt_vacuum_relati } ; -AnalyzeStmt: analyze_keyword opt_verbose opt_vacuum_relation_list +AnalyzeStmt: analyze_keyword opt_utility_option_list opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = NIL; - if ($2) - n->options = lappend(n->options, - makeDefElem("verbose", NULL, @2)); + n->options = $2; n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } - | analyze_keyword '(' utility_option_list ')' opt_vacuum_relation_list + | analyze_keyword VERBOSE opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = $3; - n->rels = $5; + n->options = list_make1(makeDefElem("verbose", NULL, @2)); + n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } ; -utility_option_list: - utility_option_elem - { - $$ = list_make1($1); - } - | utility_option_list ',' utility_option_elem - { - $$ = lappend($1, $3); - } - ; - analyze_keyword: ANALYZE | ANALYSE /* British */ ; -utility_option_elem: - utility_option_name utility_option_arg - { - $$ = makeDefElem($1, $2, @1); - } - ; - -utility_option_name: - NonReservedWord { $$ = $1; } - | analyze_keyword { $$ = "analyze"; } - | FORMAT_LA { $$ = "format"; } - ; - -utility_option_arg: - opt_boolean_or_string { $$ = (Node *) makeString($1); } - | NumericOnly { $$ = (Node *) $1; } - | /* EMPTY */ { $$ = NULL; } - ; - opt_analyze: analyze_keyword { $$ = true; } | /*EMPTY*/ { $$ = false; } @@ -12240,7 +12410,6 @@ InsertStmt: $5->onConflictClause = $6; $5->returningClause = $7; $5->withClause = $1; - $5->stmt_location = @$; $$ = (Node *) $5; } ; @@ -12431,7 +12600,6 @@ DeleteStmt: opt_with_clause DELETE_P FROM relation_expr_opt_alias n->whereClause = $6; n->returningClause = $7; n->withClause = $1; - n->stmt_location = @$; $$ = (Node *) n; } ; @@ -12506,7 +12674,6 @@ UpdateStmt: opt_with_clause UPDATE relation_expr_opt_alias n->whereClause = $7; n->returningClause = $8; n->withClause = $1; - n->stmt_location = @$; $$ = (Node *) n; } ; @@ -12584,7 +12751,6 @@ MergeStmt: m->joinCondition = $8; m->mergeWhenClauses = $9; m->returningClause = $10; - m->stmt_location = @$; $$ = (Node *) m; } @@ -12825,20 +12991,7 @@ SelectStmt: select_no_parens %prec UMINUS ; select_with_parens: - '(' select_no_parens ')' - { - SelectStmt *n = (SelectStmt *) $2; - - /* - * As SelectStmt's location starts at the SELECT keyword, - * we need to track the length of the SelectStmt within - * parentheses to be able to extract the relevant part - * of the query. Without this, the RawStmt's length would - * be used and would include the closing parenthesis. - */ - n->stmt_len = @3 - @2; - $$ = $2; - } + '(' select_no_parens ')' { $$ = $2; } | '(' select_with_parens ')' { $$ = $2; } ; @@ -12958,9 +13111,9 @@ simple_select: n->whereClause = $6; n->groupClause = ($7)->list; n->groupDistinct = ($7)->distinct; + n->groupByAll = ($7)->all; n->havingClause = $8; n->windowClause = $9; - n->stmt_location = @1; $$ = (Node *) n; } | SELECT distinct_clause target_list @@ -12976,9 +13129,9 @@ simple_select: n->whereClause = $6; n->groupClause = ($7)->list; n->groupDistinct = ($7)->distinct; + n->groupByAll = ($7)->all; n->havingClause = $8; n->windowClause = $9; - n->stmt_location = @1; $$ = (Node *) n; } | values_clause { $$ = $1; } @@ -12999,20 +13152,19 @@ simple_select: n->targetList = list_make1(rt); n->fromClause = list_make1($2); - n->stmt_location = @1; $$ = (Node *) n; } | select_clause UNION set_quantifier select_clause { - $$ = makeSetOp(SETOP_UNION, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_UNION, $3 == SET_QUANTIFIER_ALL, $1, $4); } | select_clause INTERSECT set_quantifier select_clause { - $$ = makeSetOp(SETOP_INTERSECT, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_INTERSECT, $3 == SET_QUANTIFIER_ALL, $1, $4); } | select_clause EXCEPT set_quantifier select_clause { - $$ = makeSetOp(SETOP_EXCEPT, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_EXCEPT, $3 == SET_QUANTIFIER_ALL, $1, $4); } ; @@ -13293,7 +13445,7 @@ select_limit: } | offset_clause { - SelectLimit *n = (SelectLimit *) palloc(sizeof(SelectLimit)); + SelectLimit *n = palloc_object(SelectLimit); n->limitOffset = $1; n->limitCount = NULL; @@ -13313,7 +13465,7 @@ opt_select_limit: limit_clause: LIMIT select_limit_value { - SelectLimit *n = (SelectLimit *) palloc(sizeof(SelectLimit)); + SelectLimit *n = palloc_object(SelectLimit); n->limitOffset = NULL; n->limitCount = $2; @@ -13341,7 +13493,7 @@ limit_clause: */ | FETCH first_or_next select_fetch_first_value row_or_rows ONLY { - SelectLimit *n = (SelectLimit *) palloc(sizeof(SelectLimit)); + SelectLimit *n = palloc_object(SelectLimit); n->limitOffset = NULL; n->limitCount = $3; @@ -13353,7 +13505,7 @@ limit_clause: } | FETCH first_or_next select_fetch_first_value row_or_rows WITH TIES { - SelectLimit *n = (SelectLimit *) palloc(sizeof(SelectLimit)); + SelectLimit *n = palloc_object(SelectLimit); n->limitOffset = NULL; n->limitCount = $3; @@ -13365,7 +13517,7 @@ limit_clause: } | FETCH first_or_next row_or_rows ONLY { - SelectLimit *n = (SelectLimit *) palloc(sizeof(SelectLimit)); + SelectLimit *n = palloc_object(SelectLimit); n->limitOffset = NULL; n->limitCount = makeIntConst(1, -1); @@ -13377,7 +13529,7 @@ limit_clause: } | FETCH first_or_next row_or_rows WITH TIES { - SelectLimit *n = (SelectLimit *) palloc(sizeof(SelectLimit)); + SelectLimit *n = palloc_object(SelectLimit); n->limitOffset = NULL; n->limitCount = makeIntConst(1, -1); @@ -13472,17 +13624,27 @@ first_or_next: FIRST_P { $$ = 0; } group_clause: GROUP_P BY set_quantifier group_by_list { - GroupClause *n = (GroupClause *) palloc(sizeof(GroupClause)); + GroupClause *n = palloc_object(GroupClause); n->distinct = $3 == SET_QUANTIFIER_DISTINCT; + n->all = false; n->list = $4; $$ = n; } + | GROUP_P BY ALL + { + GroupClause *n = palloc_object(GroupClause); + n->distinct = false; + n->all = true; + n->list = NIL; + $$ = n; + } | /*EMPTY*/ { - GroupClause *n = (GroupClause *) palloc(sizeof(GroupClause)); + GroupClause *n = palloc_object(GroupClause); n->distinct = false; + n->all = false; n->list = NIL; $$ = n; } @@ -13590,7 +13752,6 @@ values_clause: { SelectStmt *n = makeNode(SelectStmt); - n->stmt_location = @1; n->valuesLists = list_make1($3); $$ = (Node *) n; } @@ -15287,49 +15448,50 @@ a_expr: c_expr { $$ = $1; } (Node *) list_make2($5, $7), @2); } - | a_expr IN_P in_expr + | a_expr IN_P select_with_parens { - /* in_expr returns a SubLink or a list of a_exprs */ - if (IsA($3, SubLink)) - { - /* generate foo = ANY (subquery) */ - SubLink *n = (SubLink *) $3; + /* generate foo = ANY (subquery) */ + SubLink *n = makeNode(SubLink); - n->subLinkType = ANY_SUBLINK; - n->subLinkId = 0; - n->testexpr = $1; - n->operName = NIL; /* show it's IN not = ANY */ - n->location = @2; - $$ = (Node *) n; - } - else - { - /* generate scalar IN expression */ - $$ = (Node *) makeSimpleA_Expr(AEXPR_IN, "=", $1, $3, @2); - } + n->subselect = $3; + n->subLinkType = ANY_SUBLINK; + n->subLinkId = 0; + n->testexpr = $1; + n->operName = NIL; /* show it's IN not = ANY */ + n->location = @2; + $$ = (Node *) n; } - | a_expr NOT_LA IN_P in_expr %prec NOT_LA + | a_expr IN_P '(' expr_list ')' { - /* in_expr returns a SubLink or a list of a_exprs */ - if (IsA($4, SubLink)) - { - /* generate NOT (foo = ANY (subquery)) */ - /* Make an = ANY node */ - SubLink *n = (SubLink *) $4; - - n->subLinkType = ANY_SUBLINK; - n->subLinkId = 0; - n->testexpr = $1; - n->operName = NIL; /* show it's IN not = ANY */ - n->location = @2; - /* Stick a NOT on top; must have same parse location */ - $$ = makeNotExpr((Node *) n, @2); - } - else - { - /* generate scalar NOT IN expression */ - $$ = (Node *) makeSimpleA_Expr(AEXPR_IN, "<>", $1, $4, @2); - } + /* generate scalar IN expression */ + A_Expr *n = makeSimpleA_Expr(AEXPR_IN, "=", $1, (Node *) $4, @2); + + n->rexpr_list_start = @3; + n->rexpr_list_end = @5; + $$ = (Node *) n; + } + | a_expr NOT_LA IN_P select_with_parens %prec NOT_LA + { + /* generate NOT (foo = ANY (subquery)) */ + SubLink *n = makeNode(SubLink); + + n->subselect = $4; + n->subLinkType = ANY_SUBLINK; + n->subLinkId = 0; + n->testexpr = $1; + n->operName = NIL; /* show it's IN not = ANY */ + n->location = @2; + /* Stick a NOT on top; must have same parse location */ + $$ = makeNotExpr((Node *) n, @2); + } + | a_expr NOT_LA IN_P '(' expr_list ')' + { + /* generate scalar NOT IN expression */ + A_Expr *n = makeSimpleA_Expr(AEXPR_IN, "<>", $1, (Node *) $5, @2); + + n->rexpr_list_start = @4; + n->rexpr_list_end = @6; + $$ = (Node *) n; } | a_expr subquery_Op sub_type select_with_parens %prec Op { @@ -15760,7 +15922,7 @@ func_application: func_name '(' ')' * (Note that many of the special SQL functions wouldn't actually make any * sense as functional index entries, but we ignore that consideration here.) */ -func_expr: func_application within_group_clause filter_clause over_clause +func_expr: func_application within_group_clause filter_clause null_treatment over_clause { FuncCall *n = (FuncCall *) $1; @@ -15793,7 +15955,8 @@ func_expr: func_application within_group_clause filter_clause over_clause n->agg_within_group = true; } n->agg_filter = $3; - n->over = $4; + n->ignore_nulls = $4; + n->over = $5; $$ = (Node *) n; } | json_aggregate_func filter_clause over_clause @@ -16371,6 +16534,26 @@ xml_passing_mech: | BY VALUE_P ; +/***************************************************************************** + * + * WAIT FOR LSN + * + *****************************************************************************/ + +WaitStmt: + WAIT FOR LSN_P Sconst opt_wait_with_clause + { + WaitStmt *n = makeNode(WaitStmt); + n->lsn_literal = $4; + n->options = $5; + $$ = (Node *) n; + } + ; + +opt_wait_with_clause: + WITH '(' utility_option_list ')' { $$ = $3; } + | /*EMPTY*/ { $$ = NIL; } + ; /* * Aggregate decoration clauses @@ -16389,6 +16572,12 @@ filter_clause: /* * Window Definitions */ +null_treatment: + IGNORE_P NULLS_P { $$ = PARSER_IGNORE_NULLS; } + | RESPECT_P NULLS_P { $$ = PARSER_RESPECT_NULLS; } + | /*EMPTY*/ { $$ = NO_NULLTREATMENT; } + ; + window_clause: WINDOW window_definition_list { $$ = $2; } | /*EMPTY*/ { $$ = NIL; } @@ -16764,15 +16953,15 @@ type_list: Typename { $$ = list_make1($1); } array_expr: '[' expr_list ']' { - $$ = makeAArrayExpr($2, @1); + $$ = makeAArrayExpr($2, @1, @3); } | '[' array_expr_list ']' { - $$ = makeAArrayExpr($2, @1); + $$ = makeAArrayExpr($2, @1, @3); } | '[' ']' { - $$ = makeAArrayExpr(NIL, @1); + $$ = makeAArrayExpr(NIL, @1, @2); } ; @@ -16894,17 +17083,6 @@ trim_list: a_expr FROM expr_list { $$ = lappend($3, $1); } | expr_list { $$ = $1; } ; -in_expr: select_with_parens - { - SubLink *n = makeNode(SubLink); - - n->subselect = $1; - /* other fields will be filled later */ - $$ = (Node *) n; - } - | '(' expr_list ')' { $$ = (Node *) $2; } - ; - /* * Define SQL-style CASE clause. * - Full specification @@ -17597,6 +17775,7 @@ PLpgSQL_Expr: opt_distinct_clause opt_target_list n->whereClause = $4; n->groupClause = ($5)->list; n->groupDistinct = ($5)->distinct; + n->groupByAll = ($5)->all; n->havingClause = $6; n->windowClause = $7; n->sortClause = $8; @@ -17826,6 +18005,7 @@ unreserved_keyword: | HOUR_P | IDENTITY_P | IF_P + | IGNORE_P | IMMEDIATE | IMMUTABLE | IMPLICIT_P @@ -17861,6 +18041,7 @@ unreserved_keyword: | LOCK_P | LOCKED | LOGGED + | LSN_P | MAPPING | MATCH | MATCHED @@ -17909,6 +18090,7 @@ unreserved_keyword: | PARSER | PARTIAL | PARTITION + | PARTITIONS | PASSING | PASSWORD | PATH @@ -17944,6 +18126,7 @@ unreserved_keyword: | REPLACE | REPLICA | RESET + | RESPECT_P | RESTART | RESTRICT | RETURN @@ -17977,6 +18160,7 @@ unreserved_keyword: | SKIP | SNAPSHOT | SOURCE + | SPLIT | SQL_P | STABLE | STANDALONE_P @@ -18030,6 +18214,7 @@ unreserved_keyword: | VIEWS | VIRTUAL | VOLATILE + | WAIT | WHITESPACE_P | WITHIN | WITHOUT @@ -18476,6 +18661,7 @@ bare_label_keyword: | LOCK_P | LOCKED | LOGGED + | LSN_P | MAPPING | MATCH | MATCHED @@ -18536,6 +18722,7 @@ bare_label_keyword: | PARSER | PARTIAL | PARTITION + | PARTITIONS | PASSING | PASSWORD | PATH @@ -18616,6 +18803,7 @@ bare_label_keyword: | SNAPSHOT | SOME | SOURCE + | SPLIT | SQL_P | STABLE | STANDALONE_P @@ -18687,6 +18875,7 @@ bare_label_keyword: | VIEWS | VIRTUAL | VOLATILE + | WAIT | WHEN | WHITESPACE_P | WORK @@ -18748,47 +18937,6 @@ updateRawStmtEnd(RawStmt *rs, int end_location) rs->stmt_len = end_location - rs->stmt_location; } -/* - * Adjust a PreparableStmt to reflect that it doesn't run to the end of the - * string. - */ -static void -updatePreparableStmtEnd(Node *n, int end_location) -{ - if (IsA(n, SelectStmt)) - { - SelectStmt *stmt = (SelectStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, InsertStmt)) - { - InsertStmt *stmt = (InsertStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, UpdateStmt)) - { - UpdateStmt *stmt = (UpdateStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, DeleteStmt)) - { - DeleteStmt *stmt = (DeleteStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, MergeStmt)) - { - MergeStmt *stmt = (MergeStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else - elog(ERROR, "unexpected node type %d", (int) n->type); -} - static Node * makeColumnRef(char *colname, List *indirection, int location, core_yyscan_t yyscanner) @@ -19167,14 +19315,11 @@ insertSelectOptions(SelectStmt *stmt, errmsg("multiple WITH clauses not allowed"), parser_errposition(exprLocation((Node *) withClause)))); stmt->withClause = withClause; - - /* Update SelectStmt's location to the start of the WITH clause */ - stmt->stmt_location = withClause->location; } } static Node * -makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location) +makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg) { SelectStmt *n = makeNode(SelectStmt); @@ -19182,7 +19327,6 @@ makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location) n->all = all; n->larg = (SelectStmt *) larg; n->rarg = (SelectStmt *) rarg; - n->stmt_location = location; return (Node *) n; } @@ -19300,12 +19444,14 @@ makeNotExpr(Node *expr, int location) } static Node * -makeAArrayExpr(List *elements, int location) +makeAArrayExpr(List *elements, int location, int location_end) { A_ArrayExpr *n = makeNode(A_ArrayExpr); n->elements = elements; n->location = location; + n->list_start = location; + n->list_end = location_end; return (Node *) n; } @@ -19638,6 +19784,47 @@ parsePartitionStrategy(char *strategy, int location, core_yyscan_t yyscanner) } +/* + * Process all_objects_list to set all_tables and/or all_sequences. + * Also, checks if the pub_object_type has been specified more than once. + */ +static void +preprocess_pub_all_objtype_list(List *all_objects_list, bool *all_tables, + bool *all_sequences, core_yyscan_t yyscanner) +{ + if (!all_objects_list) + return; + + *all_tables = false; + *all_sequences = false; + + foreach_ptr(PublicationAllObjSpec, obj, all_objects_list) + { + if (obj->pubobjtype == PUBLICATION_ALL_TABLES) + { + if (*all_tables) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid publication object list"), + errdetail("ALL TABLES can be specified only once."), + parser_errposition(obj->location)); + + *all_tables = true; + } + else if (obj->pubobjtype == PUBLICATION_ALL_SEQUENCES) + { + if (*all_sequences) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid publication object list"), + errdetail("ALL SEQUENCES can be specified only once."), + parser_errposition(obj->location)); + + *all_sequences = true; + } + } +} + /* * Process pubobjspec_list to check for errors in any of the objects and * convert PUBLICATIONOBJ_CONTINUATION into appropriate PublicationObjSpecType. diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c index 0ac8966e30ff3..b8340557b3454 100644 --- a/src/backend/parser/parse_agg.c +++ b/src/backend/parser/parse_agg.c @@ -38,6 +38,8 @@ typedef struct ParseState *pstate; int min_varlevel; int min_agglevel; + int min_ctelevel; + RangeTblEntry *min_cte; int sublevels_up; } check_agg_arguments_context; @@ -58,7 +60,8 @@ typedef struct static int check_agg_arguments(ParseState *pstate, List *directargs, List *args, - Expr *filter); + Expr *filter, + int agglocation); static bool check_agg_arguments_walker(Node *node, check_agg_arguments_context *context); static Node *substitute_grouped_columns(Node *node, ParseState *pstate, Query *qry, @@ -339,7 +342,8 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr) min_varlevel = check_agg_arguments(pstate, directargs, args, - filter); + filter, + location); *p_levelsup = min_varlevel; @@ -641,7 +645,8 @@ static int check_agg_arguments(ParseState *pstate, List *directargs, List *args, - Expr *filter) + Expr *filter, + int agglocation) { int agglevel; check_agg_arguments_context context; @@ -649,6 +654,8 @@ check_agg_arguments(ParseState *pstate, context.pstate = pstate; context.min_varlevel = -1; /* signifies nothing found yet */ context.min_agglevel = -1; + context.min_ctelevel = -1; + context.min_cte = NULL; context.sublevels_up = 0; (void) check_agg_arguments_walker((Node *) args, &context); @@ -686,6 +693,20 @@ check_agg_arguments(ParseState *pstate, parser_errposition(pstate, aggloc))); } + /* + * If there's a non-local CTE that's below the aggregate's semantic level, + * complain. It's not quite clear what we should do to fix up such a case + * (treating the CTE reference like a Var seems wrong), and it's also + * unclear whether there is a real-world use for such cases. + */ + if (context.min_ctelevel >= 0 && context.min_ctelevel < agglevel) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("outer-level aggregate cannot use a nested CTE"), + errdetail("CTE \"%s\" is below the aggregate's semantic level.", + context.min_cte->eref->aliasname), + parser_errposition(pstate, agglocation))); + /* * Now check for vars/aggs in the direct arguments, and throw error if * needed. Note that we allow a Var of the agg's semantic level, but not @@ -699,6 +720,7 @@ check_agg_arguments(ParseState *pstate, { context.min_varlevel = -1; context.min_agglevel = -1; + context.min_ctelevel = -1; (void) check_agg_arguments_walker((Node *) directargs, &context); if (context.min_varlevel >= 0 && context.min_varlevel < agglevel) ereport(ERROR, @@ -714,6 +736,13 @@ check_agg_arguments(ParseState *pstate, parser_errposition(pstate, locate_agg_of_level((Node *) directargs, context.min_agglevel)))); + if (context.min_ctelevel >= 0 && context.min_ctelevel < agglevel) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("outer-level aggregate cannot use a nested CTE"), + errdetail("CTE \"%s\" is below the aggregate's semantic level.", + context.min_cte->eref->aliasname), + parser_errposition(pstate, agglocation))); } return agglevel; } @@ -791,6 +820,30 @@ check_agg_arguments_walker(Node *node, parser_errposition(context->pstate, ((WindowFunc *) node)->location))); } + + if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rte = (RangeTblEntry *) node; + + if (rte->rtekind == RTE_CTE) + { + int ctelevelsup = rte->ctelevelsup; + + /* convert levelsup to frame of reference of original query */ + ctelevelsup -= context->sublevels_up; + /* ignore local CTEs of subqueries */ + if (ctelevelsup >= 0) + { + if (context->min_ctelevel < 0 || + context->min_ctelevel > ctelevelsup) + { + context->min_ctelevel = ctelevelsup; + context->min_cte = rte; + } + } + } + return false; /* allow range_table_walker to continue */ + } if (IsA(node, Query)) { /* Recurse into subselects */ @@ -800,7 +853,7 @@ check_agg_arguments_walker(Node *node, result = query_tree_walker((Query *) node, check_agg_arguments_walker, context, - 0); + QTW_EXAMINE_RTES_BEFORE); context->sublevels_up--; return result; } diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index 9f20a70ce13cf..57609e2d55c40 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -733,7 +733,7 @@ transformRangeTableFunc(ParseState *pstate, RangeTableFunc *rtf) tf->ordinalitycol = -1; /* Process column specs */ - names = palloc(sizeof(char *) * list_length(rtf->columns)); + names = palloc_array(char *, list_length(rtf->columns)); colno = 0; foreach(col, rtf->columns) @@ -1573,7 +1573,7 @@ transformFromClauseItem(ParseState *pstate, Node *n, { ParseNamespaceItem *jnsitem; - jnsitem = (ParseNamespaceItem *) palloc(sizeof(ParseNamespaceItem)); + jnsitem = palloc_object(ParseNamespaceItem); jnsitem->p_names = j->join_using_alias; jnsitem->p_rte = nsitem->p_rte; jnsitem->p_rtindex = nsitem->p_rtindex; @@ -2598,6 +2598,9 @@ transformGroupingSet(List **flatresult, * GROUP BY items will be added to the targetlist (as resjunk columns) * if not already present, so the targetlist must be passed by reference. * + * If GROUP BY ALL is specified, the groupClause will be inferred to be all + * non-aggregate, non-window expressions in the targetlist. + * * This is also used for window PARTITION BY clauses (which act almost the * same, but are always interpreted per SQL99 rules). * @@ -2622,6 +2625,7 @@ transformGroupingSet(List **flatresult, * * pstate ParseState * grouplist clause to transform + * groupByAll is this a GROUP BY ALL statement? * groupingSets reference to list to contain the grouping set tree * targetlist reference to TargetEntry list * sortClause ORDER BY clause (SortGroupClause nodes) @@ -2629,7 +2633,8 @@ transformGroupingSet(List **flatresult, * useSQL99 SQL99 rather than SQL92 syntax */ List * -transformGroupClause(ParseState *pstate, List *grouplist, List **groupingSets, +transformGroupClause(ParseState *pstate, List *grouplist, bool groupByAll, + List **groupingSets, List **targetlist, List *sortClause, ParseExprKind exprKind, bool useSQL99) { @@ -2640,6 +2645,63 @@ transformGroupClause(ParseState *pstate, List *grouplist, List **groupingSets, bool hasGroupingSets = false; Bitmapset *seen_local = NULL; + /* Handle GROUP BY ALL */ + if (groupByAll) + { + /* There cannot have been any explicit grouplist items */ + Assert(grouplist == NIL); + + /* Iterate over targets, adding acceptable ones to the result list */ + foreach_ptr(TargetEntry, tle, *targetlist) + { + /* Ignore junk TLEs */ + if (tle->resjunk) + continue; + + /* + * TLEs containing aggregates are not okay to add to GROUP BY + * (compare checkTargetlistEntrySQL92). But the SQL standard + * directs us to skip them, so it's fine. + */ + if (pstate->p_hasAggs && + contain_aggs_of_level((Node *) tle->expr, 0)) + continue; + + /* + * Likewise, TLEs containing window functions are not okay to add + * to GROUP BY. At this writing, the SQL standard is silent on + * what to do with them, but by analogy to aggregates we'll just + * skip them. + */ + if (pstate->p_hasWindowFuncs && + contain_windowfuncs((Node *) tle->expr)) + continue; + + /* + * Otherwise, add the TLE to the result using default sort/group + * semantics. We specify the parse location as the TLE's + * location, despite the comment for addTargetToGroupList + * discouraging that. The only other thing we could point to is + * the ALL keyword, which seems unhelpful when there are multiple + * TLEs. + */ + result = addTargetToGroupList(pstate, tle, + result, *targetlist, + exprLocation((Node *) tle->expr)); + } + + /* If we found any acceptable targets, we're done */ + if (result != NIL) + return result; + + /* + * Otherwise, the SQL standard says to treat it like "GROUP BY ()". + * Build a representation of that, and let the rest of this function + * handle it. + */ + grouplist = list_make1(makeGroupingSet(GROUPING_SET_EMPTY, NIL, -1)); + } + /* * Recursively flatten implicit RowExprs. (Technically this is only needed * for GROUP BY, per the syntax rules for grouping sets, but we do it @@ -2818,6 +2880,7 @@ transformWindowDefinitions(ParseState *pstate, true /* force SQL99 rules */ ); partitionClause = transformGroupClause(pstate, windef->partitionClause, + false /* not GROUP BY ALL */ , NULL, targetlist, orderClause, @@ -3214,24 +3277,32 @@ resolve_unique_index_expr(ParseState *pstate, InferClause *infer, * Raw grammar re-uses CREATE INDEX infrastructure for unique index * inference clause, and so will accept opclasses by name and so on. * - * Make no attempt to match ASC or DESC ordering or NULLS FIRST/NULLS - * LAST ordering, since those are not significant for inference - * purposes (any unique index matching the inference specification in - * other regards is accepted indifferently). Actively reject this as - * wrong-headed. + * Make no attempt to match ASC or DESC ordering, NULLS FIRST/NULLS + * LAST ordering or opclass options, since those are not significant + * for inference purposes (any unique index matching the inference + * specification in other regards is accepted indifferently). Actively + * reject this as wrong-headed. */ if (ielem->ordering != SORTBY_DEFAULT) ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), - errmsg("ASC/DESC is not allowed in ON CONFLICT clause"), + errmsg("%s is not allowed in ON CONFLICT clause", + "ASC/DESC"), parser_errposition(pstate, exprLocation((Node *) infer)))); if (ielem->nulls_ordering != SORTBY_NULLS_DEFAULT) ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), - errmsg("NULLS FIRST/LAST is not allowed in ON CONFLICT clause"), + errmsg("%s is not allowed in ON CONFLICT clause", + "NULLS FIRST/LAST"), parser_errposition(pstate, exprLocation((Node *) infer)))); + if (ielem->opclassopts) + ereport(ERROR, + errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("operator class options are not allowed in ON CONFLICT clause"), + parser_errposition(pstate, + exprLocation((Node *) infer))); if (!ielem->expr) { diff --git a/src/backend/parser/parse_coerce.c b/src/backend/parser/parse_coerce.c index 0b5b81c7f27ee..78b1e366ad707 100644 --- a/src/backend/parser/parse_coerce.c +++ b/src/backend/parser/parse_coerce.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "catalog/pg_cast.h" #include "catalog/pg_class.h" #include "catalog/pg_inherits.h" diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 1f8e2d54673dd..6b8fa15fca33b 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -15,9 +15,9 @@ #include "postgres.h" +#include "access/htup_details.h" #include "catalog/pg_aggregate.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" @@ -94,7 +94,8 @@ static Node *transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func); static void transformJsonPassingArgs(ParseState *pstate, const char *constructName, JsonFormatType format, List *args, List **passing_values, List **passing_names); -static JsonBehavior *transformJsonBehavior(ParseState *pstate, JsonBehavior *behavior, +static JsonBehavior *transformJsonBehavior(ParseState *pstate, JsonExpr *jsexpr, + JsonBehavior *behavior, JsonBehaviorType default_behavior, JsonReturning *returning); static Node *GetJsonBehaviorConst(JsonBehaviorType btype, int location); @@ -326,7 +327,7 @@ transformExprRecurse(ParseState *pstate, Node *expr) case T_CaseTestExpr: case T_Var: { - result = (Node *) expr; + result = expr; break; } @@ -1223,6 +1224,8 @@ transformAExprIn(ParseState *pstate, A_Expr *a) newa->element_typeid = scalar_type; newa->elements = aexprs; newa->multidims = false; + newa->list_start = a->rexpr_list_start; + newa->list_end = a->rexpr_list_end; newa->location = -1; result = (Node *) make_scalar_array_op(pstate, @@ -2165,6 +2168,8 @@ transformArrayExpr(ParseState *pstate, A_ArrayExpr *a, /* array_collid will be set by parse_collate.c */ newa->element_typeid = element_type; newa->elements = newcoercedelems; + newa->list_start = a->list_start; + newa->list_end = a->list_end; newa->location = a->location; return (Node *) newa; @@ -2901,7 +2906,7 @@ make_row_comparison_op(ParseState *pstate, List *opname, * operators, and see which interpretations (cmptypes) exist for each * operator. */ - opinfo_lists = (List **) palloc(nopers * sizeof(List *)); + opinfo_lists = palloc_array(List *, nopers); cmptypes = NULL; i = 0; foreach(l, opexprs) @@ -3236,7 +3241,7 @@ getJsonEncodingConst(JsonFormat *format) { JsonEncoding encoding; const char *enc; - Name encname = palloc(sizeof(NameData)); + Name encname = palloc_object(NameData); if (!format || format->format_type == JS_FORMAT_DEFAULT || @@ -4074,7 +4079,7 @@ transformJsonParseArg(ParseState *pstate, Node *jsexpr, JsonFormat *format, if (*exprtype == UNKNOWNOID || typcategory == TYPCATEGORY_STRING) { - expr = coerce_to_target_type(pstate, (Node *) expr, *exprtype, + expr = coerce_to_target_type(pstate, expr, *exprtype, TEXTOID, -1, COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, -1); @@ -4280,6 +4285,9 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) { JsonExpr *jsexpr; Node *path_spec; + Oid pathspec_type; + int pathspec_loc; + Node *coerced_path_spec; const char *func_name = NULL; JsonFormatType default_format; @@ -4495,17 +4503,21 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) jsexpr->format = func->context_item->format; path_spec = transformExprRecurse(pstate, func->pathspec); - path_spec = coerce_to_target_type(pstate, path_spec, exprType(path_spec), - JSONPATHOID, -1, - COERCION_EXPLICIT, COERCE_IMPLICIT_CAST, - exprLocation(path_spec)); - if (path_spec == NULL) + pathspec_type = exprType(path_spec); + pathspec_loc = exprLocation(path_spec); + coerced_path_spec = coerce_to_target_type(pstate, path_spec, + pathspec_type, + JSONPATHOID, -1, + COERCION_EXPLICIT, + COERCE_IMPLICIT_CAST, + pathspec_loc); + if (coerced_path_spec == NULL) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("JSON path expression must be of type %s, not of type %s", - "jsonpath", format_type_be(exprType(path_spec))), - parser_errposition(pstate, exprLocation(path_spec)))); - jsexpr->path_spec = path_spec; + "jsonpath", format_type_be(pathspec_type)), + parser_errposition(pstate, pathspec_loc))); + jsexpr->path_spec = coerced_path_spec; /* Transform and coerce the PASSING arguments to jsonb. */ transformJsonPassingArgs(pstate, func_name, @@ -4525,13 +4537,16 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) { jsexpr->returning->typid = BOOLOID; jsexpr->returning->typmod = -1; + jsexpr->collation = InvalidOid; } /* JSON_TABLE() COLUMNS can specify a non-boolean type. */ if (jsexpr->returning->typid != BOOLOID) jsexpr->use_json_coercion = true; - jsexpr->on_error = transformJsonBehavior(pstate, func->on_error, + jsexpr->on_error = transformJsonBehavior(pstate, + jsexpr, + func->on_error, JSON_BEHAVIOR_FALSE, jsexpr->returning); break; @@ -4546,6 +4561,8 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) ret->typmod = -1; } + jsexpr->collation = get_typcollation(jsexpr->returning->typid); + /* * Keep quotes on scalar strings by default, omitting them only if * OMIT QUOTES is specified. @@ -4562,11 +4579,15 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) jsexpr->use_json_coercion = true; /* Assume NULL ON EMPTY when ON EMPTY is not specified. */ - jsexpr->on_empty = transformJsonBehavior(pstate, func->on_empty, + jsexpr->on_empty = transformJsonBehavior(pstate, + jsexpr, + func->on_empty, JSON_BEHAVIOR_NULL, jsexpr->returning); /* Assume NULL ON ERROR when ON ERROR is not specified. */ - jsexpr->on_error = transformJsonBehavior(pstate, func->on_error, + jsexpr->on_error = transformJsonBehavior(pstate, + jsexpr, + func->on_error, JSON_BEHAVIOR_NULL, jsexpr->returning); break; @@ -4578,6 +4599,7 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) jsexpr->returning->typid = TEXTOID; jsexpr->returning->typmod = -1; } + jsexpr->collation = get_typcollation(jsexpr->returning->typid); /* * Override whatever transformJsonOutput() set these to, which @@ -4603,11 +4625,15 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) } /* Assume NULL ON EMPTY when ON EMPTY is not specified. */ - jsexpr->on_empty = transformJsonBehavior(pstate, func->on_empty, + jsexpr->on_empty = transformJsonBehavior(pstate, + jsexpr, + func->on_empty, JSON_BEHAVIOR_NULL, jsexpr->returning); /* Assume NULL ON ERROR when ON ERROR is not specified. */ - jsexpr->on_error = transformJsonBehavior(pstate, func->on_error, + jsexpr->on_error = transformJsonBehavior(pstate, + jsexpr, + func->on_error, JSON_BEHAVIOR_NULL, jsexpr->returning); break; @@ -4618,6 +4644,7 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) jsexpr->returning->typid = exprType(jsexpr->formatted_expr); jsexpr->returning->typmod = -1; } + jsexpr->collation = get_typcollation(jsexpr->returning->typid); /* * Assume EMPTY ARRAY ON ERROR when ON ERROR is not specified. @@ -4625,7 +4652,9 @@ transformJsonFuncExpr(ParseState *pstate, JsonFuncExpr *func) * ON EMPTY cannot be specified at the top level but it can be for * the individual columns. */ - jsexpr->on_error = transformJsonBehavior(pstate, func->on_error, + jsexpr->on_error = transformJsonBehavior(pstate, + jsexpr, + func->on_error, JSON_BEHAVIOR_EMPTY_ARRAY, jsexpr->returning); break; @@ -4701,7 +4730,8 @@ ValidJsonBehaviorDefaultExpr(Node *expr, void *context) * Transform a JSON BEHAVIOR clause. */ static JsonBehavior * -transformJsonBehavior(ParseState *pstate, JsonBehavior *behavior, +transformJsonBehavior(ParseState *pstate, JsonExpr *jsexpr, + JsonBehavior *behavior, JsonBehaviorType default_behavior, JsonReturning *returning) { @@ -4716,7 +4746,11 @@ transformJsonBehavior(ParseState *pstate, JsonBehavior *behavior, location = behavior->location; if (btype == JSON_BEHAVIOR_DEFAULT) { + Oid targetcoll = jsexpr->collation; + Oid exprcoll; + expr = transformExprRecurse(pstate, behavior->expr); + if (!ValidJsonBehaviorDefaultExpr(expr, NULL)) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), @@ -4732,6 +4766,24 @@ transformJsonBehavior(ParseState *pstate, JsonBehavior *behavior, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("DEFAULT expression must not return a set"), parser_errposition(pstate, exprLocation(expr)))); + + /* + * Reject a DEFAULT expression whose collation differs from the + * enclosing JSON expression's result collation + * (jsexpr->collation), as chosen by the RETURNING clause. + */ + exprcoll = exprCollation(expr); + if (!OidIsValid(exprcoll)) + exprcoll = get_typcollation(exprType(expr)); + if (OidIsValid(targetcoll) && OidIsValid(exprcoll) && + targetcoll != exprcoll) + ereport(ERROR, + errcode(ERRCODE_COLLATION_MISMATCH), + errmsg("collation of DEFAULT expression conflicts with RETURNING clause"), + errdetail("\"%s\" versus \"%s\"", + get_collation_name(exprcoll), + get_collation_name(targetcoll)), + parser_errposition(pstate, exprLocation(expr))); } } diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 583bbbf232f04..778d69c6f3c20 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -42,6 +42,8 @@ typedef enum FUNCLOOKUP_AMBIGUOUS, } FuncLookupError; +static int func_lookup_failure_details(int fgc_flags, List *argnames, + bool proc_call); static void unify_hypothetical_args(ParseState *pstate, List *fargs, int numAggregatedArgs, Oid *actual_arg_types, Oid *declared_arg_types); @@ -98,6 +100,7 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, bool agg_star = (fn ? fn->agg_star : false); bool agg_distinct = (fn ? fn->agg_distinct : false); bool func_variadic = (fn ? fn->func_variadic : false); + int ignore_nulls = (fn ? fn->ignore_nulls : NO_NULLTREATMENT); CoercionForm funcformat = (fn ? fn->funcformat : COERCE_EXPLICIT_CALL); bool could_be_projection; Oid rettype; @@ -115,6 +118,7 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, int nvargs; Oid vatype; FuncDetailCode fdresult; + int fgc_flags; char aggkind = 0; ParseCallbackState pcbstate; @@ -266,6 +270,7 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, fdresult = func_get_detail(funcname, fargs, argnames, nargs, actual_arg_types, !func_variadic, true, proc_call, + &fgc_flags, &funcid, &rettype, &retset, &nvargs, &vatype, &declared_arg_types, &argdefaults); @@ -514,6 +519,13 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, errmsg("%s is not an ordered-set aggregate, so it cannot have WITHIN GROUP", NameListToString(funcname)), parser_errposition(pstate, location))); + + /* It also can't treat nulls as a window function */ + if (ignore_nulls != NO_NULLTREATMENT) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("aggregate functions do not accept RESPECT/IGNORE NULLS"), + parser_errposition(pstate, location))); } } else if (fdresult == FUNCDETAIL_WINDOWFUNC) @@ -563,8 +575,8 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, errmsg("procedure %s is not unique", func_signature_string(funcname, nargs, argnames, actual_arg_types)), - errhint("Could not choose a best candidate procedure. " - "You might need to add explicit type casts."), + errdetail("Could not choose a best candidate procedure."), + errhint("You might need to add explicit type casts."), parser_errposition(pstate, location))); else ereport(ERROR, @@ -572,8 +584,8 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, errmsg("function %s is not unique", func_signature_string(funcname, nargs, argnames, actual_arg_types)), - errhint("Could not choose a best candidate function. " - "You might need to add explicit type casts."), + errdetail("Could not choose a best candidate function."), + errhint("You might need to add explicit type casts."), parser_errposition(pstate, location))); } else @@ -601,7 +613,9 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, /* * No function, and no column either. Since we're dealing with - * function notation, report "function does not exist". + * function notation, report "function/procedure does not exist". + * Depending on what was returned in fgc_flags, we can add some color + * to that with detail or hint messages. */ if (list_length(agg_order) > 1 && !agg_within_group) { @@ -611,8 +625,8 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, errmsg("function %s does not exist", func_signature_string(funcname, nargs, argnames, actual_arg_types)), - errhint("No aggregate function matches the given name and argument types. " - "Perhaps you misplaced ORDER BY; ORDER BY must appear " + errdetail("No aggregate function matches the given name and argument types."), + errhint("Perhaps you misplaced ORDER BY; ORDER BY must appear " "after all regular arguments of the aggregate."), parser_errposition(pstate, location))); } @@ -622,8 +636,8 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, errmsg("procedure %s does not exist", func_signature_string(funcname, nargs, argnames, actual_arg_types)), - errhint("No procedure matches the given name and argument types. " - "You might need to add explicit type casts."), + func_lookup_failure_details(fgc_flags, argnames, + proc_call), parser_errposition(pstate, location))); else ereport(ERROR, @@ -631,8 +645,8 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, errmsg("function %s does not exist", func_signature_string(funcname, nargs, argnames, actual_arg_types)), - errhint("No function matches the given name and argument types. " - "You might need to add explicit type casts."), + func_lookup_failure_details(fgc_flags, argnames, + proc_call), parser_errposition(pstate, location))); } @@ -834,6 +848,7 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, wfunc->winstar = agg_star; wfunc->winagg = (fdresult == FUNCDETAIL_AGGREGATE); wfunc->aggfilter = agg_filter; + wfunc->ignore_nulls = ignore_nulls; wfunc->runCondition = NIL; wfunc->location = location; @@ -905,6 +920,104 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, return retval; } +/* + * Interpret the fgc_flags and issue a suitable detail or hint message. + * + * Helper function to reduce code duplication while throwing a + * function-not-found error. + */ +static int +func_lookup_failure_details(int fgc_flags, List *argnames, bool proc_call) +{ + /* + * If not FGC_NAME_VISIBLE, we shouldn't raise the question of whether the + * arguments are wrong. If the function name was not schema-qualified, + * it's helpful to distinguish between doesn't-exist-anywhere and + * not-in-search-path; but if it was, there's really nothing to add to the + * basic "function/procedure %s does not exist" message. + * + * Note: we passed missing_ok = false to FuncnameGetCandidates, so there's + * no need to consider FGC_SCHEMA_EXISTS here: we'd have already thrown an + * error if an explicitly-given schema doesn't exist. + */ + if (!(fgc_flags & FGC_NAME_VISIBLE)) + { + if (fgc_flags & FGC_SCHEMA_GIVEN) + return 0; /* schema-qualified name */ + else if (!(fgc_flags & FGC_NAME_EXISTS)) + { + if (proc_call) + return errdetail("There is no procedure of that name."); + else + return errdetail("There is no function of that name."); + } + else + { + if (proc_call) + return errdetail("A procedure of that name exists, but it is not in the search_path."); + else + return errdetail("A function of that name exists, but it is not in the search_path."); + } + } + + /* + * Next, complain if nothing had the right number of arguments. (This + * takes precedence over wrong-argnames cases because we won't even look + * at the argnames unless there's a workable number of arguments.) + */ + if (!(fgc_flags & FGC_ARGCOUNT_MATCH)) + { + if (proc_call) + return errdetail("No procedure of that name accepts the given number of arguments."); + else + return errdetail("No function of that name accepts the given number of arguments."); + } + + /* + * If there are argnames, and we failed to match them, again we should + * mention that and not bring up the argument types. + */ + if (argnames != NIL && !(fgc_flags & FGC_ARGNAMES_MATCH)) + { + if (proc_call) + return errdetail("No procedure of that name accepts the given argument names."); + else + return errdetail("No function of that name accepts the given argument names."); + } + + /* + * We could have matched all the given argnames and still not have had a + * valid call, either because of improper use of mixed notation, or + * because of missing arguments, or because the user misused VARIADIC. The + * rules about named-argument matching are finicky enough that it's worth + * trying to be specific about the problem. (The messages here are chosen + * with full knowledge of the steps that namespace.c uses while checking a + * potential match.) + */ + if (argnames != NIL && !(fgc_flags & FGC_ARGNAMES_NONDUP)) + return errdetail("In the closest available match, " + "an argument was specified both positionally and by name."); + + if (argnames != NIL && !(fgc_flags & FGC_ARGNAMES_ALL)) + return errdetail("In the closest available match, " + "not all required arguments were supplied."); + + if (argnames != NIL && !(fgc_flags & FGC_ARGNAMES_VALID)) + return errhint("This call would be correct if the variadic array were labeled VARIADIC and placed last."); + + if (fgc_flags & FGC_VARIADIC_FAIL) + return errhint("The VARIADIC parameter must be placed last, even when using argument names."); + + /* + * Otherwise, the problem must be incorrect argument types. + */ + if (proc_call) + (void) errdetail("No procedure of that name accepts the given argument types."); + else + (void) errdetail("No function of that name accepts the given argument types."); + return errhint("You might need to add explicit type casts."); +} + /* func_match_argtypes() * @@ -1372,9 +1485,14 @@ func_select_candidate(int nargs, * 1) check for possible interpretation as a type coercion request * 2) apply the ambiguous-function resolution rules * - * Return values *funcid through *true_typeids receive info about the function. - * If argdefaults isn't NULL, *argdefaults receives a list of any default - * argument expressions that need to be added to the given arguments. + * If there is no match at all, we return FUNCDETAIL_NOTFOUND, and *fgc_flags + * is filled with some flags that may be useful for issuing an on-point error + * message (see FuncnameGetCandidates). + * + * On success, return values *funcid through *true_typeids receive info about + * the function. If argdefaults isn't NULL, *argdefaults receives a list of + * any default argument expressions that need to be added to the given + * arguments. * * When processing a named- or mixed-notation call (ie, fargnames isn't NIL), * the returned true_typeids and argdefaults are ordered according to the @@ -1400,6 +1518,7 @@ func_get_detail(List *funcname, bool expand_variadic, bool expand_defaults, bool include_out_arguments, + int *fgc_flags, /* return value */ Oid *funcid, /* return value */ Oid *rettype, /* return value */ bool *retset, /* return value */ @@ -1424,7 +1543,8 @@ func_get_detail(List *funcname, /* Get list of possible candidates from namespace search */ raw_candidates = FuncnameGetCandidates(funcname, nargs, fargnames, expand_variadic, expand_defaults, - include_out_arguments, false); + include_out_arguments, false, + fgc_flags); /* * Quickly check if there is an exact match to the input datatypes (there @@ -1594,7 +1714,10 @@ func_get_detail(List *funcname, */ if (fargnames != NIL && !expand_variadic && nargs > 0 && best_candidate->argnumbers[nargs - 1] != nargs - 1) + { + *fgc_flags |= FGC_VARIADIC_FAIL; return FUNCDETAIL_NOTFOUND; + } *funcid = best_candidate->oid; *nvargs = best_candidate->nvargs; @@ -2053,6 +2176,7 @@ LookupFuncNameInternal(ObjectType objtype, List *funcname, { Oid result = InvalidOid; FuncCandidateList clist; + int fgc_flags; /* NULL argtypes allowed for nullary functions only */ Assert(argtypes != NULL || nargs == 0); @@ -2062,7 +2186,8 @@ LookupFuncNameInternal(ObjectType objtype, List *funcname, /* Get list of candidate objects */ clist = FuncnameGetCandidates(funcname, nargs, NIL, false, false, - include_out_arguments, missing_ok); + include_out_arguments, missing_ok, + &fgc_flags); /* Scan list for a match to the arg types (if specified) and the objtype */ for (; clist != NULL; clist = clist->next) diff --git a/src/backend/parser/parse_node.c b/src/backend/parser/parse_node.c index d6feb16aef375..cafeb87217ceb 100644 --- a/src/backend/parser/parse_node.c +++ b/src/backend/parser/parse_node.c @@ -40,7 +40,7 @@ make_parsestate(ParseState *parentParseState) { ParseState *pstate; - pstate = palloc0(sizeof(ParseState)); + pstate = palloc0_object(ParseState); pstate->parentParseState = parentParseState; @@ -408,7 +408,7 @@ make_const(ParseState *pstate, A_Const *aconst) typeid = INT8OID; typelen = sizeof(int64); - typebyval = FLOAT8PASSBYVAL; /* int8 and float8 alike */ + typebyval = true; } } else diff --git a/src/backend/parser/parse_oper.c b/src/backend/parser/parse_oper.c index 0c4337563cf35..7bd7a336fd6fd 100644 --- a/src/backend/parser/parse_oper.c +++ b/src/backend/parser/parse_oper.c @@ -72,7 +72,8 @@ static FuncDetailCode oper_select_candidate(int nargs, Oid *operOid); static void op_error(ParseState *pstate, List *op, Oid arg1, Oid arg2, - FuncDetailCode fdresult, int location); + FuncDetailCode fdresult, int fgc_flags, int location); +static int oper_lookup_failure_details(int fgc_flags, bool is_unary_op); static bool make_oper_cache_key(ParseState *pstate, OprCacheKey *key, List *opname, Oid ltypeId, Oid rtypeId, int location); @@ -373,6 +374,7 @@ oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, Oid operOid; OprCacheKey key; bool key_ok; + int fgc_flags = 0; FuncDetailCode fdresult = FUNCDETAIL_NOTFOUND; HeapTuple tup = NULL; @@ -404,7 +406,7 @@ oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, FuncCandidateList clist; /* Get binary operators of given name */ - clist = OpernameGetCandidates(opname, 'b', false); + clist = OpernameGetCandidates(opname, 'b', false, &fgc_flags); /* No operators found? Then fail... */ if (clist != NULL) @@ -434,7 +436,8 @@ oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, make_oper_cache_entry(&key, operOid); } else if (!noError) - op_error(pstate, opname, ltypeId, rtypeId, fdresult, location); + op_error(pstate, opname, ltypeId, rtypeId, + fdresult, fgc_flags, location); return (Operator) tup; } @@ -520,6 +523,7 @@ left_oper(ParseState *pstate, List *op, Oid arg, bool noError, int location) Oid operOid; OprCacheKey key; bool key_ok; + int fgc_flags = 0; FuncDetailCode fdresult = FUNCDETAIL_NOTFOUND; HeapTuple tup = NULL; @@ -551,7 +555,7 @@ left_oper(ParseState *pstate, List *op, Oid arg, bool noError, int location) FuncCandidateList clist; /* Get prefix operators of given name */ - clist = OpernameGetCandidates(op, 'l', false); + clist = OpernameGetCandidates(op, 'l', false, &fgc_flags); /* No operators found? Then fail... */ if (clist != NULL) @@ -585,7 +589,8 @@ left_oper(ParseState *pstate, List *op, Oid arg, bool noError, int location) make_oper_cache_entry(&key, operOid); } else if (!noError) - op_error(pstate, op, InvalidOid, arg, fdresult, location); + op_error(pstate, op, InvalidOid, arg, + fdresult, fgc_flags, location); return (Operator) tup; } @@ -621,29 +626,67 @@ op_signature_string(List *op, Oid arg1, Oid arg2) static void op_error(ParseState *pstate, List *op, Oid arg1, Oid arg2, - FuncDetailCode fdresult, int location) + FuncDetailCode fdresult, int fgc_flags, int location) { if (fdresult == FUNCDETAIL_MULTIPLE) ereport(ERROR, (errcode(ERRCODE_AMBIGUOUS_FUNCTION), errmsg("operator is not unique: %s", op_signature_string(op, arg1, arg2)), - errhint("Could not choose a best candidate operator. " - "You might need to add explicit type casts."), + errdetail("Could not choose a best candidate operator."), + errhint("You might need to add explicit type casts."), parser_errposition(pstate, location))); else ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION), errmsg("operator does not exist: %s", op_signature_string(op, arg1, arg2)), - (!arg1 || !arg2) ? - errhint("No operator matches the given name and argument type. " - "You might need to add an explicit type cast.") : - errhint("No operator matches the given name and argument types. " - "You might need to add explicit type casts."), + oper_lookup_failure_details(fgc_flags, (!arg1 || !arg2)), parser_errposition(pstate, location))); } +/* + * Interpret the fgc_flags and issue a suitable detail or hint message. + */ +static int +oper_lookup_failure_details(int fgc_flags, bool is_unary_op) +{ + /* + * If not FGC_NAME_VISIBLE, we shouldn't raise the question of whether the + * arguments are wrong. If the operator name was not schema-qualified, + * it's helpful to distinguish between doesn't-exist-anywhere and + * not-in-search-path; but if it was, there's really nothing to add to the + * basic "operator does not exist" message. + * + * Note: we passed missing_ok = false to OpernameGetCandidates, so there's + * no need to consider FGC_SCHEMA_EXISTS here: we'd have already thrown an + * error if an explicitly-given schema doesn't exist. + */ + if (!(fgc_flags & FGC_NAME_VISIBLE)) + { + if (fgc_flags & FGC_SCHEMA_GIVEN) + return 0; /* schema-qualified name */ + else if (!(fgc_flags & FGC_NAME_EXISTS)) + return errdetail("There is no operator of that name."); + else + return errdetail("An operator of that name exists, but it is not in the search_path."); + } + + /* + * Otherwise, the problem must be incorrect argument type(s). + */ + if (is_unary_op) + { + (void) errdetail("No operator of that name accepts the given argument type."); + return errhint("You might need to add an explicit type cast."); + } + else + { + (void) errdetail("No operator of that name accepts the given argument types."); + return errhint("You might need to add explicit type casts."); + } +} + /* * make_op() * Operator expression construction. diff --git a/src/backend/parser/parse_param.c b/src/backend/parser/parse_param.c index 930921626b6d5..772f3e3c1d8fc 100644 --- a/src/backend/parser/parse_param.c +++ b/src/backend/parser/parse_param.c @@ -68,7 +68,7 @@ void setup_parse_fixed_parameters(ParseState *pstate, const Oid *paramTypes, int numParams) { - FixedParamState *parstate = palloc(sizeof(FixedParamState)); + FixedParamState *parstate = palloc_object(FixedParamState); parstate->paramTypes = paramTypes; parstate->numParams = numParams; @@ -84,7 +84,7 @@ void setup_parse_variable_parameters(ParseState *pstate, Oid **paramTypes, int *numParams) { - VarParamState *parstate = palloc(sizeof(VarParamState)); + VarParamState *parstate = palloc_object(VarParamState); parstate->paramTypes = paramTypes; parstate->numParams = numParams; diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 04ecf64b1fc25..dd64f45478ab4 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -18,11 +18,9 @@ #include "access/htup_details.h" #include "access/relation.h" -#include "access/sysattr.h" #include "access/table.h" #include "catalog/heap.h" #include "catalog/namespace.h" -#include "catalog/pg_type.h" #include "funcapi.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" @@ -33,7 +31,6 @@ #include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/lsyscache.h" -#include "utils/rel.h" #include "utils/syscache.h" #include "utils/varlena.h" @@ -103,7 +100,6 @@ static void expandTupleDesc(TupleDesc tupdesc, Alias *eref, static int specialAttNum(const char *attname); static bool rte_visible_if_lateral(ParseState *pstate, RangeTblEntry *rte); static bool rte_visible_if_qualified(ParseState *pstate, RangeTblEntry *rte); -static bool isQueryUsingTempRelation_walker(Node *node, void *context); /* @@ -968,7 +964,7 @@ searchRangeTableForCol(ParseState *pstate, const char *alias, const char *colnam int location) { ParseState *orig_pstate = pstate; - FuzzyAttrMatchState *fuzzystate = palloc(sizeof(FuzzyAttrMatchState)); + FuzzyAttrMatchState *fuzzystate = palloc_object(FuzzyAttrMatchState); fuzzystate->distance = MAX_FUZZY_DISTANCE + 1; fuzzystate->rfirst = NULL; @@ -1340,7 +1336,7 @@ buildNSItemFromTupleDesc(RangeTblEntry *rte, Index rtindex, } /* ... and build the nsitem */ - nsitem = (ParseNamespaceItem *) palloc(sizeof(ParseNamespaceItem)); + nsitem = palloc_object(ParseNamespaceItem); nsitem->p_names = rte->eref; nsitem->p_rte = rte; nsitem->p_rtindex = rtindex; @@ -1404,7 +1400,7 @@ buildNSItemFromLists(RangeTblEntry *rte, Index rtindex, } /* ... and build the nsitem */ - nsitem = (ParseNamespaceItem *) palloc(sizeof(ParseNamespaceItem)); + nsitem = palloc_object(ParseNamespaceItem); nsitem->p_names = rte->eref; nsitem->p_rte = rte; nsitem->p_rtindex = rtindex; @@ -1795,7 +1791,7 @@ addRangeTableEntryForFunction(ParseState *pstate, rte->eref = eref; /* Process each function ... */ - functupdescs = (TupleDesc *) palloc(nfuncs * sizeof(TupleDesc)); + functupdescs = palloc_array(TupleDesc, nfuncs); totalatts = 0; funcno = 0; @@ -2306,7 +2302,7 @@ addRangeTableEntryForJoin(ParseState *pstate, * Build a ParseNamespaceItem, but don't add it to the pstate's namespace * list --- caller must do that if appropriate. */ - nsitem = (ParseNamespaceItem *) palloc(sizeof(ParseNamespaceItem)); + nsitem = palloc_object(ParseNamespaceItem); nsitem->p_names = rte->eref; nsitem->p_rte = rte; nsitem->p_perminfo = NULL; @@ -3489,13 +3485,13 @@ get_rte_attribute_is_dropped(RangeTblEntry *rte, AttrNumber attnum) if (tupdesc) { /* Composite data type, e.g. a table's row type */ - Form_pg_attribute att_tup; + CompactAttribute *att; Assert(tupdesc); Assert(attnum - atts_done <= tupdesc->natts); - att_tup = TupleDescAttr(tupdesc, - attnum - atts_done - 1); - return att_tup->attisdropped; + att = TupleDescCompactAttr(tupdesc, + attnum - atts_done - 1); + return att->attisdropped; } /* Otherwise, it can't have any dropped columns */ return false; @@ -3922,53 +3918,6 @@ rte_visible_if_qualified(ParseState *pstate, RangeTblEntry *rte) } -/* - * Examine a fully-parsed query, and return true iff any relation underlying - * the query is a temporary relation (table, view, or materialized view). - */ -bool -isQueryUsingTempRelation(Query *query) -{ - return isQueryUsingTempRelation_walker((Node *) query, NULL); -} - -static bool -isQueryUsingTempRelation_walker(Node *node, void *context) -{ - if (node == NULL) - return false; - - if (IsA(node, Query)) - { - Query *query = (Query *) node; - ListCell *rtable; - - foreach(rtable, query->rtable) - { - RangeTblEntry *rte = lfirst(rtable); - - if (rte->rtekind == RTE_RELATION) - { - Relation rel = table_open(rte->relid, AccessShareLock); - char relpersistence = rel->rd_rel->relpersistence; - - table_close(rel, AccessShareLock); - if (relpersistence == RELPERSISTENCE_TEMP) - return true; - } - } - - return query_tree_walker(query, - isQueryUsingTempRelation_walker, - context, - QTW_IGNORE_JOINALIASES); - } - - return expression_tree_walker(node, - isQueryUsingTempRelation_walker, - context); -} - /* * addRTEPermissionInfo * Creates RTEPermissionInfo for a given RTE and adds it into the diff --git a/src/backend/parser/parse_target.c b/src/backend/parser/parse_target.c index 4aba0d9d4d5cc..905c975d83b56 100644 --- a/src/backend/parser/parse_target.c +++ b/src/backend/parser/parse_target.c @@ -16,7 +16,6 @@ #include "catalog/namespace.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "funcapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" diff --git a/src/backend/parser/parse_type.c b/src/backend/parser/parse_type.c index 7713bdc6af0a9..9d7aa6967dad9 100644 --- a/src/backend/parser/parse_type.c +++ b/src/backend/parser/parse_type.c @@ -369,7 +369,7 @@ typenameTypeMod(ParseState *pstate, const TypeName *typeName, Type typ) * Currently, we allow simple numeric constants, string literals, and * identifiers; possibly this list could be extended. */ - datums = (Datum *) palloc(list_length(typeName->typmods) * sizeof(Datum)); + datums = palloc_array(Datum, list_length(typeName->typmods)); n = 0; foreach(l, typeName->typmods) { @@ -382,7 +382,7 @@ typenameTypeMod(ParseState *pstate, const TypeName *typeName, Type typ) if (IsA(&ac->val, Integer)) { - cstr = psprintf("%ld", (long) intVal(&ac->val)); + cstr = psprintf("%d", intVal(&ac->val)); } else if (IsA(&ac->val, Float)) { diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 62015431fdf1a..2b7b084f2162c 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -32,6 +32,7 @@ #include "catalog/heap.h" #include "catalog/index.h" #include "catalog/namespace.h" +#include "catalog/partition.h" #include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" @@ -58,6 +59,8 @@ #include "parser/parse_type.h" #include "parser/parse_utilcmd.h" #include "parser/parser.h" +#include "partitioning/partbounds.h" +#include "partitioning/partdesc.h" #include "rewrite/rewriteManip.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -134,7 +137,7 @@ static void transformConstraintAttrs(CreateStmtContext *cxt, List *constraintList); static void transformColumnType(CreateStmtContext *cxt, ColumnDef *column); static void setSchemaName(const char *context_schema, char **stmt_schema_name); -static void transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd); +static void transformPartitionCmd(CreateStmtContext *cxt, PartitionBoundSpec *bound); static List *transformPartitionRangeBounds(ParseState *pstate, List *blist, Relation parent); static void validateInfiniteBounds(ParseState *pstate, List *blist); @@ -1279,6 +1282,28 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla lst = RelationGetNotNullConstraints(RelationGetRelid(relation), false, true); cxt->nnconstraints = list_concat(cxt->nnconstraints, lst); + + /* Copy comments on not-null constraints */ + if (table_like_clause->options & CREATE_TABLE_LIKE_COMMENTS) + { + foreach_node(Constraint, nnconstr, lst) + { + if ((comment = GetComment(get_relation_constraint_oid(RelationGetRelid(relation), + nnconstr->conname, false), + ConstraintRelationId, + 0)) != NULL) + { + CommentStmt *stmt = makeNode(CommentStmt); + + stmt->objtype = OBJECT_TABCONSTRAINT; + stmt->object = (Node *) list_make3(makeString(cxt->relation->schemaname), + makeString(cxt->relation->relname), + makeString(nnconstr->conname)); + stmt->comment = comment; + cxt->alist = lappend(cxt->alist, stmt); + } + } + } } /* @@ -1439,7 +1464,6 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) char *ccname = constr->check[ccnum].ccname; char *ccbin = constr->check[ccnum].ccbin; bool ccenforced = constr->check[ccnum].ccenforced; - bool ccvalid = constr->check[ccnum].ccvalid; bool ccnoinherit = constr->check[ccnum].ccnoinherit; Node *ccbin_node; bool found_whole_row; @@ -1470,7 +1494,7 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) n->conname = pstrdup(ccname); n->location = -1; n->is_enforced = ccenforced; - n->initially_valid = ccvalid; + n->initially_valid = ccenforced; /* sic */ n->is_no_inherit = ccnoinherit; n->raw_expr = NULL; n->cooked_expr = nodeToString(ccbin_node); @@ -2548,7 +2572,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) } /* Close the index relation but keep the lock */ - relation_close(index_rel, NoLock); + index_close(index_rel, NoLock); index->indexOid = index_oid; } @@ -3488,6 +3512,288 @@ transformRuleStmt(RuleStmt *stmt, const char *queryString, } +/* + * checkPartition + * Check whether partRelOid is a leaf partition of the parent table (rel). + * isMerge: true indicates the operation is "ALTER TABLE ... MERGE PARTITIONS"; + * false indicates the operation is "ALTER TABLE ... SPLIT PARTITION". + */ +static void +checkPartition(Relation rel, Oid partRelOid, bool isMerge) +{ + Relation partRel; + + partRel = table_open(partRelOid, NoLock); + + if (partRel->rd_rel->relkind != RELKIND_RELATION) + ereport(ERROR, + errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a table", RelationGetRelationName(partRel)), + isMerge + ? errhint("ALTER TABLE ... MERGE PARTITIONS can only merge partitions don't have sub-partitions") + : errhint("ALTER TABLE ... SPLIT PARTITION can only split partitions don't have sub-partitions")); + + if (!partRel->rd_rel->relispartition) + ereport(ERROR, + errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a partition of partitioned table \"%s\"", + RelationGetRelationName(partRel), RelationGetRelationName(rel)), + isMerge + ? errhint("ALTER TABLE ... MERGE PARTITIONS can only merge partitions don't have sub-partitions") + : errhint("ALTER TABLE ... SPLIT PARTITION can only split partitions don't have sub-partitions")); + + if (get_partition_parent(partRelOid, false) != RelationGetRelid(rel)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("relation \"%s\" is not a partition of relation \"%s\"", + RelationGetRelationName(partRel), RelationGetRelationName(rel)), + isMerge + ? errhint("ALTER TABLE ... MERGE PARTITIONS can only merge partitions don't have sub-partitions") + : errhint("ALTER TABLE ... SPLIT PARTITION can only split partitions don't have sub-partitions")); + + table_close(partRel, NoLock); +} + +/* + * transformPartitionCmdForSplit - + * analyze the ALTER TABLE ... SPLIT PARTITION command + * + * For each new partition, sps->bound is set to the transformed value of bound. + * Does checks for bounds of new partitions. + */ +static void +transformPartitionCmdForSplit(CreateStmtContext *cxt, PartitionCmd *partcmd) +{ + Relation parent = cxt->rel; + PartitionKey key; + char strategy; + Oid splitPartOid; + Oid defaultPartOid; + int default_index = -1; + bool isSplitPartDefault; + ListCell *listptr, + *listptr2; + List *splitlist; + + splitlist = partcmd->partlist; + key = RelationGetPartitionKey(parent); + strategy = get_partition_strategy(key); + defaultPartOid = get_default_oid_from_partdesc(RelationGetPartitionDesc(parent, true)); + + /* Transform partition bounds for all partitions in the list: */ + foreach_node(SinglePartitionSpec, sps, splitlist) + { + cxt->partbound = NULL; + transformPartitionCmd(cxt, sps->bound); + /* Assign the transformed value of the partition bound. */ + sps->bound = cxt->partbound; + } + + /* + * Open and lock the partition, check ownership along the way. We need to + * use AccessExclusiveLock here because this split partition will be + * detached, then dropped in ATExecSplitPartition. + */ + splitPartOid = RangeVarGetRelidExtended(partcmd->name, AccessExclusiveLock, + 0, RangeVarCallbackOwnsRelation, + NULL); + + checkPartition(parent, splitPartOid, false); + + switch (strategy) + { + case PARTITION_STRATEGY_LIST: + case PARTITION_STRATEGY_RANGE: + { + foreach_node(SinglePartitionSpec, sps, splitlist) + { + if (sps->bound->is_default) + { + if (default_index != -1) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("DEFAULT partition should be one"), + parser_errposition(cxt->pstate, sps->name->location)); + + default_index = foreach_current_index(sps); + } + } + } + break; + + case PARTITION_STRATEGY_HASH: + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("partition of hash-partitioned table cannot be split")); + break; + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + break; + } + + /* isSplitPartDefault: is the being split partition a DEFAULT partition? */ + isSplitPartDefault = (defaultPartOid == splitPartOid); + + if (isSplitPartDefault && default_index == -1) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("can not split DEFAULT partition \"%s\"", + get_rel_name(splitPartOid)), + errhint("To split DEFAULT partition one of the new partition msut be DEFAULT"), + parser_errposition(cxt->pstate, ((SinglePartitionSpec *) linitial(splitlist))->name->location)); + + /* + * If the partition being split is not the DEFAULT partition, but the + * DEFAULT partition exists, then none of the resulting split partitions + * can be the DEFAULT. + */ + if (!isSplitPartDefault && (default_index != -1) && OidIsValid(defaultPartOid)) + { + SinglePartitionSpec *spsDef = + (SinglePartitionSpec *) list_nth(splitlist, default_index); + + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("can not split non-DEFAULT partition \"%s\"", + get_rel_name(splitPartOid)), + errmsg("new partition cannot be DEFAULT because DEFAULT partition \"%s\" already exists", + get_rel_name(defaultPartOid)), + parser_errposition(cxt->pstate, spsDef->name->location)); + } + + foreach(listptr, splitlist) + { + Oid nspid; + SinglePartitionSpec *sps = (SinglePartitionSpec *) lfirst(listptr); + RangeVar *name = sps->name; + + nspid = RangeVarGetCreationNamespace(sps->name); + + /* Partitions in the list should have different names. */ + for_each_cell(listptr2, splitlist, lnext(splitlist, listptr)) + { + Oid nspid2; + SinglePartitionSpec *sps2 = (SinglePartitionSpec *) lfirst(listptr2); + RangeVar *name2 = sps2->name; + + if (equal(name, name2)) + ereport(ERROR, + errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("partition with name \"%s\" is already used", name->relname), + parser_errposition(cxt->pstate, name2->location)); + + nspid2 = RangeVarGetCreationNamespace(sps2->name); + + if (nspid2 == nspid && strcmp(name->relname, name2->relname) == 0) + ereport(ERROR, + errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("partition with name \"%s\" is already used", name->relname), + parser_errposition(cxt->pstate, name2->location)); + } + } + + /* Then we should check partitions with transformed bounds. */ + check_partitions_for_split(parent, splitPartOid, splitlist, cxt->pstate); +} + + +/* + * transformPartitionCmdForMerge - + * analyze the ALTER TABLE ... MERGE PARTITIONS command + * + * Does simple checks for merged partitions. Calculates bound of the resulting + * partition. + */ +static void +transformPartitionCmdForMerge(CreateStmtContext *cxt, PartitionCmd *partcmd) +{ + Oid defaultPartOid; + Oid partOid; + Relation parent = cxt->rel; + PartitionKey key; + char strategy; + ListCell *listptr, + *listptr2; + bool isDefaultPart = false; + List *partOids = NIL; + + key = RelationGetPartitionKey(parent); + strategy = get_partition_strategy(key); + + if (strategy == PARTITION_STRATEGY_HASH) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("partition of hash-partitioned table cannot be merged")); + + /* Does the partitioned table (parent) have a default partition? */ + defaultPartOid = get_default_oid_from_partdesc(RelationGetPartitionDesc(parent, true)); + + foreach(listptr, partcmd->partlist) + { + RangeVar *name = (RangeVar *) lfirst(listptr); + + /* Partitions in the list should have different names. */ + for_each_cell(listptr2, partcmd->partlist, lnext(partcmd->partlist, listptr)) + { + RangeVar *name2 = (RangeVar *) lfirst(listptr2); + + if (equal(name, name2)) + ereport(ERROR, + errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("partition with name \"%s\" is already used", name->relname), + parser_errposition(cxt->pstate, name2->location)); + } + + /* + * Search the DEFAULT partition in the list. Open and lock partitions + * before calculating the boundary for resulting partition, we also + * check for ownership along the way. We need to use + * AccessExclusiveLock here, because these merged partitions will be + * detached and then dropped in ATExecMergePartitions. + */ + partOid = RangeVarGetRelidExtended(name, AccessExclusiveLock, 0, + RangeVarCallbackOwnsRelation, + NULL); + /* Is the current partition a DEFAULT partition? */ + if (partOid == defaultPartOid) + isDefaultPart = true; + + /* + * Extended check because the same partition can have different names + * (for example, "part_name" and "public.part_name"). + */ + foreach(listptr2, partOids) + { + Oid curOid = lfirst_oid(listptr2); + + if (curOid == partOid) + ereport(ERROR, + errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("partition with name \"%s\" is already used", name->relname), + parser_errposition(cxt->pstate, name->location)); + } + + checkPartition(parent, partOid, true); + + partOids = lappend_oid(partOids, partOid); + } + + /* Allocate the bound of the resulting partition. */ + Assert(partcmd->bound == NULL); + partcmd->bound = makeNode(PartitionBoundSpec); + + /* Fill the partition bound. */ + partcmd->bound->strategy = strategy; + partcmd->bound->location = -1; + partcmd->bound->is_default = isDefaultPart; + if (!isDefaultPart) + calculate_partition_bound_for_merge(parent, partcmd->partlist, + partOids, partcmd->bound, + cxt->pstate); +} + /* * transformAlterTableStmt - * parse analysis for ALTER TABLE @@ -3757,20 +4063,48 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, { PartitionCmd *partcmd = (PartitionCmd *) cmd->def; - transformPartitionCmd(&cxt, partcmd); - /* assign transformed value of the partition bound */ + transformPartitionCmd(&cxt, partcmd->bound); + /* assign the transformed value of the partition bound */ partcmd->bound = cxt.partbound; } newcmds = lappend(newcmds, cmd); break; + case AT_MergePartitions: + { + PartitionCmd *partcmd = (PartitionCmd *) cmd->def; + + if (list_length(partcmd->partlist) < 2) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("list of partitions to be merged should include at least two partitions")); + + transformPartitionCmdForMerge(&cxt, partcmd); + newcmds = lappend(newcmds, cmd); + break; + } + + case AT_SplitPartition: + { + PartitionCmd *partcmd = (PartitionCmd *) cmd->def; + + if (list_length(partcmd->partlist) < 2) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("list of new partitions should contain at least two partitions")); + + transformPartitionCmdForSplit(&cxt, partcmd); + newcmds = lappend(newcmds, cmd); + break; + } + default: /* - * Currently, we shouldn't actually get here for subcommand - * types that don't require transformation; but if we do, just - * emit them unchanged. + * Currently, we shouldn't actually get here for the + * subcommand types that don't require transformation; but if + * we do, just emit them unchanged. */ newcmds = lappend(newcmds, cmd); break; @@ -4195,13 +4529,13 @@ setSchemaName(const char *context_schema, char **stmt_schema_name) /* * transformPartitionCmd - * Analyze the ATTACH/DETACH PARTITION command + * Analyze the ATTACH/DETACH/SPLIT PARTITION command * - * In case of the ATTACH PARTITION command, cxt->partbound is set to the - * transformed value of cmd->bound. + * In case of the ATTACH/SPLIT PARTITION command, cxt->partbound is set to the + * transformed value of bound. */ static void -transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd) +transformPartitionCmd(CreateStmtContext *cxt, PartitionBoundSpec *bound) { Relation parentRel = cxt->rel; @@ -4210,9 +4544,9 @@ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd) case RELKIND_PARTITIONED_TABLE: /* transform the partition bound, if any */ Assert(RelationGetPartitionKey(parentRel) != NULL); - if (cmd->bound != NULL) + if (bound != NULL) cxt->partbound = transformPartitionBound(cxt->pstate, parentRel, - cmd->bound); + bound); break; case RELKIND_PARTITIONED_INDEX: @@ -4220,7 +4554,7 @@ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd) * A partitioned index cannot have a partition bound set. ALTER * INDEX prevents that with its grammar, but not ALTER TABLE. */ - if (cmd->bound != NULL) + if (bound != NULL) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("\"%s\" is not a partitioned table", diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index 33a040506b47f..a3679f8e86ce5 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -339,7 +339,7 @@ hexval(unsigned char c) /* is Unicode code point acceptable? */ static void -check_unicode_value(pg_wchar c) +check_unicode_value(char32_t c) { if (!is_valid_unicode_codepoint(c)) ereport(ERROR, @@ -376,7 +376,7 @@ str_udeescape(const char *str, char escape, char *new, *out; size_t new_len; - pg_wchar pair_first = 0; + char16_t pair_first = 0; ScannerCallbackState scbstate; /* @@ -420,7 +420,7 @@ str_udeescape(const char *str, char escape, isxdigit((unsigned char) in[3]) && isxdigit((unsigned char) in[4])) { - pg_wchar unicode; + char32_t unicode; unicode = (hexval(in[1]) << 12) + (hexval(in[2]) << 8) + @@ -457,7 +457,7 @@ str_udeescape(const char *str, char escape, isxdigit((unsigned char) in[6]) && isxdigit((unsigned char) in[7])) { - pg_wchar unicode; + char32_t unicode; unicode = (hexval(in[2]) << 20) + (hexval(in[3]) << 16) + diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 08990831fe81a..a67815339b7ca 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -121,7 +121,7 @@ static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner); static char *litbufdup(core_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner); static int process_integer_literal(const char *token, YYSTYPE *lval, int base); -static void addunicode(pg_wchar c, yyscan_t yyscanner); +static void addunicode(char32_t c, yyscan_t yyscanner); #define yyerror(msg) scanner_yyerror(msg, yyscanner) @@ -640,7 +640,7 @@ other . addlit(yytext, yyleng, yyscanner); } {xeunicode} { - pg_wchar c = strtoul(yytext + 2, NULL, 16); + char32_t c = strtoul(yytext + 2, NULL, 16); /* * For consistency with other productions, issue any @@ -668,7 +668,7 @@ other . POP_YYLLOC(); } {xeunicode} { - pg_wchar c = strtoul(yytext + 2, NULL, 16); + char32_t c = strtoul(yytext + 2, NULL, 16); /* Remember start of overall string token ... */ PUSH_YYLLOC(); @@ -1376,7 +1376,7 @@ process_integer_literal(const char *token, YYSTYPE *lval, int base) } static void -addunicode(pg_wchar c, core_yyscan_t yyscanner) +addunicode(char32_t c, core_yyscan_t yyscanner) { ScannerCallbackState scbstate; char buf[MAX_UNICODE_EQUIVALENT_STRING + 1]; diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c index 2feb2b6cf5a96..d63cb865260b6 100644 --- a/src/backend/parser/scansup.c +++ b/src/backend/parser/scansup.c @@ -18,6 +18,7 @@ #include "mb/pg_wchar.h" #include "parser/scansup.h" +#include "utils/pg_locale.h" /* @@ -46,35 +47,22 @@ char * downcase_identifier(const char *ident, int len, bool warn, bool truncate) { char *result; - int i; - bool enc_is_single_byte; - - result = palloc(len + 1); - enc_is_single_byte = pg_database_encoding_max_length() == 1; + size_t needed pg_attribute_unused(); /* - * SQL99 specifies Unicode-aware case normalization, which we don't yet - * have the infrastructure for. Instead we use tolower() to provide a - * locale-aware translation. However, there are some locales where this - * is not right either (eg, Turkish may do strange things with 'i' and - * 'I'). Our current compromise is to use tolower() for characters with - * the high bit set, as long as they aren't part of a multi-byte - * character, and use an ASCII-only downcasing for 7-bit characters. + * Preserves string length. + * + * NB: if we decide to support Unicode-aware identifier case folding, then + * we need to account for a change in string length. */ - for (i = 0; i < len; i++) - { - unsigned char ch = (unsigned char) ident[i]; + result = palloc(len + 1); - if (ch >= 'A' && ch <= 'Z') - ch += 'a' - 'A'; - else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch)) - ch = tolower(ch); - result[i] = (char) ch; - } - result[i] = '\0'; + needed = pg_downcase_ident(result, len + 1, ident, len); + Assert(needed == len); + Assert(result[len] == '\0'); - if (i >= NAMEDATALEN && truncate) - truncate_identifier(result, i, warn); + if (len >= NAMEDATALEN && truncate) + truncate_identifier(result, len, warn); return result; } diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 4bdc2941efb21..16b0adc172c5f 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -17,6 +17,7 @@ #include "access/relation.h" #include "access/table.h" #include "access/tableam.h" +#include "catalog/namespace.h" #include "catalog/partition.h" #include "catalog/pg_inherits.h" #include "catalog/pg_type.h" @@ -319,7 +320,7 @@ partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, * Initialize mapping array with invalid values, this is filled within * each sub-routine below depending on the bound type. */ - *mapping = (int *) palloc(sizeof(int) * nparts); + *mapping = palloc_array(int, nparts); for (i = 0; i < nparts; i++) (*mapping)[i] = -1; @@ -353,15 +354,13 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts, int greatest_modulus; Datum *boundDatums; - boundinfo = (PartitionBoundInfoData *) - palloc0(sizeof(PartitionBoundInfoData)); + boundinfo = palloc0_object(PartitionBoundInfoData); boundinfo->strategy = key->strategy; /* No special hash partitions. */ boundinfo->null_index = -1; boundinfo->default_index = -1; - hbounds = (PartitionHashBound *) - palloc(nparts * sizeof(PartitionHashBound)); + hbounds = palloc_array(PartitionHashBound, nparts); /* Convert from node to the internal representation */ for (i = 0; i < nparts; i++) @@ -384,7 +383,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts, greatest_modulus = hbounds[nparts - 1].modulus; boundinfo->ndatums = nparts; - boundinfo->datums = (Datum **) palloc0(nparts * sizeof(Datum *)); + boundinfo->datums = palloc0_array(Datum *, nparts); boundinfo->kind = NULL; boundinfo->interleaved_parts = NULL; boundinfo->nindexes = greatest_modulus; @@ -472,8 +471,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, int null_index = -1; Datum *boundDatums; - boundinfo = (PartitionBoundInfoData *) - palloc0(sizeof(PartitionBoundInfoData)); + boundinfo = palloc0_object(PartitionBoundInfoData); boundinfo->strategy = key->strategy; /* Will be set correctly below. */ boundinfo->null_index = -1; @@ -533,7 +531,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, qsort_partition_list_value_cmp, key); boundinfo->ndatums = ndatums; - boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); + boundinfo->datums = palloc0_array(Datum *, ndatums); boundinfo->kind = NULL; boundinfo->interleaved_parts = NULL; boundinfo->nindexes = ndatums; @@ -690,16 +688,14 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, Datum *boundDatums; PartitionRangeDatumKind *boundKinds; - boundinfo = (PartitionBoundInfoData *) - palloc0(sizeof(PartitionBoundInfoData)); + boundinfo = palloc0_object(PartitionBoundInfoData); boundinfo->strategy = key->strategy; /* There is no special null-accepting range partition. */ boundinfo->null_index = -1; /* Will be set correctly below. */ boundinfo->default_index = -1; - all_bounds = (PartitionRangeBound **) - palloc0(2 * nparts * sizeof(PartitionRangeBound *)); + all_bounds = palloc0_array(PartitionRangeBound *, 2 * nparts); /* Create a unified list of range bounds across all the partitions. */ ndatums = 0; @@ -803,10 +799,8 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, * bound. */ boundinfo->ndatums = ndatums; - boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); - boundinfo->kind = (PartitionRangeDatumKind **) - palloc(ndatums * - sizeof(PartitionRangeDatumKind *)); + boundinfo->datums = palloc0_array(Datum *, ndatums); + boundinfo->kind = palloc0_array(PartitionRangeDatumKind *, ndatums); boundinfo->interleaved_parts = NULL; /* @@ -814,7 +808,7 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, * element of the indexes[] array. */ boundinfo->nindexes = ndatums + 1; - boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int)); + boundinfo->indexes = palloc_array(int, (ndatums + 1)); /* * In the loop below, to save from allocating a series of small arrays, @@ -824,8 +818,7 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, */ partnatts = key->partnatts; boundDatums = (Datum *) palloc(ndatums * partnatts * sizeof(Datum)); - boundKinds = (PartitionRangeDatumKind *) palloc(ndatums * partnatts * - sizeof(PartitionRangeDatumKind)); + boundKinds = palloc_array(PartitionRangeDatumKind, ndatums * partnatts); for (i = 0; i < ndatums; i++) { @@ -1007,11 +1000,8 @@ partition_bounds_copy(PartitionBoundInfo src, int ndatums; int nindexes; int partnatts; - bool hash_part; - int natts; - Datum *boundDatums; - dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData)); + dest = (PartitionBoundInfo) palloc_object(PartitionBoundInfoData); dest->strategy = src->strategy; ndatums = dest->ndatums = src->ndatums; @@ -1021,9 +1011,9 @@ partition_bounds_copy(PartitionBoundInfo src, /* List partitioned tables have only a single partition key. */ Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1); - dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums); + dest->datums = palloc_array(Datum *, ndatums); - if (src->kind != NULL) + if (src->kind != NULL && ndatums > 0) { PartitionRangeDatumKind *boundKinds; @@ -1058,40 +1048,44 @@ partition_bounds_copy(PartitionBoundInfo src, * For hash partitioning, datums array will have two elements - modulus * and remainder. */ - hash_part = (key->strategy == PARTITION_STRATEGY_HASH); - natts = hash_part ? 2 : partnatts; - boundDatums = palloc(ndatums * natts * sizeof(Datum)); - - for (i = 0; i < ndatums; i++) + if (ndatums > 0) { - int j; - - dest->datums[i] = &boundDatums[i * natts]; + bool hash_part = (key->strategy == PARTITION_STRATEGY_HASH); + int natts = hash_part ? 2 : partnatts; + Datum *boundDatums = palloc(ndatums * natts * sizeof(Datum)); - for (j = 0; j < natts; j++) + for (i = 0; i < ndatums; i++) { - bool byval; - int typlen; + int j; - if (hash_part) - { - typlen = sizeof(int32); /* Always int4 */ - byval = true; /* int4 is pass-by-value */ - } - else + dest->datums[i] = &boundDatums[i * natts]; + + for (j = 0; j < natts; j++) { - byval = key->parttypbyval[j]; - typlen = key->parttyplen[j]; - } + if (dest->kind == NULL || + dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) + { + bool byval; + int typlen; - if (dest->kind == NULL || - dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) - dest->datums[i][j] = datumCopy(src->datums[i][j], - byval, typlen); + if (hash_part) + { + typlen = sizeof(int32); /* Always int4 */ + byval = true; /* int4 is pass-by-value */ + } + else + { + byval = key->parttypbyval[j]; + typlen = key->parttyplen[j]; + } + dest->datums[i][j] = datumCopy(src->datums[i][j], + byval, typlen); + } + } } } - dest->indexes = (int *) palloc(sizeof(int) * nindexes); + dest->indexes = palloc_array(int, nindexes); memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes); dest->null_index = src->null_index; @@ -1814,10 +1808,10 @@ init_partition_map(RelOptInfo *rel, PartitionMap *map) int i; map->nparts = nparts; - map->merged_indexes = (int *) palloc(sizeof(int) * nparts); - map->merged = (bool *) palloc(sizeof(bool) * nparts); + map->merged_indexes = palloc_array(int, nparts); + map->merged = palloc_array(bool, nparts); map->did_remapping = false; - map->old_indexes = (int *) palloc(sizeof(int) * nparts); + map->old_indexes = palloc_array(int, nparts); for (i = 0; i < nparts; i++) { map->merged_indexes[i] = map->old_indexes[i] = -1; @@ -2392,7 +2386,7 @@ fix_merged_indexes(PartitionMap *outer_map, PartitionMap *inner_map, Assert(nmerged > 0); - new_indexes = (int *) palloc(sizeof(int) * nmerged); + new_indexes = palloc_array(int, nmerged); for (i = 0; i < nmerged; i++) new_indexes[i] = -1; @@ -2452,8 +2446,8 @@ generate_matching_part_pairs(RelOptInfo *outer_rel, RelOptInfo *inner_rel, Assert(*outer_parts == NIL); Assert(*inner_parts == NIL); - outer_indexes = (int *) palloc(sizeof(int) * nmerged); - inner_indexes = (int *) palloc(sizeof(int) * nmerged); + outer_indexes = palloc_array(int, nmerged); + inner_indexes = palloc_array(int, nmerged); for (i = 0; i < nmerged; i++) outer_indexes[i] = inner_indexes[i] = -1; @@ -2524,11 +2518,11 @@ build_merged_partition_bounds(char strategy, List *merged_datums, int pos; ListCell *lc; - merged_bounds = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData)); + merged_bounds = palloc_object(PartitionBoundInfoData); merged_bounds->strategy = strategy; merged_bounds->ndatums = ndatums; - merged_bounds->datums = (Datum **) palloc(sizeof(Datum *) * ndatums); + merged_bounds->datums = palloc_array(Datum *, ndatums); pos = 0; foreach(lc, merged_datums) merged_bounds->datums[pos++] = (Datum *) lfirst(lc); @@ -2536,8 +2530,7 @@ build_merged_partition_bounds(char strategy, List *merged_datums, if (strategy == PARTITION_STRATEGY_RANGE) { Assert(list_length(merged_kinds) == ndatums); - merged_bounds->kind = (PartitionRangeDatumKind **) - palloc(sizeof(PartitionRangeDatumKind *) * ndatums); + merged_bounds->kind = palloc_array(PartitionRangeDatumKind *, ndatums); pos = 0; foreach(lc, merged_kinds) merged_bounds->kind[pos++] = (PartitionRangeDatumKind *) lfirst(lc); @@ -2558,7 +2551,7 @@ build_merged_partition_bounds(char strategy, List *merged_datums, Assert(list_length(merged_indexes) == ndatums); merged_bounds->nindexes = ndatums; - merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums); + merged_bounds->indexes = palloc_array(int, ndatums); pos = 0; foreach(lc, merged_indexes) merged_bounds->indexes[pos++] = lfirst_int(lc); @@ -3433,11 +3426,10 @@ make_one_partition_rbound(PartitionKey key, int index, List *datums, bool lower) Assert(datums != NIL); - bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound)); + bound = palloc0_object(PartitionRangeBound); bound->index = index; - bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum)); - bound->kind = (PartitionRangeDatumKind *) palloc0(key->partnatts * - sizeof(PartitionRangeDatumKind)); + bound->datums = palloc0_array(Datum, key->partnatts); + bound->kind = palloc0_array(PartitionRangeDatumKind, key->partnatts); bound->lower = lower; i = 0; @@ -3554,8 +3546,8 @@ partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, */ int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation, - Datum *rb_datums, PartitionRangeDatumKind *rb_kind, - Datum *tuple_datums, int n_tuple_datums) + const Datum *rb_datums, PartitionRangeDatumKind *rb_kind, + const Datum *tuple_datums, int n_tuple_datums) { int i; int32 cmpval = -1; @@ -3694,7 +3686,7 @@ partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, int partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, PartitionBoundInfo boundinfo, - int nvalues, Datum *values, bool *is_equal) + int nvalues, const Datum *values, bool *is_equal) { int lo, hi, @@ -4977,3 +4969,907 @@ satisfies_hash_partition(PG_FUNCTION_ARGS) PG_RETURN_BOOL(rowHash % modulus == remainder); } + +/* + * check_two_partitions_bounds_range + * + * (function for BY RANGE partitioning) + * + * This is a helper function for check_partitions_for_split() and + * calculate_partition_bound_for_merge(). This function compares the upper + * bound of first_bound and the lower bound of second_bound. These bounds + * should be equal except when "defaultPart == true" (this means that one of + * the split partitions is DEFAULT). In this case, the upper bound of + * first_bound can be less than the lower bound of second_bound because + * the space between these bounds will be included in the DEFAULT partition. + * + * parent: partitioned table + * first_name: name of the first partition + * first_bound: bound of the first partition + * second_name: name of the second partition + * second_bound: bound of the second partition + * defaultPart: true if one of the new partitions is DEFAULT + * is_merge: true ndicates the operation is MERGE PARTITIONS; + * false indicates the operation is SPLIT PARTITION. + * pstate: pointer to ParseState struct for determining error position + */ +static void +check_two_partitions_bounds_range(Relation parent, + RangeVar *first_name, + PartitionBoundSpec *first_bound, + RangeVar *second_name, + PartitionBoundSpec *second_bound, + bool defaultPart, + bool is_merge, + ParseState *pstate) +{ + PartitionKey key = RelationGetPartitionKey(parent); + PartitionRangeBound *first_upper; + PartitionRangeBound *second_lower; + int cmpval; + + Assert(key->strategy == PARTITION_STRATEGY_RANGE); + + first_upper = make_one_partition_rbound(key, -1, first_bound->upperdatums, false); + second_lower = make_one_partition_rbound(key, -1, second_bound->lowerdatums, true); + + /* + * lower1 argument of partition_rbound_cmp() is set to false for the + * correct comparison result of the lower and upper bounds. + */ + cmpval = partition_rbound_cmp(key->partnatts, + key->partsupfunc, + key->partcollation, + second_lower->datums, second_lower->kind, + false, first_upper); + if ((!defaultPart && cmpval) || (defaultPart && cmpval < 0)) + { + PartitionRangeDatum *datum = linitial(second_bound->lowerdatums); + + if (is_merge) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("can not merge partition \"%s\" together with partition \"%s\"", + second_name->relname, first_name->relname), + errdetail("lower bound of partition \"%s\" is not equal to the upper bound of partition \"%s\"", + second_name->relname, first_name->relname), + errhint("ALTER TABLE ... MERGE PARTITIONS requires the partition bounds to be adjacent."), + parser_errposition(pstate, datum->location)); + else + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("can not split to partition \"%s\" together with partition \"%s\"", + second_name->relname, first_name->relname), + errdetail("lower bound of partition \"%s\" is not equal to the upper bound of partition \"%s\"", + second_name->relname, first_name->relname), + errhint("ALTER TABLE ... SPLIT PARTITION requires the partition bounds to be adjacent."), + parser_errposition(pstate, datum->location)); + } +} + +/* + * get_partition_bound_spec + * + * Returns the PartitionBoundSpec for the partition with the given OID partOid. + */ +static PartitionBoundSpec * +get_partition_bound_spec(Oid partOid) +{ + HeapTuple tuple; + Datum datum; + bool isnull; + PartitionBoundSpec *boundspec = NULL; + + /* Try fetching the tuple from the catcache, for speed. */ + tuple = SearchSysCache1(RELOID, partOid); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", partOid); + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + if (isnull) + elog(ERROR, "partition bound for relation %u is null", + partOid); + + boundspec = stringToNode(TextDatumGetCString(datum)); + + if (!IsA(boundspec, PartitionBoundSpec)) + elog(ERROR, "expected PartitionBoundSpec for relation %u", + partOid); + + ReleaseSysCache(tuple); + return boundspec; +} + +/* + * calculate_partition_bound_for_merge + * + * Calculates the bound of the merged partition "spec" by using the bounds of + * the partitions to be merged. + * + * parent: partitioned table + * partNames: names of partitions to be merged + * partOids: Oids of partitions to be merged + * spec (out): bounds specification of the merged partition + * pstate: pointer to ParseState struct to determine error position + */ +void +calculate_partition_bound_for_merge(Relation parent, + List *partNames, + List *partOids, + PartitionBoundSpec *spec, + ParseState *pstate) +{ + PartitionKey key = RelationGetPartitionKey(parent); + PartitionBoundSpec *bound; + + Assert(!spec->is_default); + + switch (key->strategy) + { + case PARTITION_STRATEGY_RANGE: + { + int i; + PartitionRangeBound **lower_bounds; + int nparts = list_length(partOids); + List *bounds = NIL; + + lower_bounds = palloc0_array(PartitionRangeBound *, nparts); + + /* + * Create an array of lower bounds and a list of + * PartitionBoundSpec. + */ + foreach_oid(partoid, partOids) + { + bound = get_partition_bound_spec(partoid); + i = foreach_current_index(partoid); + + lower_bounds[i] = make_one_partition_rbound(key, i, bound->lowerdatums, true); + bounds = lappend(bounds, bound); + } + + /* Sort the array of lower bounds. */ + qsort_arg(lower_bounds, nparts, sizeof(PartitionRangeBound *), + qsort_partition_rbound_cmp, key); + + /* Ranges of partitions should be adjacent. */ + for (i = 1; i < nparts; i++) + { + int index = lower_bounds[i]->index; + int prev_index = lower_bounds[i - 1]->index; + + check_two_partitions_bounds_range(parent, + (RangeVar *) list_nth(partNames, prev_index), + (PartitionBoundSpec *) list_nth(bounds, prev_index), + (RangeVar *) list_nth(partNames, index), + (PartitionBoundSpec *) list_nth(bounds, index), + false, + true, + pstate); + } + + /* + * The lower bound of the first partition is the lower bound + * of the merged partition. + */ + spec->lowerdatums = + ((PartitionBoundSpec *) list_nth(bounds, lower_bounds[0]->index))->lowerdatums; + + /* + * The upper bound of the last partition is the upper bound of + * the merged partition. + */ + spec->upperdatums = + ((PartitionBoundSpec *) list_nth(bounds, lower_bounds[nparts - 1]->index))->upperdatums; + + pfree(lower_bounds); + list_free(bounds); + break; + } + + case PARTITION_STRATEGY_LIST: + { + /* Consolidate bounds for all partitions in the list. */ + foreach_oid(partoid, partOids) + { + bound = get_partition_bound_spec(partoid); + spec->listdatums = list_concat(spec->listdatums, bound->listdatums); + } + break; + } + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } +} + +/* + * partitions_listdatum_intersection + * + * (function for BY LIST partitioning) + * + * Function compares lists of values for different partitions. + * Return a list that contains *one* cell that is present in both list1 and + * list2. The returned list is freshly allocated via palloc(), but the + * cells themselves point to the same objects as the cells of the + * input lists. + * + * Currently, there is no need to collect all common partition datums from the + * two lists. + */ +static List * +partitions_listdatum_intersection(FmgrInfo *partsupfunc, Oid *partcollation, + const List *list1, const List *list2) +{ + List *result = NIL; + + if (list1 == NIL || list2 == NIL) + return result; + + foreach_node(Const, val1, list1) + { + bool isnull1 = val1->constisnull; + + foreach_node(Const, val2, list2) + { + if (val2->constisnull) + { + if (isnull1) + { + result = lappend(result, val1); + return result; + } + continue; + } + else if (isnull1) + continue; + + /* Compare two datum values. */ + if (DatumGetInt32(FunctionCall2Coll(&partsupfunc[0], + partcollation[0], + val1->constvalue, + val2->constvalue)) == 0) + { + result = lappend(result, val1); + return result; + } + } + } + + return result; +} + +/* + * check_partitions_not_overlap_list + * + * (function for BY LIST partitioning) + * + * This is a helper function for check_partitions_for_split(). + * Checks that the values of the new partitions do not overlap. + * + * parent: partitioned table + * parts: array of SinglePartitionSpec structs with info about split partitions + * nparts: size of array "parts" + */ +static void +check_partitions_not_overlap_list(Relation parent, + SinglePartitionSpec **parts, + int nparts, + ParseState *pstate) +{ + PartitionKey key PG_USED_FOR_ASSERTS_ONLY = RelationGetPartitionKey(parent); + int i, + j; + SinglePartitionSpec *sps1, + *sps2; + List *overlap; + + Assert(key->strategy == PARTITION_STRATEGY_LIST); + + for (i = 0; i < nparts; i++) + { + sps1 = parts[i]; + + for (j = i + 1; j < nparts; j++) + { + sps2 = parts[j]; + + overlap = partitions_listdatum_intersection(&key->partsupfunc[0], + key->partcollation, + sps1->bound->listdatums, + sps2->bound->listdatums); + if (list_length(overlap) > 0) + { + Const *val = (Const *) linitial_node(Const, overlap); + + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("new partition \"%s\" would overlap with another new partition \"%s\"", + sps1->name->relname, sps2->name->relname), + parser_errposition(pstate, exprLocation((Node *) val))); + } + } + } +} + +/* + * check_partition_bounds_for_split_range + * + * (function for BY RANGE partitioning) + * + * Checks that bounds of new partition "spec" are inside bounds of split + * partition (with Oid splitPartOid). If first=true (this means that "spec" is + * the first of the new partitions), then the lower bound of "spec" should be + * equal (or greater than or equal in case defaultPart=true) to the lower + * bound of the split partition. If last=true (this means that "spec" is the + * last of the new partitions), then the upper bound of "spec" should be + * equal (or less than or equal in case defaultPart=true) to the upper bound + * of the split partition. + * + * parent: partitioned table + * relname: name of the new partition + * spec: bounds specification of the new partition + * splitPartOid: split partition Oid + * first: true iff the new partition "spec" is the first of the + * new partitions + * last: true iff the new partition "spec" is the last of the + * new partitions + * defaultPart: true iff new partitions contain the DEFAULT partition + * pstate: pointer to ParseState struct to determine error position + */ +static void +check_partition_bounds_for_split_range(Relation parent, + char *relname, + PartitionBoundSpec *spec, + Oid splitPartOid, + bool first, + bool last, + bool defaultPart, + ParseState *pstate) +{ + PartitionKey key = RelationGetPartitionKey(parent); + PartitionRangeBound *lower, + *upper; + int cmpval; + + Assert(key->strategy == PARTITION_STRATEGY_RANGE); + Assert(spec->strategy == PARTITION_STRATEGY_RANGE); + + lower = make_one_partition_rbound(key, -1, spec->lowerdatums, true); + upper = make_one_partition_rbound(key, -1, spec->upperdatums, false); + + /* + * First, check if the resulting range would be empty with the specified + * lower and upper bounds. partition_rbound_cmp cannot return zero here, + * since the lower-bound flags are different. + */ + cmpval = partition_rbound_cmp(key->partnatts, + key->partsupfunc, + key->partcollation, + lower->datums, lower->kind, + true, upper); + Assert(cmpval != 0); + if (cmpval > 0) + { + /* Point to the problematic key in the lower datums list. */ + PartitionRangeDatum *datum = list_nth(spec->lowerdatums, cmpval - 1); + + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("empty range bound specified for partition \"%s\"", + relname), + errdetail("Specified lower bound %s is greater than or equal to upper bound %s.", + get_range_partbound_string(spec->lowerdatums), + get_range_partbound_string(spec->upperdatums)), + parser_errposition(pstate, exprLocation((Node *) datum))); + } + + /* + * Need to check first and last partitions (from the set of new + * partitions) + */ + if (first || last) + { + PartitionBoundSpec *split_spec = get_partition_bound_spec(splitPartOid); + PartitionRangeDatum *datum; + + if (first) + { + PartitionRangeBound *split_lower; + + split_lower = make_one_partition_rbound(key, -1, split_spec->lowerdatums, true); + + cmpval = partition_rbound_cmp(key->partnatts, + key->partsupfunc, + key->partcollation, + lower->datums, lower->kind, + true, split_lower); + if (cmpval != 0) + datum = list_nth(spec->lowerdatums, abs(cmpval) - 1); + + /* + * The lower bound of "spec" must equal the lower bound of the + * split partition. However, if one of the new partitions is + * DEFAULT, then it is ok for the new partition's lower bound to + * be greater than that of the split partition. + */ + if (!defaultPart) + { + if (cmpval != 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("lower bound of partition \"%s\" is not equal to lower bound of split partition \"%s\"", + relname, + get_rel_name(splitPartOid)), + errhint("%s require combined bounds of new partitions must exactly match the bound of the split partition", + "ALTER TABLE ... SPLIT PARTITION"), + parser_errposition(pstate, exprLocation((Node *) datum))); + } + else if (cmpval < 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("lower bound of partition \"%s\" is less than lower bound of split partition \"%s\"", + relname, + get_rel_name(splitPartOid)), + errhint("%s require combined bounds of new partitions must exactly match the bound of the split partition", + "ALTER TABLE ... SPLIT PARTITION"), + parser_errposition(pstate, exprLocation((Node *) datum))); + } + else + { + PartitionRangeBound *split_upper; + + split_upper = make_one_partition_rbound(key, -1, split_spec->upperdatums, false); + + cmpval = partition_rbound_cmp(key->partnatts, + key->partsupfunc, + key->partcollation, + upper->datums, upper->kind, + false, split_upper); + if (cmpval != 0) + datum = list_nth(spec->upperdatums, abs(cmpval) - 1); + + /* + * The upper bound of "spec" must equal the upper bound of the + * split partition. However, if one of the new partitions is + * DEFAULT, then it is ok for the new partition's upper bound to + * be less than that of the split partition. + */ + if (!defaultPart) + { + if (cmpval != 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("upper bound of partition \"%s\" is not equal to upper bound of split partition \"%s\"", + relname, + get_rel_name(splitPartOid)), + errhint("%s require combined bounds of new partitions must exactly match the bound of the split partition", + "ALTER TABLE ... SPLIT PARTITION"), + parser_errposition(pstate, exprLocation((Node *) datum))); + } + else if (cmpval > 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("upper bound of partition \"%s\" is greater than upper bound of split partition \"%s\"", + relname, + get_rel_name(splitPartOid)), + errhint("%s require combined bounds of new partitions must exactly match the bound of the split partition", + "ALTER TABLE ... SPLIT PARTITION"), + parser_errposition(pstate, exprLocation((Node *) datum))); + } + } +} + +/* + * check_partition_bounds_for_split_list + * + * (function for BY LIST partitioning) + * + * Checks that the bounds of the new partition are inside the bounds of the + * split partition (with Oid splitPartOid). + * + * parent: partitioned table + * relname: name of the new partition + * spec: bounds specification of the new partition + * splitPartOid: split partition Oid + * pstate: pointer to ParseState struct to determine error position + */ +static void +check_partition_bounds_for_split_list(Relation parent, char *relname, + PartitionBoundSpec *spec, + Oid splitPartOid, + ParseState *pstate) +{ + PartitionKey key = RelationGetPartitionKey(parent); + PartitionDesc partdesc = RelationGetPartitionDesc(parent, false); + PartitionBoundInfo boundinfo = partdesc->boundinfo; + int with = -1; + bool overlap = false; + int overlap_location = -1; + + Assert(key->strategy == PARTITION_STRATEGY_LIST); + Assert(spec->strategy == PARTITION_STRATEGY_LIST); + Assert(boundinfo && boundinfo->strategy == PARTITION_STRATEGY_LIST); + + /* + * Search each value of the new partition "spec" in the existing + * partitions. All of them should be in the split partition (with Oid + * splitPartOid). + */ + foreach_node(Const, val, spec->listdatums) + { + overlap_location = exprLocation((Node *) val); + if (!val->constisnull) + { + int offset; + bool equal; + + offset = partition_list_bsearch(&key->partsupfunc[0], + key->partcollation, + boundinfo, + val->constvalue, + &equal); + if (offset >= 0 && equal) + { + with = boundinfo->indexes[offset]; + if (partdesc->oids[with] != splitPartOid) + { + overlap = true; + break; + } + } + else + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("new partition \"%s\" cannot have this value because split partition \"%s\" does not have", + relname, + get_rel_name(splitPartOid)), + parser_errposition(pstate, overlap_location)); + } + else if (partition_bound_accepts_nulls(boundinfo)) + { + with = boundinfo->null_index; + if (partdesc->oids[with] != splitPartOid) + { + overlap = true; + break; + } + } + else + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("new partition \"%s\" cannot have NULL value because split partition \"%s\" does not have", + relname, + get_rel_name(splitPartOid)), + parser_errposition(pstate, overlap_location)); + } + + if (overlap) + { + Assert(with >= 0); + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("new partition \"%s\" would overlap with another (not split) partition \"%s\"", + relname, get_rel_name(partdesc->oids[with])), + parser_errposition(pstate, overlap_location)); + } +} + +/* + * find_value_in_new_partitions_list + * + * (function for BY LIST partitioning) + * + * Function returns true iff any of the new partitions contains the value + * "value". + * + * partsupfunc: information about the comparison function associated with + * the partition key + * partcollation: partitioning collation + * parts: pointer to an array with new partition descriptions + * nparts: number of new partitions + * value: the value that we are looking for + * isnull: true if the value that we are looking for is NULL + */ +static bool +find_value_in_new_partitions_list(FmgrInfo *partsupfunc, + Oid *partcollation, + SinglePartitionSpec **parts, + int nparts, + Datum value, + bool isnull) +{ + for (int i = 0; i < nparts; i++) + { + SinglePartitionSpec *sps = parts[i]; + + foreach_node(Const, val, sps->bound->listdatums) + { + if (isnull && val->constisnull) + return true; + + if (!isnull && !val->constisnull) + { + if (DatumGetInt32(FunctionCall2Coll(&partsupfunc[0], + partcollation[0], + val->constvalue, + value)) == 0) + return true; + } + } + } + return false; +} + +/* + * check_parent_values_in_new_partitions + * + * (function for BY LIST partitioning) + * + * Checks that all values of split partition (with Oid partOid) are contained + * in new partitions. + * + * parent: partitioned table + * partOid: split partition Oid + * parts: pointer to an array with new partition descriptions + * nparts: number of new partitions + * pstate: pointer to ParseState struct to determine error position + */ +static void +check_parent_values_in_new_partitions(Relation parent, + Oid partOid, + SinglePartitionSpec **parts, + int nparts, + ParseState *pstate) +{ + PartitionKey key = RelationGetPartitionKey(parent); + PartitionDesc partdesc = RelationGetPartitionDesc(parent, false); + PartitionBoundInfo boundinfo = partdesc->boundinfo; + int i; + bool found = true; + Datum datum = PointerGetDatum(NULL); + + Assert(key->strategy == PARTITION_STRATEGY_LIST); + + /* + * Special processing for NULL value. Search for a NULL value if the split + * partition (partOid) contains it. + */ + if (partition_bound_accepts_nulls(boundinfo) && + partdesc->oids[boundinfo->null_index] == partOid) + { + if (!find_value_in_new_partitions_list(&key->partsupfunc[0], + key->partcollation, parts, nparts, datum, true)) + found = false; + } + + if (!found) + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("new partitions combined partition bounds do not contain value (%s) but split partition \"%s\" does", + "NULL", + get_rel_name(partOid)), + errhint("%s require combined bounds of new partitions must exactly match the bound of the split partition", + "ALTER TABLE ... SPLIT PARTITION")); + + /* + * Search all values of split partition with partOid in the PartitionDesc + * of partitioned table. + */ + for (i = 0; i < boundinfo->ndatums; i++) + { + if (partdesc->oids[boundinfo->indexes[i]] == partOid) + { + /* We found the value that the split partition contains. */ + datum = boundinfo->datums[i][0]; + if (!find_value_in_new_partitions_list(&key->partsupfunc[0], + key->partcollation, parts, nparts, datum, false)) + { + found = false; + break; + } + } + } + + if (!found) + { + Const *notFoundVal; + + /* + * Make a Const for getting the string representation of the missing + * value. + */ + notFoundVal = makeConst(key->parttypid[0], + key->parttypmod[0], + key->parttypcoll[0], + key->parttyplen[0], + datum, + false, /* isnull */ + key->parttypbyval[0]); + + ereport(ERROR, + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("new partitions combined partition bounds do not contain value (%s) but split partition \"%s\" does", + deparse_expression((Node *) notFoundVal, NIL, false, false), + get_rel_name(partOid)), + errhint("%s require combined bounds of new partitions must exactly match the bound of the split partition", + "ALTER TABLE ... SPLIT PARTITION")); + } +} + +/* + * check_partitions_for_split + * + * Checks new partitions for the SPLIT PARTITION command: + * 1. Bounds of new partitions should not overlap with new and existing + * partitions. + * 2. In the case when new or existing partitions contain the DEFAULT + * partition, new partitions can have any bounds inside the split partition + * bound (can be spaces between partition bounds). + * 3. In case new partitions don't contain the DEFAULT partition and the + * partitioned table does not have the DEFAULT partition, the following + * should be true: the sum of the bounds of new partitions should be equal + & to the bound of the split partition. + * + * parent: partitioned table + * splitPartOid: split partition Oid + * partlist: list of new partitions after partition split + * pstate: pointer to ParseState struct for determine error position + */ +void +check_partitions_for_split(Relation parent, + Oid splitPartOid, + List *partlist, + ParseState *pstate) +{ + PartitionKey key; + char strategy; + Oid defaultPartOid; + bool isSplitPartDefault; + bool createDefaultPart = false; + int default_index = -1; + int i; + SinglePartitionSpec **new_parts; + SinglePartitionSpec *spsPrev = NULL; + + /* + * nparts counts the number of split partitions, but it exclude the + * default partition. + */ + int nparts = 0; + + key = RelationGetPartitionKey(parent); + strategy = get_partition_strategy(key); + + defaultPartOid = + get_default_oid_from_partdesc(RelationGetPartitionDesc(parent, true)); + + Assert(strategy == PARTITION_STRATEGY_RANGE || + strategy == PARTITION_STRATEGY_LIST); + + /* + * Make an array new_parts with new partitions except the DEFAULT + * partition. + */ + new_parts = palloc0_array(SinglePartitionSpec *, list_length(partlist)); + + /* isSplitPartDefault flag: is split partition a DEFAULT partition? */ + isSplitPartDefault = (defaultPartOid == splitPartOid); + + foreach_node(SinglePartitionSpec, sps, partlist) + { + if (sps->bound->is_default) + default_index = foreach_current_index(sps); + else + new_parts[nparts++] = sps; + } + + /* An indicator that the DEFAULT partition will be created. */ + if (default_index != -1) + { + createDefaultPart = true; + Assert(nparts == list_length(partlist) - 1); + } + + if (strategy == PARTITION_STRATEGY_RANGE) + { + PartitionRangeBound **lower_bounds; + SinglePartitionSpec **tmp_new_parts; + + /* + * To simplify the check for ranges of new partitions, we need to sort + * all partitions in ascending order of their bounds (we compare the + * lower bound only). + */ + lower_bounds = palloc0_array(PartitionRangeBound *, nparts); + + /* Create an array of lower bounds. */ + for (i = 0; i < nparts; i++) + { + lower_bounds[i] = make_one_partition_rbound(key, i, + new_parts[i]->bound->lowerdatums, true); + } + + /* Sort the array of lower bounds. */ + qsort_arg(lower_bounds, nparts, sizeof(PartitionRangeBound *), + qsort_partition_rbound_cmp, (void *) key); + + /* Reorder the array of partitions. */ + tmp_new_parts = new_parts; + new_parts = palloc0_array(SinglePartitionSpec *, nparts); + for (i = 0; i < nparts; i++) + new_parts[i] = tmp_new_parts[lower_bounds[i]->index]; + + pfree(tmp_new_parts); + pfree(lower_bounds); + } + + for (i = 0; i < nparts; i++) + { + SinglePartitionSpec *sps = new_parts[i]; + + if (isSplitPartDefault) + { + /* + * When the split partition is the DEFAULT partition, we can use + * any free ranges - as when creating a new partition. + */ + check_new_partition_bound(sps->name->relname, parent, sps->bound, + pstate); + } + else + { + /* + * Checks that the bounds of the current partition are inside the + * bounds of the split partition. For range partitioning: checks + * that the upper bound of the previous partition is equal to the + * lower bound of the current partition. For list partitioning: + * checks that the split partition contains all values of the + * current partition. + */ + if (strategy == PARTITION_STRATEGY_RANGE) + { + bool first = (i == 0); + bool last = (i == (nparts - 1)); + + check_partition_bounds_for_split_range(parent, sps->name->relname, sps->bound, + splitPartOid, first, last, + createDefaultPart, pstate); + } + else + check_partition_bounds_for_split_list(parent, sps->name->relname, + sps->bound, splitPartOid, pstate); + } + + /* Ranges of new partitions should not overlap. */ + if (strategy == PARTITION_STRATEGY_RANGE && spsPrev) + check_two_partitions_bounds_range(parent, spsPrev->name, spsPrev->bound, + sps->name, sps->bound, + createDefaultPart, + false, + pstate); + + spsPrev = sps; + } + + if (strategy == PARTITION_STRATEGY_LIST) + { + /* Values of new partitions should not overlap. */ + check_partitions_not_overlap_list(parent, new_parts, nparts, + pstate); + + /* + * Need to check that all values of the split partition are contained + * in the new partitions. Skip this check if the DEFAULT partition + * exists. + */ + if (!createDefaultPart) + check_parent_values_in_new_partitions(parent, splitPartOid, + new_parts, nparts, pstate); + } + + pfree(new_parts); +} diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c index 328b4d450e451..985f48fc34dad 100644 --- a/src/backend/partitioning/partdesc.c +++ b/src/backend/partitioning/partdesc.c @@ -37,7 +37,7 @@ typedef struct PartitionDirectoryData MemoryContext pdir_mcxt; HTAB *pdir_hash; bool omit_detached; -} PartitionDirectoryData; +} PartitionDirectoryData; typedef struct PartitionDirectoryEntry { @@ -426,7 +426,7 @@ CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached) PartitionDirectory pdir; HASHCTL ctl; - pdir = palloc(sizeof(PartitionDirectoryData)); + pdir = palloc_object(PartitionDirectoryData); pdir->pdir_mcxt = mcxt; ctl.keysize = sizeof(Oid); diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 48a35f763e906..0227a2c92815d 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -179,13 +179,13 @@ static List *get_steps_using_prefix_recurse(GeneratePruningStepsContext *context List *step_exprs, List *step_cmpfns); static PruneStepResult *get_matching_hash_bounds(PartitionPruneContext *context, - StrategyNumber opstrategy, Datum *values, int nvalues, + StrategyNumber opstrategy, const Datum *values, int nvalues, FmgrInfo *partsupfunc, Bitmapset *nullkeys); static PruneStepResult *get_matching_list_bounds(PartitionPruneContext *context, StrategyNumber opstrategy, Datum value, int nvalues, FmgrInfo *partsupfunc, Bitmapset *nullkeys); static PruneStepResult *get_matching_range_bounds(PartitionPruneContext *context, - StrategyNumber opstrategy, Datum *values, int nvalues, + StrategyNumber opstrategy, const Datum *values, int nvalues, FmgrInfo *partsupfunc, Bitmapset *nullkeys); static Bitmapset *pull_exec_paramids(Expr *expr); static bool pull_exec_paramids_walker(Node *node, Bitmapset **context); @@ -246,7 +246,7 @@ make_partition_pruneinfo(PlannerInfo *root, RelOptInfo *parentrel, * that zero can represent an un-filled array entry. */ allpartrelids = NIL; - relid_subplan_map = palloc0(sizeof(int) * root->simple_rel_array_size); + relid_subplan_map = palloc0_array(int, root->simple_rel_array_size); i = 1; foreach(lc, subpaths) @@ -465,7 +465,7 @@ make_partitionedrel_pruneinfo(PlannerInfo *root, RelOptInfo *parentrel, * In this phase we discover whether runtime pruning is needed at all; if * not, we can avoid doing further work. */ - relid_subpart_map = palloc0(sizeof(int) * root->simple_rel_array_size); + relid_subpart_map = palloc0_array(int, root->simple_rel_array_size); i = 1; rti = -1; @@ -818,9 +818,8 @@ prune_append_rel_partitions(RelOptInfo *rel) context.boundinfo = rel->boundinfo; context.partcollation = rel->part_scheme->partcollation; context.partsupfunc = rel->part_scheme->partsupfunc; - context.stepcmpfuncs = (FmgrInfo *) palloc0(sizeof(FmgrInfo) * - context.partnatts * - list_length(pruning_steps)); + context.stepcmpfuncs = palloc0_array(FmgrInfo, + context.partnatts * list_length(pruning_steps)); context.ppccontext = CurrentMemoryContext; /* These are not valid when being called from the planner */ @@ -1890,7 +1889,7 @@ match_clause_to_partition_key(GeneratePruningStepsContext *context, return PARTCLAUSE_MATCH_STEPS; } - partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo)); + partclause = palloc_object(PartClauseInfo); partclause->keyno = partkeyidx; /* Do pruning with the Boolean equality operator. */ partclause->opno = BooleanEqualOperator; @@ -2147,7 +2146,7 @@ match_clause_to_partition_key(GeneratePruningStepsContext *context, /* * Build the clause, passing the negator if applicable. */ - partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo)); + partclause = palloc_object(PartClauseInfo); partclause->keyno = partkeyidx; if (is_opne_listp) { @@ -2690,10 +2689,10 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context, */ static PruneStepResult * get_matching_hash_bounds(PartitionPruneContext *context, - StrategyNumber opstrategy, Datum *values, int nvalues, + StrategyNumber opstrategy, const Datum *values, int nvalues, FmgrInfo *partsupfunc, Bitmapset *nullkeys) { - PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PruneStepResult *result = palloc0_object(PruneStepResult); PartitionBoundInfo boundinfo = context->boundinfo; int *partindices = boundinfo->indexes; int partnatts = context->partnatts; @@ -2770,7 +2769,7 @@ get_matching_list_bounds(PartitionPruneContext *context, StrategyNumber opstrategy, Datum value, int nvalues, FmgrInfo *partsupfunc, Bitmapset *nullkeys) { - PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PruneStepResult *result = palloc0_object(PruneStepResult); PartitionBoundInfo boundinfo = context->boundinfo; int off, minoff, @@ -2978,10 +2977,10 @@ get_matching_list_bounds(PartitionPruneContext *context, */ static PruneStepResult * get_matching_range_bounds(PartitionPruneContext *context, - StrategyNumber opstrategy, Datum *values, int nvalues, + StrategyNumber opstrategy, const Datum *values, int nvalues, FmgrInfo *partsupfunc, Bitmapset *nullkeys) { - PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PruneStepResult *result = palloc0_object(PruneStepResult); PartitionBoundInfo boundinfo = context->boundinfo; Oid *partcollation = context->partcollation; int partnatts = context->partnatts; @@ -3504,7 +3503,7 @@ perform_pruning_base_step(PartitionPruneContext *context, { PruneStepResult *result; - result = (PruneStepResult *) palloc(sizeof(PruneStepResult)); + result = palloc_object(PruneStepResult); result->bound_offsets = NULL; result->scan_default = false; result->scan_null = false; @@ -3593,7 +3592,7 @@ perform_pruning_combine_step(PartitionPruneContext *context, PartitionPruneStepCombine *cstep, PruneStepResult **step_results) { - PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PruneStepResult *result = palloc0_object(PruneStepResult); bool firststep; ListCell *lc1; diff --git a/src/backend/port/Makefile b/src/backend/port/Makefile index 47338d9922957..8613ac01aff6d 100644 --- a/src/backend/port/Makefile +++ b/src/backend/port/Makefile @@ -22,7 +22,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = \ - $(TAS) \ atomics.o \ pg_sema.o \ pg_shmem.o @@ -33,16 +32,5 @@ endif include $(top_srcdir)/src/backend/common.mk -tas.o: tas.s -ifeq ($(SUN_STUDIO_CC), yes) -# preprocess assembler file with cpp - $(CC) $(CFLAGS) -c -P $< - mv $*.i $*_cpp.s - $(CC) $(CFLAGS) -c $*_cpp.s -o $@ -else - $(CC) $(CFLAGS) -c $< -endif - clean: - rm -f tas_cpp.s $(MAKE) -C win32 clean diff --git a/src/backend/port/posix_sema.c b/src/backend/port/posix_sema.c index 269c7460817ec..d7fb0c0c4da0a 100644 --- a/src/backend/port/posix_sema.c +++ b/src/backend/port/posix_sema.c @@ -215,12 +215,8 @@ PGReserveSemaphores(int maxSemas) elog(PANIC, "out of memory"); #else - /* - * We must use ShmemAllocUnlocked(), since the spinlock protecting - * ShmemAlloc() won't be ready yet. - */ sharedSemas = (PGSemaphore) - ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas)); + ShmemAlloc(PGSemaphoreShmemSize(maxSemas)); #endif numSems = 0; diff --git a/src/backend/port/sysv_sema.c b/src/backend/port/sysv_sema.c index 423b2b4f9d6d1..9faaeeefc7908 100644 --- a/src/backend/port/sysv_sema.c +++ b/src/backend/port/sysv_sema.c @@ -69,7 +69,7 @@ static int nextSemaNumber; /* next free sem num in last sema set */ static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, - int numSems); + int numSems, bool retry_ok); static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value); static void IpcSemaphoreKill(IpcSemaphoreId semId); @@ -88,9 +88,13 @@ static void ReleaseSemaphores(int status, Datum arg); * If we fail with a failure code other than collision-with-existing-set, * print out an error and abort. Other types of errors suggest nonrecoverable * problems. + * + * Unfortunately, it's sometimes hard to tell whether errors are + * nonrecoverable. Our caller keeps track of whether continuing to retry + * is sane or not; if not, we abort on failure regardless of the errno. */ static IpcSemaphoreId -InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems) +InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems, bool retry_ok) { int semId; @@ -101,16 +105,27 @@ InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems) int saved_errno = errno; /* - * Fail quietly if error indicates a collision with existing set. One - * would expect EEXIST, given that we said IPC_EXCL, but perhaps we - * could get a permission violation instead? Also, EIDRM might occur - * if an old set is slated for destruction but not gone yet. + * Fail quietly if error suggests a collision with an existing set and + * our caller has not lost patience. + * + * One would expect EEXIST, given that we said IPC_EXCL, but perhaps + * we could get a permission violation instead. On some platforms + * EINVAL will be reported if the existing set has too few semaphores. + * Also, EIDRM might occur if an old set is slated for destruction but + * not gone yet. + * + * EINVAL is the key reason why we need the caller-level loop limit, + * as it can also mean that the platform's SEMMSL is less than + * numSems, and that condition can't be fixed by trying another key. */ - if (saved_errno == EEXIST || saved_errno == EACCES + if (retry_ok && + (saved_errno == EEXIST + || saved_errno == EACCES + || saved_errno == EINVAL #ifdef EIDRM - || saved_errno == EIDRM + || saved_errno == EIDRM #endif - ) + )) return -1; /* @@ -207,17 +222,22 @@ IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum) static IpcSemaphoreId IpcSemaphoreCreate(int numSems) { + int num_tries = 0; IpcSemaphoreId semId; union semun semun; PGSemaphoreData mysema; /* Loop till we find a free IPC key */ - for (nextSemaKey++;; nextSemaKey++) + for (nextSemaKey++;; nextSemaKey++, num_tries++) { pid_t creatorPID; - /* Try to create new semaphore set */ - semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1); + /* + * Try to create new semaphore set. Give up after trying 1000 + * distinct IPC keys. + */ + semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1, + num_tries < 1000); if (semId >= 0) break; /* successful create */ @@ -254,7 +274,7 @@ IpcSemaphoreCreate(int numSems) /* * Now try again to create the sema set. */ - semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1); + semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1, true); if (semId >= 0) break; /* successful create */ @@ -323,12 +343,8 @@ PGReserveSemaphores(int maxSemas) errmsg("could not stat data directory \"%s\": %m", DataDir))); - /* - * We must use ShmemAllocUnlocked(), since the spinlock protecting - * ShmemAlloc() won't be ready yet. - */ sharedSemas = (PGSemaphore) - ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas)); + ShmemAlloc(PGSemaphoreShmemSize(maxSemas)); numSharedSemas = 0; maxSharedSemas = maxSemas; diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 197926d44f6bc..298ceb3e218f3 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -206,7 +206,7 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) */ if (shmctl(shmid, IPC_RMID, NULL) < 0) elog(LOG, "shmctl(%d, %d, 0) failed: %m", - (int) shmid, IPC_RMID); + shmid, IPC_RMID); } } diff --git a/src/backend/port/tas/sunstudio_sparc.s b/src/backend/port/tas/sunstudio_sparc.s deleted file mode 100644 index 8e0a0965b64ea..0000000000000 --- a/src/backend/port/tas/sunstudio_sparc.s +++ /dev/null @@ -1,53 +0,0 @@ -!------------------------------------------------------------------------- -! -! sunstudio_sparc.s -! compare and swap for Sun Studio on Sparc -! -! Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group -! Portions Copyright (c) 1994, Regents of the University of California -! -! IDENTIFICATION -! src/backend/port/tas/sunstudio_sparc.s -! -!------------------------------------------------------------------------- - -! Fortunately the Sun compiler can process cpp conditionals with -P - -! '/' is the comment for x86, while '!' is the comment for Sparc - -#if defined(__sparcv9) || defined(__sparc) - - .section ".text" - .align 8 - .skip 24 - .align 4 - - .global pg_atomic_cas -pg_atomic_cas: - - ! "cas" only works on sparcv9 and sparcv8plus chips, and - ! requires a compiler targeting these CPUs. It will fail - ! on a compiler targeting sparcv8, and of course will not - ! be understood by a sparcv8 CPU. gcc continues to use - ! "ldstub" because it targets sparcv7. - ! - ! There is actually a trick for embedding "cas" in a - ! sparcv8-targeted compiler, but it can only be run - ! on a sparcv8plus/v9 cpus: - ! - ! http://cvs.opensolaris.org/source/xref/on/usr/src/lib/libc/sparc/threads/sparc.il - ! - ! NB: We're assuming we're running on a TSO system here - solaris - ! userland luckily always has done so. - -#if defined(__sparcv9) || defined(__sparcv8plus) - cas [%o0],%o2,%o1 -#else - ldstub [%o0],%o1 -#endif - mov %o1,%o0 - retl - nop - .type pg_atomic_cas,2 - .size pg_atomic_cas,(.-pg_atomic_cas) -#endif diff --git a/src/backend/port/tas/sunstudio_x86.s b/src/backend/port/tas/sunstudio_x86.s deleted file mode 100644 index 0111ffde45c29..0000000000000 --- a/src/backend/port/tas/sunstudio_x86.s +++ /dev/null @@ -1,43 +0,0 @@ -/------------------------------------------------------------------------- -/ -/ sunstudio_x86.s -/ compare and swap for Sun Studio on x86 -/ -/ Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group -/ Portions Copyright (c) 1994, Regents of the University of California -/ -/ IDENTIFICATION -/ src/backend/port/tas/sunstudio_x86.s -/ -/------------------------------------------------------------------------- - -/ Fortunately the Sun compiler can process cpp conditionals with -P - -/ '/' is the comment for x86, while '!' is the comment for Sparc - - .file "tas.s" - -#if defined(__amd64) - .code64 -#endif - - .globl pg_atomic_cas - .type pg_atomic_cas, @function - - .section .text, "ax" - .align 16 - -pg_atomic_cas: -#if defined(__amd64) - movl %edx,%eax - lock - cmpxchgl %esi,(%rdi) -#else - movl 4(%esp), %edx - movl 8(%esp), %ecx - movl 12(%esp), %eax - lock - cmpxchgl %ecx, (%edx) -#endif - ret - .size pg_atomic_cas, . - pg_atomic_cas diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 981be42e3afc8..1bd3924e35ebc 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -77,7 +77,6 @@ #include "catalog/namespace.h" #include "catalog/pg_database.h" #include "catalog/pg_namespace.h" -#include "commands/dbcommands.h" #include "commands/vacuum.h" #include "common/int.h" #include "lib/ilist.h" @@ -134,6 +133,7 @@ double autovacuum_vac_cost_delay; int autovacuum_vac_cost_limit; int Log_autovacuum_min_duration = 600000; +int Log_autoanalyze_min_duration = 600000; /* the minimum allowed time between two awakenings of the launcher */ #define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */ @@ -310,6 +310,16 @@ static AutoVacuumShmemStruct *AutoVacuumShmem; static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList); static MemoryContext DatabaseListCxt = NULL; +/* + * Dummy pointer to persuade Valgrind that we've not leaked the array of + * avl_dbase structs. Make it global to ensure the compiler doesn't + * optimize it away. + */ +#ifdef USE_VALGRIND +extern avl_dbase *avl_dbase_array; +avl_dbase *avl_dbase_array; +#endif + /* Pointer to my own WorkerInfo, valid on each worker */ static WorkerInfo MyWorkerInfo = NULL; @@ -562,10 +572,10 @@ AutoVacLauncherMain(const void *startup_data, size_t startup_data_len) /* * Create the initial database list. The invariant we want this list to - * keep is that it's ordered by decreasing next_time. As soon as an entry - * is updated to a higher time, it will be moved to the front (which is - * correct because the only operation is to add autovacuum_naptime to the - * entry, and time always increases). + * keep is that it's ordered by decreasing next_worker. As soon as an + * entry is updated to a higher time, it will be moved to the front (which + * is correct because the only operation is to add autovacuum_naptime to + * the entry, and time always increases). */ rebuild_database_list(InvalidOid); @@ -1020,6 +1030,10 @@ rebuild_database_list(Oid newdb) /* put all the hash elements into an array */ dbary = palloc(nelems * sizeof(avl_dbase)); + /* keep Valgrind quiet */ +#ifdef USE_VALGRIND + avl_dbase_array = dbary; +#endif i = 0; hash_seq_init(&seq, dbhash); @@ -1851,7 +1865,7 @@ get_database_list(void) */ oldcxt = MemoryContextSwitchTo(resultcxt); - avdb = (avw_dbase *) palloc(sizeof(avw_dbase)); + avdb = palloc_object(avw_dbase); avdb->adw_datid = pgdatabase->oid; avdb->adw_name = pstrdup(NameStr(pgdatabase->datname)); @@ -1922,8 +1936,8 @@ do_autovacuum(void) /* * Compute the multixact age for which freezing is urgent. This is - * normally autovacuum_multixact_freeze_max_age, but may be less if we are - * short of multixact member space. + * normally autovacuum_multixact_freeze_max_age, but may be less if + * multixact members are bloated. */ effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); @@ -2234,6 +2248,12 @@ do_autovacuum(void) get_namespace_name(classForm->relnamespace), NameStr(classForm->relname)))); + /* + * Deletion might involve TOAST table access, so ensure we have a + * valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + object.classId = RelationRelationId; object.objectId = relid; object.objectSubId = 0; @@ -2246,6 +2266,7 @@ do_autovacuum(void) * To commit the deletion, end current transaction and start a new * one. Note this also releases the locks we took. */ + PopActiveSnapshot(); CommitTransactionCommand(); StartTransactionCommand(); @@ -2535,7 +2556,10 @@ do_autovacuum(void) workitem->avw_active = true; LWLockRelease(AutovacuumLock); + PushActiveSnapshot(GetTransactionSnapshot()); perform_work_item(workitem); + if (ActiveSnapshotSet()) /* transaction could have aborted */ + PopActiveSnapshot(); /* * Check for config changes before acquiring lock for further jobs. @@ -2558,8 +2582,18 @@ do_autovacuum(void) /* * We leak table_toast_map here (among other things), but since we're - * going away soon, it's not a problem. + * going away soon, it's not a problem normally. But when using Valgrind, + * release some stuff to reduce complaints about leaked storage. */ +#ifdef USE_VALGRIND + hash_destroy(table_toast_map); + FreeTupleDesc(pg_class_desc); + if (bstrategy) + pfree(bstrategy); +#endif + + /* Run the rest in xact context, mainly to avoid Valgrind leak warnings */ + MemoryContextSwitchTo(TopTransactionContext); /* * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We @@ -2719,7 +2753,7 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) if (relopts == NULL) return NULL; - av = palloc(sizeof(AutoVacOpts)); + av = palloc_object(AutoVacOpts); memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts)); pfree(relopts); @@ -2784,7 +2818,8 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, int freeze_table_age; int multixact_freeze_min_age; int multixact_freeze_table_age; - int log_min_duration; + int log_vacuum_min_duration; + int log_analyze_min_duration; /* * Calculate the vacuum cost parameters and the freeze ages. If there @@ -2794,10 +2829,15 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, */ /* -1 in autovac setting means use log_autovacuum_min_duration */ - log_min_duration = (avopts && avopts->log_min_duration >= 0) - ? avopts->log_min_duration + log_vacuum_min_duration = (avopts && avopts->log_vacuum_min_duration >= 0) + ? avopts->log_vacuum_min_duration : Log_autovacuum_min_duration; + /* -1 in autovac setting means use log_autoanalyze_min_duration */ + log_analyze_min_duration = (avopts && avopts->log_analyze_min_duration >= 0) + ? avopts->log_analyze_min_duration + : Log_autoanalyze_min_duration; + /* these do not have autovacuum-specific settings */ freeze_min_age = (avopts && avopts->freeze_min_age >= 0) ? avopts->freeze_min_age @@ -2817,7 +2857,7 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, ? avopts->multixact_freeze_table_age : default_multixact_freeze_table_age; - tab = palloc(sizeof(autovac_table)); + tab = palloc_object(autovac_table); tab->at_relid = relid; tab->at_sharedrel = classForm->relisshared; @@ -2847,7 +2887,8 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age; tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age; tab->at_params.is_wraparound = wraparound; - tab->at_params.log_min_duration = log_min_duration; + tab->at_params.log_vacuum_min_duration = log_vacuum_min_duration; + tab->at_params.log_analyze_min_duration = log_analyze_min_duration; tab->at_params.toast_parent = InvalidOid; /* @@ -3077,7 +3118,7 @@ relation_needs_vacanalyze(Oid relid, * vacuuming only, so don't vacuum (or analyze) anything that's not being * forced. */ - if (PointerIsValid(tabentry) && AutoVacuumingActive()) + if (tabentry && AutoVacuumingActive()) { float4 pcnt_unfrozen = 1; float4 reltuples = classForm->reltuples; @@ -3117,11 +3158,6 @@ relation_needs_vacanalyze(Oid relid, vac_ins_scale_factor * reltuples * pcnt_unfrozen; anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples; - /* - * Note that we don't need to take special consideration for stat - * reset, because if that happens, the last vacuum and analyze counts - * will be reset too. - */ if (vac_ins_base_thresh >= 0) elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), ins: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)", NameStr(classForm->relname), @@ -3183,7 +3219,7 @@ autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy) rel_list = list_make1(rel); MemoryContextSwitchTo(old_context); - vacuum(rel_list, &tab->at_params, bstrategy, vac_context, true); + vacuum(rel_list, tab->at_params, bstrategy, vac_context, true); MemoryContextDelete(vac_context); } diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 116ddf7b835f1..8e1068969aecc 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -131,7 +131,10 @@ static const struct "ParallelApplyWorkerMain", ParallelApplyWorkerMain }, { - "TablesyncWorkerMain", TablesyncWorkerMain + "TableSyncWorkerMain", TableSyncWorkerMain + }, + { + "SequenceSyncWorkerMain", SequenceSyncWorkerMain } }; @@ -613,6 +616,7 @@ ResetBackgroundWorkerCrashTimes(void) * resetting. */ rw->rw_crashed_at = 0; + rw->rw_pid = 0; /* * If there was anyone waiting for it, they're history. @@ -1128,7 +1132,7 @@ RegisterDynamicBackgroundWorker(BackgroundWorker *worker, */ if (success && handle) { - *handle = palloc(sizeof(BackgroundWorkerHandle)); + *handle = palloc_object(BackgroundWorkerHandle); (*handle)->slot = slotno; (*handle)->generation = generation; } diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index fda91ffd1ce2d..2eac8ac30d328 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -42,6 +42,8 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogrecovery.h" +#include "catalog/pg_authid.h" +#include "commands/defrem.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" @@ -61,6 +63,7 @@ #include "storage/shmem.h" #include "storage/smgr.h" #include "storage/spin.h" +#include "utils/acl.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" @@ -127,6 +130,13 @@ typedef struct int num_requests; /* current # of requests */ int max_requests; /* allocated array size */ + + int head; /* Index of the first request in the ring + * buffer */ + int tail; /* Index of the last request in the ring + * buffer */ + + /* The ring buffer of pending checkpointer requests */ CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; } CheckpointerShmemStruct; @@ -135,6 +145,12 @@ static CheckpointerShmemStruct *CheckpointerShmem; /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ #define WRITES_PER_ABSORB 1000 +/* Maximum number of checkpointer requests to process in one batch */ +#define CKPT_REQ_BATCH_SIZE 10000 + +/* Max number of requests the checkpointer request queue can hold */ +#define MAX_CHECKPOINT_REQUESTS 10000000 + /* * GUC parameters */ @@ -161,7 +177,7 @@ static pg_time_t last_xlog_switch_time; static void ProcessCheckpointerInterrupts(void); static void CheckArchiveTimeout(void); static bool IsCheckpointOnSchedule(double progress); -static bool ImmediateCheckpointRequested(void); +static bool FastCheckpointRequested(void); static bool CompactCheckpointerRequestQueue(void); static void UpdateSharedMemoryConfig(void); @@ -543,6 +559,12 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len) break; } + /* + * Disable logical decoding if someone requested it. See comments atop + * logicalctl.c. + */ + DisableLogicalDecodingIfNecessary(); + /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); @@ -734,12 +756,12 @@ CheckArchiveTimeout(void) } /* - * Returns true if an immediate checkpoint request is pending. (Note that - * this does not check the *current* checkpoint's IMMEDIATE flag, but whether - * there is one pending behind it.) + * Returns true if a fast checkpoint request is pending. (Note that this does + * not check the *current* checkpoint's FAST flag, but whether there is one + * pending behind it.) */ static bool -ImmediateCheckpointRequested(void) +FastCheckpointRequested(void) { volatile CheckpointerShmemStruct *cps = CheckpointerShmem; @@ -747,7 +769,7 @@ ImmediateCheckpointRequested(void) * We don't need to acquire the ckpt_lck in this case because we're only * looking at a single flag bit. */ - if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE) + if (cps->ckpt_flags & CHECKPOINT_FAST) return true; return false; } @@ -760,7 +782,7 @@ ImmediateCheckpointRequested(void) * checkpoint_completion_target. * * The checkpoint request flags should be passed in; currently the only one - * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. + * examined is CHECKPOINT_FAST, which disables delays between writes. * * 'progress' is an estimate of how much of the work has been done, as a * fraction between 0.0 meaning none, and 1.0 meaning all done. @@ -778,10 +800,10 @@ CheckpointWriteDelay(int flags, double progress) * Perform the usual duties and take a nap, unless we're behind schedule, * in which case we just try to catch up as quickly as possible. */ - if (!(flags & CHECKPOINT_IMMEDIATE) && + if (!(flags & CHECKPOINT_FAST) && !ShutdownXLOGPending && !ShutdownRequestPending && - !ImmediateCheckpointRequested() && + !FastCheckpointRequested() && IsCheckpointOnSchedule(progress)) { if (ConfigReloadPending) @@ -937,11 +959,14 @@ CheckpointerShmemSize(void) Size size; /* - * Currently, the size of the requests[] array is arbitrarily set equal to - * NBuffers. This may prove too large or small ... + * The size of the requests[] array is arbitrarily set equal to NBuffers. + * But there is a cap of MAX_CHECKPOINT_REQUESTS to prevent accumulating + * too many checkpoint requests in the ring buffer. */ size = offsetof(CheckpointerShmemStruct, requests); - size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest))); + size = add_size(size, mul_size(Min(NBuffers, + MAX_CHECKPOINT_REQUESTS), + sizeof(CheckpointerRequest))); return size; } @@ -970,12 +995,65 @@ CheckpointerShmemInit(void) */ MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); - CheckpointerShmem->max_requests = NBuffers; + CheckpointerShmem->max_requests = Min(NBuffers, MAX_CHECKPOINT_REQUESTS); + CheckpointerShmem->head = CheckpointerShmem->tail = 0; ConditionVariableInit(&CheckpointerShmem->start_cv); ConditionVariableInit(&CheckpointerShmem->done_cv); } } +/* + * ExecCheckpoint + * Primary entry point for manual CHECKPOINT commands + * + * This is mainly a wrapper for RequestCheckpoint(). + */ +void +ExecCheckpoint(ParseState *pstate, CheckPointStmt *stmt) +{ + bool fast = true; + bool unlogged = false; + + foreach_ptr(DefElem, opt, stmt->options) + { + if (strcmp(opt->defname, "mode") == 0) + { + char *mode = defGetString(opt); + + if (strcmp(mode, "spread") == 0) + fast = false; + else if (strcmp(mode, "fast") != 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized value for %s option \"%s\": \"%s\"", + "CHECKPOINT", "mode", mode), + parser_errposition(pstate, opt->location))); + } + else if (strcmp(opt->defname, "flush_unlogged") == 0) + unlogged = defGetBoolean(opt); + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized %s option \"%s\"", + "CHECKPOINT", opt->defname), + parser_errposition(pstate, opt->location))); + } + + if (!has_privs_of_role(GetUserId(), ROLE_PG_CHECKPOINT)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + /* translator: %s is name of an SQL command (e.g., CHECKPOINT) */ + errmsg("permission denied to execute %s command", + "CHECKPOINT"), + errdetail("Only roles with privileges of the \"%s\" role may execute this command.", + "pg_checkpoint"))); + + RequestCheckpoint(CHECKPOINT_WAIT | + (fast ? CHECKPOINT_FAST : 0) | + (unlogged ? CHECKPOINT_FLUSH_UNLOGGED : 0) | + (RecoveryInProgress() ? 0 : CHECKPOINT_FORCE)); +} + /* * RequestCheckpoint * Called in backend processes to request a checkpoint @@ -983,11 +1061,11 @@ CheckpointerShmemInit(void) * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, + * CHECKPOINT_FAST: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or - * CHECKPOINT_END_OF_RECOVERY). + * CHECKPOINT_END_OF_RECOVERY, and the CHECKPOINT command). * CHECKPOINT_WAIT: wait for completion before returning (otherwise, * just signal checkpointer to do it, and return). * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. @@ -1009,7 +1087,7 @@ RequestCheckpoint(int flags) * There's no point in doing slow checkpoints in a standalone backend, * because there's no other backends the checkpoint could disrupt. */ - CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); + CreateCheckPoint(flags | CHECKPOINT_FAST); /* Free all smgr objects, as CheckpointerMain() normally would. */ smgrdestroyall(); @@ -1148,6 +1226,7 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) { CheckpointerRequest *request; bool too_full; + int insert_pos; if (!IsUnderPostmaster) return false; /* probably shouldn't even get here */ @@ -1171,10 +1250,14 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) } /* OK, insert request */ - request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; + insert_pos = CheckpointerShmem->tail; + request = &CheckpointerShmem->requests[insert_pos]; request->ftag = *ftag; request->type = type; + CheckpointerShmem->tail = (CheckpointerShmem->tail + 1) % CheckpointerShmem->max_requests; + CheckpointerShmem->num_requests++; + /* If queue is more than half full, nudge the checkpointer to empty it */ too_full = (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests / 2); @@ -1216,12 +1299,16 @@ CompactCheckpointerRequestQueue(void) struct CheckpointerSlotMapping { CheckpointerRequest request; - int slot; + int ring_idx; }; - int n, - preserve_count; + int n; int num_skipped = 0; + int head; + int max_requests; + int num_requests; + int read_idx, + write_idx; HASHCTL ctl; HTAB *htab; bool *skip_slot; @@ -1233,8 +1320,13 @@ CompactCheckpointerRequestQueue(void) if (CritSectionCount > 0) return false; + max_requests = CheckpointerShmem->max_requests; + num_requests = CheckpointerShmem->num_requests; + /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + skip_slot = palloc0_array(bool, max_requests); + + head = CheckpointerShmem->head; /* Initialize temporary hash table */ ctl.keysize = sizeof(CheckpointerRequest); @@ -1258,7 +1350,8 @@ CompactCheckpointerRequestQueue(void) * away preceding entries that would end up being canceled anyhow), but * it's not clear that the extra complexity would buy us anything. */ - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = head; + for (n = 0; n < num_requests; n++) { CheckpointerRequest *request; struct CheckpointerSlotMapping *slotmap; @@ -1271,16 +1364,19 @@ CompactCheckpointerRequestQueue(void) * CheckpointerShmemInit. Note also that RelFileLocator had better * contain no pad bytes. */ - request = &CheckpointerShmem->requests[n]; + request = &CheckpointerShmem->requests[read_idx]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { /* Duplicate, so mark the previous occurrence as skippable */ - skip_slot[slotmap->slot] = true; + skip_slot[slotmap->ring_idx] = true; num_skipped++; } /* Remember slot containing latest occurrence of this request value */ - slotmap->slot = n; + slotmap->ring_idx = read_idx; + + /* Move to the next request in the ring buffer */ + read_idx = (read_idx + 1) % max_requests; } /* Done with the hash table. */ @@ -1294,17 +1390,34 @@ CompactCheckpointerRequestQueue(void) } /* We found some duplicates; remove them. */ - preserve_count = 0; - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = write_idx = head; + for (n = 0; n < num_requests; n++) { - if (skip_slot[n]) - continue; - CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; + /* If this slot is NOT skipped, keep it */ + if (!skip_slot[read_idx]) + { + /* If the read and write positions are different, copy the request */ + if (write_idx != read_idx) + CheckpointerShmem->requests[write_idx] = + CheckpointerShmem->requests[read_idx]; + + /* Advance the write position */ + write_idx = (write_idx + 1) % max_requests; + } + + read_idx = (read_idx + 1) % max_requests; } + + /* + * Update ring buffer state: head remains the same, tail moves, count + * decreases + */ + CheckpointerShmem->tail = write_idx; + CheckpointerShmem->num_requests -= num_skipped; + ereport(DEBUG1, (errmsg_internal("compacted fsync request queue from %d entries to %d entries", - CheckpointerShmem->num_requests, preserve_count))); - CheckpointerShmem->num_requests = preserve_count; + num_requests, CheckpointerShmem->num_requests))); /* Cleanup. */ pfree(skip_slot); @@ -1325,40 +1438,64 @@ AbsorbSyncRequests(void) { CheckpointerRequest *requests = NULL; CheckpointerRequest *request; - int n; + int n, + i; + bool loop; if (!AmCheckpointerProcess()) return; - LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array, and processing the requests after releasing the lock. - * - * Once we have cleared the requests from shared memory, we have to PANIC - * if we then fail to absorb them (eg, because our hashtable runs out of - * memory). This is because the system cannot run safely if we are unable - * to fsync what we have been told to fsync. Fortunately, the hashtable - * is so small that the problem is quite unlikely to arise in practice. - */ - n = CheckpointerShmem->num_requests; - if (n > 0) + do { - requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); - } + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /*--- + * We try to avoid holding the lock for a long time by: + * 1. Copying the request array and processing the requests after + * releasing the lock; + * 2. Processing not the whole queue, but only batches of + * CKPT_REQ_BATCH_SIZE at once. + * + * Once we have cleared the requests from shared memory, we must + * PANIC if we then fail to absorb them (e.g., because our hashtable + * runs out of memory). This is because the system cannot run safely + * if we are unable to fsync what we have been told to fsync. + * Fortunately, the hashtable is so small that the problem is quite + * unlikely to arise in practice. + * + * Note: The maximum possible size of a ring buffer is + * MAX_CHECKPOINT_REQUESTS entries, which fit into a maximum palloc + * allocation size of 1Gb. Our maximum batch size, + * CKPT_REQ_BATCH_SIZE, is even smaller. + */ + n = Min(CheckpointerShmem->num_requests, CKPT_REQ_BATCH_SIZE); + if (n > 0) + { + if (!requests) + requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - START_CRIT_SECTION(); + for (i = 0; i < n; i++) + { + requests[i] = CheckpointerShmem->requests[CheckpointerShmem->head]; + CheckpointerShmem->head = (CheckpointerShmem->head + 1) % CheckpointerShmem->max_requests; + } - CheckpointerShmem->num_requests = 0; + CheckpointerShmem->num_requests -= n; - LWLockRelease(CheckpointerCommLock); + } + + START_CRIT_SECTION(); + + /* Are there any requests in the queue? If so, keep going. */ + loop = CheckpointerShmem->num_requests != 0; + + LWLockRelease(CheckpointerCommLock); - for (request = requests; n > 0; request++, n--) - RememberSyncRequest(&request->ftag, request->type); + for (request = requests; n > 0; request++, n--) + RememberSyncRequest(&request->ftag, request->type); - END_CRIT_SECTION(); + END_CRIT_SECTION(); + } while (loop); if (requests) pfree(requests); @@ -1404,3 +1541,16 @@ FirstCallSinceLastCheckpoint(void) return FirstCall; } + +/* + * Wake up the checkpointer process. + */ +void +WakeupCheckpointer(void) +{ + volatile PROC_HDR *procglobal = ProcGlobal; + ProcNumber checkpointerProc = procglobal->checkpointerProc; + + if (checkpointerProc != INVALID_PROC_NUMBER) + SetLatch(&GetPGProcByNumber(checkpointerProc)->procLatch); +} diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c index 0ae9bf906ec18..ba63b84dfc535 100644 --- a/src/backend/postmaster/interrupt.c +++ b/src/backend/postmaster/interrupt.c @@ -94,9 +94,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS) * shut down and exit. * * Typically, this handler would be used for SIGTERM, but some processes use - * other signals. In particular, the checkpointer exits on SIGUSR2, and the WAL - * writer and the logical replication parallel apply worker exits on either - * SIGINT or SIGTERM. + * other signals. In particular, the checkpointer and parallel apply worker + * exit on SIGUSR2, and the WAL writer exits on either SIGINT or SIGTERM. * * ShutdownRequestPending should be checked at a convenient place within the * main loop, or else the main loop should call ProcessMainLoopInterrupts. diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index bf6b55ee83048..98f7c4848c9a1 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -101,7 +101,9 @@ typedef struct struct InjectionPointsCtl *ActiveInjectionPoints; #endif int NamedLWLockTrancheRequests; - NamedLWLockTranche *NamedLWLockTrancheArray; + NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray; + char **LWLockTrancheNames; + int *LWLockCounter; LWLockPadded *MainLWLockArray; slock_t *ProcStructLock; PROC_HDR *ProcGlobal; @@ -177,34 +179,10 @@ typedef struct } child_process_kind; static child_process_kind child_process_kinds[] = { - [B_INVALID] = {"invalid", NULL, false}, - - [B_BACKEND] = {"backend", BackendMain, true}, - [B_DEAD_END_BACKEND] = {"dead-end backend", BackendMain, true}, - [B_AUTOVAC_LAUNCHER] = {"autovacuum launcher", AutoVacLauncherMain, true}, - [B_AUTOVAC_WORKER] = {"autovacuum worker", AutoVacWorkerMain, true}, - [B_BG_WORKER] = {"bgworker", BackgroundWorkerMain, true}, - - /* - * WAL senders start their life as regular backend processes, and change - * their type after authenticating the client for replication. We list it - * here for PostmasterChildName() but cannot launch them directly. - */ - [B_WAL_SENDER] = {"wal sender", NULL, true}, - [B_SLOTSYNC_WORKER] = {"slot sync worker", ReplSlotSyncWorkerMain, true}, - - [B_STANDALONE_BACKEND] = {"standalone backend", NULL, false}, - - [B_ARCHIVER] = {"archiver", PgArchiverMain, true}, - [B_BG_WRITER] = {"bgwriter", BackgroundWriterMain, true}, - [B_CHECKPOINTER] = {"checkpointer", CheckpointerMain, true}, - [B_IO_WORKER] = {"io_worker", IoWorkerMain, true}, - [B_STARTUP] = {"startup", StartupProcessMain, true}, - [B_WAL_RECEIVER] = {"wal_receiver", WalReceiverMain, true}, - [B_WAL_SUMMARIZER] = {"wal_summarizer", WalSummarizerMain, true}, - [B_WAL_WRITER] = {"wal_writer", WalWriterMain, true}, - - [B_LOGGER] = {"syslogger", SysLoggerMain, false}, +#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \ + [bktype] = {description, main_func, shmem_attach}, +#include "postmaster/proctypelist.h" +#undef PG_PROCTYPE }; const char * @@ -227,7 +205,7 @@ PostmasterChildName(BackendType child_type) */ pid_t postmaster_child_launch(BackendType child_type, int child_slot, - const void *startup_data, size_t startup_data_len, + void *startup_data, size_t startup_data_len, ClientSocket *client_sock) { pid_t pid; @@ -280,7 +258,7 @@ postmaster_child_launch(BackendType child_type, int child_slot, MyPMChildSlot = child_slot; if (client_sock) { - MyClientSocket = palloc(sizeof(ClientSocket)); + MyClientSocket = palloc_object(ClientSocket); memcpy(MyClientSocket, client_sock, sizeof(ClientSocket)); } @@ -760,7 +738,9 @@ save_backend_variables(BackendParameters *param, #endif param->NamedLWLockTrancheRequests = NamedLWLockTrancheRequests; - param->NamedLWLockTrancheArray = NamedLWLockTrancheArray; + param->NamedLWLockTrancheRequestArray = NamedLWLockTrancheRequestArray; + param->LWLockTrancheNames = LWLockTrancheNames; + param->LWLockCounter = LWLockCounter; param->MainLWLockArray = MainLWLockArray; param->ProcStructLock = ProcStructLock; param->ProcGlobal = ProcGlobal; @@ -1020,7 +1000,9 @@ restore_backend_variables(BackendParameters *param) #endif NamedLWLockTrancheRequests = param->NamedLWLockTrancheRequests; - NamedLWLockTrancheArray = param->NamedLWLockTrancheArray; + NamedLWLockTrancheRequestArray = param->NamedLWLockTrancheRequestArray; + LWLockTrancheNames = param->LWLockTrancheNames; + LWLockCounter = param->LWLockCounter; MainLWLockArray = param->MainLWLockArray; ProcStructLock = param->ProcStructLock; ProcGlobal = param->ProcGlobal; diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 7e622ae4bd2a7..3a65d841725e2 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -185,8 +185,8 @@ PgArchShmemInit(void) /* * PgArchCanRestart * - * Return true and archiver is allowed to restart if enough time has - * passed since it was launched last to reach PGARCH_RESTART_INTERVAL. + * Return true, indicating archiver is allowed to restart, if enough time has + * passed since it was last launched to reach PGARCH_RESTART_INTERVAL. * Otherwise return false. * * This is a safety valve to protect against continuous respawn attempts if the @@ -201,15 +201,18 @@ PgArchCanRestart(void) time_t curtime = time(NULL); /* - * Return false and don't restart archiver if too soon since last archiver - * start. + * If first time through, or time somehow went backwards, always update + * last_pgarch_start_time to match the current clock and allow archiver + * start. Otherwise allow it only once enough time has elapsed. */ - if ((unsigned int) (curtime - last_pgarch_start_time) < - (unsigned int) PGARCH_RESTART_INTERVAL) - return false; - - last_pgarch_start_time = curtime; - return true; + if (last_pgarch_start_time == 0 || + curtime < last_pgarch_start_time || + curtime - last_pgarch_start_time >= PGARCH_RESTART_INTERVAL) + { + last_pgarch_start_time = curtime; + return true; + } + return false; } @@ -254,7 +257,7 @@ PgArchiverMain(const void *startup_data, size_t startup_data_len) PgArch->pgprocno = MyProcNumber; /* Create workspace for pgarch_readyXlog() */ - arch_files = palloc(sizeof(struct arch_files_state)); + arch_files = palloc_object(struct arch_files_state); arch_files->arch_files_size = 0; /* Initialize our max-heap for prioritizing files to archive. */ @@ -332,7 +335,8 @@ pgarch_MainLoop(void) * SIGUSR2 arrives. However, that means a random SIGTERM would * disable archiving indefinitely, which doesn't seem like a good * idea. If more than 60 seconds pass since SIGTERM, exit anyway, so - * that the postmaster can start a new archiver if needed. + * that the postmaster can start a new archiver if needed. Also exit + * if time unexpectedly goes backward. */ if (ShutdownRequestPending) { @@ -340,8 +344,8 @@ pgarch_MainLoop(void) if (last_sigterm_time == 0) last_sigterm_time = curtime; - else if ((unsigned int) (curtime - last_sigterm_time) >= - (unsigned int) 60) + else if (curtime < last_sigterm_time || + curtime - last_sigterm_time >= 60) break; } @@ -718,15 +722,15 @@ pgarch_readyXlog(char *xlog) /* * Store the file in our max-heap if it has a high enough priority. */ - if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) < NUM_FILES_PER_DIRECTORY_SCAN) { /* If the heap isn't full yet, quickly add it. */ - arch_file = arch_files->arch_filenames[arch_files->arch_heap->bh_size]; + arch_file = arch_files->arch_filenames[binaryheap_size(arch_files->arch_heap)]; strcpy(arch_file, basename); binaryheap_add_unordered(arch_files->arch_heap, CStringGetDatum(arch_file)); /* If we just filled the heap, make it a valid one. */ - if (arch_files->arch_heap->bh_size == NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) == NUM_FILES_PER_DIRECTORY_SCAN) binaryheap_build(arch_files->arch_heap); } else if (ready_file_comparator(binaryheap_first(arch_files->arch_heap), @@ -744,21 +748,21 @@ pgarch_readyXlog(char *xlog) FreeDir(rldir); /* If no files were found, simply return. */ - if (arch_files->arch_heap->bh_size == 0) + if (binaryheap_empty(arch_files->arch_heap)) return false; /* * If we didn't fill the heap, we didn't make it a valid one. Do that * now. */ - if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) < NUM_FILES_PER_DIRECTORY_SCAN) binaryheap_build(arch_files->arch_heap); /* * Fill arch_files array with the files to archive in ascending order of * priority. */ - arch_files->arch_files_size = arch_files->arch_heap->bh_size; + arch_files->arch_files_size = binaryheap_size(arch_files->arch_heap); for (int i = 0; i < arch_files->arch_files_size; i++) arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); @@ -941,7 +945,7 @@ LoadArchiveLibrary(void) ereport(ERROR, (errmsg("archive modules must register an archive callback"))); - archive_module_state = (ArchiveModuleState *) palloc0(sizeof(ArchiveModuleState)); + archive_module_state = palloc0_object(ArchiveModuleState); if (ArchiveCallbacks->startup_cb != NULL) ArchiveCallbacks->startup_cb(archive_module_state); diff --git a/src/backend/postmaster/pmchild.c b/src/backend/postmaster/pmchild.c index cde1d23a4ca8b..584bb58c8abaf 100644 --- a/src/backend/postmaster/pmchild.c +++ b/src/backend/postmaster/pmchild.c @@ -59,6 +59,17 @@ NON_EXEC_STATIC int num_pmchild_slots = 0; */ dlist_head ActiveChildList; +/* + * Dummy pointer to persuade Valgrind that we've not leaked the array of + * PMChild structs. Make it global to ensure the compiler doesn't + * optimize it away. + */ +#ifdef USE_VALGRIND +extern PMChild *pmchild_array; +PMChild *pmchild_array; +#endif + + /* * MaxLivePostmasterChildren * @@ -125,8 +136,13 @@ InitPostmasterChildSlots(void) for (int i = 0; i < BACKEND_NUM_TYPES; i++) num_pmchild_slots += pmchild_pools[i].size; - /* Initialize them */ + /* Allocate enough slots, and make sure Valgrind doesn't complain */ slots = palloc(num_pmchild_slots * sizeof(PMChild)); +#ifdef USE_VALGRIND + pmchild_array = slots; +#endif + + /* Initialize them */ slotno = 0; for (int btype = 0; btype < BACKEND_NUM_TYPES; btype++) { diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 490f7ce36645b..cf44a67718718 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -854,6 +854,9 @@ PostmasterMain(int argc, char *argv[]) if (summarize_wal && wal_level == WAL_LEVEL_MINIMAL) ereport(ERROR, (errmsg("WAL cannot be summarized when \"wal_level\" is \"minimal\""))); + if (sync_replication_slots && wal_level == WAL_LEVEL_MINIMAL) + ereport(ERROR, + (errmsg("replication slot synchronization (\"sync_replication_slots\" = on) requires \"wal_level\" to be \"replica\" or \"logical\""))); /* * Other one-time internal sanity checks can go here, if they are fast. @@ -877,7 +880,7 @@ PostmasterMain(int argc, char *argv[]) /* For debugging: display postmaster environment */ if (message_level_is_interesting(DEBUG3)) { -#if !defined(WIN32) || defined(_MSC_VER) +#if !defined(WIN32) extern char **environ; #endif char **p; @@ -1554,13 +1557,21 @@ DetermineSleepTime(void) { if (AbortStartTime != 0) { + time_t curtime = time(NULL); int seconds; - /* time left to abort; clamp to 0 in case it already expired */ - seconds = SIGKILL_CHILDREN_AFTER_SECS - - (time(NULL) - AbortStartTime); + /* + * time left to abort; clamp to 0 if it already expired, or if + * time goes backwards + */ + if (curtime < AbortStartTime || + curtime - AbortStartTime >= SIGKILL_CHILDREN_AFTER_SECS) + seconds = 0; + else + seconds = SIGKILL_CHILDREN_AFTER_SECS - + (curtime - AbortStartTime); - return Max(seconds * 1000, 0); + return seconds * 1000; } else return 60 * 1000; @@ -2630,6 +2641,13 @@ CleanupBackend(PMChild *bp, } bp = NULL; + /* + * In a crash case, exit immediately without resetting background worker + * state. However, if restart_after_crash is enabled, the background + * worker state (e.g., rw_pid) still needs be reset so the worker can + * restart after crash recovery. This reset is handled in + * ResetBackgroundWorkerCrashTimes(), not here. + */ if (crashed) { HandleChildCrash(bp_pid, exitstatus, procname); @@ -3373,7 +3391,7 @@ LaunchMissingBackgroundProcesses(void) Shutdown <= SmartShutdown) { WalReceiverPMChild = StartChildProcess(B_WAL_RECEIVER); - if (WalReceiverPMChild != 0) + if (WalReceiverPMChild != NULL) WalReceiverRequested = false; /* else leave the flag set, so we'll try again later */ } @@ -4337,15 +4355,15 @@ maybe_start_bgworkers(void) static bool maybe_reap_io_worker(int pid) { - for (int id = 0; id < MAX_IO_WORKERS; ++id) + for (int i = 0; i < MAX_IO_WORKERS; ++i) { - if (io_worker_children[id] && - io_worker_children[id]->pid == pid) + if (io_worker_children[i] && + io_worker_children[i]->pid == pid) { - ReleasePostmasterChildSlot(io_worker_children[id]); + ReleasePostmasterChildSlot(io_worker_children[i]); --io_worker_count; - io_worker_children[id] = NULL; + io_worker_children[i] = NULL; return true; } } @@ -4389,22 +4407,22 @@ maybe_adjust_io_workers(void) while (io_worker_count < io_workers) { PMChild *child; - int id; + int i; /* find unused entry in io_worker_children array */ - for (id = 0; id < MAX_IO_WORKERS; ++id) + for (i = 0; i < MAX_IO_WORKERS; ++i) { - if (io_worker_children[id] == NULL) + if (io_worker_children[i] == NULL) break; } - if (id == MAX_IO_WORKERS) - elog(ERROR, "could not find a free IO worker ID"); + if (i == MAX_IO_WORKERS) + elog(ERROR, "could not find a free IO worker slot"); /* Try to launch one. */ child = StartChildProcess(B_IO_WORKER); if (child != NULL) { - io_worker_children[id] = child; + io_worker_children[i] = child; ++io_worker_count; } else @@ -4415,11 +4433,11 @@ maybe_adjust_io_workers(void) if (io_worker_count > io_workers) { /* ask the IO worker in the highest slot to exit */ - for (int id = MAX_IO_WORKERS - 1; id >= 0; --id) + for (int i = MAX_IO_WORKERS - 1; i >= 0; --i) { - if (io_worker_children[id] != NULL) + if (io_worker_children[i] != NULL) { - kill(io_worker_children[id]->pid, SIGUSR2); + kill(io_worker_children[i]->pid, SIGUSR2); break; } } @@ -4544,7 +4562,7 @@ pgwin32_register_deadchild_callback(HANDLE procHandle, DWORD procId) { win32_deadchild_waitinfo *childinfo; - childinfo = palloc(sizeof(win32_deadchild_waitinfo)); + childinfo = palloc_object(win32_deadchild_waitinfo); childinfo->procHandle = procHandle; childinfo->procId = procId; diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c index 50c2edec1f611..526ad053a458f 100644 --- a/src/backend/postmaster/syslogger.c +++ b/src/backend/postmaster/syslogger.c @@ -960,7 +960,7 @@ process_pipe_input(char *logbuffer, int *bytes_in_logbuffer) * Need a free slot, but there isn't one in the list, * so create a new one and extend the list with it. */ - free_slot = palloc(sizeof(save_buffer)); + free_slot = palloc_object(save_buffer); buffer_list = lappend(buffer_list, free_slot); buffer_lists[p.pid % NBUFFER_LISTS] = buffer_list; } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 0fec4f1f871ce..e7e4d652f9720 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -342,7 +342,7 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) * If we discover that WAL summarization is not enabled, just exit. */ current_lsn = GetOldestUnsummarizedLSN(¤t_tli, &exact); - if (XLogRecPtrIsInvalid(current_lsn)) + if (!XLogRecPtrIsValid(current_lsn)) proc_exit(0); /* @@ -379,13 +379,13 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) * only have to do this once per timeline switch, we probably wouldn't * save any significant amount of work in practice. */ - if (current_tli != latest_tli && XLogRecPtrIsInvalid(switch_lsn)) + if (current_tli != latest_tli && !XLogRecPtrIsValid(switch_lsn)) { List *tles = readTimeLineHistory(latest_tli); switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli); ereport(DEBUG1, - errmsg_internal("switch point from TLI %u to TLI %u is at %X/%X", + errmsg_internal("switch point from TLI %u to TLI %u is at %X/%08X", current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn))); } @@ -394,7 +394,7 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) * on this timeline. Switch to the next timeline and go around again, * backing up to the exact switch point if we passed it. */ - if (!XLogRecPtrIsInvalid(switch_lsn) && current_lsn >= switch_lsn) + if (XLogRecPtrIsValid(switch_lsn) && current_lsn >= switch_lsn) { /* Restart summarization from switch point. */ current_tli = switch_tli; @@ -419,7 +419,7 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) end_of_summary_lsn = SummarizeWAL(current_tli, current_lsn, exact, switch_lsn, latest_lsn); - Assert(!XLogRecPtrIsInvalid(end_of_summary_lsn)); + Assert(XLogRecPtrIsValid(end_of_summary_lsn)); Assert(end_of_summary_lsn >= current_lsn); /* @@ -644,7 +644,7 @@ WakeupWalSummarizer(void) if (WalSummarizerCtl == NULL) return; - LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + LWLockAcquire(WALSummarizerLock, LW_SHARED); pgprocno = WalSummarizerCtl->summarizer_pgprocno; LWLockRelease(WALSummarizerLock); @@ -685,7 +685,7 @@ WaitForWalSummarization(XLogRecPtr lsn) /* * If the LSN summarized on disk has reached the target value, stop. */ - LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + LWLockAcquire(WALSummarizerLock, LW_SHARED); summarized_lsn = WalSummarizerCtl->summarized_lsn; pending_lsn = WalSummarizerCtl->pending_lsn; LWLockRelease(WALSummarizerLock); @@ -741,7 +741,7 @@ WaitForWalSummarization(XLogRecPtr lsn) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL summarization is not progressing"), - errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", + errdetail("Summarization is needed through %X/%08X, but is stuck at %X/%08X on disk and %X/%08X in memory.", LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); @@ -755,12 +755,12 @@ WaitForWalSummarization(XLogRecPtr lsn) current_time) / 1000; ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg_plural("still waiting for WAL summarization through %X/%X after %ld second", - "still waiting for WAL summarization through %X/%X after %ld seconds", + errmsg_plural("still waiting for WAL summarization through %X/%08X after %ld second", + "still waiting for WAL summarization through %X/%08X after %ld seconds", elapsed_seconds, LSN_FORMAT_ARGS(lsn), elapsed_seconds), - errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", + errdetail("Summarization has reached %X/%08X on disk and %X/%08X in memory.", LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); } @@ -920,10 +920,9 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, bool fast_forward = true; /* Initialize private data for xlogreader. */ - private_data = (SummarizerReadLocalXLogPrivate *) - palloc0(sizeof(SummarizerReadLocalXLogPrivate)); + private_data = palloc0_object(SummarizerReadLocalXLogPrivate); private_data->tli = tli; - private_data->historic = !XLogRecPtrIsInvalid(switch_lsn); + private_data->historic = XLogRecPtrIsValid(switch_lsn); private_data->read_upto = maximum_lsn; /* Create xlogreader. */ @@ -971,7 +970,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, else { summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn); - if (XLogRecPtrIsInvalid(summary_start_lsn)) + if (!XLogRecPtrIsValid(summary_start_lsn)) { /* * If we hit end-of-WAL while trying to find the next valid @@ -981,7 +980,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, if (private_data->end_of_wal) { ereport(DEBUG1, - errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", + errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X", tli, LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(private_data->read_upto))); @@ -1000,8 +999,8 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, } else ereport(ERROR, - (errmsg("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(start_lsn)))); + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(start_lsn))); } /* We shouldn't go backward. */ @@ -1034,7 +1033,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, * able to read a complete record. */ ereport(DEBUG1, - errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", + errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr), LSN_FORMAT_ARGS(private_data->read_upto))); @@ -1045,20 +1044,20 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL from timeline %u at %X/%X: %s", + errmsg("could not read WAL from timeline %u at %X/%08X: %s", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL from timeline %u at %X/%X", + errmsg("could not read WAL from timeline %u at %X/%08X", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); } /* We shouldn't go backward. */ Assert(summary_start_lsn <= xlogreader->EndRecPtr); - if (!XLogRecPtrIsInvalid(switch_lsn) && + if (XLogRecPtrIsValid(switch_lsn) && xlogreader->ReadRecPtr >= switch_lsn) { /* @@ -1180,7 +1179,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, * If we have a switch LSN and have reached it, stop before reading * the next record. */ - if (!XLogRecPtrIsInvalid(switch_lsn) && + if (XLogRecPtrIsValid(switch_lsn) && xlogreader->EndRecPtr >= switch_lsn) break; } @@ -1222,7 +1221,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, /* Tell the user what we did. */ ereport(DEBUG1, - errmsg_internal("summarized WAL on TLI %u from %X/%X to %X/%X", + errmsg_internal("summarized WAL on TLI %u from %X/%08X to %X/%08X", tli, LSN_FORMAT_ARGS(summary_start_lsn), LSN_FORMAT_ARGS(summary_end_lsn))); @@ -1234,7 +1233,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, /* If we skipped a non-zero amount of WAL, log a debug message. */ if (summary_end_lsn > summary_start_lsn && fast_forward) ereport(DEBUG1, - errmsg_internal("skipped summarizing WAL on TLI %u from %X/%X to %X/%X", + errmsg_internal("skipped summarizing WAL on TLI %u from %X/%08X to %X/%08X", tli, LSN_FORMAT_ARGS(summary_start_lsn), LSN_FORMAT_ARGS(summary_end_lsn))); @@ -1580,7 +1579,7 @@ summarizer_read_local_xlog_page(XLogReaderState *state, /* Debugging output. */ ereport(DEBUG1, - errmsg_internal("timeline %u became historic, can read up to %X/%X", + errmsg_internal("timeline %u became historic, can read up to %X/%08X", private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto))); } @@ -1723,7 +1722,7 @@ MaybeRemoveOldWalSummaries(void) * If the WAL doesn't exist any more, we can remove it if the file * modification time is old enough. */ - if (XLogRecPtrIsInvalid(oldest_lsn) || ws->end_lsn <= oldest_lsn) + if (!XLogRecPtrIsValid(oldest_lsn) || ws->end_lsn <= oldest_lsn) RemoveWalSummaryIfOlderThan(ws, cutoff_time); /* diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 77d1ce28168b2..847abcc35b321 100644 --- a/src/backend/regex/regc_locale.c +++ b/src/backend/regex/regc_locale.c @@ -453,7 +453,7 @@ range(struct vars *v, /* context */ for (c = a; c <= b; c++) { - cc = pg_wc_tolower(c); + cc = regc_wc_tolower(c); if (cc != c && (before(cc, a) || before(b, cc))) { @@ -464,7 +464,7 @@ range(struct vars *v, /* context */ } addchr(cv, cc); } - cc = pg_wc_toupper(c); + cc = regc_wc_toupper(c); if (cc != c && (before(cc, a) || before(b, cc))) { @@ -562,7 +562,7 @@ lookupcclass(struct vars *v, /* context (for returning errors) */ * Must include case counterparts if "cases" is true. * * The returned cvec might be either a transient cvec gotten from getcvec(), - * or a permanently cached one from pg_ctype_get_cache(). This is okay + * or a permanently cached one from regc_ctype_get_cache(). This is okay * because callers are not supposed to explicitly free the result either way. */ static struct cvec * @@ -584,7 +584,7 @@ cclasscvec(struct vars *v, /* context */ /* * Now compute the character class contents. For classes that are based * on the behavior of a or function, we use - * pg_ctype_get_cache so that we can cache the results. Other classes + * regc_ctype_get_cache so that we can cache the results. Other classes * have definitions that are hard-wired here, and for those we just * construct a transient cvec on the fly. * @@ -594,16 +594,16 @@ cclasscvec(struct vars *v, /* context */ switch (cclasscode) { case CC_PRINT: - cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isprint, cclasscode); break; case CC_ALNUM: - cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isalnum, cclasscode); break; case CC_ALPHA: - cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isalpha, cclasscode); break; case CC_WORD: - cv = pg_ctype_get_cache(pg_wc_isword, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isword, cclasscode); break; case CC_ASCII: /* hard-wired meaning */ @@ -624,10 +624,10 @@ cclasscvec(struct vars *v, /* context */ addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: - cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isdigit, cclasscode); break; case CC_PUNCT: - cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode); + cv = regc_ctype_get_cache(regc_wc_ispunct, cclasscode); break; case CC_XDIGIT: @@ -645,16 +645,16 @@ cclasscvec(struct vars *v, /* context */ } break; case CC_SPACE: - cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isspace, cclasscode); break; case CC_LOWER: - cv = pg_ctype_get_cache(pg_wc_islower, cclasscode); + cv = regc_ctype_get_cache(regc_wc_islower, cclasscode); break; case CC_UPPER: - cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isupper, cclasscode); break; case CC_GRAPH: - cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode); + cv = regc_ctype_get_cache(regc_wc_isgraph, cclasscode); break; } @@ -679,29 +679,29 @@ cclass_column_index(struct colormap *cm, chr c) * Note: we should not see requests to consider cclasses that are not * treated as locale-specific by cclasscvec(), above. */ - if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) + if (cm->classbits[CC_PRINT] && regc_wc_isprint(c)) colnum |= cm->classbits[CC_PRINT]; - if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c)) + if (cm->classbits[CC_ALNUM] && regc_wc_isalnum(c)) colnum |= cm->classbits[CC_ALNUM]; - if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) + if (cm->classbits[CC_ALPHA] && regc_wc_isalpha(c)) colnum |= cm->classbits[CC_ALPHA]; - if (cm->classbits[CC_WORD] && pg_wc_isword(c)) + if (cm->classbits[CC_WORD] && regc_wc_isword(c)) colnum |= cm->classbits[CC_WORD]; assert(cm->classbits[CC_ASCII] == 0); assert(cm->classbits[CC_BLANK] == 0); assert(cm->classbits[CC_CNTRL] == 0); - if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c)) + if (cm->classbits[CC_DIGIT] && regc_wc_isdigit(c)) colnum |= cm->classbits[CC_DIGIT]; - if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c)) + if (cm->classbits[CC_PUNCT] && regc_wc_ispunct(c)) colnum |= cm->classbits[CC_PUNCT]; assert(cm->classbits[CC_XDIGIT] == 0); - if (cm->classbits[CC_SPACE] && pg_wc_isspace(c)) + if (cm->classbits[CC_SPACE] && regc_wc_isspace(c)) colnum |= cm->classbits[CC_SPACE]; - if (cm->classbits[CC_LOWER] && pg_wc_islower(c)) + if (cm->classbits[CC_LOWER] && regc_wc_islower(c)) colnum |= cm->classbits[CC_LOWER]; - if (cm->classbits[CC_UPPER] && pg_wc_isupper(c)) + if (cm->classbits[CC_UPPER] && regc_wc_isupper(c)) colnum |= cm->classbits[CC_UPPER]; - if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c)) + if (cm->classbits[CC_GRAPH] && regc_wc_isgraph(c)) colnum |= cm->classbits[CC_GRAPH]; return colnum; @@ -721,8 +721,8 @@ allcases(struct vars *v, /* context */ chr lc, uc; - lc = pg_wc_tolower(c); - uc = pg_wc_toupper(c); + lc = regc_wc_tolower(c); + uc = regc_wc_toupper(c); cv = getcvec(v, 2, 0); addchr(cv, lc); @@ -760,7 +760,7 @@ casecmp(const chr *x, const chr *y, /* strings to compare */ { for (; len > 0; len--, x++, y++) { - if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y))) + if ((*x != *y) && (regc_wc_tolower(*x) != regc_wc_tolower(*y))) return 1; } return 0; diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 78193cfb964e5..bb0e3f1d13920 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -19,203 +19,10 @@ #include "common/unicode_case.h" #include "common/unicode_category.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_c.h" -/* - * For the libc provider, to provide as much functionality as possible on a - * variety of platforms without going so far as to implement everything from - * scratch, we use several implementation strategies depending on the - * situation: - * - * 1. In C/POSIX collations, we use hard-wired code. We can't depend on - * the functions since those will obey LC_CTYPE. Note that these - * collations don't give a fig about multibyte characters. - * - * 2. When working in UTF8 encoding, we use the functions. - * This assumes that every platform uses Unicode codepoints directly - * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption - * even for non-UTF8 encodings, which may be a problem.) On some platforms - * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. - * - * 3. In all other encodings, we use the functions for pg_wchar - * values up to 255, and punt for values above that. This is 100% correct - * only in single-byte encodings such as LATINn. However, non-Unicode - * multibyte encodings are mostly Far Eastern character sets for which the - * properties being tested here aren't very relevant for higher code values - * anyway. The difficulty with using the functions with - * non-Unicode multibyte encodings is that we can have no certainty that - * the platform's wchar_t representation matches what we do in pg_wchar - * conversions. - * - * As a special case, in the "default" collation, (2) and (3) force ASCII - * letters to follow ASCII upcase/downcase rules, while in a non-default - * collation we just let the library functions do what they will. The case - * where this matters is treatment of I/i in Turkish, and the behavior is - * meant to match the upper()/lower() SQL functions. - * - * We store the active collation setting in static variables. In principle - * it could be passed down to here via the regex library's "struct vars" data - * structure; but that would require somewhat invasive changes in the regex - * library, and right now there's no real benefit to be gained from that. - * - * NB: the coding here assumes pg_wchar is an unsigned type. - */ - -typedef enum -{ - PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */ - PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */ - PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t functions */ - PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t functions */ - PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */ -} PG_Locale_Strategy; - -static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; -/* - * Hard-wired character properties for C locale - */ -#define PG_ISDIGIT 0x01 -#define PG_ISALPHA 0x02 -#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA) -#define PG_ISUPPER 0x04 -#define PG_ISLOWER 0x08 -#define PG_ISGRAPH 0x10 -#define PG_ISPRINT 0x20 -#define PG_ISPUNCT 0x40 -#define PG_ISSPACE 0x80 - -static const unsigned char pg_char_properties[128] = { - /* NUL */ 0, - /* ^A */ 0, - /* ^B */ 0, - /* ^C */ 0, - /* ^D */ 0, - /* ^E */ 0, - /* ^F */ 0, - /* ^G */ 0, - /* ^H */ 0, - /* ^I */ PG_ISSPACE, - /* ^J */ PG_ISSPACE, - /* ^K */ PG_ISSPACE, - /* ^L */ PG_ISSPACE, - /* ^M */ PG_ISSPACE, - /* ^N */ 0, - /* ^O */ 0, - /* ^P */ 0, - /* ^Q */ 0, - /* ^R */ 0, - /* ^S */ 0, - /* ^T */ 0, - /* ^U */ 0, - /* ^V */ 0, - /* ^W */ 0, - /* ^X */ 0, - /* ^Y */ 0, - /* ^Z */ 0, - /* ^[ */ 0, - /* ^\ */ 0, - /* ^] */ 0, - /* ^^ */ 0, - /* ^_ */ 0, - /* */ PG_ISPRINT | PG_ISSPACE, - /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, - /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, - /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, - /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, - /* DEL */ 0 -}; - /* * pg_set_regex_collation: set collation for these functions to obey @@ -228,7 +35,6 @@ void pg_set_regex_collation(Oid collation) { pg_locale_t locale = 0; - PG_Locale_Strategy strategy; if (!OidIsValid(collation)) { @@ -242,377 +48,144 @@ pg_set_regex_collation(Oid collation) errhint("Use the COLLATE clause to set the collation explicitly."))); } - if (collation == C_COLLATION_OID) - { - /* - * Some callers expect regexes to work for C_COLLATION_OID before - * catalog access is available, so we can't call - * pg_newlocale_from_collation(). - */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; - } - else - { - locale = pg_newlocale_from_collation(collation); - - if (!locale->deterministic) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("nondeterministic collations are not supported for regular expressions"))); + locale = pg_newlocale_from_collation(collation); - if (locale->ctype_is_c) - { - /* - * C/POSIX collations use this path regardless of database - * encoding - */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; - } - else if (locale->provider == COLLPROVIDER_BUILTIN) - { - Assert(GetDatabaseEncoding() == PG_UTF8); - strategy = PG_REGEX_STRATEGY_BUILTIN; - } -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - { - strategy = PG_REGEX_STRATEGY_ICU; - } -#endif - else - { - Assert(locale->provider == COLLPROVIDER_LIBC); - if (GetDatabaseEncoding() == PG_UTF8) - strategy = PG_REGEX_STRATEGY_LIBC_WIDE; - else - strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; - } - } + if (!locale->deterministic) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("nondeterministic collations are not supported for regular expressions"))); - pg_regex_strategy = strategy; pg_regex_locale = locale; } +/* + * The following functions overlap with those defined in pg_locale.c. XXX: + * consider refactor. + */ + static int -pg_wc_isdigit(pg_wchar c) +regc_wc_isdigit(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISDIGIT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isdigit(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + else + return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale); } static int -pg_wc_isalpha(pg_wchar c) +regc_wc_isalpha(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALPHA)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalpha(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalpha(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + else + return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale); } static int -pg_wc_isalnum(pg_wchar c) +regc_wc_isalnum(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALNUM)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalnum(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + else + return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale); } static int -pg_wc_isword(pg_wchar c) +regc_wc_isword(pg_wchar c) { /* We define word characters as alnum class plus underscore */ if (c == CHR('_')) return 1; - return pg_wc_isalnum(c); + return regc_wc_isalnum(c); } static int -pg_wc_isupper(pg_wchar c) +regc_wc_isupper(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISUPPER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isupper(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isupper_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isupper(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + else + return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale); } static int -pg_wc_islower(pg_wchar c) +regc_wc_islower(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISLOWER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_islower(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - islower_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_islower(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + else + return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale); } static int -pg_wc_isgraph(pg_wchar c) +regc_wc_isgraph(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISGRAPH)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isgraph(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isgraph(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + else + return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale); } static int -pg_wc_isprint(pg_wchar c) +regc_wc_isprint(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPRINT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isprint(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswprint_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isprint_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isprint(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + else + return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale); } static int -pg_wc_ispunct(pg_wchar c) +regc_wc_ispunct(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPUNCT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_ispunct(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + else + return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale); } static int -pg_wc_isspace(pg_wchar c) +regc_wc_isspace(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISSPACE)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isspace(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswspace_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isspace_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isspace(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + else + return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale); } static pg_wchar -pg_wc_toupper(pg_wchar c) +regc_wc_toupper(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_uppercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - if (c <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_toupper(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale); } static pg_wchar -pg_wc_tolower(pg_wchar c) +regc_wc_tolower(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_lowercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - if (c <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_tolower(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale); } @@ -629,11 +202,11 @@ pg_wc_tolower(pg_wchar c) * the main regex code expects us to return a failure indication instead. */ -typedef int (*pg_wc_probefunc) (pg_wchar c); +typedef int (*regc_wc_probefunc) (pg_wchar c); typedef struct pg_ctype_cache { - pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */ + regc_wc_probefunc probefunc; /* regc_wc_isalpha or a sibling */ pg_locale_t locale; /* locale this entry is for */ struct cvec cv; /* cache entry contents */ struct pg_ctype_cache *next; /* chain link */ @@ -682,14 +255,14 @@ store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs) } /* - * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all + * Given a probe function (e.g., regc_wc_isalpha) get a struct cvec for all * chrs satisfying the probe function. The active collation is the one * previously set by pg_set_regex_collation. Return NULL if out of memory. * * Note that the result must not be freed or modified by caller. */ static struct cvec * -pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) +regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode) { pg_ctype_cache *pcc; pg_wchar max_chr; @@ -738,37 +311,27 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) * would always be true for production values of MAX_SIMPLE_CHR, but it's * useful to allow it to be small for testing purposes.) */ - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: #if MAX_SIMPLE_CHR >= 127 - max_chr = (pg_wchar) 127; - pcc->cv.cclasscode = -1; + max_chr = (pg_wchar) 127; + pcc->cv.cclasscode = -1; #else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; + max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif - break; - case PG_REGEX_STRATEGY_BUILTIN: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_WIDE: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_1BYTE: + } + else if (GetDatabaseEncoding() == PG_UTF8) + { + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + } + else + { #if MAX_SIMPLE_CHR >= UCHAR_MAX - max_chr = (pg_wchar) UCHAR_MAX; - pcc->cv.cclasscode = -1; + max_chr = (pg_wchar) UCHAR_MAX; + pcc->cv.cclasscode = -1; #else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; + max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif - break; - case PG_REGEX_STRATEGY_ICU: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - default: - Assert(false); - max_chr = 0; /* can't get here, but keep compiler quiet */ - break; } /* diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 15b264e50f1a7..3e18e4a78a202 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -249,18 +249,18 @@ static struct cvec *getcvec(struct vars *v, int nchrs, int nranges); static void freecvec(struct cvec *cv); /* === regc_pg_locale.c === */ -static int pg_wc_isdigit(pg_wchar c); -static int pg_wc_isalpha(pg_wchar c); -static int pg_wc_isalnum(pg_wchar c); -static int pg_wc_isword(pg_wchar c); -static int pg_wc_isupper(pg_wchar c); -static int pg_wc_islower(pg_wchar c); -static int pg_wc_isgraph(pg_wchar c); -static int pg_wc_isprint(pg_wchar c); -static int pg_wc_ispunct(pg_wchar c); -static int pg_wc_isspace(pg_wchar c); -static pg_wchar pg_wc_toupper(pg_wchar c); -static pg_wchar pg_wc_tolower(pg_wchar c); +static int regc_wc_isdigit(pg_wchar c); +static int regc_wc_isalpha(pg_wchar c); +static int regc_wc_isalnum(pg_wchar c); +static int regc_wc_isword(pg_wchar c); +static int regc_wc_isupper(pg_wchar c); +static int regc_wc_islower(pg_wchar c); +static int regc_wc_isgraph(pg_wchar c); +static int regc_wc_isprint(pg_wchar c); +static int regc_wc_ispunct(pg_wchar c); +static int regc_wc_isspace(pg_wchar c); +static pg_wchar regc_wc_toupper(pg_wchar c); +static pg_wchar regc_wc_tolower(pg_wchar c); /* === regc_locale.c === */ static chr element(struct vars *v, const chr *startp, const chr *endp); diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 7b4ddf7a8f52f..5ddc9e812e70d 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -211,7 +211,7 @@ libpqrcv_connect(const char *conninfo, bool replication, bool logical, Assert(i < lengthof(keys)); - conn = palloc0(sizeof(WalReceiverConn)); + conn = palloc0_object(WalReceiverConn); conn->streamConn = libpqsrv_connect_params(keys, vals, /* expand_dbname = */ true, @@ -232,6 +232,9 @@ libpqrcv_connect(const char *conninfo, bool replication, bool logical, errhint("Target server's authentication method must be changed, or set password_required=false in the subscription parameters."))); } + PQsetNoticeReceiver(conn->streamConn, libpqsrv_notice_receiver, + "received message via replication"); + /* * Set always-secure search path for the cases where the connection is * used to run SQL queries, so malicious users can't get control. @@ -418,31 +421,22 @@ libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli) "IDENTIFY_SYSTEM", WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive database system identifier and timeline ID from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } /* * IDENTIFY_SYSTEM returns 3 columns in 9.3 and earlier, and 4 columns in * 9.4 and onwards. */ if (PQnfields(res) < 3 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields.", - ntuples, nfields, 1, 3))); - } + PQntuples(res), PQnfields(res), 1, 3))); primary_sysid = pstrdup(PQgetvalue(res, 0, 0)); *primary_tli = pg_strtoint32(PQgetvalue(res, 0, 1)); PQclear(res); @@ -534,7 +528,7 @@ libpqrcv_startstreaming(WalReceiverConn *conn, if (options->logical) appendStringInfoString(&cmd, " LOGICAL"); - appendStringInfo(&cmd, " %X/%X", LSN_FORMAT_ARGS(options->startpoint)); + appendStringInfo(&cmd, " %X/%08X", LSN_FORMAT_ARGS(options->startpoint)); /* * Additional options are different depending on if we are doing logical @@ -604,13 +598,10 @@ libpqrcv_startstreaming(WalReceiverConn *conn, return false; } else if (PQresultStatus(res) != PGRES_COPY_BOTH) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not start WAL streaming: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } PQclear(res); return true; } @@ -718,26 +709,17 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, cmd, WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive timeline history file from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } if (PQnfields(res) != 2 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Expected 1 tuple with 2 fields, got %d tuples with %d fields.", - ntuples, nfields))); - } + PQntuples(res), PQnfields(res)))); *filename = pstrdup(PQgetvalue(res, 0, 0)); *len = PQgetlength(res, 0, 1); @@ -841,13 +823,10 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, return -1; } else - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive data from WAL stream: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } } if (rawlen < -1) ereport(ERROR, @@ -971,13 +950,10 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, pfree(cmd.data); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not create replication slot \"%s\": %s", slotname, pchomp(PQerrorMessage(conn->streamConn))))); - } if (lsn) *lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid, @@ -1126,7 +1102,7 @@ libpqrcv_exec(WalReceiverConn *conn, const char *query, const int nRetTypes, const Oid *retTypes) { PGresult *pgres = NULL; - WalRcvExecResult *walres = palloc0(sizeof(WalRcvExecResult)); + WalRcvExecResult *walres = palloc0_object(WalRcvExecResult); char *diag_sqlstate; if (MyDatabaseId == InvalidOid) diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile index 1e08bbbd4eb15..455768a57f0f3 100644 --- a/src/backend/replication/logical/Makefile +++ b/src/backend/replication/logical/Makefile @@ -20,14 +20,17 @@ OBJS = \ decode.o \ launcher.o \ logical.o \ + logicalctl.o \ logicalfuncs.o \ message.o \ origin.o \ proto.o \ relation.o \ reorderbuffer.o \ + sequencesync.o \ slotsync.o \ snapbuild.o \ + syncutils.o \ tablesync.o \ worker.o diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c index d25085d351535..a4aafcf5b6ee7 100644 --- a/src/backend/replication/logical/applyparallelworker.c +++ b/src/backend/replication/logical/applyparallelworker.c @@ -299,7 +299,7 @@ pa_can_start(void) * STREAM START message, and it doesn't seem worth sending the extra eight * bytes with the STREAM START to enable parallelism for this case. */ - if (!XLogRecPtrIsInvalid(MySubscription->skiplsn)) + if (XLogRecPtrIsValid(MySubscription->skiplsn)) return false; /* @@ -425,7 +425,7 @@ pa_launch_parallel_worker(void) */ oldcontext = MemoryContextSwitchTo(ApplyContext); - winfo = (ParallelApplyWorkerInfo *) palloc0(sizeof(ParallelApplyWorkerInfo)); + winfo = palloc0_object(ParallelApplyWorkerInfo); /* Setup shared memory. */ if (!pa_setup_dsm(winfo)) @@ -441,7 +441,8 @@ pa_launch_parallel_worker(void) MySubscription->name, MyLogicalRepWorker->userid, InvalidOid, - dsm_segment_handle(winfo->dsm_seg)); + dsm_segment_handle(winfo->dsm_seg), + false); if (launched) { @@ -639,7 +640,7 @@ pa_detach_all_error_mq(void) * Check if there are any pending spooled messages. */ static bool -pa_has_spooled_message_pending() +pa_has_spooled_message_pending(void) { PartialFileSetState fileset_state; @@ -777,10 +778,10 @@ LogicalParallelApplyLoop(shm_mq_handle *mqh) /* * The first byte of messages sent from leader apply worker to - * parallel apply workers can only be 'w'. + * parallel apply workers can only be PqReplMsg_WALData. */ c = pq_getmsgbyte(&s); - if (c != 'w') + if (c != PqReplMsg_WALData) elog(ERROR, "unexpected message \"%c\"", c); /* @@ -869,10 +870,17 @@ ParallelApplyWorkerMain(Datum main_arg) InitializingApplyWorker = true; - /* Setup signal handling. */ + /* + * Setup signal handling. + * + * Note: We intentionally used SIGUSR2 to trigger a graceful shutdown + * initiated by the leader apply worker. This helps to differentiate it + * from the case where we abort the current transaction and exit on + * receiving SIGTERM. + */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGINT, SignalHandlerForShutdownRequest); pqsignal(SIGTERM, die); + pqsignal(SIGUSR2, SignalHandlerForShutdownRequest); BackgroundWorkerUnblockSignals(); /* @@ -962,7 +970,7 @@ ParallelApplyWorkerMain(Datum main_arg) * the subscription relation state. */ CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, - invalidate_syncing_table_states, + InvalidateSyncingRelStates, (Datum) 0); set_apply_error_context_origin(originname); @@ -971,9 +979,9 @@ ParallelApplyWorkerMain(Datum main_arg) /* * The parallel apply worker must not get here because the parallel apply - * worker will only stop when it receives a SIGTERM or SIGINT from the - * leader, or when there is an error. None of these cases will allow the - * code to reach here. + * worker will only stop when it receives a SIGTERM or SIGUSR2 from the + * leader, or SIGINT from itself, or when there is an error. None of these + * cases will allow the code to reach here. */ Assert(false); } @@ -1006,7 +1014,7 @@ ProcessParallelApplyMessage(StringInfo msg) switch (msgtype) { - case 'E': /* ErrorResponse */ + case PqMsg_ErrorResponse: { ErrorData edata; @@ -1043,11 +1051,11 @@ ProcessParallelApplyMessage(StringInfo msg) /* * Don't need to do anything about NoticeResponse and - * NotifyResponse as the logical replication worker doesn't need - * to send messages to the client. + * NotificationResponse as the logical replication worker doesn't + * need to send messages to the client. */ - case 'N': - case 'A': + case PqMsg_NoticeResponse: + case PqMsg_NotificationResponse: break; default: @@ -1632,7 +1640,7 @@ pa_xact_finish(ParallelApplyWorkerInfo *winfo, XLogRecPtr remote_lsn) */ pa_wait_for_xact_finish(winfo); - if (!XLogRecPtrIsInvalid(remote_lsn)) + if (XLogRecPtrIsValid(remote_lsn)) store_flush_position(remote_lsn, winfo->shared->last_commit_end); pa_free_worker(winfo); diff --git a/src/backend/replication/logical/conflict.c b/src/backend/replication/logical/conflict.c index 97c4e26b58654..166955922650f 100644 --- a/src/backend/replication/logical/conflict.c +++ b/src/backend/replication/logical/conflict.c @@ -29,6 +29,7 @@ static const char *const ConflictTypeNames[] = { [CT_UPDATE_EXISTS] = "update_exists", [CT_UPDATE_MISSING] = "update_missing", [CT_DELETE_ORIGIN_DIFFERS] = "delete_origin_differs", + [CT_UPDATE_DELETED] = "update_deleted", [CT_DELETE_MISSING] = "delete_missing", [CT_MULTIPLE_UNIQUE_CONFLICTS] = "multiple_unique_conflicts" }; @@ -54,7 +55,7 @@ static char *build_index_value_desc(EState *estate, Relation localrel, /* * Get the xmin and commit timestamp data (origin and timestamp) associated - * with the provided local tuple. + * with the provided local row. * * Return true if the commit timestamp data was found, false otherwise. */ @@ -88,12 +89,12 @@ GetTupleTransactionInfo(TupleTableSlot *localslot, TransactionId *xmin, * This function is used to report a conflict while applying replication * changes. * - * 'searchslot' should contain the tuple used to search the local tuple to be + * 'searchslot' should contain the tuple used to search the local row to be * updated or deleted. * * 'remoteslot' should contain the remote new tuple, if any. * - * conflicttuples is a list of local tuples that caused the conflict and the + * conflicttuples is a list of local rows that caused the conflict and the * conflict related information. See ConflictTupleInfo. * * The caller must ensure that all the indexes passed in ConflictTupleInfo are @@ -176,6 +177,7 @@ errcode_apply_conflict(ConflictType type) case CT_UPDATE_ORIGIN_DIFFERS: case CT_UPDATE_MISSING: case CT_DELETE_ORIGIN_DIFFERS: + case CT_UPDATE_DELETED: case CT_DELETE_MISSING: return errcode(ERRCODE_T_R_SERIALIZATION_FAILURE); } @@ -189,9 +191,9 @@ errcode_apply_conflict(ConflictType type) * * The DETAIL line comprises of two parts: * 1. Explanation of the conflict type, including the origin and commit - * timestamp of the existing local tuple. - * 2. Display of conflicting key, existing local tuple, remote new tuple, and - * replica identity columns, if any. The remote old tuple is excluded as its + * timestamp of the existing local row. + * 2. Display of conflicting key, existing local row, remote new row, and + * replica identity columns, if any. The remote old row is excluded as its * information is covered in the replica identity columns. */ static void @@ -261,6 +263,26 @@ errdetail_apply_conflict(EState *estate, ResultRelInfo *relinfo, break; + case CT_UPDATE_DELETED: + if (localts) + { + if (localorigin == InvalidRepOriginId) + appendStringInfo(&err_detail, _("The row to be updated was deleted locally in transaction %u at %s."), + localxmin, timestamptz_to_str(localts)); + else if (replorigin_by_oid(localorigin, true, &origin_name)) + appendStringInfo(&err_detail, _("The row to be updated was deleted by a different origin \"%s\" in transaction %u at %s."), + origin_name, localxmin, timestamptz_to_str(localts)); + + /* The origin that modified this row has been removed. */ + else + appendStringInfo(&err_detail, _("The row to be updated was deleted by a non-existent origin in transaction %u at %s."), + localxmin, timestamptz_to_str(localts)); + } + else + appendStringInfo(&err_detail, _("The row to be updated was deleted.")); + + break; + case CT_UPDATE_MISSING: appendStringInfoString(&err_detail, _("Could not find the row to be updated.")); break; @@ -291,7 +313,7 @@ errdetail_apply_conflict(EState *estate, ResultRelInfo *relinfo, localslot, remoteslot, indexoid); /* - * Next, append the key values, existing local tuple, remote tuple and + * Next, append the key values, existing local row, remote row, and * replica identity columns after the message. */ if (val_desc) @@ -309,7 +331,7 @@ errdetail_apply_conflict(EState *estate, ResultRelInfo *relinfo, /* * Helper function to build the additional details for conflicting key, - * existing local tuple, remote tuple, and replica identity columns. + * existing local row, remote row, and replica identity columns. * * If the return value is NULL, it indicates that the current user lacks * permissions to view the columns involved. @@ -351,7 +373,7 @@ build_tuple_value_details(EState *estate, ResultRelInfo *relinfo, { /* * The 'modifiedCols' only applies to the new tuple, hence we pass - * NULL for the existing local tuple. + * NULL for the existing local row. */ desc = ExecBuildSlotValueDescription(relid, localslot, tupdesc, NULL, 64); @@ -361,12 +383,12 @@ build_tuple_value_details(EState *estate, ResultRelInfo *relinfo, if (tuple_value.len > 0) { appendStringInfoString(&tuple_value, "; "); - appendStringInfo(&tuple_value, _("existing local tuple %s"), + appendStringInfo(&tuple_value, _("existing local row %s"), desc); } else { - appendStringInfo(&tuple_value, _("Existing local tuple %s"), + appendStringInfo(&tuple_value, _("Existing local row %s"), desc); } } @@ -393,11 +415,11 @@ build_tuple_value_details(EState *estate, ResultRelInfo *relinfo, if (tuple_value.len > 0) { appendStringInfoString(&tuple_value, "; "); - appendStringInfo(&tuple_value, _("remote tuple %s"), desc); + appendStringInfo(&tuple_value, _("remote row %s"), desc); } else { - appendStringInfo(&tuple_value, _("Remote tuple %s"), desc); + appendStringInfo(&tuple_value, _("Remote row %s"), desc); } } } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index cc03f0706e9c8..a1df8e1d6460b 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -149,39 +149,34 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) * can restart from there. */ break; - case XLOG_PARAMETER_CHANGE: + case XLOG_LOGICAL_DECODING_STATUS_CHANGE: { - xl_parameter_change *xlrec = - (xl_parameter_change *) XLogRecGetData(buf->record); + bool logical_decoding; + + memcpy(&logical_decoding, XLogRecGetData(buf->record), sizeof(bool)); /* - * If wal_level on the primary is reduced to less than - * logical, we want to prevent existing logical slots from - * being used. Existing logical slots on the standby get - * invalidated when this WAL record is replayed; and further, - * slot creation fails when wal_level is not sufficient; but - * all these operations are not synchronized, so a logical - * slot may creep in while the wal_level is being reduced. - * Hence this extra check. + * Error out as we should not decode this WAL record. + * + * Logical decoding is disabled, and existing logical slots on + * the standby are invalidated when this WAL record is + * replayed. No logical decoder can process this WAL record + * until replay completes, and by then the slots are already + * invalidated. Furthermore, no new logical slots can be + * created while logical decoding is disabled. This cannot + * occur even on primary either, since it will not restart + * with wal_level < replica if any logical slots exist. */ - if (xlrec->wal_level < WAL_LEVEL_LOGICAL) - { - /* - * This can occur only on a standby, as a primary would - * not allow to restart after changing wal_level < logical - * if there is pre-existing logical slot. - */ - Assert(RecoveryInProgress()); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("logical decoding on standby requires \"wal_level\" >= \"logical\" on the primary"))); - } + elog(ERROR, "unexpected logical decoding status change %d", + logical_decoding); + break; } case XLOG_NOOP: case XLOG_NEXTOID: case XLOG_SWITCH: case XLOG_BACKUP_END: + case XLOG_PARAMETER_CHANGE: case XLOG_RESTORE_POINT: case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: @@ -521,18 +516,9 @@ heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) /* * Inplace updates are only ever performed on catalog tuples and - * can, per definition, not change tuple visibility. Inplace - * updates don't affect storage or interpretation of table rows, - * so they don't affect logicalrep_write_tuple() outcomes. Hence, - * we don't process invalidations from the original operation. If - * inplace updates did affect those things, invalidations wouldn't - * make it work, since there are no snapshot-specific versions of - * inplace-updated values. Since we also don't decode catalog - * tuples, we're not interested in the record's contents. - * - * WAL contains likely-unnecessary commit-time invals from the - * CacheInvalidateHeapTuple() call in - * heap_inplace_update_and_unlock(). Excess invalidation is safe. + * can, per definition, not change tuple visibility. Since we + * also don't decode catalog tuples, we're not interested in the + * record's contents. */ break; diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 10677da56b2b6..3991e1495d4c5 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -32,6 +32,7 @@ #include "postmaster/interrupt.h" #include "replication/logicallauncher.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "storage/ipc.h" @@ -42,6 +43,7 @@ #include "utils/memutils.h" #include "utils/pg_lsn.h" #include "utils/snapmgr.h" +#include "utils/syscache.h" /* max sleep time between cycles (3min) */ #define DEFAULT_NAPTIME_PER_CYCLE 180000L @@ -91,7 +93,6 @@ static dshash_table *last_start_times = NULL; static bool on_commit_launcher_wakeup = false; -static void ApplyLauncherWakeup(void); static void logicalrep_launcher_onexit(int code, Datum arg); static void logicalrep_worker_onexit(int code, Datum arg); static void logicalrep_worker_detach(void); @@ -100,6 +101,10 @@ static int logicalrep_pa_worker_count(Oid subid); static void logicalrep_launcher_attach_dshmem(void); static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time); static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid); +static void compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin); +static bool acquire_conflict_slot_if_exists(void); +static void update_conflict_slot_xmin(TransactionId new_xmin); +static void init_conflict_slot_xmin(void); /* @@ -142,12 +147,14 @@ get_subscription_list(void) */ oldcxt = MemoryContextSwitchTo(resultcxt); - sub = (Subscription *) palloc0(sizeof(Subscription)); + sub = palloc0_object(Subscription); sub->oid = subform->oid; sub->dbid = subform->subdbid; sub->owner = subform->subowner; sub->enabled = subform->subenabled; sub->name = pstrdup(NameStr(subform->subname)); + sub->retaindeadtuples = subform->subretaindeadtuples; + sub->retentionactive = subform->subretentionactive; /* We don't fill fields we are not interested in. */ res = lappend(res, sub); @@ -175,12 +182,14 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, uint16 generation, BackgroundWorkerHandle *handle) { - BgwHandleStatus status; - int rc; + bool result = false; + bool dropped_latch = false; for (;;) { + BgwHandleStatus status; pid_t pid; + int rc; CHECK_FOR_INTERRUPTS(); @@ -189,8 +198,9 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, /* Worker either died or has started. Return false if died. */ if (!worker->in_use || worker->proc) { + result = worker->in_use; LWLockRelease(LogicalRepWorkerLock); - return worker->in_use; + break; } LWLockRelease(LogicalRepWorkerLock); @@ -205,7 +215,7 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, if (generation == worker->generation) logicalrep_worker_cleanup(worker); LWLockRelease(LogicalRepWorkerLock); - return false; + break; /* result is already false */ } /* @@ -220,25 +230,42 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, { ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); + dropped_latch = true; } } + + /* + * If we had to clear a latch event in order to wait, be sure to restore + * it before exiting. Otherwise caller may miss events. + */ + if (dropped_latch) + SetLatch(MyLatch); + + return result; } /* - * Walks the workers array and searches for one that matches given - * subscription id and relid. + * Walks the workers array and searches for one that matches given worker type, + * subscription id, and relation id. * - * We are only interested in the leader apply worker or table sync worker. + * For both apply workers and sequencesync workers, the relid should be set to + * InvalidOid, as these workers handle changes across all tables and sequences + * respectively, rather than targeting a specific relation. For tablesync + * workers, the relid should be set to the OID of the relation being + * synchronized. */ LogicalRepWorker * -logicalrep_worker_find(Oid subid, Oid relid, bool only_running) +logicalrep_worker_find(LogicalRepWorkerType wtype, Oid subid, Oid relid, + bool only_running) { int i; LogicalRepWorker *res = NULL; + /* relid must be valid only for table sync workers */ + Assert((wtype == WORKERTYPE_TABLESYNC) == OidIsValid(relid)); Assert(LWLockHeldByMe(LogicalRepWorkerLock)); - /* Search for attached worker for a given subscription id. */ + /* Search for an attached worker that matches the specified criteria. */ for (i = 0; i < max_logical_replication_workers; i++) { LogicalRepWorker *w = &LogicalRepCtx->workers[i]; @@ -248,7 +275,7 @@ logicalrep_worker_find(Oid subid, Oid relid, bool only_running) continue; if (w->in_use && w->subid == subid && w->relid == relid && - (!only_running || w->proc)) + w->type == wtype && (!only_running || w->proc)) { res = w; break; @@ -296,7 +323,8 @@ logicalrep_workers_find(Oid subid, bool only_running, bool acquire_lock) bool logicalrep_worker_launch(LogicalRepWorkerType wtype, Oid dbid, Oid subid, const char *subname, Oid userid, - Oid relid, dsm_handle subworker_dsm) + Oid relid, dsm_handle subworker_dsm, + bool retain_dead_tuples) { BackgroundWorker bgw; BackgroundWorkerHandle *bgw_handle; @@ -308,6 +336,7 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, int nparallelapplyworkers; TimestampTz now; bool is_tablesync_worker = (wtype == WORKERTYPE_TABLESYNC); + bool is_sequencesync_worker = (wtype == WORKERTYPE_SEQUENCESYNC); bool is_parallel_apply_worker = (wtype == WORKERTYPE_PARALLEL_APPLY); /*---------- @@ -315,10 +344,13 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, * - must be valid worker type * - tablesync workers are only ones to have relid * - parallel apply worker is the only kind of subworker + * - The replication slot used in conflict detection is created when + * retain_dead_tuples is enabled */ Assert(wtype != WORKERTYPE_UNKNOWN); Assert(is_tablesync_worker == OidIsValid(relid)); Assert(is_parallel_apply_worker == (subworker_dsm != DSM_HANDLE_INVALID)); + Assert(!retain_dead_tuples || MyReplicationSlot); ereport(DEBUG1, (errmsg_internal("starting logical replication worker for subscription \"%s\"", @@ -328,7 +360,7 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, if (max_active_replication_origins == 0) ereport(ERROR, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), - errmsg("cannot start logical replication workers when \"max_active_replication_origins\"=0"))); + errmsg("cannot start logical replication workers when \"max_active_replication_origins\" is 0"))); /* * We need to do the modification of the shared memory under lock so that @@ -393,7 +425,8 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, * sync worker limit per subscription. So, just return silently as we * might get here because of an otherwise harmless race condition. */ - if (is_tablesync_worker && nsyncworkers >= max_sync_workers_per_subscription) + if ((is_tablesync_worker || is_sequencesync_worker) && + nsyncworkers >= max_sync_workers_per_subscription) { LWLockRelease(LogicalRepWorkerLock); return false; @@ -441,11 +474,15 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, worker->stream_fileset = NULL; worker->leader_pid = is_parallel_apply_worker ? MyProcPid : InvalidPid; worker->parallel_apply = is_parallel_apply_worker; + worker->oldest_nonremovable_xid = retain_dead_tuples + ? MyReplicationSlot->data.xmin + : InvalidTransactionId; worker->last_lsn = InvalidXLogRecPtr; TIMESTAMP_NOBEGIN(worker->last_send_time); TIMESTAMP_NOBEGIN(worker->last_recv_time); worker->reply_lsn = InvalidXLogRecPtr; TIMESTAMP_NOBEGIN(worker->reply_time); + worker->last_seqsync_start_time = 0; /* Before releasing lock, remember generation for future identification. */ generation = worker->generation; @@ -479,8 +516,16 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, memcpy(bgw.bgw_extra, &subworker_dsm, sizeof(dsm_handle)); break; + case WORKERTYPE_SEQUENCESYNC: + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "SequenceSyncWorkerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication sequencesync worker for subscription %u", + subid); + snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication sequencesync worker"); + break; + case WORKERTYPE_TABLESYNC: - snprintf(bgw.bgw_function_name, BGW_MAXLEN, "TablesyncWorkerMain"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "TableSyncWorkerMain"); snprintf(bgw.bgw_name, BGW_MAXLEN, "logical replication tablesync worker for subscription %u sync %u", subid, @@ -600,16 +645,20 @@ logicalrep_worker_stop_internal(LogicalRepWorker *worker, int signo) } /* - * Stop the logical replication worker for subid/relid, if any. + * Stop the logical replication worker that matches the specified worker type, + * subscription id, and relation id. */ void -logicalrep_worker_stop(Oid subid, Oid relid) +logicalrep_worker_stop(LogicalRepWorkerType wtype, Oid subid, Oid relid) { LogicalRepWorker *worker; + /* relid must be valid only for table sync workers */ + Assert((wtype == WORKERTYPE_TABLESYNC) == OidIsValid(relid)); + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - worker = logicalrep_worker_find(subid, relid, false); + worker = logicalrep_worker_find(wtype, subid, relid, false); if (worker) { @@ -623,7 +672,7 @@ logicalrep_worker_stop(Oid subid, Oid relid) /* * Stop the given logical replication parallel apply worker. * - * Node that the function sends SIGINT instead of SIGTERM to the parallel apply + * Node that the function sends SIGUSR2 instead of SIGTERM to the parallel apply * worker so that the worker exits cleanly. */ void @@ -661,22 +710,26 @@ logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo) * Only stop the worker if the generation matches and the worker is alive. */ if (worker->generation == generation && worker->proc) - logicalrep_worker_stop_internal(worker, SIGINT); + logicalrep_worker_stop_internal(worker, SIGUSR2); LWLockRelease(LogicalRepWorkerLock); } /* - * Wake up (using latch) any logical replication worker for specified sub/rel. + * Wake up (using latch) any logical replication worker that matches the + * specified worker type, subscription id, and relation id. */ void -logicalrep_worker_wakeup(Oid subid, Oid relid) +logicalrep_worker_wakeup(LogicalRepWorkerType wtype, Oid subid, Oid relid) { LogicalRepWorker *worker; + /* relid must be valid only for table sync workers */ + Assert((wtype == WORKERTYPE_TABLESYNC) == OidIsValid(relid)); + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - worker = logicalrep_worker_find(subid, relid, true); + worker = logicalrep_worker_find(wtype, subid, relid, true); if (worker) logicalrep_worker_wakeup_ptr(worker); @@ -766,6 +819,8 @@ logicalrep_worker_detach(void) } LWLockRelease(LogicalRepWorkerLock); + + list_free(workers); } /* Block concurrent access. */ @@ -806,6 +861,33 @@ logicalrep_launcher_onexit(int code, Datum arg) LogicalRepCtx->launcher_pid = 0; } +/* + * Reset the last_seqsync_start_time of the sequencesync worker in the + * subscription's apply worker. + * + * Note that this value is not stored in the sequencesync worker, because that + * has finished already and is about to exit. + */ +void +logicalrep_reset_seqsync_start_time(void) +{ + LogicalRepWorker *worker; + + /* + * The apply worker can't access last_seqsync_start_time concurrently, so + * it is okay to use SHARED lock here. See ProcessSequencesForSync(). + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(WORKERTYPE_APPLY, + MyLogicalRepWorker->subid, InvalidOid, + true); + if (worker) + worker->last_seqsync_start_time = 0; + + LWLockRelease(LogicalRepWorkerLock); +} + /* * Cleanup function. * @@ -854,7 +936,7 @@ logicalrep_sync_worker_count(Oid subid) { LogicalRepWorker *w = &LogicalRepCtx->workers[i]; - if (isTablesyncWorker(w) && w->subid == subid) + if (w->subid == subid && (isTableSyncWorker(w) || isSequenceSyncWorker(w))) res++; } @@ -1016,7 +1098,7 @@ logicalrep_launcher_attach_dshmem(void) last_start_times_dsa = dsa_attach(LogicalRepCtx->last_start_dsa); dsa_pin_mapping(last_start_times_dsa); last_start_times = dshash_attach(last_start_times_dsa, &dsh_params, - LogicalRepCtx->last_start_dsh, 0); + LogicalRepCtx->last_start_dsh, NULL); } MemoryContextSwitchTo(oldcontext); @@ -1105,7 +1187,10 @@ ApplyLauncherWakeupAtCommit(void) on_commit_launcher_wakeup = true; } -static void +/* + * Wakeup the launcher immediately. + */ +void ApplyLauncherWakeup(void) { if (LogicalRepCtx->launcher_pid != 0) @@ -1137,6 +1222,12 @@ ApplyLauncherMain(Datum main_arg) */ BackgroundWorkerInitializeConnection(NULL, NULL, 0); + /* + * Acquire the conflict detection slot at startup to ensure it can be + * dropped if no longer needed after a restart. + */ + acquire_conflict_slot_if_exists(); + /* Enter main loop */ for (;;) { @@ -1146,6 +1237,9 @@ ApplyLauncherMain(Datum main_arg) MemoryContext subctx; MemoryContext oldctx; long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + bool can_update_xmin = true; + bool retain_dead_tuples = false; + TransactionId xmin = InvalidTransactionId; CHECK_FOR_INTERRUPTS(); @@ -1155,7 +1249,14 @@ ApplyLauncherMain(Datum main_arg) ALLOCSET_DEFAULT_SIZES); oldctx = MemoryContextSwitchTo(subctx); - /* Start any missing workers for enabled subscriptions. */ + /* + * Start any missing workers for enabled subscriptions. + * + * Also, during the iteration through all subscriptions, we compute + * the minimum XID required to protect deleted tuples for conflict + * detection if one of the subscription enables retain_dead_tuples + * option. + */ sublist = get_subscription_list(); foreach(lc, sublist) { @@ -1165,15 +1266,87 @@ ApplyLauncherMain(Datum main_arg) TimestampTz now; long elapsed; + if (sub->retaindeadtuples) + { + retain_dead_tuples = true; + + /* + * Create a replication slot to retain information necessary + * for conflict detection such as dead tuples, commit + * timestamps, and origins. + * + * The slot is created before starting the apply worker to + * prevent it from unnecessarily maintaining its + * oldest_nonremovable_xid. + * + * The slot is created even for a disabled subscription to + * ensure that conflict-related information is available when + * applying remote changes that occurred before the + * subscription was enabled. + */ + CreateConflictDetectionSlot(); + + if (sub->retentionactive) + { + /* + * Can't advance xmin of the slot unless all the + * subscriptions actively retaining dead tuples are + * enabled. This is required to ensure that we don't + * advance the xmin of CONFLICT_DETECTION_SLOT if one of + * the subscriptions is not enabled. Otherwise, we won't + * be able to detect conflicts reliably for such a + * subscription even though it has set the + * retain_dead_tuples option. + */ + can_update_xmin &= sub->enabled; + + /* + * Initialize the slot once the subscription activates + * retention. + */ + if (!TransactionIdIsValid(MyReplicationSlot->data.xmin)) + init_conflict_slot_xmin(); + } + } + if (!sub->enabled) continue; LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - w = logicalrep_worker_find(sub->oid, InvalidOid, false); - LWLockRelease(LogicalRepWorkerLock); + w = logicalrep_worker_find(WORKERTYPE_APPLY, sub->oid, InvalidOid, + false); if (w != NULL) - continue; /* worker is running already */ + { + /* + * Compute the minimum xmin required to protect dead tuples + * required for conflict detection among all running apply + * workers. This computation is performed while holding + * LogicalRepWorkerLock to prevent accessing invalid worker + * data, in scenarios where a worker might exit and reset its + * state concurrently. + */ + if (sub->retaindeadtuples && + sub->retentionactive && + can_update_xmin) + compute_min_nonremovable_xid(w, &xmin); + + LWLockRelease(LogicalRepWorkerLock); + + /* worker is running already */ + continue; + } + + LWLockRelease(LogicalRepWorkerLock); + + /* + * Can't advance xmin of the slot unless all the workers + * corresponding to subscriptions actively retaining dead tuples + * are running, disabling the further computation of the minimum + * nonremovable xid. + */ + if (sub->retaindeadtuples && sub->retentionactive) + can_update_xmin = false; /* * If the worker is eligible to start now, launch it. Otherwise, @@ -1194,10 +1367,23 @@ ApplyLauncherMain(Datum main_arg) (elapsed = TimestampDifferenceMilliseconds(last_start, now)) >= wal_retrieve_retry_interval) { ApplyLauncherSetWorkerStartTime(sub->oid, now); - logicalrep_worker_launch(WORKERTYPE_APPLY, - sub->dbid, sub->oid, sub->name, - sub->owner, InvalidOid, - DSM_HANDLE_INVALID); + if (!logicalrep_worker_launch(WORKERTYPE_APPLY, + sub->dbid, sub->oid, sub->name, + sub->owner, InvalidOid, + DSM_HANDLE_INVALID, + sub->retaindeadtuples && + sub->retentionactive)) + { + /* + * We get here either if we failed to launch a worker + * (perhaps for resource-exhaustion reasons) or if we + * launched one but it immediately quit. Either way, it + * seems appropriate to try again after + * wal_retrieve_retry_interval. + */ + wait_time = Min(wait_time, + wal_retrieve_retry_interval); + } } else { @@ -1206,6 +1392,25 @@ ApplyLauncherMain(Datum main_arg) } } + /* + * Drop the CONFLICT_DETECTION_SLOT slot if there is no subscription + * that requires us to retain dead tuples. Otherwise, if required, + * advance the slot's xmin to protect dead tuples required for the + * conflict detection. + * + * Additionally, if all apply workers for subscriptions with + * retain_dead_tuples enabled have requested to stop retention, the + * slot's xmin will be set to InvalidTransactionId allowing the + * removal of dead tuples. + */ + if (MyReplicationSlot) + { + if (!retain_dead_tuples) + ReplicationSlotDropAcquired(); + else if (can_update_xmin) + update_conflict_slot_xmin(xmin); + } + /* Switch back to original memory context. */ MemoryContextSwitchTo(oldctx); /* Clean the temporary memory. */ @@ -1233,6 +1438,146 @@ ApplyLauncherMain(Datum main_arg) /* Not reachable */ } +/* + * Determine the minimum non-removable transaction ID across all apply workers + * for subscriptions that have retain_dead_tuples enabled. Store the result + * in *xmin. + */ +static void +compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin) +{ + TransactionId nonremovable_xid; + + Assert(worker != NULL); + + /* + * The replication slot for conflict detection must be created before the + * worker starts. + */ + Assert(MyReplicationSlot); + + SpinLockAcquire(&worker->relmutex); + nonremovable_xid = worker->oldest_nonremovable_xid; + SpinLockRelease(&worker->relmutex); + + /* + * Return if the apply worker has stopped retention concurrently. + * + * Although this function is invoked only when retentionactive is true, + * the apply worker might stop retention after the launcher fetches the + * retentionactive flag. + */ + if (!TransactionIdIsValid(nonremovable_xid)) + return; + + if (!TransactionIdIsValid(*xmin) || + TransactionIdPrecedes(nonremovable_xid, *xmin)) + *xmin = nonremovable_xid; +} + +/* + * Acquire the replication slot used to retain information for conflict + * detection, if it exists. + * + * Return true if successfully acquired, otherwise return false. + */ +static bool +acquire_conflict_slot_if_exists(void) +{ + if (!SearchNamedReplicationSlot(CONFLICT_DETECTION_SLOT, true)) + return false; + + ReplicationSlotAcquire(CONFLICT_DETECTION_SLOT, true, false); + return true; +} + +/* + * Update the xmin the replication slot used to retain information required + * for conflict detection. + */ +static void +update_conflict_slot_xmin(TransactionId new_xmin) +{ + Assert(MyReplicationSlot); + Assert(!TransactionIdIsValid(new_xmin) || + TransactionIdPrecedesOrEquals(MyReplicationSlot->data.xmin, new_xmin)); + + /* Return if the xmin value of the slot cannot be updated */ + if (TransactionIdEquals(MyReplicationSlot->data.xmin, new_xmin)) + return; + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = new_xmin; + MyReplicationSlot->data.xmin = new_xmin; + SpinLockRelease(&MyReplicationSlot->mutex); + + elog(DEBUG1, "updated xmin: %u", MyReplicationSlot->data.xmin); + + ReplicationSlotMarkDirty(); + ReplicationSlotsComputeRequiredXmin(false); + + /* + * Like PhysicalConfirmReceivedLocation(), do not save slot information + * each time. This is acceptable because all concurrent transactions on + * the publisher that require the data preceding the slot's xmin should + * have already been applied and flushed on the subscriber before the xmin + * is advanced. So, even if the slot's xmin regresses after a restart, it + * will be advanced again in the next cycle. Therefore, no data required + * for conflict detection will be prematurely removed. + */ + return; +} + +/* + * Initialize the xmin for the conflict detection slot. + */ +static void +init_conflict_slot_xmin(void) +{ + TransactionId xmin_horizon; + + /* Replication slot must exist but shouldn't be initialized. */ + Assert(MyReplicationSlot && + !TransactionIdIsValid(MyReplicationSlot->data.xmin)); + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + xmin_horizon = GetOldestSafeDecodingTransactionId(false); + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = xmin_horizon; + MyReplicationSlot->data.xmin = xmin_horizon; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); +} + +/* + * Create and acquire the replication slot used to retain information for + * conflict detection, if not yet. + */ +void +CreateConflictDetectionSlot(void) +{ + /* Exit early, if the replication slot is already created and acquired */ + if (MyReplicationSlot) + return; + + ereport(LOG, + errmsg("creating replication conflict detection slot")); + + ReplicationSlotCreate(CONFLICT_DETECTION_SLOT, false, RS_PERSISTENT, false, + false, false); + + init_conflict_slot_xmin(); +} + /* * Is current process the logical replication launcher? */ @@ -1305,7 +1650,7 @@ pg_stat_get_subscription(PG_FUNCTION_ARGS) worker_pid = worker.proc->pid; values[0] = ObjectIdGetDatum(worker.subid); - if (isTablesyncWorker(&worker)) + if (isTableSyncWorker(&worker)) values[1] = ObjectIdGetDatum(worker.relid); else nulls[1] = true; @@ -1316,7 +1661,7 @@ pg_stat_get_subscription(PG_FUNCTION_ARGS) else nulls[3] = true; - if (XLogRecPtrIsInvalid(worker.last_lsn)) + if (!XLogRecPtrIsValid(worker.last_lsn)) nulls[4] = true; else values[4] = LSNGetDatum(worker.last_lsn); @@ -1328,7 +1673,7 @@ pg_stat_get_subscription(PG_FUNCTION_ARGS) nulls[6] = true; else values[6] = TimestampTzGetDatum(worker.last_recv_time); - if (XLogRecPtrIsInvalid(worker.reply_lsn)) + if (!XLogRecPtrIsValid(worker.reply_lsn)) nulls[7] = true; else values[7] = LSNGetDatum(worker.reply_lsn); @@ -1345,6 +1690,9 @@ pg_stat_get_subscription(PG_FUNCTION_ARGS) case WORKERTYPE_PARALLEL_APPLY: values[9] = CStringGetTextDatum("parallel apply"); break; + case WORKERTYPE_SEQUENCESYNC: + values[9] = CStringGetTextDatum("sequence synchronization"); + break; case WORKERTYPE_TABLESYNC: values[9] = CStringGetTextDatum("table synchronization"); break; diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 1d56d0c4ef314..c8858e0661676 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -29,6 +29,7 @@ #include "postgres.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "access/xlogutils.h" #include "fmgr.h" #include "miscadmin.h" @@ -41,6 +42,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/inval.h" #include "utils/memutils.h" @@ -115,31 +117,20 @@ CheckLogicalDecodingRequirements(void) * needs the same check. */ - if (wal_level < WAL_LEVEL_LOGICAL) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("logical decoding requires \"wal_level\" >= \"logical\""))); - if (MyDatabaseId == InvalidOid) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("logical decoding requires a database connection"))); - if (RecoveryInProgress()) - { - /* - * This check may have race conditions, but whenever - * XLOG_PARAMETER_CHANGE indicates that wal_level has changed, we - * verify that there are no existing logical replication slots. And to - * avoid races around creating a new slot, - * CheckLogicalDecodingRequirements() is called once before creating - * the slot, and once when logical decoding is initially starting up. - */ - if (GetActiveWalLevelOnStandby() < WAL_LEVEL_LOGICAL) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("logical decoding on standby requires \"wal_level\" >= \"logical\" on the primary"))); - } + /* CheckSlotRequirements() has already checked if wal_level >= 'replica' */ + Assert(wal_level >= WAL_LEVEL_REPLICA); + + /* Check if logical decoding is available on standby */ + if (RecoveryInProgress() && !IsLogicalDecodingEnabled()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding on standby requires \"effective_wal_level\" >= \"logical\" on the primary"), + errhint("Set \"wal_level\" >= \"logical\" or create at least one logical slot when \"wal_level\" = \"replica\"."))); } /* @@ -170,7 +161,7 @@ StartupDecodingContext(List *output_plugin_options, "Logical decoding context", ALLOCSET_DEFAULT_SIZES); old_context = MemoryContextSwitchTo(context); - ctx = palloc0(sizeof(LogicalDecodingContext)); + ctx = palloc0_object(LogicalDecodingContext); ctx->context = context; @@ -386,7 +377,7 @@ CreateInitDecodingContext(const char *plugin, slot->data.plugin = plugin_name; SpinLockRelease(&slot->mutex); - if (XLogRecPtrIsInvalid(restart_lsn)) + if (!XLogRecPtrIsValid(restart_lsn)) ReplicationSlotReserveWal(); else { @@ -544,9 +535,9 @@ CreateDecodingContext(XLogRecPtr start_lsn, /* slot must be valid to allow decoding */ Assert(slot->data.invalidated == RS_INVAL_NONE); - Assert(slot->data.restart_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(slot->data.restart_lsn)); - if (start_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(start_lsn)) { /* continue from last position */ start_lsn = slot->data.confirmed_flush; @@ -565,7 +556,7 @@ CreateDecodingContext(XLogRecPtr start_lsn, * kinds of client errors; so the client may wish to check that * confirmed_flush_lsn matches its expectations. */ - elog(LOG, "%X/%X has been already streamed, forwarding to %X/%X", + elog(LOG, "%X/%08X has been already streamed, forwarding to %X/%08X", LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(slot->data.confirmed_flush)); @@ -608,7 +599,7 @@ CreateDecodingContext(XLogRecPtr start_lsn, ereport(LOG, (errmsg("starting logical decoding for slot \"%s\"", NameStr(slot->data.name)), - errdetail("Streaming transactions committing after %X/%X, reading WAL from %X/%X.", + errdetail("Streaming transactions committing after %X/%08X, reading WAL from %X/%08X.", LSN_FORMAT_ARGS(slot->data.confirmed_flush), LSN_FORMAT_ARGS(slot->data.restart_lsn)))); @@ -635,7 +626,7 @@ DecodingContextFindStartpoint(LogicalDecodingContext *ctx) /* Initialize from where to start reading WAL. */ XLogBeginRead(ctx->reader, slot->data.restart_lsn); - elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X", + elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%08X", LSN_FORMAT_ARGS(slot->data.restart_lsn)); /* Wait for a consistent starting point */ @@ -755,8 +746,8 @@ output_plugin_error_callback(void *arg) LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg; /* not all callbacks have an associated LSN */ - if (state->report_location != InvalidXLogRecPtr) - errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X", + if (XLogRecPtrIsValid(state->report_location)) + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%08X", NameStr(state->ctx->slot->data.name), NameStr(state->ctx->slot->data.plugin), state->callback_name, @@ -1709,7 +1700,7 @@ LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) * Only increase if the previous values have been applied, otherwise we * might never end up updating if the receiver acks too slowly. */ - else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr) + else if (!XLogRecPtrIsValid(slot->candidate_xmin_lsn)) { slot->candidate_catalog_xmin = xmin; slot->candidate_xmin_lsn = current_lsn; @@ -1723,7 +1714,7 @@ LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) SpinLockRelease(&slot->mutex); if (got_new_xmin) - elog(DEBUG1, "got new catalog xmin %u at %X/%X", xmin, + elog(DEBUG1, "got new catalog xmin %u at %X/%08X", xmin, LSN_FORMAT_ARGS(current_lsn)); /* candidate already valid with the current flush position, apply */ @@ -1747,8 +1738,8 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart slot = MyReplicationSlot; Assert(slot != NULL); - Assert(restart_lsn != InvalidXLogRecPtr); - Assert(current_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(restart_lsn)); + Assert(XLogRecPtrIsValid(current_lsn)); SpinLockAcquire(&slot->mutex); @@ -1777,13 +1768,13 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart * might never end up updating if the receiver acks too slowly. A missed * value here will just cause some extra effort after reconnecting. */ - else if (slot->candidate_restart_valid == InvalidXLogRecPtr) + else if (!XLogRecPtrIsValid(slot->candidate_restart_valid)) { slot->candidate_restart_valid = current_lsn; slot->candidate_restart_lsn = restart_lsn; SpinLockRelease(&slot->mutex); - elog(DEBUG1, "got new restart lsn %X/%X at %X/%X", + elog(DEBUG1, "got new restart lsn %X/%08X at %X/%08X", LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(current_lsn)); } @@ -1798,7 +1789,7 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart confirmed_flush = slot->data.confirmed_flush; SpinLockRelease(&slot->mutex); - elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X", + elog(DEBUG1, "failed to increase restart lsn: proposed %X/%08X, after %X/%08X, current candidate %X/%08X, current after %X/%08X, flushed up to %X/%08X", LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(current_lsn), LSN_FORMAT_ARGS(candidate_restart_lsn), @@ -1817,17 +1808,21 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart void LogicalConfirmReceivedLocation(XLogRecPtr lsn) { - Assert(lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(lsn)); /* Do an unlocked check for candidate_lsn first. */ - if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr || - MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(MyReplicationSlot->candidate_xmin_lsn) || + XLogRecPtrIsValid(MyReplicationSlot->candidate_restart_valid)) { bool updated_xmin = false; bool updated_restart = false; + XLogRecPtr restart_lsn pg_attribute_unused(); SpinLockAcquire(&MyReplicationSlot->mutex); + /* remember the old restart lsn */ + restart_lsn = MyReplicationSlot->data.restart_lsn; + /* * Prevent moving the confirmed_flush backwards, as this could lead to * data duplication issues caused by replicating already replicated @@ -1843,7 +1838,7 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) MyReplicationSlot->data.confirmed_flush = lsn; /* if we're past the location required for bumping xmin, do so */ - if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr && + if (XLogRecPtrIsValid(MyReplicationSlot->candidate_xmin_lsn) && MyReplicationSlot->candidate_xmin_lsn <= lsn) { /* @@ -1865,10 +1860,10 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) } } - if (MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr && + if (XLogRecPtrIsValid(MyReplicationSlot->candidate_restart_valid) && MyReplicationSlot->candidate_restart_valid <= lsn) { - Assert(MyReplicationSlot->candidate_restart_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(MyReplicationSlot->candidate_restart_lsn)); MyReplicationSlot->data.restart_lsn = MyReplicationSlot->candidate_restart_lsn; MyReplicationSlot->candidate_restart_lsn = InvalidXLogRecPtr; @@ -1881,6 +1876,18 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) /* first write new xmin to disk, so we know what's up after a crash */ if (updated_xmin || updated_restart) { +#ifdef USE_INJECTION_POINTS + XLogSegNo seg1, + seg2; + + XLByteToSeg(restart_lsn, seg1, wal_segment_size); + XLByteToSeg(MyReplicationSlot->data.restart_lsn, seg2, wal_segment_size); + + /* trigger injection point, but only if segment changes */ + if (seg1 != seg2) + INJECTION_POINT("logical-replication-slot-advance-segment", NULL); +#endif + ReplicationSlotMarkDirty(); ReplicationSlotSave(); elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart); @@ -1937,10 +1944,11 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) PgStat_StatReplSlotEntry repSlotStat; /* Nothing to do if we don't have any replication stats to be sent. */ - if (rb->spillBytes <= 0 && rb->streamBytes <= 0 && rb->totalBytes <= 0) + if (rb->spillBytes <= 0 && rb->streamBytes <= 0 && rb->totalBytes <= 0 && + rb->memExceededCount <= 0) return; - elog(DEBUG2, "UpdateDecodingStats: updating stats %p %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64, + elog(DEBUG2, "UpdateDecodingStats: updating stats %p %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64, rb, rb->spillTxns, rb->spillCount, @@ -1948,6 +1956,7 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) rb->streamTxns, rb->streamCount, rb->streamBytes, + rb->memExceededCount, rb->totalTxns, rb->totalBytes); @@ -1957,6 +1966,7 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) repSlotStat.stream_txns = rb->streamTxns; repSlotStat.stream_count = rb->streamCount; repSlotStat.stream_bytes = rb->streamBytes; + repSlotStat.mem_exceeded_count = rb->memExceededCount; repSlotStat.total_txns = rb->totalTxns; repSlotStat.total_bytes = rb->totalBytes; @@ -1968,6 +1978,7 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) rb->streamTxns = 0; rb->streamCount = 0; rb->streamBytes = 0; + rb->memExceededCount = 0; rb->totalTxns = 0; rb->totalBytes = 0; } @@ -2064,10 +2075,10 @@ LogicalSlotAdvanceAndCheckSnapState(XLogRecPtr moveto, bool *found_consistent_snapshot) { LogicalDecodingContext *ctx; - ResourceOwner old_resowner = CurrentResourceOwner; + ResourceOwner old_resowner PG_USED_FOR_ASSERTS_ONLY = CurrentResourceOwner; XLogRecPtr retlsn; - Assert(moveto != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(moveto)); if (found_consistent_snapshot) *found_consistent_snapshot = false; @@ -2123,22 +2134,25 @@ LogicalSlotAdvanceAndCheckSnapState(XLogRecPtr moveto, * might still have critical updates to do. */ if (record) + { LogicalDecodingProcessRecord(ctx, ctx->reader); + /* + * We used to have bugs where logical decoding would fail to + * preserve the resource owner. That's important here, so + * verify that that doesn't happen anymore. XXX this could be + * removed once it's been battle-tested. + */ + Assert(CurrentResourceOwner == old_resowner); + } + CHECK_FOR_INTERRUPTS(); } if (found_consistent_snapshot && DecodingContextReady(ctx)) *found_consistent_snapshot = true; - /* - * Logical decoding could have clobbered CurrentResourceOwner during - * transaction management, so restore the executor's value. (This is - * a kluge, but it's not worth cleaning up right now.) - */ - CurrentResourceOwner = old_resowner; - - if (ctx->reader->EndRecPtr != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(ctx->reader->EndRecPtr)) { LogicalConfirmReceivedLocation(moveto); diff --git a/src/backend/replication/logical/logicalctl.c b/src/backend/replication/logical/logicalctl.c new file mode 100644 index 0000000000000..5a0ddf37b8b58 --- /dev/null +++ b/src/backend/replication/logical/logicalctl.c @@ -0,0 +1,639 @@ +/*------------------------------------------------------------------------- + * logicalctl.c + * Functionality to control logical decoding status online. + * + * This module enables dynamic control of logical decoding availability. + * Logical decoding becomes active under two conditions: when the wal_level + * parameter is set to 'logical', or when at least one valid logical replication + * slot exists with wal_level set to 'replica'. The system disables logical + * decoding when neither condition is met. Therefore, the dynamic control + * of logical decoding availability is required only when wal_level is set + * to 'replica'. Logical decoding is always enabled when wal_level='logical' + * and always disabled when wal_level='minimal'. + * + * The core concept of dynamically enabling and disabling logical decoding + * is to separately control two aspects: writing information required for + * logical decoding to WAL records, and using logical decoding itself. During + * activation, we first enable logical WAL writing while keeping logical + * decoding disabled. This change is reflected in the read-only + * effective_wal_level GUC parameter. Once we ensure that all processes have + * updated to the latest effective_wal_level value, we then enable logical + * decoding. Deactivation follows a similar careful, multi-step process + * in reverse order. + * + * While activation occurs synchronously right after creating the first + * logical slot, deactivation happens asynchronously through the checkpointer + * process. This design avoids a race condition at the end of recovery; see + * the comments in UpdateLogicalDecodingStatusEndOfRecovery() for details. + * Asynchronous deactivation also avoids excessive toggling of the logical + * decoding status in workloads that repeatedly create and drop a single + * logical slot. On the other hand, this lazy approach can delay changes + * to effective_wal_level and the disabling logical decoding, especially + * when the checkpointer is busy with other tasks. We chose this lazy approach + * in all deactivation paths to keep the implementation simple, even though + * laziness is strictly required only for end-of-recovery cases. Future work + * might address this limitation either by using a dedicated worker instead + * of the checkpointer, or by implementing synchronous waiting during slot + * drops if workloads are significantly affected by the lazy deactivation + * of logical decoding. + * + * Standby servers use the primary server's effective_wal_level and logical + * decoding status. Unlike normal activation and deactivation, these + * are updated simultaneously without status change coordination, solely by + * replaying XLOG_LOGICAL_DECODING_STATUS_CHANGE records. The local wal_level + * setting has no effect during this time. Upon promotion, we update the + * logical decoding status based on local conditions: the wal_level value and + * the presence of logical slots. + * + * In the future, we could extend support to include automatic transitions + * of effective_wal_level between 'minimal' and 'logical' WAL levels. However, + * this enhancement would require additional coordination mechanisms and + * careful implementation of operations such as terminating walsenders and + * archiver processes while carefully considering the sequence of operations + * to ensure system stability during these transitions. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/replication/logical/logicalctl.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xloginsert.h" +#include "catalog/pg_control.h" +#include "miscadmin.h" +#include "replication/slot.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/injection_point.h" + +/* + * Struct for controlling the logical decoding status. + * + * This struct is protected by LogicalDecodingControlLock. + */ +typedef struct LogicalDecodingCtlData +{ + /* + * This is the authoritative value used by all processes to determine + * whether to write additional information required by logical decoding to + * WAL. Since this information could be checked frequently, each process + * caches this value in XLogLogicalInfo for better performance. + */ + bool xlog_logical_info; + + /* True if logical decoding is available in the system */ + bool logical_decoding_enabled; + + /* True if logical decoding might need to be disabled */ + bool pending_disable; +} LogicalDecodingCtlData; + +static LogicalDecodingCtlData *LogicalDecodingCtl = NULL; + +/* + * A process-local cache of LogicalDecodingCtl->xlog_logical_info. This is + * initialized at process startup, and updated when processing the process + * barrier signal in ProcessBarrierUpdateXLogLogicalInfo(). If the process + * is in an XID-assigned transaction, the cache update is delayed until the + * transaction ends. See the comments for XLogLogicalInfoUpdatePending for details. + */ +bool XLogLogicalInfo = false; + +/* + * When receiving the PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO signal, if + * an XID is assigned to the current transaction, the process sets this flag and + * delays the XLogLogicalInfo update until the transaction ends. This ensures + * that the XLogLogicalInfo value (typically accessed via XLogLogicalInfoActive) + * remains consistent throughout the transaction. + */ +static bool XLogLogicalInfoUpdatePending = false; + +static void update_xlog_logical_info(void); +static void abort_logical_decoding_activation(int code, Datum arg); +static void write_logical_decoding_status_update_record(bool status); + +Size +LogicalDecodingCtlShmemSize(void) +{ + return sizeof(LogicalDecodingCtlData); +} + +void +LogicalDecodingCtlShmemInit(void) +{ + bool found; + + LogicalDecodingCtl = ShmemInitStruct("Logical decoding control", + LogicalDecodingCtlShmemSize(), + &found); + + if (!found) + MemSet(LogicalDecodingCtl, 0, LogicalDecodingCtlShmemSize()); +} + +/* + * Initialize the logical decoding status in shmem at server startup. This + * must be called ONCE during postmaster or standalone-backend startup. + */ +void +StartupLogicalDecodingStatus(bool last_status) +{ + /* Logical decoding is always disabled when 'minimal' WAL level */ + if (wal_level == WAL_LEVEL_MINIMAL) + return; + + /* + * Set the initial logical decoding status based on the last status. If + * logical decoding was enabled before the last shutdown, it remains + * enabled as we might have set wal_level='logical' or have at least one + * logical slot. + */ + LogicalDecodingCtl->xlog_logical_info = last_status; + LogicalDecodingCtl->logical_decoding_enabled = last_status; +} + +/* + * Update the XLogLogicalInfo cache. + */ +static inline void +update_xlog_logical_info(void) +{ + XLogLogicalInfo = IsXLogLogicalInfoEnabled(); +} + +/* + * Initialize XLogLogicalInfo backend-private cache. This routine is called + * during process initialization. + */ +void +InitializeProcessXLogLogicalInfo(void) +{ + update_xlog_logical_info(); +} + +/* + * This routine is called when we are told to update XLogLogicalInfo + * by a ProcSignalBarrier. + */ +bool +ProcessBarrierUpdateXLogLogicalInfo(void) +{ + if (GetTopTransactionIdIfAny() != InvalidTransactionId) + { + /* Delay updating XLogLogicalInfo until the transaction end */ + XLogLogicalInfoUpdatePending = true; + } + else + update_xlog_logical_info(); + + return true; +} + +/* + * Check the shared memory state and return true if logical decoding is + * enabled on the system. + */ +bool +IsLogicalDecodingEnabled(void) +{ + bool enabled; + + LWLockAcquire(LogicalDecodingControlLock, LW_SHARED); + enabled = LogicalDecodingCtl->logical_decoding_enabled; + LWLockRelease(LogicalDecodingControlLock); + + return enabled; +} + +/* + * Returns true if logical WAL logging is enabled based on the shared memory + * status. + */ +bool +IsXLogLogicalInfoEnabled(void) +{ + bool xlog_logical_info; + + LWLockAcquire(LogicalDecodingControlLock, LW_SHARED); + xlog_logical_info = LogicalDecodingCtl->xlog_logical_info; + LWLockRelease(LogicalDecodingControlLock); + + return xlog_logical_info; +} + +/* + * Reset the local cache at end of the transaction. + */ +void +AtEOXact_LogicalCtl(void) +{ + /* Update the local cache if there is a pending update */ + if (XLogLogicalInfoUpdatePending) + { + update_xlog_logical_info(); + XLogLogicalInfoUpdatePending = false; + } +} + +/* + * Writes an XLOG_LOGICAL_DECODING_STATUS_CHANGE WAL record with the given + * status. + */ +static void +write_logical_decoding_status_update_record(bool status) +{ + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData(&status, sizeof(bool)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_LOGICAL_DECODING_STATUS_CHANGE); + XLogFlush(recptr); +} + +/* + * A PG_ENSURE_ERROR_CLEANUP callback for activating logical decoding, resetting + * the shared flags to revert the logical decoding activation process. + */ +static void +abort_logical_decoding_activation(int code, Datum arg) +{ + Assert(MyReplicationSlot); + Assert(!LogicalDecodingCtl->logical_decoding_enabled); + + elog(DEBUG1, "aborting logical decoding activation process"); + + /* + * Abort the change to xlog_logical_info. We don't need to check + * CheckLogicalSlotExists() as we're still holding a logical slot. + */ + LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE); + LogicalDecodingCtl->xlog_logical_info = false; + LWLockRelease(LogicalDecodingControlLock); + + /* + * Some processes might have already started logical info WAL logging, so + * tell all running processes to update their caches. We don't need to + * wait for all processes to disable xlog_logical_info locally as it's + * always safe to write logical information to WAL records, even when not + * strictly required. + */ + EmitProcSignalBarrier(PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO); +} + +/* + * Enable logical decoding if disabled. + * + * If this function is called during recovery, it simply returns without + * action since the logical decoding status change is not allowed during + * this time. The logical decoding status depends on the status on the primary. + * The caller should use CheckLogicalDecodingRequirements() before calling this + * function to make sure that the logical decoding status can be modified. + * + * Note that there is no interlock between logical decoding activation + * and slot creation. To ensure enabling logical decoding, the caller + * needs to call this function after creating a logical slot before + * initializing the logical decoding context. + */ +void +EnsureLogicalDecodingEnabled(void) +{ + Assert(MyReplicationSlot); + Assert(wal_level >= WAL_LEVEL_REPLICA); + + /* Logical decoding is always enabled */ + if (wal_level >= WAL_LEVEL_LOGICAL) + return; + + if (RecoveryInProgress()) + { + /* + * CheckLogicalDecodingRequirements() must have already errored out if + * logical decoding is not enabled since we cannot enable the logical + * decoding status during recovery. + */ + Assert(IsLogicalDecodingEnabled()); + return; + } + + /* + * Ensure to abort the activation process in cases where there in an + * interruption during the wait. + */ + PG_ENSURE_ERROR_CLEANUP(abort_logical_decoding_activation, (Datum) 0); + { + EnableLogicalDecoding(); + } + PG_END_ENSURE_ERROR_CLEANUP(abort_logical_decoding_activation, (Datum) 0); +} + +/* + * A workhorse function to enable logical decoding. + */ +void +EnableLogicalDecoding(void) +{ + bool in_recovery; + + LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE); + + /* Return if it is already enabled */ + if (LogicalDecodingCtl->logical_decoding_enabled) + { + LogicalDecodingCtl->pending_disable = false; + LWLockRelease(LogicalDecodingControlLock); + return; + } + + /* + * Set logical info WAL logging in shmem. All process starts after this + * point will include the information required by logical decoding to WAL + * records. + */ + LogicalDecodingCtl->xlog_logical_info = true; + + LWLockRelease(LogicalDecodingControlLock); + + /* + * Tell all running processes to reflect the xlog_logical_info update, and + * wait. This ensures that all running processes have enabled logical + * information WAL logging. + */ + WaitForProcSignalBarrier( + EmitProcSignalBarrier(PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO)); + + INJECTION_POINT("logical-decoding-activation", NULL); + + in_recovery = RecoveryInProgress(); + + /* + * There could be some transactions that might have started with the old + * status, but we don't need to wait for these transactions to complete as + * long as they have valid XIDs. These transactions will appear in the + * xl_running_xacts record and therefore the snapshot builder will not try + * to decode the transaction during the logical decoding initialization. + * + * There is a theoretical case where a transaction decides whether to + * include logical-info to WAL records before getting an XID. In this + * case, the transaction won't appear in xl_running_xacts. + * + * For operations that do not require an XID assignment, the process + * starts including logical-info immediately upon receiving the signal + * (barrier). If such an operation checks the effective_wal_level multiple + * times within a single execution, the resulting WAL records might be + * inconsistent (i.e., logical-info is included in some records but not in + * others). However, this is harmless because logical decoding generally + * ignores WAL records that are not associated with an assigned XID. + * + * One might think we need to wait for all running transactions, including + * those without XIDs and read-only transactions, to finish before + * enabling logical decoding. However, such a requirement would force the + * slot creation to wait for a potentially very long time due to + * long-running read queries, which is practically unacceptable. + */ + + START_CRIT_SECTION(); + + /* + * We enable logical decoding first, followed by writing the WAL record. + * This sequence ensures logical decoding becomes available on the primary + * first. + */ + LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE); + + LogicalDecodingCtl->logical_decoding_enabled = true; + + if (!in_recovery) + write_logical_decoding_status_update_record(true); + + LogicalDecodingCtl->pending_disable = false; + + LWLockRelease(LogicalDecodingControlLock); + + END_CRIT_SECTION(); + + if (!in_recovery) + ereport(LOG, + errmsg("logical decoding is enabled upon creating a new logical replication slot")); +} + +/* + * Initiate a request for disabling logical decoding. + * + * Note that this function does not verify whether logical slots exist. The + * checkpointer will verify if logical decoding should actually be disabled. + */ +void +RequestDisableLogicalDecoding(void) +{ + if (wal_level != WAL_LEVEL_REPLICA) + return; + + /* + * It's possible that we might not actually need to disable logical + * decoding if someone creates a new logical slot concurrently. We set the + * flag anyway and the checkpointer will check it and disable logical + * decoding if necessary. + */ + LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE); + LogicalDecodingCtl->pending_disable = true; + LWLockRelease(LogicalDecodingControlLock); + + WakeupCheckpointer(); + + elog(DEBUG1, "requested disabling logical decoding"); +} + +/* + * Disable logical decoding if necessary. + * + * This function disables logical decoding upon a request initiated by + * RequestDisableLogicalDecoding(). Otherwise, it performs no action. + */ +void +DisableLogicalDecodingIfNecessary(void) +{ + bool pending_disable; + + if (wal_level != WAL_LEVEL_REPLICA) + return; + + /* + * Sanity check as we cannot disable logical decoding while holding a + * logical slot. + */ + Assert(!MyReplicationSlot); + + if (RecoveryInProgress()) + return; + + LWLockAcquire(LogicalDecodingControlLock, LW_SHARED); + pending_disable = LogicalDecodingCtl->pending_disable; + LWLockRelease(LogicalDecodingControlLock); + + /* Quick return if no pending disable request */ + if (!pending_disable) + return; + + DisableLogicalDecoding(); +} + +/* + * A workhorse function to disable logical decoding. + */ +void +DisableLogicalDecoding(void) +{ + bool in_recovery = RecoveryInProgress(); + + LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE); + + /* + * Check if we can disable logical decoding. + * + * Skip CheckLogicalSlotExists() check during recovery because the + * existing slots will be invalidated after disabling logical decoding. + */ + if (!LogicalDecodingCtl->logical_decoding_enabled || + (!in_recovery && CheckLogicalSlotExists())) + { + LogicalDecodingCtl->pending_disable = false; + LWLockRelease(LogicalDecodingControlLock); + return; + } + + START_CRIT_SECTION(); + + /* + * We need to disable logical decoding first and then disable logical + * information WAL logging in order to ensure that no logical decoding + * processes WAL records with insufficient information. + */ + LogicalDecodingCtl->logical_decoding_enabled = false; + + /* Write the WAL to disable logical decoding on standbys too */ + if (!in_recovery) + write_logical_decoding_status_update_record(false); + + /* Now disable logical information WAL logging */ + LogicalDecodingCtl->xlog_logical_info = false; + LogicalDecodingCtl->pending_disable = false; + + END_CRIT_SECTION(); + + if (!in_recovery) + ereport(LOG, + errmsg("logical decoding is disabled because there are no valid logical replication slots")); + + LWLockRelease(LogicalDecodingControlLock); + + /* + * Tell all running processes to reflect the xlog_logical_info update. + * Unlike when enabling logical decoding, we don't need to wait for all + * processes to complete it in this case. We already disabled logical + * decoding and it's always safe to write logical information to WAL + * records, even when not strictly required. Therefore, we don't need to + * wait for all running transactions to finish either. + */ + EmitProcSignalBarrier(PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO); +} + +/* + * Updates the logical decoding status at end of recovery, and ensures that + * all running processes have the updated XLogLogicalInfo status. This + * function must be called before accepting writes. + */ +void +UpdateLogicalDecodingStatusEndOfRecovery(void) +{ + bool new_status = false; + + Assert(RecoveryInProgress()); + + /* + * With 'minimal' WAL level, there are no logical replication slots during + * recovery. Logical decoding is always disabled, so there is no need to + * synchronize XLogLogicalInfo. + */ + if (wal_level == WAL_LEVEL_MINIMAL) + { + Assert(!IsXLogLogicalInfoEnabled() && !IsLogicalDecodingEnabled()); + return; + } + + LWLockAcquire(LogicalDecodingControlLock, LW_EXCLUSIVE); + + if (wal_level == WAL_LEVEL_LOGICAL || CheckLogicalSlotExists()) + new_status = true; + + /* + * When recovery ends, we need to either enable or disable logical + * decoding based on the wal_level setting and the presence of logical + * slots. We need to note that concurrent slot creation and deletion could + * happen but WAL writes are still not permitted until recovery fully + * completes. Here's how we handle concurrent toggling of logical + * decoding: + * + * For 'enable' case, if there's a concurrent disable request before + * recovery fully completes, the checkpointer will handle it after + * recovery is done. This means there might be a brief period after + * recovery where logical decoding remains enabled even with no logical + * replication slots present. This temporary state is not new - it can + * already occur due to the checkpointer's asynchronous deactivation + * process. + * + * For 'disable' case, backend cannot create logical replication slots + * during recovery (see checks in CheckLogicalDecodingRequirements()), + * which prevents a race condition between disabling logical decoding and + * concurrent slot creation. + */ + if (new_status != LogicalDecodingCtl->logical_decoding_enabled) + { + /* + * Update both the logical decoding status and logical WAL logging + * status. Unlike toggling these status during non-recovery, we don't + * need to worry about the operation order as WAL writes are still not + * permitted. + */ + LogicalDecodingCtl->xlog_logical_info = new_status; + LogicalDecodingCtl->logical_decoding_enabled = new_status; + + elog(DEBUG1, + "update logical decoding status to %d at the end of recovery", + new_status); + + /* + * Now that we updated the logical decoding status, clear the pending + * disable flag. It's possible that a concurrent process drops the + * last logical slot and initiates the pending disable again. The + * checkpointer process will check it. + */ + LogicalDecodingCtl->pending_disable = false; + + LWLockRelease(LogicalDecodingControlLock); + + write_logical_decoding_status_update_record(new_status); + } + else + LWLockRelease(LogicalDecodingControlLock); + + /* + * Ensure all running processes have the updated status. We don't need to + * wait for running transactions to finish as we don't accept any writes + * yet. On the other hand, we need to wait for synchronizing + * XLogLogicalInfo even if we've not updated the status above as the + * status have been turned on and off during recovery, having running + * processes have different status on their local caches. + */ + if (IsUnderPostmaster) + WaitForProcSignalBarrier( + EmitProcSignalBarrier(PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO)); + + INJECTION_POINT("startup-logical-decoding-status-change-end-of-recovery", NULL); +} diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index ca53caac2f2f5..cf77ee28dfe70 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -107,7 +107,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin XLogRecPtr end_of_wal; XLogRecPtr wait_for_wal_lsn; LogicalDecodingContext *ctx; - ResourceOwner old_resowner = CurrentResourceOwner; + ResourceOwner old_resowner PG_USED_FOR_ASSERTS_ONLY = CurrentResourceOwner; ArrayType *arr; Size ndim; List *options = NIL; @@ -129,7 +129,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin upto_lsn = PG_GETARG_LSN(1); if (PG_ARGISNULL(2)) - upto_nchanges = InvalidXLogRecPtr; + upto_nchanges = 0; else upto_nchanges = PG_GETARG_INT32(2); @@ -140,7 +140,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin arr = PG_GETARG_ARRAYTYPE_P(3); /* state to write output to */ - p = palloc0(sizeof(DecodingOutputState)); + p = palloc0_object(DecodingOutputState); p->binary_output = binary; @@ -229,7 +229,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin * Wait for specified streaming replication standby servers (if any) * to confirm receipt of WAL up to wait_for_wal_lsn. */ - if (XLogRecPtrIsInvalid(upto_lsn)) + if (!XLogRecPtrIsValid(upto_lsn)) wait_for_wal_lsn = end_of_wal; else wait_for_wal_lsn = Min(upto_lsn, end_of_wal); @@ -263,10 +263,20 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin * store the description into our tuplestore. */ if (record != NULL) + { LogicalDecodingProcessRecord(ctx, ctx->reader); + /* + * We used to have bugs where logical decoding would fail to + * preserve the resource owner. Verify that that doesn't + * happen anymore. XXX this could be removed once it's been + * battle-tested. + */ + Assert(CurrentResourceOwner == old_resowner); + } + /* check limits */ - if (upto_lsn != InvalidXLogRecPtr && + if (XLogRecPtrIsValid(upto_lsn) && upto_lsn <= ctx->reader->EndRecPtr) break; if (upto_nchanges != 0 && @@ -275,18 +285,11 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin CHECK_FOR_INTERRUPTS(); } - /* - * Logical decoding could have clobbered CurrentResourceOwner during - * transaction management, so restore the executor's value. (This is - * a kluge, but it's not worth cleaning up right now.) - */ - CurrentResourceOwner = old_resowner; - /* * Next time, start where we left off. (Hunting things, the family * business..) */ - if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm) + if (XLogRecPtrIsValid(ctx->reader->EndRecPtr) && confirm) { LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr); diff --git a/src/backend/replication/logical/meson.build b/src/backend/replication/logical/meson.build index 6f19614c79d8f..928b503addf54 100644 --- a/src/backend/replication/logical/meson.build +++ b/src/backend/replication/logical/meson.build @@ -6,14 +6,17 @@ backend_sources += files( 'decode.c', 'launcher.c', 'logical.c', + 'logicalctl.c', 'logicalfuncs.c', 'message.c', 'origin.c', 'proto.c', 'relation.c', 'reorderbuffer.c', + 'sequencesync.c', 'slotsync.c', 'snapbuild.c', + 'syncutils.c', 'tablesync.c', 'worker.c', ) diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index a17bacf88e7f3..2380f369578e0 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -789,14 +789,6 @@ StartupReplicationOrigin(void) readBytes = read(fd, &disk_state, sizeof(disk_state)); - /* no further data */ - if (readBytes == sizeof(crc)) - { - /* not pretty, but simple ... */ - file_crc = *(pg_crc32c *) &disk_state; - break; - } - if (readBytes < 0) { ereport(PANIC, @@ -805,6 +797,13 @@ StartupReplicationOrigin(void) path))); } + /* no further data */ + if (readBytes == sizeof(crc)) + { + memcpy(&file_crc, &disk_state, sizeof(file_crc)); + break; + } + if (readBytes != sizeof(disk_state)) { ereport(PANIC, @@ -826,9 +825,9 @@ StartupReplicationOrigin(void) last_state++; ereport(LOG, - (errmsg("recovered replication state of node %d to %X/%X", - disk_state.roident, - LSN_FORMAT_ARGS(disk_state.remote_lsn)))); + errmsg("recovered replication state of node %d to %X/%08X", + disk_state.roident, + LSN_FORMAT_ARGS(disk_state.remote_lsn))); } /* now check checksum */ @@ -984,8 +983,8 @@ replorigin_advance(RepOriginId node, /* initialize new slot */ LWLockAcquire(&free_state->lock, LW_EXCLUSIVE); replication_state = free_state; - Assert(replication_state->remote_lsn == InvalidXLogRecPtr); - Assert(replication_state->local_lsn == InvalidXLogRecPtr); + Assert(!XLogRecPtrIsValid(replication_state->remote_lsn)); + Assert(!XLogRecPtrIsValid(replication_state->local_lsn)); replication_state->roident = node; } @@ -1020,7 +1019,7 @@ replorigin_advance(RepOriginId node, */ if (go_backward || replication_state->remote_lsn < remote_commit) replication_state->remote_lsn = remote_commit; - if (local_commit != InvalidXLogRecPtr && + if (XLogRecPtrIsValid(local_commit) && (go_backward || replication_state->local_lsn < local_commit)) replication_state->local_lsn = local_commit; LWLockRelease(&replication_state->lock); @@ -1064,7 +1063,7 @@ replorigin_get_progress(RepOriginId node, bool flush) LWLockRelease(ReplicationOriginLock); - if (flush && local_lsn != InvalidXLogRecPtr) + if (flush && XLogRecPtrIsValid(local_lsn)) XLogFlush(local_lsn); return remote_lsn; @@ -1167,6 +1166,14 @@ replorigin_session_setup(RepOriginId node, int acquired_by) curstate->roident, curstate->acquired_by))); } + else if (curstate->acquired_by != acquired_by) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("could not find replication state slot for replication origin with OID %u which was acquired by %d", + node, acquired_by))); + } + /* ok, found slot */ session_replication_state = curstate; break; @@ -1181,10 +1188,16 @@ replorigin_session_setup(RepOriginId node, int acquired_by) errhint("Increase \"max_active_replication_origins\" and try again."))); else if (session_replication_state == NULL) { + if (acquired_by) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use PID %d for inactive replication origin with ID %d", + acquired_by, node))); + /* initialize new slot */ session_replication_state = &replication_states[free_slot]; - Assert(session_replication_state->remote_lsn == InvalidXLogRecPtr); - Assert(session_replication_state->local_lsn == InvalidXLogRecPtr); + Assert(!XLogRecPtrIsValid(session_replication_state->remote_lsn)); + Assert(!XLogRecPtrIsValid(session_replication_state->local_lsn)); session_replication_state->roident = node; } @@ -1193,9 +1206,8 @@ replorigin_session_setup(RepOriginId node, int acquired_by) if (acquired_by == 0) session_replication_state->acquired_by = MyProcPid; - else if (session_replication_state->acquired_by != acquired_by) - elog(ERROR, "could not find replication state slot for replication origin with OID %u which was acquired by %d", - node, acquired_by); + else + Assert(session_replication_state->acquired_by == acquired_by); LWLockRelease(ReplicationOriginLock); @@ -1269,7 +1281,7 @@ replorigin_session_get_progress(bool flush) local_lsn = session_replication_state->local_lsn; LWLockRelease(&session_replication_state->lock); - if (flush && local_lsn != InvalidXLogRecPtr) + if (flush && XLogRecPtrIsValid(local_lsn)) XLogFlush(local_lsn); return remote_lsn; @@ -1374,12 +1386,14 @@ pg_replication_origin_session_setup(PG_FUNCTION_ARGS) { char *name; RepOriginId origin; + int pid; replorigin_check_prerequisites(true, false); name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); origin = replorigin_by_name(name, false); - replorigin_session_setup(origin, 0); + pid = PG_GETARG_INT32(1); + replorigin_session_setup(origin, pid); replorigin_session_origin = origin; @@ -1439,7 +1453,7 @@ pg_replication_origin_session_progress(PG_FUNCTION_ARGS) remote_lsn = replorigin_session_get_progress(flush); - if (remote_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(remote_lsn)) PG_RETURN_NULL(); PG_RETURN_LSN(remote_lsn); @@ -1528,7 +1542,7 @@ pg_replication_origin_progress(PG_FUNCTION_ARGS) remote_lsn = replorigin_get_progress(roident, flush); - if (remote_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(remote_lsn)) PG_RETURN_NULL(); PG_RETURN_LSN(remote_lsn); diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index 1a352b542dc56..27ad74fd759b6 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -52,7 +52,7 @@ logicalrep_write_begin(StringInfo out, ReorderBufferTXN *txn) /* fixed fields */ pq_sendint64(out, txn->final_lsn); - pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint64(out, txn->commit_time); pq_sendint32(out, txn->xid); } @@ -64,7 +64,7 @@ logicalrep_read_begin(StringInfo in, LogicalRepBeginData *begin_data) { /* read fields */ begin_data->final_lsn = pq_getmsgint64(in); - if (begin_data->final_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(begin_data->final_lsn)) elog(ERROR, "final_lsn not set in begin message"); begin_data->committime = pq_getmsgint64(in); begin_data->xid = pq_getmsgint(in, 4); @@ -88,7 +88,7 @@ logicalrep_write_commit(StringInfo out, ReorderBufferTXN *txn, /* send fields */ pq_sendint64(out, commit_lsn); pq_sendint64(out, txn->end_lsn); - pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint64(out, txn->commit_time); } /* @@ -120,7 +120,7 @@ logicalrep_write_begin_prepare(StringInfo out, ReorderBufferTXN *txn) /* fixed fields */ pq_sendint64(out, txn->final_lsn); pq_sendint64(out, txn->end_lsn); - pq_sendint64(out, txn->xact_time.prepare_time); + pq_sendint64(out, txn->prepare_time); pq_sendint32(out, txn->xid); /* send gid */ @@ -135,10 +135,10 @@ logicalrep_read_begin_prepare(StringInfo in, LogicalRepPreparedTxnData *begin_da { /* read fields */ begin_data->prepare_lsn = pq_getmsgint64(in); - if (begin_data->prepare_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(begin_data->prepare_lsn)) elog(ERROR, "prepare_lsn not set in begin prepare message"); begin_data->end_lsn = pq_getmsgint64(in); - if (begin_data->end_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(begin_data->end_lsn)) elog(ERROR, "end_lsn not set in begin prepare message"); begin_data->prepare_time = pq_getmsgint64(in); begin_data->xid = pq_getmsgint(in, 4); @@ -173,7 +173,7 @@ logicalrep_write_prepare_common(StringInfo out, LogicalRepMsgType type, /* send fields */ pq_sendint64(out, prepare_lsn); pq_sendint64(out, txn->end_lsn); - pq_sendint64(out, txn->xact_time.prepare_time); + pq_sendint64(out, txn->prepare_time); pq_sendint32(out, txn->xid); /* send gid */ @@ -207,10 +207,10 @@ logicalrep_read_prepare_common(StringInfo in, char *msgtype, /* read fields */ prepare_data->prepare_lsn = pq_getmsgint64(in); - if (prepare_data->prepare_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(prepare_data->prepare_lsn)) elog(ERROR, "prepare_lsn is not set in %s message", msgtype); prepare_data->end_lsn = pq_getmsgint64(in); - if (prepare_data->end_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(prepare_data->end_lsn)) elog(ERROR, "end_lsn is not set in %s message", msgtype); prepare_data->prepare_time = pq_getmsgint64(in); prepare_data->xid = pq_getmsgint(in, 4); @@ -253,7 +253,7 @@ logicalrep_write_commit_prepared(StringInfo out, ReorderBufferTXN *txn, /* send fields */ pq_sendint64(out, commit_lsn); pq_sendint64(out, txn->end_lsn); - pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint64(out, txn->commit_time); pq_sendint32(out, txn->xid); /* send gid */ @@ -274,10 +274,10 @@ logicalrep_read_commit_prepared(StringInfo in, LogicalRepCommitPreparedTxnData * /* read fields */ prepare_data->commit_lsn = pq_getmsgint64(in); - if (prepare_data->commit_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(prepare_data->commit_lsn)) elog(ERROR, "commit_lsn is not set in commit prepared message"); prepare_data->end_lsn = pq_getmsgint64(in); - if (prepare_data->end_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(prepare_data->end_lsn)) elog(ERROR, "end_lsn is not set in commit prepared message"); prepare_data->commit_time = pq_getmsgint64(in); prepare_data->xid = pq_getmsgint(in, 4); @@ -311,7 +311,7 @@ logicalrep_write_rollback_prepared(StringInfo out, ReorderBufferTXN *txn, pq_sendint64(out, prepare_end_lsn); pq_sendint64(out, txn->end_lsn); pq_sendint64(out, prepare_time); - pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint64(out, txn->commit_time); pq_sendint32(out, txn->xid); /* send gid */ @@ -333,10 +333,10 @@ logicalrep_read_rollback_prepared(StringInfo in, /* read fields */ rollback_data->prepare_end_lsn = pq_getmsgint64(in); - if (rollback_data->prepare_end_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(rollback_data->prepare_end_lsn)) elog(ERROR, "prepare_end_lsn is not set in rollback prepared message"); rollback_data->rollback_end_lsn = pq_getmsgint64(in); - if (rollback_data->rollback_end_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(rollback_data->rollback_end_lsn)) elog(ERROR, "rollback_end_lsn is not set in rollback prepared message"); rollback_data->prepare_time = pq_getmsgint64(in); rollback_data->rollback_time = pq_getmsgint64(in); @@ -697,7 +697,7 @@ logicalrep_write_rel(StringInfo out, TransactionId xid, Relation rel, LogicalRepRelation * logicalrep_read_rel(StringInfo in) { - LogicalRepRelation *rel = palloc(sizeof(LogicalRepRelation)); + LogicalRepRelation *rel = palloc_object(LogicalRepRelation); rel->remoteid = pq_getmsgint(in, 4); @@ -708,6 +708,9 @@ logicalrep_read_rel(StringInfo in) /* Read the replica identity. */ rel->replident = pq_getmsgbyte(in); + /* relkind is not sent */ + rel->relkind = 0; + /* Get attribute description */ logicalrep_read_attrs(in, rel); @@ -809,7 +812,7 @@ logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, continue; } - if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(values[i]))) { /* * Unchanged toasted datum. (Note that we don't promise to detect @@ -868,7 +871,7 @@ logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple) /* Allocate space for per-column values; zero out unused StringInfoDatas */ tuple->colvalues = (StringInfoData *) palloc0(natts * sizeof(StringInfoData)); - tuple->colstatus = (char *) palloc(natts * sizeof(char)); + tuple->colstatus = palloc_array(char, natts); tuple->ncols = natts; /* Read the data */ @@ -991,8 +994,8 @@ logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel) Bitmapset *attkeys = NULL; natts = pq_getmsgint(in, 2); - attnames = palloc(natts * sizeof(char *)); - atttyps = palloc(natts * sizeof(Oid)); + attnames = palloc_array(char *, natts); + atttyps = palloc_array(Oid, natts); /* read the attributes */ for (i = 0; i < natts; i++) @@ -1119,7 +1122,7 @@ logicalrep_write_stream_commit(StringInfo out, ReorderBufferTXN *txn, /* send fields */ pq_sendint64(out, commit_lsn); pq_sendint64(out, txn->end_lsn); - pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint64(out, txn->commit_time); } /* diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index f59046ad620da..2c8485b881f5c 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -188,14 +188,25 @@ logicalrep_relmap_update(LogicalRepRelation *remoterel) entry->remoterel.nspname = pstrdup(remoterel->nspname); entry->remoterel.relname = pstrdup(remoterel->relname); entry->remoterel.natts = remoterel->natts; - entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); - entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + entry->remoterel.attnames = palloc_array(char *, remoterel->natts); + entry->remoterel.atttyps = palloc_array(Oid, remoterel->natts); for (i = 0; i < remoterel->natts; i++) { entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); entry->remoterel.atttyps[i] = remoterel->atttyps[i]; } entry->remoterel.replident = remoterel->replident; + + /* + * XXX The walsender currently does not transmit the relkind of the remote + * relation when replicating changes. Since we support replicating only + * table changes at present, we default to initializing relkind as + * RELKIND_RELATION. This is needed in CheckSubscriptionRelkind() to check + * if the publisher and subscriber relation kinds are compatible. + */ + entry->remoterel.relkind = + (remoterel->relkind == 0) ? RELKIND_RELATION : remoterel->relkind; + entry->remoterel.attkeys = bms_copy(remoterel->attkeys); MemoryContextSwitchTo(oldctx); } @@ -238,6 +249,7 @@ logicalrep_get_attrs_str(LogicalRepRelation *remoterel, Bitmapset *atts) { attcnt++; if (attcnt > 1) + /* translator: This is a separator in a list of entity names. */ appendStringInfoString(&attsbuf, _(", ")); appendStringInfo(&attsbuf, _("\"%s\""), remoterel->attnames[i]); @@ -425,6 +437,7 @@ logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode) /* Check for supported relkind. */ CheckSubscriptionRelkind(entry->localrel->rd_rel->relkind, + remoterel->relkind, remoterel->nspname, remoterel->relname); /* @@ -691,8 +704,8 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root, entry->remoterel.nspname = pstrdup(remoterel->nspname); entry->remoterel.relname = pstrdup(remoterel->relname); entry->remoterel.natts = remoterel->natts; - entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); - entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + entry->remoterel.attnames = palloc_array(char *, remoterel->natts); + entry->remoterel.atttyps = palloc_array(Oid, remoterel->natts); for (i = 0; i < remoterel->natts; i++) { entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 676551118753d..f18c6fb52b570 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -109,10 +109,22 @@ #include "storage/procarray.h" #include "storage/sinval.h" #include "utils/builtins.h" +#include "utils/inval.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/relfilenumbermap.h" +/* + * Each transaction has an 8MB limit for invalidation messages distributed from + * other transactions. This limit is set considering scenarios with many + * concurrent logical decoding operations. When the distributed invalidation + * messages reach this threshold, the transaction is marked as + * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost + * some inval messages and hence don't know what needs to be invalidated. + */ +#define MAX_DISTR_INVAL_MSG_PER_TXN \ + ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage)) + /* entry for a hash table we use to map from xid to our transaction state */ typedef struct ReorderBufferTXNByIdEnt { @@ -378,6 +390,7 @@ ReorderBufferAllocate(void) buffer->streamTxns = 0; buffer->streamCount = 0; buffer->streamBytes = 0; + buffer->memExceededCount = 0; buffer->totalTxns = 0; buffer->totalBytes = 0; @@ -472,6 +485,12 @@ ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) txn->invalidations = NULL; } + if (txn->invalidations_distributed) + { + pfree(txn->invalidations_distributed); + txn->invalidations_distributed = NULL; + } + /* Reset the toast hash */ ReorderBufferToastReset(rb, txn); @@ -682,7 +701,7 @@ ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, { /* initialize the new entry, if creation was requested */ Assert(ent != NULL); - Assert(lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(lsn)); ent->txn = ReorderBufferAllocTXN(rb); ent->txn->xid = xid; @@ -830,7 +849,7 @@ ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, change->lsn = lsn; change->txn = txn; - Assert(InvalidXLogRecPtr != lsn); + Assert(XLogRecPtrIsValid(lsn)); dlist_push_tail(&txn->changes, &change->node); txn->nentries++; txn->nentries_mem++; @@ -947,14 +966,14 @@ AssertTXNLsnOrder(ReorderBuffer *rb) iter.cur); /* start LSN must be set */ - Assert(cur_txn->first_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(cur_txn->first_lsn)); /* If there is an end LSN, it must be higher than start LSN */ - if (cur_txn->end_lsn != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(cur_txn->end_lsn)) Assert(cur_txn->first_lsn <= cur_txn->end_lsn); /* Current initial LSN must be strictly higher than previous */ - if (prev_first_lsn != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(prev_first_lsn)) Assert(prev_first_lsn < cur_txn->first_lsn); /* known-as-subtxn txns must not be listed */ @@ -971,10 +990,10 @@ AssertTXNLsnOrder(ReorderBuffer *rb) /* base snapshot (and its LSN) must be set */ Assert(cur_txn->base_snapshot != NULL); - Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(cur_txn->base_snapshot_lsn)); /* current LSN must be strictly higher than previous */ - if (prev_base_snap_lsn != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(prev_base_snap_lsn)) Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn); /* known-as-subtxn txns must not be listed */ @@ -1003,11 +1022,11 @@ AssertChangeLsnOrder(ReorderBufferTXN *txn) cur_change = dlist_container(ReorderBufferChange, node, iter.cur); - Assert(txn->first_lsn != InvalidXLogRecPtr); - Assert(cur_change->lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(txn->first_lsn)); + Assert(XLogRecPtrIsValid(cur_change->lsn)); Assert(txn->first_lsn <= cur_change->lsn); - if (txn->end_lsn != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(txn->end_lsn)) Assert(cur_change->lsn <= txn->end_lsn); Assert(prev_lsn <= cur_change->lsn); @@ -1034,7 +1053,7 @@ ReorderBufferGetOldestTXN(ReorderBuffer *rb) txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn); Assert(!rbtxn_is_known_subxact(txn)); - Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(txn->first_lsn)); return txn; } @@ -1397,7 +1416,7 @@ ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) int32 off; /* nothing there anymore */ - if (state->heap->bh_size == 0) + if (binaryheap_empty(state->heap)) return NULL; off = DatumGetInt32(binaryheap_first(state->heap)); @@ -2197,6 +2216,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, { bool using_subtxn; MemoryContext ccxt = CurrentMemoryContext; + ResourceOwner cowner = CurrentResourceOwner; ReorderBufferIterTXNState *volatile iterstate = NULL; volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; ReorderBufferChange *volatile specinsert = NULL; @@ -2256,7 +2276,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, * We can't call start stream callback before processing first * change. */ - if (prev_lsn == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(prev_lsn)) { if (streaming) { @@ -2271,7 +2291,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, * subtransactions. The changes may have the same LSN due to * MULTI_INSERT xlog records. */ - Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn); + Assert(!XLogRecPtrIsValid(prev_lsn) || prev_lsn <= change->lsn); prev_lsn = change->lsn; @@ -2581,7 +2601,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, if (++changes_count >= CHANGES_THRESHOLD) { - rb->update_progress_txn(rb, txn, change->lsn); + rb->update_progress_txn(rb, txn, prev_lsn); changes_count = 0; } } @@ -2661,10 +2681,24 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + if (rbtxn_distr_inval_overflowed(txn)) + { + Assert(txn->ninvalidations_distributed == 0); + InvalidateSystemCaches(); + } + else + { + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed, + txn->invalidations_distributed); + } if (using_subtxn) + { RollbackAndReleaseCurrentSubTransaction(); + MemoryContextSwitchTo(ccxt); + CurrentResourceOwner = cowner; + } /* * We are here due to one of the four reasons: 1. Decoding an @@ -2710,11 +2744,24 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(txn->ninvalidations, - txn->invalidations); + if (rbtxn_distr_inval_overflowed(txn)) + { + Assert(txn->ninvalidations_distributed == 0); + InvalidateSystemCaches(); + } + else + { + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed, + txn->invalidations_distributed); + } if (using_subtxn) + { RollbackAndReleaseCurrentSubTransaction(); + MemoryContextSwitchTo(ccxt); + CurrentResourceOwner = cowner; + } /* * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent @@ -2784,7 +2831,7 @@ ReorderBufferReplay(ReorderBufferTXN *txn, txn->final_lsn = commit_lsn; txn->end_lsn = end_lsn; - txn->xact_time.commit_time = commit_time; + txn->commit_time = commit_time; txn->origin_id = origin_id; txn->origin_lsn = origin_lsn; @@ -2876,7 +2923,7 @@ ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid, */ txn->final_lsn = prepare_lsn; txn->end_lsn = end_lsn; - txn->xact_time.prepare_time = prepare_time; + txn->prepare_time = prepare_time; txn->origin_id = origin_id; txn->origin_lsn = origin_lsn; @@ -2928,12 +2975,12 @@ ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid, * have been updated in it by now. */ Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED); - Assert(txn->final_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(txn->final_lsn)); txn->gid = pstrdup(gid); ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, - txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn); + txn->prepare_time, txn->origin_id, txn->origin_lsn); /* * Send a prepare if not already done so. This might occur if we have @@ -2972,7 +3019,7 @@ ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, * be later used for rollback. */ prepare_end_lsn = txn->end_lsn; - prepare_time = txn->xact_time.prepare_time; + prepare_time = txn->prepare_time; /* add the gid in the txn */ txn->gid = pstrdup(gid); @@ -2994,7 +3041,7 @@ ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, */ Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE)); - Assert(txn->final_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(txn->final_lsn)); /* * By this time the txn has the prepare record information and it is @@ -3004,12 +3051,12 @@ ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, * prepared after the restart. */ ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, - txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn); + txn->prepare_time, txn->origin_id, txn->origin_lsn); } txn->final_lsn = commit_lsn; txn->end_lsn = end_lsn; - txn->xact_time.commit_time = commit_time; + txn->commit_time = commit_time; txn->origin_id = origin_id; txn->origin_lsn = origin_lsn; @@ -3049,7 +3096,7 @@ ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, if (txn == NULL) return; - txn->xact_time.abort_time = abort_time; + txn->abort_time = abort_time; /* For streamed transactions notify the remote node about the abort. */ if (rbtxn_is_streamed(txn)) @@ -3060,7 +3107,8 @@ ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, * We might have decoded changes for this transaction that could load * the cache as per the current transaction's view (consider DDL's * happened in this transaction). We don't want the decoding of future - * transactions to use those cache entries so execute invalidations. + * transactions to use those cache entries so execute only the inval + * messages in this transaction. */ if (txn->ninvalidations > 0) ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, @@ -3147,9 +3195,10 @@ ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) txn->final_lsn = lsn; /* - * Process cache invalidation messages if there are any. Even if we're not - * interested in the transaction's contents, it could have manipulated the - * catalog and we need to update the caches according to that. + * Process only cache invalidation messages in this transaction if there + * are any. Even if we're not interested in the transaction's contents, it + * could have manipulated the catalog and we need to update the caches + * according to that. */ if (txn->base_snapshot != NULL && txn->ninvalidations > 0) ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, @@ -3205,6 +3254,8 @@ ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, SharedInvalidationMessage *invalidations) { bool use_subtxn = IsTransactionOrTransactionBlock(); + MemoryContext ccxt = CurrentMemoryContext; + ResourceOwner cowner = CurrentResourceOwner; int i; if (use_subtxn) @@ -3223,7 +3274,11 @@ ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, LocalExecuteInvalidationMessage(&invalidations[i]); if (use_subtxn) + { RollbackAndReleaseCurrentSubTransaction(); + MemoryContextSwitchTo(ccxt); + CurrentResourceOwner = cowner; + } } /* @@ -3421,6 +3476,55 @@ ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, txn->ntuplecids++; } +/* + * Add new invalidation messages to the reorder buffer queue. + */ +static void +ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferChange *change; + + change = ReorderBufferAllocChange(rb); + change->action = REORDER_BUFFER_CHANGE_INVALIDATION; + change->data.inval.ninvalidations = nmsgs; + change->data.inval.invalidations = palloc_array(SharedInvalidationMessage, nmsgs); + memcpy(change->data.inval.invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * A helper function for ReorderBufferAddInvalidations() and + * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation + * messages to the **invals_out. + */ +static void +ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out, + uint32 *ninvals_out, + SharedInvalidationMessage *msgs_new, + Size nmsgs_new) +{ + if (*ninvals_out == 0) + { + *ninvals_out = nmsgs_new; + *invals_out = palloc_array(SharedInvalidationMessage, nmsgs_new); + memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new); + } + else + { + /* Enlarge the array of inval messages */ + *invals_out = (SharedInvalidationMessage *) + repalloc(*invals_out, sizeof(SharedInvalidationMessage) * + (*ninvals_out + nmsgs_new)); + memcpy(*invals_out + *ninvals_out, msgs_new, + nmsgs_new * sizeof(SharedInvalidationMessage)); + *ninvals_out += nmsgs_new; + } +} + /* * Accumulate the invalidations for executing them later. * @@ -3441,7 +3545,6 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, { ReorderBufferTXN *txn; MemoryContext oldcontext; - ReorderBufferChange *change; txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); @@ -3456,35 +3559,76 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, Assert(nmsgs > 0); - /* Accumulate invalidations. */ - if (txn->ninvalidations == 0) - { - txn->ninvalidations = nmsgs; - txn->invalidations = (SharedInvalidationMessage *) - palloc(sizeof(SharedInvalidationMessage) * nmsgs); - memcpy(txn->invalidations, msgs, - sizeof(SharedInvalidationMessage) * nmsgs); - } - else + ReorderBufferAccumulateInvalidations(&txn->invalidations, + &txn->ninvalidations, + msgs, nmsgs); + + ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accumulate the invalidations distributed by other committed transactions + * for executing them later. + * + * This function is similar to ReorderBufferAddInvalidations() but stores + * the given inval messages to the txn->invalidations_distributed with the + * overflow check. + * + * This needs to be called by committed transactions to distribute their + * inval messages to in-progress transactions. + */ +void +ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferTXN *txn; + MemoryContext oldcontext; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* + * Collect all the invalidations under the top transaction, if available, + * so that we can execute them all together. See comments + * ReorderBufferAddInvalidations. + */ + txn = rbtxn_get_toptxn(txn); + + Assert(nmsgs > 0); + + if (!rbtxn_distr_inval_overflowed(txn)) { - txn->invalidations = (SharedInvalidationMessage *) - repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) * - (txn->ninvalidations + nmsgs)); + /* + * Check the transaction has enough space for storing distributed + * invalidation messages. + */ + if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN) + { + /* + * Mark the invalidation message as overflowed and free up the + * messages accumulated so far. + */ + txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED; - memcpy(txn->invalidations + txn->ninvalidations, msgs, - nmsgs * sizeof(SharedInvalidationMessage)); - txn->ninvalidations += nmsgs; + if (txn->invalidations_distributed) + { + pfree(txn->invalidations_distributed); + txn->invalidations_distributed = NULL; + txn->ninvalidations_distributed = 0; + } + } + else + ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed, + &txn->ninvalidations_distributed, + msgs, nmsgs); } - change = ReorderBufferAllocChange(rb); - change->action = REORDER_BUFFER_CHANGE_INVALIDATION; - change->data.inval.ninvalidations = nmsgs; - change->data.inval.invalidations = (SharedInvalidationMessage *) - palloc(sizeof(SharedInvalidationMessage) * nmsgs); - memcpy(change->data.inval.invalidations, msgs, - sizeof(SharedInvalidationMessage) * nmsgs); - - ReorderBufferQueueChange(rb, xid, lsn, change, false); + /* Queue the invalidation messages into the transaction */ + ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs); MemoryContextSwitchTo(oldcontext); } @@ -3555,8 +3699,7 @@ ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb) return NULL; /* Initialize XID array */ - xids = (TransactionId *) palloc(sizeof(TransactionId) * - dclist_count(&rb->catchange_txns)); + xids = palloc_array(TransactionId, dclist_count(&rb->catchange_txns)); dclist_foreach(iter, &rb->catchange_txns) { ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN, @@ -3753,14 +3896,26 @@ static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) { ReorderBufferTXN *txn; + bool update_stats = true; - /* - * Bail out if debug_logical_replication_streaming is buffered and we - * haven't exceeded the memory limit. - */ - if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED && - rb->size < logical_decoding_work_mem * (Size) 1024) + if (rb->size >= logical_decoding_work_mem * (Size) 1024) + { + /* + * Update the statistics as the memory usage has reached the limit. We + * report the statistics update later in this function since we can + * update the slot statistics altogether while streaming or + * serializing transactions in most cases. + */ + rb->memExceededCount += 1; + } + else if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED) + { + /* + * Bail out if debug_logical_replication_streaming is buffered and we + * haven't exceeded the memory limit. + */ return; + } /* * If debug_logical_replication_streaming is immediate, loop until there's @@ -3820,8 +3975,17 @@ ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) */ Assert(txn->size == 0); Assert(txn->nentries_mem == 0); + + /* + * We've reported the memExceededCount update while streaming or + * serializing the transaction. + */ + update_stats = false; } + if (update_stats) + UpdateDecodingStats((LogicalDecodingContext *) rb->private_data); + /* We must be under the memory limit now. */ Assert(rb->size < logical_decoding_work_mem * (Size) 1024); } @@ -4385,8 +4549,8 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, dlist_mutable_iter cleanup_iter; File *fd = &file->vfd; - Assert(txn->first_lsn != InvalidXLogRecPtr); - Assert(txn->final_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(txn->first_lsn)); + Assert(XLogRecPtrIsValid(txn->final_lsn)); /* free current entries, so we have memory for more */ dlist_foreach_modify(cleanup_iter, &txn->changes) @@ -4693,8 +4857,8 @@ ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn) XLogSegNo cur; XLogSegNo last; - Assert(txn->first_lsn != InvalidXLogRecPtr); - Assert(txn->final_lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(txn->first_lsn)); + Assert(XLogRecPtrIsValid(txn->final_lsn)); XLByteToSeg(txn->first_lsn, first, wal_segment_size); XLByteToSeg(txn->final_lsn, last, wal_segment_size); @@ -4787,7 +4951,7 @@ StartupReorderBuffer(void) continue; /* if it cannot be a slot, skip the directory */ - if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2)) continue; /* @@ -4957,9 +5121,9 @@ ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, toast_desc = RelationGetDescr(toast_rel); /* should we allocate from stack instead? */ - attrs = palloc0(sizeof(Datum) * desc->natts); - isnull = palloc0(sizeof(bool) * desc->natts); - free = palloc0(sizeof(bool) * desc->natts); + attrs = palloc0_array(Datum, desc->natts); + isnull = palloc0_array(bool, desc->natts); + free = palloc0_array(bool, desc->natts); newtup = change->data.tp.newtuple; @@ -4967,7 +5131,7 @@ ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, for (natt = 0; natt < desc->natts; natt++) { - Form_pg_attribute attr = TupleDescAttr(desc, natt); + CompactAttribute *attr = TupleDescCompactAttr(desc, natt); ReorderBufferToastEnt *ent; struct varlena *varlena; @@ -4979,10 +5143,6 @@ ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, dlist_iter it; Size data_done = 0; - /* system columns aren't toasted */ - if (attr->attnum < 0) - continue; - if (attr->attisdropped) continue; @@ -5368,7 +5528,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) continue; /* ok, relevant, queue for apply */ - f = palloc(sizeof(RewriteMappingFile)); + f = palloc_object(RewriteMappingFile); f->lsn = f_lsn; strcpy(f->fname, mapping_de->d_name); files = lappend(files, f); diff --git a/src/backend/replication/logical/sequencesync.c b/src/backend/replication/logical/sequencesync.c new file mode 100644 index 0000000000000..019e5ec6a7dad --- /dev/null +++ b/src/backend/replication/logical/sequencesync.c @@ -0,0 +1,755 @@ +/*------------------------------------------------------------------------- + * sequencesync.c + * PostgreSQL logical replication: sequence synchronization + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/sequencesync.c + * + * NOTES + * This file contains code for sequence synchronization for + * logical replication. + * + * Sequences requiring synchronization are tracked in the pg_subscription_rel + * catalog. + * + * Sequences to be synchronized will be added with state INIT when either of + * the following commands is executed: + * CREATE SUBSCRIPTION + * ALTER SUBSCRIPTION ... REFRESH PUBLICATION + * + * Executing the following command resets all sequences in the subscription to + * state INIT, triggering re-synchronization: + * ALTER SUBSCRIPTION ... REFRESH SEQUENCES + * + * The apply worker periodically scans pg_subscription_rel for sequences in + * INIT state. When such sequences are found, it spawns a sequencesync worker + * to handle synchronization. + * + * A single sequencesync worker is responsible for synchronizing all sequences. + * It begins by retrieving the list of sequences that are flagged for + * synchronization, i.e., those in the INIT state. These sequences are then + * processed in batches, allowing multiple entries to be synchronized within a + * single transaction. The worker fetches the current sequence values and page + * LSNs from the remote publisher, updates the corresponding sequences on the + * local subscriber, and finally marks each sequence as READY upon successful + * synchronization. + * + * Sequence state transitions follow this pattern: + * INIT -> READY + * + * To avoid creating too many transactions, up to MAX_SEQUENCES_SYNC_PER_BATCH + * sequences are synchronized per transaction. The locks on the sequence + * relation will be periodically released at each transaction commit. + * + * XXX: We didn't choose launcher process to maintain the launch of sequencesync + * worker as it didn't have database connection to access the sequences from the + * pg_subscription_rel system catalog that need to be synchronized. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/table.h" +#include "catalog/pg_sequence.h" +#include "catalog/pg_subscription_rel.h" +#include "commands/sequence.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "replication/logicalworker.h" +#include "replication/worker_internal.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/syscache.h" +#include "utils/usercontext.h" + +#define REMOTE_SEQ_COL_COUNT 10 + +typedef enum CopySeqResult +{ + COPYSEQ_SUCCESS, + COPYSEQ_MISMATCH, + COPYSEQ_INSUFFICIENT_PERM, + COPYSEQ_SKIPPED +} CopySeqResult; + +static List *seqinfos = NIL; + +/* + * Apply worker determines if sequence synchronization is needed. + * + * Start a sequencesync worker if one is not already running. The active + * sequencesync worker will handle all pending sequence synchronization. If any + * sequences remain unsynchronized after it exits, a new worker can be started + * in the next iteration. + */ +void +ProcessSequencesForSync(void) +{ + LogicalRepWorker *sequencesync_worker; + int nsyncworkers; + bool has_pending_sequences; + bool started_tx; + + FetchRelationStates(NULL, &has_pending_sequences, &started_tx); + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + if (!has_pending_sequences) + return; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + /* Check if there is a sequencesync worker already running? */ + sequencesync_worker = logicalrep_worker_find(WORKERTYPE_SEQUENCESYNC, + MyLogicalRepWorker->subid, + InvalidOid, true); + if (sequencesync_worker) + { + LWLockRelease(LogicalRepWorkerLock); + return; + } + + /* + * Count running sync workers for this subscription, while we have the + * lock. + */ + nsyncworkers = logicalrep_sync_worker_count(MyLogicalRepWorker->subid); + LWLockRelease(LogicalRepWorkerLock); + + /* + * It is okay to read/update last_seqsync_start_time here in apply worker + * as we have already ensured that sync worker doesn't exist. + */ + launch_sync_worker(WORKERTYPE_SEQUENCESYNC, nsyncworkers, InvalidOid, + &MyLogicalRepWorker->last_seqsync_start_time); +} + +/* + * get_sequences_string + * + * Build a comma-separated string of schema-qualified sequence names + * for the given list of sequence indexes. + */ +static void +get_sequences_string(List *seqindexes, StringInfo buf) +{ + resetStringInfo(buf); + foreach_int(seqidx, seqindexes) + { + LogicalRepSequenceInfo *seqinfo = + (LogicalRepSequenceInfo *) list_nth(seqinfos, seqidx); + + if (buf->len > 0) + appendStringInfoString(buf, ", "); + + appendStringInfo(buf, "\"%s.%s\"", seqinfo->nspname, seqinfo->seqname); + } +} + +/* + * report_sequence_errors + * + * Report discrepancies found during sequence synchronization between + * the publisher and subscriber. Emits warnings for: + * a) mismatched definitions or concurrent rename + * b) insufficient privileges + * c) missing sequences on the subscriber + * Then raises an ERROR to indicate synchronization failure. + */ +static void +report_sequence_errors(List *mismatched_seqs_idx, List *insuffperm_seqs_idx, + List *missing_seqs_idx) +{ + StringInfo seqstr; + + /* Quick exit if there are no errors to report */ + if (!mismatched_seqs_idx && !insuffperm_seqs_idx && !missing_seqs_idx) + return; + + seqstr = makeStringInfo(); + + if (mismatched_seqs_idx) + { + get_sequences_string(mismatched_seqs_idx, seqstr); + ereport(WARNING, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg_plural("mismatched or renamed sequence on subscriber (%s)", + "mismatched or renamed sequences on subscriber (%s)", + list_length(mismatched_seqs_idx), + seqstr->data)); + } + + if (insuffperm_seqs_idx) + { + get_sequences_string(insuffperm_seqs_idx, seqstr); + ereport(WARNING, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg_plural("insufficient privileges on sequence (%s)", + "insufficient privileges on sequences (%s)", + list_length(insuffperm_seqs_idx), + seqstr->data)); + } + + if (missing_seqs_idx) + { + get_sequences_string(missing_seqs_idx, seqstr); + ereport(WARNING, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg_plural("missing sequence on publisher (%s)", + "missing sequences on publisher (%s)", + list_length(missing_seqs_idx), + seqstr->data)); + } + + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication sequence synchronization failed for subscription \"%s\"", + MySubscription->name)); +} + +/* + * get_and_validate_seq_info + * + * Extracts remote sequence information from the tuple slot received from the + * publisher, and validates it against the corresponding local sequence + * definition. + */ +static CopySeqResult +get_and_validate_seq_info(TupleTableSlot *slot, Relation *sequence_rel, + LogicalRepSequenceInfo **seqinfo, int *seqidx) +{ + bool isnull; + int col = 0; + Oid remote_typid; + int64 remote_start; + int64 remote_increment; + int64 remote_min; + int64 remote_max; + bool remote_cycle; + CopySeqResult result = COPYSEQ_SUCCESS; + HeapTuple tup; + Form_pg_sequence local_seq; + LogicalRepSequenceInfo *seqinfo_local; + + *seqidx = DatumGetInt32(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + /* Identify the corresponding local sequence for the given index. */ + *seqinfo = seqinfo_local = + (LogicalRepSequenceInfo *) list_nth(seqinfos, *seqidx); + + seqinfo_local->last_value = DatumGetInt64(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + seqinfo_local->is_called = DatumGetBool(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + seqinfo_local->page_lsn = DatumGetLSN(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + remote_typid = DatumGetObjectId(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + remote_start = DatumGetInt64(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + remote_increment = DatumGetInt64(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + remote_min = DatumGetInt64(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + remote_max = DatumGetInt64(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + remote_cycle = DatumGetBool(slot_getattr(slot, ++col, &isnull)); + Assert(!isnull); + + /* Sanity check */ + Assert(col == REMOTE_SEQ_COL_COUNT); + + seqinfo_local->found_on_pub = true; + + *sequence_rel = try_table_open(seqinfo_local->localrelid, RowExclusiveLock); + + /* Sequence was concurrently dropped? */ + if (!*sequence_rel) + return COPYSEQ_SKIPPED; + + tup = SearchSysCache1(SEQRELID, ObjectIdGetDatum(seqinfo_local->localrelid)); + + /* Sequence was concurrently dropped? */ + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for sequence %u", + seqinfo_local->localrelid); + + local_seq = (Form_pg_sequence) GETSTRUCT(tup); + + /* Sequence parameters for remote/local are the same? */ + if (local_seq->seqtypid != remote_typid || + local_seq->seqstart != remote_start || + local_seq->seqincrement != remote_increment || + local_seq->seqmin != remote_min || + local_seq->seqmax != remote_max || + local_seq->seqcycle != remote_cycle) + result = COPYSEQ_MISMATCH; + + /* Sequence was concurrently renamed? */ + if (strcmp(seqinfo_local->nspname, + get_namespace_name(RelationGetNamespace(*sequence_rel))) || + strcmp(seqinfo_local->seqname, RelationGetRelationName(*sequence_rel))) + result = COPYSEQ_MISMATCH; + + ReleaseSysCache(tup); + return result; +} + +/* + * Apply remote sequence state to local sequence and mark it as + * synchronized (READY). + */ +static CopySeqResult +copy_sequence(LogicalRepSequenceInfo *seqinfo, Oid seqowner) +{ + UserContext ucxt; + AclResult aclresult; + bool run_as_owner = MySubscription->runasowner; + Oid seqoid = seqinfo->localrelid; + + /* + * If the user did not opt to run as the owner of the subscription + * ('run_as_owner'), then copy the sequence as the owner of the sequence. + */ + if (!run_as_owner) + SwitchToUntrustedUser(seqowner, &ucxt); + + aclresult = pg_class_aclcheck(seqoid, GetUserId(), ACL_UPDATE); + + if (aclresult != ACLCHECK_OK) + { + if (!run_as_owner) + RestoreUserContext(&ucxt); + + return COPYSEQ_INSUFFICIENT_PERM; + } + + /* + * The log counter (log_cnt) tracks how many sequence values are still + * unused locally. It is only relevant to the local node and managed + * internally by nextval() when allocating new ranges. Since log_cnt does + * not affect the visible sequence state (like last_value or is_called) + * and is only used for local caching, it need not be copied to the + * subscriber during synchronization. + */ + SetSequence(seqoid, seqinfo->last_value, seqinfo->is_called); + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + /* + * Record the remote sequence's LSN in pg_subscription_rel and mark the + * sequence as READY. + */ + UpdateSubscriptionRelState(MySubscription->oid, seqoid, SUBREL_STATE_READY, + seqinfo->page_lsn, false); + + return COPYSEQ_SUCCESS; +} + +/* + * Copy existing data of sequences from the publisher. + */ +static void +copy_sequences(WalReceiverConn *conn) +{ + int cur_batch_base_index = 0; + int n_seqinfos = list_length(seqinfos); + List *mismatched_seqs_idx = NIL; + List *missing_seqs_idx = NIL; + List *insuffperm_seqs_idx = NIL; + StringInfo seqstr = makeStringInfo(); + StringInfo cmd = makeStringInfo(); + MemoryContext oldctx; + +#define MAX_SEQUENCES_SYNC_PER_BATCH 100 + + elog(DEBUG1, + "logical replication sequence synchronization for subscription \"%s\" - total unsynchronized: %d", + MySubscription->name, n_seqinfos); + + while (cur_batch_base_index < n_seqinfos) + { + Oid seqRow[REMOTE_SEQ_COL_COUNT] = {INT8OID, INT8OID, + BOOLOID, LSNOID, OIDOID, INT8OID, INT8OID, INT8OID, INT8OID, BOOLOID}; + int batch_size = 0; + int batch_succeeded_count = 0; + int batch_mismatched_count = 0; + int batch_skipped_count = 0; + int batch_insuffperm_count = 0; + int batch_missing_count; + Relation sequence_rel; + + WalRcvExecResult *res; + TupleTableSlot *slot; + + StartTransactionCommand(); + + for (int idx = cur_batch_base_index; idx < n_seqinfos; idx++) + { + char *nspname_literal; + char *seqname_literal; + + LogicalRepSequenceInfo *seqinfo = + (LogicalRepSequenceInfo *) list_nth(seqinfos, idx); + + if (seqstr->len > 0) + appendStringInfoString(seqstr, ", "); + + nspname_literal = quote_literal_cstr(seqinfo->nspname); + seqname_literal = quote_literal_cstr(seqinfo->seqname); + + appendStringInfo(seqstr, "(%s, %s, %d)", + nspname_literal, seqname_literal, idx); + + if (++batch_size == MAX_SEQUENCES_SYNC_PER_BATCH) + break; + } + + /* + * We deliberately avoid acquiring a local lock on the sequence before + * querying the publisher to prevent potential distributed deadlocks + * in bi-directional replication setups. + * + * Example scenario: + * + * - On each node, a background worker acquires a lock on a sequence + * as part of a sync operation. + * + * - Concurrently, a user transaction attempts to alter the same + * sequence, waiting on the background worker's lock. + * + * - Meanwhile, a query from the other node tries to access metadata + * that depends on the completion of the alter operation. + * + * - This creates a circular wait across nodes: + * + * Node-1: Query -> waits on Alter -> waits on Sync Worker + * + * Node-2: Query -> waits on Alter -> waits on Sync Worker + * + * Since each node only sees part of the wait graph, the deadlock may + * go undetected, leading to indefinite blocking. + * + * Note: Each entry in VALUES includes an index 'seqidx' that + * represents the sequence's position in the local 'seqinfos' list. + * This index is propagated to the query results and later used to + * directly map the fetched publisher sequence rows back to their + * corresponding local entries without relying on result order or name + * matching. + */ + appendStringInfo(cmd, + "SELECT s.seqidx, ps.*, seq.seqtypid,\n" + " seq.seqstart, seq.seqincrement, seq.seqmin,\n" + " seq.seqmax, seq.seqcycle\n" + "FROM ( VALUES %s ) AS s (schname, seqname, seqidx)\n" + "JOIN pg_namespace n ON n.nspname = s.schname\n" + "JOIN pg_class c ON c.relnamespace = n.oid AND c.relname = s.seqname\n" + "JOIN pg_sequence seq ON seq.seqrelid = c.oid\n" + "JOIN LATERAL pg_get_sequence_data(seq.seqrelid) AS ps ON true\n", + seqstr->data); + + res = walrcv_exec(conn, cmd->data, lengthof(seqRow), seqRow); + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch sequence information from the publisher: %s", + res->err)); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + CopySeqResult sync_status; + LogicalRepSequenceInfo *seqinfo; + int seqidx; + + CHECK_FOR_INTERRUPTS(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + sync_status = get_and_validate_seq_info(slot, &sequence_rel, + &seqinfo, &seqidx); + if (sync_status == COPYSEQ_SUCCESS) + sync_status = copy_sequence(seqinfo, + sequence_rel->rd_rel->relowner); + + switch (sync_status) + { + case COPYSEQ_SUCCESS: + elog(DEBUG1, + "logical replication synchronization for subscription \"%s\", sequence \"%s.%s\" has finished", + MySubscription->name, seqinfo->nspname, + seqinfo->seqname); + batch_succeeded_count++; + break; + case COPYSEQ_MISMATCH: + + /* + * Remember mismatched sequences in a long-lived memory + * context since these will be used after the transaction + * is committed. + */ + oldctx = MemoryContextSwitchTo(ApplyContext); + mismatched_seqs_idx = lappend_int(mismatched_seqs_idx, + seqidx); + MemoryContextSwitchTo(oldctx); + batch_mismatched_count++; + break; + case COPYSEQ_INSUFFICIENT_PERM: + + /* + * Remember sequences with insufficient privileges in a + * long-lived memory context since these will be used + * after the transaction is committed. + */ + oldctx = MemoryContextSwitchTo(ApplyContext); + insuffperm_seqs_idx = lappend_int(insuffperm_seqs_idx, + seqidx); + MemoryContextSwitchTo(oldctx); + batch_insuffperm_count++; + break; + case COPYSEQ_SKIPPED: + ereport(LOG, + errmsg("skip synchronization of sequence \"%s.%s\" because it has been dropped concurrently", + seqinfo->nspname, + seqinfo->seqname)); + batch_skipped_count++; + break; + } + + if (sequence_rel) + table_close(sequence_rel, NoLock); + } + + ExecDropSingleTupleTableSlot(slot); + walrcv_clear_result(res); + resetStringInfo(seqstr); + resetStringInfo(cmd); + + batch_missing_count = batch_size - (batch_succeeded_count + + batch_mismatched_count + + batch_insuffperm_count + + batch_skipped_count); + + elog(DEBUG1, + "logical replication sequence synchronization for subscription \"%s\" - batch #%d = %d attempted, %d succeeded, %d mismatched, %d insufficient permission, %d missing from publisher, %d skipped", + MySubscription->name, + (cur_batch_base_index / MAX_SEQUENCES_SYNC_PER_BATCH) + 1, + batch_size, batch_succeeded_count, batch_mismatched_count, + batch_insuffperm_count, batch_missing_count, batch_skipped_count); + + /* Commit this batch, and prepare for next batch */ + CommitTransactionCommand(); + + if (batch_missing_count) + { + for (int idx = cur_batch_base_index; idx < cur_batch_base_index + batch_size; idx++) + { + LogicalRepSequenceInfo *seqinfo = + (LogicalRepSequenceInfo *) list_nth(seqinfos, idx); + + /* If the sequence was not found on publisher, record it */ + if (!seqinfo->found_on_pub) + missing_seqs_idx = lappend_int(missing_seqs_idx, idx); + } + } + + /* + * cur_batch_base_index is not incremented sequentially because some + * sequences may be missing, and the number of fetched rows may not + * match the batch size. + */ + cur_batch_base_index += batch_size; + } + + /* Report mismatches, permission issues, or missing sequences */ + report_sequence_errors(mismatched_seqs_idx, insuffperm_seqs_idx, + missing_seqs_idx); +} + +/* + * Identifies sequences that require synchronization and initiates the + * synchronization process. + */ +static void +LogicalRepSyncSequences(void) +{ + char *err; + bool must_use_password; + Relation rel; + HeapTuple tup; + ScanKeyData skey[2]; + SysScanDesc scan; + Oid subid = MyLogicalRepWorker->subid; + StringInfoData app_name; + + StartTransactionCommand(); + + rel = table_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[0], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + ScanKeyInit(&skey[1], + Anum_pg_subscription_rel_srsubstate, + BTEqualStrategyNumber, F_CHAREQ, + CharGetDatum(SUBREL_STATE_INIT)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, 2, skey); + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + LogicalRepSequenceInfo *seq; + Relation sequence_rel; + MemoryContext oldctx; + + CHECK_FOR_INTERRUPTS(); + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + + sequence_rel = try_table_open(subrel->srrelid, RowExclusiveLock); + + /* Skip if sequence was dropped concurrently */ + if (!sequence_rel) + continue; + + /* Skip if the relation is not a sequence */ + if (sequence_rel->rd_rel->relkind != RELKIND_SEQUENCE) + { + table_close(sequence_rel, NoLock); + continue; + } + + /* + * Worker needs to process sequences across transaction boundary, so + * allocate them under long-lived context. + */ + oldctx = MemoryContextSwitchTo(ApplyContext); + + seq = palloc0_object(LogicalRepSequenceInfo); + seq->localrelid = subrel->srrelid; + seq->nspname = get_namespace_name(RelationGetNamespace(sequence_rel)); + seq->seqname = pstrdup(RelationGetRelationName(sequence_rel)); + seqinfos = lappend(seqinfos, seq); + + MemoryContextSwitchTo(oldctx); + + table_close(sequence_rel, NoLock); + } + + /* Cleanup */ + systable_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + /* + * Exit early if no catalog entries found, likely due to concurrent drops. + */ + if (!seqinfos) + return; + + /* Is the use of a password mandatory? */ + must_use_password = MySubscription->passwordrequired && + !MySubscription->ownersuperuser; + + initStringInfo(&app_name); + appendStringInfo(&app_name, "pg_%u_sequence_sync_" UINT64_FORMAT, + MySubscription->oid, GetSystemIdentifier()); + + /* + * Establish the connection to the publisher for sequence synchronization. + */ + LogRepWorkerWalRcvConn = + walrcv_connect(MySubscription->conninfo, true, true, + must_use_password, + app_name.data, &err); + if (LogRepWorkerWalRcvConn == NULL) + ereport(ERROR, + errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("sequencesync worker for subscription \"%s\" could not connect to the publisher: %s", + MySubscription->name, err)); + + pfree(app_name.data); + + copy_sequences(LogRepWorkerWalRcvConn); +} + +/* + * Execute the initial sync with error handling. Disable the subscription, + * if required. + * + * Note that we don't handle FATAL errors which are probably because of system + * resource error and are not repeatable. + */ +static void +start_sequence_sync(void) +{ + Assert(am_sequencesync_worker()); + + PG_TRY(); + { + /* Call initial sync. */ + LogicalRepSyncSequences(); + } + PG_CATCH(); + { + if (MySubscription->disableonerr) + DisableSubscriptionAndExit(); + else + { + /* + * Report the worker failed during sequence synchronization. Abort + * the current transaction so that the stats message is sent in an + * idle state. + */ + AbortOutOfAnyTransaction(); + pgstat_report_subscription_error(MySubscription->oid, + WORKERTYPE_SEQUENCESYNC); + + PG_RE_THROW(); + } + } + PG_END_TRY(); +} + +/* Logical Replication sequencesync worker entry point */ +void +SequenceSyncWorkerMain(Datum main_arg) +{ + int worker_slot = DatumGetInt32(main_arg); + + SetupApplyOrSyncWorker(worker_slot); + + start_sequence_sync(); + + FinishSyncWorker(); +} diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 656e66e0ae0a1..2aea776352d94 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -39,6 +39,13 @@ * the last cycle. Refer to the comments above wait_for_slot_activity() for * more details. * + * If the SQL function pg_sync_replication_slots() is used to sync the slots, + * and if the slots are not ready to be synced and are marked as RS_TEMPORARY + * because of any of the reasons mentioned above, then the SQL function also + * waits and retries until the slots are marked as RS_PERSISTENT (which means + * sync-ready). Refer to the comments in SyncReplicationSlots() for more + * details. + * * Any standby synchronized slots will be dropped if they no longer need * to be synchronized. See comment atop drop_local_obsolete_slots() for more * details. @@ -52,7 +59,6 @@ #include "access/xlog_internal.h" #include "access/xlogrecovery.h" #include "catalog/pg_database.h" -#include "commands/dbcommands.h" #include "libpq/pqsignal.h" #include "pgstat.h" #include "postmaster/interrupt.h" @@ -65,6 +71,7 @@ #include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" +#include "utils/memutils.h" #include "utils/pg_lsn.h" #include "utils/ps_status.h" #include "utils/timeout.h" @@ -72,11 +79,14 @@ /* * Struct for sharing information to control slot synchronization. * - * The slot sync worker's pid is needed by the startup process to shut it - * down during promotion. The startup process shuts down the slot sync worker - * and also sets stopSignaled=true to handle the race condition when the + * The 'pid' is either the slot sync worker's pid or the backend's pid running + * the SQL function pg_sync_replication_slots(). When the startup process sets + * 'stopSignaled' during promotion, it uses this 'pid' to wake up the currently + * synchronizing process so that the process can immediately stop its + * synchronizing work on seeing 'stopSignaled' set. + * Setting 'stopSignaled' is also used to handle the race condition when the * postmaster has not noticed the promotion yet and thus may end up restarting - * the slot sync worker. If stopSignaled is set, the worker will exit in such a + * the slot sync worker. If 'stopSignaled' is set, the worker will exit in such a * case. The SQL function pg_sync_replication_slots() will also error out if * this flag is set. Note that we don't need to reset this variable as after * promotion the slot sync worker won't be restarted because the pmState @@ -149,6 +159,35 @@ typedef struct RemoteSlot static void slotsync_failure_callback(int code, Datum arg); static void update_synced_slots_inactive_since(void); +/* + * Update slot sync skip stats. This function requires the caller to acquire + * the slot. + */ +static void +update_slotsync_skip_stats(SlotSyncSkipReason skip_reason) +{ + ReplicationSlot *slot; + + Assert(MyReplicationSlot); + + slot = MyReplicationSlot; + + /* + * Update the slot sync related stats in pg_stat_replication_slot when a + * slot sync is skipped + */ + if (skip_reason != SS_SKIP_NONE) + pgstat_report_replslotsync(slot); + + /* Update the slot sync skip reason */ + if (slot->slotsync_skip_reason != skip_reason) + { + SpinLockAcquire(&slot->mutex); + slot->slotsync_skip_reason = skip_reason; + SpinLockRelease(&slot->mutex); + } +} + /* * If necessary, update the local synced slot's metadata based on the data * from the remote slot. @@ -171,6 +210,7 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, ReplicationSlot *slot = MyReplicationSlot; bool updated_xmin_or_lsn = false; bool updated_config = false; + SlotSyncSkipReason skip_reason = SS_SKIP_NONE; Assert(slot->data.invalidated == RS_INVAL_NONE); @@ -188,6 +228,9 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, TransactionIdPrecedes(remote_slot->catalog_xmin, slot->data.catalog_xmin)) { + /* Update slot sync skip stats */ + update_slotsync_skip_stats(SS_SKIP_WAL_OR_ROWS_REMOVED); + /* * This can happen in following situations: * @@ -211,9 +254,9 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, * impact the users, so we used DEBUG1 level to log the message. */ ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1, - errmsg("could not synchronize replication slot \"%s\" because remote slot precedes local slot", + errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("The remote slot has LSN %X/%X and catalog xmin %u, but the local slot has LSN %X/%X and catalog xmin %u.", + errdetail("Synchronization could lead to data loss, because the remote slot needs WAL at LSN %X/%08X and catalog xmin %u, but the standby has LSN %X/%08X and catalog xmin %u.", LSN_FORMAT_ARGS(remote_slot->restart_lsn), remote_slot->catalog_xmin, LSN_FORMAT_ARGS(slot->data.restart_lsn), @@ -275,14 +318,24 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, ereport(ERROR, errmsg_internal("synchronized confirmed_flush for slot \"%s\" differs from remote slot", remote_slot->name), - errdetail_internal("Remote slot has LSN %X/%X but local slot has LSN %X/%X.", + errdetail_internal("Remote slot has LSN %X/%08X but local slot has LSN %X/%08X.", LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), LSN_FORMAT_ARGS(slot->data.confirmed_flush))); + + /* + * If we can't reach a consistent snapshot, the slot won't be + * persisted. See update_and_persist_local_synced_slot(). + */ + if (found_consistent_snapshot && !(*found_consistent_snapshot)) + skip_reason = SS_SKIP_NO_CONSISTENT_SNAPSHOT; } updated_xmin_or_lsn = true; } + /* Update slot sync skip stats */ + update_slotsync_skip_stats(skip_reason); + if (remote_dbid != slot->data.database || remote_slot->two_phase != slot->data.two_phase || remote_slot->failover != slot->data.failover || @@ -494,7 +547,7 @@ reserve_wal_for_local_slot(XLogRecPtr restart_lsn) ReplicationSlot *slot = MyReplicationSlot; Assert(slot != NULL); - Assert(XLogRecPtrIsInvalid(slot->data.restart_lsn)); + Assert(!XLogRecPtrIsValid(slot->data.restart_lsn)); while (true) { @@ -554,16 +607,21 @@ reserve_wal_for_local_slot(XLogRecPtr restart_lsn) * local ones, then update the LSNs and persist the local synced slot for * future synchronization; otherwise, do nothing. * + * *slot_persistence_pending is set to true if any of the slots fail to + * persist. + * * Return true if the slot is marked as RS_PERSISTENT (sync-ready), otherwise * false. */ static bool -update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) +update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, + bool *slot_persistence_pending) { ReplicationSlot *slot = MyReplicationSlot; bool found_consistent_snapshot = false; bool remote_slot_precedes = false; + /* Slotsync skip stats are handled in function update_local_synced_slot() */ (void) update_local_synced_slot(remote_slot, remote_dbid, &found_consistent_snapshot, &remote_slot_precedes); @@ -581,7 +639,13 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) * current location when recreating the slot in the next cycle. It may * take more time to create such a slot. Therefore, we keep this slot * and attempt the synchronization in the next cycle. + * + * We also update the slot_persistence_pending parameter, so the SQL + * function can retry. */ + if (slot_persistence_pending) + *slot_persistence_pending = true; + return false; } @@ -593,9 +657,13 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) { ereport(LOG, errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("Logical decoding could not find consistent point from local slot's LSN %X/%X.", + errdetail("Synchronization could lead to data loss, because the standby could not build a consistent snapshot to decode WALs at LSN %X/%08X.", LSN_FORMAT_ARGS(slot->data.restart_lsn))); + /* Set this, so that SQL function can retry */ + if (slot_persistence_pending) + *slot_persistence_pending = true; + return false; } @@ -619,37 +687,19 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) * updated. The slot is then persisted and is considered as sync-ready for * periodic syncs. * + * *slot_persistence_pending is set to true if any of the slots fail to + * persist. + * * Returns TRUE if the local slot is updated. */ static bool -synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) +synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid, + bool *slot_persistence_pending) { ReplicationSlot *slot; - XLogRecPtr latestFlushPtr; + XLogRecPtr latestFlushPtr = GetStandbyFlushRecPtr(NULL); bool slot_updated = false; - /* - * Make sure that concerned WAL is received and flushed before syncing - * slot to target lsn received from the primary server. - */ - latestFlushPtr = GetStandbyFlushRecPtr(NULL); - if (remote_slot->confirmed_lsn > latestFlushPtr) - { - /* - * Can get here only if GUC 'synchronized_standby_slots' on the - * primary server was not configured correctly. - */ - ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("skipping slot synchronization because the received slot sync" - " LSN %X/%X for slot \"%s\" is ahead of the standby position %X/%X", - LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), - remote_slot->name, - LSN_FORMAT_ARGS(latestFlushPtr))); - - return false; - } - /* Search for the named slot */ if ((slot = SearchNamedReplicationSlot(remote_slot->name, true))) { @@ -708,15 +758,46 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) /* Skip the sync of an invalidated slot */ if (slot->data.invalidated != RS_INVAL_NONE) { + update_slotsync_skip_stats(SS_SKIP_INVALID); + ReplicationSlotRelease(); return slot_updated; } + /* + * Make sure that concerned WAL is received and flushed before syncing + * slot to target lsn received from the primary server. + * + * Report statistics only after the slot has been acquired, ensuring + * it cannot be dropped during the reporting process. + */ + if (remote_slot->confirmed_lsn > latestFlushPtr) + { + update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED); + + /* + * Can get here only if GUC 'synchronized_standby_slots' on the + * primary server was not configured correctly. + */ + ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("skipping slot synchronization because the received slot sync" + " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X", + LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), + remote_slot->name, + LSN_FORMAT_ARGS(latestFlushPtr))); + + ReplicationSlotRelease(); + + return slot_updated; + } + /* Slot not ready yet, let's attempt to make it sync-ready now. */ if (slot->data.persistency == RS_TEMPORARY) { slot_updated = update_and_persist_local_synced_slot(remote_slot, - remote_dbid); + remote_dbid, + slot_persistence_pending); } /* Slot ready for sync, so sync it. */ @@ -733,7 +814,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) ereport(ERROR, errmsg_internal("cannot synchronize local slot \"%s\"", remote_slot->name), - errdetail_internal("Local slot's start streaming location LSN(%X/%X) is ahead of remote slot's LSN(%X/%X).", + errdetail_internal("Local slot's start streaming location LSN(%X/%08X) is ahead of remote slot's LSN(%X/%08X).", LSN_FORMAT_ARGS(slot->data.confirmed_flush), LSN_FORMAT_ARGS(remote_slot->confirmed_lsn))); @@ -785,7 +866,36 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) ReplicationSlotsComputeRequiredXmin(true); LWLockRelease(ProcArrayLock); - update_and_persist_local_synced_slot(remote_slot, remote_dbid); + /* + * Make sure that concerned WAL is received and flushed before syncing + * slot to target lsn received from the primary server. + * + * Report statistics only after the slot has been acquired, ensuring + * it cannot be dropped during the reporting process. + */ + if (remote_slot->confirmed_lsn > latestFlushPtr) + { + update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED); + + /* + * Can get here only if GUC 'synchronized_standby_slots' on the + * primary server was not configured correctly. + */ + ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("skipping slot synchronization because the received slot sync" + " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X", + LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), + remote_slot->name, + LSN_FORMAT_ARGS(latestFlushPtr))); + + ReplicationSlotRelease(); + + return false; + } + + update_and_persist_local_synced_slot(remote_slot, remote_dbid, + slot_persistence_pending); slot_updated = true; } @@ -796,15 +906,16 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) } /* - * Synchronize slots. + * Fetch remote slots. * - * Gets the failover logical slots info from the primary server and updates - * the slots locally. Creates the slots if not present on the standby. + * If slot_names is NIL, fetches all failover logical slots from the + * primary server, otherwise fetches only the ones with names in slot_names. * - * Returns TRUE if any of the slots gets updated in this sync-cycle. + * Returns a list of remote slot information structures, or NIL if none + * are found. */ -static bool -synchronize_slots(WalReceiverConn *wrconn) +static List * +fetch_remote_slots(WalReceiverConn *wrconn, List *slot_names) { #define SLOTSYNC_COLUMN_COUNT 10 Oid slotRow[SLOTSYNC_COLUMN_COUNT] = {TEXTOID, TEXTOID, LSNOID, @@ -813,34 +924,50 @@ synchronize_slots(WalReceiverConn *wrconn) WalRcvExecResult *res; TupleTableSlot *tupslot; List *remote_slot_list = NIL; - bool some_slot_updated = false; - bool started_tx = false; - const char *query = "SELECT slot_name, plugin, confirmed_flush_lsn," - " restart_lsn, catalog_xmin, two_phase, two_phase_at, failover," - " database, invalidation_reason" - " FROM pg_catalog.pg_replication_slots" - " WHERE failover and NOT temporary"; - - /* The syscache access in walrcv_exec() needs a transaction env. */ - if (!IsTransactionState()) + StringInfoData query; + + initStringInfo(&query); + appendStringInfoString(&query, + "SELECT slot_name, plugin, confirmed_flush_lsn," + " restart_lsn, catalog_xmin, two_phase," + " two_phase_at, failover," + " database, invalidation_reason" + " FROM pg_catalog.pg_replication_slots" + " WHERE failover and NOT temporary"); + + if (slot_names != NIL) { - StartTransactionCommand(); - started_tx = true; + bool first_slot = true; + + /* + * Construct the query to fetch only the specified slots + */ + appendStringInfoString(&query, " AND slot_name IN ("); + + foreach_ptr(char, slot_name, slot_names) + { + if (!first_slot) + appendStringInfoString(&query, ", "); + + appendStringInfo(&query, "%s", quote_literal_cstr(slot_name)); + first_slot = false; + } + appendStringInfoChar(&query, ')'); } /* Execute the query */ - res = walrcv_exec(wrconn, query, SLOTSYNC_COLUMN_COUNT, slotRow); + res = walrcv_exec(wrconn, query.data, SLOTSYNC_COLUMN_COUNT, slotRow); + pfree(query.data); if (res->status != WALRCV_OK_TUPLES) ereport(ERROR, errmsg("could not fetch failover logical slots info from the primary server: %s", res->err)); - /* Construct the remote_slot tuple and synchronize each slot locally */ tupslot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); while (tuplestore_gettupleslot(res->tuplestore, true, false, tupslot)) { bool isnull; - RemoteSlot *remote_slot = palloc0(sizeof(RemoteSlot)); + RemoteSlot *remote_slot = palloc0_object(RemoteSlot); Datum d; int col = 0; @@ -900,8 +1027,8 @@ synchronize_slots(WalReceiverConn *wrconn) * pg_replication_slots view, then we can avoid fetching RS_EPHEMERAL * slots in the first place. */ - if ((XLogRecPtrIsInvalid(remote_slot->restart_lsn) || - XLogRecPtrIsInvalid(remote_slot->confirmed_lsn) || + if ((!XLogRecPtrIsValid(remote_slot->restart_lsn) || + !XLogRecPtrIsValid(remote_slot->confirmed_lsn) || !TransactionIdIsValid(remote_slot->catalog_xmin)) && remote_slot->invalidated == RS_INVAL_NONE) pfree(remote_slot); @@ -912,6 +1039,29 @@ synchronize_slots(WalReceiverConn *wrconn) ExecClearTuple(tupslot); } + walrcv_clear_result(res); + + return remote_slot_list; +} + +/* + * Synchronize slots. + * + * This function takes a list of remote slots and synchronizes them locally. It + * creates the slots if not present on the standby and updates existing ones. + * + * If slot_persistence_pending is not NULL, it will be set to true if one or + * more slots could not be persisted. This allows callers such as + * SyncReplicationSlots() to retry those slots. + * + * Returns TRUE if any of the slots gets updated in this sync-cycle. + */ +static bool +synchronize_slots(WalReceiverConn *wrconn, List *remote_slot_list, + bool *slot_persistence_pending) +{ + bool some_slot_updated = false; + /* Drop local slots that no longer need to be synced. */ drop_local_obsolete_slots(remote_slot_list); @@ -927,19 +1077,12 @@ synchronize_slots(WalReceiverConn *wrconn) */ LockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock); - some_slot_updated |= synchronize_one_slot(remote_slot, remote_dbid); + some_slot_updated |= synchronize_one_slot(remote_slot, remote_dbid, + slot_persistence_pending); UnlockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock); } - /* We are done, free remote_slot_list elements */ - list_free_deep(remote_slot_list); - - walrcv_clear_result(res); - - if (started_tx) - CommitTransactionCommand(); - return some_slot_updated; } @@ -1058,15 +1201,17 @@ bool ValidateSlotSyncParams(int elevel) { /* - * Logical slot sync/creation requires wal_level >= logical. - * - * Since altering the wal_level requires a server restart, so error out in - * this case regardless of elevel provided by caller. + * Logical slot sync/creation requires logical decoding to be enabled. */ - if (wal_level < WAL_LEVEL_LOGICAL) - ereport(ERROR, + if (!IsLogicalDecodingEnabled()) + { + ereport(elevel, errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("replication slot synchronization requires \"wal_level\" >= \"logical\"")); + errmsg("replication slot synchronization requires \"effective_wal_level\" >= \"logical\" on the primary"), + errhint("To enable logical decoding on primary, set \"wal_level\" >= \"logical\" or create at least one logical slot when \"wal_level\" = \"replica\".")); + + return false; + } /* * A physical replication slot(primary_slot_name) is required on the @@ -1116,10 +1261,10 @@ ValidateSlotSyncParams(int elevel) } /* - * Re-read the config file. + * Re-read the config file for slot synchronization. * - * Exit if any of the slot sync GUCs have changed. The postmaster will - * restart it. + * Exit or throw error if relevant GUCs have changed depending on whether + * called from slot sync worker or from the SQL function pg_sync_replication_slots() */ static void slotsync_reread_config(void) @@ -1130,8 +1275,11 @@ slotsync_reread_config(void) bool old_hot_standby_feedback = hot_standby_feedback; bool conninfo_changed; bool primary_slotname_changed; + bool is_slotsync_worker = AmLogicalSlotSyncWorkerProcess(); + bool parameter_changed = false; - Assert(sync_replication_slots); + if (is_slotsync_worker) + Assert(sync_replication_slots); ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); @@ -1143,44 +1291,85 @@ slotsync_reread_config(void) if (old_sync_replication_slots != sync_replication_slots) { - ereport(LOG, - /* translator: %s is a GUC variable name */ - errmsg("replication slot synchronization worker will shut down because \"%s\" is disabled", "sync_replication_slots")); - proc_exit(0); - } + if (is_slotsync_worker) + { + ereport(LOG, + /* translator: %s is a GUC variable name */ + errmsg("replication slot synchronization worker will stop because \"%s\" is disabled", + "sync_replication_slots")); - if (conninfo_changed || - primary_slotname_changed || - (old_hot_standby_feedback != hot_standby_feedback)) + proc_exit(0); + } + + parameter_changed = true; + } + else { - ereport(LOG, - errmsg("replication slot synchronization worker will restart because of a parameter change")); + if (conninfo_changed || + primary_slotname_changed || + (old_hot_standby_feedback != hot_standby_feedback)) + { - /* - * Reset the last-start time for this worker so that the postmaster - * can restart it without waiting for SLOTSYNC_RESTART_INTERVAL_SEC. - */ - SlotSyncCtx->last_start_time = 0; + if (is_slotsync_worker) + { + ereport(LOG, + errmsg("replication slot synchronization worker will restart because of a parameter change")); - proc_exit(0); + /* + * Reset the last-start time for this worker so that the + * postmaster can restart it without waiting for + * SLOTSYNC_RESTART_INTERVAL_SEC. + */ + SlotSyncCtx->last_start_time = 0; + + proc_exit(0); + } + + parameter_changed = true; + } + } + + /* + * If we have reached here with a parameter change, we must be running in + * SQL function, emit error in such a case. + */ + if (parameter_changed) + { + Assert(!is_slotsync_worker); + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("replication slot synchronization will stop because of a parameter change")); } } /* - * Interrupt handler for main loop of slot sync worker. + * Interrupt handler for process performing slot synchronization. */ static void -ProcessSlotSyncInterrupts(WalReceiverConn *wrconn) +ProcessSlotSyncInterrupts(void) { CHECK_FOR_INTERRUPTS(); - if (ShutdownRequestPending) + if (SlotSyncCtx->stopSignaled) { - ereport(LOG, - errmsg("replication slot synchronization worker is shutting down on receiving SIGINT")); + if (AmLogicalSlotSyncWorkerProcess()) + { + ereport(LOG, + errmsg("replication slot synchronization worker will stop because promotion is triggered")); - proc_exit(0); + proc_exit(0); + } + else + { + /* + * For the backend executing SQL function + * pg_sync_replication_slots(). + */ + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot synchronization will stop because promotion is triggered")); + } } if (ConfigReloadPending) @@ -1283,29 +1472,14 @@ wait_for_slot_activity(bool some_slot_updated) } /* - * Emit an error if a promotion or a concurrent sync call is in progress. + * Emit an error if a concurrent sync call is in progress. * Otherwise, advertise that a sync is in progress. */ static void -check_and_set_sync_info(pid_t worker_pid) +check_and_set_sync_info(pid_t sync_process_pid) { SpinLockAcquire(&SlotSyncCtx->mutex); - /* The worker pid must not be already assigned in SlotSyncCtx */ - Assert(worker_pid == InvalidPid || SlotSyncCtx->pid == InvalidPid); - - /* - * Emit an error if startup process signaled the slot sync machinery to - * stop. See comments atop SlotSyncCtxStruct. - */ - if (SlotSyncCtx->stopSignaled) - { - SpinLockRelease(&SlotSyncCtx->mutex); - ereport(ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot synchronize replication slots when standby promotion is ongoing")); - } - if (SlotSyncCtx->syncing) { SpinLockRelease(&SlotSyncCtx->mutex); @@ -1314,13 +1488,16 @@ check_and_set_sync_info(pid_t worker_pid) errmsg("cannot synchronize replication slots concurrently")); } + /* The pid must not be already assigned in SlotSyncCtx */ + Assert(SlotSyncCtx->pid == InvalidPid); + SlotSyncCtx->syncing = true; /* * Advertise the required PID so that the startup process can kill the - * slot sync worker on promotion. + * slot sync process on promotion. */ - SlotSyncCtx->pid = worker_pid; + SlotSyncCtx->pid = sync_process_pid; SpinLockRelease(&SlotSyncCtx->mutex); @@ -1331,20 +1508,24 @@ check_and_set_sync_info(pid_t worker_pid) * Reset syncing flag. */ static void -reset_syncing_flag() +reset_syncing_flag(void) { SpinLockAcquire(&SlotSyncCtx->mutex); SlotSyncCtx->syncing = false; + SlotSyncCtx->pid = InvalidPid; SpinLockRelease(&SlotSyncCtx->mutex); syncing_slots = false; -}; +} /* * The main loop of our worker process. * * It connects to the primary server, fetches logical failover slots * information periodically in order to create and sync the slots. + * + * Note: If any changes are made here, check if the corresponding SQL + * function logic in SyncReplicationSlots() also needs to be changed. */ void ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) @@ -1409,7 +1590,7 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) /* Setup signal handling */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGINT, StatementCancelHandler); pqsignal(SIGTERM, die); pqsignal(SIGFPE, FloatExceptionHandler); pqsignal(SIGUSR1, procsignal_sigusr1_handler); @@ -1477,7 +1658,6 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) */ wrconn = walrcv_connect(PrimaryConnInfo, false, false, false, app_name.data, &err); - pfree(app_name.data); if (!wrconn) ereport(ERROR, @@ -1485,6 +1665,8 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) errmsg("synchronization worker \"%s\" could not connect to the primary server: %s", app_name.data, err)); + pfree(app_name.data); + /* * Register the disconnection callback. * @@ -1505,17 +1687,35 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) for (;;) { bool some_slot_updated = false; + bool started_tx = false; + List *remote_slots; - ProcessSlotSyncInterrupts(wrconn); + ProcessSlotSyncInterrupts(); - some_slot_updated = synchronize_slots(wrconn); + /* + * The syscache access in fetch_remote_slots() needs a transaction + * env. + */ + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } + + remote_slots = fetch_remote_slots(wrconn, NIL); + some_slot_updated = synchronize_slots(wrconn, remote_slots, NULL); + list_free_deep(remote_slots); + + if (started_tx) + CommitTransactionCommand(); wait_for_slot_activity(some_slot_updated); } /* * The slot sync worker can't get here because it will only stop when it - * receives a SIGINT from the startup process, or when there is an error. + * receives a stop request from the startup process, or when there is an + * error. */ Assert(false); } @@ -1541,7 +1741,7 @@ update_synced_slots_inactive_since(void) if (!StandbyMode) return; - /* The slot sync worker or SQL function mustn't be running by now */ + /* The slot sync worker or the SQL function mustn't be running by now */ Assert((SlotSyncCtx->pid == InvalidPid) && !SlotSyncCtx->syncing); LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); @@ -1570,16 +1770,18 @@ update_synced_slots_inactive_since(void) } /* - * Shut down the slot sync worker. + * Shut down slot synchronization. * - * This function sends signal to shutdown slot sync worker, if required. It - * also waits till the slot sync worker has exited or + * This function sets stopSignaled=true and wakes up the slot sync process + * (either worker or backend running the SQL function pg_sync_replication_slots()) + * so that worker can exit or the SQL function pg_sync_replication_slots() can + * finish. It also waits till the slot sync worker has exited or * pg_sync_replication_slots() has finished. */ void ShutDownSlotSync(void) { - pid_t worker_pid; + pid_t sync_process_pid; SpinLockAcquire(&SlotSyncCtx->mutex); @@ -1596,12 +1798,16 @@ ShutDownSlotSync(void) return; } - worker_pid = SlotSyncCtx->pid; + sync_process_pid = SlotSyncCtx->pid; SpinLockRelease(&SlotSyncCtx->mutex); - if (worker_pid != InvalidPid) - kill(worker_pid, SIGINT); + /* + * Signal process doing slotsync, if any. The process will stop upon + * detecting that the stopSignaled flag is set to true. + */ + if (sync_process_pid != InvalidPid) + kill(sync_process_pid, SIGUSR1); /* Wait for slot sync to end */ for (;;) @@ -1636,8 +1842,9 @@ ShutDownSlotSync(void) /* * SlotSyncWorkerCanRestart * - * Returns true if enough time (SLOTSYNC_RESTART_INTERVAL_SEC) has passed - * since it was launched last. Otherwise returns false. + * Return true, indicating worker is allowed to restart, if enough time has + * passed since it was last launched to reach SLOTSYNC_RESTART_INTERVAL_SEC. + * Otherwise return false. * * This is a safety valve to protect against continuous respawn attempts if the * worker is dying immediately at launch. Note that since we will retry to @@ -1649,14 +1856,19 @@ SlotSyncWorkerCanRestart(void) { time_t curtime = time(NULL); - /* Return false if too soon since last start. */ - if ((unsigned int) (curtime - SlotSyncCtx->last_start_time) < - (unsigned int) SLOTSYNC_RESTART_INTERVAL_SEC) - return false; - - SlotSyncCtx->last_start_time = curtime; - - return true; + /* + * If first time through, or time somehow went backwards, always update + * last_start_time to match the current clock and allow worker start. + * Otherwise allow it only once enough time has elapsed. + */ + if (SlotSyncCtx->last_start_time == 0 || + curtime < SlotSyncCtx->last_start_time || + curtime - SlotSyncCtx->last_start_time >= SLOTSYNC_RESTART_INTERVAL_SEC) + { + SlotSyncCtx->last_start_time = curtime; + return true; + } + return false; } /* @@ -1735,20 +1947,98 @@ slotsync_failure_callback(int code, Datum arg) walrcv_disconnect(wrconn); } +/* + * Helper function to extract slot names from a list of remote slots + */ +static List * +extract_slot_names(List *remote_slots) +{ + List *slot_names = NIL; + + foreach_ptr(RemoteSlot, remote_slot, remote_slots) + { + char *slot_name; + + slot_name = pstrdup(remote_slot->name); + slot_names = lappend(slot_names, slot_name); + } + + return slot_names; +} + /* * Synchronize the failover enabled replication slots using the specified * primary server connection. + * + * Repeatedly fetches and updates replication slot information from the + * primary until all slots are at least "sync ready". + * + * Exits early if promotion is triggered or certain critical + * configuration parameters have changed. */ void SyncReplicationSlots(WalReceiverConn *wrconn) { PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn)); { - check_and_set_sync_info(InvalidPid); + List *remote_slots = NIL; + List *slot_names = NIL; /* List of slot names to track */ + + check_and_set_sync_info(MyProcPid); + + /* Check for interrupts and config changes */ + ProcessSlotSyncInterrupts(); validate_remote_info(wrconn); - synchronize_slots(wrconn); + /* Retry until all the slots are sync-ready */ + for (;;) + { + bool slot_persistence_pending = false; + bool some_slot_updated = false; + + /* Check for interrupts and config changes */ + ProcessSlotSyncInterrupts(); + + /* We must be in a valid transaction state */ + Assert(IsTransactionState()); + + /* + * Fetch remote slot info for the given slot_names. If slot_names + * is NIL, fetch all failover-enabled slots. Note that we reuse + * slot_names from the first iteration; re-fetching all failover + * slots each time could cause an endless loop. Instead of + * reprocessing only the pending slots in each iteration, it's + * better to process all the slots received in the first + * iteration. This ensures that by the time we're done, all slots + * reflect the latest values. + */ + remote_slots = fetch_remote_slots(wrconn, slot_names); + + /* Attempt to synchronize slots */ + some_slot_updated = synchronize_slots(wrconn, remote_slots, + &slot_persistence_pending); + + /* + * If slot_persistence_pending is true, extract slot names for + * future iterations (only needed if we haven't done it yet) + */ + if (slot_names == NIL && slot_persistence_pending) + slot_names = extract_slot_names(remote_slots); + + /* Free the current remote_slots list */ + list_free_deep(remote_slots); + + /* Done if all slots are persisted i.e are sync-ready */ + if (!slot_persistence_pending) + break; + + /* wait before retrying again */ + wait_for_slot_activity(some_slot_updated); + } + + if (slot_names) + list_free_deep(slot_names); /* Cleanup the synced temporary slots */ ReplicationSlotCleanup(true); diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 0d7bddbe4ed4e..d6ab1e017eb9e 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -199,7 +199,7 @@ AllocateSnapshotBuilder(ReorderBuffer *reorder, ALLOCSET_DEFAULT_SIZES); oldcontext = MemoryContextSwitchTo(context); - builder = palloc0(sizeof(SnapBuild)); + builder = palloc0_object(SnapBuild); builder->state = SNAPBUILD_START; builder->context = context; @@ -486,8 +486,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) MyProc->xmin = snap->xmin; /* allocate in transaction context */ - newxip = (TransactionId *) - palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount()); + newxip = palloc_array(TransactionId, GetMaxSnapshotXidCount()); /* * snapbuild.c builds transactions in an "inverted" manner, which means it @@ -774,7 +773,7 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact if (rbtxn_is_prepared(txn)) continue; - elog(DEBUG2, "adding a new snapshot and invalidations to %u at %X/%X", + elog(DEBUG2, "adding a new snapshot and invalidations to %u at %X/%08X", txn->xid, LSN_FORMAT_ARGS(lsn)); /* @@ -794,6 +793,13 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact * contents built by the current transaction even after its decoding, * which should have been invalidated due to concurrent catalog * changing transaction. + * + * Distribute only the invalidation messages generated by the current + * committed transaction. Invalidation messages received from other + * transactions would have already been propagated to the relevant + * in-progress transactions. This transaction would have processed + * those invalidations, ensuring that subsequent transactions observe + * a consistent cache state. */ if (txn->xid != xid) { @@ -807,8 +813,9 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact { Assert(msgs != NULL); - ReorderBufferAddInvalidations(builder->reorder, txn->xid, lsn, - ninvalidations, msgs); + ReorderBufferAddDistributedInvalidations(builder->reorder, + txn->xid, lsn, + ninvalidations, msgs); } } } @@ -1202,7 +1209,7 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * oldest ongoing txn might have started when we didn't yet serialize * anything because we hadn't reached a consistent state yet. */ - if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr) + if (txn != NULL && XLogRecPtrIsValid(txn->restart_decoding_lsn)) LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn); /* @@ -1210,8 +1217,8 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * we have one. */ else if (txn == NULL && - builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr && - builder->last_serialized_snapshot != InvalidXLogRecPtr) + XLogRecPtrIsValid(builder->reorder->current_restart_decoding_lsn) && + XLogRecPtrIsValid(builder->last_serialized_snapshot)) LogicalIncreaseRestartDecodingForSlot(lsn, builder->last_serialized_snapshot); } @@ -1263,10 +1270,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->initial_xmin_horizon)) { ereport(DEBUG1, - (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", - LSN_FORMAT_ARGS(lsn)), - errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", - builder->initial_xmin_horizon, running->oldestRunningXid))); + errmsg_internal("skipping snapshot at %X/%08X while building logical decoding snapshot, xmin horizon too low", + LSN_FORMAT_ARGS(lsn)), + errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", + builder->initial_xmin_horizon, running->oldestRunningXid)); SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon); @@ -1285,7 +1292,7 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn */ if (running->oldestRunningXid == running->nextXid) { - if (builder->start_decoding_at == InvalidXLogRecPtr || + if (!XLogRecPtrIsValid(builder->start_decoding_at) || builder->start_decoding_at <= lsn) /* can decode everything after this */ builder->start_decoding_at = lsn + 1; @@ -1302,9 +1309,9 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = InvalidTransactionId; ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("There are no running transactions."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no running transactions.")); return false; } @@ -1351,10 +1358,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn Assert(TransactionIdIsNormal(builder->xmax)); ereport(LOG, - (errmsg("logical decoding found initial starting point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); + errmsg("logical decoding found initial starting point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid)); SnapBuildWaitSnapshot(running, running->nextXid); } @@ -1375,10 +1382,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = running->nextXid; ereport(LOG, - (errmsg("logical decoding found initial consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); + errmsg("logical decoding found initial consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid)); SnapBuildWaitSnapshot(running, running->nextXid); } @@ -1399,9 +1406,9 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = InvalidTransactionId; ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("There are no old transactions anymore."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no old transactions anymore.")); } /* @@ -1501,8 +1508,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) struct stat stat_buf; Size sz; - Assert(lsn != InvalidXLogRecPtr); - Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr || + Assert(XLogRecPtrIsValid(lsn)); + Assert(!XLogRecPtrIsValid(builder->last_serialized_snapshot) || builder->last_serialized_snapshot <= lsn); /* @@ -1905,9 +1912,9 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) Assert(builder->state == SNAPBUILD_CONSISTENT); ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Logical decoding will begin using saved snapshot."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Logical decoding will begin using saved snapshot.")); return true; snapshot_not_interesting: @@ -2021,7 +2028,7 @@ CheckPointSnapBuild(void) lsn = ((uint64) hi) << 32 | lo; /* check whether we still need it */ - if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + if (lsn < cutoff || !XLogRecPtrIsValid(cutoff)) { elog(DEBUG1, "removing snapbuild snapshot %s", path); diff --git a/src/backend/replication/logical/syncutils.c b/src/backend/replication/logical/syncutils.c new file mode 100644 index 0000000000000..332819804bef6 --- /dev/null +++ b/src/backend/replication/logical/syncutils.c @@ -0,0 +1,280 @@ +/*------------------------------------------------------------------------- + * syncutils.c + * PostgreSQL logical replication: common synchronization code + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/syncutils.c + * + * NOTES + * This file contains code common for synchronization workers. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/pg_subscription_rel.h" +#include "pgstat.h" +#include "replication/logicallauncher.h" +#include "replication/worker_internal.h" +#include "storage/ipc.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + +/* + * Enum for phases of the subscription relations state. + * + * SYNC_RELATIONS_STATE_NEEDS_REBUILD indicates that the subscription relations + * state is no longer valid, and the subscription relations should be rebuilt. + * + * SYNC_RELATIONS_STATE_REBUILD_STARTED indicates that the subscription + * relations state is being rebuilt. + * + * SYNC_RELATIONS_STATE_VALID indicates that the subscription relation state is + * up-to-date and valid. + */ +typedef enum +{ + SYNC_RELATIONS_STATE_NEEDS_REBUILD, + SYNC_RELATIONS_STATE_REBUILD_STARTED, + SYNC_RELATIONS_STATE_VALID, +} SyncingRelationsState; + +static SyncingRelationsState relation_states_validity = SYNC_RELATIONS_STATE_NEEDS_REBUILD; + +/* + * Exit routine for synchronization worker. + */ +pg_noreturn void +FinishSyncWorker(void) +{ + Assert(am_sequencesync_worker() || am_tablesync_worker()); + + /* + * Commit any outstanding transaction. This is the usual case, unless + * there was nothing to do for the table. + */ + if (IsTransactionState()) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + /* And flush all writes. */ + XLogFlush(GetXLogWriteRecPtr()); + + if (am_sequencesync_worker()) + { + ereport(LOG, + errmsg("logical replication sequence synchronization worker for subscription \"%s\" has finished", + MySubscription->name)); + + /* + * Reset last_seqsync_start_time, so that next time a sequencesync + * worker is needed it can be started promptly. + */ + logicalrep_reset_seqsync_start_time(); + } + else + { + StartTransactionCommand(); + ereport(LOG, + errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished", + MySubscription->name, + get_rel_name(MyLogicalRepWorker->relid))); + CommitTransactionCommand(); + + /* Find the leader apply worker and signal it. */ + logicalrep_worker_wakeup(WORKERTYPE_APPLY, MyLogicalRepWorker->subid, + InvalidOid); + } + + /* Stop gracefully */ + proc_exit(0); +} + +/* + * Callback from syscache invalidation. + */ +void +InvalidateSyncingRelStates(Datum arg, int cacheid, uint32 hashvalue) +{ + relation_states_validity = SYNC_RELATIONS_STATE_NEEDS_REBUILD; +} + +/* + * Attempt to launch a sync worker for one or more sequences or a table, if + * a worker slot is available and the retry interval has elapsed. + * + * wtype: sync worker type. + * nsyncworkers: Number of currently running sync workers for the subscription. + * relid: InvalidOid for sequencesync worker, actual relid for tablesync + * worker. + * last_start_time: Pointer to the last start time of the worker. + */ +void +launch_sync_worker(LogicalRepWorkerType wtype, int nsyncworkers, Oid relid, + TimestampTz *last_start_time) +{ + TimestampTz now; + + Assert((wtype == WORKERTYPE_TABLESYNC && OidIsValid(relid)) || + (wtype == WORKERTYPE_SEQUENCESYNC && !OidIsValid(relid))); + + /* If there is a free sync worker slot, start a new sync worker */ + if (nsyncworkers >= max_sync_workers_per_subscription) + return; + + now = GetCurrentTimestamp(); + + if (!(*last_start_time) || + TimestampDifferenceExceeds(*last_start_time, now, + wal_retrieve_retry_interval)) + { + /* + * Set the last_start_time even if we fail to start the worker, so + * that we won't retry until wal_retrieve_retry_interval has elapsed. + */ + *last_start_time = now; + (void) logicalrep_worker_launch(wtype, + MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + relid, DSM_HANDLE_INVALID, false); + } +} + +/* + * Process possible state change(s) of relations that are being synchronized + * and start new tablesync workers for the newly added tables. Also, start a + * new sequencesync worker for the newly added sequences. + */ +void +ProcessSyncingRelations(XLogRecPtr current_lsn) +{ + switch (MyLogicalRepWorker->type) + { + case WORKERTYPE_PARALLEL_APPLY: + + /* + * Skip for parallel apply workers because they only operate on + * tables that are in a READY state. See pa_can_start() and + * should_apply_changes_for_rel(). + */ + break; + + case WORKERTYPE_TABLESYNC: + ProcessSyncingTablesForSync(current_lsn); + break; + + case WORKERTYPE_APPLY: + ProcessSyncingTablesForApply(current_lsn); + ProcessSequencesForSync(); + break; + + case WORKERTYPE_SEQUENCESYNC: + /* Should never happen. */ + elog(ERROR, "sequence synchronization worker is not expected to process relations"); + break; + + case WORKERTYPE_UNKNOWN: + /* Should never happen. */ + elog(ERROR, "Unknown worker type"); + } +} + +/* + * Common code to fetch the up-to-date sync state info for tables and sequences. + * + * The pg_subscription_rel catalog is shared by tables and sequences. Changes + * to either sequences or tables can affect the validity of relation states, so + * we identify non-READY tables and non-READY sequences together to ensure + * consistency. + * + * has_pending_subtables: true if the subscription has one or more tables that + * are not in READY state, otherwise false. + * has_pending_subsequences: true if the subscription has one or more sequences + * that are not in READY state, otherwise false. + */ +void +FetchRelationStates(bool *has_pending_subtables, + bool *has_pending_subsequences, + bool *started_tx) +{ + /* + * has_subtables and has_subsequences_non_ready are declared as static, + * since the same value can be used until the system table is invalidated. + */ + static bool has_subtables = false; + static bool has_subsequences_non_ready = false; + + *started_tx = false; + + if (relation_states_validity != SYNC_RELATIONS_STATE_VALID) + { + MemoryContext oldctx; + List *rstates; + SubscriptionRelState *rstate; + + relation_states_validity = SYNC_RELATIONS_STATE_REBUILD_STARTED; + has_subsequences_non_ready = false; + + /* Clean the old lists. */ + list_free_deep(table_states_not_ready); + table_states_not_ready = NIL; + + if (!IsTransactionState()) + { + StartTransactionCommand(); + *started_tx = true; + } + + /* Fetch tables and sequences that are in non-READY state. */ + rstates = GetSubscriptionRelations(MySubscription->oid, true, true, + true); + + /* Allocate the tracking info in a permanent memory context. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + foreach_ptr(SubscriptionRelState, subrel, rstates) + { + if (get_rel_relkind(subrel->relid) == RELKIND_SEQUENCE) + has_subsequences_non_ready = true; + else + { + rstate = palloc_object(SubscriptionRelState); + memcpy(rstate, subrel, sizeof(SubscriptionRelState)); + table_states_not_ready = lappend(table_states_not_ready, + rstate); + } + } + MemoryContextSwitchTo(oldctx); + + /* + * Does the subscription have tables? + * + * If there were not-READY tables found then we know it does. But if + * table_states_not_ready was empty we still need to check again to + * see if there are 0 tables. + */ + has_subtables = (table_states_not_ready != NIL) || + HasSubscriptionTables(MySubscription->oid); + + /* + * If the subscription relation cache has been invalidated since we + * entered this routine, we still use and return the relations we just + * finished constructing, to avoid infinite loops, but we leave the + * table states marked as stale so that we'll rebuild it again on next + * access. Otherwise, we mark the table states as valid. + */ + if (relation_states_validity == SYNC_RELATIONS_STATE_REBUILD_STARTED) + relation_states_validity = SYNC_RELATIONS_STATE_VALID; + } + + if (has_pending_subtables) + *has_pending_subtables = has_subtables; + + if (has_pending_subsequences) + *has_pending_subsequences = has_subsequences_non_ready; +} diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index 8e1e8762f6258..2522e372036ff 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -117,58 +117,15 @@ #include "utils/array.h" #include "utils/builtins.h" #include "utils/lsyscache.h" -#include "utils/memutils.h" #include "utils/rls.h" #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/usercontext.h" -typedef enum -{ - SYNC_TABLE_STATE_NEEDS_REBUILD, - SYNC_TABLE_STATE_REBUILD_STARTED, - SYNC_TABLE_STATE_VALID, -} SyncingTablesState; - -static SyncingTablesState table_states_validity = SYNC_TABLE_STATE_NEEDS_REBUILD; -static List *table_states_not_ready = NIL; -static bool FetchTableStates(bool *started_tx); +List *table_states_not_ready = NIL; static StringInfo copybuf = NULL; -/* - * Exit routine for synchronization worker. - */ -pg_noreturn static void -finish_sync_worker(void) -{ - /* - * Commit any outstanding transaction. This is the usual case, unless - * there was nothing to do for the table. - */ - if (IsTransactionState()) - { - CommitTransactionCommand(); - pgstat_report_stat(true); - } - - /* And flush all writes. */ - XLogFlush(GetXLogWriteRecPtr()); - - StartTransactionCommand(); - ereport(LOG, - (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished", - MySubscription->name, - get_rel_name(MyLogicalRepWorker->relid)))); - CommitTransactionCommand(); - - /* Find the leader apply worker and signal it. */ - logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); - - /* Stop gracefully */ - proc_exit(0); -} - /* * Wait until the relation sync state is set in the catalog to the expected * one; return true when it happens. @@ -180,7 +137,7 @@ finish_sync_worker(void) * CATCHUP state to SYNCDONE. */ static bool -wait_for_relation_state_change(Oid relid, char expected_state) +wait_for_table_state_change(Oid relid, char expected_state) { char state; @@ -203,7 +160,8 @@ wait_for_relation_state_change(Oid relid, char expected_state) /* Check if the sync worker is still running and bail if not. */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - worker = logicalrep_worker_find(MyLogicalRepWorker->subid, relid, + worker = logicalrep_worker_find(WORKERTYPE_TABLESYNC, + MyLogicalRepWorker->subid, relid, false); LWLockRelease(LogicalRepWorkerLock); if (!worker) @@ -250,8 +208,9 @@ wait_for_worker_state_change(char expected_state) * waiting. */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - worker = logicalrep_worker_find(MyLogicalRepWorker->subid, - InvalidOid, false); + worker = logicalrep_worker_find(WORKERTYPE_APPLY, + MyLogicalRepWorker->subid, InvalidOid, + false); if (worker && worker->proc) logicalrep_worker_wakeup_ptr(worker); LWLockRelease(LogicalRepWorkerLock); @@ -273,15 +232,6 @@ wait_for_worker_state_change(char expected_state) return false; } -/* - * Callback from syscache invalidation. - */ -void -invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) -{ - table_states_validity = SYNC_TABLE_STATE_NEEDS_REBUILD; -} - /* * Handle table synchronization cooperation from the synchronization * worker. @@ -290,8 +240,8 @@ invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) * predetermined synchronization point in the WAL stream, mark the table as * SYNCDONE and finish. */ -static void -process_syncing_tables_for_sync(XLogRecPtr current_lsn) +void +ProcessSyncingTablesForSync(XLogRecPtr current_lsn) { SpinLockAcquire(&MyLogicalRepWorker->relmutex); @@ -316,7 +266,8 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, MyLogicalRepWorker->relstate, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); /* * End streaming so that LogRepWorkerWalRcvConn can be used to drop @@ -348,9 +299,9 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn) /* * Start a new transaction to clean up the tablesync origin tracking. - * This transaction will be ended within the finish_sync_worker(). - * Now, even, if we fail to remove this here, the apply worker will - * ensure to clean it up afterward. + * This transaction will be ended within the FinishSyncWorker(). Now, + * even, if we fail to remove this here, the apply worker will ensure + * to clean it up afterward. * * We need to do this after the table state is set to SYNCDONE. * Otherwise, if an error occurs while performing the database @@ -386,7 +337,7 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn) */ replorigin_drop_by_name(originname, true, false); - finish_sync_worker(); + FinishSyncWorker(); } else SpinLockRelease(&MyLogicalRepWorker->relmutex); @@ -413,8 +364,8 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn) * If the synchronization position is reached (SYNCDONE), then the table can * be marked as READY and is no longer tracked. */ -static void -process_syncing_tables_for_apply(XLogRecPtr current_lsn) +void +ProcessSyncingTablesForApply(XLogRecPtr current_lsn) { struct tablesync_start_time_mapping { @@ -423,13 +374,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) }; static HTAB *last_start_times = NULL; ListCell *lc; - bool started_tx = false; + bool started_tx; bool should_exit = false; + Relation rel = NULL; Assert(!IsTransactionState()); /* We need up-to-date sync state info for subscription tables here. */ - FetchTableStates(&started_tx); + FetchRelationStates(NULL, NULL, &started_tx); /* * Prepare a hash table for tracking last start times of workers, to avoid @@ -463,6 +415,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) { SubscriptionRelState *rstate = (SubscriptionRelState *) lfirst(lc); + if (!started_tx) + { + StartTransactionCommand(); + started_tx = true; + } + + Assert(get_rel_relkind(rstate->relid) != RELKIND_SEQUENCE); + if (rstate->state == SUBREL_STATE_SYNCDONE) { /* @@ -476,11 +436,6 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) rstate->state = SUBREL_STATE_READY; rstate->lsn = current_lsn; - if (!started_tx) - { - StartTransactionCommand(); - started_tx = true; - } /* * Remove the tablesync origin tracking if exists. @@ -492,7 +447,17 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) * worker to remove the origin tracking as if there is any * error while dropping we won't restart it to drop the * origin. So passing missing_ok = true. + * + * Lock the subscription and origin in the same order as we + * are doing during DDL commands to avoid deadlocks. See + * AlterSubscription_refresh. */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, + 0, AccessShareLock); + + if (!rel) + rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid, rstate->relid, originname, @@ -504,7 +469,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) */ UpdateSubscriptionRelState(MyLogicalRepWorker->subid, rstate->relid, rstate->state, - rstate->lsn); + rstate->lsn, true); } } else @@ -516,7 +481,8 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid, + syncworker = logicalrep_worker_find(WORKERTYPE_TABLESYNC, + MyLogicalRepWorker->subid, rstate->relid, false); if (syncworker) @@ -555,7 +521,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) * This is required to avoid any undetected deadlocks * due to any existing lock as deadlock detector won't * be able to detect the waits on the latch. + * + * Also close any tables prior to the commit. */ + if (rel) + { + table_close(rel, NoLock); + rel = NULL; + } CommitTransactionCommand(); pgstat_report_stat(false); } @@ -567,8 +540,8 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) StartTransactionCommand(); started_tx = true; - wait_for_relation_state_change(rstate->relid, - SUBREL_STATE_SYNCDONE); + wait_for_table_state_change(rstate->relid, + SUBREL_STATE_SYNCDONE); } else LWLockRelease(LogicalRepWorkerLock); @@ -582,41 +555,28 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) */ int nsyncworkers = logicalrep_sync_worker_count(MyLogicalRepWorker->subid); + struct tablesync_start_time_mapping *hentry; + bool found; /* Now safe to release the LWLock */ LWLockRelease(LogicalRepWorkerLock); - /* - * If there are free sync worker slot(s), start a new sync - * worker for the table. - */ - if (nsyncworkers < max_sync_workers_per_subscription) - { - TimestampTz now = GetCurrentTimestamp(); - struct tablesync_start_time_mapping *hentry; - bool found; - - hentry = hash_search(last_start_times, &rstate->relid, - HASH_ENTER, &found); + hentry = hash_search(last_start_times, &rstate->relid, + HASH_ENTER, &found); + if (!found) + hentry->last_start_time = 0; - if (!found || - TimestampDifferenceExceeds(hentry->last_start_time, now, - wal_retrieve_retry_interval)) - { - logicalrep_worker_launch(WORKERTYPE_TABLESYNC, - MyLogicalRepWorker->dbid, - MySubscription->oid, - MySubscription->name, - MyLogicalRepWorker->userid, - rstate->relid, - DSM_HANDLE_INVALID); - hentry->last_start_time = now; - } - } + launch_sync_worker(WORKERTYPE_TABLESYNC, nsyncworkers, + rstate->relid, &hentry->last_start_time); } } } + /* Close table if opened */ + if (rel) + table_close(rel, NoLock); + + if (started_tx) { /* @@ -659,37 +619,6 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) } } -/* - * Process possible state change(s) of tables that are being synchronized. - */ -void -process_syncing_tables(XLogRecPtr current_lsn) -{ - switch (MyLogicalRepWorker->type) - { - case WORKERTYPE_PARALLEL_APPLY: - - /* - * Skip for parallel apply workers because they only operate on - * tables that are in a READY state. See pa_can_start() and - * should_apply_changes_for_rel(). - */ - break; - - case WORKERTYPE_TABLESYNC: - process_syncing_tables_for_sync(current_lsn); - break; - - case WORKERTYPE_APPLY: - process_syncing_tables_for_apply(current_lsn); - break; - - case WORKERTYPE_UNKNOWN: - /* Should never happen. */ - elog(ERROR, "Unknown worker type"); - } -} - /* * Create list of columns for COPY based on logical relation mapping. */ @@ -893,7 +822,7 @@ fetch_remote_table_info(char *nspname, char *relname, LogicalRepRelation *lrel, /* * We don't support the case where the column list is different for * the same table when combining publications. See comments atop - * fetch_table_list. So there should be only one row returned. + * fetch_relation_list. So there should be only one row returned. * Although we already checked this when creating the subscription, we * still need to check here in case the column list was changed after * creating the subscription and before the sync worker is started. @@ -1139,8 +1068,9 @@ copy_table(Relation rel) /* Start copy on the publisher. */ initStringInfo(&cmd); - /* Regular table with no row filter or generated columns */ - if (lrel.relkind == RELKIND_RELATION && qual == NIL && !gencol_published) + /* Regular or partitioned table with no row filter or generated columns */ + if ((lrel.relkind == RELKIND_RELATION || lrel.relkind == RELKIND_PARTITIONED_TABLE) + && qual == NIL && !gencol_published) { appendStringInfo(&cmd, "COPY %s", quote_qualified_identifier(lrel.nspname, lrel.relname)); @@ -1326,7 +1256,7 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) case SUBREL_STATE_SYNCDONE: case SUBREL_STATE_READY: case SUBREL_STATE_UNKNOWN: - finish_sync_worker(); /* doesn't return */ + FinishSyncWorker(); /* doesn't return */ } /* Calculate the name of the tablesync slot. */ @@ -1403,12 +1333,27 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr; SpinLockRelease(&MyLogicalRepWorker->relmutex); - /* Update the state and make it visible to others. */ + /* + * Update the state, create the replication origin, and make them visible + * to others. + */ StartTransactionCommand(); UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, MyLogicalRepWorker->relstate, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); + + /* + * Create the replication origin in a separate transaction from the one + * that sets up the origin in shared memory. This prevents the risk that + * changes to the origin in shared memory cannot be rolled back if the + * transaction aborts. + */ + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + originid = replorigin_create(originname); + CommitTransactionCommand(); pgstat_report_stat(true); @@ -1448,41 +1393,25 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) CRS_USE_SNAPSHOT, origin_startpos); /* - * Setup replication origin tracking. The purpose of doing this before the - * copy is to avoid doing the copy again due to any error in setting up - * origin tracking. + * Advance the origin to the LSN got from walrcv_create_slot and then set + * up the origin. The advancement is WAL logged for the purpose of + * recovery. Locks are to prevent the replication origin from vanishing + * while advancing. + * + * The purpose of doing these before the copy is to avoid doing the copy + * again due to any error in advancing or setting up origin tracking. */ - originid = replorigin_by_name(originname, true); - if (!OidIsValid(originid)) - { - /* - * Origin tracking does not exist, so create it now. - * - * Then advance to the LSN got from walrcv_create_slot. This is WAL - * logged for the purpose of recovery. Locks are to prevent the - * replication origin from vanishing while advancing. - */ - originid = replorigin_create(originname); + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + replorigin_advance(originid, *origin_startpos, InvalidXLogRecPtr, + true /* go backward */ , true /* WAL log */ ); + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); - LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); - replorigin_advance(originid, *origin_startpos, InvalidXLogRecPtr, - true /* go backward */ , true /* WAL log */ ); - UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); - - replorigin_session_setup(originid, 0); - replorigin_session_origin = originid; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("replication origin \"%s\" already exists", - originname))); - } + replorigin_session_setup(originid, 0); + replorigin_session_origin = originid; /* - * Make sure that the copy command runs as the table owner, unless the - * user has opted out of that behaviour. + * If the user did not opt to run as the owner of the subscription + * ('run_as_owner'), then copy the table as the owner of the table. */ run_as_owner = MySubscription->runasowner; if (!run_as_owner) @@ -1541,14 +1470,15 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, SUBREL_STATE_FINISHEDCOPY, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); CommitTransactionCommand(); copy_table_done: elog(DEBUG1, - "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X", + "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%08X", originname, LSN_FORMAT_ARGS(*origin_startpos)); /* @@ -1567,77 +1497,6 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) return slotname; } -/* - * Common code to fetch the up-to-date sync state info into the static lists. - * - * Returns true if subscription has 1 or more tables, else false. - * - * Note: If this function started the transaction (indicated by the parameter) - * then it is the caller's responsibility to commit it. - */ -static bool -FetchTableStates(bool *started_tx) -{ - static bool has_subrels = false; - - *started_tx = false; - - if (table_states_validity != SYNC_TABLE_STATE_VALID) - { - MemoryContext oldctx; - List *rstates; - ListCell *lc; - SubscriptionRelState *rstate; - - table_states_validity = SYNC_TABLE_STATE_REBUILD_STARTED; - - /* Clean the old lists. */ - list_free_deep(table_states_not_ready); - table_states_not_ready = NIL; - - if (!IsTransactionState()) - { - StartTransactionCommand(); - *started_tx = true; - } - - /* Fetch all non-ready tables. */ - rstates = GetSubscriptionRelations(MySubscription->oid, true); - - /* Allocate the tracking info in a permanent memory context. */ - oldctx = MemoryContextSwitchTo(CacheMemoryContext); - foreach(lc, rstates) - { - rstate = palloc(sizeof(SubscriptionRelState)); - memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState)); - table_states_not_ready = lappend(table_states_not_ready, rstate); - } - MemoryContextSwitchTo(oldctx); - - /* - * Does the subscription have tables? - * - * If there were not-READY relations found then we know it does. But - * if table_states_not_ready was empty we still need to check again to - * see if there are 0 tables. - */ - has_subrels = (table_states_not_ready != NIL) || - HasSubscriptionRelations(MySubscription->oid); - - /* - * If the subscription relation cache has been invalidated since we - * entered this routine, we still use and return the relations we just - * finished constructing, to avoid infinite loops, but we leave the - * table states marked as stale so that we'll rebuild it again on next - * access. Otherwise, we mark the table states as valid. - */ - if (table_states_validity == SYNC_TABLE_STATE_REBUILD_STARTED) - table_states_validity = SYNC_TABLE_STATE_VALID; - } - - return has_subrels; -} - /* * Execute the initial sync with error handling. Disable the subscription, * if it's required. @@ -1670,7 +1529,8 @@ start_table_sync(XLogRecPtr *origin_startpos, char **slotname) * idle state. */ AbortOutOfAnyTransaction(); - pgstat_report_subscription_error(MySubscription->oid, false); + pgstat_report_subscription_error(MySubscription->oid, + WORKERTYPE_TABLESYNC); PG_RE_THROW(); } @@ -1689,7 +1549,7 @@ start_table_sync(XLogRecPtr *origin_startpos, char **slotname) * and starts streaming to catchup with apply worker. */ static void -run_tablesync_worker() +run_tablesync_worker(void) { char originname[NAMEDATALEN]; XLogRecPtr origin_startpos = InvalidXLogRecPtr; @@ -1715,7 +1575,7 @@ run_tablesync_worker() /* Logical Replication Tablesync worker entry point */ void -TablesyncWorkerMain(Datum main_arg) +TableSyncWorkerMain(Datum main_arg) { int worker_slot = DatumGetInt32(main_arg); @@ -1723,7 +1583,7 @@ TablesyncWorkerMain(Datum main_arg) run_tablesync_worker(); - finish_sync_worker(); + FinishSyncWorker(); } /* @@ -1737,11 +1597,11 @@ TablesyncWorkerMain(Datum main_arg) bool AllTablesyncsReady(void) { - bool started_tx = false; - bool has_subrels = false; + bool started_tx; + bool has_tables; /* We need up-to-date sync state info for subscription tables here. */ - has_subrels = FetchTableStates(&started_tx); + FetchRelationStates(&has_tables, NULL, &started_tx); if (started_tx) { @@ -1753,7 +1613,33 @@ AllTablesyncsReady(void) * Return false when there are no tables in subscription or not all tables * are in ready state; true otherwise. */ - return has_subrels && (table_states_not_ready == NIL); + return has_tables && (table_states_not_ready == NIL); +} + +/* + * Return whether the subscription currently has any tables. + * + * Note: Unlike HasSubscriptionTables(), this function relies on cached + * information for subscription tables. Additionally, it should not be + * invoked outside of apply or tablesync workers, as MySubscription must be + * initialized first. + */ +bool +HasSubscriptionTablesCached(void) +{ + bool started_tx; + bool has_tables; + + /* We need up-to-date subscription tables info here */ + FetchRelationStates(&has_tables, NULL, &started_tx); + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + return has_tables; } /* diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 4151a4b2a96ba..718408bb599b4 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -91,7 +91,7 @@ * behave as if two_phase = off. When the apply worker detects that all * tablesyncs have become READY (while the tri-state was PENDING) it will * restart the apply worker process. This happens in - * process_syncing_tables_for_apply. + * ProcessSyncingTablesForApply. * * When the (re-started) apply worker finds that all tablesyncs are READY for a * two_phase tri-state of PENDING it start streaming messages with the @@ -109,13 +109,6 @@ * If ever a user needs to be aware of the tri-state value, they can fetch it * from the pg_subscription catalog (see column subtwophasestate). * - * We don't allow to toggle two_phase option of a subscription because it can - * lead to an inconsistent replica. Consider, initially, it was on and we have - * received some prepare then we turn it off, now at commit time the server - * will send the entire transaction data along with the commit. With some more - * analysis, we can allow changing this option from off to on but not sure if - * that alone would be useful. - * * Finally, to avoid problems mentioned in previous paragraphs from any * subsequent (not READY) tablesyncs (need to toggle two_phase option from 'on' * to 'off' and then again back to 'on') there is a restriction for @@ -139,6 +132,113 @@ * failover = true when creating the subscription. Enabling failover allows us * to smoothly transition to the promoted standby, ensuring that we can * subscribe to the new primary without losing any data. + * + * RETAIN DEAD TUPLES + * ---------------------- + * Each apply worker that enabled retain_dead_tuples option maintains a + * non-removable transaction ID (oldest_nonremovable_xid) in shared memory to + * prevent dead rows from being removed prematurely when the apply worker still + * needs them to detect update_deleted conflicts. Additionally, this helps to + * retain the required commit_ts module information, which further helps to + * detect update_origin_differs and delete_origin_differs conflicts reliably, as + * otherwise, vacuum freeze could remove the required information. + * + * The logical replication launcher manages an internal replication slot named + * "pg_conflict_detection". It asynchronously aggregates the non-removable + * transaction ID from all apply workers to determine the appropriate xmin for + * the slot, thereby retaining necessary tuples. + * + * The non-removable transaction ID in the apply worker is advanced to the + * oldest running transaction ID once all concurrent transactions on the + * publisher have been applied and flushed locally. The process involves: + * + * - RDT_GET_CANDIDATE_XID: + * Call GetOldestActiveTransactionId() to take oldestRunningXid as the + * candidate xid. + * + * - RDT_REQUEST_PUBLISHER_STATUS: + * Send a message to the walsender requesting the publisher status, which + * includes the latest WAL write position and information about transactions + * that are in the commit phase. + * + * - RDT_WAIT_FOR_PUBLISHER_STATUS: + * Wait for the status from the walsender. After receiving the first status, + * do not proceed if there are concurrent remote transactions that are still + * in the commit phase. These transactions might have been assigned an + * earlier commit timestamp but have not yet written the commit WAL record. + * Continue to request the publisher status (RDT_REQUEST_PUBLISHER_STATUS) + * until all these transactions have completed. + * + * - RDT_WAIT_FOR_LOCAL_FLUSH: + * Advance the non-removable transaction ID if the current flush location has + * reached or surpassed the last received WAL position. + * + * - RDT_STOP_CONFLICT_INFO_RETENTION: + * This phase is required only when max_retention_duration is defined. We + * enter this phase if the wait time in either the + * RDT_WAIT_FOR_PUBLISHER_STATUS or RDT_WAIT_FOR_LOCAL_FLUSH phase exceeds + * configured max_retention_duration. In this phase, + * pg_subscription.subretentionactive is updated to false within a new + * transaction, and oldest_nonremovable_xid is set to InvalidTransactionId. + * + * - RDT_RESUME_CONFLICT_INFO_RETENTION: + * This phase is required only when max_retention_duration is defined. We + * enter this phase if the retention was previously stopped, and the time + * required to advance the non-removable transaction ID in the + * RDT_WAIT_FOR_LOCAL_FLUSH phase has decreased to within acceptable limits + * (or if max_retention_duration is set to 0). During this phase, + * pg_subscription.subretentionactive is updated to true within a new + * transaction, and the worker will be restarted. + * + * The overall state progression is: GET_CANDIDATE_XID -> + * REQUEST_PUBLISHER_STATUS -> WAIT_FOR_PUBLISHER_STATUS -> (loop to + * REQUEST_PUBLISHER_STATUS till concurrent remote transactions end) -> + * WAIT_FOR_LOCAL_FLUSH -> loop back to GET_CANDIDATE_XID. + * + * Retaining the dead tuples for this period is sufficient for ensuring + * eventual consistency using last-update-wins strategy, as dead tuples are + * useful for detecting conflicts only during the application of concurrent + * transactions from remote nodes. After applying and flushing all remote + * transactions that occurred concurrently with the tuple DELETE, any + * subsequent UPDATE from a remote node should have a later timestamp. In such + * cases, it is acceptable to detect an update_missing scenario and convert the + * UPDATE to an INSERT when applying it. But, for concurrent remote + * transactions with earlier timestamps than the DELETE, detecting + * update_deleted is necessary, as the UPDATEs in remote transactions should be + * ignored if their timestamp is earlier than that of the dead tuples. + * + * Note that advancing the non-removable transaction ID is not supported if the + * publisher is also a physical standby. This is because the logical walsender + * on the standby can only get the WAL replay position but there may be more + * WALs that are being replicated from the primary and those WALs could have + * earlier commit timestamp. + * + * Similarly, when the publisher has subscribed to another publisher, + * information necessary for conflict detection cannot be retained for + * changes from origins other than the publisher. This is because publisher + * lacks the information on concurrent transactions of other publishers to + * which it subscribes. As the information on concurrent transactions is + * unavailable beyond subscriber's immediate publishers, the non-removable + * transaction ID might be advanced prematurely before changes from other + * origins have been fully applied. + * + * XXX Retaining information for changes from other origins might be possible + * by requesting the subscription on that origin to enable retain_dead_tuples + * and fetching the conflict detection slot.xmin along with the publisher's + * status. In the RDT_WAIT_FOR_PUBLISHER_STATUS phase, the apply worker could + * wait for the remote slot's xmin to reach the oldest active transaction ID, + * ensuring that all transactions from other origins have been applied on the + * publisher, thereby getting the latest WAL position that includes all + * concurrent changes. However, this approach may impact performance, so it + * might not worth the effort. + * + * XXX It seems feasible to get the latest commit's WAL location from the + * publisher and wait till that is applied. However, we can't do that + * because commit timestamps can regress as a commit with a later LSN is not + * guaranteed to have a later timestamp than those with earlier LSNs. Having + * said that, even if that is possible, it won't improve performance much as + * the apply always lag and moves slowly as compared with the transactions + * on the publisher. *------------------------------------------------------------------------- */ @@ -147,6 +247,7 @@ #include #include +#include "access/commit_ts.h" #include "access/table.h" #include "access/tableam.h" #include "access/twophase.h" @@ -155,6 +256,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_subscription.h" #include "catalog/pg_subscription_rel.h" +#include "commands/subscriptioncmds.h" #include "commands/tablecmds.h" #include "commands/trigger.h" #include "executor/executor.h" @@ -173,15 +275,16 @@ #include "replication/logicalrelation.h" #include "replication/logicalworker.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "rewrite/rewriteHandler.h" #include "storage/buffile.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/acl.h" -#include "utils/dynahash.h" #include "utils/guc.h" #include "utils/inval.h" #include "utils/lsyscache.h" @@ -275,6 +378,83 @@ typedef enum TRANS_PARALLEL_APPLY, } TransApplyAction; +/* + * The phases involved in advancing the non-removable transaction ID. + * + * See comments atop worker.c for details of the transition between these + * phases. + */ +typedef enum +{ + RDT_GET_CANDIDATE_XID, + RDT_REQUEST_PUBLISHER_STATUS, + RDT_WAIT_FOR_PUBLISHER_STATUS, + RDT_WAIT_FOR_LOCAL_FLUSH, + RDT_STOP_CONFLICT_INFO_RETENTION, + RDT_RESUME_CONFLICT_INFO_RETENTION, +} RetainDeadTuplesPhase; + +/* + * Critical information for managing phase transitions within the + * RetainDeadTuplesPhase. + */ +typedef struct RetainDeadTuplesData +{ + RetainDeadTuplesPhase phase; /* current phase */ + XLogRecPtr remote_lsn; /* WAL write position on the publisher */ + + /* + * Oldest transaction ID that was in the commit phase on the publisher. + * Use FullTransactionId to prevent issues with transaction ID wraparound, + * where a new remote_oldestxid could falsely appear to originate from the + * past and block advancement. + */ + FullTransactionId remote_oldestxid; + + /* + * Next transaction ID to be assigned on the publisher. Use + * FullTransactionId for consistency and to allow straightforward + * comparisons with remote_oldestxid. + */ + FullTransactionId remote_nextxid; + + TimestampTz reply_time; /* when the publisher responds with status */ + + /* + * Publisher transaction ID that must be awaited to complete before + * entering the final phase (RDT_WAIT_FOR_LOCAL_FLUSH). Use + * FullTransactionId for the same reason as remote_nextxid. + */ + FullTransactionId remote_wait_for; + + TransactionId candidate_xid; /* candidate for the non-removable + * transaction ID */ + TimestampTz flushpos_update_time; /* when the remote flush position was + * updated in final phase + * (RDT_WAIT_FOR_LOCAL_FLUSH) */ + + long table_sync_wait_time; /* time spent waiting for table sync + * to finish */ + + /* + * The following fields are used to determine the timing for the next + * round of transaction ID advancement. + */ + TimestampTz last_recv_time; /* when the last message was received */ + TimestampTz candidate_xid_time; /* when the candidate_xid is decided */ + int xid_advance_interval; /* how much time (ms) to wait before + * attempting to advance the + * non-removable transaction ID */ +} RetainDeadTuplesData; + +/* + * The minimum (100ms) and maximum (3 minutes) intervals for advancing + * non-removable transaction IDs. The maximum interval is a bit arbitrary but + * is sufficient to not cause any undue network traffic. + */ +#define MIN_XID_ADVANCE_INTERVAL 100 +#define MAX_XID_ADVANCE_INTERVAL 180000 + /* errcontext tracker */ static ApplyErrorCallbackArg apply_error_callback_arg = { @@ -334,16 +514,23 @@ bool InitializingApplyWorker = false; * by the user. */ static XLogRecPtr skip_xact_finish_lsn = InvalidXLogRecPtr; -#define is_skipping_changes() (unlikely(!XLogRecPtrIsInvalid(skip_xact_finish_lsn))) +#define is_skipping_changes() (unlikely(XLogRecPtrIsValid(skip_xact_finish_lsn))) /* BufFile handle of the current streaming file */ static BufFile *stream_fd = NULL; +/* + * The remote WAL position that has been applied and flushed locally. We record + * and use this information both while sending feedback to the server and + * advancing oldest_nonremovable_xid. + */ +static XLogRecPtr last_flushpos = InvalidXLogRecPtr; + typedef struct SubXactInfo { TransactionId xid; /* XID of the subxact */ int fileno; /* file number in the buffile */ - off_t offset; /* offset in the file */ + pgoff_t offset; /* offset in the file */ } SubXactInfo; /* Sub-transaction data for the current streaming transaction */ @@ -379,6 +566,26 @@ static void stream_close_file(void); static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); +static void maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received); +static bool can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data); +static void process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received); +static void get_candidate_xid(RetainDeadTuplesData *rdt_data); +static void request_publisher_status(RetainDeadTuplesData *rdt_data); +static void wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received); +static void wait_for_local_flush(RetainDeadTuplesData *rdt_data); +static bool should_stop_conflict_info_retention(RetainDeadTuplesData *rdt_data); +static void stop_conflict_info_retention(RetainDeadTuplesData *rdt_data); +static void resume_conflict_info_retention(RetainDeadTuplesData *rdt_data); +static bool update_retention_status(bool active); +static void reset_retention_data_fields(RetainDeadTuplesData *rdt_data); +static void adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, + bool new_xid_found); + +static void apply_worker_exit(void); + static void apply_handle_commit_internal(LogicalRepCommitData *commit_data); static void apply_handle_insert_internal(ApplyExecutionData *edata, ResultRelInfo *relinfo, @@ -397,6 +604,12 @@ static bool FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel Oid localidxoid, TupleTableSlot *remoteslot, TupleTableSlot **localslot); +static bool FindDeletedTupleInLocalRel(Relation localrel, + Oid localidxoid, + TupleTableSlot *remoteslot, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time); static void apply_handle_tuple_routing(ApplyExecutionData *edata, TupleTableSlot *remoteslot, LogicalRepTupleData *newtup, @@ -489,6 +702,11 @@ should_apply_changes_for_rel(LogicalRepRelMapEntry *rel) (rel->state == SUBREL_STATE_SYNCDONE && rel->statelsn <= remote_final_lsn)); + case WORKERTYPE_SEQUENCESYNC: + /* Should never happen. */ + elog(ERROR, "sequence synchronization worker is not expected to apply changes"); + break; + case WORKERTYPE_UNKNOWN: /* Should never happen. */ elog(ERROR, "Unknown worker type"); @@ -657,7 +875,7 @@ create_edata_for_relation(LogicalRepRelMapEntry *rel) List *perminfos = NIL; ResultRelInfo *resultRelInfo; - edata = (ApplyExecutionData *) palloc0(sizeof(ApplyExecutionData)); + edata = palloc0_object(ApplyExecutionData); edata->targetRel = rel; edata->estate = estate = CreateExecutorState(); @@ -762,9 +980,10 @@ slot_fill_defaults(LogicalRepRelMapEntry *rel, EState *estate, Assert(rel->attrmap->maplen == num_phys_attrs); for (attnum = 0; attnum < num_phys_attrs; attnum++) { + CompactAttribute *cattr = TupleDescCompactAttr(desc, attnum); Expr *defexpr; - if (TupleDescAttr(desc, attnum)->attisdropped || TupleDescAttr(desc, attnum)->attgenerated) + if (cattr->attisdropped || cattr->attgenerated) continue; if (rel->attrmap->attnums[attnum] >= 0) @@ -1023,14 +1242,17 @@ apply_handle_commit(StringInfo s) if (commit_data.commit_lsn != remote_final_lsn) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg_internal("incorrect commit LSN %X/%X in commit message (expected %X/%X)", + errmsg_internal("incorrect commit LSN %X/%08X in commit message (expected %X/%08X)", LSN_FORMAT_ARGS(commit_data.commit_lsn), LSN_FORMAT_ARGS(remote_final_lsn)))); apply_handle_commit_internal(&commit_data); - /* Process any tables that are being synchronized in parallel. */ - process_syncing_tables(commit_data.end_lsn); + /* + * Process any tables that are being synchronized in parallel, as well as + * any newly added tables or sequences. + */ + ProcessSyncingRelations(commit_data.end_lsn); pgstat_report_activity(STATE_IDLE, NULL); reset_apply_error_context_info(); @@ -1115,7 +1337,7 @@ apply_handle_prepare(StringInfo s) if (prepare_data.prepare_lsn != remote_final_lsn) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg_internal("incorrect prepare LSN %X/%X in prepare message (expected %X/%X)", + errmsg_internal("incorrect prepare LSN %X/%08X in prepare message (expected %X/%08X)", LSN_FORMAT_ARGS(prepare_data.prepare_lsn), LSN_FORMAT_ARGS(remote_final_lsn)))); @@ -1151,8 +1373,11 @@ apply_handle_prepare(StringInfo s) in_remote_transaction = false; - /* Process any tables that are being synchronized in parallel. */ - process_syncing_tables(prepare_data.end_lsn); + /* + * Process any tables that are being synchronized in parallel, as well as + * any newly added tables or sequences. + */ + ProcessSyncingRelations(prepare_data.end_lsn); /* * Since we have already prepared the transaction, in a case where the @@ -1207,8 +1432,11 @@ apply_handle_commit_prepared(StringInfo s) store_flush_position(prepare_data.end_lsn, XactLastCommitEnd); in_remote_transaction = false; - /* Process any tables that are being synchronized in parallel. */ - process_syncing_tables(prepare_data.end_lsn); + /* + * Process any tables that are being synchronized in parallel, as well as + * any newly added tables or sequences. + */ + ProcessSyncingRelations(prepare_data.end_lsn); clear_subscription_skip_lsn(prepare_data.end_lsn); @@ -1273,8 +1501,11 @@ apply_handle_rollback_prepared(StringInfo s) store_flush_position(rollback_data.rollback_end_lsn, InvalidXLogRecPtr); in_remote_transaction = false; - /* Process any tables that are being synchronized in parallel. */ - process_syncing_tables(rollback_data.rollback_end_lsn); + /* + * Process any tables that are being synchronized in parallel, as well as + * any newly added tables or sequences. + */ + ProcessSyncingRelations(rollback_data.rollback_end_lsn); pgstat_report_activity(STATE_IDLE, NULL); reset_apply_error_context_info(); @@ -1408,8 +1639,11 @@ apply_handle_stream_prepare(StringInfo s) pgstat_report_stat(false); - /* Process any tables that are being synchronized in parallel. */ - process_syncing_tables(prepare_data.end_lsn); + /* + * Process any tables that are being synchronized in parallel, as well as + * any newly added tables or sequences. + */ + ProcessSyncingRelations(prepare_data.end_lsn); /* * Similar to prepare case, the subskiplsn could be left in a case of @@ -1468,7 +1702,7 @@ stream_start_internal(TransactionId xid, bool first_segment) oldctx = MemoryContextSwitchTo(ApplyContext); - MyLogicalRepWorker->stream_fileset = palloc(sizeof(FileSet)); + MyLogicalRepWorker->stream_fileset = palloc_object(FileSet); FileSetInit(MyLogicalRepWorker->stream_fileset); MemoryContextSwitchTo(oldctx); @@ -1603,7 +1837,8 @@ apply_handle_stream_start(StringInfo s) * Signal the leader apply worker, as it may be waiting for * us. */ - logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + logicalrep_worker_wakeup(WORKERTYPE_APPLY, + MyLogicalRepWorker->subid, InvalidOid); } parallel_stream_nchanges = 0; @@ -1991,12 +2226,12 @@ apply_handle_stream_abort(StringInfo s) */ static void ensure_last_message(FileSet *stream_fileset, TransactionId xid, int fileno, - off_t offset) + pgoff_t offset) { char path[MAXPGPATH]; BufFile *fd; int last_fileno; - off_t last_offset; + pgoff_t last_offset; Assert(!IsTransactionState()); @@ -2031,7 +2266,7 @@ apply_spooled_messages(FileSet *stream_fileset, TransactionId xid, MemoryContext oldcxt; ResourceOwner oldowner; int fileno; - off_t offset; + pgoff_t offset; if (!am_parallel_apply_worker()) maybe_start_skipping_changes(lsn); @@ -2250,8 +2485,11 @@ apply_handle_stream_commit(StringInfo s) break; } - /* Process any tables that are being synchronized in parallel. */ - process_syncing_tables(commit_data.end_lsn); + /* + * Process any tables that are being synchronized in parallel, as well as + * any newly added tables or sequences. + */ + ProcessSyncingRelations(commit_data.end_lsn); pgstat_report_activity(STATE_IDLE, NULL); @@ -2620,7 +2858,7 @@ apply_handle_update(StringInfo s) target_perminfo = list_nth(estate->es_rteperminfos, 0); for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++) { - Form_pg_attribute att = TupleDescAttr(remoteslot->tts_tupleDescriptor, i); + CompactAttribute *att = TupleDescCompactAttr(remoteslot->tts_tupleDescriptor, i); int remoteattnum = rel->attrmap->attnums[i]; if (!att->attisdropped && remoteattnum >= 0) @@ -2733,17 +2971,31 @@ apply_handle_update_internal(ApplyExecutionData *edata, } else { + ConflictType type; TupleTableSlot *newslot = localslot; + /* + * Detecting whether the tuple was recently deleted or never existed + * is crucial to avoid misleading the user during conflict handling. + */ + if (FindDeletedTupleInLocalRel(localrel, localindexoid, remoteslot, + &conflicttuple.xmin, + &conflicttuple.origin, + &conflicttuple.ts) && + conflicttuple.origin != replorigin_session_origin) + type = CT_UPDATE_DELETED; + else + type = CT_UPDATE_MISSING; + /* Store the new tuple for conflict reporting */ slot_store_data(newslot, relmapentry, newtup); /* - * The tuple to be updated could not be found. Do nothing except for - * emitting a log message. + * The tuple to be updated could not be found or was deleted. Do + * nothing except for emitting a log message. */ - ReportApplyConflict(estate, relinfo, LOG, CT_UPDATE_MISSING, - remoteslot, newslot, list_make1(&conflicttuple)); + ReportApplyConflict(estate, relinfo, LOG, type, remoteslot, newslot, + list_make1(&conflicttuple)); } /* Cleanup. */ @@ -2963,6 +3215,135 @@ FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel, return found; } +/* + * Determine whether the index can reliably locate the deleted tuple in the + * local relation. + * + * An index may exclude deleted tuples if it was re-indexed or re-created during + * change application. Therefore, an index is considered usable only if the + * conflict detection slot.xmin (conflict_detection_xmin) is greater than the + * index tuple's xmin. This ensures that any tuples deleted prior to the index + * creation or re-indexing are not relevant for conflict detection in the + * current apply worker. + * + * Note that indexes may also be excluded if they were modified by other DDL + * operations, such as ALTER INDEX. However, this is acceptable, as the + * likelihood of such DDL changes coinciding with the need to scan dead + * tuples for the update_deleted is low. + */ +static bool +IsIndexUsableForFindingDeletedTuple(Oid localindexoid, + TransactionId conflict_detection_xmin) +{ + HeapTuple index_tuple; + TransactionId index_xmin; + + index_tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(localindexoid)); + + if (!HeapTupleIsValid(index_tuple)) /* should not happen */ + elog(ERROR, "cache lookup failed for index %u", localindexoid); + + /* + * No need to check for a frozen transaction ID, as + * TransactionIdPrecedes() manages it internally, treating it as falling + * behind the conflict_detection_xmin. + */ + index_xmin = HeapTupleHeaderGetXmin(index_tuple->t_data); + + ReleaseSysCache(index_tuple); + + return TransactionIdPrecedes(index_xmin, conflict_detection_xmin); +} + +/* + * Attempts to locate a deleted tuple in the local relation that matches the + * values of the tuple received from the publication side (in 'remoteslot'). + * The search is performed using either the replica identity index, primary + * key, other available index, or a sequential scan if necessary. + * + * Returns true if the deleted tuple is found. If found, the transaction ID, + * origin, and commit timestamp of the deletion are stored in '*delete_xid', + * '*delete_origin', and '*delete_time' respectively. + */ +static bool +FindDeletedTupleInLocalRel(Relation localrel, Oid localidxoid, + TupleTableSlot *remoteslot, + TransactionId *delete_xid, RepOriginId *delete_origin, + TimestampTz *delete_time) +{ + TransactionId oldestxmin; + + /* + * Return false if either dead tuples are not retained or commit timestamp + * data is not available. + */ + if (!MySubscription->retaindeadtuples || !track_commit_timestamp) + return false; + + /* + * For conflict detection, we use the leader worker's + * oldest_nonremovable_xid value instead of invoking + * GetOldestNonRemovableTransactionId() or using the conflict detection + * slot's xmin. The oldest_nonremovable_xid acts as a threshold to + * identify tuples that were recently deleted. These deleted tuples are no + * longer visible to concurrent transactions. However, if a remote update + * matches such a tuple, we log an update_deleted conflict. + * + * While GetOldestNonRemovableTransactionId() and slot.xmin may return + * transaction IDs older than oldest_nonremovable_xid, for our current + * purpose, it is acceptable to treat tuples deleted by transactions prior + * to oldest_nonremovable_xid as update_missing conflicts. + */ + if (am_leader_apply_worker()) + { + oldestxmin = MyLogicalRepWorker->oldest_nonremovable_xid; + } + else + { + LogicalRepWorker *leader; + + /* + * Obtain the information from the leader apply worker as only the + * leader manages oldest_nonremovable_xid (see + * maybe_advance_nonremovable_xid() for details). + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + leader = logicalrep_worker_find(WORKERTYPE_APPLY, + MyLogicalRepWorker->subid, InvalidOid, + false); + if (!leader) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not detect conflict as the leader apply worker has exited"))); + } + + SpinLockAcquire(&leader->relmutex); + oldestxmin = leader->oldest_nonremovable_xid; + SpinLockRelease(&leader->relmutex); + LWLockRelease(LogicalRepWorkerLock); + } + + /* + * Return false if the leader apply worker has stopped retaining + * information for detecting conflicts. This implies that update_deleted + * can no longer be reliably detected. + */ + if (!TransactionIdIsValid(oldestxmin)) + return false; + + if (OidIsValid(localidxoid) && + IsIndexUsableForFindingDeletedTuple(localidxoid, oldestxmin)) + return RelationFindDeletedTupleInfoByIndex(localrel, localidxoid, + remoteslot, oldestxmin, + delete_xid, delete_origin, + delete_time); + else + return RelationFindDeletedTupleInfoSeq(localrel, remoteslot, + oldestxmin, delete_xid, + delete_origin, delete_time); +} + /* * This handles insert, update, delete on a partitioned table. */ @@ -3012,6 +3393,7 @@ apply_handle_tuple_routing(ApplyExecutionData *edata, * at CREATE/ALTER SUBSCRIPTION would be insufficient. */ CheckSubscriptionRelkind(partrel->rd_rel->relkind, + relmapentry->remoterel.relkind, get_namespace_name(RelationGetNamespace(partrel)), RelationGetRelationName(partrel)); @@ -3081,18 +3463,35 @@ apply_handle_tuple_routing(ApplyExecutionData *edata, remoteslot_part, &localslot); if (!found) { + ConflictType type; TupleTableSlot *newslot = localslot; + /* + * Detecting whether the tuple was recently deleted or + * never existed is crucial to avoid misleading the user + * during conflict handling. + */ + if (FindDeletedTupleInLocalRel(partrel, + part_entry->localindexoid, + remoteslot_part, + &conflicttuple.xmin, + &conflicttuple.origin, + &conflicttuple.ts) && + conflicttuple.origin != replorigin_session_origin) + type = CT_UPDATE_DELETED; + else + type = CT_UPDATE_MISSING; + /* Store the new tuple for conflict reporting */ slot_store_data(newslot, part_entry, newtup); /* - * The tuple to be updated could not be found. Do nothing - * except for emitting a log message. + * The tuple to be updated could not be found or was + * deleted. Do nothing except for emitting a log message. */ ReportApplyConflict(estate, partrelinfo, LOG, - CT_UPDATE_MISSING, remoteslot_part, - newslot, list_make1(&conflicttuple)); + type, remoteslot_part, newslot, + list_make1(&conflicttuple)); return; } @@ -3191,6 +3590,7 @@ apply_handle_tuple_routing(ApplyExecutionData *edata, /* Check that new partition also has supported relkind. */ CheckSubscriptionRelkind(partrel_new->rd_rel->relkind, + relmapentry->remoterel.relkind, get_namespace_name(RelationGetNamespace(partrel_new)), RelationGetRelationName(partrel_new)); @@ -3551,7 +3951,7 @@ store_flush_position(XLogRecPtr remote_lsn, XLogRecPtr local_lsn) MemoryContextSwitchTo(ApplyContext); /* Track commit lsn */ - flushpos = (FlushPosition *) palloc(sizeof(FlushPosition)); + flushpos = palloc_object(FlushPosition); flushpos->local_end = local_lsn; flushpos->remote_end = remote_lsn; @@ -3584,6 +3984,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received) bool ping_sent = false; TimeLineID tli; ErrorContextCallback errcallback; + RetainDeadTuplesData rdt_data = {0}; /* * Init the ApplyMessageContext which we clean up after each replication @@ -3662,6 +4063,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_recv_timestamp = GetCurrentTimestamp(); ping_sent = false; + rdt_data.last_recv_time = last_recv_timestamp; + /* Ensure we are reading the data into our memory context. */ MemoryContextSwitchTo(ApplyMessageContext); @@ -3669,7 +4072,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received) c = pq_getmsgbyte(&s); - if (c == 'w') + if (c == PqReplMsg_WALData) { XLogRecPtr start_lsn; XLogRecPtr end_lsn; @@ -3688,8 +4091,10 @@ LogicalRepApplyLoop(XLogRecPtr last_received) UpdateWorkerStats(last_received, send_time, false); apply_dispatch(&s); + + maybe_advance_nonremovable_xid(&rdt_data, false); } - else if (c == 'k') + else if (c == PqReplMsg_Keepalive) { XLogRecPtr end_lsn; TimestampTz timestamp; @@ -3703,8 +4108,31 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_received = end_lsn; send_feedback(last_received, reply_requested, false); + + maybe_advance_nonremovable_xid(&rdt_data, false); + UpdateWorkerStats(last_received, timestamp, true); } + else if (c == PqReplMsg_PrimaryStatusUpdate) + { + rdt_data.remote_lsn = pq_getmsgint64(&s); + rdt_data.remote_oldestxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.remote_nextxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.reply_time = pq_getmsgint64(&s); + + /* + * This should never happen, see + * ProcessStandbyPSRequestMessage. But if it happens + * due to a bug, we don't want to proceed as it can + * incorrectly advance oldest_nonremovable_xid. + */ + if (!XLogRecPtrIsValid(rdt_data.remote_lsn)) + elog(ERROR, "cannot get the latest WAL position from the publisher"); + + maybe_advance_nonremovable_xid(&rdt_data, true); + + UpdateWorkerStats(last_received, rdt_data.reply_time, false); + } /* other message types are purposefully ignored */ MemoryContextReset(ApplyMessageContext); @@ -3717,6 +4145,11 @@ LogicalRepApplyLoop(XLogRecPtr last_received) /* confirm all writes so far */ send_feedback(last_received, false, false); + /* Reset the timestamp if no message was received */ + rdt_data.last_recv_time = 0; + + maybe_advance_nonremovable_xid(&rdt_data, false); + if (!in_remote_transaction && !in_streamed_transaction) { /* @@ -3727,8 +4160,11 @@ LogicalRepApplyLoop(XLogRecPtr last_received) AcceptInvalidationMessages(); maybe_reread_subscription(); - /* Process any table synchronization changes. */ - process_syncing_tables(last_received); + /* + * Process any relations that are being synchronized in parallel + * and any newly added tables or sequences. + */ + ProcessSyncingRelations(last_received); } /* Cleanup the memory. */ @@ -3751,6 +4187,20 @@ LogicalRepApplyLoop(XLogRecPtr last_received) else wait_time = NAPTIME_PER_CYCLE; + /* + * Ensure to wake up when it's possible to advance the non-removable + * transaction ID, or when the retention duration may have exceeded + * max_retention_duration. + */ + if (MySubscription->retentionactive) + { + if (rdt_data.phase == RDT_GET_CANDIDATE_XID && + rdt_data.xid_advance_interval) + wait_time = Min(wait_time, rdt_data.xid_advance_interval); + else if (MySubscription->maxretention > 0) + wait_time = Min(wait_time, MySubscription->maxretention); + } + rc = WaitLatchOrSocket(MyLatch, WL_SOCKET_READABLE | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, @@ -3814,6 +4264,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) send_feedback(last_received, requestReply, requestReply); + maybe_advance_nonremovable_xid(&rdt_data, false); + /* * Force reporting to ensure long idle periods don't lead to * arbitrarily delayed stats. Stats can only be reported outside @@ -3849,7 +4301,6 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) static XLogRecPtr last_recvpos = InvalidXLogRecPtr; static XLogRecPtr last_writepos = InvalidXLogRecPtr; - static XLogRecPtr last_flushpos = InvalidXLogRecPtr; XLogRecPtr writepos; XLogRecPtr flushpos; @@ -3903,14 +4354,14 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) else resetStringInfo(reply_message); - pq_sendbyte(reply_message, 'r'); + pq_sendbyte(reply_message, PqReplMsg_StandbyStatusUpdate); pq_sendint64(reply_message, recvpos); /* write */ pq_sendint64(reply_message, flushpos); /* flush */ pq_sendint64(reply_message, writepos); /* apply */ pq_sendint64(reply_message, now); /* sendTime */ pq_sendbyte(reply_message, requestReply); /* replyRequested */ - elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X", + elog(DEBUG2, "sending feedback (force %d) to recv %X/%08X, write %X/%08X, flush %X/%08X", force, LSN_FORMAT_ARGS(recvpos), LSN_FORMAT_ARGS(writepos), @@ -3927,6 +4378,625 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) last_flushpos = flushpos; } +/* + * Attempt to advance the non-removable transaction ID. + * + * See comments atop worker.c for details. + */ +static void +maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + if (!can_advance_nonremovable_xid(rdt_data)) + return; + + process_rdt_phase_transition(rdt_data, status_received); +} + +/* + * Preliminary check to determine if advancing the non-removable transaction ID + * is allowed. + */ +static bool +can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data) +{ + /* + * It is sufficient to manage non-removable transaction ID for a + * subscription by the main apply worker to detect update_deleted reliably + * even for table sync or parallel apply workers. + */ + if (!am_leader_apply_worker()) + return false; + + /* No need to advance if retaining dead tuples is not required */ + if (!MySubscription->retaindeadtuples) + return false; + + return true; +} + +/* + * Process phase transitions during the non-removable transaction ID + * advancement. See comments atop worker.c for details of the transition. + */ +static void +process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + switch (rdt_data->phase) + { + case RDT_GET_CANDIDATE_XID: + get_candidate_xid(rdt_data); + break; + case RDT_REQUEST_PUBLISHER_STATUS: + request_publisher_status(rdt_data); + break; + case RDT_WAIT_FOR_PUBLISHER_STATUS: + wait_for_publisher_status(rdt_data, status_received); + break; + case RDT_WAIT_FOR_LOCAL_FLUSH: + wait_for_local_flush(rdt_data); + break; + case RDT_STOP_CONFLICT_INFO_RETENTION: + stop_conflict_info_retention(rdt_data); + break; + case RDT_RESUME_CONFLICT_INFO_RETENTION: + resume_conflict_info_retention(rdt_data); + break; + } +} + +/* + * Workhorse for the RDT_GET_CANDIDATE_XID phase. + */ +static void +get_candidate_xid(RetainDeadTuplesData *rdt_data) +{ + TransactionId oldest_running_xid; + TimestampTz now; + + /* + * Use last_recv_time when applying changes in the loop to avoid + * unnecessary system time retrieval. If last_recv_time is not available, + * obtain the current timestamp. + */ + now = rdt_data->last_recv_time ? rdt_data->last_recv_time : GetCurrentTimestamp(); + + /* + * Compute the candidate_xid and request the publisher status at most once + * per xid_advance_interval. Refer to adjust_xid_advance_interval() for + * details on how this value is dynamically adjusted. This is to avoid + * using CPU and network resources without making much progress. + */ + if (!TimestampDifferenceExceeds(rdt_data->candidate_xid_time, now, + rdt_data->xid_advance_interval)) + return; + + /* + * Immediately update the timer, even if the function returns later + * without setting candidate_xid due to inactivity on the subscriber. This + * avoids frequent calls to GetOldestActiveTransactionId. + */ + rdt_data->candidate_xid_time = now; + + /* + * Consider transactions in the current database, as only dead tuples from + * this database are required for conflict detection. + */ + oldest_running_xid = GetOldestActiveTransactionId(false, false); + + /* + * Oldest active transaction ID (oldest_running_xid) can't be behind any + * of its previously computed value. + */ + Assert(TransactionIdPrecedesOrEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)); + + /* Return if the oldest_nonremovable_xid cannot be advanced */ + if (TransactionIdEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)) + { + adjust_xid_advance_interval(rdt_data, false); + return; + } + + adjust_xid_advance_interval(rdt_data, true); + + rdt_data->candidate_xid = oldest_running_xid; + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_REQUEST_PUBLISHER_STATUS phase. + */ +static void +request_publisher_status(RetainDeadTuplesData *rdt_data) +{ + static StringInfo request_message = NULL; + + if (!request_message) + { + MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext); + + request_message = makeStringInfo(); + MemoryContextSwitchTo(oldctx); + } + else + resetStringInfo(request_message); + + /* + * Send the current time to update the remote walsender's latest reply + * message received time. + */ + pq_sendbyte(request_message, PqReplMsg_PrimaryStatusRequest); + pq_sendint64(request_message, GetCurrentTimestamp()); + + elog(DEBUG2, "sending publisher status request message"); + + /* Send a request for the publisher status */ + walrcv_send(LogRepWorkerWalRcvConn, + request_message->data, request_message->len); + + rdt_data->phase = RDT_WAIT_FOR_PUBLISHER_STATUS; + + /* + * Skip calling maybe_advance_nonremovable_xid() since further transition + * is possible only once we receive the publisher status message. + */ +} + +/* + * Workhorse for the RDT_WAIT_FOR_PUBLISHER_STATUS phase. + */ +static void +wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + /* + * Return if we have requested but not yet received the publisher status. + */ + if (!status_received) + return; + + /* + * We don't need to maintain oldest_nonremovable_xid if we decide to stop + * retaining conflict information for this worker. + */ + if (should_stop_conflict_info_retention(rdt_data)) + { + rdt_data->phase = RDT_STOP_CONFLICT_INFO_RETENTION; + return; + } + + if (!FullTransactionIdIsValid(rdt_data->remote_wait_for)) + rdt_data->remote_wait_for = rdt_data->remote_nextxid; + + /* + * Check if all remote concurrent transactions that were active at the + * first status request have now completed. If completed, proceed to the + * next phase; otherwise, continue checking the publisher status until + * these transactions finish. + * + * It's possible that transactions in the commit phase during the last + * cycle have now finished committing, but remote_oldestxid remains older + * than remote_wait_for. This can happen if some old transaction came in + * the commit phase when we requested status in this cycle. We do not + * handle this case explicitly as it's rare and the benefit doesn't + * justify the required complexity. Tracking would require either caching + * all xids at the publisher or sending them to subscribers. The condition + * will resolve naturally once the remaining transactions are finished. + * + * Directly advancing the non-removable transaction ID is possible if + * there are no activities on the publisher since the last advancement + * cycle. However, it requires maintaining two fields, last_remote_nextxid + * and last_remote_lsn, within the structure for comparison with the + * current cycle's values. Considering the minimal cost of continuing in + * RDT_WAIT_FOR_LOCAL_FLUSH without awaiting changes, we opted not to + * advance the transaction ID here. + */ + if (FullTransactionIdPrecedesOrEquals(rdt_data->remote_wait_for, + rdt_data->remote_oldestxid)) + rdt_data->phase = RDT_WAIT_FOR_LOCAL_FLUSH; + else + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_WAIT_FOR_LOCAL_FLUSH phase. + */ +static void +wait_for_local_flush(RetainDeadTuplesData *rdt_data) +{ + Assert(XLogRecPtrIsValid(rdt_data->remote_lsn) && + TransactionIdIsValid(rdt_data->candidate_xid)); + + /* + * We expect the publisher and subscriber clocks to be in sync using time + * sync service like NTP. Otherwise, we will advance this worker's + * oldest_nonremovable_xid prematurely, leading to the removal of rows + * required to detect update_deleted reliably. This check primarily + * addresses scenarios where the publisher's clock falls behind; if the + * publisher's clock is ahead, subsequent transactions will naturally bear + * later commit timestamps, conforming to the design outlined atop + * worker.c. + * + * XXX Consider waiting for the publisher's clock to catch up with the + * subscriber's before proceeding to the next phase. + */ + if (TimestampDifferenceExceeds(rdt_data->reply_time, + rdt_data->candidate_xid_time, 0)) + ereport(ERROR, + errmsg_internal("oldest_nonremovable_xid transaction ID could be advanced prematurely"), + errdetail_internal("The clock on the publisher is behind that of the subscriber.")); + + /* + * Do not attempt to advance the non-removable transaction ID when table + * sync is in progress. During this time, changes from a single + * transaction may be applied by multiple table sync workers corresponding + * to the target tables. So, it's necessary for all table sync workers to + * apply and flush the corresponding changes before advancing the + * transaction ID, otherwise, dead tuples that are still needed for + * conflict detection in table sync workers could be removed prematurely. + * However, confirming the apply and flush progress across all table sync + * workers is complex and not worth the effort, so we simply return if not + * all tables are in the READY state. + * + * Advancing the transaction ID is necessary even when no tables are + * currently subscribed, to avoid retaining dead tuples unnecessarily. + * While it might seem safe to skip all phases and directly assign + * candidate_xid to oldest_nonremovable_xid during the + * RDT_GET_CANDIDATE_XID phase in such cases, this is unsafe. If users + * concurrently add tables to the subscription, the apply worker may not + * process invalidations in time. Consequently, + * HasSubscriptionTablesCached() might miss the new tables, leading to + * premature advancement of oldest_nonremovable_xid. + * + * Performing the check during RDT_WAIT_FOR_LOCAL_FLUSH is safe, as + * invalidations are guaranteed to be processed before applying changes + * from newly added tables while waiting for the local flush to reach + * remote_lsn. + * + * Additionally, even if we check for subscription tables during + * RDT_GET_CANDIDATE_XID, they might be dropped before reaching + * RDT_WAIT_FOR_LOCAL_FLUSH. Therefore, it's still necessary to verify + * subscription tables at this stage to prevent unnecessary tuple + * retention. + */ + if (HasSubscriptionTablesCached() && !AllTablesyncsReady()) + { + TimestampTz now; + + now = rdt_data->last_recv_time + ? rdt_data->last_recv_time : GetCurrentTimestamp(); + + /* + * Record the time spent waiting for table sync, it is needed for the + * timeout check in should_stop_conflict_info_retention(). + */ + rdt_data->table_sync_wait_time = + TimestampDifferenceMilliseconds(rdt_data->candidate_xid_time, now); + + return; + } + + /* + * We don't need to maintain oldest_nonremovable_xid if we decide to stop + * retaining conflict information for this worker. + */ + if (should_stop_conflict_info_retention(rdt_data)) + { + rdt_data->phase = RDT_STOP_CONFLICT_INFO_RETENTION; + return; + } + + /* + * Update and check the remote flush position if we are applying changes + * in a loop. This is done at most once per WalWriterDelay to avoid + * performing costly operations in get_flush_position() too frequently + * during change application. + */ + if (last_flushpos < rdt_data->remote_lsn && rdt_data->last_recv_time && + TimestampDifferenceExceeds(rdt_data->flushpos_update_time, + rdt_data->last_recv_time, WalWriterDelay)) + { + XLogRecPtr writepos; + XLogRecPtr flushpos; + bool have_pending_txes; + + /* Fetch the latest remote flush position */ + get_flush_position(&writepos, &flushpos, &have_pending_txes); + + if (flushpos > last_flushpos) + last_flushpos = flushpos; + + rdt_data->flushpos_update_time = rdt_data->last_recv_time; + } + + /* Return to wait for the changes to be applied */ + if (last_flushpos < rdt_data->remote_lsn) + return; + + /* + * Reaching this point implies should_stop_conflict_info_retention() + * returned false earlier, meaning that the most recent duration for + * advancing the non-removable transaction ID is within the + * max_retention_duration or max_retention_duration is set to 0. + * + * Therefore, if conflict info retention was previously stopped due to a + * timeout, it is now safe to resume retention. + */ + if (!MySubscription->retentionactive) + { + rdt_data->phase = RDT_RESUME_CONFLICT_INFO_RETENTION; + return; + } + + /* + * Reaching here means the remote WAL position has been received, and all + * transactions up to that position on the publisher have been applied and + * flushed locally. So, we can advance the non-removable transaction ID. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->oldest_nonremovable_xid = rdt_data->candidate_xid; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + elog(DEBUG2, "confirmed flush up to remote lsn %X/%08X: new oldest_nonremovable_xid %u", + LSN_FORMAT_ARGS(rdt_data->remote_lsn), + rdt_data->candidate_xid); + + /* Notify launcher to update the xmin of the conflict slot */ + ApplyLauncherWakeup(); + + reset_retention_data_fields(rdt_data); + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Check whether conflict information retention should be stopped due to + * exceeding the maximum wait time (max_retention_duration). + * + * If retention should be stopped, return true. Otherwise, return false. + */ +static bool +should_stop_conflict_info_retention(RetainDeadTuplesData *rdt_data) +{ + TimestampTz now; + + Assert(TransactionIdIsValid(rdt_data->candidate_xid)); + Assert(rdt_data->phase == RDT_WAIT_FOR_PUBLISHER_STATUS || + rdt_data->phase == RDT_WAIT_FOR_LOCAL_FLUSH); + + if (!MySubscription->maxretention) + return false; + + /* + * Use last_recv_time when applying changes in the loop to avoid + * unnecessary system time retrieval. If last_recv_time is not available, + * obtain the current timestamp. + */ + now = rdt_data->last_recv_time ? rdt_data->last_recv_time : GetCurrentTimestamp(); + + /* + * Return early if the wait time has not exceeded the configured maximum + * (max_retention_duration). Time spent waiting for table synchronization + * is excluded from this calculation, as it occurs infrequently. + */ + if (!TimestampDifferenceExceeds(rdt_data->candidate_xid_time, now, + MySubscription->maxretention + + rdt_data->table_sync_wait_time)) + return false; + + return true; +} + +/* + * Workhorse for the RDT_STOP_CONFLICT_INFO_RETENTION phase. + */ +static void +stop_conflict_info_retention(RetainDeadTuplesData *rdt_data) +{ + /* Stop retention if not yet */ + if (MySubscription->retentionactive) + { + /* + * If the retention status cannot be updated (e.g., due to active + * transaction), skip further processing to avoid inconsistent + * retention behavior. + */ + if (!update_retention_status(false)) + return; + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->oldest_nonremovable_xid = InvalidTransactionId; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + ereport(LOG, + errmsg("logical replication worker for subscription \"%s\" has stopped retaining the information for detecting conflicts", + MySubscription->name), + errdetail("Retention is stopped because the apply process has not caught up with the publisher within the configured max_retention_duration.")); + } + + Assert(!TransactionIdIsValid(MyLogicalRepWorker->oldest_nonremovable_xid)); + + /* + * If retention has been stopped, reset to the initial phase to retry + * resuming retention. This reset is required to recalculate the current + * wait time and resume retention if the time falls within + * max_retention_duration. + */ + reset_retention_data_fields(rdt_data); +} + +/* + * Workhorse for the RDT_RESUME_CONFLICT_INFO_RETENTION phase. + */ +static void +resume_conflict_info_retention(RetainDeadTuplesData *rdt_data) +{ + /* We can't resume retention without updating retention status. */ + if (!update_retention_status(true)) + return; + + ereport(LOG, + errmsg("logical replication worker for subscription \"%s\" will resume retaining the information for detecting conflicts", + MySubscription->name), + MySubscription->maxretention + ? errdetail("Retention is re-enabled because the apply process has caught up with the publisher within the configured max_retention_duration.") + : errdetail("Retention is re-enabled because max_retention_duration has been set to unlimited.")); + + /* + * Restart the worker to let the launcher initialize + * oldest_nonremovable_xid at startup. + * + * While it's technically possible to derive this value on-the-fly using + * the conflict detection slot's xmin, doing so risks a race condition: + * the launcher might clean slot.xmin just after retention resumes. This + * would make oldest_nonremovable_xid unreliable, especially during xid + * wraparound. + * + * Although this can be prevented by introducing heavy weight locking, the + * complexity it will bring doesn't seem worthwhile given how rarely + * retention is resumed. + */ + apply_worker_exit(); +} + +/* + * Updates pg_subscription.subretentionactive to the given value within a + * new transaction. + * + * If already inside an active transaction, skips the update and returns + * false. + * + * Returns true if the update is successfully performed. + */ +static bool +update_retention_status(bool active) +{ + /* + * Do not update the catalog during an active transaction. The transaction + * may be started during change application, leading to a possible + * rollback of catalog updates if the application fails subsequently. + */ + if (IsTransactionState()) + return false; + + StartTransactionCommand(); + + /* + * Updating pg_subscription might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + + /* Update pg_subscription.subretentionactive */ + UpdateDeadTupleRetentionStatus(MySubscription->oid, active); + + PopActiveSnapshot(); + CommitTransactionCommand(); + + /* Notify launcher to update the conflict slot */ + ApplyLauncherWakeup(); + + MySubscription->retentionactive = active; + + return true; +} + +/* + * Reset all data fields of RetainDeadTuplesData except those used to + * determine the timing for the next round of transaction ID advancement. We + * can even use flushpos_update_time in the next round to decide whether to get + * the latest flush position. + */ +static void +reset_retention_data_fields(RetainDeadTuplesData *rdt_data) +{ + rdt_data->phase = RDT_GET_CANDIDATE_XID; + rdt_data->remote_lsn = InvalidXLogRecPtr; + rdt_data->remote_oldestxid = InvalidFullTransactionId; + rdt_data->remote_nextxid = InvalidFullTransactionId; + rdt_data->reply_time = 0; + rdt_data->remote_wait_for = InvalidFullTransactionId; + rdt_data->candidate_xid = InvalidTransactionId; + rdt_data->table_sync_wait_time = 0; +} + +/* + * Adjust the interval for advancing non-removable transaction IDs. + * + * If there is no activity on the node or retention has been stopped, we + * progressively double the interval used to advance non-removable transaction + * ID. This helps conserve CPU and network resources when there's little benefit + * to frequent updates. + * + * The interval is capped by the lowest of the following: + * - wal_receiver_status_interval (if set and retention is active), + * - a default maximum of 3 minutes, + * - max_retention_duration (if retention is active). + * + * This ensures the interval never exceeds the retention boundary, even if other + * limits are higher. Once activity resumes on the node and the retention is + * active, the interval is reset to lesser of 100ms and max_retention_duration, + * allowing timely advancement of non-removable transaction ID. + * + * XXX The use of wal_receiver_status_interval is a bit arbitrary so we can + * consider the other interval or a separate GUC if the need arises. + */ +static void +adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, bool new_xid_found) +{ + if (rdt_data->xid_advance_interval && !new_xid_found) + { + int max_interval = wal_receiver_status_interval + ? wal_receiver_status_interval * 1000 + : MAX_XID_ADVANCE_INTERVAL; + + /* + * No new transaction ID has been assigned since the last check, so + * double the interval, but not beyond the maximum allowable value. + */ + rdt_data->xid_advance_interval = Min(rdt_data->xid_advance_interval * 2, + max_interval); + } + else if (rdt_data->xid_advance_interval && + !MySubscription->retentionactive) + { + /* + * Retention has been stopped, so double the interval-capped at a + * maximum of 3 minutes. The wal_receiver_status_interval is + * intentionally not used as a upper bound, since the likelihood of + * retention resuming is lower than that of general activity resuming. + */ + rdt_data->xid_advance_interval = Min(rdt_data->xid_advance_interval * 2, + MAX_XID_ADVANCE_INTERVAL); + } + else + { + /* + * A new transaction ID was found or the interval is not yet + * initialized, so set the interval to the minimum value. + */ + rdt_data->xid_advance_interval = MIN_XID_ADVANCE_INTERVAL; + } + + /* + * Ensure the wait time remains within the maximum retention time limit + * when retention is active. + */ + if (MySubscription->retentionactive) + rdt_data->xid_advance_interval = Min(rdt_data->xid_advance_interval, + MySubscription->maxretention); +} + /* * Exit routine for apply workers due to subscription parameter changes. */ @@ -4185,7 +5255,7 @@ subxact_info_read(Oid subid, TransactionId xid) len = sizeof(SubXactInfo) * subxact_data.nsubxacts; /* we keep the maximum as a power of 2 */ - subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts); + subxact_data.nsubxacts_max = 1 << pg_ceil_log2_32(subxact_data.nsubxacts); /* * Allocate subxact information in the logical streaming context. We need @@ -4491,7 +5561,7 @@ set_stream_options(WalRcvStreamOptions *options, * Cleanup the memory for subxacts and reset the related variables. */ static inline void -cleanup_subxact_info() +cleanup_subxact_info(void) { if (subxact_data.subxacts) pfree(subxact_data.subxacts); @@ -4536,7 +5606,8 @@ start_apply(XLogRecPtr origin_startpos) * idle state. */ AbortOutOfAnyTransaction(); - pgstat_report_subscription_error(MySubscription->oid, !am_tablesync_worker()); + pgstat_report_subscription_error(MySubscription->oid, + MyLogicalRepWorker->type); PG_RE_THROW(); } @@ -4550,7 +5621,7 @@ start_apply(XLogRecPtr origin_startpos) * It sets up replication origin, streaming options and then starts streaming. */ static void -run_apply_worker() +run_apply_worker(void) { char originname[NAMEDATALEN]; XLogRecPtr origin_startpos = InvalidXLogRecPtr; @@ -4626,8 +5697,16 @@ run_apply_worker() walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); StartTransactionCommand(); + + /* + * Updating pg_subscription might involve TOAST table access, so + * ensure we have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + UpdateTwoPhaseState(MySubscription->oid, LOGICALREP_TWOPHASE_STATE_ENABLED); MySubscription->twophasestate = LOGICALREP_TWOPHASE_STATE_ENABLED; + PopActiveSnapshot(); CommitTransactionCommand(); } else @@ -4648,8 +5727,8 @@ run_apply_worker() } /* - * Common initialization for leader apply worker, parallel apply worker and - * tablesync worker. + * Common initialization for leader apply worker, parallel apply worker, + * tablesync worker and sequencesync worker. * * Initialize the database connection, in-memory subscription and necessary * config options. @@ -4681,6 +5760,13 @@ InitializeLogRepWorker(void) StartTransactionCommand(); oldctx = MemoryContextSwitchTo(ApplyContext); + /* + * Lock the subscription to prevent it from being concurrently dropped, + * then re-verify its existence. After the initialization, the worker will + * be terminated gracefully if the subscription is dropped. + */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, 0, + AccessShareLock); MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); if (!MySubscription) { @@ -4707,6 +5793,31 @@ InitializeLogRepWorker(void) apply_worker_exit(); } + /* + * Restart the worker if retain_dead_tuples was enabled during startup. + * + * At this point, the replication slot used for conflict detection might + * not exist yet, or could be dropped soon if the launcher perceives + * retain_dead_tuples as disabled. To avoid unnecessary tracking of + * oldest_nonremovable_xid when the slot is absent or at risk of being + * dropped, a restart is initiated. + * + * The oldest_nonremovable_xid should be initialized only when the + * subscription's retention is active before launching the worker. See + * logicalrep_worker_launch. + */ + if (am_leader_apply_worker() && + MySubscription->retaindeadtuples && + MySubscription->retentionactive && + !TransactionIdIsValid(MyLogicalRepWorker->oldest_nonremovable_xid)) + { + ereport(LOG, + errmsg("logical replication worker for subscription \"%s\" will restart because the option %s was enabled during startup", + MySubscription->name, "retain_dead_tuples")); + + apply_worker_exit(); + } + /* Setup synchronous commit according to the user's wishes */ SetConfigOption("synchronous_commit", MySubscription->synccommit, PGC_BACKEND, PGC_S_OVERRIDE); @@ -4725,15 +5836,36 @@ InitializeLogRepWorker(void) if (am_tablesync_worker()) ereport(LOG, - (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has started", - MySubscription->name, - get_rel_name(MyLogicalRepWorker->relid)))); + errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has started", + MySubscription->name, + get_rel_name(MyLogicalRepWorker->relid))); + else if (am_sequencesync_worker()) + ereport(LOG, + errmsg("logical replication sequence synchronization worker for subscription \"%s\" has started", + MySubscription->name)); else ereport(LOG, - (errmsg("logical replication apply worker for subscription \"%s\" has started", - MySubscription->name))); + errmsg("logical replication apply worker for subscription \"%s\" has started", + MySubscription->name)); CommitTransactionCommand(); + + /* + * Register a callback to reset the origin state before aborting any + * pending transaction during shutdown (see ShutdownPostgres()). This will + * avoid origin advancement for an incomplete transaction which could + * otherwise lead to its loss as such a transaction won't be sent by the + * server again. + * + * Note that even a LOG or DEBUG statement placed after setting the origin + * state may process a shutdown signal before committing the current apply + * operation. So, it is important to register such a callback here. + * + * Register this callback here to ensure that all types of logical + * replication workers that set up origins and apply remote transactions + * are protected. + */ + before_shmem_exit(replorigin_reset, (Datum) 0); } /* @@ -4747,14 +5879,16 @@ replorigin_reset(int code, Datum arg) replorigin_session_origin_timestamp = 0; } -/* Common function to setup the leader apply or tablesync worker. */ +/* + * Common function to setup the leader apply, tablesync and sequencesync worker. + */ void SetupApplyOrSyncWorker(int worker_slot) { /* Attach to slot */ logicalrep_worker_attach(worker_slot); - Assert(am_tablesync_worker() || am_leader_apply_worker()); + Assert(am_tablesync_worker() || am_sequencesync_worker() || am_leader_apply_worker()); /* Setup signal handling */ pqsignal(SIGHUP, SignalHandlerForConfigReload); @@ -4775,19 +5909,6 @@ SetupApplyOrSyncWorker(int worker_slot) InitializeLogRepWorker(); - /* - * Register a callback to reset the origin state before aborting any - * pending transaction during shutdown (see ShutdownPostgres()). This will - * avoid origin advancement for an in-complete transaction which could - * otherwise lead to its loss as such a transaction won't be sent by the - * server again. - * - * Note that even a LOG or DEBUG statement placed after setting the origin - * state may process a shutdown signal before committing the current apply - * operation. So, it is important to register such a callback here. - */ - before_shmem_exit(replorigin_reset, (Datum) 0); - /* Connect to the origin and start the replication. */ elog(DEBUG1, "connecting to publisher using connection string \"%s\"", MySubscription->conninfo); @@ -4797,7 +5918,7 @@ SetupApplyOrSyncWorker(int worker_slot) * the subscription relation state. */ CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, - invalidate_syncing_table_states, + InvalidateSyncingRelStates, (Datum) 0); } @@ -4837,13 +5958,24 @@ DisableSubscriptionAndExit(void) RESUME_INTERRUPTS(); - /* Report the worker failed during either table synchronization or apply */ + /* + * Report the worker failed during sequence synchronization, table + * synchronization, or apply. + */ pgstat_report_subscription_error(MyLogicalRepWorker->subid, - !am_tablesync_worker()); + MyLogicalRepWorker->type); /* Disable the subscription */ StartTransactionCommand(); + + /* + * Updating pg_subscription might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + DisableSubscription(MySubscription->oid); + PopActiveSnapshot(); CommitTransactionCommand(); /* Ensure we remove no-longer-useful entry for worker's start time */ @@ -4855,6 +5987,15 @@ DisableSubscriptionAndExit(void) errmsg("subscription \"%s\" has been disabled because of an error", MySubscription->name)); + /* + * Skip the track_commit_timestamp check when disabling the worker due to + * an error, as verifying commit timestamps is unnecessary in this + * context. + */ + CheckSubDeadTupleRetention(false, true, WARNING, + MySubscription->retaindeadtuples, + MySubscription->retentionactive, false); + proc_exit(0); } @@ -4892,7 +6033,7 @@ maybe_start_skipping_changes(XLogRecPtr finish_lsn) * function is called for every remote transaction and we assume that * skipping the transaction is not used often. */ - if (likely(XLogRecPtrIsInvalid(MySubscription->skiplsn) || + if (likely(!XLogRecPtrIsValid(MySubscription->skiplsn) || MySubscription->skiplsn != finish_lsn)) return; @@ -4900,7 +6041,7 @@ maybe_start_skipping_changes(XLogRecPtr finish_lsn) skip_xact_finish_lsn = finish_lsn; ereport(LOG, - errmsg("logical replication starts skipping transaction at LSN %X/%X", + errmsg("logical replication starts skipping transaction at LSN %X/%08X", LSN_FORMAT_ARGS(skip_xact_finish_lsn))); } @@ -4914,8 +6055,8 @@ stop_skipping_changes(void) return; ereport(LOG, - (errmsg("logical replication completed skipping transaction at LSN %X/%X", - LSN_FORMAT_ARGS(skip_xact_finish_lsn)))); + errmsg("logical replication completed skipping transaction at LSN %X/%08X", + LSN_FORMAT_ARGS(skip_xact_finish_lsn))); /* Stop skipping changes */ skip_xact_finish_lsn = InvalidXLogRecPtr; @@ -4938,7 +6079,7 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) XLogRecPtr myskiplsn = MySubscription->skiplsn; bool started_tx = false; - if (likely(XLogRecPtrIsInvalid(myskiplsn)) || am_parallel_apply_worker()) + if (likely(!XLogRecPtrIsValid(myskiplsn)) || am_parallel_apply_worker()) return; if (!IsTransactionState()) @@ -4947,6 +6088,12 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) started_tx = true; } + /* + * Updating pg_subscription might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + /* * Protect subskiplsn of pg_subscription from being concurrently updated * while clearing it. @@ -4997,7 +6144,7 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) if (myskiplsn != finish_lsn) ereport(WARNING, errmsg("skip-LSN of subscription \"%s\" cleared", MySubscription->name), - errdetail("Remote transaction's finish WAL location (LSN) %X/%X did not match skip-LSN %X/%X.", + errdetail("Remote transaction's finish WAL location (LSN) %X/%08X did not match skip-LSN %X/%08X.", LSN_FORMAT_ARGS(finish_lsn), LSN_FORMAT_ARGS(myskiplsn))); } @@ -5005,6 +6152,8 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) heap_freetuple(tup); table_close(rel, NoLock); + PopActiveSnapshot(); + if (started_tx) CommitTransactionCommand(); } @@ -5026,13 +6175,13 @@ apply_error_callback(void *arg) errcontext("processing remote data for replication origin \"%s\" during message type \"%s\"", errarg->origin_name, logicalrep_message_type(errarg->command)); - else if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + else if (!XLogRecPtrIsValid(errarg->finish_lsn)) errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->remote_xid, @@ -5042,7 +6191,7 @@ apply_error_callback(void *arg) { if (errarg->remote_attnum < 0) { - if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + if (!XLogRecPtrIsValid(errarg->finish_lsn)) errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u", errarg->origin_name, logicalrep_message_type(errarg->command), @@ -5050,7 +6199,7 @@ apply_error_callback(void *arg) errarg->rel->remoterel.relname, errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->rel->remoterel.nspname, @@ -5060,7 +6209,7 @@ apply_error_callback(void *arg) } else { - if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + if (!XLogRecPtrIsValid(errarg->finish_lsn)) errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u", errarg->origin_name, logicalrep_message_type(errarg->command), @@ -5069,7 +6218,7 @@ apply_error_callback(void *arg) errarg->rel->remoterel.attnames[errarg->remote_attnum], errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->rel->remoterel.nspname, diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 693a766e6d75f..787998abb8a2f 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -235,6 +235,7 @@ static bool get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid); static void init_tuple_slot(PGOutputData *data, Relation relation, RelationSyncEntry *entry); +static void pgoutput_memory_context_reset(void *arg); /* row filter routines */ static EState *create_estate_for_relation(Relation rel); @@ -297,10 +298,12 @@ parse_output_parameters(List *options, PGOutputData *data) bool two_phase_option_given = false; bool origin_option_given = false; + /* Initialize optional parameters to defaults */ data->binary = false; data->streaming = LOGICALREP_STREAM_OFF; data->messages = false; data->two_phase = false; + data->publish_no_origin = false; foreach(lc, options) { @@ -424,6 +427,19 @@ parse_output_parameters(List *options, PGOutputData *data) errmsg("option \"%s\" missing", "publication_names")); } +/* + * Memory context reset callback of PGOutputData->context. + */ +static void +pgoutput_memory_context_reset(void *arg) +{ + if (RelationSyncCache) + { + hash_destroy(RelationSyncCache); + RelationSyncCache = NULL; + } +} + /* * Initialize this plugin */ @@ -431,8 +447,9 @@ static void pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init) { - PGOutputData *data = palloc0(sizeof(PGOutputData)); + PGOutputData *data = palloc0_object(PGOutputData); static bool publication_callback_registered = false; + MemoryContextCallback *mcallback; /* Create our memory context for private allocations. */ data->context = AllocSetContextCreate(ctx->context, @@ -447,6 +464,14 @@ pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, "logical replication publication list context", ALLOCSET_SMALL_SIZES); + /* + * Ensure to cleanup RelationSyncCache even when logical decoding invoked + * via SQL interface ends up with an error. + */ + mcallback = palloc0_object(MemoryContextCallback); + mcallback->func = pgoutput_memory_context_reset; + MemoryContextRegisterResetCallback(ctx->context, mcallback); + ctx->output_plugin_private = data; /* This plugin uses binary protocol. */ @@ -1044,7 +1069,7 @@ check_and_init_gencol(PGOutputData *data, List *publications, /* Check if there is any generated column present. */ for (int i = 0; i < desc->natts; i++) { - Form_pg_attribute att = TupleDescAttr(desc, i); + CompactAttribute *att = TupleDescCompactAttr(desc, i); if (att->attgenerated) { @@ -1112,9 +1137,9 @@ pgoutput_column_list_init(PGOutputData *data, List *publications, * * Note that we don't support the case where the column list is different * for the same table when combining publications. See comments atop - * fetch_table_list. But one can later change the publication so we still - * need to check all the given publication-table mappings and report an - * error if any publications have a different column list. + * fetch_relation_list. But one can later change the publication so we + * still need to check all the given publication-table mappings and report + * an error if any publications have a different column list. */ foreach(lc, publications) { @@ -1372,8 +1397,8 @@ pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, * VARTAG_INDIRECT. See ReorderBufferToastReplace. */ if (att->attlen == -1 && - VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) && - !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i])) + VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(new_slot->tts_values[i])) && + !VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(old_slot->tts_values[i]))) { if (!tmp_new_slot) { @@ -1758,11 +1783,7 @@ pgoutput_origin_filter(LogicalDecodingContext *ctx, static void pgoutput_shutdown(LogicalDecodingContext *ctx) { - if (RelationSyncCache) - { - hash_destroy(RelationSyncCache); - RelationSyncCache = NULL; - } + pgoutput_memory_context_reset(NULL); } /* @@ -1789,7 +1810,7 @@ LoadPublications(List *pubnames) else ereport(WARNING, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("skipped loading publication: %s", pubname), + errmsg("skipped loading publication \"%s\"", pubname), errdetail("The publication does not exist at this point in the WAL."), errhint("Create the publication if it does not exist.")); } @@ -1886,7 +1907,7 @@ pgoutput_stream_abort(struct LogicalDecodingContext *ctx, OutputPluginPrepareWrite(ctx, true); logicalrep_write_stream_abort(ctx->out, toptxn->xid, txn->xid, abort_lsn, - txn->xact_time.abort_time, write_abort_info); + txn->abort_time, write_abort_info); OutputPluginWrite(ctx, true); diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index 7440aae5a1a7e..8a649199ec69c 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -279,7 +279,7 @@ alter_replication_slot: ; /* - * START_REPLICATION [SLOT slot] [PHYSICAL] %X/%X [TIMELINE %u] + * START_REPLICATION [SLOT slot] [PHYSICAL] %X/%08X [TIMELINE %u] */ start_replication: K_START_REPLICATION opt_slot opt_physical RECPTR opt_timeline @@ -295,7 +295,7 @@ start_replication: } ; -/* START_REPLICATION SLOT slot LOGICAL %X/%X options */ +/* START_REPLICATION SLOT slot LOGICAL %X/%08X options */ start_logical_replication: K_START_REPLICATION K_SLOT IDENT K_LOGICAL RECPTR plugin_options { diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index 014ea8d25c6b7..b6930e2865953 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -155,7 +155,7 @@ UPLOAD_MANIFEST { return K_UPLOAD_MANIFEST; } {hexdigit}+\/{hexdigit}+ { uint32 hi, lo; - if (sscanf(yytext, "%X/%X", &hi, &lo) != 2) + if (sscanf(yytext, "%X/%08X", &hi, &lo) != 2) replication_yyerror(NULL, yyscanner, "invalid streaming start location"); yylval->recptr = ((uint64) hi) << 32 | lo; return RECPTR; diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 600b87fa9cb65..58c41d4551678 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -47,6 +47,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/interrupt.h" +#include "replication/logicallauncher.h" #include "replication/slotsync.h" #include "replication/slot.h" #include "replication/walsender_private.h" @@ -154,7 +155,7 @@ int max_replication_slots = 10; /* the maximum number of replication * Invalidate replication slots that have remained idle longer than this * duration; '0' disables it. */ -int idle_replication_slot_timeout_mins = 0; +int idle_replication_slot_timeout_secs = 0; /* * This GUC lists streaming replication standby server slot names that @@ -172,6 +173,7 @@ static SyncStandbySlotsConfigData *synchronized_standby_slots_config; static XLogRecPtr ss_oldest_flush_lsn = InvalidXLogRecPtr; static void ReplicationSlotShmemExit(int code, Datum arg); +static bool IsSlotForConflictCheck(const char *name); static void ReplicationSlotDropPtr(ReplicationSlot *slot); /* internal persistency functions */ @@ -258,31 +260,72 @@ ReplicationSlotShmemExit(int code, Datum arg) /* * Check whether the passed slot name is valid and report errors at elevel. * + * See comments for ReplicationSlotValidateNameInternal(). + */ +bool +ReplicationSlotValidateName(const char *name, bool allow_reserved_name, + int elevel) +{ + int err_code; + char *err_msg = NULL; + char *err_hint = NULL; + + if (!ReplicationSlotValidateNameInternal(name, allow_reserved_name, + &err_code, &err_msg, &err_hint)) + { + /* + * Use errmsg_internal() and errhint_internal() instead of errmsg() + * and errhint(), since the messages from + * ReplicationSlotValidateNameInternal() are already translated. This + * avoids double translation. + */ + ereport(elevel, + errcode(err_code), + errmsg_internal("%s", err_msg), + (err_hint != NULL) ? errhint_internal("%s", err_hint) : 0); + + pfree(err_msg); + if (err_hint != NULL) + pfree(err_hint); + return false; + } + + return true; +} + +/* + * Check whether the passed slot name is valid. + * + * An error will be reported for a reserved replication slot name if + * allow_reserved_name is set to false. + * * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow * the name to be used as a directory name on every supported OS. * - * Returns whether the directory name is valid or not if elevel < ERROR. + * Returns true if the slot name is valid. Otherwise, returns false and stores + * the error code, error message, and optional hint in err_code, err_msg, and + * err_hint, respectively. The caller is responsible for freeing err_msg and + * err_hint, which are palloc'd. */ bool -ReplicationSlotValidateName(const char *name, int elevel) +ReplicationSlotValidateNameInternal(const char *name, bool allow_reserved_name, + int *err_code, char **err_msg, char **err_hint) { const char *cp; if (strlen(name) == 0) { - ereport(elevel, - (errcode(ERRCODE_INVALID_NAME), - errmsg("replication slot name \"%s\" is too short", - name))); + *err_code = ERRCODE_INVALID_NAME; + *err_msg = psprintf(_("replication slot name \"%s\" is too short"), name); + *err_hint = NULL; return false; } if (strlen(name) >= NAMEDATALEN) { - ereport(elevel, - (errcode(ERRCODE_NAME_TOO_LONG), - errmsg("replication slot name \"%s\" is too long", - name))); + *err_code = ERRCODE_NAME_TOO_LONG; + *err_msg = psprintf(_("replication slot name \"%s\" is too long"), name); + *err_hint = NULL; return false; } @@ -292,17 +335,34 @@ ReplicationSlotValidateName(const char *name, int elevel) || (*cp >= '0' && *cp <= '9') || (*cp == '_'))) { - ereport(elevel, - (errcode(ERRCODE_INVALID_NAME), - errmsg("replication slot name \"%s\" contains invalid character", - name), - errhint("Replication slot names may only contain lower case letters, numbers, and the underscore character."))); + *err_code = ERRCODE_INVALID_NAME; + *err_msg = psprintf(_("replication slot name \"%s\" contains invalid character"), name); + *err_hint = psprintf(_("Replication slot names may only contain lower case letters, numbers, and the underscore character.")); return false; } } + + if (!allow_reserved_name && IsSlotForConflictCheck(name)) + { + *err_code = ERRCODE_RESERVED_NAME; + *err_msg = psprintf(_("replication slot name \"%s\" is reserved"), name); + *err_hint = psprintf(_("The name \"%s\" is reserved for the conflict detection slot."), + CONFLICT_DETECTION_SLOT); + return false; + } + return true; } +/* + * Return true if the replication slot name is "pg_conflict_detection". + */ +static bool +IsSlotForConflictCheck(const char *name) +{ + return (strcmp(name, CONFLICT_DETECTION_SLOT) == 0); +} + /* * Create a new replication slot and mark it as used by this backend. * @@ -330,7 +390,12 @@ ReplicationSlotCreate(const char *name, bool db_specific, Assert(MyReplicationSlot == NULL); - ReplicationSlotValidateName(name, ERROR); + /* + * The logical launcher or pg_upgrade may create or migrate an internal + * slot, so using a reserved name is allowed in these cases. + */ + ReplicationSlotValidateName(name, IsBinaryUpgrade || IsLogicalLauncher(), + ERROR); if (failover) { @@ -424,7 +489,9 @@ ReplicationSlotCreate(const char *name, bool db_specific, slot->candidate_restart_valid = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->last_saved_confirmed_flush = InvalidXLogRecPtr; + slot->last_saved_restart_lsn = InvalidXLogRecPtr; slot->inactive_since = 0; + slot->slotsync_skip_reason = SS_SKIP_NONE; /* * Create the slot on disk. We haven't actually marked the slot allocated @@ -580,6 +647,17 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) name))); } + /* + * Do not allow users to acquire the reserved slot. This scenario may + * occur if the launcher that owns the slot has terminated unexpectedly + * due to an error, and a backend process attempts to reuse the slot. + */ + if (!IsLogicalLauncher() && IsSlotForConflictCheck(name)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("cannot acquire replication slot \"%s\"", name), + errdetail("The slot is reserved for conflict detection and can only be acquired by logical replication launcher.")); + /* * This is the slot we want; check if it's active under some other * process. In single user mode, we don't need this check. @@ -608,7 +686,7 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) } else { - active_pid = MyProcPid; + s->active_pid = active_pid = MyProcPid; ReplicationSlotSetInactiveSince(s, 0, true); } LWLockRelease(ReplicationSlotControlLock); @@ -687,16 +765,15 @@ ReplicationSlotRelease(void) { ReplicationSlot *slot = MyReplicationSlot; char *slotname = NULL; /* keep compiler quiet */ - bool is_logical = false; /* keep compiler quiet */ + bool is_logical; TimestampTz now = 0; Assert(slot != NULL && slot->active_pid != 0); + is_logical = SlotIsLogical(slot); + if (am_walsender) - { slotname = pstrdup(NameStr(slot->data.name)); - is_logical = SlotIsLogical(slot); - } if (slot->data.persistency == RS_EPHEMERAL) { @@ -706,6 +783,14 @@ ReplicationSlotRelease(void) * data. */ ReplicationSlotDropAcquired(); + + /* + * Request to disable logical decoding, even though this slot may not + * have been the last logical slot. The checkpointer will verify if + * logical decoding should actually be disabled. + */ + if (is_logical) + RequestDisableLogicalDecoding(); } /* @@ -770,15 +855,21 @@ ReplicationSlotRelease(void) * * Cleanup only synced temporary slots if 'synced_only' is true, else * cleanup all temporary slots. + * + * If it drops the last logical slot in the cluster, requests to disable + * logical decoding. */ void ReplicationSlotCleanup(bool synced_only) { int i; + bool found_valid_logicalslot; + bool dropped_logical = false; Assert(MyReplicationSlot == NULL); restart: + found_valid_logicalslot = false; LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { @@ -788,6 +879,10 @@ ReplicationSlotCleanup(bool synced_only) continue; SpinLockAcquire(&s->mutex); + + found_valid_logicalslot |= + (SlotIsLogical(s) && s->data.invalidated == RS_INVAL_NONE); + if ((s->active_pid == MyProcPid && (!synced_only || s->data.synced))) { @@ -795,6 +890,9 @@ ReplicationSlotCleanup(bool synced_only) SpinLockRelease(&s->mutex); LWLockRelease(ReplicationSlotControlLock); /* avoid deadlock */ + if (SlotIsLogical(s)) + dropped_logical = true; + ReplicationSlotDropPtr(s); ConditionVariableBroadcast(&s->active_cv); @@ -805,6 +903,9 @@ ReplicationSlotCleanup(bool synced_only) } LWLockRelease(ReplicationSlotControlLock); + + if (dropped_logical && !found_valid_logicalslot) + RequestDisableLogicalDecoding(); } /* @@ -813,6 +914,8 @@ ReplicationSlotCleanup(bool synced_only) void ReplicationSlotDrop(const char *name, bool nowait) { + bool is_logical; + Assert(MyReplicationSlot == NULL); ReplicationSlotAcquire(name, nowait, false); @@ -827,7 +930,12 @@ ReplicationSlotDrop(const char *name, bool nowait) errmsg("cannot drop replication slot \"%s\"", name), errdetail("This replication slot is being synchronized from the primary server.")); + is_logical = SlotIsLogical(MyReplicationSlot); + ReplicationSlotDropAcquired(); + + if (is_logical) + RequestDisableLogicalDecoding(); } /* @@ -1165,22 +1273,43 @@ ReplicationSlotsComputeRequiredLSN(void) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn; + XLogRecPtr last_saved_restart_lsn; bool invalidated; + ReplicationSlotPersistency persistency; if (!s->in_use) continue; SpinLockAcquire(&s->mutex); + persistency = s->data.persistency; restart_lsn = s->data.restart_lsn; invalidated = s->data.invalidated != RS_INVAL_NONE; + last_saved_restart_lsn = s->last_saved_restart_lsn; SpinLockRelease(&s->mutex); /* invalidated slots need not apply */ if (invalidated) continue; - if (restart_lsn != InvalidXLogRecPtr && - (min_required == InvalidXLogRecPtr || + /* + * For persistent slot use last_saved_restart_lsn to compute the + * oldest LSN for removal of WAL segments. The segments between + * last_saved_restart_lsn and restart_lsn might be needed by a + * persistent slot in the case of database crash. Non-persistent + * slots can't survive the database crash, so we don't care about + * last_saved_restart_lsn for them. + */ + if (persistency == RS_PERSISTENT) + { + if (XLogRecPtrIsValid(last_saved_restart_lsn) && + restart_lsn > last_saved_restart_lsn) + { + restart_lsn = last_saved_restart_lsn; + } + } + + if (XLogRecPtrIsValid(restart_lsn) && + (!XLogRecPtrIsValid(min_required) || restart_lsn < min_required)) min_required = restart_lsn; } @@ -1216,7 +1345,9 @@ ReplicationSlotsComputeLogicalRestartLSN(void) { ReplicationSlot *s; XLogRecPtr restart_lsn; + XLogRecPtr last_saved_restart_lsn; bool invalidated; + ReplicationSlotPersistency persistency; s = &ReplicationSlotCtl->replication_slots[i]; @@ -1230,18 +1361,37 @@ ReplicationSlotsComputeLogicalRestartLSN(void) /* read once, it's ok if it increases while we're checking */ SpinLockAcquire(&s->mutex); + persistency = s->data.persistency; restart_lsn = s->data.restart_lsn; invalidated = s->data.invalidated != RS_INVAL_NONE; + last_saved_restart_lsn = s->last_saved_restart_lsn; SpinLockRelease(&s->mutex); /* invalidated slots need not apply */ if (invalidated) continue; - if (restart_lsn == InvalidXLogRecPtr) + /* + * For persistent slot use last_saved_restart_lsn to compute the + * oldest LSN for removal of WAL segments. The segments between + * last_saved_restart_lsn and restart_lsn might be needed by a + * persistent slot in the case of database crash. Non-persistent + * slots can't survive the database crash, so we don't care about + * last_saved_restart_lsn for them. + */ + if (persistency == RS_PERSISTENT) + { + if (XLogRecPtrIsValid(last_saved_restart_lsn) && + restart_lsn > last_saved_restart_lsn) + { + restart_lsn = last_saved_restart_lsn; + } + } + + if (!XLogRecPtrIsValid(restart_lsn)) continue; - if (result == InvalidXLogRecPtr || + if (!XLogRecPtrIsValid(result) || restart_lsn < result) result = restart_lsn; } @@ -1316,16 +1466,22 @@ ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive) * * This routine isn't as efficient as it could be - but we don't drop * databases often, especially databases with lots of slots. + * + * If it drops the last logical slot in the cluster, it requests to disable + * logical decoding. */ void ReplicationSlotsDropDBSlots(Oid dboid) { int i; + bool found_valid_logicalslot; + bool dropped = false; if (max_replication_slots <= 0) return; restart: + found_valid_logicalslot = false; LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { @@ -1343,11 +1499,19 @@ ReplicationSlotsDropDBSlots(Oid dboid) if (!SlotIsLogical(s)) continue; + /* + * Check logical slots on other databases too so we can disable + * logical decoding only if no slots in the cluster. + */ + SpinLockAcquire(&s->mutex); + found_valid_logicalslot |= (s->data.invalidated == RS_INVAL_NONE); + SpinLockRelease(&s->mutex); + /* not our database, skip */ if (s->data.database != dboid) continue; - /* NB: intentionally including invalidated slots */ + /* NB: intentionally including invalidated slots to drop */ /* acquire slot, so ReplicationSlotDropAcquired can be reused */ SpinLockAcquire(&s->mutex); @@ -1399,11 +1563,55 @@ ReplicationSlotsDropDBSlots(Oid dboid) */ LWLockRelease(ReplicationSlotControlLock); ReplicationSlotDropAcquired(); + dropped = true; goto restart; } LWLockRelease(ReplicationSlotControlLock); + + if (dropped && !found_valid_logicalslot) + RequestDisableLogicalDecoding(); } +/* + * Returns true if there is at least one in-use valid logical replication slot. + */ +bool +CheckLogicalSlotExists(void) +{ + bool found = false; + + if (max_replication_slots <= 0) + return false; + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (int i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s; + bool invalidated; + + s = &ReplicationSlotCtl->replication_slots[i]; + + /* cannot change while ReplicationSlotCtlLock is held */ + if (!s->in_use) + continue; + + if (SlotIsPhysical(s)) + continue; + + SpinLockAcquire(&s->mutex); + invalidated = s->data.invalidated != RS_INVAL_NONE; + SpinLockRelease(&s->mutex); + + if (invalidated) + continue; + + found = true; + break; + } + LWLockRelease(ReplicationSlotControlLock); + + return found; +} /* * Check whether the server's configuration supports using replication @@ -1452,62 +1660,65 @@ void ReplicationSlotReserveWal(void) { ReplicationSlot *slot = MyReplicationSlot; + XLogSegNo segno; + XLogRecPtr restart_lsn; Assert(slot != NULL); - Assert(slot->data.restart_lsn == InvalidXLogRecPtr); + Assert(!XLogRecPtrIsValid(slot->data.restart_lsn)); + Assert(!XLogRecPtrIsValid(slot->last_saved_restart_lsn)); /* - * The replication slot mechanism is used to prevent removal of required - * WAL. As there is no interlock between this routine and checkpoints, WAL - * segments could concurrently be removed when a now stale return value of - * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that - * this happens we'll just retry. + * The replication slot mechanism is used to prevent the removal of + * required WAL. + * + * Acquire an exclusive lock to prevent the checkpoint process from + * concurrently computing the minimum slot LSN (see + * CheckPointReplicationSlots). This ensures that the WAL reserved for + * replication cannot be removed during a checkpoint. + * + * The mechanism is reliable because if WAL reservation occurs first, the + * checkpoint must wait for the restart_lsn update before determining the + * minimum non-removable LSN. On the other hand, if the checkpoint happens + * first, subsequent WAL reservations will select positions at or beyond + * the redo pointer of that checkpoint. */ - while (true) - { - XLogSegNo segno; - XLogRecPtr restart_lsn; + LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); - /* - * For logical slots log a standby snapshot and start logical decoding - * at exactly that position. That allows the slot to start up more - * quickly. But on a standby we cannot do WAL writes, so just use the - * replay pointer; effectively, an attempt to create a logical slot on - * standby will cause it to wait for an xl_running_xact record to be - * logged independently on the primary, so that a snapshot can be - * built using the record. - * - * None of this is needed (or indeed helpful) for physical slots as - * they'll start replay at the last logged checkpoint anyway. Instead - * return the location of the last redo LSN. While that slightly - * increases the chance that we have to retry, it's where a base - * backup has to start replay at. - */ - if (SlotIsPhysical(slot)) - restart_lsn = GetRedoRecPtr(); - else if (RecoveryInProgress()) - restart_lsn = GetXLogReplayRecPtr(NULL); - else - restart_lsn = GetXLogInsertRecPtr(); + /* + * For logical slots log a standby snapshot and start logical decoding at + * exactly that position. That allows the slot to start up more quickly. + * But on a standby we cannot do WAL writes, so just use the replay + * pointer; effectively, an attempt to create a logical slot on standby + * will cause it to wait for an xl_running_xact record to be logged + * independently on the primary, so that a snapshot can be built using the + * record. + * + * None of this is needed (or indeed helpful) for physical slots as + * they'll start replay at the last logged checkpoint anyway. Instead, + * return the location of the last redo LSN, where a base backup has to + * start replay at. + */ + if (SlotIsPhysical(slot)) + restart_lsn = GetRedoRecPtr(); + else if (RecoveryInProgress()) + restart_lsn = GetXLogReplayRecPtr(NULL); + else + restart_lsn = GetXLogInsertRecPtr(); - SpinLockAcquire(&slot->mutex); - slot->data.restart_lsn = restart_lsn; - SpinLockRelease(&slot->mutex); + SpinLockAcquire(&slot->mutex); + slot->data.restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); - /* prevent WAL removal as fast as possible */ - ReplicationSlotsComputeRequiredLSN(); + /* prevent WAL removal as fast as possible */ + ReplicationSlotsComputeRequiredLSN(); - /* - * If all required WAL is still there, great, otherwise retry. The - * slot should prevent further removal of WAL, unless there's a - * concurrent ReplicationSlotsComputeRequiredLSN() after we've written - * the new restart_lsn above, so normally we should never need to loop - * more than twice. - */ - XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size); - if (XLogGetLastRemovedSegno() < segno) - break; - } + /* Checkpoint shouldn't remove the required WAL. */ + XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size); + if (XLogGetLastRemovedSegno() >= segno) + elog(ERROR, "WAL required by replication slot %s has been removed concurrently", + NameStr(slot->data.name)); + + LWLockRelease(ReplicationSlotAllocationLock); if (!RecoveryInProgress() && SlotIsLogical(slot)) { @@ -1547,8 +1758,8 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, uint64 ex = oldestLSN - restart_lsn; appendStringInfo(&err_detail, - ngettext("The slot's restart_lsn %X/%X exceeds the limit by %" PRIu64 " byte.", - "The slot's restart_lsn %X/%X exceeds the limit by %" PRIu64 " bytes.", + ngettext("The slot's restart_lsn %X/%08X exceeds the limit by %" PRIu64 " byte.", + "The slot's restart_lsn %X/%08X exceeds the limit by %" PRIu64 " bytes.", ex), LSN_FORMAT_ARGS(restart_lsn), ex); @@ -1563,18 +1774,15 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, break; case RS_INVAL_WAL_LEVEL: - appendStringInfoString(&err_detail, _("Logical decoding on standby requires \"wal_level\" >= \"logical\" on the primary server.")); + appendStringInfoString(&err_detail, _("Logical decoding on standby requires the primary server to either set \"wal_level\" >= \"logical\" or have at least one logical slot when \"wal_level\" = \"replica\".")); break; case RS_INVAL_IDLE_TIMEOUT: { - int minutes = slot_idle_seconds / SECS_PER_MINUTE; - int secs = slot_idle_seconds % SECS_PER_MINUTE; - /* translator: %s is a GUC variable name */ - appendStringInfo(&err_detail, _("The slot's idle time of %dmin %02ds exceeds the configured \"%s\" duration of %dmin."), - minutes, secs, "idle_replication_slot_timeout", - idle_replication_slot_timeout_mins); + appendStringInfo(&err_detail, _("The slot's idle time of %lds exceeds the configured \"%s\" duration of %ds."), + slot_idle_seconds, "idle_replication_slot_timeout", + idle_replication_slot_timeout_secs); /* translator: %s is a GUC variable name */ appendStringInfo(&err_hint, _("You might need to increase \"%s\"."), "idle_replication_slot_timeout"); @@ -1612,8 +1820,8 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, static inline bool CanInvalidateIdleSlot(ReplicationSlot *s) { - return (idle_replication_slot_timeout_mins != 0 && - !XLogRecPtrIsInvalid(s->data.restart_lsn) && + return (idle_replication_slot_timeout_secs != 0 && + XLogRecPtrIsValid(s->data.restart_lsn) && s->inactive_since > 0 && !(RecoveryInProgress() && s->data.synced)); } @@ -1629,17 +1837,16 @@ static ReplicationSlotInvalidationCause DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, XLogRecPtr oldestLSN, Oid dboid, TransactionId snapshotConflictHorizon, - TransactionId initial_effective_xmin, - TransactionId initial_catalog_effective_xmin, - XLogRecPtr initial_restart_lsn, TimestampTz *inactive_since, TimestampTz now) { Assert(possible_causes != RS_INVAL_NONE); if (possible_causes & RS_INVAL_WAL_REMOVED) { - if (initial_restart_lsn != InvalidXLogRecPtr && - initial_restart_lsn < oldestLSN) + XLogRecPtr restart_lsn = s->data.restart_lsn; + + if (XLogRecPtrIsValid(restart_lsn) && + restart_lsn < oldestLSN) return RS_INVAL_WAL_REMOVED; } @@ -1649,12 +1856,15 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, if (SlotIsLogical(s) && (dboid == InvalidOid || dboid == s->data.database)) { - if (TransactionIdIsValid(initial_effective_xmin) && - TransactionIdPrecedesOrEquals(initial_effective_xmin, + TransactionId effective_xmin = s->effective_xmin; + TransactionId catalog_effective_xmin = s->effective_catalog_xmin; + + if (TransactionIdIsValid(effective_xmin) && + TransactionIdPrecedesOrEquals(effective_xmin, snapshotConflictHorizon)) return RS_INVAL_HORIZON; - else if (TransactionIdIsValid(initial_catalog_effective_xmin) && - TransactionIdPrecedesOrEquals(initial_catalog_effective_xmin, + else if (TransactionIdIsValid(catalog_effective_xmin) && + TransactionIdPrecedesOrEquals(catalog_effective_xmin, snapshotConflictHorizon)) return RS_INVAL_HORIZON; } @@ -1673,9 +1883,9 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, if (CanInvalidateIdleSlot(s)) { /* - * We simulate the invalidation due to idle_timeout as the minimum - * time idle time is one minute which makes tests take a long - * time. + * Simulate the invalidation due to idle_timeout to test the + * timeout behavior promptly, without waiting for it to trigger + * naturally. */ #ifdef USE_INJECTION_POINTS if (IS_INJECTION_POINT_ATTACHED("slot-timeout-inval")) @@ -1690,7 +1900,7 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, * idle_replication_slot_timeout GUC. */ if (TimestampDifferenceExceedsSeconds(s->inactive_since, now, - idle_replication_slot_timeout_mins * SECS_PER_MINUTE)) + idle_replication_slot_timeout_secs)) { *inactive_since = s->inactive_since; return RS_INVAL_IDLE_TIMEOUT; @@ -1706,10 +1916,11 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, * * Acquires the given slot and mark it invalid, if necessary and possible. * - * Returns whether ReplicationSlotControlLock was released in the interim (and - * in that case we're not holding the lock at return, otherwise we are). + * Returns true if the slot was invalidated. * - * Sets *invalidated true if the slot was invalidated. (Untouched otherwise.) + * Set *released_lock_out if ReplicationSlotControlLock was released in the + * interim (and in that case we're not holding the lock at return, otherwise + * we are). * * This is inherently racy, because we release the LWLock * for syscalls, so caller must restart if we return true. @@ -1719,15 +1930,11 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, ReplicationSlot *s, XLogRecPtr oldestLSN, Oid dboid, TransactionId snapshotConflictHorizon, - bool *invalidated) + bool *released_lock_out) { int last_signaled_pid = 0; bool released_lock = false; - bool terminated = false; - TransactionId initial_effective_xmin = InvalidTransactionId; - TransactionId initial_catalog_effective_xmin = InvalidTransactionId; - XLogRecPtr initial_restart_lsn = InvalidXLogRecPtr; - ReplicationSlotInvalidationCause invalidation_cause_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE; + bool invalidated = false; TimestampTz inactive_since = 0; for (;;) @@ -1770,42 +1977,12 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, /* we do nothing if the slot is already invalid */ if (s->data.invalidated == RS_INVAL_NONE) - { - /* - * The slot's mutex will be released soon, and it is possible that - * those values change since the process holding the slot has been - * terminated (if any), so record them here to ensure that we - * would report the correct invalidation cause. - * - * Unlike other slot attributes, slot's inactive_since can't be - * changed until the acquired slot is released or the owning - * process is terminated. So, the inactive slot can only be - * invalidated immediately without being terminated. - */ - if (!terminated) - { - initial_restart_lsn = s->data.restart_lsn; - initial_effective_xmin = s->effective_xmin; - initial_catalog_effective_xmin = s->effective_catalog_xmin; - } - invalidation_cause = DetermineSlotInvalidationCause(possible_causes, s, oldestLSN, dboid, snapshotConflictHorizon, - initial_effective_xmin, - initial_catalog_effective_xmin, - initial_restart_lsn, &inactive_since, now); - } - - /* - * The invalidation cause recorded previously should not change while - * the process owning the slot (if any) has been terminated. - */ - Assert(!(invalidation_cause_prev != RS_INVAL_NONE && terminated && - invalidation_cause_prev != invalidation_cause)); /* if there's no invalidation, we're done */ if (invalidation_cause == RS_INVAL_NONE) @@ -1823,6 +2000,11 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * If the slot can be acquired, do so and mark it invalidated * immediately. Otherwise we'll signal the owning process, below, and * retry. + * + * Note: Unlike other slot attributes, slot's inactive_since can't be + * changed until the acquired slot is released or the owning process + * is terminated. So, the inactive slot can only be invalidated + * immediately without being terminated. */ if (active_pid == 0) { @@ -1835,23 +2017,17 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * just rely on .invalidated. */ if (invalidation_cause == RS_INVAL_WAL_REMOVED) + { s->data.restart_lsn = InvalidXLogRecPtr; + s->last_saved_restart_lsn = InvalidXLogRecPtr; + } /* Let caller know */ - *invalidated = true; + invalidated = true; } SpinLockRelease(&s->mutex); - /* - * The logical replication slots shouldn't be invalidated as GUC - * max_slot_wal_keep_size is set to -1 and - * idle_replication_slot_timeout is set to 0 during the binary - * upgrade. See check_old_cluster_for_valid_slots() where we ensure - * that no invalidated before the upgrade. - */ - Assert(!(*invalidated && SlotIsLogical(s) && IsBinaryUpgrade)); - /* * Calculate the idle time duration of the slot if slot is marked * invalidated with RS_INVAL_IDLE_TIMEOUT. @@ -1903,8 +2079,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, (void) kill(active_pid, SIGTERM); last_signaled_pid = active_pid; - terminated = true; - invalidation_cause_prev = invalidation_cause; } /* Wait until the slot is released. */ @@ -1915,6 +2089,14 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * Re-acquire lock and start over; we expect to invalidate the * slot next time (unless another process acquires the slot in the * meantime). + * + * Note: It is possible for a slot to advance its restart_lsn or + * xmin values sufficiently between when we release the mutex and + * when we recheck, moving from a conflicting state to a non + * conflicting state. This is intentional and safe: if the slot + * has caught up while we're busy here, the resources we were + * concerned about (WAL segments or tuples) have not yet been + * removed, and there's no reason to invalidate the slot. */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); continue; @@ -1949,7 +2131,8 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, Assert(released_lock == !LWLockHeldByMe(ReplicationSlotControlLock)); - return released_lock; + *released_lock_out = released_lock; + return invalidated; } /* @@ -1962,7 +2145,8 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * - RS_INVAL_WAL_REMOVED: requires a LSN older than the given segment * - RS_INVAL_HORIZON: requires a snapshot <= the given horizon in the given * db; dboid may be InvalidOid for shared relations - * - RS_INVAL_WAL_LEVEL: is logical and wal_level is insufficient + * - RS_INVAL_WAL_LEVEL: is a logical slot and effective_wal_level is not + * logical. * - RS_INVAL_IDLE_TIMEOUT: has been idle longer than the configured * "idle_replication_slot_timeout" duration. * @@ -1970,6 +2154,9 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * causes in a single pass, minimizing redundant iterations. The "cause" * parameter can be a MASK representing one or more of the defined causes. * + * If it invalidates the last logical slot in the cluster, it requests to + * disable logical decoding. + * * NB - this runs as part of checkpoint, so avoid raising errors if possible. */ bool @@ -1979,6 +2166,8 @@ InvalidateObsoleteReplicationSlots(uint32 possible_causes, { XLogRecPtr oldestLSN; bool invalidated = false; + bool invalidated_logical = false; + bool found_valid_logicalslot; Assert(!(possible_causes & RS_INVAL_HORIZON) || TransactionIdIsValid(snapshotConflictHorizon)); Assert(!(possible_causes & RS_INVAL_WAL_REMOVED) || oldestSegno > 0); @@ -1990,21 +2179,58 @@ InvalidateObsoleteReplicationSlots(uint32 possible_causes, XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN); restart: + found_valid_logicalslot = false; LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (int i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + bool released_lock = false; if (!s->in_use) continue; - if (InvalidatePossiblyObsoleteSlot(possible_causes, s, oldestLSN, dboid, - snapshotConflictHorizon, - &invalidated)) + /* Prevent invalidation of logical slots during binary upgrade */ + if (SlotIsLogical(s) && IsBinaryUpgrade) { - /* if the lock was released, start from scratch */ - goto restart; + SpinLockAcquire(&s->mutex); + found_valid_logicalslot |= (s->data.invalidated == RS_INVAL_NONE); + SpinLockRelease(&s->mutex); + + continue; + } + + if (InvalidatePossiblyObsoleteSlot(possible_causes, s, oldestLSN, + dboid, snapshotConflictHorizon, + &released_lock)) + { + Assert(released_lock); + + /* Remember we have invalidated a physical or logical slot */ + invalidated = true; + + /* + * Additionally, remember we have invalidated a logical slot as we + * can request disabling logical decoding later. + */ + if (SlotIsLogical(s)) + invalidated_logical = true; + } + else + { + /* + * We need to check if the slot is invalidated here since + * InvalidatePossiblyObsoleteSlot() returns false also if the slot + * is already invalidated. + */ + SpinLockAcquire(&s->mutex); + found_valid_logicalslot |= + (SlotIsLogical(s) && (s->data.invalidated == RS_INVAL_NONE)); + SpinLockRelease(&s->mutex); } + + /* if the lock was released, start from scratch */ + if (released_lock) + goto restart; } LWLockRelease(ReplicationSlotControlLock); @@ -2017,6 +2243,15 @@ InvalidateObsoleteReplicationSlots(uint32 possible_causes, ReplicationSlotsComputeRequiredLSN(); } + /* + * Request the checkpointer to disable logical decoding if no valid + * logical slots remain. If called by the checkpointer during a + * checkpoint, only the request is initiated; actual deactivation is + * deferred until after the checkpoint completes. + */ + if (invalidated_logical && !found_valid_logicalslot) + RequestDisableLogicalDecoding(); + return invalidated; } @@ -2032,6 +2267,7 @@ void CheckPointReplicationSlots(bool is_shutdown) { int i; + bool last_saved_restart_lsn_updated = false; elog(DEBUG1, "performing replication slot checkpoint"); @@ -2041,6 +2277,12 @@ CheckPointReplicationSlots(bool is_shutdown) * acquiring a slot we cannot take the control lock - but that's OK, * because holding ReplicationSlotAllocationLock is strictly stronger, and * enough to guarantee that nobody can change the in_use bits on us. + * + * Additionally, acquiring the Allocation lock is necessary to serialize + * the slot flush process with concurrent slot WAL reservation. This + * ensures that the WAL position being reserved is either flushed to disk + * or is beyond or equal to the redo pointer of the current checkpoint + * (See ReplicationSlotReserveWal for details). */ LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED); @@ -2076,9 +2318,23 @@ CheckPointReplicationSlots(bool is_shutdown) SpinLockRelease(&s->mutex); } + /* + * Track if we're going to update slot's last_saved_restart_lsn. We + * need this to know if we need to recompute the required LSN. + */ + if (s->last_saved_restart_lsn != s->data.restart_lsn) + last_saved_restart_lsn_updated = true; + SaveSlotToPath(s, path, LOG); } LWLockRelease(ReplicationSlotAllocationLock); + + /* + * Recompute the required LSN if SaveSlotToPath() updated + * last_saved_restart_lsn for any slot. + */ + if (last_saved_restart_lsn_updated) + ReplicationSlotsComputeRequiredLSN(); } /* @@ -2278,6 +2534,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) pgstat_report_wait_end(); CloseTransientFile(fd); + unlink(tmppath); LWLockRelease(&slot->io_in_progress_lock); /* if write didn't set errno, assume problem is no disk space */ @@ -2298,7 +2555,9 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) pgstat_report_wait_end(); CloseTransientFile(fd); + unlink(tmppath); LWLockRelease(&slot->io_in_progress_lock); + errno = save_errno; ereport(elevel, (errcode_for_file_access(), @@ -2312,7 +2571,9 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { int save_errno = errno; + unlink(tmppath); LWLockRelease(&slot->io_in_progress_lock); + errno = save_errno; ereport(elevel, (errcode_for_file_access(), @@ -2326,7 +2587,9 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { int save_errno = errno; + unlink(tmppath); LWLockRelease(&slot->io_in_progress_lock); + errno = save_errno; ereport(elevel, (errcode_for_file_access(), @@ -2354,6 +2617,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) if (!slot->just_dirtied) slot->dirty = false; slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush; + slot->last_saved_restart_lsn = cp.slotdata.restart_lsn; SpinLockRelease(&slot->mutex); LWLockRelease(&slot->io_in_progress_lock); @@ -2523,19 +2787,20 @@ RestoreSlotFromDisk(const char *name) */ if (cp.slotdata.database != InvalidOid) { - if (wal_level < WAL_LEVEL_LOGICAL) + if (wal_level < WAL_LEVEL_REPLICA) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("logical replication slot \"%s\" exists, but \"wal_level\" < \"logical\"", + errmsg("logical replication slot \"%s\" exists, but \"wal_level\" < \"replica\"", NameStr(cp.slotdata.name)), - errhint("Change \"wal_level\" to be \"logical\" or higher."))); + errhint("Change \"wal_level\" to be \"replica\" or higher."))); /* * In standby mode, the hot standby must be enabled. This check is * necessary to ensure logical slots are invalidated when they become * incompatible due to insufficient wal_level. Otherwise, if the - * primary reduces wal_level < logical while hot standby is disabled, - * logical slots would remain valid even after promotion. + * primary reduces effective_wal_level < logical while hot standby is + * disabled, primary disable logical decoding while hot standby is + * disabled, logical slots would remain valid even after promotion. */ if (StandbyMode && !EnableHotStandby) ereport(FATAL, @@ -2569,6 +2834,7 @@ RestoreSlotFromDisk(const char *name) slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush; + slot->last_saved_restart_lsn = cp.slotdata.restart_lsn; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; @@ -2645,53 +2911,32 @@ GetSlotInvalidationCauseName(ReplicationSlotInvalidationCause cause) static bool validate_sync_standby_slots(char *rawname, List **elemlist) { - bool ok; - /* Verify syntax and parse string into a list of identifiers */ - ok = SplitIdentifierString(rawname, ',', elemlist); - - if (!ok) + if (!SplitIdentifierString(rawname, ',', elemlist)) { GUC_check_errdetail("List syntax is invalid."); + return false; } - else if (MyProc) + + /* Iterate the list to validate each slot name */ + foreach_ptr(char, name, *elemlist) { - /* - * Check that each specified slot exist and is physical. - * - * Because we need an LWLock, we cannot do this on processes without a - * PGPROC, so we skip it there; but see comments in - * StandbySlotsHaveCaughtup() as to why that's not a problem. - */ - LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + int err_code; + char *err_msg = NULL; + char *err_hint = NULL; - foreach_ptr(char, name, *elemlist) + if (!ReplicationSlotValidateNameInternal(name, false, &err_code, + &err_msg, &err_hint)) { - ReplicationSlot *slot; - - slot = SearchNamedReplicationSlot(name, false); - - if (!slot) - { - GUC_check_errdetail("Replication slot \"%s\" does not exist.", - name); - ok = false; - break; - } - - if (!SlotIsPhysical(slot)) - { - GUC_check_errdetail("\"%s\" is not a physical replication slot.", - name); - ok = false; - break; - } + GUC_check_errcode(err_code); + GUC_check_errdetail("%s", err_msg); + if (err_hint != NULL) + GUC_check_errhint("%s", err_hint); + return false; } - - LWLockRelease(ReplicationSlotControlLock); } - return ok; + return true; } /* @@ -2826,7 +3071,7 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel) * Don't need to wait for the standbys to catch up if they are already * beyond the specified WAL location. */ - if (!XLogRecPtrIsInvalid(ss_oldest_flush_lsn) && + if (XLogRecPtrIsValid(ss_oldest_flush_lsn) && ss_oldest_flush_lsn >= wait_for_lsn) return true; @@ -2849,12 +3094,6 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel) /* * If a slot name provided in synchronized_standby_slots does not * exist, report a message and exit the loop. - * - * Though validate_sync_standby_slots (the GUC check_hook) tries to - * avoid this, it can nonetheless happen because the user can specify - * a nonexistent slot name before server startup. That function cannot - * validate such a slot during startup, as ReplicationSlotCtl is not - * initialized by then. Also, the user might have dropped one slot. */ if (!slot) { @@ -2903,7 +3142,7 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel) break; } - if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn < wait_for_lsn) + if (!XLogRecPtrIsValid(restart_lsn) || restart_lsn < wait_for_lsn) { /* Log a message if no active_pid for this physical slot */ if (inactive) @@ -2922,7 +3161,7 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel) Assert(restart_lsn >= wait_for_lsn); - if (XLogRecPtrIsInvalid(min_restart_lsn) || + if (!XLogRecPtrIsValid(min_restart_lsn) || min_restart_lsn > restart_lsn) min_restart_lsn = restart_lsn; @@ -2941,7 +3180,7 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel) return false; /* The ss_oldest_flush_lsn must not retreat. */ - Assert(XLogRecPtrIsInvalid(ss_oldest_flush_lsn) || + Assert(!XLogRecPtrIsValid(ss_oldest_flush_lsn) || min_restart_lsn >= ss_oldest_flush_lsn); ss_oldest_flush_lsn = min_restart_lsn; @@ -2993,22 +3232,3 @@ WaitForStandbyConfirmation(XLogRecPtr wait_for_lsn) ConditionVariableCancelSleep(); } - -/* - * GUC check_hook for idle_replication_slot_timeout - * - * The value of idle_replication_slot_timeout must be set to 0 during - * a binary upgrade. See start_postmaster() in pg_upgrade for more details. - */ -bool -check_idle_replication_slot_timeout(int *newval, void **extra, GucSource source) -{ - if (IsBinaryUpgrade && *newval != 0) - { - GUC_check_errdetail("\"%s\" must be set to 0 during binary upgrade mode.", - "idle_replication_slot_timeout"); - return false; - } - - return true; -} diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 36cc2ed4e440f..70a27f83f2931 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -24,6 +24,17 @@ #include "utils/guc.h" #include "utils/pg_lsn.h" +/* + * Map SlotSyncSkipReason enum values to human-readable names. + */ +static const char *SlotSyncSkipReasonNames[] = { + [SS_SKIP_NONE] = "none", + [SS_SKIP_WAL_NOT_FLUSHED] = "wal_not_flushed", + [SS_SKIP_WAL_OR_ROWS_REMOVED] = "wal_or_rows_removed", + [SS_SKIP_NO_CONSISTENT_SNAPSHOT] = "no_consistent_snapshot", + [SS_SKIP_INVALID] = "slot_invalidated" +}; + /* * Helper function for creating a new physical replication slot with * given arguments. Note that this function doesn't release the created @@ -46,7 +57,7 @@ create_physical_replication_slot(char *name, bool immediately_reserve, if (immediately_reserve) { /* Reserve WAL as the user asked for it */ - if (XLogRecPtrIsInvalid(restart_lsn)) + if (!XLogRecPtrIsValid(restart_lsn)) ReplicationSlotReserveWal(); else MyReplicationSlot->data.restart_lsn = restart_lsn; @@ -136,6 +147,13 @@ create_logical_replication_slot(char *name, char *plugin, temporary ? RS_TEMPORARY : RS_EPHEMERAL, two_phase, failover, false); + /* + * Ensure the logical decoding is enabled before initializing the logical + * decoding context. + */ + EnsureLogicalDecodingEnabled(); + Assert(IsLogicalDecodingEnabled()); + /* * Create logical decoding context to find start point or, if we don't * need it, to 1) bump slot's restart_lsn and xmin 2) check plugin sanity. @@ -235,7 +253,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS) Datum pg_get_replication_slots(PG_FUNCTION_ARGS) { -#define PG_GET_REPLICATION_SLOTS_COLS 20 +#define PG_GET_REPLICATION_SLOTS_COLS 21 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; XLogRecPtr currlsn; int slotno; @@ -308,12 +326,12 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) else nulls[i++] = true; - if (slot_contents.data.restart_lsn != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(slot_contents.data.restart_lsn)) values[i++] = LSNGetDatum(slot_contents.data.restart_lsn); else nulls[i++] = true; - if (slot_contents.data.confirmed_flush != InvalidXLogRecPtr) + if (XLogRecPtrIsValid(slot_contents.data.confirmed_flush)) values[i++] = LSNGetDatum(slot_contents.data.confirmed_flush); else nulls[i++] = true; @@ -357,7 +375,7 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) * * If we do change it, save the state for safe_wal_size below. */ - if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn)) + if (XLogRecPtrIsValid(slot_contents.data.restart_lsn)) { int pid; @@ -407,7 +425,7 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) values[i++] = BoolGetDatum(slot_contents.data.two_phase); if (slot_contents.data.two_phase && - !XLogRecPtrIsInvalid(slot_contents.data.two_phase_at)) + XLogRecPtrIsValid(slot_contents.data.two_phase_at)) values[i++] = LSNGetDatum(slot_contents.data.two_phase_at); else nulls[i++] = true; @@ -443,6 +461,11 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) values[i++] = BoolGetDatum(slot_contents.data.synced); + if (slot_contents.slotsync_skip_reason == SS_SKIP_NONE) + nulls[i++] = true; + else + values[i++] = CStringGetTextDatum(SlotSyncSkipReasonNames[slot_contents.slotsync_skip_reason]); + Assert(i == PG_GET_REPLICATION_SLOTS_COLS); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, @@ -467,7 +490,7 @@ pg_physical_replication_slot_advance(XLogRecPtr moveto) XLogRecPtr startlsn = MyReplicationSlot->data.restart_lsn; XLogRecPtr retlsn = startlsn; - Assert(moveto != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(moveto)); if (startlsn < moveto) { @@ -523,7 +546,7 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS) CheckSlotPermissions(); - if (XLogRecPtrIsInvalid(moveto)) + if (!XLogRecPtrIsValid(moveto)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid target WAL LSN"))); @@ -545,7 +568,7 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS) ReplicationSlotAcquire(NameStr(*slotname), true, true); /* A slot whose restart_lsn has never been reserved cannot be advanced */ - if (XLogRecPtrIsInvalid(MyReplicationSlot->data.restart_lsn)) + if (!XLogRecPtrIsValid(MyReplicationSlot->data.restart_lsn)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("replication slot \"%s\" cannot be advanced", @@ -566,7 +589,7 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS) if (moveto < minlsn) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot advance replication slot to %X/%X, minimum is %X/%X", + errmsg("cannot advance replication slot to %X/%08X, minimum is %X/%08X", LSN_FORMAT_ARGS(moveto), LSN_FORMAT_ARGS(minlsn)))); /* Do the actual slot update, depending on the slot type */ @@ -679,7 +702,7 @@ copy_replication_slot(FunctionCallInfo fcinfo, bool logical_slot) NameStr(*src_name)))); /* Copying non-reserved slot doesn't make sense */ - if (XLogRecPtrIsInvalid(src_restart_lsn)) + if (!XLogRecPtrIsValid(src_restart_lsn)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot copy a replication slot that doesn't reserve WAL"))); @@ -785,7 +808,7 @@ copy_replication_slot(FunctionCallInfo fcinfo, bool logical_slot) errdetail("The source replication slot was modified incompatibly during the copy operation."))); /* The source slot must have a consistent snapshot */ - if (src_islogical && XLogRecPtrIsInvalid(copy_confirmed_flush)) + if (src_islogical && !XLogRecPtrIsValid(copy_confirmed_flush)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot copy unfinished logical replication slot \"%s\"", @@ -840,7 +863,7 @@ copy_replication_slot(FunctionCallInfo fcinfo, bool logical_slot) /* All done. Set up the return values */ values[0] = NameGetDatum(dst_name); nulls[0] = false; - if (!XLogRecPtrIsInvalid(MyReplicationSlot->data.confirmed_flush)) + if (XLogRecPtrIsValid(MyReplicationSlot->data.confirmed_flush)) { values[1] = LSNGetDatum(MyReplicationSlot->data.confirmed_flush); nulls[1] = false; @@ -921,7 +944,6 @@ pg_sync_replication_slots(PG_FUNCTION_ARGS) /* Connect to the primary server. */ wrconn = walrcv_connect(PrimaryConnInfo, false, false, false, app_name.data, &err); - pfree(app_name.data); if (!wrconn) ereport(ERROR, @@ -929,6 +951,8 @@ pg_sync_replication_slots(PG_FUNCTION_ARGS) errmsg("synchronization worker \"%s\" could not connect to the primary server: %s", app_name.data, err)); + pfree(app_name.data); + SyncReplicationSlots(wrconn); walrcv_disconnect(wrconn); diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index cc35984ad0085..298a3766d766e 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -258,7 +258,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) { char buffer[32]; - sprintf(buffer, "waiting for %X/%X", LSN_FORMAT_ARGS(lsn)); + sprintf(buffer, "waiting for %X/%08X", LSN_FORMAT_ARGS(lsn)); set_ps_display_suffix(buffer); } @@ -493,7 +493,7 @@ SyncRepReleaseWaiters(void) if (MyWalSnd->sync_standby_priority == 0 || (MyWalSnd->state != WALSNDSTATE_STREAMING && MyWalSnd->state != WALSNDSTATE_STOPPING) || - XLogRecPtrIsInvalid(MyWalSnd->flush)) + !XLogRecPtrIsValid(MyWalSnd->flush)) { announce_next_takeover = true; return; @@ -566,7 +566,7 @@ SyncRepReleaseWaiters(void) LWLockRelease(SyncRepLock); - elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X", + elog(DEBUG3, "released %d procs up to write %X/%08X, %d procs up to flush %X/%08X, %d procs up to apply %X/%08X", numwrite, LSN_FORMAT_ARGS(writePtr), numflush, LSN_FORMAT_ARGS(flushPtr), numapply, LSN_FORMAT_ARGS(applyPtr)); @@ -676,11 +676,11 @@ SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr flush = sync_standbys[i].flush; XLogRecPtr apply = sync_standbys[i].apply; - if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write) + if (!XLogRecPtrIsValid(*writePtr) || *writePtr > write) *writePtr = write; - if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush) + if (!XLogRecPtrIsValid(*flushPtr) || *flushPtr > flush) *flushPtr = flush; - if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply) + if (!XLogRecPtrIsValid(*applyPtr) || *applyPtr > apply) *applyPtr = apply; } } @@ -705,9 +705,9 @@ SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, /* Should have enough candidates, or somebody messed up */ Assert(nth > 0 && nth <= num_standbys); - write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys); - flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys); - apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys); + write_array = palloc_array(XLogRecPtr, num_standbys); + flush_array = palloc_array(XLogRecPtr, num_standbys); + apply_array = palloc_array(XLogRecPtr, num_standbys); for (i = 0; i < num_standbys; i++) { @@ -757,8 +757,7 @@ SyncRepGetCandidateStandbys(SyncRepStandbyData **standbys) int n; /* Create result array */ - *standbys = (SyncRepStandbyData *) - palloc(max_wal_senders * sizeof(SyncRepStandbyData)); + *standbys = palloc_array(SyncRepStandbyData, max_wal_senders); /* Quick exit if sync replication is not requested */ if (SyncRepConfig == NULL) @@ -799,7 +798,7 @@ SyncRepGetCandidateStandbys(SyncRepStandbyData **standbys) continue; /* Must have a valid flush position */ - if (XLogRecPtrIsInvalid(stby->flush)) + if (!XLogRecPtrIsValid(stby->flush)) continue; /* OK, it's a candidate */ diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l index 7dec1f869c745..02004d621e73d 100644 --- a/src/backend/replication/syncrep_scanner.l +++ b/src/backend/replication/syncrep_scanner.l @@ -157,17 +157,16 @@ syncrep_yyerror(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse { struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext * macro */ - char *syncrep_parse_error_msg = *syncrep_parse_error_msg_p; /* report only the first error in a parse operation */ - if (syncrep_parse_error_msg) + if (*syncrep_parse_error_msg_p) return; if (yytext[0]) - syncrep_parse_error_msg = psprintf("%s at or near \"%s\"", - message, yytext); + *syncrep_parse_error_msg_p = psprintf("%s at or near \"%s\"", + message, yytext); else - syncrep_parse_error_msg = psprintf("%s at end of input", - message); + *syncrep_parse_error_msg_p = psprintf("%s at end of input", + message); } void diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 8c4d0fd9aed2b..ac802ae85b48a 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -386,12 +386,12 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) { if (first_stream) ereport(LOG, - (errmsg("started streaming WAL from primary at %X/%X on timeline %u", - LSN_FORMAT_ARGS(startpoint), startpointTLI))); + errmsg("started streaming WAL from primary at %X/%08X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI)); else ereport(LOG, - (errmsg("restarted WAL streaming at %X/%X on timeline %u", - LSN_FORMAT_ARGS(startpoint), startpointTLI))); + errmsg("restarted WAL streaming at %X/%08X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI)); first_stream = false; /* Initialize LogstreamResult and buffers for processing messages */ @@ -470,7 +470,7 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) { ereport(LOG, (errmsg("replication terminated by primary server"), - errdetail("End of WAL reached on timeline %u at %X/%X.", + errdetail("End of WAL reached on timeline %u at %X/%08X.", startpointTLI, LSN_FORMAT_ARGS(LogstreamResult.Write)))); endofwal = true; @@ -711,7 +711,7 @@ WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%08X", LSN_FORMAT_ARGS(*startpoint)); set_ps_display(activitymsg); } @@ -826,7 +826,7 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, TimeLineID tli) switch (type) { - case 'w': /* WAL records */ + case PqReplMsg_WALData: { StringInfoData incoming_message; @@ -850,7 +850,7 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, TimeLineID tli) XLogWalRcvWrite(buf, len, dataStart, tli); break; } - case 'k': /* Keepalive */ + case PqReplMsg_Keepalive: { StringInfoData incoming_message; @@ -928,7 +928,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) start = pgstat_prepare_io_time(track_wal_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff); + byteswritten = pg_pwrite(recvFile, buf, segbytes, (pgoff_t) startoff); pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, @@ -949,8 +949,8 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to WAL segment %s " - "at offset %d, length %lu: %m", - xlogfname, startoff, (unsigned long) segbytes))); + "at offset %d, length %d: %m", + xlogfname, startoff, segbytes))); } /* Update state for write */ @@ -1014,7 +1014,7 @@ XLogWalRcvFlush(bool dying, TimeLineID tli) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%08X", LSN_FORMAT_ARGS(LogstreamResult.Write)); set_ps_display(activitymsg); } @@ -1130,7 +1130,7 @@ XLogWalRcvSendReply(bool force, bool requestReply) applyPtr = GetXLogReplayRecPtr(NULL); resetStringInfo(&reply_message); - pq_sendbyte(&reply_message, 'r'); + pq_sendbyte(&reply_message, PqReplMsg_StandbyStatusUpdate); pq_sendint64(&reply_message, writePtr); pq_sendint64(&reply_message, flushPtr); pq_sendint64(&reply_message, applyPtr); @@ -1138,7 +1138,7 @@ XLogWalRcvSendReply(bool force, bool requestReply) pq_sendbyte(&reply_message, requestReply ? 1 : 0); /* Send it */ - elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X%s", + elog(DEBUG2, "sending write %X/%08X flush %X/%08X apply %X/%08X%s", LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr), @@ -1234,7 +1234,7 @@ XLogWalRcvSendHSFeedback(bool immed) /* Construct the message and send it. */ resetStringInfo(&reply_message); - pq_sendbyte(&reply_message, 'h'); + pq_sendbyte(&reply_message, PqReplMsg_HotStandbyFeedback); pq_sendint64(&reply_message, GetCurrentTimestamp()); pq_sendint32(&reply_message, xmin); pq_sendint32(&reply_message, xmin_epoch); @@ -1450,8 +1450,8 @@ pg_stat_get_wal_receiver(PG_FUNCTION_ARGS) if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - values = palloc0(sizeof(Datum) * tupdesc->natts); - nulls = palloc0(sizeof(bool) * tupdesc->natts); + values = palloc0_array(Datum, tupdesc->natts); + nulls = palloc0_array(bool, tupdesc->natts); /* Fetch values */ values[0] = Int32GetDatum(pid); @@ -1469,16 +1469,16 @@ pg_stat_get_wal_receiver(PG_FUNCTION_ARGS) { values[1] = CStringGetTextDatum(WalRcvGetStateString(state)); - if (XLogRecPtrIsInvalid(receive_start_lsn)) + if (!XLogRecPtrIsValid(receive_start_lsn)) nulls[2] = true; else values[2] = LSNGetDatum(receive_start_lsn); values[3] = Int32GetDatum(receive_start_tli); - if (XLogRecPtrIsInvalid(written_lsn)) + if (!XLogRecPtrIsValid(written_lsn)) nulls[4] = true; else values[4] = LSNGetDatum(written_lsn); - if (XLogRecPtrIsInvalid(flushed_lsn)) + if (!XLogRecPtrIsValid(flushed_lsn)) nulls[5] = true; else values[5] = LSNGetDatum(flushed_lsn); @@ -1491,7 +1491,7 @@ pg_stat_get_wal_receiver(PG_FUNCTION_ARGS) nulls[8] = true; else values[8] = TimestampTzGetDatum(last_receipt_time); - if (XLogRecPtrIsInvalid(latest_end_lsn)) + if (!XLogRecPtrIsValid(latest_end_lsn)) nulls[9] = true; else values[9] = LSNGetDatum(latest_end_lsn); diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c index 8de2886ff0b59..822645748a71b 100644 --- a/src/backend/replication/walreceiverfuncs.c +++ b/src/backend/replication/walreceiverfuncs.c @@ -119,6 +119,20 @@ WalRcvRunning(void) return false; } +/* Return the state of the walreceiver. */ +WalRcvState +WalRcvGetState(void) +{ + WalRcvData *walrcv = WalRcv; + WalRcvState state; + + SpinLockAcquire(&walrcv->mutex); + state = walrcv->walRcvState; + SpinLockRelease(&walrcv->mutex); + + return state; +} + /* * Is walreceiver running and streaming (or at least attempting to connect, * or starting up)? @@ -301,7 +315,7 @@ RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, * If this is the first startup of walreceiver (on this timeline), * initialize flushedUpto and latestChunkStart to the starting point. */ - if (walrcv->receiveStart == 0 || walrcv->receivedTLI != tli) + if (!XLogRecPtrIsValid(walrcv->receiveStart) || walrcv->receivedTLI != tli) { walrcv->flushedUpto = recptr; walrcv->receivedTLI = tli; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 9fa8beb6103d3..96cede8f45a94 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -51,6 +51,7 @@ #include "access/timeline.h" #include "access/transam.h" +#include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" @@ -60,11 +61,11 @@ #include "backup/basebackup_incremental.h" #include "catalog/pg_authid.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "funcapi.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" +#include "libpq/protocol.h" #include "miscadmin.h" #include "nodes/replnodes.h" #include "pgstat.h" @@ -84,11 +85,13 @@ #include "storage/ipc.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "storage/procarray.h" #include "tcop/dest.h" #include "tcop/tcopprot.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/guc.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_lsn.h" #include "utils/pgstat_internal.h" @@ -230,6 +233,20 @@ typedef struct int write_head; int read_heads[NUM_SYNC_REP_WAIT_MODE]; WalTimeSample last_read[NUM_SYNC_REP_WAIT_MODE]; + + /* + * Overflow entries for read heads that collide with the write head. + * + * When the cyclic buffer fills (write head is about to collide with a + * read head), we save that read head's current sample here and mark it as + * using overflow (read_heads[i] = -1). This allows the write head to + * continue advancing while the overflowed mode continues lag computation + * using the saved sample. + * + * Once the standby's reported LSN advances past the overflow entry's LSN, + * we transition back to normal buffer-based tracking. + */ + WalTimeSample overflowed[NUM_SYNC_REP_WAIT_MODE]; } LagTracker; static LagTracker *lag_tracker; @@ -258,6 +275,7 @@ static void StartLogicalReplication(StartReplicationCmd *cmd); static void ProcessStandbyMessage(void); static void ProcessStandbyReplyMessage(void); static void ProcessStandbyHSFeedbackMessage(void); +static void ProcessStandbyPSRequestMessage(void); static void ProcessRepliesIfAny(void); static void ProcessPendingWrites(void); static void WalSndKeepalive(bool requestReply, XLogRecPtr writePtr); @@ -408,7 +426,7 @@ IdentifySystem(void) else logptr = GetFlushRecPtr(&currTLI); - snprintf(xloc, sizeof(xloc), "%X/%X", LSN_FORMAT_ARGS(logptr)); + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(logptr)); if (MyDatabaseId != InvalidOid) { @@ -511,11 +529,11 @@ ReadReplicationSlot(ReadReplicationSlotCmd *cmd) i++; /* start LSN */ - if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn)) + if (XLogRecPtrIsValid(slot_contents.data.restart_lsn)) { char xloc[64]; - snprintf(xloc, sizeof(xloc), "%X/%X", + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(slot_contents.data.restart_lsn)); values[i] = CStringGetTextDatum(xloc); nulls[i] = false; @@ -523,7 +541,7 @@ ReadReplicationSlot(ReadReplicationSlotCmd *cmd) i++; /* timeline this WAL was produced on */ - if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn)) + if (XLogRecPtrIsValid(slot_contents.data.restart_lsn)) { TimeLineID slots_position_timeline; TimeLineID current_timeline; @@ -733,13 +751,13 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: maxmsglen = PQ_LARGE_MESSAGE_LIMIT; break; - case 'c': /* CopyDone */ - case 'f': /* CopyFail */ - case 'H': /* Flush */ - case 'S': /* Sync */ + case PqMsg_CopyDone: + case PqMsg_CopyFail: + case PqMsg_Flush: + case PqMsg_Sync: maxmsglen = PQ_SMALL_MESSAGE_LIMIT; break; default: @@ -761,19 +779,19 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, /* Process the message */ switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: AppendIncrementalManifestData(ib, buf->data, buf->len); return true; - case 'c': /* CopyDone */ + case PqMsg_CopyDone: return false; - case 'H': /* Sync */ - case 'S': /* Flush */ + case PqMsg_Sync: + case PqMsg_Flush: /* Ignore these while in CopyOut mode as we do elsewhere. */ return true; - case 'f': + case PqMsg_CopyFail: ereport(ERROR, (errcode(ERRCODE_QUERY_CANCELED), errmsg("COPY from stdin failed: %s", @@ -888,16 +906,16 @@ StartReplication(StartReplicationCmd *cmd) * that's older than the switchpoint, if it's still in the same * WAL segment. */ - if (!XLogRecPtrIsInvalid(switchpoint) && + if (XLogRecPtrIsValid(switchpoint) && switchpoint < cmd->startpoint) { ereport(ERROR, - (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", - LSN_FORMAT_ARGS(cmd->startpoint), - cmd->timeline), - errdetail("This server's history forked from timeline %u at %X/%X.", - cmd->timeline, - LSN_FORMAT_ARGS(switchpoint)))); + errmsg("requested starting point %X/%08X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%08X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint))); } sendTimeLineValidUpto = switchpoint; } @@ -939,9 +957,9 @@ StartReplication(StartReplicationCmd *cmd) if (FlushPtr < cmd->startpoint) { ereport(ERROR, - (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", - LSN_FORMAT_ARGS(cmd->startpoint), - LSN_FORMAT_ARGS(FlushPtr)))); + errmsg("requested starting point %X/%08X is ahead of the WAL flush position of this server %X/%08X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr))); } /* Start streaming from the requested point */ @@ -983,7 +1001,7 @@ StartReplication(StartReplicationCmd *cmd) Datum values[2]; bool nulls[2] = {0}; - snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + snprintf(startpos_str, sizeof(startpos_str), "%X/%08X", LSN_FORMAT_ARGS(sendTimeLineValidUpto)); dest = CreateDestReceiver(DestRemoteSimple); @@ -1134,8 +1152,8 @@ parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd, else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized value for CREATE_REPLICATION_SLOT option \"%s\": \"%s\"", - defel->defname, action))); + errmsg("unrecognized value for %s option \"%s\": \"%s\"", + "CREATE_REPLICATION_SLOT", defel->defname, action))); } else if (strcmp(defel->defname, "reserve_wal") == 0) { @@ -1279,6 +1297,13 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) need_full_snapshot = true; } + /* + * Ensure the logical decoding is enabled before initializing the + * logical decoding context. + */ + EnsureLogicalDecodingEnabled(); + Assert(IsLogicalDecodingEnabled()); + ctx = CreateInitDecodingContext(cmd->plugin, NIL, need_full_snapshot, InvalidXLogRecPtr, XL_ROUTINE(.page_read = logical_read_xlog_page, @@ -1324,7 +1349,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) ReplicationSlotPersist(); } - snprintf(xloc, sizeof(xloc), "%X/%X", + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(MyReplicationSlot->data.confirmed_flush)); dest = CreateDestReceiver(DestRemoteSimple); @@ -1531,7 +1556,7 @@ WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xi resetStringInfo(ctx->out); - pq_sendbyte(ctx->out, 'w'); + pq_sendbyte(ctx->out, PqReplMsg_WALData); pq_sendint64(ctx->out, lsn); /* dataStart */ pq_sendint64(ctx->out, lsn); /* walEnd */ @@ -1567,7 +1592,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, tmpbuf.data, sizeof(int64)); /* output previously gathered data in a CopyData packet */ - pq_putmessage_noblock('d', ctx->out->data, ctx->out->len); + pq_putmessage_noblock(PqMsg_CopyData, ctx->out->data, ctx->out->len); CHECK_FOR_INTERRUPTS(); @@ -1809,7 +1834,7 @@ WalSndWaitForWal(XLogRecPtr loc) * receipt of WAL up to RecentFlushPtr. This is particularly interesting * if we're far behind. */ - if (!XLogRecPtrIsInvalid(RecentFlushPtr) && + if (XLogRecPtrIsValid(RecentFlushPtr) && !NeedToWaitForWal(loc, RecentFlushPtr, &wait_event)) return RecentFlushPtr; @@ -2289,7 +2314,8 @@ ProcessRepliesIfAny(void) switch (firstchar) { /* - * 'd' means a standby reply wrapped in a CopyData packet. + * PqMsg_CopyData means a standby reply wrapped in a CopyData + * packet. */ case PqMsg_CopyData: ProcessStandbyMessage(); @@ -2297,13 +2323,14 @@ ProcessRepliesIfAny(void) break; /* - * CopyDone means the standby requested to finish streaming. - * Reply with CopyDone, if we had not sent that already. + * PqMsg_CopyDone means the standby requested to finish + * streaming. Reply with CopyDone, if we had not sent that + * already. */ case PqMsg_CopyDone: if (!streamingDoneSending) { - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; } @@ -2312,7 +2339,8 @@ ProcessRepliesIfAny(void) break; /* - * 'X' means that the standby is closing down the socket. + * PqMsg_Terminate means that the standby is closing down the + * socket. */ case PqMsg_Terminate: proc_exit(0); @@ -2347,14 +2375,18 @@ ProcessStandbyMessage(void) switch (msgtype) { - case 'r': + case PqReplMsg_StandbyStatusUpdate: ProcessStandbyReplyMessage(); break; - case 'h': + case PqReplMsg_HotStandbyFeedback: ProcessStandbyHSFeedbackMessage(); break; + case PqReplMsg_PrimaryStatusRequest: + ProcessStandbyPSRequestMessage(); + break; + default: ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -2372,7 +2404,7 @@ PhysicalConfirmReceivedLocation(XLogRecPtr lsn) bool changed = false; ReplicationSlot *slot = MyReplicationSlot; - Assert(lsn != InvalidXLogRecPtr); + Assert(XLogRecPtrIsValid(lsn)); SpinLockAcquire(&slot->mutex); if (slot->data.restart_lsn != lsn) { @@ -2429,7 +2461,7 @@ ProcessStandbyReplyMessage(void) /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(replyTime)); - elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s reply_time %s", + elog(DEBUG2, "write %X/%08X flush %X/%08X apply %X/%08X%s reply_time %s", LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr), @@ -2494,7 +2526,7 @@ ProcessStandbyReplyMessage(void) /* * Advance our local xmin horizon when the client confirmed a flush. */ - if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr) + if (MyReplicationSlot && XLogRecPtrIsValid(flushPtr)) { if (SlotIsLogical(MyReplicationSlot)) LogicalConfirmReceivedLocation(flushPtr); @@ -2701,6 +2733,71 @@ ProcessStandbyHSFeedbackMessage(void) } } +/* + * Process the request for a primary status update message. + */ +static void +ProcessStandbyPSRequestMessage(void) +{ + XLogRecPtr lsn = InvalidXLogRecPtr; + TransactionId oldestXidInCommit; + TransactionId oldestGXidInCommit; + FullTransactionId nextFullXid; + FullTransactionId fullOldestXidInCommit; + WalSnd *walsnd = MyWalSnd; + TimestampTz replyTime; + + /* + * This shouldn't happen because we don't support getting primary status + * message from standby. + */ + if (RecoveryInProgress()) + elog(ERROR, "the primary status is unavailable during recovery"); + + replyTime = pq_getmsgint64(&reply_message); + + /* + * Update shared state for this WalSender process based on reply data from + * standby. + */ + SpinLockAcquire(&walsnd->mutex); + walsnd->replyTime = replyTime; + SpinLockRelease(&walsnd->mutex); + + /* + * Consider transactions in the current database, as only these are the + * ones replicated. + */ + oldestXidInCommit = GetOldestActiveTransactionId(true, false); + oldestGXidInCommit = TwoPhaseGetOldestXidInCommit(); + + /* + * Update the oldest xid for standby transmission if an older prepared + * transaction exists and is currently in commit phase. + */ + if (TransactionIdIsValid(oldestGXidInCommit) && + TransactionIdPrecedes(oldestGXidInCommit, oldestXidInCommit)) + oldestXidInCommit = oldestGXidInCommit; + + nextFullXid = ReadNextFullTransactionId(); + fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid, + oldestXidInCommit); + lsn = GetXLogWriteRecPtr(); + + elog(DEBUG2, "sending primary status"); + + /* construct the message... */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, PqReplMsg_PrimaryStatusUpdate); + pq_sendint64(&output_message, lsn); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(fullOldestXidInCommit)); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(nextFullXid)); + pq_sendint64(&output_message, GetCurrentTimestamp()); + + /* ... and send it wrapped in CopyData */ + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); +} + /* * Compute how long send/receive loops should sleep. * @@ -2984,7 +3081,7 @@ InitWalSenderSlot(void) SpinLockRelease(&walsnd->mutex); /* don't need the lock anymore */ - MyWalSnd = (WalSnd *) walsnd; + MyWalSnd = walsnd; break; } @@ -3246,12 +3343,12 @@ XLogSendPhysical(void) wal_segment_close(xlogreader); /* Send CopyDone */ - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; WalSndCaughtUp = true; - elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + elog(DEBUG1, "walsender reached end of timeline at %X/%08X (sent up to %X/%08X)", LSN_FORMAT_ARGS(sendTimeLineValidUpto), LSN_FORMAT_ARGS(sentPtr)); return; @@ -3303,7 +3400,7 @@ XLogSendPhysical(void) * OK to read and send the slice. */ resetStringInfo(&output_message); - pq_sendbyte(&output_message, 'w'); + pq_sendbyte(&output_message, PqReplMsg_WALData); pq_sendint64(&output_message, startptr); /* dataStart */ pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ @@ -3374,7 +3471,7 @@ XLogSendPhysical(void) memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], tmpbuf.data, sizeof(int64)); - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); sentPtr = endptr; @@ -3392,7 +3489,7 @@ XLogSendPhysical(void) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%08X", LSN_FORMAT_ARGS(sentPtr)); set_ps_display(activitymsg); } @@ -3446,11 +3543,19 @@ XLogSendLogical(void) * If first time through in this session, initialize flushPtr. Otherwise, * we only need to update flushPtr if EndRecPtr is past it. */ - if (flushPtr == InvalidXLogRecPtr || + if (!XLogRecPtrIsValid(flushPtr) || logical_decoding_ctx->reader->EndRecPtr >= flushPtr) { + /* + * For cascading logical WAL senders, we use the replay LSN instead of + * the flush LSN, since logical decoding on a standby only processes + * WAL that has been replayed. This distinction becomes particularly + * important during shutdown, as new WAL is no longer replayed and the + * last replayed LSN marks the furthest point up to which decoding can + * proceed. + */ if (am_cascading_walsender) - flushPtr = GetStandbyFlushRecPtr(NULL); + flushPtr = GetXLogReplayRecPtr(NULL); else flushPtr = GetFlushRecPtr(NULL); } @@ -3499,8 +3604,8 @@ WalSndDone(WalSndSendDataCallback send_data) * flush location if valid, write otherwise. Tools like pg_receivewal will * usually (unless in synchronous mode) return an invalid flush location. */ - replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ? - MyWalSnd->write : MyWalSnd->flush; + replicatedPtr = XLogRecPtrIsValid(MyWalSnd->flush) ? + MyWalSnd->flush : MyWalSnd->write; if (WalSndCaughtUp && sentPtr == replicatedPtr && !pq_is_send_pending()) @@ -3875,7 +3980,7 @@ WalSndGetStateString(WalSndState state) static Interval * offset_to_interval(TimeOffset offset) { - Interval *result = palloc(sizeof(Interval)); + Interval *result = palloc_object(Interval); result->month = 0; result->day = 0; @@ -3975,19 +4080,19 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) { values[1] = CStringGetTextDatum(WalSndGetStateString(state)); - if (XLogRecPtrIsInvalid(sent_ptr)) + if (!XLogRecPtrIsValid(sent_ptr)) nulls[2] = true; values[2] = LSNGetDatum(sent_ptr); - if (XLogRecPtrIsInvalid(write)) + if (!XLogRecPtrIsValid(write)) nulls[3] = true; values[3] = LSNGetDatum(write); - if (XLogRecPtrIsInvalid(flush)) + if (!XLogRecPtrIsValid(flush)) nulls[4] = true; values[4] = LSNGetDatum(flush); - if (XLogRecPtrIsInvalid(apply)) + if (!XLogRecPtrIsValid(apply)) nulls[5] = true; values[5] = LSNGetDatum(apply); @@ -3996,7 +4101,7 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) * which always returns an invalid flush location, as an * asynchronous standby. */ - priority = XLogRecPtrIsInvalid(flush) ? 0 : priority; + priority = XLogRecPtrIsValid(flush) ? priority : 0; if (writeLag < 0) nulls[6] = true; @@ -4066,13 +4171,13 @@ WalSndKeepalive(bool requestReply, XLogRecPtr writePtr) /* construct the message... */ resetStringInfo(&output_message); - pq_sendbyte(&output_message, 'k'); - pq_sendint64(&output_message, XLogRecPtrIsInvalid(writePtr) ? sentPtr : writePtr); + pq_sendbyte(&output_message, PqReplMsg_Keepalive); + pq_sendint64(&output_message, XLogRecPtrIsValid(writePtr) ? writePtr : sentPtr); pq_sendint64(&output_message, GetCurrentTimestamp()); pq_sendbyte(&output_message, requestReply ? 1 : 0); /* ... and send it wrapped in CopyData */ - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); /* Set local flag */ if (requestReply) @@ -4123,7 +4228,6 @@ WalSndKeepaliveIfNecessary(void) static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) { - bool buffer_full; int new_write_head; int i; @@ -4145,25 +4249,19 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) * of space. */ new_write_head = (lag_tracker->write_head + 1) % LAG_TRACKER_BUFFER_SIZE; - buffer_full = false; for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; ++i) { + /* + * If the buffer is full, move the slowest reader to a separate + * overflow entry and free its space in the buffer so the write head + * can advance. + */ if (new_write_head == lag_tracker->read_heads[i]) - buffer_full = true; - } - - /* - * If the buffer is full, for now we just rewind by one slot and overwrite - * the last sample, as a simple (if somewhat uneven) way to lower the - * sampling rate. There may be better adaptive compaction algorithms. - */ - if (buffer_full) - { - new_write_head = lag_tracker->write_head; - if (lag_tracker->write_head > 0) - lag_tracker->write_head--; - else - lag_tracker->write_head = LAG_TRACKER_BUFFER_SIZE - 1; + { + lag_tracker->overflowed[i] = + lag_tracker->buffer[lag_tracker->read_heads[i]]; + lag_tracker->read_heads[i] = -1; + } } /* Store a sample at the current write head position. */ @@ -4190,6 +4288,28 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) { TimestampTz time = 0; + /* + * If 'lsn' has not passed the WAL position stored in the overflow entry, + * return the elapsed time (in microseconds) since the saved local flush + * time. If the flush time is in the future (due to clock drift), return + * -1 to treat as no valid sample. + * + * Otherwise, switch back to using the buffer to control the read head and + * compute the elapsed time. The read head is then reset to point to the + * oldest entry in the buffer. + */ + if (lag_tracker->read_heads[head] == -1) + { + if (lag_tracker->overflowed[head].lsn > lsn) + return (now >= lag_tracker->overflowed[head].time) ? + now - lag_tracker->overflowed[head].time : -1; + + time = lag_tracker->overflowed[head].time; + lag_tracker->last_read[head] = lag_tracker->overflowed[head]; + lag_tracker->read_heads[head] = + (lag_tracker->write_head + 1) % LAG_TRACKER_BUFFER_SIZE; + } + /* Read all unread samples up to this LSN or end of buffer. */ while (lag_tracker->read_heads[head] != lag_tracker->write_head && lag_tracker->buffer[lag_tracker->read_heads[head]].lsn <= lsn) diff --git a/src/backend/rewrite/rewriteDefine.c b/src/backend/rewrite/rewriteDefine.c index 8aa90b0d6fb75..a96fbdc1ddd64 100644 --- a/src/backend/rewrite/rewriteDefine.c +++ b/src/backend/rewrite/rewriteDefine.c @@ -725,10 +725,9 @@ EnableDisableRule(Relation rel, const char *rulename, /* * Change ev_enabled if it is different from the desired new state. */ - if (DatumGetChar(ruleform->ev_enabled) != - fires_when) + if (ruleform->ev_enabled != fires_when) { - ruleform->ev_enabled = CharGetDatum(fires_when); + ruleform->ev_enabled = fires_when; CatalogTupleUpdate(pg_rewrite_desc, &ruletup->t_self, ruletup); changed = true; diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index f0bce5f9ed957..0852322cc588c 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -592,7 +592,10 @@ rewriteRuleAction(Query *parsetree, } } - /* OK, it's safe to combine the CTE lists */ + /* + * OK, it's safe to combine the CTE lists. Beware that RewriteQuery + * knows we concatenate the lists in this order. + */ sub_action->cteList = list_concat(sub_action->cteList, copyObject(parsetree->cteList)); /* ... and don't forget about the associated flags */ @@ -923,8 +926,9 @@ rewriteTargetListIU(List *targetList, apply_default = true; /* - * Can only insert DEFAULT into generated columns, regardless of - * any OVERRIDING clauses. + * Can only insert DEFAULT into generated columns. (The + * OVERRIDING clause does not apply to generated columns, so we + * don't consider it here.) */ if (att_tup->attgenerated && !apply_default) { @@ -2616,7 +2620,7 @@ view_col_is_auto_updatable(RangeTblRef *rtr, TargetEntry *tle) * view_query_is_auto_updatable - test whether the specified view definition * represents an auto-updatable view. Returns NULL (if the view can be updated) * or a message string giving the reason that it cannot be. - + * * The returned string has not been translated; if it is shown as an error * message, the caller should apply _() to translate it. * @@ -3780,7 +3784,7 @@ rewriteTargetView(Query *parsetree, Relation view) parsetree->hasSubLinks = checkExprHasSubLink(viewqual); } else - AddQual(parsetree, (Node *) viewqual); + AddQual(parsetree, viewqual); } /* @@ -3871,9 +3875,13 @@ rewriteTargetView(Query *parsetree, Relation view) * orig_rt_length is the length of the originating query's rtable, for product * queries created by fireRules(), and 0 otherwise. This is used to skip any * already-processed VALUES RTEs from the original query. + * + * num_ctes_processed is the number of CTEs at the end of the query's cteList + * that have already been rewritten, and must not be rewritten again. */ static List * -RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length) +RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length, + int num_ctes_processed) { CmdType event = parsetree->commandType; bool instead = false; @@ -3887,17 +3895,29 @@ RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length) * First, recursively process any insert/update/delete/merge statements in * WITH clauses. (We have to do this first because the WITH clauses may * get copied into rule actions below.) + * + * Any new WITH clauses from rule actions are processed when we recurse + * into product queries below. However, when recursing, we must take care + * to avoid rewriting a CTE query more than once (because expanding + * generated columns in the targetlist more than once would fail). Since + * new CTEs from product queries are added to the start of the list (see + * rewriteRuleAction), we just skip the last num_ctes_processed items. */ foreach(lc1, parsetree->cteList) { CommonTableExpr *cte = lfirst_node(CommonTableExpr, lc1); Query *ctequery = castNode(Query, cte->ctequery); + int i = foreach_current_index(lc1); List *newstuff; + /* Skip already-processed CTEs at the end of the list */ + if (i >= list_length(parsetree->cteList) - num_ctes_processed) + break; + if (ctequery->commandType == CMD_SELECT) continue; - newstuff = RewriteQuery(ctequery, rewrite_events, 0); + newstuff = RewriteQuery(ctequery, rewrite_events, 0, 0); /* * Currently we can only handle unconditional, single-statement DO @@ -3957,6 +3977,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length) errmsg("multi-statement DO INSTEAD rules are not supported for data-modifying statements in WITH"))); } } + num_ctes_processed = list_length(parsetree->cteList); /* * If the statement is an insert, update, delete, or merge, adjust its @@ -4266,7 +4287,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length) RelationGetRelationName(rt_entry_relation)))); } - rev = (rewrite_event *) palloc(sizeof(rewrite_event)); + rev = palloc_object(rewrite_event); rev->relation = RelationGetRelid(rt_entry_relation); rev->event = event; rewrite_events = lappend(rewrite_events, rev); @@ -4288,7 +4309,8 @@ RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length) newstuff = RewriteQuery(pt, rewrite_events, pt == parsetree ? orig_rt_length : - product_orig_rt_length); + product_orig_rt_length, + num_ctes_processed); rewritten = list_concat(rewritten, newstuff); } @@ -4544,7 +4566,7 @@ build_generation_expression(Relation rel, int attrno) List * QueryRewrite(Query *parsetree) { - uint64 input_query_id = parsetree->queryId; + int64 input_query_id = parsetree->queryId; List *querylist; List *results; ListCell *l; @@ -4563,7 +4585,7 @@ QueryRewrite(Query *parsetree) * * Apply all non-SELECT rules possibly getting 0 or many queries */ - querylist = RewriteQuery(parsetree, NIL, 0); + querylist = RewriteQuery(parsetree, NIL, 0, 0); /* * Step 2 diff --git a/src/backend/rewrite/rewriteManip.c b/src/backend/rewrite/rewriteManip.c index cd786aa4112b5..f3c2886ed5421 100644 --- a/src/backend/rewrite/rewriteManip.c +++ b/src/backend/rewrite/rewriteManip.c @@ -542,8 +542,6 @@ offset_relid_set(Relids relids, int offset) * (identified by sublevels_up and rt_index), and change their varno fields * to 'new_index'. The varnosyn fields are changed too. Also, adjust other * nodes that contain rangetable indexes, such as RangeTblRef and JoinExpr. - * Specifying 'change_RangeTblRef' to false allows skipping RangeTblRef. - * See ChangeVarNodesExtended for details. * * NOTE: although this has the form of a walker, we cheat and modify the * nodes in-place. The given expression tree should have been copied @@ -664,17 +662,16 @@ ChangeVarNodes_walker(Node *node, ChangeVarNodes_context *context) } /* - * ChangeVarNodesExtended - similar to ChangeVarNodes, but with an additional + * ChangeVarNodesExtended - similar to ChangeVarNodes, but with an additional * 'callback' param * - * ChangeVarNodes changes a given node and all of its underlying nodes. - * This version of function additionally takes a callback, which has a - * chance to process a node before ChangeVarNodes_walker. A callback - * returns a boolean value indicating if given node should be skipped from - * further processing by ChangeVarNodes_walker. The callback is called - * only for expressions and other children nodes of a Query processed by - * a walker. Initial processing of the root Query doesn't involve the - * callback. + * ChangeVarNodes changes a given node and all of its underlying nodes. This + * version of function additionally takes a callback, which has a chance to + * process a node before ChangeVarNodes_walker. A callback returns a boolean + * value indicating if the given node should be skipped from further processing + * by ChangeVarNodes_walker. The callback is called only for expressions and + * other children nodes of a Query processed by a walker. Initial processing + * of the root Query doesn't involve the callback. */ void ChangeVarNodesExtended(Node *node, int rt_index, int new_index, @@ -1593,7 +1590,7 @@ map_variable_attnos_mutator(Node *node, var->varlevelsup == context->sublevels_up) { /* Found a matching variable, make the substitution */ - Var *newvar = (Var *) palloc(sizeof(Var)); + Var *newvar = palloc_object(Var); int attno = var->varattno; *newvar = *var; /* initially copy all fields of the Var */ @@ -1664,7 +1661,7 @@ map_variable_attnos_mutator(Node *node, context->to_rowtype != var->vartype) { ConvertRowtypeExpr *newnode; - Var *newvar = (Var *) palloc(sizeof(Var)); + Var *newvar = palloc_object(Var); /* whole-row variable, warn caller */ *(context->found_whole_row) = true; @@ -1677,7 +1674,7 @@ map_variable_attnos_mutator(Node *node, /* Var itself is changed to the requested type. */ newvar->vartype = context->to_rowtype; - newnode = (ConvertRowtypeExpr *) palloc(sizeof(ConvertRowtypeExpr)); + newnode = palloc_object(ConvertRowtypeExpr); *newnode = *r; /* initially copy all fields of the CRE */ newnode->arg = (Expr *) newvar; diff --git a/src/backend/rewrite/rewriteSearchCycle.c b/src/backend/rewrite/rewriteSearchCycle.c index 19b89dee0d096..5202ef43d1068 100644 --- a/src/backend/rewrite/rewriteSearchCycle.c +++ b/src/backend/rewrite/rewriteSearchCycle.c @@ -282,8 +282,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) newrte = makeNode(RangeTblEntry); newrte->rtekind = RTE_SUBQUERY; - newrte->alias = makeAlias("*TLOCRN*", cte->ctecolnames); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TLOCRN*", cte->ctecolnames); newsubquery = copyObject(rte1->subquery); IncrementVarSublevelsUp((Node *) newsubquery, 1, 1); newrte->subquery = newsubquery; @@ -320,7 +320,7 @@ rewriteSearchAndCycle(CommonTableExpr *cte) if (cte->search_clause->search_breadth_first) { search_col_rowexpr->args = lcons(makeConst(INT8OID, -1, InvalidOid, sizeof(int64), - Int64GetDatum(0), false, FLOAT8PASSBYVAL), + Int64GetDatum(0), false, true), search_col_rowexpr->args); search_col_rowexpr->colnames = lcons(makeString("*DEPTH*"), search_col_rowexpr->colnames); texpr = (Expr *) search_col_rowexpr; @@ -379,8 +379,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_mark_column)); ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_path_column)); } - newrte->alias = makeAlias("*TROCRN*", ewcl); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TROCRN*", ewcl); /* * Find the reference to the recursive CTE in the right UNION subquery's diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c index e2b811a3806ec..d3c022ae54830 100644 --- a/src/backend/snowball/dict_snowball.c +++ b/src/backend/snowball/dict_snowball.c @@ -229,7 +229,7 @@ dsnowball_init(PG_FUNCTION_ARGS) bool stoploaded = false; ListCell *l; - d = (DictSnowball *) palloc0(sizeof(DictSnowball)); + d = palloc0_object(DictSnowball); foreach(l, dictoptions) { @@ -278,7 +278,7 @@ dsnowball_lexize(PG_FUNCTION_ARGS) char *in = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); char *txt = str_tolower(in, len, DEFAULT_COLLATION_OID); - TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); + TSLexeme *res = palloc0_array(TSLexeme, 2); /* * Do not pass strings exceeding 1000 bytes to the stemmer, as they're diff --git a/src/backend/statistics/attribute_stats.c b/src/backend/statistics/attribute_stats.c index ab198076401b0..06bc1a05fc14b 100644 --- a/src/backend/statistics/attribute_stats.c +++ b/src/backend/statistics/attribute_stats.c @@ -19,9 +19,9 @@ #include "access/heapam.h" #include "catalog/indexing.h" -#include "catalog/pg_collation.h" +#include "catalog/namespace.h" #include "catalog/pg_operator.h" -#include "nodes/nodeFuncs.h" +#include "nodes/makefuncs.h" #include "statistics/statistics.h" #include "statistics/stat_utils.h" #include "utils/array.h" @@ -30,9 +30,10 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" -#define DEFAULT_NULL_FRAC Float4GetDatum(0.0) -#define DEFAULT_AVG_WIDTH Int32GetDatum(0) /* unknown */ -#define DEFAULT_N_DISTINCT Float4GetDatum(0.0) /* unknown */ +/* + * Positional argument numbers, names, and types for + * attribute_statistics_update() and pg_restore_attribute_stats(). + */ enum attribute_stats_argnum { @@ -80,6 +81,11 @@ static struct StatsArgInfo attarginfo[] = [NUM_ATTRIBUTE_STATS_ARGS] = {0} }; +/* + * Positional argument numbers, names, and types for + * pg_clear_attribute_stats(). + */ + enum clear_attribute_stats_argnum { C_ATTRELSCHEMA_ARG = 0, @@ -99,24 +105,9 @@ static struct StatsArgInfo cleararginfo[] = }; static bool attribute_statistics_update(FunctionCallInfo fcinfo); -static Node *get_attr_expr(Relation rel, int attnum); -static void get_attr_stat_type(Oid reloid, AttrNumber attnum, - Oid *atttypid, int32 *atttypmod, - char *atttyptype, Oid *atttypcoll, - Oid *eq_opr, Oid *lt_opr); -static bool get_elem_stat_type(Oid atttypid, char atttyptype, - Oid *elemtypid, Oid *elem_eq_opr); -static Datum text_to_stavalues(const char *staname, FmgrInfo *array_in, Datum d, - Oid typid, int32 typmod, bool *ok); -static void set_stats_slot(Datum *values, bool *nulls, bool *replaces, - int16 stakind, Oid staop, Oid stacoll, - Datum stanumbers, bool stanumbers_isnull, - Datum stavalues, bool stavalues_isnull); static void upsert_pg_statistic(Relation starel, HeapTuple oldtup, - Datum *values, bool *nulls, bool *replaces); + const Datum *values, const bool *nulls, const bool *replaces); static bool delete_pg_statistic(Oid reloid, AttrNumber attnum, bool stainherit); -static void init_empty_stats_tuple(Oid reloid, int16 attnum, bool inherited, - Datum *values, bool *nulls, bool *replaces); /* * Insert or Update Attribute Statistics @@ -143,6 +134,7 @@ attribute_statistics_update(FunctionCallInfo fcinfo) char *attname; AttrNumber attnum; bool inherited; + Oid locked_table = InvalidOid; Relation starel; HeapTuple statup; @@ -182,8 +174,6 @@ attribute_statistics_update(FunctionCallInfo fcinfo) nspname = TextDatumGetCString(PG_GETARG_DATUM(ATTRELSCHEMA_ARG)); relname = TextDatumGetCString(PG_GETARG_DATUM(ATTRELNAME_ARG)); - reloid = stats_lookup_relid(nspname, relname); - if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -191,7 +181,9 @@ attribute_statistics_update(FunctionCallInfo fcinfo) errhint("Statistics cannot be modified during recovery."))); /* lock before looking up attribute */ - stats_lock_check_privileges(reloid); + reloid = RangeVarGetRelidExtended(makeRangeVar(nspname, relname, -1), + ShareUpdateExclusiveLock, 0, + RangeVarCallbackForStats, &locked_table); /* user can specify either attname or attnum, but not both */ if (!PG_ARGISNULL(ATTNAME_ARG)) @@ -199,7 +191,7 @@ attribute_statistics_update(FunctionCallInfo fcinfo) if (!PG_ARGISNULL(ATTNUM_ARG)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("cannot specify both attname and attnum"))); + errmsg("cannot specify both \"%s\" and \"%s\"", "attname", "attnum"))); attname = TextDatumGetCString(PG_GETARG_DATUM(ATTNAME_ARG)); attnum = get_attnum(reloid, attname); /* note that this test covers attisdropped cases too: */ @@ -225,7 +217,7 @@ attribute_statistics_update(FunctionCallInfo fcinfo) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("must specify either attname or attnum"))); + errmsg("must specify either \"%s\" or \"%s\"", "attname", "attnum"))); attname = NULL; /* keep compiler quiet */ attnum = 0; } @@ -285,20 +277,21 @@ attribute_statistics_update(FunctionCallInfo fcinfo) } /* derive information from attribute */ - get_attr_stat_type(reloid, attnum, - &atttypid, &atttypmod, - &atttyptype, &atttypcoll, - &eq_opr, <_opr); + statatt_get_type(reloid, attnum, + &atttypid, &atttypmod, + &atttyptype, &atttypcoll, + &eq_opr, <_opr); /* if needed, derive element type */ if (do_mcelem || do_dechist) { - if (!get_elem_stat_type(atttypid, atttyptype, - &elemtypid, &elem_eq_opr)) + if (!statatt_get_elem_type(atttypid, atttyptype, + &elemtypid, &elem_eq_opr)) { ereport(WARNING, - (errmsg("unable to determine element type of attribute \"%s\"", attname), - errdetail("Cannot set STATISTIC_KIND_MCELEM or STATISTIC_KIND_DECHIST."))); + (errmsg("could not determine element type of column \"%s\"", attname), + errdetail("Cannot set %s or %s.", + "STATISTIC_KIND_MCELEM", "STATISTIC_KIND_DECHIST"))); elemtypid = InvalidOid; elem_eq_opr = InvalidOid; @@ -313,8 +306,9 @@ attribute_statistics_update(FunctionCallInfo fcinfo) { ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not determine less-than operator for attribute \"%s\"", attname), - errdetail("Cannot set STATISTIC_KIND_HISTOGRAM or STATISTIC_KIND_CORRELATION."))); + errmsg("could not determine less-than operator for column \"%s\"", attname), + errdetail("Cannot set %s or %s.", + "STATISTIC_KIND_HISTOGRAM", "STATISTIC_KIND_CORRELATION"))); do_histogram = false; do_correlation = false; @@ -327,8 +321,9 @@ attribute_statistics_update(FunctionCallInfo fcinfo) { ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("attribute \"%s\" is not a range type", attname), - errdetail("Cannot set STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM or STATISTIC_KIND_BOUNDS_HISTOGRAM."))); + errmsg("column \"%s\" is not a range type", attname), + errdetail("Cannot set %s or %s.", + "STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM", "STATISTIC_KIND_BOUNDS_HISTOGRAM"))); do_bounds_histogram = false; do_range_length_histogram = false; @@ -339,14 +334,14 @@ attribute_statistics_update(FunctionCallInfo fcinfo) starel = table_open(StatisticRelationId, RowExclusiveLock); - statup = SearchSysCache3(STATRELATTINH, reloid, attnum, inherited); + statup = SearchSysCache3(STATRELATTINH, ObjectIdGetDatum(reloid), Int16GetDatum(attnum), BoolGetDatum(inherited)); /* initialize from existing tuple if exists */ if (HeapTupleIsValid(statup)) heap_deform_tuple(statup, RelationGetDescr(starel), values, nulls); else - init_empty_stats_tuple(reloid, attnum, inherited, values, nulls, - replaces); + statatt_init_empty_tuple(reloid, attnum, inherited, values, nulls, + replaces); /* if specified, set to argument values */ if (!PG_ARGISNULL(NULL_FRAC_ARG)) @@ -370,18 +365,18 @@ attribute_statistics_update(FunctionCallInfo fcinfo) { bool converted; Datum stanumbers = PG_GETARG_DATUM(MOST_COMMON_FREQS_ARG); - Datum stavalues = text_to_stavalues("most_common_vals", - &array_in_fn, - PG_GETARG_DATUM(MOST_COMMON_VALS_ARG), - atttypid, atttypmod, - &converted); + Datum stavalues = statatt_build_stavalues("most_common_vals", + &array_in_fn, + PG_GETARG_DATUM(MOST_COMMON_VALS_ARG), + atttypid, atttypmod, + &converted); if (converted) { - set_stats_slot(values, nulls, replaces, - STATISTIC_KIND_MCV, - eq_opr, atttypcoll, - stanumbers, false, stavalues, false); + statatt_set_slot(values, nulls, replaces, + STATISTIC_KIND_MCV, + eq_opr, atttypcoll, + stanumbers, false, stavalues, false); } else result = false; @@ -393,18 +388,18 @@ attribute_statistics_update(FunctionCallInfo fcinfo) Datum stavalues; bool converted = false; - stavalues = text_to_stavalues("histogram_bounds", - &array_in_fn, - PG_GETARG_DATUM(HISTOGRAM_BOUNDS_ARG), - atttypid, atttypmod, - &converted); + stavalues = statatt_build_stavalues("histogram_bounds", + &array_in_fn, + PG_GETARG_DATUM(HISTOGRAM_BOUNDS_ARG), + atttypid, atttypmod, + &converted); if (converted) { - set_stats_slot(values, nulls, replaces, - STATISTIC_KIND_HISTOGRAM, - lt_opr, atttypcoll, - 0, true, stavalues, false); + statatt_set_slot(values, nulls, replaces, + STATISTIC_KIND_HISTOGRAM, + lt_opr, atttypcoll, + 0, true, stavalues, false); } else result = false; @@ -417,10 +412,10 @@ attribute_statistics_update(FunctionCallInfo fcinfo) ArrayType *arry = construct_array_builtin(elems, 1, FLOAT4OID); Datum stanumbers = PointerGetDatum(arry); - set_stats_slot(values, nulls, replaces, - STATISTIC_KIND_CORRELATION, - lt_opr, atttypcoll, - stanumbers, false, 0, true); + statatt_set_slot(values, nulls, replaces, + STATISTIC_KIND_CORRELATION, + lt_opr, atttypcoll, + stanumbers, false, 0, true); } /* STATISTIC_KIND_MCELEM */ @@ -430,18 +425,18 @@ attribute_statistics_update(FunctionCallInfo fcinfo) bool converted = false; Datum stavalues; - stavalues = text_to_stavalues("most_common_elems", - &array_in_fn, - PG_GETARG_DATUM(MOST_COMMON_ELEMS_ARG), - elemtypid, atttypmod, - &converted); + stavalues = statatt_build_stavalues("most_common_elems", + &array_in_fn, + PG_GETARG_DATUM(MOST_COMMON_ELEMS_ARG), + elemtypid, atttypmod, + &converted); if (converted) { - set_stats_slot(values, nulls, replaces, - STATISTIC_KIND_MCELEM, - elem_eq_opr, atttypcoll, - stanumbers, false, stavalues, false); + statatt_set_slot(values, nulls, replaces, + STATISTIC_KIND_MCELEM, + elem_eq_opr, atttypcoll, + stanumbers, false, stavalues, false); } else result = false; @@ -452,10 +447,10 @@ attribute_statistics_update(FunctionCallInfo fcinfo) { Datum stanumbers = PG_GETARG_DATUM(ELEM_COUNT_HISTOGRAM_ARG); - set_stats_slot(values, nulls, replaces, - STATISTIC_KIND_DECHIST, - elem_eq_opr, atttypcoll, - stanumbers, false, 0, true); + statatt_set_slot(values, nulls, replaces, + STATISTIC_KIND_DECHIST, + elem_eq_opr, atttypcoll, + stanumbers, false, 0, true); } /* @@ -470,18 +465,18 @@ attribute_statistics_update(FunctionCallInfo fcinfo) bool converted = false; Datum stavalues; - stavalues = text_to_stavalues("range_bounds_histogram", - &array_in_fn, - PG_GETARG_DATUM(RANGE_BOUNDS_HISTOGRAM_ARG), - atttypid, atttypmod, - &converted); + stavalues = statatt_build_stavalues("range_bounds_histogram", + &array_in_fn, + PG_GETARG_DATUM(RANGE_BOUNDS_HISTOGRAM_ARG), + atttypid, atttypmod, + &converted); if (converted) { - set_stats_slot(values, nulls, replaces, - STATISTIC_KIND_BOUNDS_HISTOGRAM, - InvalidOid, InvalidOid, - 0, true, stavalues, false); + statatt_set_slot(values, nulls, replaces, + STATISTIC_KIND_BOUNDS_HISTOGRAM, + InvalidOid, InvalidOid, + 0, true, stavalues, false); } else result = false; @@ -498,17 +493,17 @@ attribute_statistics_update(FunctionCallInfo fcinfo) bool converted = false; Datum stavalues; - stavalues = text_to_stavalues("range_length_histogram", - &array_in_fn, - PG_GETARG_DATUM(RANGE_LENGTH_HISTOGRAM_ARG), - FLOAT8OID, 0, &converted); + stavalues = statatt_build_stavalues("range_length_histogram", + &array_in_fn, + PG_GETARG_DATUM(RANGE_LENGTH_HISTOGRAM_ARG), + FLOAT8OID, 0, &converted); if (converted) { - set_stats_slot(values, nulls, replaces, - STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, - Float8LessOperator, InvalidOid, - stanumbers, false, stavalues, false); + statatt_set_slot(values, nulls, replaces, + STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, + Float8LessOperator, InvalidOid, + stanumbers, false, stavalues, false); } else result = false; @@ -523,297 +518,12 @@ attribute_statistics_update(FunctionCallInfo fcinfo) return result; } -/* - * If this relation is an index and that index has expressions in it, and - * the attnum specified is known to be an expression, then we must walk - * the list attributes up to the specified attnum to get the right - * expression. - */ -static Node * -get_attr_expr(Relation rel, int attnum) -{ - List *index_exprs; - ListCell *indexpr_item; - - /* relation is not an index */ - if (rel->rd_rel->relkind != RELKIND_INDEX && - rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) - return NULL; - - index_exprs = RelationGetIndexExpressions(rel); - - /* index has no expressions to give */ - if (index_exprs == NIL) - return NULL; - - /* - * The index attnum points directly to a relation attnum, then it's not an - * expression attribute. - */ - if (rel->rd_index->indkey.values[attnum - 1] != 0) - return NULL; - - indexpr_item = list_head(rel->rd_indexprs); - - for (int i = 0; i < attnum - 1; i++) - if (rel->rd_index->indkey.values[i] == 0) - indexpr_item = lnext(rel->rd_indexprs, indexpr_item); - - if (indexpr_item == NULL) /* shouldn't happen */ - elog(ERROR, "too few entries in indexprs list"); - - return (Node *) lfirst(indexpr_item); -} - -/* - * Derive type information from the attribute. - */ -static void -get_attr_stat_type(Oid reloid, AttrNumber attnum, - Oid *atttypid, int32 *atttypmod, - char *atttyptype, Oid *atttypcoll, - Oid *eq_opr, Oid *lt_opr) -{ - Relation rel = relation_open(reloid, AccessShareLock); - Form_pg_attribute attr; - HeapTuple atup; - Node *expr; - TypeCacheEntry *typcache; - - atup = SearchSysCache2(ATTNUM, ObjectIdGetDatum(reloid), - Int16GetDatum(attnum)); - - /* Attribute not found */ - if (!HeapTupleIsValid(atup)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("attribute %d of relation \"%s\" does not exist", - attnum, RelationGetRelationName(rel)))); - - attr = (Form_pg_attribute) GETSTRUCT(atup); - - if (attr->attisdropped) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("attribute %d of relation \"%s\" does not exist", - attnum, RelationGetRelationName(rel)))); - - expr = get_attr_expr(rel, attr->attnum); - - /* - * When analyzing an expression index, believe the expression tree's type - * not the column datatype --- the latter might be the opckeytype storage - * type of the opclass, which is not interesting for our purposes. This - * mimics the behavior of examine_attribute(). - */ - if (expr == NULL) - { - *atttypid = attr->atttypid; - *atttypmod = attr->atttypmod; - *atttypcoll = attr->attcollation; - } - else - { - *atttypid = exprType(expr); - *atttypmod = exprTypmod(expr); - - if (OidIsValid(attr->attcollation)) - *atttypcoll = attr->attcollation; - else - *atttypcoll = exprCollation(expr); - } - ReleaseSysCache(atup); - - /* - * If it's a multirange, step down to the range type, as is done by - * multirange_typanalyze(). - */ - if (type_is_multirange(*atttypid)) - *atttypid = get_multirange_range(*atttypid); - - /* finds the right operators even if atttypid is a domain */ - typcache = lookup_type_cache(*atttypid, TYPECACHE_LT_OPR | TYPECACHE_EQ_OPR); - *atttyptype = typcache->typtype; - *eq_opr = typcache->eq_opr; - *lt_opr = typcache->lt_opr; - - /* - * Special case: collation for tsvector is DEFAULT_COLLATION_OID. See - * compute_tsvector_stats(). - */ - if (*atttypid == TSVECTOROID) - *atttypcoll = DEFAULT_COLLATION_OID; - - relation_close(rel, NoLock); -} - -/* - * Derive element type information from the attribute type. - */ -static bool -get_elem_stat_type(Oid atttypid, char atttyptype, - Oid *elemtypid, Oid *elem_eq_opr) -{ - TypeCacheEntry *elemtypcache; - - if (atttypid == TSVECTOROID) - { - /* - * Special case: element type for tsvector is text. See - * compute_tsvector_stats(). - */ - *elemtypid = TEXTOID; - } - else - { - /* find underlying element type through any domain */ - *elemtypid = get_base_element_type(atttypid); - } - - if (!OidIsValid(*elemtypid)) - return false; - - /* finds the right operator even if elemtypid is a domain */ - elemtypcache = lookup_type_cache(*elemtypid, TYPECACHE_EQ_OPR); - if (!OidIsValid(elemtypcache->eq_opr)) - return false; - - *elem_eq_opr = elemtypcache->eq_opr; - - return true; -} - -/* - * Cast a text datum into an array with element type elemtypid. - * - * If an error is encountered, capture it and re-throw a WARNING, and set ok - * to false. If the resulting array contains NULLs, raise a WARNING and set ok - * to false. Otherwise, set ok to true. - */ -static Datum -text_to_stavalues(const char *staname, FmgrInfo *array_in, Datum d, Oid typid, - int32 typmod, bool *ok) -{ - LOCAL_FCINFO(fcinfo, 8); - char *s; - Datum result; - ErrorSaveContext escontext = {T_ErrorSaveContext}; - - escontext.details_wanted = true; - - s = TextDatumGetCString(d); - - InitFunctionCallInfoData(*fcinfo, array_in, 3, InvalidOid, - (Node *) &escontext, NULL); - - fcinfo->args[0].value = CStringGetDatum(s); - fcinfo->args[0].isnull = false; - fcinfo->args[1].value = ObjectIdGetDatum(typid); - fcinfo->args[1].isnull = false; - fcinfo->args[2].value = Int32GetDatum(typmod); - fcinfo->args[2].isnull = false; - - result = FunctionCallInvoke(fcinfo); - - pfree(s); - - if (escontext.error_occurred) - { - escontext.error_data->elevel = WARNING; - ThrowErrorData(escontext.error_data); - *ok = false; - return (Datum) 0; - } - - if (array_contains_nulls(DatumGetArrayTypeP(result))) - { - ereport(WARNING, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("\"%s\" array cannot contain NULL values", staname))); - *ok = false; - return (Datum) 0; - } - - *ok = true; - - return result; -} - -/* - * Find and update the slot with the given stakind, or use the first empty - * slot. - */ -static void -set_stats_slot(Datum *values, bool *nulls, bool *replaces, - int16 stakind, Oid staop, Oid stacoll, - Datum stanumbers, bool stanumbers_isnull, - Datum stavalues, bool stavalues_isnull) -{ - int slotidx; - int first_empty = -1; - AttrNumber stakind_attnum; - AttrNumber staop_attnum; - AttrNumber stacoll_attnum; - - /* find existing slot with given stakind */ - for (slotidx = 0; slotidx < STATISTIC_NUM_SLOTS; slotidx++) - { - stakind_attnum = Anum_pg_statistic_stakind1 - 1 + slotidx; - - if (first_empty < 0 && - DatumGetInt16(values[stakind_attnum]) == 0) - first_empty = slotidx; - if (DatumGetInt16(values[stakind_attnum]) == stakind) - break; - } - - if (slotidx >= STATISTIC_NUM_SLOTS && first_empty >= 0) - slotidx = first_empty; - - if (slotidx >= STATISTIC_NUM_SLOTS) - ereport(ERROR, - (errmsg("maximum number of statistics slots exceeded: %d", - slotidx + 1))); - - stakind_attnum = Anum_pg_statistic_stakind1 - 1 + slotidx; - staop_attnum = Anum_pg_statistic_staop1 - 1 + slotidx; - stacoll_attnum = Anum_pg_statistic_stacoll1 - 1 + slotidx; - - if (DatumGetInt16(values[stakind_attnum]) != stakind) - { - values[stakind_attnum] = Int16GetDatum(stakind); - replaces[stakind_attnum] = true; - } - if (DatumGetObjectId(values[staop_attnum]) != staop) - { - values[staop_attnum] = ObjectIdGetDatum(staop); - replaces[staop_attnum] = true; - } - if (DatumGetObjectId(values[stacoll_attnum]) != stacoll) - { - values[stacoll_attnum] = ObjectIdGetDatum(stacoll); - replaces[stacoll_attnum] = true; - } - if (!stanumbers_isnull) - { - values[Anum_pg_statistic_stanumbers1 - 1 + slotidx] = stanumbers; - nulls[Anum_pg_statistic_stanumbers1 - 1 + slotidx] = false; - replaces[Anum_pg_statistic_stanumbers1 - 1 + slotidx] = true; - } - if (!stavalues_isnull) - { - values[Anum_pg_statistic_stavalues1 - 1 + slotidx] = stavalues; - nulls[Anum_pg_statistic_stavalues1 - 1 + slotidx] = false; - replaces[Anum_pg_statistic_stavalues1 - 1 + slotidx] = true; - } -} - /* * Upsert the pg_statistic record. */ static void upsert_pg_statistic(Relation starel, HeapTuple oldtup, - Datum *values, bool *nulls, bool *replaces) + const Datum *values, const bool *nulls, const bool *replaces) { HeapTuple newtup; @@ -864,44 +574,6 @@ delete_pg_statistic(Oid reloid, AttrNumber attnum, bool stainherit) return result; } -/* - * Initialize values and nulls for a new stats tuple. - */ -static void -init_empty_stats_tuple(Oid reloid, int16 attnum, bool inherited, - Datum *values, bool *nulls, bool *replaces) -{ - memset(nulls, true, sizeof(bool) * Natts_pg_statistic); - memset(replaces, true, sizeof(bool) * Natts_pg_statistic); - - /* must initialize non-NULL attributes */ - - values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(reloid); - nulls[Anum_pg_statistic_starelid - 1] = false; - values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(attnum); - nulls[Anum_pg_statistic_staattnum - 1] = false; - values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inherited); - nulls[Anum_pg_statistic_stainherit - 1] = false; - - values[Anum_pg_statistic_stanullfrac - 1] = DEFAULT_NULL_FRAC; - nulls[Anum_pg_statistic_stanullfrac - 1] = false; - values[Anum_pg_statistic_stawidth - 1] = DEFAULT_AVG_WIDTH; - nulls[Anum_pg_statistic_stawidth - 1] = false; - values[Anum_pg_statistic_stadistinct - 1] = DEFAULT_N_DISTINCT; - nulls[Anum_pg_statistic_stadistinct - 1] = false; - - /* initialize stakind, staop, and stacoll slots */ - for (int slotnum = 0; slotnum < STATISTIC_NUM_SLOTS; slotnum++) - { - values[Anum_pg_statistic_stakind1 + slotnum - 1] = (Datum) 0; - nulls[Anum_pg_statistic_stakind1 + slotnum - 1] = false; - values[Anum_pg_statistic_staop1 + slotnum - 1] = InvalidOid; - nulls[Anum_pg_statistic_staop1 + slotnum - 1] = false; - values[Anum_pg_statistic_stacoll1 + slotnum - 1] = InvalidOid; - nulls[Anum_pg_statistic_stacoll1 + slotnum - 1] = false; - } -} - /* * Delete statistics for the given attribute. */ @@ -914,6 +586,7 @@ pg_clear_attribute_stats(PG_FUNCTION_ARGS) char *attname; AttrNumber attnum; bool inherited; + Oid locked_table = InvalidOid; stats_check_required_arg(fcinfo, cleararginfo, C_ATTRELSCHEMA_ARG); stats_check_required_arg(fcinfo, cleararginfo, C_ATTRELNAME_ARG); @@ -923,15 +596,15 @@ pg_clear_attribute_stats(PG_FUNCTION_ARGS) nspname = TextDatumGetCString(PG_GETARG_DATUM(C_ATTRELSCHEMA_ARG)); relname = TextDatumGetCString(PG_GETARG_DATUM(C_ATTRELNAME_ARG)); - reloid = stats_lookup_relid(nspname, relname); - if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("Statistics cannot be modified during recovery."))); - stats_lock_check_privileges(reloid); + reloid = RangeVarGetRelidExtended(makeRangeVar(nspname, relname, -1), + ShareUpdateExclusiveLock, 0, + RangeVarCallbackForStats, &locked_table); attname = TextDatumGetCString(PG_GETARG_DATUM(C_ATTNAME_ARG)); attnum = get_attnum(reloid, attname); diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c index eb2fc4366b4a7..2aed867d5e7c8 100644 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -16,23 +16,17 @@ #include "access/htup_details.h" #include "catalog/pg_statistic_ext.h" #include "catalog/pg_statistic_ext_data.h" -#include "lib/stringinfo.h" #include "nodes/nodeFuncs.h" -#include "nodes/nodes.h" -#include "nodes/pathnodes.h" #include "optimizer/clauses.h" #include "optimizer/optimizer.h" #include "parser/parsetree.h" #include "statistics/extended_stats_internal.h" -#include "statistics/statistics.h" #include "utils/fmgroids.h" -#include "utils/fmgrprotos.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/selfuncs.h" #include "utils/syscache.h" #include "utils/typcache.h" -#include "varatt.h" /* size of the struct header fields (magic, type, ndeps) */ #define SizeOfHeader (3 * sizeof(uint32)) @@ -156,7 +150,7 @@ generate_dependencies_recurse(DependencyGenerator state, int index, static void generate_dependencies(DependencyGenerator state) { - AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k); + AttrNumber *current = palloc0_array(AttrNumber, state->k); generate_dependencies_recurse(state, 0, 0, current); @@ -177,8 +171,8 @@ DependencyGenerator_init(int n, int k) Assert((n >= k) && (k > 0)); /* allocate the DependencyGenerator state */ - state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData)); - state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber)); + state = palloc0_object(DependencyGeneratorData); + state->dependencies = palloc_array(AttrNumber, k); state->ndependencies = 0; state->current = 0; @@ -243,7 +237,7 @@ dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency) * Translate the array of indexes to regular attnums for the dependency * (we will need this to identify the columns in StatsBuildData). */ - attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber)); + attnums_dep = palloc_array(AttrNumber, k); for (i = 0; i < k; i++) attnums_dep[i] = data->attnums[dependency[i]]; @@ -408,8 +402,7 @@ statext_dependencies_build(StatsBuildData *data) /* initialize the list of dependencies */ if (dependencies == NULL) { - dependencies - = (MVDependencies *) palloc0(sizeof(MVDependencies)); + dependencies = palloc0_object(MVDependencies); dependencies->magic = STATS_DEPS_MAGIC; dependencies->type = STATS_DEPS_TYPE_BASIC; @@ -511,7 +504,7 @@ statext_dependencies_deserialize(bytea *data) VARSIZE_ANY_EXHDR(data), SizeOfHeader); /* read the MVDependencies header */ - dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies)); + dependencies = palloc0_object(MVDependencies); /* initialize pointer to the data part (skip the varlena header) */ tmp = VARDATA_ANY(data); @@ -643,91 +636,6 @@ statext_dependencies_load(Oid mvoid, bool inh) return result; } -/* - * pg_dependencies_in - input routine for type pg_dependencies. - * - * pg_dependencies is real enough to be a table column, but it has no operations - * of its own, and disallows input too - */ -Datum -pg_dependencies_in(PG_FUNCTION_ARGS) -{ - /* - * pg_node_list stores the data in binary form and parsing text input is - * not needed, so disallow this. - */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_dependencies"))); - - PG_RETURN_VOID(); /* keep compiler quiet */ -} - -/* - * pg_dependencies - output routine for type pg_dependencies. - */ -Datum -pg_dependencies_out(PG_FUNCTION_ARGS) -{ - bytea *data = PG_GETARG_BYTEA_PP(0); - MVDependencies *dependencies = statext_dependencies_deserialize(data); - int i, - j; - StringInfoData str; - - initStringInfo(&str); - appendStringInfoChar(&str, '{'); - - for (i = 0; i < dependencies->ndeps; i++) - { - MVDependency *dependency = dependencies->deps[i]; - - if (i > 0) - appendStringInfoString(&str, ", "); - - appendStringInfoChar(&str, '"'); - for (j = 0; j < dependency->nattributes; j++) - { - if (j == dependency->nattributes - 1) - appendStringInfoString(&str, " => "); - else if (j > 0) - appendStringInfoString(&str, ", "); - - appendStringInfo(&str, "%d", dependency->attributes[j]); - } - appendStringInfo(&str, "\": %f", dependency->degree); - } - - appendStringInfoChar(&str, '}'); - - PG_RETURN_CSTRING(str.data); -} - -/* - * pg_dependencies_recv - binary input routine for type pg_dependencies. - */ -Datum -pg_dependencies_recv(PG_FUNCTION_ARGS) -{ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_dependencies"))); - - PG_RETURN_VOID(); /* keep compiler quiet */ -} - -/* - * pg_dependencies_send - binary output routine for type pg_dependencies. - * - * Functional dependencies are serialized in a bytea value (although the type - * is named differently), so let's just send that. - */ -Datum -pg_dependencies_send(PG_FUNCTION_ARGS) -{ - return byteasend(fcinfo); -} - /* * dependency_is_compatible_clause * Determines if the clause is compatible with functional dependencies @@ -876,7 +784,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) * A boolean expression "x" can be interpreted as "x = true", so * proceed with seeing if it's a suitable Var. */ - clause_expr = (Node *) clause; + clause_expr = clause; } /* @@ -1050,7 +958,7 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses, * and mark all the corresponding clauses as estimated. */ nattrs = bms_num_members(attnums); - attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs); + attr_sel = palloc_array(Selectivity, nattrs); attidx = 0; i = -1; @@ -1303,7 +1211,7 @@ dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, N * A boolean expression "x" can be interpreted as "x = true", so * proceed with seeing if it's a suitable Var. */ - clause_expr = (Node *) clause; + clause_expr = clause; } /* @@ -1397,8 +1305,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) return 1.0; - list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * - list_length(clauses)); + list_attnums = palloc_array(AttrNumber, list_length(clauses)); /* * We allocate space as if every clause was a unique expression, although @@ -1406,7 +1313,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, * we'll translate to attnums, and there might be duplicates. But it's * easier and cheaper to just do one allocation than repalloc later. */ - unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses)); + unique_exprs = palloc_array(Node *, list_length(clauses)); unique_exprs_cnt = 0; /* @@ -1559,8 +1466,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, * make it just the right size, but it's likely wasteful anyway thanks to * moving the freed chunks to freelists etc. */ - func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) * - list_length(rel->statlist)); + func_dependencies = palloc_array(MVDependencies *, list_length(rel->statlist)); nfunc_dependencies = 0; total_ndeps = 0; @@ -1783,8 +1689,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, * Work out which dependencies we can apply, starting with the * widest/strongest ones, and proceeding to smaller/weaker ones. */ - dependencies = (MVDependency **) palloc(sizeof(MVDependency *) * - total_ndeps); + dependencies = palloc_array(MVDependency *, total_ndeps); ndependencies = 0; while (true) diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index a8b63ec0884a9..19778b773d20d 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -446,7 +446,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) Form_pg_statistic_ext staForm; List *exprs = NIL; - entry = palloc0(sizeof(StatExtEntry)); + entry = palloc0_object(StatExtEntry); staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); entry->statOid = staForm->oid; entry->schema = get_namespace_name(staForm->stxnamespace); @@ -532,7 +532,7 @@ examine_attribute(Node *expr) /* * Create the VacAttrStats struct. */ - stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + stats = palloc0_object(VacAttrStats); stats->attstattarget = -1; /* @@ -613,7 +613,7 @@ examine_expression(Node *expr, int stattarget) /* * Create the VacAttrStats struct. */ - stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + stats = palloc0_object(VacAttrStats); /* * We can't have statistics target specified for the expression, so we @@ -946,7 +946,7 @@ build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs) *numattrs = num; /* build attnums from the bitmapset */ - attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * num); + attnums = palloc_array(AttrNumber, num); i = 0; j = -1; while ((j = bms_next_member(attrs, j)) >= 0) @@ -986,10 +986,9 @@ build_sorted_items(StatsBuildData *data, int *nitems, { int i, j, - len, nrows; int nvalues = data->numrows * numattrs; - + Size len; SortItem *items; Datum *values; bool *isnull; @@ -997,14 +996,16 @@ build_sorted_items(StatsBuildData *data, int *nitems, int *typlen; /* Compute the total amount of memory we need (both items and values). */ - len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); + len = MAXALIGN(data->numrows * sizeof(SortItem)) + + nvalues * (sizeof(Datum) + sizeof(bool)); /* Allocate the memory and split it into the pieces. */ ptr = palloc0(len); /* items to sort */ items = (SortItem *) ptr; - ptr += data->numrows * sizeof(SortItem); + /* MAXALIGN ensures that the following Datums are suitably aligned */ + ptr += MAXALIGN(data->numrows * sizeof(SortItem)); /* values and null flags */ values = (Datum *) ptr; @@ -1027,7 +1028,7 @@ build_sorted_items(StatsBuildData *data, int *nitems, } /* build a local cache of typlen for all attributes */ - typlen = (int *) palloc(sizeof(int) * data->nattnums); + typlen = palloc_array(int, data->nattnums); for (i = 0; i < data->nattnums; i++) typlen[i] = get_typlen(data->stats[i]->attrtypid); @@ -1317,6 +1318,9 @@ choose_best_statistics(List *stats, char requiredkind, bool inh, * so we can't cope with system columns. * *exprs: input/output parameter collecting primitive subclauses within * the clause tree + * *leakproof: input/output parameter recording the leakproofness of the + * clause tree. This should be true initially, and will be set to false + * if any operator function used in an OpExpr is not leakproof. * * Returns false if there is something we definitively can't handle. * On true return, we can proceed to match the *exprs against statistics. @@ -1324,7 +1328,7 @@ choose_best_statistics(List *stats, char requiredkind, bool inh, static bool statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, Index relid, Bitmapset **attnums, - List **exprs) + List **exprs, bool *leakproof) { /* Look inside any binary-compatible relabeling (as in examine_variable) */ if (IsA(clause, RelabelType)) @@ -1359,7 +1363,6 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, /* (Var/Expr op Const) or (Const op Var/Expr) */ if (is_opclause(clause)) { - RangeTblEntry *rte = root->simple_rte_array[relid]; OpExpr *expr = (OpExpr *) clause; Node *clause_expr; @@ -1394,24 +1397,15 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, return false; } - /* - * If there are any securityQuals on the RTE from security barrier - * views or RLS policies, then the user may not have access to all the - * table's data, and we must check that the operator is leakproof. - * - * If the operator is leaky, then we must ignore this clause for the - * purposes of estimating with MCV lists, otherwise the operator might - * reveal values from the MCV list that the user doesn't have - * permission to see. - */ - if (rte->securityQuals != NIL && - !get_func_leakproof(get_opcode(expr->opno))) - return false; + /* Check if the operator is leakproof */ + if (*leakproof) + *leakproof = get_func_leakproof(get_opcode(expr->opno)); /* Check (Var op Const) or (Const op Var) clauses by recursing. */ if (IsA(clause_expr, Var)) return statext_is_compatible_clause_internal(root, clause_expr, - relid, attnums, exprs); + relid, attnums, + exprs, leakproof); /* Otherwise we have (Expr op Const) or (Const op Expr). */ *exprs = lappend(*exprs, clause_expr); @@ -1421,7 +1415,6 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, /* Var/Expr IN Array */ if (IsA(clause, ScalarArrayOpExpr)) { - RangeTblEntry *rte = root->simple_rte_array[relid]; ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; Node *clause_expr; bool expronleft; @@ -1461,24 +1454,15 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, return false; } - /* - * If there are any securityQuals on the RTE from security barrier - * views or RLS policies, then the user may not have access to all the - * table's data, and we must check that the operator is leakproof. - * - * If the operator is leaky, then we must ignore this clause for the - * purposes of estimating with MCV lists, otherwise the operator might - * reveal values from the MCV list that the user doesn't have - * permission to see. - */ - if (rte->securityQuals != NIL && - !get_func_leakproof(get_opcode(expr->opno))) - return false; + /* Check if the operator is leakproof */ + if (*leakproof) + *leakproof = get_func_leakproof(get_opcode(expr->opno)); /* Check Var IN Array clauses by recursing. */ if (IsA(clause_expr, Var)) return statext_is_compatible_clause_internal(root, clause_expr, - relid, attnums, exprs); + relid, attnums, + exprs, leakproof); /* Otherwise we have Expr IN Array. */ *exprs = lappend(*exprs, clause_expr); @@ -1515,7 +1499,8 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, */ if (!statext_is_compatible_clause_internal(root, (Node *) lfirst(lc), - relid, attnums, exprs)) + relid, attnums, exprs, + leakproof)) return false; } @@ -1529,8 +1514,10 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, /* Check Var IS NULL clauses by recursing. */ if (IsA(nt->arg, Var)) - return statext_is_compatible_clause_internal(root, (Node *) (nt->arg), - relid, attnums, exprs); + return statext_is_compatible_clause_internal(root, + (Node *) (nt->arg), + relid, attnums, + exprs, leakproof); /* Otherwise we have Expr IS NULL. */ *exprs = lappend(*exprs, nt->arg); @@ -1569,11 +1556,9 @@ static bool statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, Bitmapset **attnums, List **exprs) { - RangeTblEntry *rte = root->simple_rte_array[relid]; - RelOptInfo *rel = root->simple_rel_array[relid]; RestrictInfo *rinfo; int clause_relid; - Oid userid; + bool leakproof; /* * Special-case handling for bare BoolExpr AND clauses, because the @@ -1613,18 +1598,31 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, clause_relid != relid) return false; - /* Check the clause and determine what attributes it references. */ + /* + * Check the clause, determine what attributes it references, and whether + * it includes any non-leakproof operators. + */ + leakproof = true; if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause, - relid, attnums, exprs)) + relid, attnums, exprs, + &leakproof)) return false; /* - * Check that the user has permission to read all required attributes. + * If the clause includes any non-leakproof operators, check that the user + * has permission to read all required attributes, otherwise the operators + * might reveal values from the MCV list that the user doesn't have + * permission to see. We require all rows to be selectable --- there must + * be no securityQuals from security barrier views or RLS policies. See + * similar code in examine_variable(), examine_simple_variable(), and + * statistic_proc_security_check(). + * + * Note that for an inheritance child, the permission checks are performed + * on the inheritance root parent, and whole-table select privilege on the + * parent doesn't guarantee that the user could read all columns of the + * child. Therefore we must check all referenced columns. */ - userid = OidIsValid(rel->userid) ? rel->userid : GetUserId(); - - /* Table-level SELECT privilege is sufficient for all columns */ - if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK) + if (!leakproof) { Bitmapset *clause_attnums = NULL; int attnum = -1; @@ -1649,26 +1647,9 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, if (*exprs != NIL) pull_varattnos((Node *) *exprs, relid, &clause_attnums); - attnum = -1; - while ((attnum = bms_next_member(clause_attnums, attnum)) >= 0) - { - /* Undo the offset */ - AttrNumber attno = attnum + FirstLowInvalidHeapAttributeNumber; - - if (attno == InvalidAttrNumber) - { - /* Whole-row reference, so must have access to all columns */ - if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT, - ACLMASK_ALL) != ACLCHECK_OK) - return false; - } - else - { - if (pg_attribute_aclcheck(rte->relid, attno, userid, - ACL_SELECT) != ACLCHECK_OK) - return false; - } - } + /* Must have permission to read all rows from these columns */ + if (!all_rows_selectable(root, relid, clause_attnums)) + return false; } /* If we reach here, the clause is OK */ @@ -1726,11 +1707,10 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV)) return sel; - list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * - list_length(clauses)); + list_attnums = palloc_array(Bitmapset *, list_length(clauses)); /* expressions extracted from complex expressions */ - list_exprs = (List **) palloc(sizeof(Node *) * list_length(clauses)); + list_exprs = palloc_array(List *, list_length(clauses)); /* * Pre-process the clauses list to extract the attnums and expressions @@ -2073,13 +2053,13 @@ examine_opclause_args(List *args, Node **exprp, Const **cstp, if (IsA(rightop, Const)) { - expr = (Node *) leftop; + expr = leftop; cst = (Const *) rightop; expronleft = true; } else if (IsA(leftop, Const)) { - expr = (Node *) rightop; + expr = rightop; cst = (Const *) leftop; expronleft = false; } @@ -2618,7 +2598,7 @@ make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows, } else { - result->values[idx][i] = (Datum) datum; + result->values[idx][i] = datum; result->nulls[idx][i] = false; } diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index d98cda698d941..ec650ba029f58 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -270,7 +270,7 @@ statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget) + sizeof(SortSupportData)); /* compute frequencies for values in each column */ - nfreqs = (int *) palloc0(sizeof(int) * numattrs); + nfreqs = palloc0_array(int, numattrs); freqs = build_column_frequencies(groups, ngroups, mss, nfreqs); /* @@ -294,8 +294,8 @@ statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget) /* just point to the proper place in the list */ MCVItem *item = &mcvlist->items[i]; - item->values = (Datum *) palloc(sizeof(Datum) * numattrs); - item->isnull = (bool *) palloc(sizeof(bool) * numattrs); + item->values = palloc_array(Datum, numattrs); + item->isnull = palloc_array(bool, numattrs); /* copy values for the group */ memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs); @@ -635,8 +635,8 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats) char *endptr PG_USED_FOR_ASSERTS_ONLY; /* values per dimension (and number of non-NULL values) */ - Datum **values = (Datum **) palloc0(sizeof(Datum *) * ndims); - int *counts = (int *) palloc0(sizeof(int) * ndims); + Datum **values = palloc0_array(Datum *, ndims); + int *counts = palloc0_array(int, ndims); /* * We'll include some rudimentary information about the attribute types @@ -646,10 +646,10 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats) * the statistics gets dropped automatically. We need to store the info * about the arrays of deduplicated values anyway. */ - info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims); + info = palloc0_array(DimensionInfo, ndims); /* sort support data for all attributes included in the MCV list */ - ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims); + ssup = palloc0_array(SortSupportData, ndims); /* collect and deduplicate values for each dimension (attribute) */ for (dim = 0; dim < ndims; dim++) @@ -668,7 +668,7 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats) info[dim].typbyval = stats[dim]->attrtype->typbyval; /* allocate space for values in the attribute and collect them */ - values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems); + values[dim] = palloc0_array(Datum, mcvlist->nitems); for (i = 0; i < mcvlist->nitems; i++) { @@ -767,7 +767,7 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats) values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i])); /* serialized length (uint32 length + data) */ - len = VARSIZE_ANY_EXHDR(values[dim][i]); + len = VARSIZE_ANY_EXHDR(DatumGetPointer(values[dim][i])); info[dim].nbytes += sizeof(uint32); /* length */ info[dim].nbytes += len; /* value (no header) */ @@ -1037,7 +1037,7 @@ statext_mcv_deserialize(bytea *data) /* pointer to the data part (skip the varlena header) */ raw = (char *) data; ptr = VARDATA_ANY(raw); - endptr = (char *) raw + VARSIZE_ANY(data); + endptr = raw + VARSIZE_ANY(data); /* get the header and perform further sanity checks */ memcpy(&mcvlist->magic, ptr, sizeof(uint32)); @@ -1134,11 +1134,11 @@ statext_mcv_deserialize(bytea *data) * original values (it might go away). */ datalen = 0; /* space for by-ref data */ - map = (Datum **) palloc(ndims * sizeof(Datum *)); + map = palloc_array(Datum *, ndims); for (dim = 0; dim < ndims; dim++) { - map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues); + map[dim] = palloc_array(Datum, info[dim].nvalues); /* space needed for a copy of data for by-ref types */ datalen += info[dim].nbytes_aligned; @@ -1609,7 +1609,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, Assert(mcvlist->nitems > 0); Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS); - matches = palloc(sizeof(bool) * mcvlist->nitems); + matches = palloc_array(bool, mcvlist->nitems); memset(matches, !is_or, sizeof(bool) * mcvlist->nitems); /* @@ -2134,7 +2134,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, /* build the OR-matches bitmap, if not built already */ if (*or_matches == NULL) - *or_matches = palloc0(sizeof(bool) * mcv->nitems); + *or_matches = palloc0_array(bool, mcv->nitems); /* build the match bitmap for the new clause */ new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys, diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c index 7e7a63405c8ba..58046d2bd6256 100644 --- a/src/backend/statistics/mvdistinct.c +++ b/src/backend/statistics/mvdistinct.c @@ -27,10 +27,7 @@ #include "catalog/pg_statistic_ext.h" #include "catalog/pg_statistic_ext_data.h" -#include "lib/stringinfo.h" #include "statistics/extended_stats_internal.h" -#include "statistics/statistics.h" -#include "utils/fmgrprotos.h" #include "utils/syscache.h" #include "utils/typcache.h" #include "varatt.h" @@ -113,7 +110,7 @@ statext_ndistinct_build(double totalrows, StatsBuildData *data) MVNDistinctItem *item = &result->items[itemcnt]; int j; - item->attributes = palloc(sizeof(AttrNumber) * k); + item->attributes = palloc_array(AttrNumber, k); item->nattributes = k; /* translate the indexes to attnums */ @@ -328,88 +325,6 @@ statext_ndistinct_deserialize(bytea *data) return ndistinct; } -/* - * pg_ndistinct_in - * input routine for type pg_ndistinct - * - * pg_ndistinct is real enough to be a table column, but it has no - * operations of its own, and disallows input (just like pg_node_tree). - */ -Datum -pg_ndistinct_in(PG_FUNCTION_ARGS) -{ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_ndistinct"))); - - PG_RETURN_VOID(); /* keep compiler quiet */ -} - -/* - * pg_ndistinct - * output routine for type pg_ndistinct - * - * Produces a human-readable representation of the value. - */ -Datum -pg_ndistinct_out(PG_FUNCTION_ARGS) -{ - bytea *data = PG_GETARG_BYTEA_PP(0); - MVNDistinct *ndist = statext_ndistinct_deserialize(data); - int i; - StringInfoData str; - - initStringInfo(&str); - appendStringInfoChar(&str, '{'); - - for (i = 0; i < ndist->nitems; i++) - { - int j; - MVNDistinctItem item = ndist->items[i]; - - if (i > 0) - appendStringInfoString(&str, ", "); - - for (j = 0; j < item.nattributes; j++) - { - AttrNumber attnum = item.attributes[j]; - - appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum); - } - appendStringInfo(&str, "\": %d", (int) item.ndistinct); - } - - appendStringInfoChar(&str, '}'); - - PG_RETURN_CSTRING(str.data); -} - -/* - * pg_ndistinct_recv - * binary input routine for type pg_ndistinct - */ -Datum -pg_ndistinct_recv(PG_FUNCTION_ARGS) -{ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_ndistinct"))); - - PG_RETURN_VOID(); /* keep compiler quiet */ -} - -/* - * pg_ndistinct_send - * binary output routine for type pg_ndistinct - * - * n-distinct is serialized into a bytea value, so let's send that. - */ -Datum -pg_ndistinct_send(PG_FUNCTION_ARGS) -{ - return byteasend(fcinfo); -} - /* * ndistinct_for_combination * Estimates number of distinct values in a combination of columns. @@ -444,9 +359,9 @@ ndistinct_for_combination(double totalrows, StatsBuildData *data, * using the specified column combination as dimensions. We could try to * sort in place, but it'd probably be more complex and bug-prone. */ - items = (SortItem *) palloc(numrows * sizeof(SortItem)); - values = (Datum *) palloc0(sizeof(Datum) * numrows * k); - isnull = (bool *) palloc0(sizeof(bool) * numrows * k); + items = palloc_array(SortItem, numrows); + values = palloc0_array(Datum, numrows * k); + isnull = palloc0_array(bool, numrows * k); for (i = 0; i < numrows; i++) { @@ -593,12 +508,12 @@ generator_init(int n, int k) Assert((n >= k) && (k > 0)); /* allocate the generator state as a single chunk of memory */ - state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator)); + state = palloc_object(CombinationGenerator); state->ncombinations = n_choose_k(n, k); /* pre-allocate space for all combinations */ - state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations); + state->combinations = palloc_array(int, k * state->ncombinations); state->current = 0; state->k = k; @@ -691,7 +606,7 @@ generate_combinations_recurse(CombinationGenerator *state, static void generate_combinations(CombinationGenerator *state) { - int *current = (int *) palloc0(sizeof(int) * state->k); + int *current = palloc0_array(int, state->k); generate_combinations_recurse(state, 0, 0, current); diff --git a/src/backend/statistics/relation_stats.c b/src/backend/statistics/relation_stats.c index cd3a75b621a0c..174da7d93a505 100644 --- a/src/backend/statistics/relation_stats.c +++ b/src/backend/statistics/relation_stats.c @@ -20,6 +20,7 @@ #include "access/heapam.h" #include "catalog/indexing.h" #include "catalog/namespace.h" +#include "nodes/makefuncs.h" #include "statistics/stat_utils.h" #include "utils/builtins.h" #include "utils/fmgroids.h" @@ -82,6 +83,7 @@ relation_statistics_update(FunctionCallInfo fcinfo) Datum values[4] = {0}; bool nulls[4] = {0}; int nreplaces = 0; + Oid locked_table = InvalidOid; stats_check_required_arg(fcinfo, relarginfo, RELSCHEMA_ARG); stats_check_required_arg(fcinfo, relarginfo, RELNAME_ARG); @@ -89,15 +91,15 @@ relation_statistics_update(FunctionCallInfo fcinfo) nspname = TextDatumGetCString(PG_GETARG_DATUM(RELSCHEMA_ARG)); relname = TextDatumGetCString(PG_GETARG_DATUM(RELNAME_ARG)); - reloid = stats_lookup_relid(nspname, relname); - if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("Statistics cannot be modified during recovery."))); - stats_lock_check_privileges(reloid); + reloid = RangeVarGetRelidExtended(makeRangeVar(nspname, relname, -1), + ShareUpdateExclusiveLock, 0, + RangeVarCallbackForStats, &locked_table); if (!PG_ARGISNULL(RELPAGES_ARG)) { @@ -112,7 +114,7 @@ relation_statistics_update(FunctionCallInfo fcinfo) { ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("reltuples cannot be < -1.0"))); + errmsg("argument \"%s\" must not be less than -1.0", "reltuples"))); result = false; } else diff --git a/src/backend/statistics/stat_utils.c b/src/backend/statistics/stat_utils.c index a9a3224efe6fd..b1c1514cc7297 100644 --- a/src/backend/statistics/stat_utils.c +++ b/src/backend/statistics/stat_utils.c @@ -16,12 +16,17 @@ #include "postgres.h" +#include "access/htup_details.h" #include "access/relation.h" #include "catalog/index.h" #include "catalog/namespace.h" +#include "catalog/pg_class.h" +#include "catalog/pg_collation.h" #include "catalog/pg_database.h" +#include "catalog/pg_statistic.h" #include "funcapi.h" #include "miscadmin.h" +#include "nodes/nodeFuncs.h" #include "statistics/stat_utils.h" #include "storage/lmgr.h" #include "utils/acl.h" @@ -29,6 +34,16 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/syscache.h" + +/* Default values assigned to new pg_statistic tuples. */ +#define DEFAULT_STATATT_NULL_FRAC Float4GetDatum(0.0) /* stanullfrac */ +#define DEFAULT_STATATT_AVG_WIDTH Int32GetDatum(0) /* stawidth, same as + * unknown */ +#define DEFAULT_STATATT_N_DISTINCT Float4GetDatum(0.0) /* stadistinct, same as + * unknown */ + +static Node *statatt_get_index_expr(Relation rel, int attnum); /* * Ensure that a given argument is not null. @@ -41,7 +56,7 @@ stats_check_required_arg(FunctionCallInfo fcinfo, if (PG_ARGISNULL(argnum)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("\"%s\" cannot be NULL", + errmsg("argument \"%s\" must not be null", arginfo[argnum].argname))); } @@ -68,7 +83,7 @@ stats_check_arg_array(FunctionCallInfo fcinfo, { ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("\"%s\" cannot be a multidimensional array", + errmsg("argument \"%s\" must not be a multidimensional array", arginfo[argnum].argname))); return false; } @@ -77,7 +92,7 @@ stats_check_arg_array(FunctionCallInfo fcinfo, { ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("\"%s\" array cannot contain NULL values", + errmsg("argument \"%s\" array must not contain null values", arginfo[argnum].argname))); return false; } @@ -108,7 +123,7 @@ stats_check_arg_pair(FunctionCallInfo fcinfo, ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("\"%s\" must be specified when \"%s\" is specified", + errmsg("argument \"%s\" must be specified when argument \"%s\" is specified", arginfo[nullarg].argname, arginfo[otherarg].argname))); @@ -119,53 +134,84 @@ stats_check_arg_pair(FunctionCallInfo fcinfo, } /* - * Lock relation in ShareUpdateExclusive mode, check privileges, and close the - * relation (but retain the lock). - * * A role has privileges to set statistics on the relation if any of the * following are true: * - the role owns the current database and the relation is not shared * - the role has the MAINTAIN privilege on the relation */ void -stats_lock_check_privileges(Oid reloid) +RangeVarCallbackForStats(const RangeVar *relation, + Oid relId, Oid oldRelId, void *arg) { - Relation table; - Oid table_oid = reloid; - Oid index_oid = InvalidOid; - LOCKMODE index_lockmode = NoLock; + Oid *locked_oid = (Oid *) arg; + Oid table_oid = relId; + HeapTuple tuple; + Form_pg_class form; + char relkind; /* - * For indexes, we follow the locking behavior in do_analyze_rel() and - * check_lock_if_inplace_updateable_rel(), which is to lock the table - * first in ShareUpdateExclusive mode and then the index in AccessShare - * mode. - * - * Partitioned indexes are treated differently than normal indexes in - * check_lock_if_inplace_updateable_rel(), so we take a - * ShareUpdateExclusive lock on both the partitioned table and the - * partitioned index. + * If we previously locked some other index's heap, and the name we're + * looking up no longer refers to that relation, release the now-useless + * lock. */ - switch (get_rel_relkind(reloid)) + if (relId != oldRelId && OidIsValid(*locked_oid)) { - case RELKIND_INDEX: - index_oid = reloid; - table_oid = IndexGetRelation(index_oid, false); - index_lockmode = AccessShareLock; - break; - case RELKIND_PARTITIONED_INDEX: - index_oid = reloid; - table_oid = IndexGetRelation(index_oid, false); - index_lockmode = ShareUpdateExclusiveLock; - break; - default: - break; + UnlockRelationOid(*locked_oid, ShareUpdateExclusiveLock); + *locked_oid = InvalidOid; + } + + /* If the relation does not exist, there's nothing more to do. */ + if (!OidIsValid(relId)) + return; + + /* If the relation does exist, check whether it's an index. */ + relkind = get_rel_relkind(relId); + if (relkind == RELKIND_INDEX || + relkind == RELKIND_PARTITIONED_INDEX) + table_oid = IndexGetRelation(relId, false); + + /* + * If retrying yields the same OID, there are a couple of extremely + * unlikely scenarios we need to handle. + */ + if (relId == oldRelId) + { + /* + * If a previous lookup found an index, but the current lookup did + * not, the index was dropped and the OID was reused for something + * else between lookups. In theory, we could simply drop our lock on + * the index's parent table and proceed, but in the interest of + * avoiding complexity, we just error. + */ + if (table_oid == relId && OidIsValid(*locked_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("index \"%s\" was concurrently dropped", + relation->relname))); + + /* + * If the current lookup found an index but a previous lookup either + * did not find an index or found one with a different parent + * relation, the relation was dropped and the OID was reused for an + * index between lookups. RangeVarGetRelidExtended() will have + * already locked the index at this point, so we can't just lock the + * newly discovered parent table OID without risking deadlock. As + * above, we just error in this case. + */ + if (table_oid != relId && table_oid != *locked_oid) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("index \"%s\" was concurrently created", + relation->relname))); } - table = relation_open(table_oid, ShareUpdateExclusiveLock); + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(table_oid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for OID %u", table_oid); + form = (Form_pg_class) GETSTRUCT(tuple); /* the relkinds that can be used with ANALYZE */ - switch (table->rd_rel->relkind) + switch (form->relkind) { case RELKIND_RELATION: case RELKIND_MATVIEW: @@ -176,62 +222,36 @@ stats_lock_check_privileges(Oid reloid) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot modify statistics for relation \"%s\"", - RelationGetRelationName(table)), - errdetail_relkind_not_supported(table->rd_rel->relkind))); + NameStr(form->relname)), + errdetail_relkind_not_supported(form->relkind))); } - if (OidIsValid(index_oid)) - { - Relation index; - - Assert(index_lockmode != NoLock); - index = relation_open(index_oid, index_lockmode); - - Assert(index->rd_index && index->rd_index->indrelid == table_oid); - - /* retain lock on index */ - relation_close(index, NoLock); - } - - if (table->rd_rel->relisshared) + if (form->relisshared) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot modify statistics for shared relation"))); + /* Check permissions */ if (!object_ownercheck(DatabaseRelationId, MyDatabaseId, GetUserId())) { - AclResult aclresult = pg_class_aclcheck(RelationGetRelid(table), + AclResult aclresult = pg_class_aclcheck(table_oid, GetUserId(), ACL_MAINTAIN); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, - get_relkind_objtype(table->rd_rel->relkind), - NameStr(table->rd_rel->relname)); + get_relkind_objtype(form->relkind), + NameStr(form->relname)); } - /* retain lock on table */ - relation_close(table, NoLock); -} + ReleaseSysCache(tuple); -/* - * Lookup relation oid from schema and relation name. - */ -Oid -stats_lookup_relid(const char *nspname, const char *relname) -{ - Oid nspoid; - Oid reloid; - - nspoid = LookupExplicitNamespace(nspname, false); - reloid = get_relname_relid(relname, nspoid); - if (!OidIsValid(reloid)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_TABLE), - errmsg("relation \"%s.%s\" does not exist", - nspname, relname))); - - return reloid; + /* Lock heap before index to avoid deadlock. */ + if (relId != oldRelId && table_oid != relId) + { + LockRelationOid(table_oid, ShareUpdateExclusiveLock); + *locked_oid = table_oid; + } } @@ -263,7 +283,7 @@ stats_check_arg_type(const char *argname, Oid argtype, Oid expectedtype) if (argtype != expectedtype) { ereport(WARNING, - (errmsg("argument \"%s\" has type \"%s\", expected type \"%s\"", + (errmsg("argument \"%s\" has type %s, expected type %s", argname, format_type_be(argtype), format_type_be(expectedtype)))); return false; @@ -272,6 +292,50 @@ stats_check_arg_type(const char *argname, Oid argtype, Oid expectedtype) return true; } +/* + * Check if attribute of an index is an expression, then retrieve the + * expression if is it the case. + * + * If the attnum specified is known to be an expression, then we must + * walk the list attributes up to the specified attnum to get the right + * expression. + */ +static Node * +statatt_get_index_expr(Relation rel, int attnum) +{ + List *index_exprs; + ListCell *indexpr_item; + + /* relation is not an index */ + if (rel->rd_rel->relkind != RELKIND_INDEX && + rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) + return NULL; + + index_exprs = RelationGetIndexExpressions(rel); + + /* index has no expressions to give */ + if (index_exprs == NIL) + return NULL; + + /* + * The index's attnum points directly to a relation attnum, hence it is + * not an expression attribute. + */ + if (rel->rd_index->indkey.values[attnum - 1] != 0) + return NULL; + + indexpr_item = list_head(rel->rd_indexprs); + + for (int i = 0; i < attnum - 1; i++) + if (rel->rd_index->indkey.values[i] == 0) + indexpr_item = lnext(rel->rd_indexprs, indexpr_item); + + if (indexpr_item == NULL) /* shouldn't happen */ + elog(ERROR, "too few entries in indexprs list"); + + return (Node *) lfirst(indexpr_item); +} + /* * Translate variadic argument pairs from 'pairs_fcinfo' into a * 'positional_fcinfo' appropriate for calling relation_statistics_update() or @@ -319,11 +383,11 @@ stats_fill_fcinfo_from_arg_pairs(FunctionCallInfo pairs_fcinfo, if (argnulls[i]) ereport(ERROR, - (errmsg("name at variadic position %d is NULL", i + 1))); + (errmsg("name at variadic position %d is null", i + 1))); if (types[i] != TEXTOID) ereport(ERROR, - (errmsg("name at variadic position %d has type \"%s\", expected type \"%s\"", + (errmsg("name at variadic position %d has type %s, expected type %s", i + 1, format_type_be(types[i]), format_type_be(TEXTOID)))); @@ -357,3 +421,325 @@ stats_fill_fcinfo_from_arg_pairs(FunctionCallInfo pairs_fcinfo, return result; } + +/* + * Derive type information from a relation attribute. + * + * This is needed for setting most slot statistics for all data types. + * + * This duplicates the logic in examine_attribute() but it will not skip the + * attribute if the attstattarget is 0. + * + * This information, retrieved from pg_attribute and pg_type with some + * specific handling for index expressions, is a prerequisite to calling + * any of the other statatt_*() functions. + */ +void +statatt_get_type(Oid reloid, AttrNumber attnum, + Oid *atttypid, int32 *atttypmod, + char *atttyptype, Oid *atttypcoll, + Oid *eq_opr, Oid *lt_opr) +{ + Relation rel = relation_open(reloid, AccessShareLock); + Form_pg_attribute attr; + HeapTuple atup; + Node *expr; + TypeCacheEntry *typcache; + + atup = SearchSysCache2(ATTNUM, ObjectIdGetDatum(reloid), + Int16GetDatum(attnum)); + + /* Attribute not found */ + if (!HeapTupleIsValid(atup)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column %d of relation \"%s\" does not exist", + attnum, RelationGetRelationName(rel)))); + + attr = (Form_pg_attribute) GETSTRUCT(atup); + + if (attr->attisdropped) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column %d of relation \"%s\" does not exist", + attnum, RelationGetRelationName(rel)))); + + expr = statatt_get_index_expr(rel, attr->attnum); + + /* + * When analyzing an expression index, believe the expression tree's type + * not the column datatype --- the latter might be the opckeytype storage + * type of the opclass, which is not interesting for our purposes. This + * mimics the behavior of examine_attribute(). + */ + if (expr == NULL) + { + *atttypid = attr->atttypid; + *atttypmod = attr->atttypmod; + *atttypcoll = attr->attcollation; + } + else + { + *atttypid = exprType(expr); + *atttypmod = exprTypmod(expr); + + if (OidIsValid(attr->attcollation)) + *atttypcoll = attr->attcollation; + else + *atttypcoll = exprCollation(expr); + } + ReleaseSysCache(atup); + + /* + * If it's a multirange, step down to the range type, as is done by + * multirange_typanalyze(). + */ + if (type_is_multirange(*atttypid)) + *atttypid = get_multirange_range(*atttypid); + + /* finds the right operators even if atttypid is a domain */ + typcache = lookup_type_cache(*atttypid, TYPECACHE_LT_OPR | TYPECACHE_EQ_OPR); + *atttyptype = typcache->typtype; + *eq_opr = typcache->eq_opr; + *lt_opr = typcache->lt_opr; + + /* + * Special case: collation for tsvector is DEFAULT_COLLATION_OID. See + * compute_tsvector_stats(). + */ + if (*atttypid == TSVECTOROID) + *atttypcoll = DEFAULT_COLLATION_OID; + + relation_close(rel, NoLock); +} + +/* + * Derive element type information from the attribute type. This information + * is needed when the given type is one that contains elements of other types. + * + * The atttypid and atttyptype should be derived from a previous call to + * statatt_get_type(). + */ +bool +statatt_get_elem_type(Oid atttypid, char atttyptype, + Oid *elemtypid, Oid *elem_eq_opr) +{ + TypeCacheEntry *elemtypcache; + + if (atttypid == TSVECTOROID) + { + /* + * Special case: element type for tsvector is text. See + * compute_tsvector_stats(). + */ + *elemtypid = TEXTOID; + } + else + { + /* find underlying element type through any domain */ + *elemtypid = get_base_element_type(atttypid); + } + + if (!OidIsValid(*elemtypid)) + return false; + + /* finds the right operator even if elemtypid is a domain */ + elemtypcache = lookup_type_cache(*elemtypid, TYPECACHE_EQ_OPR); + if (!OidIsValid(elemtypcache->eq_opr)) + return false; + + *elem_eq_opr = elemtypcache->eq_opr; + + return true; +} + +/* + * Build an array with element type elemtypid from a text datum, used as + * value of an attribute in a tuple to-be-inserted into pg_statistic. + * + * The typid and typmod should be derived from a previous call to + * statatt_get_type(). + * + * If an error is encountered, capture it and throw a WARNING, with "ok" set + * to false. If the resulting array contains NULLs, raise a WARNING and + * set "ok" to false. When the operation succeeds, set "ok" to true. + */ +Datum +statatt_build_stavalues(const char *staname, FmgrInfo *array_in, Datum d, Oid typid, + int32 typmod, bool *ok) +{ + LOCAL_FCINFO(fcinfo, 8); + char *s; + Datum result; + ErrorSaveContext escontext = {T_ErrorSaveContext}; + + escontext.details_wanted = true; + + s = TextDatumGetCString(d); + + InitFunctionCallInfoData(*fcinfo, array_in, 3, InvalidOid, + (Node *) &escontext, NULL); + + fcinfo->args[0].value = CStringGetDatum(s); + fcinfo->args[0].isnull = false; + fcinfo->args[1].value = ObjectIdGetDatum(typid); + fcinfo->args[1].isnull = false; + fcinfo->args[2].value = Int32GetDatum(typmod); + fcinfo->args[2].isnull = false; + + result = FunctionCallInvoke(fcinfo); + + pfree(s); + + if (escontext.error_occurred) + { + escontext.error_data->elevel = WARNING; + ThrowErrorData(escontext.error_data); + *ok = false; + return (Datum) 0; + } + + if (array_contains_nulls(DatumGetArrayTypeP(result))) + { + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"%s\" array must not contain null values", staname))); + *ok = false; + return (Datum) 0; + } + + *ok = true; + + return result; +} + +/* + * Find and update the slot of a stakind, or use the first empty slot. + * + * Core statistics types expect the stakind value to be one of the + * STATISTIC_KIND_* constants defined in pg_statistic.h, but types defined + * by extensions are not restricted to those values. + * + * In the case of core statistics, the required staop is determined by the + * stakind given and will either be a hardcoded oid, or the eq/lt operator + * derived from statatt_get_type(). Likewise, types defined by extensions + * have no such restriction. + * + * The stacoll value should be either the atttypcoll derived from + * statatt_get_type(), or a hardcoded value required by that particular + * stakind. + * + * The value/null pairs for stanumbers and stavalues should be calculated + * based on the stakind, using statatt_build_stavalues() or constructed arrays. + */ +void +statatt_set_slot(Datum *values, bool *nulls, bool *replaces, + int16 stakind, Oid staop, Oid stacoll, + Datum stanumbers, bool stanumbers_isnull, + Datum stavalues, bool stavalues_isnull) +{ + int slotidx; + int first_empty = -1; + AttrNumber stakind_attnum; + AttrNumber staop_attnum; + AttrNumber stacoll_attnum; + + /* find existing slot with given stakind */ + for (slotidx = 0; slotidx < STATISTIC_NUM_SLOTS; slotidx++) + { + stakind_attnum = Anum_pg_statistic_stakind1 - 1 + slotidx; + + if (first_empty < 0 && + DatumGetInt16(values[stakind_attnum]) == 0) + first_empty = slotidx; + if (DatumGetInt16(values[stakind_attnum]) == stakind) + break; + } + + if (slotidx >= STATISTIC_NUM_SLOTS && first_empty >= 0) + slotidx = first_empty; + + if (slotidx >= STATISTIC_NUM_SLOTS) + ereport(ERROR, + (errmsg("maximum number of statistics slots exceeded: %d", + slotidx + 1))); + + stakind_attnum = Anum_pg_statistic_stakind1 - 1 + slotidx; + staop_attnum = Anum_pg_statistic_staop1 - 1 + slotidx; + stacoll_attnum = Anum_pg_statistic_stacoll1 - 1 + slotidx; + + if (DatumGetInt16(values[stakind_attnum]) != stakind) + { + values[stakind_attnum] = Int16GetDatum(stakind); + replaces[stakind_attnum] = true; + } + if (DatumGetObjectId(values[staop_attnum]) != staop) + { + values[staop_attnum] = ObjectIdGetDatum(staop); + replaces[staop_attnum] = true; + } + if (DatumGetObjectId(values[stacoll_attnum]) != stacoll) + { + values[stacoll_attnum] = ObjectIdGetDatum(stacoll); + replaces[stacoll_attnum] = true; + } + if (!stanumbers_isnull) + { + values[Anum_pg_statistic_stanumbers1 - 1 + slotidx] = stanumbers; + nulls[Anum_pg_statistic_stanumbers1 - 1 + slotidx] = false; + replaces[Anum_pg_statistic_stanumbers1 - 1 + slotidx] = true; + } + if (!stavalues_isnull) + { + values[Anum_pg_statistic_stavalues1 - 1 + slotidx] = stavalues; + nulls[Anum_pg_statistic_stavalues1 - 1 + slotidx] = false; + replaces[Anum_pg_statistic_stavalues1 - 1 + slotidx] = true; + } +} + +/* + * Initialize values and nulls for a new pg_statistic tuple. + * + * The caller is responsible for allocating the arrays where the results are + * stored, which should be of size Natts_pg_statistic. + * + * When using this routine for a tuple inserted into pg_statistic, reloid, + * attnum and inherited flags should all be set. + * + * When using this routine for a tuple that is an element of a stxdexpr + * array inserted into pg_statistic_ext_data, reloid, attnum and inherited + * should be respectively set to InvalidOid, InvalidAttrNumber and false. + */ +void +statatt_init_empty_tuple(Oid reloid, int16 attnum, bool inherited, + Datum *values, bool *nulls, bool *replaces) +{ + memset(nulls, true, sizeof(bool) * Natts_pg_statistic); + memset(replaces, true, sizeof(bool) * Natts_pg_statistic); + + /* This must initialize non-NULL attributes */ + values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(reloid); + nulls[Anum_pg_statistic_starelid - 1] = false; + values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(attnum); + nulls[Anum_pg_statistic_staattnum - 1] = false; + values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inherited); + nulls[Anum_pg_statistic_stainherit - 1] = false; + + values[Anum_pg_statistic_stanullfrac - 1] = DEFAULT_STATATT_NULL_FRAC; + nulls[Anum_pg_statistic_stanullfrac - 1] = false; + values[Anum_pg_statistic_stawidth - 1] = DEFAULT_STATATT_AVG_WIDTH; + nulls[Anum_pg_statistic_stawidth - 1] = false; + values[Anum_pg_statistic_stadistinct - 1] = DEFAULT_STATATT_N_DISTINCT; + nulls[Anum_pg_statistic_stadistinct - 1] = false; + + /* initialize stakind, staop, and stacoll slots */ + for (int slotnum = 0; slotnum < STATISTIC_NUM_SLOTS; slotnum++) + { + values[Anum_pg_statistic_stakind1 + slotnum - 1] = (Datum) 0; + nulls[Anum_pg_statistic_stakind1 + slotnum - 1] = false; + values[Anum_pg_statistic_staop1 + slotnum - 1] = ObjectIdGetDatum(InvalidOid); + nulls[Anum_pg_statistic_staop1 + slotnum - 1] = false; + values[Anum_pg_statistic_stacoll1 + slotnum - 1] = ObjectIdGetDatum(InvalidOid); + nulls[Anum_pg_statistic_stacoll1 + slotnum - 1] = false; + } +} diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md index f10b5c7e31ec7..72ae3b3737d51 100644 --- a/src/backend/storage/aio/README.md +++ b/src/backend/storage/aio/README.md @@ -94,7 +94,7 @@ pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0); * * In this example we're reading only a single buffer, hence the 1. */ -pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); +pgaio_io_set_handle_data_32(ioh, (uint32 *) &buffer, 1); /* * Pass the AIO handle to lower-level function. When operating on the level of @@ -119,8 +119,9 @@ pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); * e.g. due to reaching a limit on the number of unsubmitted IOs, and even * complete before smgrstartreadv() returns. */ +void *page = BufferGetBlock(buffer); smgrstartreadv(ioh, operation->smgr, forknum, blkno, - BufferGetBlock(buffer), 1); + &page, 1); /* * To benefit from AIO, it is beneficial to perform other work, including diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index c64d815ebd12a..c4c2d8cc4b1a1 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -53,7 +53,7 @@ static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state); static void pgaio_io_reclaim(PgAioHandle *ioh); -static void pgaio_io_resowner_register(PgAioHandle *ioh); +static void pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner); static void pgaio_io_wait_for_free(void); static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation); static const char *pgaio_io_state_get_name(PgAioHandleState s); @@ -89,6 +89,9 @@ static const IoMethodOps *const pgaio_method_ops_table[] = { #endif }; +StaticAssertDecl(lengthof(io_method_options) == lengthof(pgaio_method_ops_table) + 1, + "io_method_options out of sync with pgaio_method_ops_table"); + /* callbacks for the configured io_method, set by assign_io_method */ const IoMethodOps *pgaio_method_ops; @@ -214,7 +217,7 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) pgaio_my_backend->handed_out_io = ioh; if (resowner) - pgaio_io_resowner_register(ioh); + pgaio_io_resowner_register(ioh, resowner); if (ret) { @@ -275,7 +278,7 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error) ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node); ioh->resowner = NULL; - switch (ioh->state) + switch ((PgAioHandleState) ioh->state) { case PGAIO_HS_IDLE: elog(ERROR, "unexpected"); @@ -403,13 +406,13 @@ pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state) } static void -pgaio_io_resowner_register(PgAioHandle *ioh) +pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner) { Assert(!ioh->resowner); - Assert(CurrentResourceOwner); + Assert(resowner); - ResourceOwnerRememberAioHandle(CurrentResourceOwner, &ioh->resowner_node); - ioh->resowner = CurrentResourceOwner; + ResourceOwnerRememberAioHandle(resowner, &ioh->resowner_node); + ioh->resowner = resowner; } /* @@ -556,6 +559,13 @@ bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state) { *state = ioh->state; + + /* + * Ensure that we don't see an earlier state of the handle than ioh->state + * due to compiler or CPU reordering. This protects both ->generation as + * directly used here, and other fields in the handle accessed in the + * caller if the handle was not reused. + */ pg_read_barrier(); return ioh->generation != ref_generation; @@ -752,7 +762,7 @@ pgaio_io_wait_for_free(void) { int reclaimed = 0; - pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs", + pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), dclist_count(&pgaio_my_backend->idle_ios)); @@ -773,7 +783,12 @@ pgaio_io_wait_for_free(void) * Note that no interrupts are processed between the state check * and the call to reclaim - that's important as otherwise an * interrupt could have already reclaimed the handle. + * + * Need to ensure that there's no reordering, in the more common + * paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). */ + pg_read_barrier(); pgaio_io_reclaim(ioh); reclaimed++; } @@ -797,7 +812,7 @@ pgaio_io_wait_for_free(void) if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0) ereport(ERROR, errmsg_internal("no free IOs despite no in-flight IOs"), - errdetail_internal("%d pending, %d in-flight, %d idle IOs", + errdetail_internal("%d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), dclist_count(&pgaio_my_backend->idle_ios))); @@ -813,7 +828,7 @@ pgaio_io_wait_for_free(void) &pgaio_my_backend->in_flight_ios); uint64 generation = ioh->generation; - switch (ioh->state) + switch ((PgAioHandleState) ioh->state) { /* should not be in in-flight list */ case PGAIO_HS_IDLE: @@ -828,7 +843,7 @@ pgaio_io_wait_for_free(void) case PGAIO_HS_COMPLETED_IO: case PGAIO_HS_SUBMITTED: pgaio_debug_io(DEBUG2, ioh, - "waiting for free io with %d in flight", + "waiting for free io with %u in flight", dclist_count(&pgaio_my_backend->in_flight_ios)); /* @@ -852,7 +867,12 @@ pgaio_io_wait_for_free(void) * check and the call to reclaim - that's important as * otherwise an interrupt could have already reclaimed the * handle. + * + * Need to ensure that there's no reordering, in the more + * common paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). */ + pg_read_barrier(); pgaio_io_reclaim(ioh); break; } @@ -1252,7 +1272,7 @@ pgaio_closing_fd(int fd) break; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO before FD %d gets closed, %d in-flight IOs", + "waiting for IO before FD %d gets closed, %u in-flight IOs", fd, dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ @@ -1288,7 +1308,7 @@ pgaio_shutdown(int code, Datum arg) uint64 generation = ioh->generation; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO to complete during shutdown, %d in-flight IOs", + "waiting for IO to complete during shutdown, %u in-flight IOs", dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ @@ -1301,8 +1321,8 @@ pgaio_shutdown(int code, Datum arg) void assign_io_method(int newval, void *extra) { + Assert(newval < lengthof(pgaio_method_ops_table)); Assert(pgaio_method_ops_table[newval] != NULL); - Assert(newval < lengthof(io_method_options)); pgaio_method_ops = pgaio_method_ops_table[newval]; } diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c index 0ad9795bb7e0c..03c9bba080267 100644 --- a/src/backend/storage/aio/aio_callback.c +++ b/src/backend/storage/aio/aio_callback.c @@ -256,6 +256,9 @@ pgaio_io_call_complete_shared(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_shared(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } ioh->distilled_result = result; @@ -290,6 +293,7 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) /* start with distilled result from shared callback */ result = ioh->distilled_result; + Assert(result.status != PGAIO_RS_UNKNOWN); for (int i = ioh->num_callbacks; i > 0; i--) { @@ -306,6 +310,9 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_local(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } /* diff --git a/src/backend/storage/aio/aio_funcs.c b/src/backend/storage/aio/aio_funcs.c index 584e683371a31..d7977387b8f1a 100644 --- a/src/backend/storage/aio/aio_funcs.c +++ b/src/backend/storage/aio/aio_funcs.c @@ -56,7 +56,7 @@ pg_get_aios(PG_FUNCTION_ARGS) for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++) { PgAioHandle *live_ioh = &pgaio_ctl->io_handles[i]; - uint32 ioh_id = pgaio_io_get_id(live_ioh); + int ioh_id = pgaio_io_get_id(live_ioh); Datum values[PG_GET_AIOS_COLS] = {0}; bool nulls[PG_GET_AIOS_COLS] = {0}; ProcNumber owner; @@ -152,7 +152,7 @@ pg_get_aios(PG_FUNCTION_ARGS) nulls[0] = false; /* column: IO's id */ - values[1] = ioh_id; + values[1] = Int32GetDatum(ioh_id); /* column: IO's generation */ values[2] = Int64GetDatum(start_generation); @@ -175,7 +175,7 @@ pg_get_aios(PG_FUNCTION_ARGS) values[4] = CStringGetTextDatum(pgaio_io_get_op_name(&ioh_copy)); /* columns: details about the IO's operation (offset, length) */ - switch (ioh_copy.op) + switch ((PgAioOp) ioh_copy.op) { case PGAIO_OP_INVALID: nulls[5] = true; diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c index 885c3940c6626..54ab84dd6f03a 100644 --- a/src/backend/storage/aio/aio_init.c +++ b/src/backend/storage/aio/aio_init.c @@ -30,12 +30,8 @@ static Size AioCtlShmemSize(void) { - Size sz; - /* pgaio_ctl itself */ - sz = offsetof(PgAioCtl, io_handles); - - return sz; + return sizeof(PgAioCtl); } static uint32 diff --git a/src/backend/storage/aio/aio_io.c b/src/backend/storage/aio/aio_io.c index 520b5077df25a..7d11d40284ada 100644 --- a/src/backend/storage/aio/aio_io.c +++ b/src/backend/storage/aio/aio_io.c @@ -121,7 +121,7 @@ pgaio_io_perform_synchronously(PgAioHandle *ioh) START_CRIT_SECTION(); /* Perform IO. */ - switch (ioh->op) + switch ((PgAioOp) ioh->op) { case PGAIO_OP_READV: pgstat_report_wait_start(WAIT_EVENT_DATA_FILE_READ); @@ -176,7 +176,7 @@ pgaio_io_get_op_name(PgAioHandle *ioh) { Assert(ioh->op >= 0 && ioh->op < PGAIO_OP_COUNT); - switch (ioh->op) + switch ((PgAioOp) ioh->op) { case PGAIO_OP_INVALID: return "invalid"; @@ -198,7 +198,7 @@ pgaio_io_uses_fd(PgAioHandle *ioh, int fd) { Assert(ioh->state >= PGAIO_HS_DEFINED); - switch (ioh->op) + switch ((PgAioOp) ioh->op) { case PGAIO_OP_READV: return ioh->op_data.read.fd == fd; @@ -222,7 +222,7 @@ pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov) *iov = &pgaio_ctl->iovecs[ioh->iovec_off]; - switch (ioh->op) + switch ((PgAioOp) ioh->op) { case PGAIO_OP_READV: return ioh->op_data.read.iov_length; diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index c719ba2727a81..00cb017ca3c74 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -29,6 +29,9 @@ #ifdef IOMETHOD_IO_URING_ENABLED +#include +#include + #include #include "miscadmin.h" @@ -94,12 +97,32 @@ PgAioUringContext struct io_uring io_uring_ring; } PgAioUringContext; +/* + * Information about the capabilities that io_uring has. + * + * Depending on liburing and kernel version different features are + * supported. At least for the kernel a kernel version check does not suffice + * as various vendors do backport features to older kernels :(. + */ +typedef struct PgAioUringCaps +{ + bool checked; + /* -1 if io_uring_queue_init_mem() is unsupported */ + int mem_init_size; +} PgAioUringCaps; + + /* PgAioUringContexts for all backends */ static PgAioUringContext *pgaio_uring_contexts; /* the current backend's context */ static PgAioUringContext *pgaio_my_uring_context; +static PgAioUringCaps pgaio_uring_caps = +{ + .checked = false, + .mem_init_size = -1, +}; static uint32 pgaio_uring_procs(void) @@ -111,30 +134,184 @@ pgaio_uring_procs(void) return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; } -static Size +/* + * Initializes pgaio_uring_caps, unless that's already done. + */ +static void +pgaio_uring_check_capabilities(void) +{ + if (pgaio_uring_caps.checked) + return; + + /* + * By default io_uring creates a shared memory mapping for each io_uring + * instance, leading to a large number of memory mappings. Unfortunately a + * large number of memory mappings slows things down, backend exit is + * particularly affected. To address that, newer kernels (6.5) support + * using user-provided memory for the memory, by putting the relevant + * memory into shared memory we don't need any additional mappings. + * + * To know whether this is supported, we unfortunately need to probe the + * kernel by trying to create a ring with userspace-provided memory. This + * also has a secondary benefit: We can determine precisely how much + * memory we need for each io_uring instance. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + { + struct io_uring test_ring; + size_t ring_size; + void *ring_ptr; + struct io_uring_params p = {0}; + int ret; + + /* + * Liburing does not yet provide an API to query how much memory a + * ring will need. So we over-estimate it here. As the memory is freed + * just below that's small temporary waste of memory. + * + * 1MB is more than enough for rings within io_max_concurrency's + * range. + */ + ring_size = 1024 * 1024; + + /* + * Hard to believe a system exists where 1MB would not be a multiple + * of the page size. But it's cheap to ensure... + */ + ring_size -= ring_size % sysconf(_SC_PAGESIZE); + + ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ring_ptr == MAP_FAILED) + elog(ERROR, + "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m", + ring_size); + + ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size); + if (ret > 0) + { + pgaio_uring_caps.mem_init_size = ret; + + elog(DEBUG1, + "can use combined memory mapping for io_uring, each ring needs %d bytes", + ret); + + /* clean up the created ring, it was just for a test */ + io_uring_queue_exit(&test_ring); + } + else + { + /* + * There are different reasons for ring creation to fail, but it's + * ok to treat that just as io_uring_queue_init_mem() not being + * supported. We'll report a more detailed error in + * pgaio_uring_shmem_init(). + */ + errno = -ret; + elog(DEBUG1, + "cannot use combined memory mapping for io_uring, ring creation failed: %m"); + + } + + if (munmap(ring_ptr, ring_size) != 0) + elog(ERROR, "munmap() failed: %m"); + } +#else + { + elog(DEBUG1, + "can't use combined memory mapping for io_uring, kernel or liburing too old"); + } +#endif + + pgaio_uring_caps.checked = true; +} + +/* + * Memory for all PgAioUringContext instances + */ +static size_t pgaio_uring_context_shmem_size(void) { return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext)); } +/* + * Memory for the combined memory used by io_uring instances. Returns 0 if + * that is not supported by kernel/liburing. + */ +static size_t +pgaio_uring_ring_shmem_size(void) +{ + size_t sz = 0; + + if (pgaio_uring_caps.mem_init_size > 0) + { + /* + * Memory for rings needs to be allocated to the page boundary, + * reserve space. Luckily it does not need to be aligned to hugepage + * boundaries, even if huge pages are used. + */ + sz = add_size(sz, sysconf(_SC_PAGESIZE)); + sz = add_size(sz, mul_size(pgaio_uring_procs(), + pgaio_uring_caps.mem_init_size)); + } + + return sz; +} + static size_t pgaio_uring_shmem_size(void) { - return pgaio_uring_context_shmem_size(); + size_t sz; + + /* + * Kernel and liburing support for various features influences how much + * shmem we need, perform the necessary checks. + */ + pgaio_uring_check_capabilities(); + + sz = pgaio_uring_context_shmem_size(); + sz = add_size(sz, pgaio_uring_ring_shmem_size()); + + return sz; } static void pgaio_uring_shmem_init(bool first_time) { - int TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; + int TotalProcs = pgaio_uring_procs(); bool found; + char *shmem; + size_t ring_mem_remain = 0; + char *ring_mem_next = 0; - pgaio_uring_contexts = (PgAioUringContext *) - ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found); - + /* + * We allocate memory for all PgAioUringContext instances and, if + * supported, the memory required for each of the io_uring instances, in + * one ShmemInitStruct(). + */ + shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found); if (found) return; + pgaio_uring_contexts = (PgAioUringContext *) shmem; + shmem += pgaio_uring_context_shmem_size(); + + /* if supported, handle memory alignment / sizing for io_uring memory */ + if (pgaio_uring_caps.mem_init_size > 0) + { + ring_mem_remain = pgaio_uring_ring_shmem_size(); + ring_mem_next = shmem; + + /* align to page boundary, see also pgaio_uring_ring_shmem_size() */ + ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next); + + /* account for alignment */ + ring_mem_remain -= ring_mem_next - shmem; + shmem += ring_mem_next - shmem; + + shmem += ring_mem_remain; + } + for (int contextno = 0; contextno < TotalProcs; contextno++) { PgAioUringContext *context = &pgaio_uring_contexts[contextno]; @@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time) * be worth using that - also need to evaluate if that causes * noticeable additional contention? */ - ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + + /* + * If supported (c.f. pgaio_uring_check_capabilities()), create ring + * with its data in shared memory. Otherwise fall back io_uring + * creating a memory mapping for each ring. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + if (pgaio_uring_caps.mem_init_size > 0) + { + struct io_uring_params p = {0}; + + ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain); + + ring_mem_remain -= ret; + ring_mem_next += ret; + } + else +#endif + { + ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + } + if (ret < 0) { char *hint = NULL; @@ -179,7 +377,7 @@ pgaio_uring_shmem_init(bool first_time) else if (-ret == ENOSYS) { err = ERRCODE_FEATURE_NOT_SUPPORTED; - hint = _("Kernel does not support io_uring."); + hint = _("The kernel does not support io_uring."); } /* update errno to allow %m to work */ @@ -400,9 +598,9 @@ pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation) while (true) { pgaio_debug_io(DEBUG3, ioh, - "wait_one io_gen: %llu, ref_gen: %llu, cycle %d", - (long long unsigned) ioh->generation, - (long long unsigned) ref_generation, + "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d", + ioh->generation, + ref_generation, waited); if (pgaio_io_was_recycled(ioh, ref_generation, &state) || @@ -462,7 +660,7 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe) { struct iovec *iov; - switch (ioh->op) + switch ((PgAioOp) ioh->op) { case PGAIO_OP_READV: iov = &pgaio_ctl->iovecs[ioh->iovec_off]; diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index 743cccc2acd18..866a35abbd118 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -52,26 +52,25 @@ #define IO_WORKER_WAKEUP_FANOUT 2 -typedef struct AioWorkerSubmissionQueue +typedef struct PgAioWorkerSubmissionQueue { uint32 size; - uint32 mask; uint32 head; uint32 tail; - uint32 ios[FLEXIBLE_ARRAY_MEMBER]; -} AioWorkerSubmissionQueue; + int sqes[FLEXIBLE_ARRAY_MEMBER]; +} PgAioWorkerSubmissionQueue; -typedef struct AioWorkerSlot +typedef struct PgAioWorkerSlot { Latch *latch; bool in_use; -} AioWorkerSlot; +} PgAioWorkerSlot; -typedef struct AioWorkerControl +typedef struct PgAioWorkerControl { uint64 idle_worker_mask; - AioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER]; -} AioWorkerControl; + PgAioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER]; +} PgAioWorkerControl; static size_t pgaio_worker_shmem_size(void); @@ -96,8 +95,8 @@ int io_workers = 3; static int io_worker_queue_size = 64; static int MyIoWorkerId; -static AioWorkerSubmissionQueue *io_worker_submission_queue; -static AioWorkerControl *io_worker_control; +static PgAioWorkerSubmissionQueue *io_worker_submission_queue; +static PgAioWorkerControl *io_worker_control; static size_t @@ -106,15 +105,15 @@ pgaio_worker_queue_shmem_size(int *queue_size) /* Round size up to next power of two so we can make a mask. */ *queue_size = pg_nextpower2_32(io_worker_queue_size); - return offsetof(AioWorkerSubmissionQueue, ios) + - sizeof(uint32) * *queue_size; + return offsetof(PgAioWorkerSubmissionQueue, sqes) + + sizeof(int) * *queue_size; } static size_t pgaio_worker_control_shmem_size(void) { - return offsetof(AioWorkerControl, workers) + - sizeof(AioWorkerSlot) * MAX_IO_WORKERS; + return offsetof(PgAioWorkerControl, workers) + + sizeof(PgAioWorkerSlot) * MAX_IO_WORKERS; } static size_t @@ -162,7 +161,7 @@ pgaio_worker_shmem_init(bool first_time) } static int -pgaio_choose_idle_worker(void) +pgaio_worker_choose_idle(void) { int worker; @@ -172,6 +171,7 @@ pgaio_choose_idle_worker(void) /* Find the lowest bit position, and clear it. */ worker = pg_rightmost_one_pos64(io_worker_control->idle_worker_mask); io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << worker); + Assert(io_worker_control->workers[worker].in_use); return worker; } @@ -179,7 +179,7 @@ pgaio_choose_idle_worker(void) static bool pgaio_worker_submission_queue_insert(PgAioHandle *ioh) { - AioWorkerSubmissionQueue *queue; + PgAioWorkerSubmissionQueue *queue; uint32 new_head; queue = io_worker_submission_queue; @@ -191,23 +191,23 @@ pgaio_worker_submission_queue_insert(PgAioHandle *ioh) return false; /* full */ } - queue->ios[queue->head] = pgaio_io_get_id(ioh); + queue->sqes[queue->head] = pgaio_io_get_id(ioh); queue->head = new_head; return true; } -static uint32 +static int pgaio_worker_submission_queue_consume(void) { - AioWorkerSubmissionQueue *queue; - uint32 result; + PgAioWorkerSubmissionQueue *queue; + int result; queue = io_worker_submission_queue; if (queue->tail == queue->head) - return UINT32_MAX; /* empty */ + return -1; /* empty */ - result = queue->ios[queue->tail]; + result = queue->sqes[queue->tail]; queue->tail = (queue->tail + 1) & (queue->size - 1); return result; @@ -240,37 +240,37 @@ pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh) } static void -pgaio_worker_submit_internal(int nios, PgAioHandle *ios[]) +pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios) { PgAioHandle *synchronous_ios[PGAIO_SUBMIT_BATCH_SIZE]; int nsync = 0; Latch *wakeup = NULL; int worker; - Assert(nios <= PGAIO_SUBMIT_BATCH_SIZE); + Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE); LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE); - for (int i = 0; i < nios; ++i) + for (int i = 0; i < num_staged_ios; ++i) { - Assert(!pgaio_worker_needs_synchronous_execution(ios[i])); - if (!pgaio_worker_submission_queue_insert(ios[i])) + Assert(!pgaio_worker_needs_synchronous_execution(staged_ios[i])); + if (!pgaio_worker_submission_queue_insert(staged_ios[i])) { /* * We'll do it synchronously, but only after we've sent as many as * we can to workers, to maximize concurrency. */ - synchronous_ios[nsync++] = ios[i]; + synchronous_ios[nsync++] = staged_ios[i]; continue; } if (wakeup == NULL) { /* Choose an idle worker to wake up if we haven't already. */ - worker = pgaio_choose_idle_worker(); + worker = pgaio_worker_choose_idle(); if (worker >= 0) wakeup = io_worker_control->workers[worker].latch; - pgaio_debug_io(DEBUG4, ios[i], + pgaio_debug_io(DEBUG4, staged_ios[i], "choosing worker %d", worker); } @@ -316,6 +316,7 @@ pgaio_worker_die(int code, Datum arg) Assert(io_worker_control->workers[MyIoWorkerId].in_use); Assert(io_worker_control->workers[MyIoWorkerId].latch == MyLatch); + io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << MyIoWorkerId); io_worker_control->workers[MyIoWorkerId].in_use = false; io_worker_control->workers[MyIoWorkerId].latch = NULL; LWLockRelease(AioWorkerSubmissionQueueLock); @@ -461,9 +462,14 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) int nwakeups = 0; int worker; - /* Try to get a job to do. */ + /* + * Try to get a job to do. + * + * The lwlock acquisition also provides the necessary memory barrier + * to ensure that we don't see an outdated data in the handle. + */ LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE); - if ((io_index = pgaio_worker_submission_queue_consume()) == UINT32_MAX) + if ((io_index = pgaio_worker_submission_queue_consume()) == -1) { /* * Nothing to do. Mark self idle. @@ -483,7 +489,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) IO_WORKER_WAKEUP_FANOUT); for (int i = 0; i < nwakeups; ++i) { - if ((worker = pgaio_choose_idle_worker()) < 0) + if ((worker = pgaio_worker_choose_idle()) < 0) break; latches[nlatches++] = io_worker_control->workers[worker].latch; } @@ -493,7 +499,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) for (int i = 0; i < nlatches; ++i) SetLatch(latches[i]); - if (io_index != UINT32_MAX) + if (io_index != -1) { PgAioHandle *ioh = NULL; @@ -568,6 +574,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) } CHECK_FOR_INTERRUPTS(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } } error_context_stack = errcallback.previous; diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 0e7f5557f5cb9..f1b88d058e372 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -247,12 +247,33 @@ read_stream_start_pending_read(ReadStream *stream) Assert(stream->pinned_buffers + stream->pending_read_nblocks <= stream->max_pinned_buffers); +#ifdef USE_ASSERT_CHECKING /* We had better not be overwriting an existing pinned buffer. */ if (stream->pinned_buffers > 0) Assert(stream->next_buffer_index != stream->oldest_buffer_index); else Assert(stream->next_buffer_index == stream->oldest_buffer_index); + /* + * Pinned buffers forwarded by a preceding StartReadBuffers() call that + * had to split the operation should match the leading blocks of this + * following StartReadBuffers() call. + */ + Assert(stream->forwarded_buffers <= stream->pending_read_nblocks); + for (int i = 0; i < stream->forwarded_buffers; ++i) + Assert(BufferGetBlockNumber(stream->buffers[stream->next_buffer_index + i]) == + stream->pending_read_blocknum + i); + + /* + * Check that we've cleared the queue/overflow entries corresponding to + * the rest of the blocks covered by this read, unless it's the first go + * around and we haven't even initialized them yet. + */ + for (int i = stream->forwarded_buffers; i < stream->pending_read_nblocks; ++i) + Assert(stream->next_buffer_index + i >= stream->initialized_buffers || + stream->buffers[stream->next_buffer_index + i] == InvalidBuffer); +#endif + /* Do we need to issue read-ahead advice? */ flags = stream->read_buffers_flags; if (stream->advice_enabled) @@ -262,7 +283,7 @@ read_stream_start_pending_read(ReadStream *stream) /* * Sequential: Issue advice until the preadv() calls have caught * up with the first advice issued for this sequential region, and - * then stay of the way of the kernel's own read-ahead. + * then stay out of the way of the kernel's own read-ahead. */ if (stream->seq_until_processed != InvalidBlockNumber) flags |= READ_BUFFERS_ISSUE_ADVICE; @@ -979,6 +1000,19 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) stream->pending_read_nblocks == 0 && stream->per_buffer_data_size == 0) { + /* + * The fast path spins on one buffer entry repeatedly instead of + * rotating through the whole queue and clearing the entries behind + * it. If the buffer it starts with happened to be forwarded between + * StartReadBuffers() calls and also wrapped around the circular queue + * partway through, then a copy also exists in the overflow zone, and + * it won't clear it out as the regular path would. Do that now, so + * it doesn't need code for that. + */ + if (stream->oldest_buffer_index < stream->io_combine_limit - 1) + stream->buffers[stream->queue_size + stream->oldest_buffer_index] = + InvalidBuffer; + stream->fast_path = true; } #endif diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a182fcd660ccb..119f31b5d6584 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -128,11 +128,11 @@ independently. If it is necessary to lock more than one partition at a time, they must be locked in partition-number order to avoid risk of deadlock. * A separate system-wide spinlock, buffer_strategy_lock, provides mutual -exclusion for operations that access the buffer free list or select -buffers for replacement. A spinlock is used here rather than a lightweight -lock for efficiency; no other locks of any sort should be acquired while -buffer_strategy_lock is held. This is essential to allow buffer replacement -to happen in multiple backends with reasonable concurrency. +exclusion for operations that select buffers for replacement. A spinlock is +used here rather than a lightweight lock for efficiency; no other locks of any +sort should be acquired while buffer_strategy_lock is held. This is essential +to allow buffer replacement to happen in multiple backends with reasonable +concurrency. * Each buffer header contains a spinlock that must be taken when examining or changing fields of that buffer header. This allows operations such as @@ -158,18 +158,8 @@ unset by sleeping on the buffer's condition variable. Normal Buffer Replacement Strategy ---------------------------------- -There is a "free list" of buffers that are prime candidates for replacement. -In particular, buffers that are completely free (contain no valid page) are -always in this list. We could also throw buffers into this list if we -consider their pages unlikely to be needed soon; however, the current -algorithm never does that. The list is singly-linked using fields in the -buffer headers; we maintain head and tail pointers in global variables. -(Note: although the list links are in the buffer headers, they are -considered to be protected by the buffer_strategy_lock, not the buffer-header -spinlocks.) To choose a victim buffer to recycle when there are no free -buffers available, we use a simple clock-sweep algorithm, which avoids the -need to take system-wide locks during common operations. It works like -this: +To choose a victim buffer to recycle we use a simple clock-sweep algorithm. It +works like this: Each buffer header contains a usage counter, which is incremented (up to a small limit value) whenever the buffer is pinned. (This requires only the @@ -184,20 +174,14 @@ The algorithm for a process that needs to obtain a victim buffer is: 1. Obtain buffer_strategy_lock. -2. If buffer free list is nonempty, remove its head buffer. Release -buffer_strategy_lock. If the buffer is pinned or has a nonzero usage count, -it cannot be used; ignore it go back to step 1. Otherwise, pin the buffer, -and return it. +2. Select the buffer pointed to by nextVictimBuffer, and circularly advance +nextVictimBuffer for next time. Release buffer_strategy_lock. -3. Otherwise, the buffer free list is empty. Select the buffer pointed to by -nextVictimBuffer, and circularly advance nextVictimBuffer for next time. -Release buffer_strategy_lock. - -4. If the selected buffer is pinned or has a nonzero usage count, it cannot +3. If the selected buffer is pinned or has a nonzero usage count, it cannot be used. Decrement its usage count (if nonzero), reacquire buffer_strategy_lock, and return to step 3 to examine the next buffer. -5. Pin the selected buffer, and return. +4. Pin the selected buffer, and return. (Note that if the selected buffer is dirty, we will have to write it out before we can recycle it; if someone else pins the buffer meanwhile we will @@ -211,9 +195,9 @@ Buffer Ring Replacement Strategy When running a query that needs to access a large number of pages just once, such as VACUUM or a large sequential scan, a different strategy is used. A page that has been touched only by such a scan is unlikely to be needed -again soon, so instead of running the normal clock sweep algorithm and +again soon, so instead of running the normal clock-sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated -using the normal clock sweep algorithm and those buffers are reused for the +using the normal clock-sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes. @@ -234,7 +218,7 @@ the ring strategy effectively degrades to the normal strategy. VACUUM uses a ring like sequential scans, however, the size of this ring is controlled by the vacuum_buffer_usage_limit GUC. Dirty pages are not removed -from the ring. Instead, WAL is flushed if needed to allow reuse of the +from the ring. Instead, the WAL is flushed if needed to allow reuse of the buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's buffers were sent to the freelist, which was effectively a buffer ring of 1 buffer, resulting in excessive WAL flushing. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index ed1dc488a42b4..6fd3a6bbac5ea 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -128,20 +128,11 @@ BufferManagerShmemInit(void) pgaio_wref_clear(&buf->io_wref); - /* - * Initially link all the buffers together as unused. Subsequent - * management of this list is done by freelist.c. - */ - buf->freeNext = i + 1; - LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } - - /* Correct last entry of linked list */ - GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; } /* Init other shared buffer-management stuff */ diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index a50955d5286ca..9d256559bab9d 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -62,7 +62,7 @@ InitBufTable(int size) SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", size, size, &info, - HASH_ELEM | HASH_BLOBS | HASH_PARTITION); + HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE); } /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f93131a645ea8..301b2ecd8828d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -90,10 +90,32 @@ */ #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32) +/* + * This is separated out from PrivateRefCountEntry to allow for copying all + * the data members via struct assignment. + */ +typedef struct PrivateRefCountData +{ + /* + * How many times has the buffer been pinned by this backend. + */ + int32 refcount; +} PrivateRefCountData; + typedef struct PrivateRefCountEntry { + /* + * Note that this needs to be same as the entry's corresponding + * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We + * store it in both places as this is used for the hashtable key and + * because it is more convenient (passing around a PrivateRefCountEntry + * suffices to identify the buffer) and faster (checking the keys array is + * faster when checking many entries, checking the entry is faster if just + * checking a single entry). + */ Buffer buffer; - int32 refcount; + + PrivateRefCountData data; } PrivateRefCountEntry; /* 64 bytes, about the size of a cache line on common systems */ @@ -194,7 +216,8 @@ static BufferDesc *PinCountWaitBuf = NULL; * * To avoid - as we used to - requiring an array with NBuffers entries to keep * track of local buffers, we use a small sequentially searched array - * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to + * (PrivateRefCountArrayKeys, with the corresponding data stored in + * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to * keep track of backend local pins. * * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all @@ -212,11 +235,13 @@ static BufferDesc *PinCountWaitBuf = NULL; * memory allocations in NewPrivateRefCountEntry() which can be important * because in some scenarios it's called with a spinlock held... */ +static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]; static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; static HTAB *PrivateRefCountHash = NULL; static int32 PrivateRefCountOverflowed = 0; static uint32 PrivateRefCountClock = 0; -static PrivateRefCountEntry *ReservedRefCountEntry = NULL; +static int ReservedRefCountSlot = -1; +static int PrivateRefCountEntryLast = -1; static uint32 MaxProportionalPins; @@ -259,7 +284,7 @@ static void ReservePrivateRefCountEntry(void) { /* Already reserved (or freed), nothing to do */ - if (ReservedRefCountEntry != NULL) + if (ReservedRefCountSlot != -1) return; /* @@ -271,16 +296,19 @@ ReservePrivateRefCountEntry(void) for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) { - PrivateRefCountEntry *res; - - res = &PrivateRefCountArray[i]; - - if (res->buffer == InvalidBuffer) + if (PrivateRefCountArrayKeys[i] == InvalidBuffer) { - ReservedRefCountEntry = res; - return; + ReservedRefCountSlot = i; + + /* + * We could return immediately, but iterating till the end of + * the array allows compiler-autovectorization. + */ } } + + if (ReservedRefCountSlot != -1) + return; } /* @@ -292,27 +320,37 @@ ReservePrivateRefCountEntry(void) * Move entry from the current clock position in the array into the * hashtable. Use that slot. */ + int victim_slot; + PrivateRefCountEntry *victim_entry; PrivateRefCountEntry *hashent; bool found; /* select victim slot */ - ReservedRefCountEntry = - &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES]; + victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES; + victim_entry = &PrivateRefCountArray[victim_slot]; + ReservedRefCountSlot = victim_slot; /* Better be used, otherwise we shouldn't get here. */ - Assert(ReservedRefCountEntry->buffer != InvalidBuffer); + Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer); + Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer); + Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer); /* enter victim array entry into hashtable */ hashent = hash_search(PrivateRefCountHash, - &(ReservedRefCountEntry->buffer), + &PrivateRefCountArrayKeys[victim_slot], HASH_ENTER, &found); Assert(!found); - hashent->refcount = ReservedRefCountEntry->refcount; + /* move data from the entry in the array to the hash entry */ + hashent->data = victim_entry->data; /* clear the now free array slot */ - ReservedRefCountEntry->buffer = InvalidBuffer; - ReservedRefCountEntry->refcount = 0; + PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer; + victim_entry->buffer = InvalidBuffer; + + /* clear the whole data member, just for future proofing */ + memset(&victim_entry->data, 0, sizeof(victim_entry->data)); + victim_entry->data.refcount = 0; PrivateRefCountOverflowed++; } @@ -327,45 +365,56 @@ NewPrivateRefCountEntry(Buffer buffer) PrivateRefCountEntry *res; /* only allowed to be called when a reservation has been made */ - Assert(ReservedRefCountEntry != NULL); + Assert(ReservedRefCountSlot != -1); /* use up the reserved entry */ - res = ReservedRefCountEntry; - ReservedRefCountEntry = NULL; + res = &PrivateRefCountArray[ReservedRefCountSlot]; /* and fill it */ + PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer; res->buffer = buffer; - res->refcount = 0; + res->data.refcount = 0; + + /* update cache for the next lookup */ + PrivateRefCountEntryLast = ReservedRefCountSlot; + + ReservedRefCountSlot = -1; return res; } /* - * Return the PrivateRefCount entry for the passed buffer. - * - * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if - * do_move is true, and the entry resides in the hashtable the entry is - * optimized for frequent access by moving it to the array. + * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth + * inlining. This particularly seems to be true if the compiler is capable of + * auto-vectorizing the code, as that imposes additional stack-alignment + * requirements etc. */ -static PrivateRefCountEntry * -GetPrivateRefCountEntry(Buffer buffer, bool do_move) +static pg_noinline PrivateRefCountEntry * +GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move) { PrivateRefCountEntry *res; + int match = -1; int i; - Assert(BufferIsValid(buffer)); - Assert(!BufferIsLocal(buffer)); - /* * First search for references in the array, that'll be sufficient in the * majority of cases. */ for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) { - res = &PrivateRefCountArray[i]; + if (PrivateRefCountArrayKeys[i] == buffer) + { + match = i; + /* see ReservePrivateRefCountEntry() for why we don't return */ + } + } + + if (likely(match != -1)) + { + /* update cache for the next lookup */ + PrivateRefCountEntryLast = match; - if (res->buffer == buffer) - return res; + return &PrivateRefCountArray[match]; } /* @@ -397,14 +446,20 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move) ReservePrivateRefCountEntry(); /* Use up the reserved slot */ - Assert(ReservedRefCountEntry != NULL); - free = ReservedRefCountEntry; - ReservedRefCountEntry = NULL; + Assert(ReservedRefCountSlot != -1); + free = &PrivateRefCountArray[ReservedRefCountSlot]; + Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer); Assert(free->buffer == InvalidBuffer); /* and fill it */ free->buffer = buffer; - free->refcount = res->refcount; + free->data = res->data; + PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer; + /* update cache for the next lookup */ + PrivateRefCountEntryLast = match; + + ReservedRefCountSlot = -1; + /* delete from hashtable */ hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); @@ -416,6 +471,43 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move) } } +/* + * Return the PrivateRefCount entry for the passed buffer. + * + * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if + * do_move is true, and the entry resides in the hashtable the entry is + * optimized for frequent access by moving it to the array. + */ +static inline PrivateRefCountEntry * +GetPrivateRefCountEntry(Buffer buffer, bool do_move) +{ + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); + + /* + * It's very common to look up the same buffer repeatedly. To make that + * fast, we have a one-entry cache. + * + * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it + * faster to check PrivateRefCountArray[].buffer, as in the case of a hit + * fewer addresses are computed and fewer cachelines are accessed. Whereas + * in GetPrivateRefCountEntrySlow()'s case, checking + * PrivateRefCountArrayKeys saves a lot of memory accesses. + */ + if (likely(PrivateRefCountEntryLast != -1) && + likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer)) + { + return &PrivateRefCountArray[PrivateRefCountEntryLast]; + } + + /* + * The code for the cached lookup is small enough to be worth inlining + * into the caller. In the miss case however, that empirically doesn't + * seem worth it. + */ + return GetPrivateRefCountEntrySlow(buffer, do_move); +} + /* * Returns how many times the passed buffer is pinned by this backend. * @@ -437,7 +529,7 @@ GetPrivateRefCount(Buffer buffer) if (ref == NULL) return 0; - return ref->refcount; + return ref->data.refcount; } /* @@ -447,19 +539,21 @@ GetPrivateRefCount(Buffer buffer) static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) { - Assert(ref->refcount == 0); + Assert(ref->data.refcount == 0); if (ref >= &PrivateRefCountArray[0] && ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]) { ref->buffer = InvalidBuffer; + PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer; + /* * Mark the just used entry as reserved - in many scenarios that * allows us to avoid ever having to search the array/hash for free * entries. */ - ReservedRefCountEntry = ref; + ReservedRefCountSlot = ref - PrivateRefCountArray; } else { @@ -512,12 +606,12 @@ static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by); -static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); +static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, + bool skip_if_not_valid); static void PinBuffer_Locked(BufferDesc *buf); static void UnpinBuffer(BufferDesc *buf); static void UnpinBufferNoOwner(BufferDesc *buf); static void BufferSync(int flags); -static uint32 WaitBufHdrUnlocked(BufferDesc *buf); static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context); static void WaitIO(BufferDesc *buf); @@ -533,6 +627,8 @@ static inline BufferDesc *BufferAlloc(SMgrRelation smgr, static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress); static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete); static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context); +static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, + IOObject io_object, IOContext io_context); static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context); static void FindAndDropRelationBuffers(RelFileLocator rlocator, @@ -685,7 +781,6 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN BufferDesc *bufHdr; BufferTag tag; uint32 buf_state; - bool have_private_ref; Assert(BufferIsValid(recent_buffer)); @@ -713,38 +808,24 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN else { bufHdr = GetBufferDescriptor(recent_buffer - 1); - have_private_ref = GetPrivateRefCount(recent_buffer) > 0; /* - * Do we already have this buffer pinned with a private reference? If - * so, it must be valid and it is safe to check the tag without - * locking. If not, we have to lock the header first and then check. + * Is it still valid and holding the right tag? We do an unlocked tag + * comparison first, to make it unlikely that we'll increment the + * usage counter of the wrong buffer, if someone calls us with a very + * out of date recent_buffer. Then we'll check it again if we get the + * pin. */ - if (have_private_ref) - buf_state = pg_atomic_read_u32(&bufHdr->state); - else - buf_state = LockBufHdr(bufHdr); - - if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag)) + if (BufferTagsEqual(&tag, &bufHdr->tag) && + PinBuffer(bufHdr, NULL, true)) { - /* - * It's now safe to pin the buffer. We can't pin first and ask - * questions later, because it might confuse code paths like - * InvalidateBuffer() if we pinned a random non-matching buffer. - */ - if (have_private_ref) - PinBuffer(bufHdr, NULL); /* bump pin count */ - else - PinBuffer_Locked(bufHdr); /* pin for first time */ - - pgBufferUsage.shared_blks_hit++; - - return true; + if (BufferTagsEqual(&tag, &bufHdr->tag)) + { + pgBufferUsage.shared_blks_hit++; + return true; + } + UnpinBuffer(bufHdr); } - - /* If we locked the header above, now unlock. */ - if (!have_private_ref) - UnlockBufHdr(bufHdr, buf_state); } return false; @@ -896,14 +977,11 @@ ExtendBufferedRelBy(BufferManagerRelation bmr, uint32 *extended_by) { Assert((bmr.rel != NULL) != (bmr.smgr != NULL)); - Assert(bmr.smgr == NULL || bmr.relpersistence != 0); + Assert(bmr.smgr == NULL || bmr.relpersistence != '\0'); Assert(extend_by > 0); - if (bmr.smgr == NULL) - { - bmr.smgr = RelationGetSmgr(bmr.rel); + if (bmr.relpersistence == '\0') bmr.relpersistence = bmr.rel->rd_rel->relpersistence; - } return ExtendBufferedRelCommon(bmr, fork, strategy, flags, extend_by, InvalidBlockNumber, @@ -932,14 +1010,11 @@ ExtendBufferedRelTo(BufferManagerRelation bmr, Buffer buffers[64]; Assert((bmr.rel != NULL) != (bmr.smgr != NULL)); - Assert(bmr.smgr == NULL || bmr.relpersistence != 0); + Assert(bmr.smgr == NULL || bmr.relpersistence != '\0'); Assert(extend_to != InvalidBlockNumber && extend_to > 0); - if (bmr.smgr == NULL) - { - bmr.smgr = RelationGetSmgr(bmr.rel); + if (bmr.relpersistence == '\0') bmr.relpersistence = bmr.rel->rd_rel->relpersistence; - } /* * If desired, create the file if it doesn't exist. If @@ -947,15 +1022,15 @@ ExtendBufferedRelTo(BufferManagerRelation bmr, * an smgrexists call. */ if ((flags & EB_CREATE_FORK_IF_NEEDED) && - (bmr.smgr->smgr_cached_nblocks[fork] == 0 || - bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) && - !smgrexists(bmr.smgr, fork)) + (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 || + BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) && + !smgrexists(BMR_GET_SMGR(bmr), fork)) { LockRelationForExtension(bmr.rel, ExclusiveLock); /* recheck, fork might have been created concurrently */ - if (!smgrexists(bmr.smgr, fork)) - smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY); + if (!smgrexists(BMR_GET_SMGR(bmr), fork)) + smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY); UnlockRelationForExtension(bmr.rel, ExclusiveLock); } @@ -965,13 +1040,13 @@ ExtendBufferedRelTo(BufferManagerRelation bmr, * kernel. */ if (flags & EB_CLEAR_SIZE_CACHE) - bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; + BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber; /* * Estimate how many pages we'll need to extend by. This avoids acquiring * unnecessarily many victim buffers. */ - current_size = smgrnblocks(bmr.smgr, fork); + current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork); /* * Since no-one else can be looking at the page contents yet, there is no @@ -1015,7 +1090,7 @@ ExtendBufferedRelTo(BufferManagerRelation bmr, if (buffer == InvalidBuffer) { Assert(extended_by == 0); - buffer = ReadBuffer_common(bmr.rel, bmr.smgr, bmr.relpersistence, + buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence, fork, extend_to - 1, mode, strategy); } @@ -1080,7 +1155,7 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid) * already valid.) */ if (!isLocalBuf) - LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* Set BM_VALID, terminate IO, and wake up any waiters */ if (isLocalBuf) @@ -1484,11 +1559,6 @@ StartReadBuffersImpl(ReadBuffersOperation *operation, * buffers must remain valid until WaitReadBuffers() is called, and any * forwarded buffers must also be preserved for a continuing call unless * they are explicitly released. - * - * Currently the I/O is only started with optional operating system advice if - * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O - * happens synchronously in WaitReadBuffers(). In future work, true I/O could - * be initiated here. */ bool StartReadBuffers(ReadBuffersOperation *operation, @@ -2014,6 +2084,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, Buffer victim_buffer; BufferDesc *victim_buf_hdr; uint32 victim_buf_state; + uint32 set_bits = 0; /* Make sure we will have room to remember the buffer pin */ ResourceOwnerEnlarge(CurrentResourceOwner); @@ -2041,7 +2112,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ buf = GetBufferDescriptor(existing_buf_id); - valid = PinBuffer(buf, strategy); + valid = PinBuffer(buf, strategy, false); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(newPartitionLock); @@ -2099,17 +2170,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ UnpinBuffer(victim_buf_hdr); - /* - * The victim buffer we acquired previously is clean and unused, let - * it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); - /* remaining code should match code at top of routine */ existing_buf_hdr = GetBufferDescriptor(existing_buf_id); - valid = PinBuffer(existing_buf_hdr, strategy); + valid = PinBuffer(existing_buf_hdr, strategy, false); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(newPartitionLock); @@ -2146,11 +2211,12 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * checkpoints, except for their "init" forks, which need to be treated * just like permanent relations. */ - victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM) - victim_buf_state |= BM_PERMANENT; + set_bits |= BM_PERMANENT; - UnlockBufHdr(victim_buf_hdr, victim_buf_state); + UnlockBufHdrExt(victim_buf_hdr, victim_buf_state, + set_bits, 0, 0); LWLockRelease(newPartitionLock); @@ -2163,8 +2229,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* - * InvalidateBuffer -- mark a shared buffer invalid and return it to the - * freelist. + * InvalidateBuffer -- mark a shared buffer invalid. * * The buffer header spinlock must be held at entry. We drop it before * returning. (This is sane because the caller must have locked the @@ -2191,9 +2256,7 @@ InvalidateBuffer(BufferDesc *buf) /* Save the original buffer tag before dropping the spinlock */ oldTag = buf->tag; - buf_state = pg_atomic_read_u32(&buf->state); - Assert(buf_state & BM_LOCKED); - UnlockBufHdr(buf, buf_state); + UnlockBufHdr(buf); /* * Need to compute the old tag's hashcode and partition lock ID. XXX is it @@ -2217,7 +2280,7 @@ InvalidateBuffer(BufferDesc *buf) /* If it's changed while we were waiting for lock, do nothing */ if (!BufferTagsEqual(&buf->tag, &oldTag)) { - UnlockBufHdr(buf, buf_state); + UnlockBufHdr(buf); LWLockRelease(oldPartitionLock); return; } @@ -2234,7 +2297,7 @@ InvalidateBuffer(BufferDesc *buf) */ if (BUF_STATE_GET_REFCOUNT(buf_state) != 0) { - UnlockBufHdr(buf, buf_state); + UnlockBufHdr(buf); LWLockRelease(oldPartitionLock); /* safety check: should definitely not be our *own* pin */ if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0) @@ -2249,8 +2312,11 @@ InvalidateBuffer(BufferDesc *buf) */ oldFlags = buf_state & BUF_FLAG_MASK; ClearBufferTag(&buf->tag); - buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); - UnlockBufHdr(buf, buf_state); + + UnlockBufHdrExt(buf, buf_state, + 0, + BUF_FLAG_MASK | BUF_USAGECOUNT_MASK, + 0); /* * Remove the buffer from the lookup hashtable, if it was in there. @@ -2262,11 +2328,6 @@ InvalidateBuffer(BufferDesc *buf) * Done with mapping lock. */ LWLockRelease(oldPartitionLock); - - /* - * Insert the buffer at the head of the list of free buffers. - */ - StrategyFreeBuffer(buf); } /* @@ -2315,7 +2376,7 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr) { Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); - UnlockBufHdr(buf_hdr, buf_state); + UnlockBufHdr(buf_hdr); LWLockRelease(partition_lock); return false; @@ -2329,8 +2390,10 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr) * tag (see e.g. FlushDatabaseBuffers()). */ ClearBufferTag(&buf_hdr->tag); - buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); - UnlockBufHdr(buf_hdr, buf_state); + UnlockBufHdrExt(buf_hdr, buf_state, + 0, + BUF_FLAG_MASK | BUF_USAGECOUNT_MASK, + 0); Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); @@ -2339,6 +2402,7 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr) LWLockRelease(partition_lock); + buf_state = pg_atomic_read_u32(&buf_hdr->state); Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID))); Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0); @@ -2355,8 +2419,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) bool from_ring; /* - * Ensure, while the spinlock's not yet held, that there's a free refcount - * entry, and a resource owner slot for the pin. + * Ensure, before we pin a victim buffer, that there's a free refcount + * entry and resource owner slot for the pin. */ ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); @@ -2365,17 +2429,12 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) again: /* - * Select a victim buffer. The buffer is returned with its header - * spinlock still held! + * Select a victim buffer. The buffer is returned pinned and owned by + * this backend. */ buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); buf = BufferDescriptorGetBuffer(buf_hdr); - Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0); - - /* Pin the buffer and then release the buffer spinlock */ - PinBuffer_Locked(buf_hdr); - /* * We shouldn't have any other pins for this buffer. */ @@ -2434,7 +2493,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) /* Read the LSN while holding buffer header lock */ buf_state = LockBufHdr(buf_hdr); lsn = BufferGetLSN(buf_hdr); - UnlockBufHdr(buf_hdr, buf_state); + UnlockBufHdr(buf_hdr); if (XLogNeedsFlush(lsn) && StrategyRejectBuffer(strategy, buf_hdr, from_ring)) @@ -2575,10 +2634,10 @@ ExtendBufferedRelCommon(BufferManagerRelation bmr, BlockNumber first_block; TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork, - bmr.smgr->smgr_rlocator.locator.spcOid, - bmr.smgr->smgr_rlocator.locator.dbOid, - bmr.smgr->smgr_rlocator.locator.relNumber, - bmr.smgr->smgr_rlocator.backend, + BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid, + BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid, + BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber, + BMR_GET_SMGR(bmr)->smgr_rlocator.backend, extend_by); if (bmr.relpersistence == RELPERSISTENCE_TEMP) @@ -2592,10 +2651,10 @@ ExtendBufferedRelCommon(BufferManagerRelation bmr, *extended_by = extend_by; TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork, - bmr.smgr->smgr_rlocator.locator.spcOid, - bmr.smgr->smgr_rlocator.locator.dbOid, - bmr.smgr->smgr_rlocator.locator.relNumber, - bmr.smgr->smgr_rlocator.backend, + BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid, + BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid, + BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber, + BMR_GET_SMGR(bmr)->smgr_rlocator.backend, *extended_by, first_block); @@ -2661,9 +2720,9 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * kernel. */ if (flags & EB_CLEAR_SIZE_CACHE) - bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; + BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber; - first_block = smgrnblocks(bmr.smgr, fork); + first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork); /* * Now that we have the accurate relation size, check if the caller wants @@ -2684,11 +2743,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, { BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(buf_hdr); UnpinBuffer(buf_hdr); } @@ -2706,7 +2760,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("cannot extend relation %s beyond %u blocks", - relpath(bmr.smgr->smgr_rlocator, fork).str, + relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str, MaxBlockNumber))); /* @@ -2728,7 +2782,8 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, ResourceOwnerEnlarge(CurrentResourceOwner); ReservePrivateRefCountEntry(); - InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i); + InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork, + first_block + i); hash = BufTableHashCode(&tag); partition_lock = BufMappingPartitionLock(hash); @@ -2743,12 +2798,10 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * because mdread doesn't complain about reads beyond EOF (when * zero_damaged_pages is ON) and so a previous attempt to read a block * beyond EOF could have left a "valid" zero-filled buffer. - * Unfortunately, we have also seen this case occurring because of - * buggy Linux kernels that sometimes return an lseek(SEEK_END) result - * that doesn't account for a recent write. In that situation, the - * pre-existing buffer would contain valid data that we don't want to - * overwrite. Since the legitimate cases should always have left a - * zero-filled buffer, complain if not PageIsNew. + * + * This has also been observed when relation was overwritten by + * external process. Since the legitimate cases should always have + * left a zero-filled buffer, complain if not PageIsNew. */ if (existing_id >= 0) { @@ -2760,15 +2813,9 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * Pin the existing buffer before releasing the partition lock, * preventing it from being evicted. */ - valid = PinBuffer(existing_hdr, strategy); + valid = PinBuffer(existing_hdr, strategy, false); LWLockRelease(partition_lock); - - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); UnpinBuffer(victim_buf_hdr); buffers[i] = BufferDescriptorGetBuffer(existing_hdr); @@ -2776,10 +2823,9 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, if (valid && !PageIsNew((Page) buf_block)) ereport(ERROR, - (errmsg("unexpected data beyond EOF in block %u of relation %s", + (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"", existing_hdr->tag.blockNum, - relpath(bmr.smgr->smgr_rlocator, fork).str), - errhint("This has been seen to occur with buggy kernels; consider updating your system."))); + relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str))); /* * We *must* do smgr[zero]extend before succeeding, else the page @@ -2793,15 +2839,13 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, */ do { - uint32 buf_state = LockBufHdr(existing_hdr); - - buf_state &= ~BM_VALID; - UnlockBufHdr(existing_hdr, buf_state); + pg_atomic_fetch_and_u32(&existing_hdr->state, ~BM_VALID); } while (!StartBufferIO(existing_hdr, true, false)); } else { uint32 buf_state; + uint32 set_bits = 0; buf_state = LockBufHdr(victim_buf_hdr); @@ -2811,11 +2855,13 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, victim_buf_hdr->tag = tag; - buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM) - buf_state |= BM_PERMANENT; + set_bits |= BM_PERMANENT; - UnlockBufHdr(victim_buf_hdr, buf_state); + UnlockBufHdrExt(victim_buf_hdr, buf_state, + set_bits, 0, + 0); LWLockRelease(partition_lock); @@ -2836,7 +2882,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * * We don't need to set checksum for all-zero pages. */ - smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false); + smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false); /* * Release the file-extension lock; it's now OK for someone else to extend @@ -2868,7 +2914,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, } if (lock) - LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); } @@ -2881,14 +2927,40 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, } /* - * BufferIsExclusiveLocked + * BufferIsLockedByMe + * + * Checks if this backend has the buffer locked in any mode. + * + * Buffer must be pinned. + */ +bool +BufferIsLockedByMe(Buffer buffer) +{ + BufferDesc *bufHdr; + + Assert(BufferIsPinned(buffer)); + + if (BufferIsLocal(buffer)) + { + /* Content locks are not maintained for local buffers. */ + return true; + } + else + { + bufHdr = GetBufferDescriptor(buffer - 1); + return LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)); + } +} + +/* + * BufferIsLockedByMeInMode * - * Checks if buffer is exclusive-locked. + * Checks if this backend has the buffer locked in the specified mode. * * Buffer must be pinned. */ bool -BufferIsExclusiveLocked(Buffer buffer) +BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode) { BufferDesc *bufHdr; @@ -2901,9 +2973,23 @@ BufferIsExclusiveLocked(Buffer buffer) } else { + LWLockMode lw_mode; + + switch (mode) + { + case BUFFER_LOCK_EXCLUSIVE: + lw_mode = LW_EXCLUSIVE; + break; + case BUFFER_LOCK_SHARE: + lw_mode = LW_SHARED; + break; + default: + pg_unreachable(); + } + bufHdr = GetBufferDescriptor(buffer - 1); return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), - LW_EXCLUSIVE); + lw_mode); } } @@ -2932,8 +3018,7 @@ BufferIsDirty(Buffer buffer) else { bufHdr = GetBufferDescriptor(buffer - 1); - Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), - LW_EXCLUSIVE)); + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); } return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY; @@ -2967,9 +3052,12 @@ MarkBufferDirty(Buffer buffer) bufHdr = GetBufferDescriptor(buffer - 1); Assert(BufferIsPinned(buffer)); - Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), - LW_EXCLUSIVE)); + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); + /* + * NB: We have to wait for the buffer header spinlock to be not held, as + * TerminateBufferIO() relies on the spinlock. + */ old_buf_state = pg_atomic_read_u32(&bufHdr->state); for (;;) { @@ -3066,17 +3154,20 @@ ReleaseAndReadBuffer(Buffer buffer, * must have been done already. * * Returns true if buffer is BM_VALID, else false. This provision allows - * some callers to avoid an extra spinlock cycle. + * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is + * true, then a false return value also indicates that the buffer was + * (recently) invalid and has not been pinned. */ static bool -PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) +PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, + bool skip_if_not_valid) { Buffer b = BufferDescriptorGetBuffer(buf); bool result; PrivateRefCountEntry *ref; Assert(!BufferIsLocal(b)); - Assert(ReservedRefCountEntry != NULL); + Assert(ReservedRefCountSlot != -1); ref = GetPrivateRefCountEntry(b, true); @@ -3085,11 +3176,16 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) uint32 buf_state; uint32 old_buf_state; - ref = NewPrivateRefCountEntry(b); - old_buf_state = pg_atomic_read_u32(&buf->state); for (;;) { + if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID))) + return false; + + /* + * We're not allowed to increase the refcount while the buffer + * header spinlock is held. Wait for the lock to be released. + */ if (old_buf_state & BM_LOCKED) old_buf_state = WaitBufHdrUnlocked(buf); @@ -3119,14 +3215,7 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) { result = (buf_state & BM_VALID) != 0; - /* - * Assume that we acquired a buffer pin for the purposes of - * Valgrind buffer client checks (even in !result case) to - * keep things simple. Buffers that are unsafe to access are - * not generally guaranteed to be marked undefined or - * non-accessible in any case. - */ - VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); + TrackNewBufferPin(b); break; } } @@ -3149,11 +3238,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) * cannot meddle with that. */ result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0; + + Assert(ref->data.refcount > 0); + ref->data.refcount++; + ResourceOwnerRememberBuffer(CurrentResourceOwner, b); } - ref->refcount++; - Assert(ref->refcount > 0); - ResourceOwnerRememberBuffer(CurrentResourceOwner, b); return result; } @@ -3182,9 +3272,7 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) static void PinBuffer_Locked(BufferDesc *buf) { - Buffer b; - PrivateRefCountEntry *ref; - uint32 buf_state; + uint32 old_buf_state; /* * As explained, We don't expect any preexisting pins. That allows us to @@ -3192,28 +3280,16 @@ PinBuffer_Locked(BufferDesc *buf) */ Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL); - /* - * Buffer can't have a preexisting pin, so mark its page as defined to - * Valgrind (this is similar to the PinBuffer() case where the backend - * doesn't already have a buffer pin) - */ - VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); - /* * Since we hold the buffer spinlock, we can update the buffer state and * release the lock in one operation. */ - buf_state = pg_atomic_read_u32(&buf->state); - Assert(buf_state & BM_LOCKED); - buf_state += BUF_REFCOUNT_ONE; - UnlockBufHdr(buf, buf_state); + old_buf_state = pg_atomic_read_u32(&buf->state); - b = BufferDescriptorGetBuffer(buf); + UnlockBufHdrExt(buf, old_buf_state, + 0, 0, 1); - ref = NewPrivateRefCountEntry(b); - ref->refcount++; - - ResourceOwnerRememberBuffer(CurrentResourceOwner, b); + TrackNewBufferPin(BufferDescriptorGetBuffer(buf)); } /* @@ -3246,12 +3322,13 @@ WakePinCountWaiter(BufferDesc *buf) /* we just released the last pin other than the waiter's */ int wait_backend_pgprocno = buf->wait_backend_pgprocno; - buf_state &= ~BM_PIN_COUNT_WAITER; - UnlockBufHdr(buf, buf_state); + UnlockBufHdrExt(buf, buf_state, + 0, BM_PIN_COUNT_WAITER, + 0); ProcSendSignal(wait_backend_pgprocno); } else - UnlockBufHdr(buf, buf_state); + UnlockBufHdr(buf); } /* @@ -3280,11 +3357,10 @@ UnpinBufferNoOwner(BufferDesc *buf) /* not moving as we're likely deleting it soon anyway */ ref = GetPrivateRefCountEntry(b, false); Assert(ref != NULL); - Assert(ref->refcount > 0); - ref->refcount--; - if (ref->refcount == 0) + Assert(ref->data.refcount > 0); + ref->data.refcount--; + if (ref->data.refcount == 0) { - uint32 buf_state; uint32 old_buf_state; /* @@ -3296,38 +3372,50 @@ UnpinBufferNoOwner(BufferDesc *buf) */ VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); - /* I'd better not still hold the buffer content lock */ - Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); - /* - * Decrement the shared reference count. - * - * Since buffer spinlock holder can update status using just write, - * it's not safe to use atomic decrement here; thus use a CAS loop. + * I'd better not still hold the buffer content lock. Can't use + * BufferIsLockedByMe(), as that asserts the buffer is pinned. */ - old_buf_state = pg_atomic_read_u32(&buf->state); - for (;;) - { - if (old_buf_state & BM_LOCKED) - old_buf_state = WaitBufHdrUnlocked(buf); - - buf_state = old_buf_state; - - buf_state -= BUF_REFCOUNT_ONE; + Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); - if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, - buf_state)) - break; - } + /* decrement the shared reference count */ + old_buf_state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE); /* Support LockBufferForCleanup() */ - if (buf_state & BM_PIN_COUNT_WAITER) + if (old_buf_state & BM_PIN_COUNT_WAITER) WakePinCountWaiter(buf); ForgetPrivateRefCountEntry(ref); } } +/* + * Set up backend-local tracking of a buffer pinned the first time by this + * backend. + */ +inline void +TrackNewBufferPin(Buffer buf) +{ + PrivateRefCountEntry *ref; + + ref = NewPrivateRefCountEntry(buf); + ref->data.refcount++; + + ResourceOwnerRememberBuffer(CurrentResourceOwner, buf); + + /* + * This is the first pin for this page by this backend, mark its page as + * defined to valgrind. While the page contents might not actually be + * valid yet, we don't currently guarantee that such pages are marked + * undefined or non-accessible. + * + * It's not necessarily the prettiest to do this here, but otherwise we'd + * need this block of code in multiple places. + */ + VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)), + BLCKSZ); +} + #define ST_SORT sort_checkpoint_bufferids #define ST_ELEMENT_TYPE CkptSortItem #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b) @@ -3339,10 +3427,10 @@ UnpinBufferNoOwner(BufferDesc *buf) * BufferSync -- Write out all dirty buffers in the pool. * * This is called at checkpoint time to write out all dirty shared buffers. - * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE - * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, - * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even - * unlogged buffers, which are otherwise skipped. The remaining flags + * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is + * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, + * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write + * even unlogged buffers, which are otherwise skipped. The remaining flags * currently have no effect here. */ static void @@ -3358,7 +3446,7 @@ BufferSync(int flags) Oid last_tsid; binaryheap *ts_heap; int i; - int mask = BM_DIRTY; + uint32 mask = BM_DIRTY; WritebackContext wb_context; /* @@ -3367,7 +3455,7 @@ BufferSync(int flags) * recovery, we write all dirty buffers. */ if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_FLUSH_ALL)))) + CHECKPOINT_FLUSH_UNLOGGED)))) mask |= BM_PERMANENT; /* @@ -3390,6 +3478,7 @@ BufferSync(int flags) for (buf_id = 0; buf_id < NBuffers; buf_id++) { BufferDesc *bufHdr = GetBufferDescriptor(buf_id); + uint32 set_bits = 0; /* * Header spinlock is enough to examine BM_DIRTY, see comment in @@ -3401,7 +3490,7 @@ BufferSync(int flags) { CkptSortItem *item; - buf_state |= BM_CHECKPOINT_NEEDED; + set_bits = BM_CHECKPOINT_NEEDED; item = &CkptBufferIds[num_to_scan++]; item->buf_id = buf_id; @@ -3411,7 +3500,9 @@ BufferSync(int flags) item->blockNum = bufHdr->tag.blockNum; } - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdrExt(bufHdr, buf_state, + set_bits, 0, + 0); /* Check for barrier events in case NBuffers is large. */ if (ProcSignalBarrierPending) @@ -3616,7 +3707,7 @@ BufferSync(int flags) * This is called periodically by the background writer process. * * Returns true if it's appropriate for the bgwriter process to go into - * low-power hibernation mode. (This happens if the strategy clock sweep + * low-power hibernation mode. (This happens if the strategy clock-sweep * has been "lapped" and no buffer allocations have occurred recently, * or if the bgwriter has been effectively disabled by setting * bgwriter_lru_maxpages to 0.) @@ -3666,8 +3757,8 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the freelist clock sweep currently is, and how many - * buffer allocations have happened since our last call. + * Find out where the clock-sweep currently is, and how many buffer + * allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); @@ -3687,8 +3778,8 @@ BgBufferSync(WritebackContext *wb_context) /* * Compute strategy_delta = how many buffers have been scanned by the - * clock sweep since last time. If first time through, assume none. Then - * see if we are still ahead of the clock sweep, and if so, how many + * clock-sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock-sweep, and if so, how many * buffers we could scan before we'd catch up with it and "lap" it. Note: * weird-looking coding of xxx_passes comparisons are to avoid bogus * behavior when the passes counts wrap around. @@ -3950,14 +4041,14 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) else if (skip_recently_used) { /* Caller told us not to write recently-used buffers */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); return result; } if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY)) { /* It's clean, so nothing to do */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); return result; } @@ -3966,11 +4057,8 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) * buffer is clean by the time we've locked it.) */ PinBuffer_Locked(bufHdr); - LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); - - LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); tag = bufHdr->tag; @@ -4024,8 +4112,9 @@ InitBufferManagerAccess(void) MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS); memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); + memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys)); - hash_ctl.keysize = sizeof(int32); + hash_ctl.keysize = sizeof(Buffer); hash_ctl.entrysize = sizeof(PrivateRefCountEntry); PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl, @@ -4073,10 +4162,10 @@ CheckForBufferLeaks(void) /* check the array */ for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) { - res = &PrivateRefCountArray[i]; - - if (res->buffer != InvalidBuffer) + if (PrivateRefCountArrayKeys[i] != InvalidBuffer) { + res = &PrivateRefCountArray[i]; + s = DebugPrintBufferRefcount(res->buffer); elog(WARNING, "buffer refcount leak: %s", s); pfree(s); @@ -4329,8 +4418,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, recptr = BufferGetLSN(buf); /* To check if block content changes while flushing. - vadim 01/17/97 */ - buf_state &= ~BM_JUST_DIRTIED; - UnlockBufHdr(buf, buf_state); + UnlockBufHdrExt(buf, buf_state, + 0, BM_JUST_DIRTIED, + 0); /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL @@ -4417,6 +4507,19 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, error_context_stack = errcallback.previous; } +/* + * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer + * before/after calling FlushBuffer(). + */ +static void +FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, + IOObject io_object, IOContext io_context) +{ + LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED); + FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + LWLockRelease(BufferDescriptorGetContentLock(buf)); +} + /* * RelationGetNumberOfBlocksInFork * Determines the current number of pages in the specified relation fork. @@ -4493,7 +4596,6 @@ BufferGetLSNAtomic(Buffer buffer) char *page = BufferGetPage(buffer); BufferDesc *bufHdr; XLogRecPtr lsn; - uint32 buf_state; /* * If we don't need locking for correctness, fastpath out. @@ -4506,9 +4608,9 @@ BufferGetLSNAtomic(Buffer buffer) Assert(BufferIsPinned(buffer)); bufHdr = GetBufferDescriptor(buffer - 1); - buf_state = LockBufHdr(bufHdr); + LockBufHdr(bufHdr); lsn = PageGetLSN(page); - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); return lsn; } @@ -4550,11 +4652,9 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, if (RelFileLocatorBackendIsTemp(rlocator)) { if (rlocator.backend == MyProcNumber) - { - for (j = 0; j < nforks; j++) - DropRelationLocalBuffers(rlocator.locator, forkNum[j], - firstDelBlock[j]); - } + DropRelationLocalBuffers(rlocator.locator, forkNum, nforks, + firstDelBlock); + return; } @@ -4611,7 +4711,6 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, for (i = 0; i < NBuffers; i++) { BufferDesc *bufHdr = GetBufferDescriptor(i); - uint32 buf_state; /* * We can make this a tad faster by prechecking the buffer tag before @@ -4632,7 +4731,7 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator)) continue; - buf_state = LockBufHdr(bufHdr); + LockBufHdr(bufHdr); for (j = 0; j < nforks; j++) { @@ -4645,7 +4744,7 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, } } if (j >= nforks) - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } } @@ -4672,7 +4771,7 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) if (nlocators == 0) return; - rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */ + rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */ /* If it's a local relation, it's localbuf.c's problem. */ for (i = 0; i < nlocators; i++) @@ -4754,7 +4853,7 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) } pfree(block); - locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */ + locators = palloc_array(RelFileLocator, n); /* non-local relations */ for (i = 0; i < n; i++) locators[i] = rels[i]->smgr_rlocator.locator; @@ -4774,7 +4873,6 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) { RelFileLocator *rlocator = NULL; BufferDesc *bufHdr = GetBufferDescriptor(i); - uint32 buf_state; /* * As in DropRelationBuffers, an unlocked precheck should be safe and @@ -4808,11 +4906,11 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) if (rlocator == NULL) continue; - buf_state = LockBufHdr(bufHdr); + LockBufHdr(bufHdr); if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator)) InvalidateBuffer(bufHdr); /* releases spinlock */ else - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } pfree(locators); @@ -4842,7 +4940,6 @@ FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, LWLock *bufPartitionLock; /* buffer partition lock for it */ int buf_id; BufferDesc *bufHdr; - uint32 buf_state; /* create a tag so we can lookup the buffer */ InitBufferTag(&bufTag, &rlocator, forkNum, curBlock); @@ -4867,14 +4964,14 @@ FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, * evicted by some other backend loading blocks for a different * relation after we release lock on the BufMapping table. */ - buf_state = LockBufHdr(bufHdr); + LockBufHdr(bufHdr); if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && BufTagGetForkNum(&bufHdr->tag) == forkNum && bufHdr->tag.blockNum >= firstDelBlock) InvalidateBuffer(bufHdr); /* releases spinlock */ else - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } } @@ -4902,7 +4999,6 @@ DropDatabaseBuffers(Oid dbid) for (i = 0; i < NBuffers; i++) { BufferDesc *bufHdr = GetBufferDescriptor(i); - uint32 buf_state; /* * As in DropRelationBuffers, an unlocked precheck should be safe and @@ -4911,11 +5007,11 @@ DropDatabaseBuffers(Oid dbid) if (bufHdr->tag.dbOid != dbid) continue; - buf_state = LockBufHdr(bufHdr); + LockBufHdr(bufHdr); if (bufHdr->tag.dbOid == dbid) InvalidateBuffer(bufHdr); /* releases spinlock */ else - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } } @@ -5008,16 +5104,20 @@ FlushRelationBuffers(Relation rel) (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { PinBuffer_Locked(bufHdr); - LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); - LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); UnpinBuffer(bufHdr); } else - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } } +void +FlushAllBuffers(void) +{ + FlushAllLocalBuffers(); +} + /* --------------------------------------------------------------------- * FlushRelationsAllBuffers * @@ -5038,7 +5138,7 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) return; /* fill-in array for qsort */ - srels = palloc(sizeof(SMgrSortArray) * nrels); + srels = palloc_array(SMgrSortArray, nrels); for (i = 0; i < nrels; i++) { @@ -5105,13 +5205,11 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { PinBuffer_Locked(bufHdr); - LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); - LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); UnpinBuffer(bufHdr); } else - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } pfree(srels); @@ -5333,13 +5431,11 @@ FlushDatabaseBuffers(Oid dbid) (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { PinBuffer_Locked(bufHdr); - LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); - LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); UnpinBuffer(bufHdr); } else - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); } } @@ -5359,7 +5455,7 @@ FlushOneBuffer(Buffer buffer) bufHdr = GetBufferDescriptor(buffer - 1); - Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); + Assert(BufferIsLockedByMe(buffer)); FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); } @@ -5412,7 +5508,7 @@ IncrBufferRefCount(Buffer buffer) ref = GetPrivateRefCountEntry(buffer, true); Assert(ref != NULL); - ref->refcount++; + ref->data.refcount++; } ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer); } @@ -5450,7 +5546,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) Assert(GetPrivateRefCount(buffer) > 0); /* here, either share or exclusive lock is OK */ - Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); + Assert(BufferIsLockedByMe(buffer)); /* * This routine might get called many times on the same page, if we are @@ -5545,12 +5641,13 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * checksum here. That will happen when the page is written * sometime later in this checkpoint cycle. */ - if (!XLogRecPtrIsInvalid(lsn)) + if (XLogRecPtrIsValid(lsn)) PageSetLSN(page, lsn); } - buf_state |= BM_DIRTY | BM_JUST_DIRTIED; - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdrExt(bufHdr, buf_state, + BM_DIRTY | BM_JUST_DIRTIED, + 0, 0); if (delayChkptFlags) MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; @@ -5581,6 +5678,7 @@ UnlockBuffers(void) if (buf) { uint32 buf_state; + uint32 unset_bits = 0; buf_state = LockBufHdr(buf); @@ -5590,9 +5688,11 @@ UnlockBuffers(void) */ if ((buf_state & BM_PIN_COUNT_WAITER) != 0 && buf->wait_backend_pgprocno == MyProcNumber) - buf_state &= ~BM_PIN_COUNT_WAITER; + unset_bits = BM_PIN_COUNT_WAITER; - UnlockBufHdr(buf, buf_state); + UnlockBufHdrExt(buf, buf_state, + 0, unset_bits, + 0); PinCountWaitBuf = NULL; } @@ -5602,7 +5702,7 @@ UnlockBuffers(void) * Acquire or release the content_lock for the buffer. */ void -LockBuffer(Buffer buffer, int mode) +LockBuffer(Buffer buffer, BufferLockMode mode) { BufferDesc *buf; @@ -5710,6 +5810,7 @@ LockBufferForCleanup(Buffer buffer) for (;;) { uint32 buf_state; + uint32 unset_bits = 0; /* Try to acquire lock */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -5719,7 +5820,7 @@ LockBufferForCleanup(Buffer buffer) if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) { /* Successfully acquired exclusive lock with pincount 1 */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); /* * Emit the log message if recovery conflict on buffer pin was @@ -5742,14 +5843,15 @@ LockBufferForCleanup(Buffer buffer) /* Failed, so mark myself as waiting for pincount 1 */ if (buf_state & BM_PIN_COUNT_WAITER) { - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); elog(ERROR, "multiple backends attempting to wait for pincount 1"); } bufHdr->wait_backend_pgprocno = MyProcNumber; PinCountWaitBuf = bufHdr; - buf_state |= BM_PIN_COUNT_WAITER; - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdrExt(bufHdr, buf_state, + BM_PIN_COUNT_WAITER, 0, + 0); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* Wait to be signaled by UnpinBuffer() */ @@ -5798,7 +5900,7 @@ LockBufferForCleanup(Buffer buffer) SetStartupBufferPinWaitBufId(-1); } else - ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN); + ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP); /* * Remove flag marking us as waiter. Normally this will not be set @@ -5811,8 +5913,11 @@ LockBufferForCleanup(Buffer buffer) buf_state = LockBufHdr(bufHdr); if ((buf_state & BM_PIN_COUNT_WAITER) != 0 && bufHdr->wait_backend_pgprocno == MyProcNumber) - buf_state &= ~BM_PIN_COUNT_WAITER; - UnlockBufHdr(bufHdr, buf_state); + unset_bits |= BM_PIN_COUNT_WAITER; + + UnlockBufHdrExt(bufHdr, buf_state, + 0, unset_bits, + 0); PinCountWaitBuf = NULL; /* Loop back and try again */ @@ -5889,12 +5994,12 @@ ConditionalLockBufferForCleanup(Buffer buffer) if (refcount == 1) { /* Successfully acquired exclusive lock with pincount 1 */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); return true; } /* Failed, so release the lock */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); return false; } @@ -5933,8 +6038,7 @@ IsBufferCleanupOK(Buffer buffer) bufHdr = GetBufferDescriptor(buffer - 1); /* caller must hold exclusive lock on buffer */ - Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), - LW_EXCLUSIVE)); + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); buf_state = LockBufHdr(bufHdr); @@ -5942,11 +6046,11 @@ IsBufferCleanupOK(Buffer buffer) if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) { /* pincount is OK. */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); return true; } - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(bufHdr); return false; } @@ -5984,7 +6088,7 @@ WaitIO(BufferDesc *buf) * clearing the wref while it's being read. */ iow = buf->io_wref; - UnlockBufHdr(buf, buf_state); + UnlockBufHdr(buf); /* no IO in progress, we don't need to wait */ if (!(buf_state & BM_IO_IN_PROGRESS)) @@ -6052,7 +6156,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) if (!(buf_state & BM_IO_IN_PROGRESS)) break; - UnlockBufHdr(buf, buf_state); + UnlockBufHdr(buf); if (nowait) return false; WaitIO(buf); @@ -6063,12 +6167,13 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) /* Check if someone else already did the I/O */ if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY)) { - UnlockBufHdr(buf, buf_state); + UnlockBufHdr(buf); return false; } - buf_state |= BM_IO_IN_PROGRESS; - UnlockBufHdr(buf, buf_state); + UnlockBufHdrExt(buf, buf_state, + BM_IO_IN_PROGRESS, 0, + 0); ResourceOwnerRememberBufferIO(CurrentResourceOwner, BufferDescriptorGetBuffer(buf)); @@ -6101,28 +6206,31 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio) { uint32 buf_state; + uint32 unset_flag_bits = 0; + int refcount_change = 0; buf_state = LockBufHdr(buf); Assert(buf_state & BM_IO_IN_PROGRESS); - buf_state &= ~BM_IO_IN_PROGRESS; + unset_flag_bits |= BM_IO_IN_PROGRESS; /* Clear earlier errors, if this IO failed, it'll be marked again */ - buf_state &= ~BM_IO_ERROR; + unset_flag_bits |= BM_IO_ERROR; if (clear_dirty && !(buf_state & BM_JUST_DIRTIED)) - buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); + unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED; if (release_aio) { /* release ownership by the AIO subsystem */ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); - buf_state -= BUF_REFCOUNT_ONE; + refcount_change = -1; pgaio_wref_clear(&buf->io_wref); } - buf_state |= set_flag_bits; - UnlockBufHdr(buf, buf_state); + buf_state = UnlockBufHdrExt(buf, buf_state, + set_flag_bits, unset_flag_bits, + refcount_change); if (forget_owner) ResourceOwnerForgetBufferIO(CurrentResourceOwner, @@ -6167,12 +6275,12 @@ AbortBufferIO(Buffer buffer) if (!(buf_state & BM_VALID)) { Assert(!(buf_state & BM_DIRTY)); - UnlockBufHdr(buf_hdr, buf_state); + UnlockBufHdr(buf_hdr); } else { Assert(buf_state & BM_DIRTY); - UnlockBufHdr(buf_hdr, buf_state); + UnlockBufHdr(buf_hdr); /* Issue notice if this is not the first failure... */ if (buf_state & BM_IO_ERROR) @@ -6201,7 +6309,7 @@ shared_buffer_write_error_callback(void *arg) /* Buffer is pinned, so we can read the tag without locking the spinlock */ if (bufHdr != NULL) - errcontext("writing block %u of relation %s", + errcontext("writing block %u of relation \"%s\"", bufHdr->tag.blockNum, relpathperm(BufTagGetRelFileLocator(&bufHdr->tag), BufTagGetForkNum(&bufHdr->tag)).str); @@ -6216,7 +6324,7 @@ local_buffer_write_error_callback(void *arg) BufferDesc *bufHdr = (BufferDesc *) arg; if (bufHdr != NULL) - errcontext("writing block %u of relation %s", + errcontext("writing block %u of relation \"%s\"", bufHdr->tag.blockNum, relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), MyProcNumber, @@ -6256,23 +6364,41 @@ rlocator_comparator(const void *p1, const void *p2) uint32 LockBufHdr(BufferDesc *desc) { - SpinDelayStatus delayStatus; uint32 old_buf_state; Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc))); - init_local_spin_delay(&delayStatus); - while (true) { - /* set BM_LOCKED flag */ + /* + * Always try once to acquire the lock directly, without setting up + * the spin-delay infrastructure. The work necessary for that shows up + * in profiles and is rarely necessary. + */ old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED); - /* if it wasn't set before we're OK */ - if (!(old_buf_state & BM_LOCKED)) - break; - perform_spin_delay(&delayStatus); + if (likely(!(old_buf_state & BM_LOCKED))) + break; /* got lock */ + + /* and then spin without atomic operations until lock is released */ + { + SpinDelayStatus delayStatus; + + init_local_spin_delay(&delayStatus); + + while (old_buf_state & BM_LOCKED) + { + perform_spin_delay(&delayStatus); + old_buf_state = pg_atomic_read_u32(&desc->state); + } + finish_spin_delay(&delayStatus); + } + + /* + * Retry. The lock might obviously already be re-acquired by the time + * we're attempting to get it again. + */ } - finish_spin_delay(&delayStatus); + return old_buf_state | BM_LOCKED; } @@ -6283,7 +6409,7 @@ LockBufHdr(BufferDesc *desc) * Obviously the buffer could be locked by the time the value is returned, so * this is primarily useful in CAS style loops. */ -static uint32 +pg_noinline uint32 WaitBufHdrUnlocked(BufferDesc *buf) { SpinDelayStatus delayStatus; @@ -6375,8 +6501,8 @@ ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b) static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg) { - CkptTsStatus *sa = (CkptTsStatus *) a; - CkptTsStatus *sb = (CkptTsStatus *) b; + CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a); + CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b); /* we want a min-heap, so return 1 for the a < b */ if (sa->progress < sb->progress) @@ -6594,14 +6720,14 @@ EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed) if ((buf_state & BM_VALID) == 0) { - UnlockBufHdr(desc, buf_state); + UnlockBufHdr(desc); return false; } /* Check that it's not pinned already. */ if (BUF_STATE_GET_REFCOUNT(buf_state) > 0) { - UnlockBufHdr(desc, buf_state); + UnlockBufHdr(desc); return false; } @@ -6610,10 +6736,8 @@ EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed) /* If it was dirty, try to clean it once. */ if (buf_state & BM_DIRTY) { - LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_SHARED); - FlushBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); *buffer_flushed = true; - LWLockRelease(BufferDescriptorGetContentLock(desc)); } /* This will return false if it becomes dirty or someone else pins it. */ @@ -6688,6 +6812,8 @@ EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, uint32 buf_state; bool buffer_flushed; + CHECK_FOR_INTERRUPTS(); + buf_state = pg_atomic_read_u32(&desc->state); if (!(buf_state & BM_VALID)) continue; @@ -6738,6 +6864,8 @@ EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, uint32 buf_state = pg_atomic_read_u32(&(desc->state)); bool buffer_flushed; + CHECK_FOR_INTERRUPTS(); + /* An unlocked precheck should be safe and saves some cycles. */ if ((buf_state & BM_VALID) == 0 || !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator)) @@ -6753,7 +6881,7 @@ EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, if ((buf_state & BM_VALID) == 0 || !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator)) { - UnlockBufHdr(desc, buf_state); + UnlockBufHdr(desc); continue; } @@ -6767,6 +6895,194 @@ EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, } } +/* + * Helper function to mark unpinned buffer dirty whose buffer header lock is + * already acquired. + */ +static bool +MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, + bool *buffer_already_dirty) +{ + uint32 buf_state; + bool result = false; + + *buffer_already_dirty = false; + + buf_state = pg_atomic_read_u32(&(desc->state)); + Assert(buf_state & BM_LOCKED); + + if ((buf_state & BM_VALID) == 0) + { + UnlockBufHdr(desc); + return false; + } + + /* Check that it's not pinned already. */ + if (BUF_STATE_GET_REFCOUNT(buf_state) > 0) + { + UnlockBufHdr(desc); + return false; + } + + /* Pin the buffer and then release the buffer spinlock */ + PinBuffer_Locked(desc); + + /* If it was not already dirty, mark it as dirty. */ + if (!(buf_state & BM_DIRTY)) + { + LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE); + MarkBufferDirty(buf); + result = true; + LWLockRelease(BufferDescriptorGetContentLock(desc)); + } + else + *buffer_already_dirty = true; + + UnpinBuffer(desc); + + return result; +} + +/* + * Try to mark the provided shared buffer as dirty. + * + * This function is intended for testing/development use only! + * + * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside. + * + * The buffer_already_dirty parameter is mandatory and indicate if the buffer + * could not be dirtied because it is already dirty. + * + * Returns true if the buffer has successfully been marked as dirty. + */ +bool +MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty) +{ + BufferDesc *desc; + bool buffer_dirtied = false; + + Assert(!BufferIsLocal(buf)); + + /* Make sure we can pin the buffer. */ + ResourceOwnerEnlarge(CurrentResourceOwner); + ReservePrivateRefCountEntry(); + + desc = GetBufferDescriptor(buf - 1); + LockBufHdr(desc); + + buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty); + /* Both can not be true at the same time */ + Assert(!(buffer_dirtied && *buffer_already_dirty)); + + return buffer_dirtied; +} + +/* + * Try to mark all the shared buffers containing provided relation's pages as + * dirty. + * + * This function is intended for testing/development use only! See + * MarkDirtyUnpinnedBuffer(). + * + * The buffers_* parameters are mandatory and indicate the total count of + * buffers that: + * - buffers_dirtied - were dirtied + * - buffers_already_dirty - were already dirty + * - buffers_skipped - could not be dirtied because of a reason different + * than a buffer being already dirty. + */ +void +MarkDirtyRelUnpinnedBuffers(Relation rel, + int32 *buffers_dirtied, + int32 *buffers_already_dirty, + int32 *buffers_skipped) +{ + Assert(!RelationUsesLocalBuffers(rel)); + + *buffers_dirtied = 0; + *buffers_already_dirty = 0; + *buffers_skipped = 0; + + for (int buf = 1; buf <= NBuffers; buf++) + { + BufferDesc *desc = GetBufferDescriptor(buf - 1); + uint32 buf_state = pg_atomic_read_u32(&(desc->state)); + bool buffer_already_dirty; + + CHECK_FOR_INTERRUPTS(); + + /* An unlocked precheck should be safe and saves some cycles. */ + if ((buf_state & BM_VALID) == 0 || + !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator)) + continue; + + /* Make sure we can pin the buffer. */ + ResourceOwnerEnlarge(CurrentResourceOwner); + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(desc); + + /* recheck, could have changed without the lock */ + if ((buf_state & BM_VALID) == 0 || + !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator)) + { + UnlockBufHdr(desc); + continue; + } + + if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty)) + (*buffers_dirtied)++; + else if (buffer_already_dirty) + (*buffers_already_dirty)++; + else + (*buffers_skipped)++; + } +} + +/* + * Try to mark all the shared buffers as dirty. + * + * This function is intended for testing/development use only! See + * MarkDirtyUnpinnedBuffer(). + * + * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_* + * parameters. + */ +void +MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, + int32 *buffers_already_dirty, + int32 *buffers_skipped) +{ + *buffers_dirtied = 0; + *buffers_already_dirty = 0; + *buffers_skipped = 0; + + for (int buf = 1; buf <= NBuffers; buf++) + { + BufferDesc *desc = GetBufferDescriptor(buf - 1); + uint32 buf_state; + bool buffer_already_dirty; + + CHECK_FOR_INTERRUPTS(); + + buf_state = pg_atomic_read_u32(&desc->state); + if (!(buf_state & BM_VALID)) + continue; + + ResourceOwnerEnlarge(CurrentResourceOwner); + ReservePrivateRefCountEntry(); + + LockBufHdr(desc); + + if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty)) + (*buffers_dirtied)++; + else if (buffer_already_dirty) + (*buffers_already_dirty)++; + else + (*buffers_skipped)++; + } +} + /* * Generic implementation of the AIO handle staging callback for readv/writev * on local/shared buffers. @@ -6851,13 +7167,15 @@ buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp) * * This pin is released again in TerminateBufferIO(). */ - buf_state += BUF_REFCOUNT_ONE; buf_hdr->io_wref = io_ref; if (is_temp) + { + buf_state += BUF_REFCOUNT_ONE; pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state); + } else - UnlockBufHdr(buf_hdr, buf_state); + UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1); /* * Ensure the content lock that prevents buffer modifications while @@ -6950,9 +7268,9 @@ buffer_readv_encode_error(PgAioResult *result, error_count > 0 ? error_count : zeroed_count; uint8 first_off; - StaticAssertStmt(PG_IOV_MAX <= 1 << READV_COUNT_BITS, + StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS, "PG_IOV_MAX is bigger than reserved space for error data"); - StaticAssertStmt((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS, + StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS, "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv"); /* @@ -7315,13 +7633,15 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, ereport(elevel, errcode(ERRCODE_DATA_CORRUPTED), - errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation %s", + errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"", affected_count, checkfail_count, first, last, rpath.str), affected_count > 1 ? - errdetail("Block %u held first zeroed page.", + errdetail("Block %u held the first zeroed page.", first + first_off) : 0, - errhint("See server log for details about the other %u invalid block(s).", - affected_count + checkfail_count - 1)); + errhint_plural("See server log for details about the other %d invalid block.", + "See server log for details about the other %d invalid blocks.", + affected_count + checkfail_count - 1, + affected_count + checkfail_count - 1)); return; } @@ -7334,25 +7654,25 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, { Assert(!zeroed_any); /* can't have invalid pages when zeroing them */ affected_count = zeroed_or_error_count; - msg_one = _("invalid page in block %u of relation %s"); - msg_mult = _("%u invalid pages among blocks %u..%u of relation %s"); - det_mult = _("Block %u held first invalid page."); + msg_one = _("invalid page in block %u of relation \"%s\""); + msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\""); + det_mult = _("Block %u held the first invalid page."); hint_mult = _("See server log for the other %u invalid block(s)."); } else if (zeroed_any && !ignored_any) { affected_count = zeroed_or_error_count; - msg_one = _("invalid page in block %u of relation %s; zeroing out page"); - msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation %s"); - det_mult = _("Block %u held first zeroed page."); + msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page"); + msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\""); + det_mult = _("Block %u held the first zeroed page."); hint_mult = _("See server log for the other %u zeroed block(s)."); } else if (!zeroed_any && ignored_any) { affected_count = checkfail_count; - msg_one = _("ignoring checksum failure in block %u of relation %s"); - msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation %s"); - det_mult = _("Block %u held first ignored page."); + msg_one = _("ignoring checksum failure in block %u of relation \"%s\""); + msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\""); + det_mult = _("Block %u held the first ignored page."); hint_mult = _("See server log for the other %u ignored block(s)."); } else diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 01909be027258..28d952b353446 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -33,25 +33,17 @@ typedef struct slock_t buffer_strategy_lock; /* - * Clock sweep hand: index of next buffer to consider grabbing. Note that + * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to * get an actual buffer, it needs to be used modulo NBuffers. */ pg_atomic_uint32 nextVictimBuffer; - int firstFreeBuffer; /* Head of list of unused buffers */ - int lastFreeBuffer; /* Tail of list of unused buffers */ - - /* - * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, - * when the list is empty) - */ - /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ + uint32 completePasses; /* Complete cycles of the clock-sweep */ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* @@ -163,34 +155,20 @@ ClockSweepTick(void) return victim; } -/* - * have_free_buffer -- a lockless check to see if there is a free buffer in - * buffer pool. - * - * If the result is true that will become stale once free buffers are moved out - * by other operations, so the caller who strictly want to use a free buffer - * should not call this. - */ -bool -have_free_buffer(void) -{ - if (StrategyControl->firstFreeBuffer >= 0) - return true; - else - return false; -} - /* * StrategyGetBuffer * * Called by the bufmgr to get the next candidate buffer to use in - * BufferAlloc(). The only hard requirement BufferAlloc() has is that + * GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that * the selected buffer must not currently be pinned by anyone. * * strategy is a BufferAccessStrategy object, or NULL for default strategy. * - * To ensure that no one else can pin the buffer before we do, we must - * return the buffer with the buffer header spinlock still held. + * It is the callers responsibility to ensure the buffer ownership can be + * tracked via TrackNewBufferPin(). + * + * The buffer is pinned and marked as owned, using TrackNewBufferPin(), + * before returning. */ BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring) @@ -198,7 +176,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r BufferDesc *buf; int bgwprocno; int trycounter; - uint32 local_buf_state; /* to avoid repeated (de-)referencing */ *from_ring = false; @@ -249,134 +226,84 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r */ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); - /* - * First check, without acquiring the lock, whether there's buffers in the - * freelist. Since we otherwise don't require the spinlock in every - * StrategyGetBuffer() invocation, it'd be sad to acquire it here - - * uselessly in most cases. That obviously leaves a race where a buffer is - * put on the freelist but we don't see the store yet - but that's pretty - * harmless, it'll just get used during the next buffer acquisition. - * - * If there's buffers on the freelist, acquire the spinlock to pop one - * buffer of the freelist. Then check whether that buffer is usable and - * repeat if not. - * - * Note that the freeNext fields are considered to be protected by the - * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to - * manipulate them without holding the spinlock. - */ - if (StrategyControl->firstFreeBuffer >= 0) + /* Use the "clock sweep" algorithm to find a free buffer */ + trycounter = NBuffers; + for (;;) { - while (true) - { - /* Acquire the spinlock to remove element from the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + uint32 old_buf_state; + uint32 local_buf_state; - if (StrategyControl->firstFreeBuffer < 0) - { - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - break; - } - - buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; + buf = GetBufferDescriptor(ClockSweepTick()); - /* - * Release the lock so someone else can access the freelist while - * we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + /* + * Check whether the buffer can be used and pin it if so. Do this + * using a CAS loop, to avoid having to lock the buffer header. + */ + old_buf_state = pg_atomic_read_u32(&buf->state); + for (;;) + { + local_buf_state = old_buf_state; /* * If the buffer is pinned or has a nonzero usage_count, we cannot - * use it; discard it and retry. (This can only happen if VACUUM - * put a valid buffer in the freelist and then someone else used - * it before we got to it. It's probably impossible altogether as - * of 8.3, but we'd better check anyway.) + * use it; decrement the usage_count (unless pinned) and keep + * scanning. */ - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) + + if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0) { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - *buf_state = local_buf_state; - return buf; + if (--trycounter == 0) + { + /* + * We've scanned all the buffers without making any state + * changes, so all the buffers are pinned (or were when we + * looked at them). We could hope that someone will free + * one eventually, but it's probably better to fail than + * to risk getting stuck in an infinite loop. + */ + elog(ERROR, "no unpinned buffers available"); + } + break; } - UnlockBufHdr(buf, local_buf_state); - } - } - /* Nothing on the freelist, so run the "clock sweep" algorithm */ - trycounter = NBuffers; - for (;;) - { - buf = GetBufferDescriptor(ClockSweepTick()); - - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot use - * it; decrement the usage_count (unless pinned) and keep scanning. - */ - local_buf_state = LockBufHdr(buf); + /* See equivalent code in PinBuffer() */ + if (unlikely(local_buf_state & BM_LOCKED)) + { + old_buf_state = WaitBufHdrUnlocked(buf); + continue; + } - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) - { if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) { local_buf_state -= BUF_USAGECOUNT_ONE; - trycounter = NBuffers; + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + local_buf_state)) + { + trycounter = NBuffers; + break; + } } else { - /* Found a usable buffer */ - if (strategy != NULL) - AddBufferToRing(strategy, buf); - *buf_state = local_buf_state; - return buf; - } - } - else if (--trycounter == 0) - { - /* - * We've scanned all the buffers without making any state changes, - * so all the buffers are pinned (or were when we looked at them). - * We could hope that someone will free one eventually, but it's - * probably better to fail than to risk getting stuck in an - * infinite loop. - */ - UnlockBufHdr(buf, local_buf_state); - elog(ERROR, "no unpinned buffers available"); - } - UnlockBufHdr(buf, local_buf_state); - } -} + /* pin the buffer if the CAS succeeds */ + local_buf_state += BUF_REFCOUNT_ONE; -/* - * StrategyFreeBuffer: put a buffer on the freelist - */ -void -StrategyFreeBuffer(BufferDesc *buf) -{ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + local_buf_state)) + { + /* Found a usable buffer */ + if (strategy != NULL) + AddBufferToRing(strategy, buf); + *buf_state = local_buf_state; - /* - * It is possible that we are told to put something in the freelist that - * is already in it; don't screw up the list if so. - */ - if (buf->freeNext == FREENEXT_NOT_IN_LIST) - { - buf->freeNext = StrategyControl->firstFreeBuffer; - if (buf->freeNext < 0) - StrategyControl->lastFreeBuffer = buf->buf_id; - StrategyControl->firstFreeBuffer = buf->buf_id; - } + TrackNewBufferPin(BufferDescriptorGetBuffer(buf)); - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + return buf; + } + } + } + } } /* @@ -504,14 +431,7 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); - /* - * Grab the whole linked list of free buffers for our strategy. We - * assume it was previously set up by BufferManagerShmemInit(). - */ - StrategyControl->firstFreeBuffer = 0; - StrategyControl->lastFreeBuffer = NBuffers - 1; - - /* Initialize the clock sweep pointer */ + /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ @@ -731,13 +651,15 @@ FreeAccessStrategy(BufferAccessStrategy strategy) * GetBufferFromRing -- returns a buffer from the ring, or NULL if the * ring is empty / not usable. * - * The bufhdr spin lock is held on the returned buffer. + * The buffer is pinned and marked as owned, using TrackNewBufferPin(), before + * returning. */ static BufferDesc * GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) { BufferDesc *buf; Buffer bufnum; + uint32 old_buf_state; uint32 local_buf_state; /* to avoid repeated (de-)referencing */ @@ -754,24 +676,49 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) if (bufnum == InvalidBuffer) return NULL; + buf = GetBufferDescriptor(bufnum - 1); + /* - * If the buffer is pinned we cannot use it under any circumstances. - * - * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, - * since our own previous usage of the ring element would have left it - * there, but it might've been decremented by clock sweep since then). A - * higher usage_count indicates someone else has touched the buffer, so we - * shouldn't re-use it. + * Check whether the buffer can be used and pin it if so. Do this using a + * CAS loop, to avoid having to lock the buffer header. */ - buf = GetBufferDescriptor(bufnum - 1); - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) + old_buf_state = pg_atomic_read_u32(&buf->state); + for (;;) { - *buf_state = local_buf_state; - return buf; + local_buf_state = old_buf_state; + + /* + * If the buffer is pinned we cannot use it under any circumstances. + * + * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, + * since our own previous usage of the ring element would have left it + * there, but it might've been decremented by clock-sweep since then). + * A higher usage_count indicates someone else has touched the buffer, + * so we shouldn't re-use it. + */ + if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0 + || BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1) + break; + + /* See equivalent code in PinBuffer() */ + if (unlikely(local_buf_state & BM_LOCKED)) + { + old_buf_state = WaitBufHdrUnlocked(buf); + continue; + } + + /* pin the buffer if the CAS succeeds */ + local_buf_state += BUF_REFCOUNT_ONE; + + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + local_buf_state)) + { + *buf_state = local_buf_state; + + TrackNewBufferPin(BufferDescriptorGetBuffer(buf)); + return buf; + } } - UnlockBufHdr(buf, local_buf_state); /* * Tell caller to allocate a new buffer with the normal allocation diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 63101d56a074b..1bde8738a2d1b 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -25,6 +25,7 @@ #include "utils/guc_hooks.h" #include "utils/memdebug.h" #include "utils/memutils.h" +#include "utils/rel.h" #include "utils/resowner.h" @@ -43,6 +44,10 @@ typedef struct int NLocBuffer = 0; /* until buffers are initialized */ + +int allocated_localbufs = 0; +int dirtied_localbufs = 0; + BufferDesc *LocalBufferDescriptors = NULL; Block *LocalBufferBlockPointers = NULL; int32 *LocalRefCount = NULL; @@ -184,6 +189,12 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) instr_time io_start; Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); + /* + * Parallel temp table scan allows an access to temp tables. So, to be + * paranoid enough we should check it each time, flushing local buffer. + */ + Assert(!IsParallelWorker()); + Assert(LocalRefCount[-BufferDescriptorGetBuffer(bufHdr) - 1] > 0); /* @@ -229,7 +240,7 @@ GetLocalVictimBuffer(void) ResourceOwnerEnlarge(CurrentResourceOwner); /* - * Need to get a new buffer. We use a clock sweep algorithm (essentially + * Need to get a new buffer. We use a clock-sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; @@ -372,7 +383,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, MemSet(buf_block, 0, BLCKSZ); } - first_block = smgrnblocks(bmr.smgr, fork); + first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork); if (extend_upto != InvalidBlockNumber) { @@ -391,7 +402,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("cannot extend relation %s beyond %u blocks", - relpath(bmr.smgr->smgr_rlocator, fork).str, + relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str, MaxBlockNumber))); for (uint32 i = 0; i < extend_by; i++) @@ -408,7 +419,8 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, /* in case we need to pin an existing buffer below */ ResourceOwnerEnlarge(CurrentResourceOwner); - InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i); + InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork, + first_block + i); hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, &tag, HASH_ENTER, &found); @@ -456,7 +468,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, io_start = pgstat_prepare_io_time(track_io_timing); /* actually extend relation */ - smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false); + smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false); pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EXTEND, io_start, 1, extend_by * BLCKSZ); @@ -507,7 +519,10 @@ MarkLocalBufferDirty(Buffer buffer) buf_state = pg_atomic_read_u32(&bufHdr->state); if (!(buf_state & BM_DIRTY)) + { pgBufferUsage.local_blks_dirtied++; + dirtied_localbufs++; + } buf_state |= BM_DIRTY; @@ -568,6 +583,12 @@ TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bit /* Clear earlier errors, if this IO failed, it'll be marked again */ buf_state &= ~BM_IO_ERROR; + if (buf_state & BM_DIRTY) + { + Assert(dirtied_localbufs > 0); + dirtied_localbufs--; + } + if (clear_dirty) buf_state &= ~BM_DIRTY; @@ -607,6 +628,12 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) uint32 buf_state; LocalBufferLookupEnt *hresult; + if (pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY) + { + Assert(dirtied_localbufs > 0); + dirtied_localbufs--; + } + /* * It's possible that we started IO on this buffer before e.g. aborting * the transaction that created a table. We need to wait for that IO to @@ -629,7 +656,7 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) */ if (check_unreferenced && (LocalRefCount[bufid] != 0 || BUF_STATE_GET_REFCOUNT(buf_state) != 0)) - elog(ERROR, "block %u of %s is still referenced (local %u)", + elog(ERROR, "block %u of %s is still referenced (local %d)", bufHdr->tag.blockNum, relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), MyProcNumber, @@ -660,10 +687,11 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) * See DropRelationBuffers in bufmgr.c for more notes. */ void -DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, - BlockNumber firstDelBlock) +DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock) { int i; + int j; for (i = 0; i < NLocBuffer; i++) { @@ -672,12 +700,18 @@ DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, buf_state = pg_atomic_read_u32(&bufHdr->state); - if ((buf_state & BM_TAG_VALID) && - BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum && - bufHdr->tag.blockNum >= firstDelBlock) + if (!(buf_state & BM_TAG_VALID) || + !BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator)) + continue; + + for (j = 0; j < nforks; j++) { - InvalidateLocalBuffer(bufHdr, true); + if (BufTagGetForkNum(&bufHdr->tag) == forkNum[j] && + bufHdr->tag.blockNum >= firstDelBlock[j]) + { + InvalidateLocalBuffer(bufHdr, true); + break; + } } } } @@ -722,19 +756,6 @@ InitLocalBuffers(void) HASHCTL info; int i; - /* - * Parallel workers can't access data in temporary tables, because they - * have no visibility into the local buffers of their leader. This is a - * convenient, low-cost place to provide a backstop check for that. Note - * that we don't wish to prevent a parallel worker from accessing catalog - * metadata about a temp table, so checks at higher levels would be - * inappropriate. - */ - if (IsParallelWorker()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TRANSACTION_STATE), - errmsg("cannot access temporary tables during a parallel operation"))); - /* Allocate and zero buffer headers and auxiliary arrays */ LocalBufferDescriptors = (BufferDesc *) calloc(nbufs, sizeof(BufferDesc)); LocalBufferBlockPointers = (Block *) calloc(nbufs, sizeof(Block)); @@ -925,10 +946,11 @@ GetLocalBufferStorage(void) num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); /* Buffers should be I/O aligned. */ - cur_block = (char *) - TYPEALIGN(PG_IO_ALIGN_SIZE, - MemoryContextAlloc(LocalBufferContext, - num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); + cur_block = MemoryContextAllocAligned(LocalBufferContext, + num_bufs * BLCKSZ, + PG_IO_ALIGN_SIZE, + 0); + next_buf_in_block = 0; num_bufs_in_block = num_bufs; } @@ -937,6 +959,7 @@ GetLocalBufferStorage(void) this_buf = cur_block + next_buf_in_block * BLCKSZ; next_buf_in_block++; total_bufs_allocated++; + allocated_localbufs++; /* * Caller's PinLocalBuffer() was too early for Valgrind updates, so do it @@ -1010,3 +1033,36 @@ AtProcExit_LocalBuffers(void) */ CheckForLocalBufferLeaks(); } + +/* + * Flush each temporary buffer page to the disk. + * + * It is costly operation needed solely to let temporary tables, indexes and + * 'toasts' participate in a parallel query plan. + */ +void +FlushAllLocalBuffers(void) +{ + int i; + + for (i = 0; i < NLocBuffer; i++) + { + BufferDesc *bufHdr = GetLocalBufferDescriptor(i); + uint32 buf_state; + + if (LocalBufHdrGetBlock(bufHdr) == NULL) + continue; + + buf_state = pg_atomic_read_u32(&bufHdr->state); + + /* XXX only valid dirty pages need to be flushed? */ + if ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinLocalBuffer(bufHdr, false); + FlushLocalBuffer(bufHdr, NULL); + UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr)); + } + } + + Assert(dirtied_localbufs == 0); +} diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 366d70d38a195..9fc370921322c 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -92,9 +92,9 @@ struct BufFile * Position as seen by user of BufFile is (curFile, curOffset + pos). */ int curFile; /* file index (0..n) part of current pos */ - off_t curOffset; /* offset part of current pos */ - int pos; /* next read/write position in buffer */ - int nbytes; /* total # of valid bytes in buffer */ + pgoff_t curOffset; /* offset part of current pos */ + int64 pos; /* next read/write position in buffer */ + int64 nbytes; /* total # of valid bytes in buffer */ /* * XXX Should ideally use PGIOAlignedBlock, but might need a way to avoid @@ -117,7 +117,7 @@ static File MakeNewFileSetSegment(BufFile *buffile, int segment); static BufFile * makeBufFileCommon(int nfiles) { - BufFile *file = (BufFile *) palloc(sizeof(BufFile)); + BufFile *file = palloc_object(BufFile); file->numFiles = nfiles; file->isInterXact = false; @@ -140,7 +140,7 @@ makeBufFile(File firstfile) { BufFile *file = makeBufFileCommon(1); - file->files = (File *) palloc(sizeof(File)); + file->files = palloc_object(File); file->files[0] = firstfile; file->readOnly = false; file->fileset = NULL; @@ -271,7 +271,7 @@ BufFileCreateFileSet(FileSet *fileset, const char *name) file = makeBufFileCommon(1); file->fileset = fileset; file->name = pstrdup(name); - file->files = (File *) palloc(sizeof(File)); + file->files = palloc_object(File); file->files[0] = MakeNewFileSetSegment(file, 0); file->readOnly = false; @@ -297,7 +297,7 @@ BufFileOpenFileSet(FileSet *fileset, const char *name, int mode, File *files; int nfiles = 0; - files = palloc(sizeof(File) * capacity); + files = palloc_array(File, capacity); /* * We don't know how many segments there are, so we'll probe the @@ -309,7 +309,7 @@ BufFileOpenFileSet(FileSet *fileset, const char *name, int mode, if (nfiles + 1 > capacity) { capacity *= 2; - files = repalloc(files, sizeof(File) * capacity); + files = repalloc_array(files, File, capacity); } /* Try to load a segment. */ FileSetSegmentName(segment_name, name, nfiles); @@ -493,8 +493,8 @@ BufFileLoadBuffer(BufFile *file) static void BufFileDumpBuffer(BufFile *file) { - int wpos = 0; - int bytestowrite; + int64 wpos = 0; + int64 bytestowrite; File thisfile; /* @@ -503,7 +503,7 @@ BufFileDumpBuffer(BufFile *file) */ while (wpos < file->nbytes) { - off_t availbytes; + int64 availbytes; instr_time io_start; instr_time io_time; @@ -524,8 +524,8 @@ BufFileDumpBuffer(BufFile *file) bytestowrite = file->nbytes - wpos; availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; - if ((off_t) bytestowrite > availbytes) - bytestowrite = (int) availbytes; + if (bytestowrite > availbytes) + bytestowrite = availbytes; thisfile = file->files[file->curFile]; @@ -729,7 +729,7 @@ BufFileFlush(BufFile *file) * BufFileSeek * * Like fseek(), except that target position needs two values in order to - * work when logical filesize exceeds maximum value representable by off_t. + * work when logical filesize exceeds maximum value representable by pgoff_t. * We do not support relative seeks across more than that, however. * I/O errors are reported by ereport(). * @@ -737,10 +737,10 @@ BufFileFlush(BufFile *file) * impossible seek is attempted. */ int -BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) +BufFileSeek(BufFile *file, int fileno, pgoff_t offset, int whence) { int newFile; - off_t newOffset; + pgoff_t newOffset; switch (whence) { @@ -754,8 +754,7 @@ BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) /* * Relative seek considers only the signed offset, ignoring - * fileno. Note that large offsets (> 1 GB) risk overflow in this - * add, unless we have 64-bit off_t. + * fileno. */ newFile = file->curFile; newOffset = (file->curOffset + file->pos) + offset; @@ -795,7 +794,7 @@ BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) * whether reading or writing, but buffer remains dirty if we were * writing. */ - file->pos = (int) (newOffset - file->curOffset); + file->pos = (int64) (newOffset - file->curOffset); return 0; } /* Otherwise, must reposition buffer, so flush any dirty data */ @@ -830,7 +829,7 @@ BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) } void -BufFileTell(BufFile *file, int *fileno, off_t *offset) +BufFileTell(BufFile *file, int *fileno, pgoff_t *offset) { *fileno = file->curFile; *offset = file->curOffset + file->pos; @@ -852,7 +851,7 @@ BufFileSeekBlock(BufFile *file, int64 blknum) { return BufFileSeek(file, (int) (blknum / BUFFILE_SEG_SIZE), - (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, + (pgoff_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, SEEK_SET); } @@ -925,11 +924,11 @@ BufFileAppend(BufFile *target, BufFile *source) * and the offset. */ void -BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset) +BufFileTruncateFileSet(BufFile *file, int fileno, pgoff_t offset) { int numFiles = file->numFiles; int newFile = fileno; - off_t newOffset = file->curOffset; + pgoff_t newOffset = file->curOffset; char segment_name[MAXPGPATH]; int i; @@ -984,10 +983,10 @@ BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset) { /* No need to reset the current pos if the new pos is greater. */ if (newOffset <= file->curOffset + file->pos) - file->pos = (int) (newOffset - file->curOffset); + file->pos = (int64) newOffset - file->curOffset; /* Adjust the nbytes for the current buffer. */ - file->nbytes = (int) (newOffset - file->curOffset); + file->nbytes = (int64) newOffset - file->curOffset; } else if (newFile == file->curFile && newOffset < file->curOffset) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd55646..9670e809b724c 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -201,7 +201,7 @@ typedef struct vfd File nextFree; /* link to next free VFD, if in freelist */ File lruMoreRecently; /* doubly linked recency-of-use list */ File lruLessRecently; - off_t fileSize; /* current size of file (0 if not temporary) */ + pgoff_t fileSize; /* current size of file (0 if not temporary) */ char *fileName; /* name of file, or NULL for unused VFD */ /* NB: fileName is malloc'd, and must be free'd when closing the VFD */ int fileFlags; /* open(2) flags for (re)opening the file */ @@ -400,25 +400,22 @@ pg_fsync(int fd) * portable, even if it runs ok on the current system. * * We assert here that a descriptor for a file was opened with write - * permissions (either O_RDWR or O_WRONLY) and for a directory without - * write permissions (O_RDONLY). + * permissions (i.e., not O_RDONLY) and for a directory without write + * permissions (O_RDONLY). Notice that the assertion check is made even + * if fsync() is disabled. * - * Ignore any fstat errors and let the follow-up fsync() do its work. - * Doing this sanity check here counts for the case where fsync() is - * disabled. + * If fstat() fails, ignore it and let the follow-up fsync() complain. */ if (fstat(fd, &st) == 0) { int desc_flags = fcntl(fd, F_GETFL); - /* - * O_RDONLY is historically 0, so just make sure that for directories - * no write flags are used. - */ + desc_flags &= O_ACCMODE; + if (S_ISDIR(st.st_mode)) - Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); + Assert(desc_flags == O_RDONLY); else - Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); + Assert(desc_flags != O_RDONLY); } errno = 0; #endif @@ -522,7 +519,7 @@ pg_file_exists(const char *name) * offset of 0 with nbytes 0 means that the entire file should be flushed */ void -pg_flush_data(int fd, off_t offset, off_t nbytes) +pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes) { /* * Right now file flushing is primarily used to avoid making later @@ -638,7 +635,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes) * may simply not be enough address space. If so, silently fall * through to the next implementation. */ - if (nbytes <= (off_t) SSIZE_MAX) + if (nbytes <= (pgoff_t) SSIZE_MAX) p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset); else p = MAP_FAILED; @@ -700,7 +697,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes) * Truncate an open file to a given length. */ static int -pg_ftruncate(int fd, off_t length) +pg_ftruncate(int fd, pgoff_t length) { int ret; @@ -717,7 +714,7 @@ pg_ftruncate(int fd, off_t length) * Truncate a file to a given length by name. */ int -pg_truncate(const char *path, off_t length) +pg_truncate(const char *path, pgoff_t length) { int ret; #ifdef WIN32 @@ -1114,23 +1111,6 @@ BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) tryAgain: #ifdef PG_O_DIRECT_USE_F_NOCACHE - - /* - * The value we defined to stand in for O_DIRECT when simulating it with - * F_NOCACHE had better not collide with any of the standard flags. - */ - StaticAssertStmt((PG_O_DIRECT & - (O_APPEND | - O_CLOEXEC | - O_CREAT | - O_DSYNC | - O_EXCL | - O_RDWR | - O_RDONLY | - O_SYNC | - O_TRUNC | - O_WRONLY)) == 0, - "PG_O_DIRECT value collides with standard flag"); fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode); #else fd = open(fileName, fileFlags, fileMode); @@ -1529,7 +1509,7 @@ FileAccess(File file) * Called whenever a temporary file is deleted to report its size. */ static void -ReportTemporaryFileUsage(const char *path, off_t size) +ReportTemporaryFileUsage(const char *path, pgoff_t size) { pgstat_report_tempfile(size); @@ -2080,7 +2060,7 @@ FileClose(File file) * this. */ int -FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info) +FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info) { Assert(FileIsValid(file)); @@ -2136,7 +2116,7 @@ FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info) } void -FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) +FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info) { int returnCode; @@ -2162,7 +2142,7 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) } ssize_t -FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, +FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info) { ssize_t returnCode; @@ -2219,7 +2199,7 @@ FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, int FileStartReadV(PgAioHandle *ioh, File file, - int iovcnt, off_t offset, + int iovcnt, pgoff_t offset, uint32 wait_event_info) { int returnCode; @@ -2244,7 +2224,7 @@ FileStartReadV(PgAioHandle *ioh, File file, } ssize_t -FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, +FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info) { ssize_t returnCode; @@ -2273,7 +2253,7 @@ FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, */ if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) { - off_t past_write = offset; + pgoff_t past_write = offset; for (int i = 0; i < iovcnt; ++i) past_write += iov[i].iov_len; @@ -2312,7 +2292,7 @@ FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, */ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) { - off_t past_write = offset + returnCode; + pgoff_t past_write = offset + returnCode; if (past_write > vfdP->fileSize) { @@ -2376,7 +2356,7 @@ FileSync(File file, uint32 wait_event_info) * appropriate error. */ int -FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) +FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info) { int returnCode; ssize_t written; @@ -2421,7 +2401,7 @@ FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) * appropriate error. */ int -FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) +FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info) { #ifdef HAVE_POSIX_FALLOCATE int returnCode; @@ -2460,7 +2440,7 @@ FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) return FileZero(file, offset, amount, wait_event_info); } -off_t +pgoff_t FileSize(File file) { Assert(FileIsValid(file)); @@ -2471,14 +2451,14 @@ FileSize(File file) if (FileIsNotOpen(file)) { if (FileAccess(file) < 0) - return (off_t) -1; + return (pgoff_t) -1; } return lseek(VfdCache[file].fd, 0, SEEK_END); } int -FileTruncate(File file, off_t offset, uint32 wait_event_info) +FileTruncate(File file, pgoff_t offset, uint32 wait_event_info) { int returnCode; @@ -3188,9 +3168,10 @@ GetNextTempTableSpace(void) /* * AtEOSubXact_Files * - * Take care of subtransaction commit/abort. At abort, we close temp files - * that the subtransaction may have opened. At commit, we reassign the - * files that were opened to the parent subtransaction. + * Take care of subtransaction commit/abort. At abort, we close AllocateDescs + * that the subtransaction may have opened. At commit, we reassign them to + * the parent subtransaction. (Temporary files are tracked by ResourceOwners + * instead.) */ void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, diff --git a/src/backend/storage/file/fileset.c b/src/backend/storage/file/fileset.c index 64141c7cb91c9..2061aa44e773d 100644 --- a/src/backend/storage/file/fileset.c +++ b/src/backend/storage/file/fileset.c @@ -114,7 +114,8 @@ FileSetCreate(FileSet *fileset, const char *name) } /* - * Open a file that was created with FileSetCreate() */ + * Open a file that was created with FileSetCreate() + */ File FileSetOpen(FileSet *fileset, const char *name, int mode) { @@ -185,7 +186,7 @@ FileSetPath(char *path, FileSet *fileset, Oid tablespace) static Oid ChooseTablespace(const FileSet *fileset, const char *name) { - uint32 hash = hash_any((const unsigned char *) name, strlen(name)); + uint32 hash = hash_bytes((const unsigned char *) name, strlen(name)); return fileset->tablespaces[hash % fileset->ntablespaces]; } diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c index 1d4fd31ffedbc..072f9399969d0 100644 --- a/src/backend/storage/ipc/dsm_registry.c +++ b/src/backend/storage/ipc/dsm_registry.c @@ -15,6 +15,20 @@ * current backend. This function guarantees that only one backend * initializes the segment and that all other backends just attach it. * + * A DSA can be created in or retrieved from the registry by calling + * GetNamedDSA(). As with GetNamedDSMSegment(), if a DSA with the provided + * name does not yet exist, it is created. Otherwise, GetNamedDSA() + * ensures the DSA is attached to the current backend. This function + * guarantees that only one backend initializes the DSA and that all other + * backends just attach it. + * + * A dshash table can be created in or retrieved from the registry by + * calling GetNamedDSHash(). As with GetNamedDSMSegment(), if a hash + * table with the provided name does not yet exist, it is created. + * Otherwise, GetNamedDSHash() ensures the hash table is attached to the + * current backend. This function guarantees that only one backend + * initializes the table and that all other backends just attach it. + * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,10 +40,12 @@ #include "postgres.h" +#include "funcapi.h" #include "lib/dshash.h" #include "storage/dsm_registry.h" #include "storage/lwlock.h" #include "storage/shmem.h" +#include "utils/builtins.h" #include "utils/memutils.h" typedef struct DSMRegistryCtxStruct @@ -40,15 +56,53 @@ typedef struct DSMRegistryCtxStruct static DSMRegistryCtxStruct *DSMRegistryCtx; -typedef struct DSMRegistryEntry +typedef struct NamedDSMState { - char name[64]; dsm_handle handle; size_t size; +} NamedDSMState; + +typedef struct NamedDSAState +{ + dsa_handle handle; + int tranche; +} NamedDSAState; + +typedef struct NamedDSHState +{ + dsa_handle dsa_handle; + dshash_table_handle dsh_handle; + int tranche; +} NamedDSHState; + +typedef enum DSMREntryType +{ + DSMR_ENTRY_TYPE_DSM, + DSMR_ENTRY_TYPE_DSA, + DSMR_ENTRY_TYPE_DSH, +} DSMREntryType; + +static const char *const DSMREntryTypeNames[] = +{ + [DSMR_ENTRY_TYPE_DSM] = "segment", + [DSMR_ENTRY_TYPE_DSA] = "area", + [DSMR_ENTRY_TYPE_DSH] = "hash", +}; + +typedef struct DSMRegistryEntry +{ + char name[NAMEDATALEN]; + DSMREntryType type; + union + { + NamedDSMState dsm; + NamedDSAState dsa; + NamedDSHState dsh; + }; } DSMRegistryEntry; static const dshash_parameters dsh_params = { - offsetof(DSMRegistryEntry, handle), + offsetof(DSMRegistryEntry, type), sizeof(DSMRegistryEntry), dshash_strcmp, dshash_strhash, @@ -101,9 +155,10 @@ init_dsm_registry(void) { /* Initialize dynamic shared hash table for registry. */ dsm_registry_dsa = dsa_create(LWTRANCHE_DSM_REGISTRY_DSA); + dsm_registry_table = dshash_create(dsm_registry_dsa, &dsh_params, NULL); + dsa_pin(dsm_registry_dsa); dsa_pin_mapping(dsm_registry_dsa); - dsm_registry_table = dshash_create(dsm_registry_dsa, &dsh_params, NULL); /* Store handles in shared memory for other backends to use. */ DSMRegistryCtx->dsah = dsa_get_handle(dsm_registry_dsa); @@ -125,15 +180,19 @@ init_dsm_registry(void) * Initialize or attach a named DSM segment. * * This routine returns the address of the segment. init_callback is called to - * initialize the segment when it is first created. + * initialize the segment when it is first created. 'arg' is passed through to + * the initialization callback function. */ void * GetNamedDSMSegment(const char *name, size_t size, - void (*init_callback) (void *ptr), bool *found) + void (*init_callback) (void *ptr, void *arg), + bool *found, void *arg) { DSMRegistryEntry *entry; MemoryContext oldcontext; void *ret; + NamedDSMState *state; + dsm_segment *seg; Assert(found); @@ -141,7 +200,7 @@ GetNamedDSMSegment(const char *name, size_t size, ereport(ERROR, (errmsg("DSM segment name cannot be empty"))); - if (strlen(name) >= offsetof(DSMRegistryEntry, handle)) + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) ereport(ERROR, (errmsg("DSM segment name too long"))); @@ -156,41 +215,127 @@ GetNamedDSMSegment(const char *name, size_t size, init_dsm_registry(); entry = dshash_find_or_insert(dsm_registry_table, name, found); + state = &entry->dsm; if (!(*found)) { + entry->type = DSMR_ENTRY_TYPE_DSM; + state->handle = DSM_HANDLE_INVALID; + state->size = size; + } + else if (entry->type != DSMR_ENTRY_TYPE_DSM) + ereport(ERROR, + (errmsg("requested DSM segment does not match type of existing entry"))); + else if (state->size != size) + ereport(ERROR, + (errmsg("requested DSM segment size does not match size of existing segment"))); + + if (state->handle == DSM_HANDLE_INVALID) + { + *found = false; + /* Initialize the segment. */ - dsm_segment *seg = dsm_create(size, 0); + seg = dsm_create(size, 0); + + if (init_callback) + (*init_callback) (dsm_segment_address(seg), arg); dsm_pin_segment(seg); dsm_pin_mapping(seg); - entry->handle = dsm_segment_handle(seg); - entry->size = size; - ret = dsm_segment_address(seg); - - if (init_callback) - (*init_callback) (ret); - } - else if (entry->size != size) - { - ereport(ERROR, - (errmsg("requested DSM segment size does not match size of " - "existing segment"))); + state->handle = dsm_segment_handle(seg); } else { - dsm_segment *seg = dsm_find_mapping(entry->handle); - /* If the existing segment is not already attached, attach it now. */ + seg = dsm_find_mapping(state->handle); if (seg == NULL) { - seg = dsm_attach(entry->handle); + seg = dsm_attach(state->handle); if (seg == NULL) elog(ERROR, "could not map dynamic shared memory segment"); dsm_pin_mapping(seg); } + } + + ret = dsm_segment_address(seg); + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); - ret = dsm_segment_address(seg); + return ret; +} + +/* + * Initialize or attach a named DSA. + * + * This routine returns a pointer to the DSA. A new LWLock tranche ID will be + * generated if needed. Note that the lock tranche will be registered with the + * provided name. Also note that this should be called at most once for a + * given DSA in each backend. + */ +dsa_area * +GetNamedDSA(const char *name, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dsa_area *ret; + NamedDSAState *state; + + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSA name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSA name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + state = &entry->dsa; + if (!(*found)) + { + entry->type = DSMR_ENTRY_TYPE_DSA; + state->handle = DSA_HANDLE_INVALID; + state->tranche = -1; + } + else if (entry->type != DSMR_ENTRY_TYPE_DSA) + ereport(ERROR, + (errmsg("requested DSA does not match type of existing entry"))); + + if (state->tranche == -1) + { + *found = false; + + /* Initialize the LWLock tranche for the DSA. */ + state->tranche = LWLockNewTrancheId(name); + } + + if (state->handle == DSA_HANDLE_INVALID) + { + *found = false; + + /* Initialize the DSA. */ + ret = dsa_create(state->tranche); + dsa_pin(ret); + dsa_pin_mapping(ret); + + /* Store handle for other backends to use. */ + state->handle = dsa_get_handle(ret); + } + else if (dsa_is_attached(state->handle)) + ereport(ERROR, + (errmsg("requested DSA already attached to current process"))); + else + { + /* Attach to existing DSA. */ + ret = dsa_attach(state->handle); + dsa_pin_mapping(ret); } dshash_release_lock(dsm_registry_table, entry); @@ -198,3 +343,147 @@ GetNamedDSMSegment(const char *name, size_t size, return ret; } + +/* + * Initialize or attach a named dshash table. + * + * This routine returns the address of the table. The tranche_id member of + * params is ignored; a new LWLock tranche ID will be generated if needed. + * Note that the lock tranche will be registered with the provided name. Also + * note that this should be called at most once for a given table in each + * backend. + */ +dshash_table * +GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dshash_table *ret; + NamedDSHState *dsh_state; + + Assert(params); + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSHash name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSHash name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + dsh_state = &entry->dsh; + if (!(*found)) + { + entry->type = DSMR_ENTRY_TYPE_DSH; + dsh_state->dsa_handle = DSA_HANDLE_INVALID; + dsh_state->dsh_handle = DSHASH_HANDLE_INVALID; + dsh_state->tranche = -1; + } + else if (entry->type != DSMR_ENTRY_TYPE_DSH) + ereport(ERROR, + (errmsg("requested DSHash does not match type of existing entry"))); + + if (dsh_state->tranche == -1) + { + *found = false; + + /* Initialize the LWLock tranche for the hash table. */ + dsh_state->tranche = LWLockNewTrancheId(name); + } + + if (dsh_state->dsa_handle == DSA_HANDLE_INVALID) + { + dshash_parameters params_copy; + dsa_area *dsa; + + *found = false; + + /* Initialize the DSA for the hash table. */ + dsa = dsa_create(dsh_state->tranche); + + /* Initialize the dshash table. */ + memcpy(¶ms_copy, params, sizeof(dshash_parameters)); + params_copy.tranche_id = dsh_state->tranche; + ret = dshash_create(dsa, ¶ms_copy, NULL); + + dsa_pin(dsa); + dsa_pin_mapping(dsa); + + /* Store handles for other backends to use. */ + dsh_state->dsa_handle = dsa_get_handle(dsa); + dsh_state->dsh_handle = dshash_get_hash_table_handle(ret); + } + else if (dsa_is_attached(dsh_state->dsa_handle)) + ereport(ERROR, + (errmsg("requested DSHash already attached to current process"))); + else + { + dsa_area *dsa; + + /* XXX: Should we verify params matches what table was created with? */ + + /* Attach to existing DSA for the hash table. */ + dsa = dsa_attach(dsh_state->dsa_handle); + dsa_pin_mapping(dsa); + + /* Attach to existing dshash table. */ + ret = dshash_attach(dsa, params, dsh_state->dsh_handle, NULL); + } + + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); + + return ret; +} + +Datum +pg_get_dsm_registry_allocations(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dshash_seq_status status; + + InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + init_dsm_registry(); + MemoryContextSwitchTo(oldcontext); + + dshash_seq_init(&status, dsm_registry_table, false); + while ((entry = dshash_seq_next(&status)) != NULL) + { + Datum vals[3]; + bool nulls[3] = {0}; + + vals[0] = CStringGetTextDatum(entry->name); + vals[1] = CStringGetTextDatum(DSMREntryTypeNames[entry->type]); + + /* Be careful to only return the sizes of initialized entries. */ + if (entry->type == DSMR_ENTRY_TYPE_DSM && + entry->dsm.handle != DSM_HANDLE_INVALID) + vals[2] = Int64GetDatum(entry->dsm.size); + else if (entry->type == DSMR_ENTRY_TYPE_DSA && + entry->dsa.handle != DSA_HANDLE_INVALID) + vals[2] = Int64GetDatum(dsa_get_total_size_from_handle(entry->dsa.handle)); + else if (entry->type == DSMR_ENTRY_TYPE_DSH && + entry->dsh.dsa_handle !=DSA_HANDLE_INVALID) + vals[2] = Int64GetDatum(dsa_get_total_size_from_handle(entry->dsh.dsa_handle)); + else + nulls[2] = true; + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, vals, nulls); + } + dshash_seq_term(&status); + + return (Datum) 0; +} diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c index 567739b5be93a..2704e80b3a7d9 100644 --- a/src/backend/storage/ipc/ipc.c +++ b/src/backend/storage/ipc/ipc.c @@ -399,7 +399,7 @@ cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg) before_shmem_exit_list[before_shmem_exit_index - 1].arg == arg) --before_shmem_exit_index; else - elog(ERROR, "before_shmem_exit callback (%p,0x%" PRIxPTR ") is not the latest entry", + elog(ERROR, "before_shmem_exit callback (%p,0x%" PRIx64 ") is not the latest entry", function, arg); } diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2fa045e6b0f66..adebba625e691 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -24,6 +24,7 @@ #include "access/twophase.h" #include "access/xlogprefetcher.h" #include "access/xlogrecovery.h" +#include "access/xlogwait.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -80,23 +81,12 @@ RequestAddinShmemSpace(Size size) /* * CalculateShmemSize - * Calculates the amount of shared memory and number of semaphores needed. - * - * If num_semaphores is not NULL, it will be set to the number of semaphores - * required. + * Calculates the amount of shared memory needed. */ Size -CalculateShmemSize(int *num_semaphores) +CalculateShmemSize(void) { Size size; - int numSemas; - - /* Compute number of semaphores we'll need */ - numSemas = ProcGlobalSemas(); - - /* Return the number of semaphores if requested by the caller */ - if (num_semaphores) - *num_semaphores = numSemas; /* * Size of the Postgres shared-memory block is estimated via moderately- @@ -108,7 +98,6 @@ CalculateShmemSize(int *num_semaphores) * during the actual allocation phase. */ size = 100000; - size = add_size(size, PGSemaphoreShmemSize(numSemas)); size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt))); size = add_size(size, dsm_estimate_size()); @@ -150,6 +139,8 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); + size = add_size(size, WaitLSNShmemSize()); + size = add_size(size, LogicalDecodingCtlShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -202,12 +193,11 @@ CreateSharedMemoryAndSemaphores(void) PGShmemHeader *shim; PGShmemHeader *seghdr; Size size; - int numSemas; Assert(!IsUnderPostmaster); /* Compute the size of the shared-memory block */ - size = CalculateShmemSize(&numSemas); + size = CalculateShmemSize(); elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); /* @@ -224,11 +214,6 @@ CreateSharedMemoryAndSemaphores(void) InitShmemAccess(seghdr); - /* - * Create semaphores - */ - PGReserveSemaphores(numSemas); - /* * Set up shared memory allocation mechanism */ @@ -343,6 +328,8 @@ CreateOrAttachShmemStructs(void) WaitEventCustomShmemInit(); InjectionPointShmemInit(); AioShmemInit(); + WaitLSNShmemInit(); + LogicalDecodingCtlShmemInit(); } /* @@ -358,12 +345,11 @@ InitializeShmemGUCs(void) Size size_b; Size size_mb; Size hp_size; - int num_semas; /* * Calculate the shared memory size and round up to the nearest megabyte. */ - size_b = CalculateShmemSize(&num_semas); + size_b = CalculateShmemSize(); size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024); sprintf(buf, "%zu", size_mb); SetConfigOption("shared_memory_size", buf, @@ -383,6 +369,6 @@ InitializeShmemGUCs(void) PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); } - sprintf(buf, "%d", num_semas); + sprintf(buf, "%d", ProcGlobalSemas()); SetConfigOption("num_os_semaphores", buf, PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); } diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index c6aefd2f688dd..beadeb5e46afa 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -187,9 +187,11 @@ WaitLatch(Latch *latch, int wakeEvents, long timeout, if (!(wakeEvents & WL_LATCH_SET)) latch = NULL; ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch); - ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, - (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), - NULL); + + if (IsUnderPostmaster) + ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, + (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), + NULL); if (WaitEventSetWait(LatchWaitSet, (wakeEvents & WL_TIMEOUT) ? timeout : -1, diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e5b945a9ee39c..f3a1603204ea6 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -54,7 +54,6 @@ #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/pg_authid.h" -#include "commands/dbcommands.h" #include "miscadmin.h" #include "pgstat.h" #include "port/pg_lfind.h" @@ -62,6 +61,7 @@ #include "storage/procarray.h" #include "utils/acl.h" #include "utils/builtins.h" +#include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/snapmgr.h" @@ -1162,7 +1162,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) * Allocate a temporary array to avoid modifying the array passed as * argument. */ - xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt)); + xids = palloc_array(TransactionId, running->xcnt + running->subxcnt); /* * Add to the temp array any xids which have not already completed. @@ -1622,58 +1622,6 @@ TransactionIdIsInProgress(TransactionId xid) return false; } -/* - * TransactionIdIsActive -- is xid the top-level XID of an active backend? - * - * This differs from TransactionIdIsInProgress in that it ignores prepared - * transactions, as well as transactions running on the primary if we're in - * hot standby. Also, we ignore subtransactions since that's not needed - * for current uses. - */ -bool -TransactionIdIsActive(TransactionId xid) -{ - bool result = false; - ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; - int i; - - /* - * Don't bother checking a transaction older than RecentXmin; it could not - * possibly still be running. - */ - if (TransactionIdPrecedes(xid, RecentXmin)) - return false; - - LWLockAcquire(ProcArrayLock, LW_SHARED); - - for (i = 0; i < arrayP->numProcs; i++) - { - int pgprocno = arrayP->pgprocnos[i]; - PGPROC *proc = &allProcs[pgprocno]; - TransactionId pxid; - - /* Fetch xid just once - see GetNewTransactionId */ - pxid = UINT32_ACCESS_ONCE(other_xids[i]); - - if (!TransactionIdIsValid(pxid)) - continue; - - if (proc->pid == 0) - continue; /* ignore prepared transactions */ - - if (TransactionIdEquals(pxid, xid)) - { - result = true; - break; - } - } - - LWLockRelease(ProcArrayLock); - - return result; -} - /* * Determine XID horizons. @@ -2866,8 +2814,10 @@ GetRunningTransactionData(void) * * Similar to GetSnapshotData but returns just oldestActiveXid. We include * all PGPROCs with an assigned TransactionId, even VACUUM processes. - * We look at all databases, though there is no need to include WALSender - * since this has no effect on hot standby conflicts. + * + * If allDbs is true, we look at all databases, though there is no need to + * include WALSender since this has no effect on hot standby conflicts. If + * allDbs is false, skip processes attached to other databases. * * This is never executed during recovery so there is no need to look at * KnownAssignedXids. @@ -2875,9 +2825,12 @@ GetRunningTransactionData(void) * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetSnapshotData() as the primary code for * that bookkeeping. + * + * inCommitOnly indicates getting the oldestActiveXid among the transactions + * in the commit critical section. */ TransactionId -GetOldestActiveTransactionId(void) +GetOldestActiveTransactionId(bool inCommitOnly, bool allDbs) { ProcArrayStruct *arrayP = procArray; TransactionId *other_xids = ProcGlobal->xids; @@ -2904,6 +2857,8 @@ GetOldestActiveTransactionId(void) for (index = 0; index < arrayP->numProcs; index++) { TransactionId xid; + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(other_xids[index]); @@ -2911,6 +2866,13 @@ GetOldestActiveTransactionId(void) if (!TransactionIdIsNormal(xid)) continue; + if (inCommitOnly && + (proc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + if (!allDbs && proc->databaseId != MyDatabaseId) + continue; + if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; @@ -3050,8 +3012,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids, int type) Assert(type != 0); /* allocate what's certainly enough result space */ - vxids = (VirtualTransactionId *) - palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); + vxids = palloc_array(VirtualTransactionId, arrayP->maxProcs); LWLockAcquire(ProcArrayLock, LW_SHARED); @@ -3331,8 +3292,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, int index; /* allocate what's certainly enough result space */ - vxids = (VirtualTransactionId *) - palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); + vxids = palloc_array(VirtualTransactionId, arrayP->maxProcs); LWLockAcquire(ProcArrayLock, LW_SHARED); diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index a9bb540b55ac2..b0b93d9609184 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -576,6 +576,9 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; + case PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO: + processed = ProcessBarrierUpdateXLogLogicalInfo(); + break; } /* @@ -728,7 +731,11 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) void SendCancelRequest(int backendPID, const uint8 *cancel_key, int cancel_key_len) { - Assert(backendPID != 0); + if (backendPID == 0) + { + ereport(LOG, (errmsg("invalid cancel request with PID 0"))); + return; + } /* * See if we have a matching backend. Reading the pss_pid and diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c index 2c79a649f4632..9c888d3eb7843 100644 --- a/src/backend/storage/ipc/shm_mq.c +++ b/src/backend/storage/ipc/shm_mq.c @@ -289,7 +289,7 @@ shm_mq_get_sender(shm_mq *mq) shm_mq_handle * shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle) { - shm_mq_handle *mqh = palloc(sizeof(shm_mq_handle)); + shm_mq_handle *mqh = palloc_object(shm_mq_handle); Assert(mq->mq_receiver == MyProc || mq->mq_sender == MyProc); mqh->mqh_queue = mq; diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index c9ae3b45b76b1..7b63c7dc90a31 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -65,6 +65,7 @@ #include "postgres.h" +#include "common/int.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" @@ -76,6 +77,7 @@ #include "utils/builtins.h" static void *ShmemAllocRaw(Size size, Size *allocated_size); +static void *ShmemAllocUnlocked(Size size); /* shared memory global variables */ @@ -234,7 +236,7 @@ ShmemAllocRaw(Size size, Size *allocated_size) * * We consider maxalign, rather than cachealign, sufficient here. */ -void * +static void * ShmemAllocUnlocked(Size size) { Size newStart; @@ -330,8 +332,8 @@ InitShmemIndex(void) */ HTAB * ShmemInitHash(const char *name, /* table string name for shmem index */ - long init_size, /* initial table size */ - long max_size, /* max size of the table */ + int64 init_size, /* initial table size */ + int64 max_size, /* max size of the table */ HASHCTL *infoP, /* info about key and bucket size */ int hash_flags) /* info about infoP */ { @@ -494,9 +496,7 @@ add_size(Size s1, Size s2) { Size result; - result = s1 + s2; - /* We are assuming Size is an unsigned type here... */ - if (result < s1 || result < s2) + if (pg_add_size_overflow(s1, s2, &result)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("requested shared memory size overflows size_t"))); @@ -511,11 +511,7 @@ mul_size(Size s1, Size s2) { Size result; - if (s1 == 0 || s2 == 0) - return 0; - result = s1 * s2; - /* We are assuming Size is an unsigned type here... */ - if (result / s2 != s1) + if (pg_mul_size_overflow(s1, s2, &result)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("requested shared memory size overflows size_t"))); @@ -603,19 +599,16 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) InitMaterializedSRF(fcinfo, 0); max_nodes = pg_numa_get_max_node(); - nodes = palloc(sizeof(Size) * (max_nodes + 1)); + nodes = palloc_array(Size, max_nodes + 1); /* - * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while - * the OS may have different memory page sizes. + * Shared memory allocations can vary in size and may not align with OS + * memory page boundaries, while NUMA queries work on pages. * - * To correctly map between them, we need to: 1. Determine the OS memory - * page size 2. Calculate how many OS pages are used by all buffer blocks - * 3. Calculate how many OS pages are contained within each database - * block. - * - * This information is needed before calling move_pages() for NUMA memory - * node inquiry. + * To correctly map each allocation to NUMA nodes, we need to: 1. + * Determine the OS memory page size. 2. Align each allocation's start/end + * addresses to page boundaries. 3. Query NUMA node information for all + * pages spanning the allocation. */ os_page_size = pg_get_shmem_pagesize(); @@ -631,8 +624,8 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) * them using only fraction of the total pages. */ shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1; - page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); - pages_status = palloc(sizeof(int) * shm_total_page_count); + page_ptrs = palloc0_array(void *, shm_total_page_count); + pages_status = palloc_array(int, shm_total_page_count); if (firstNumaTouch) elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts"); @@ -679,12 +672,10 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) */ for (i = 0; i < shm_ent_page_count; i++) { - volatile uint64 touch pg_attribute_unused(); - page_ptrs[i] = startptr + (i * os_page_size); if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + pg_numa_touch_mem_if_required(page_ptrs[i]); CHECK_FOR_INTERRUPTS(); } @@ -716,7 +707,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) for (i = 0; i <= max_nodes; i++) { values[0] = CStringGetTextDatum(ent->key); - values[1] = i; + values[1] = Int32GetDatum(i); values[2] = Int64GetDatum(nodes[i] * os_page_size); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c index c5748b690f408..d7a845e2c229f 100644 --- a/src/backend/storage/ipc/sinvaladt.c +++ b/src/backend/storage/ipc/sinvaladt.c @@ -331,7 +331,7 @@ CleanupInvalidationState(int status, Datum arg) ProcState *stateP; int i; - Assert(PointerIsValid(segP)); + Assert(segP); LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 7fa8d9247e097..773832c3a36cb 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -499,7 +499,7 @@ ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, * seems OK, given that this kind of conflict should not normally be * reached, e.g. due to using a physical replication slot. */ - if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel) + if (IsLogicalDecodingEnabled() && isCatalogRel) InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid, snapshotConflictHorizon); } @@ -840,7 +840,7 @@ ResolveRecoveryConflictWithBufferPin(void) * SIGHUP signal handler, etc cannot do that because it uses the different * latch from that ProcWaitForSignal() waits on. */ - ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN); + ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP); if (got_standby_delay_timeout) SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); @@ -1285,6 +1285,7 @@ LogStandbySnapshot(void) RunningTransactions running; xl_standby_lock *locks; int nlocks; + bool logical_decoding_enabled = IsLogicalDecodingEnabled(); Assert(XLogStandbyInfoActive()); @@ -1325,13 +1326,13 @@ LogStandbySnapshot(void) * record. Fortunately this routine isn't executed frequently, and it's * only a shared lock. */ - if (wal_level < WAL_LEVEL_LOGICAL) + if (!logical_decoding_enabled) LWLockRelease(ProcArrayLock); recptr = LogCurrentRunningXacts(running); /* Release lock if we kept it longer ... */ - if (wal_level >= WAL_LEVEL_LOGICAL) + if (logical_decoding_enabled) LWLockRelease(ProcArrayLock); /* GetRunningTransactionData() acquired XidGenLock, we must release it */ @@ -1376,7 +1377,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) if (xlrec.subxid_overflow) elog(DEBUG2, - "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)", CurrRunningXacts->xcnt, LSN_FORMAT_ARGS(recptr), CurrRunningXacts->oldestRunningXid, @@ -1384,7 +1385,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) CurrRunningXacts->nextXid); else elog(DEBUG2, - "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)", CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, LSN_FORMAT_ARGS(recptr), CurrRunningXacts->oldestRunningXid, diff --git a/src/backend/storage/ipc/waiteventset.c b/src/backend/storage/ipc/waiteventset.c index 7c0e66900f98d..f2174e72ae68e 100644 --- a/src/backend/storage/ipc/waiteventset.c +++ b/src/backend/storage/ipc/waiteventset.c @@ -67,6 +67,7 @@ #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/atomics.h" #include "portability/instr_time.h" #include "postmaster/postmaster.h" #include "storage/fd.h" @@ -461,7 +462,6 @@ CreateWaitEventSet(ResourceOwner resowner, int nevents) * pending signals are serviced. */ set->handles[0] = pgwin32_signal_event; - StaticAssertStmt(WSA_INVALID_EVENT == NULL, ""); #endif return set; @@ -978,6 +978,8 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events) #endif #if defined(WAIT_USE_WIN32) +StaticAssertDecl(WSA_INVALID_EVENT == NULL, ""); + static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event) { @@ -1476,7 +1478,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, struct pollfd *cur_pollfd; /* Sleep */ - rc = poll(set->pollfds, set->nevents, (int) cur_timeout); + rc = poll(set->pollfds, set->nevents, cur_timeout); /* Check return code */ if (rc < 0) @@ -2009,7 +2011,7 @@ ResOwnerReleaseWaitEventSet(Datum res) * NB: be sure to save and restore errno around it. (That's standard practice * in most signal handlers, of course, but we used to omit it in handlers that * only set a flag.) XXX - * + * * NB: this function is called from critical sections and signal handlers so * throwing an error is not a good idea. * diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c index 68b76f2cc18a0..2bd872d6581f8 100644 --- a/src/backend/storage/large_object/inv_api.c +++ b/src/backend/storage/large_object/inv_api.c @@ -298,7 +298,7 @@ inv_open(Oid lobjId, int flags, MemoryContext mcxt) void inv_close(LargeObjectDesc *obj_desc) { - Assert(PointerIsValid(obj_desc)); + Assert(obj_desc); pfree(obj_desc); } @@ -344,7 +344,7 @@ inv_getsize(LargeObjectDesc *obj_desc) SysScanDesc sd; HeapTuple tuple; - Assert(PointerIsValid(obj_desc)); + Assert(obj_desc); open_lo_relation(); @@ -389,7 +389,7 @@ inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence) { int64 newoffset; - Assert(PointerIsValid(obj_desc)); + Assert(obj_desc); /* * We allow seek/tell if you have either read or write permission, so no @@ -436,7 +436,7 @@ inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence) int64 inv_tell(LargeObjectDesc *obj_desc) { - Assert(PointerIsValid(obj_desc)); + Assert(obj_desc); /* * We allow seek/tell if you have either read or write permission, so no @@ -459,7 +459,7 @@ inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes) SysScanDesc sd; HeapTuple tuple; - Assert(PointerIsValid(obj_desc)); + Assert(obj_desc); Assert(buf != NULL); if ((obj_desc->flags & IFS_RDLOCK) == 0) @@ -556,12 +556,10 @@ inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) bool pfreeit; union { - bytea hdr; + alignas(int32) bytea hdr; /* this is to make the union big enough for a LO data chunk: */ char data[LOBLKSIZE + VARHDRSZ]; - /* ensure union is aligned well enough: */ - int32 align_it; - } workbuf; + } workbuf = {0}; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; @@ -569,7 +567,7 @@ inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; - Assert(PointerIsValid(obj_desc)); + Assert(obj_desc); Assert(buf != NULL); /* enforce writability because snapshot is probably wrong otherwise */ @@ -747,12 +745,10 @@ inv_truncate(LargeObjectDesc *obj_desc, int64 len) Form_pg_largeobject olddata; union { - bytea hdr; + alignas(int32) bytea hdr; /* this is to make the union big enough for a LO data chunk: */ char data[LOBLKSIZE + VARHDRSZ]; - /* ensure union is aligned well enough: */ - int32 align_it; - } workbuf; + } workbuf = {0}; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; @@ -760,7 +756,7 @@ inv_truncate(LargeObjectDesc *obj_desc, int64 len) bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; - Assert(PointerIsValid(obj_desc)); + Assert(obj_desc); /* enforce writability because snapshot is probably wrong otherwise */ if ((obj_desc->flags & IFS_WRLOCK) == 0) diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile index 6cbaf23b855f6..a5fbc24ddad6e 100644 --- a/src/backend/storage/lmgr/Makefile +++ b/src/backend/storage/lmgr/Makefile @@ -24,13 +24,9 @@ OBJS = \ include $(top_srcdir)/src/backend/common.mk -ifdef TAS -TASPATH = $(top_builddir)/src/backend/port/tas.o -endif - s_lock_test: s_lock.c $(top_builddir)/src/common/libpgcommon.a $(top_builddir)/src/port/libpgport.a $(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \ - $(TASPATH) -L $(top_builddir)/src/common -lpgcommon \ + -L $(top_builddir)/src/common -lpgcommon \ -L $(top_builddir)/src/port -lpgport -lm -o s_lock_test lwlocknames.h: ../../../include/storage/lwlocklist.h ../../utils/activity/wait_event_names.txt generate-lwlocknames.pl diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c index 228303e77f7ca..59abc080ed286 100644 --- a/src/backend/storage/lmgr/condition_variable.c +++ b/src/backend/storage/lmgr/condition_variable.c @@ -18,6 +18,8 @@ #include "postgres.h" +#include + #include "miscadmin.h" #include "portability/instr_time.h" #include "storage/condition_variable.h" diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl index 4441b7cba0c5f..cd3e43c448aed 100644 --- a/src/backend/storage/lmgr/generate-lwlocknames.pl +++ b/src/backend/storage/lmgr/generate-lwlocknames.pl @@ -10,7 +10,6 @@ my $output_path = '.'; my $lastlockidx = -1; -my $continue = "\n"; GetOptions('outdir:s' => \$output_path); @@ -28,18 +27,24 @@ # -# First, record the predefined LWLocks listed in wait_event_names.txt. We'll -# cross-check those with the ones in lwlocklist.h. +# First, record the predefined LWLocks and built-in tranches listed in +# wait_event_names.txt. We'll cross-check those with the ones in lwlocklist.h. # +my @wait_event_tranches; my @wait_event_lwlocks; my $record_lwlocks = 0; +my $in_tranches = 0; while (<$wait_event_names>) { chomp; # Check for end marker. - last if /^# END OF PREDEFINED LWLOCKS/; + if (/^# END OF PREDEFINED LWLOCKS/) + { + $in_tranches = 1; + next; + } # Skip comments and empty lines. next if /^#/; @@ -55,13 +60,29 @@ # Go to the next line if we are not yet recording LWLocks. next if not $record_lwlocks; + # Stop recording if we reach another section. + last if /^Section:/; + # Record the LWLock. (my $waiteventname, my $waitevendocsentence) = split(/\t/, $_); - push(@wait_event_lwlocks, $waiteventname); + + if ($in_tranches) + { + push(@wait_event_tranches, $waiteventname); + } + else + { + push(@wait_event_lwlocks, $waiteventname); + } } +# +# While gathering the list of predefined LWLocks, cross-check the lists in +# lwlocklist.h with the wait events we just recorded. +# my $in_comment = 0; -my $i = 0; +my $lwlock_count = 0; +my $tranche_count = 0; while (<$lwlocklist>) { chomp; @@ -82,40 +103,72 @@ next; } - die "unable to parse lwlocklist.h line \"$_\"" - unless /^PG_LWLOCK\((\d+),\s+(\w+)\)$/; + # + # Gather list of predefined LWLocks and cross-check with the wait events. + # + if (/^PG_LWLOCK\((\d+),\s+(\w+)\)$/) + { + my ($lockidx, $lockname) = ($1, $2); - (my $lockidx, my $lockname) = ($1, $2); + die "lwlocklist.h not in order" if $lockidx < $lastlockidx; + die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; - die "lwlocklist.h not in order" if $lockidx < $lastlockidx; - die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; + die "$lockname defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $lwlock_count >= scalar @wait_event_lwlocks; + die "lists of predefined LWLocks do not match (first mismatch at " + . "$wait_event_lwlocks[$lwlock_count] in wait_event_names.txt and " + . "$lockname in lwlocklist.h)" + if $wait_event_lwlocks[$lwlock_count] ne $lockname; - die "$lockname defined in lwlocklist.h but missing from " - . "wait_event_names.txt" - if $i >= scalar @wait_event_lwlocks; - die "lists of predefined LWLocks do not match (first mismatch at " - . "$wait_event_lwlocks[$i] in wait_event_names.txt and $lockname in " - . "lwlocklist.h)" - if $wait_event_lwlocks[$i] ne $lockname; - $i++; + $lwlock_count++; - while ($lastlockidx < $lockidx - 1) + while ($lastlockidx < $lockidx - 1) + { + ++$lastlockidx; + } + $lastlockidx = $lockidx; + + # Add a "Lock" suffix to each lock name, as the C code depends on that. + printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", + $lockname . "Lock"; + + next; + } + + # + # Cross-check the built-in LWLock tranches with the wait events. + # + if (/^PG_LWLOCKTRANCHE\((\w+),\s+(\w+)\)$/) { - ++$lastlockidx; - $continue = ",\n"; + my ($tranche_id, $tranche_name) = ($1, $2); + + die "$tranche_name defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $tranche_count >= scalar @wait_event_tranches; + die + "lists of built-in LWLock tranches do not match (first mismatch at " + . "$wait_event_tranches[$tranche_count] in wait_event_names.txt and " + . "$tranche_name in lwlocklist.h)" + if $wait_event_tranches[$tranche_count] ne $tranche_name; + + $tranche_count++; + + next; } - $lastlockidx = $lockidx; - $continue = ",\n"; - # Add a "Lock" suffix to each lock name, as the C code depends on that - printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", - $lockname . "Lock"; + die "unable to parse lwlocklist.h line \"$_\""; } die - "$wait_event_lwlocks[$i] defined in wait_event_names.txt but missing from " - . "lwlocklist.h" - if $i < scalar @wait_event_lwlocks; + "$wait_event_lwlocks[$lwlock_count] defined in wait_event_names.txt but " + . " missing from lwlocklist.h" + if $lwlock_count < scalar @wait_event_lwlocks; + +die + "$wait_event_tranches[$tranche_count] defined in wait_event_names.txt but " + . "missing from lwlocklist.h" + if $tranche_count < scalar @wait_event_tranches; print $h "\n"; printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index f50962983c37b..4798eb7900379 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -55,7 +55,7 @@ typedef struct XactLockTableWaitInfo { XLTW_Oper oper; Relation rel; - ItemPointer ctid; + const ItemPointerData *ctid; } XactLockTableWaitInfo; static void XactLockTableWaitErrorCb(void *arg); @@ -559,7 +559,7 @@ UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode) * tuple. See heap_lock_tuple before using this! */ void -LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +LockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode) { LOCKTAG tag; @@ -579,7 +579,7 @@ LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) * Returns true iff the lock was acquired. */ bool -ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode, +ConditionalLockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode, bool logLockFailure) { LOCKTAG tag; @@ -598,7 +598,7 @@ ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode, * UnlockTuple */ void -UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +UnlockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode) { LOCKTAG tag; @@ -660,7 +660,7 @@ XactLockTableDelete(TransactionId xid) * and if so wait for its parent. */ void -XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, +XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper) { LOCKTAG tag; @@ -717,7 +717,10 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, * through, to avoid slowing down the normal case.) */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } @@ -757,7 +760,10 @@ ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure) /* See XactLockTableWait about this case */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 86b06b9223f0b..9015ba3caf7a1 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -51,7 +51,7 @@ /* GUC variables */ int max_locks_per_xact; /* used to set the lock table size */ -bool log_lock_failure = false; +bool log_lock_failures = false; #define NLOCKENTS() \ mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) @@ -415,6 +415,7 @@ static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner); static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode); static void FinishStrongLockAcquire(void); static ProcWaitStatus WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner); +static void waitonlock_error_callback(void *arg); static void ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock); static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent); static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode, @@ -443,7 +444,7 @@ void LockManagerShmemInit(void) { HASHCTL info; - long init_table_size, + int64 init_table_size, max_table_size; bool found; @@ -589,7 +590,7 @@ proclock_hash(const void *key, Size keysize) * intermediate variable to suppress cast-pointer-to-int warnings. */ procptr = PointerGetDatum(proclocktag->myProc); - lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + lockhash ^= DatumGetUInt32(procptr) << LOG2_NUM_LOCK_PARTITIONS; return lockhash; } @@ -610,7 +611,7 @@ ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode) * This must match proclock_hash()! */ procptr = PointerGetDatum(proclocktag->myProc); - lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + lockhash ^= DatumGetUInt32(procptr) << LOG2_NUM_LOCK_PARTITIONS; return lockhash; } @@ -1931,6 +1932,7 @@ static ProcWaitStatus WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) { ProcWaitStatus result; + ErrorContextCallback waiterrcontext; TRACE_POSTGRESQL_LOCK_WAIT_START(locallock->tag.lock.locktag_field1, locallock->tag.lock.locktag_field2, @@ -1939,6 +1941,12 @@ WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) locallock->tag.lock.locktag_type, locallock->tag.mode); + /* Setup error traceback support for ereport() */ + waiterrcontext.callback = waitonlock_error_callback; + waiterrcontext.arg = locallock; + waiterrcontext.previous = error_context_stack; + error_context_stack = &waiterrcontext; + /* adjust the process title to indicate that it's waiting */ set_ps_display_suffix("waiting"); @@ -1990,6 +1998,8 @@ WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) /* reset ps display to remove the suffix */ set_ps_display_remove_suffix(); + error_context_stack = waiterrcontext.previous; + TRACE_POSTGRESQL_LOCK_WAIT_DONE(locallock->tag.lock.locktag_field1, locallock->tag.lock.locktag_field2, locallock->tag.lock.locktag_field3, @@ -2000,6 +2010,28 @@ WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) return result; } +/* + * error context callback for failures in WaitOnLock + * + * We report which lock was being waited on, in the same style used in + * deadlock reports. This helps with lock timeout errors in particular. + */ +static void +waitonlock_error_callback(void *arg) +{ + LOCALLOCK *locallock = (LOCALLOCK *) arg; + const LOCKTAG *tag = &locallock->tag.lock; + LOCKMODE mode = locallock->tag.mode; + StringInfoData locktagbuf; + + initStringInfo(&locktagbuf); + DescribeLockTag(&locktagbuf, tag); + + errcontext("waiting for %s on %s", + GetLockmodeName(tag->locktag_lockmethodid, mode), + locktagbuf.data); +} + /* * Remove a proc from the wait-queue it is on (caller must know it is on one). * This is only used when the proc has failed to get the lock, so we set its @@ -3068,9 +3100,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) (MaxBackends + max_prepared_xacts + 1)); } else - vxids = (VirtualTransactionId *) - palloc0(sizeof(VirtualTransactionId) * - (MaxBackends + max_prepared_xacts + 1)); + vxids = palloc0_array(VirtualTransactionId, (MaxBackends + max_prepared_xacts + 1)); /* Compute hash code and partition lock, and look up conflicting modes. */ hashcode = LockTagHashCode(locktag); @@ -3539,9 +3569,9 @@ AtPrepare_Locks(void) * but that probably costs more cycles. */ void -PostPrepare_Locks(TransactionId xid) +PostPrepare_Locks(FullTransactionId fxid) { - PGPROC *newproc = TwoPhaseGetDummyProc(xid, false); + PGPROC *newproc = TwoPhaseGetDummyProc(fxid, false); HASH_SEQ_STATUS status; LOCALLOCK *locallock; LOCK *lock; @@ -3769,12 +3799,12 @@ GetLockStatusData(void) int el; int i; - data = (LockData *) palloc(sizeof(LockData)); + data = palloc_object(LockData); /* Guess how much space we'll need. */ els = MaxBackends; el = 0; - data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els); + data->locks = palloc_array(LockInstanceData, els); /* * First, we iterate through the per-backend fast-path arrays, locking @@ -3969,7 +3999,7 @@ GetBlockerStatusData(int blocked_pid) PGPROC *proc; int i; - data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData)); + data = palloc_object(BlockedProcsData); /* * Guess how much space we'll need, and preallocate. Most of the time @@ -3979,9 +4009,9 @@ GetBlockerStatusData(int blocked_pid) */ data->nprocs = data->nlocks = data->npids = 0; data->maxprocs = data->maxlocks = data->maxpids = MaxBackends; - data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs); - data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks); - data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids); + data->procs = palloc_array(BlockedProcData, data->maxprocs); + data->locks = palloc_array(LockInstanceData, data->maxlocks); + data->waiter_pids = palloc_array(int, data->maxpids); /* * In order to search the ProcArray for blocked_pid and assume that that @@ -4324,11 +4354,11 @@ DumpAllLocks(void) * and PANIC anyway. */ void -lock_twophase_recover(TransactionId xid, uint16 info, +lock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, false); + PGPROC *proc = TwoPhaseGetDummyProc(fxid, false); LOCKTAG *locktag; LOCKMODE lockmode; LOCKMETHODID lockmethodid; @@ -4505,7 +4535,7 @@ lock_twophase_recover(TransactionId xid, uint16 info, * starting up into hot standby mode. */ void -lock_twophase_standby_recover(TransactionId xid, uint16 info, +lock_twophase_standby_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; @@ -4524,7 +4554,7 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info, if (lockmode == AccessExclusiveLock && locktag->locktag_type == LOCKTAG_RELATION) { - StandbyAcquireAccessExclusiveLock(xid, + StandbyAcquireAccessExclusiveLock(XidFromFullTransactionId(fxid), locktag->locktag_field1 /* dboid */ , locktag->locktag_field2 /* reloid */ ); } @@ -4537,11 +4567,11 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info, * Find and release the lock indicated by the 2PC record. */ void -lock_twophase_postcommit(TransactionId xid, uint16 info, +lock_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, true); + PGPROC *proc = TwoPhaseGetDummyProc(fxid, true); LOCKTAG *locktag; LOCKMETHODID lockmethodid; LockMethod lockMethodTable; @@ -4563,10 +4593,10 @@ lock_twophase_postcommit(TransactionId xid, uint16 info, * This is actually just the same as the COMMIT case. */ void -lock_twophase_postabort(TransactionId xid, uint16 info, +lock_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - lock_twophase_postcommit(xid, info, recdata, len); + lock_twophase_postcommit(fxid, info, recdata, len); } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 46f44bc45113f..b839ace57cb43 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -122,62 +122,22 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, * own tranche. We absorb the names of these tranches from there into * BuiltinTrancheNames here. * - * 2. There are some predefined tranches for built-in groups of locks. - * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names - * appear in BuiltinTrancheNames[] below. + * 2. There are some predefined tranches for built-in groups of locks defined + * in lwlocklist.h. We absorb the names of these tranches, too. * * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche - * or LWLockRegisterTranche. The names of these that are known in the current - * process appear in LWLockTrancheNames[]. + * or LWLockNewTrancheId. These names are stored in shared memory and can be + * accessed via LWLockTrancheNames. * * All these names are user-visible as wait event names, so choose with care * ... and do not forget to update the documentation's list of wait events. */ static const char *const BuiltinTrancheNames[] = { #define PG_LWLOCK(id, lockname) [id] = CppAsString(lockname), +#define PG_LWLOCKTRANCHE(id, lockname) [LWTRANCHE_##id] = CppAsString(lockname), #include "storage/lwlocklist.h" #undef PG_LWLOCK - [LWTRANCHE_XACT_BUFFER] = "XactBuffer", - [LWTRANCHE_COMMITTS_BUFFER] = "CommitTsBuffer", - [LWTRANCHE_SUBTRANS_BUFFER] = "SubtransBuffer", - [LWTRANCHE_MULTIXACTOFFSET_BUFFER] = "MultiXactOffsetBuffer", - [LWTRANCHE_MULTIXACTMEMBER_BUFFER] = "MultiXactMemberBuffer", - [LWTRANCHE_NOTIFY_BUFFER] = "NotifyBuffer", - [LWTRANCHE_SERIAL_BUFFER] = "SerialBuffer", - [LWTRANCHE_WAL_INSERT] = "WALInsert", - [LWTRANCHE_BUFFER_CONTENT] = "BufferContent", - [LWTRANCHE_REPLICATION_ORIGIN_STATE] = "ReplicationOriginState", - [LWTRANCHE_REPLICATION_SLOT_IO] = "ReplicationSlotIO", - [LWTRANCHE_LOCK_FASTPATH] = "LockFastPath", - [LWTRANCHE_BUFFER_MAPPING] = "BufferMapping", - [LWTRANCHE_LOCK_MANAGER] = "LockManager", - [LWTRANCHE_PREDICATE_LOCK_MANAGER] = "PredicateLockManager", - [LWTRANCHE_PARALLEL_HASH_JOIN] = "ParallelHashJoin", - [LWTRANCHE_PARALLEL_BTREE_SCAN] = "ParallelBtreeScan", - [LWTRANCHE_PARALLEL_QUERY_DSA] = "ParallelQueryDSA", - [LWTRANCHE_PER_SESSION_DSA] = "PerSessionDSA", - [LWTRANCHE_PER_SESSION_RECORD_TYPE] = "PerSessionRecordType", - [LWTRANCHE_PER_SESSION_RECORD_TYPMOD] = "PerSessionRecordTypmod", - [LWTRANCHE_SHARED_TUPLESTORE] = "SharedTupleStore", - [LWTRANCHE_SHARED_TIDBITMAP] = "SharedTidBitmap", - [LWTRANCHE_PARALLEL_APPEND] = "ParallelAppend", - [LWTRANCHE_PER_XACT_PREDICATE_LIST] = "PerXactPredicateList", - [LWTRANCHE_PGSTATS_DSA] = "PgStatsDSA", - [LWTRANCHE_PGSTATS_HASH] = "PgStatsHash", - [LWTRANCHE_PGSTATS_DATA] = "PgStatsData", - [LWTRANCHE_LAUNCHER_DSA] = "LogicalRepLauncherDSA", - [LWTRANCHE_LAUNCHER_HASH] = "LogicalRepLauncherHash", - [LWTRANCHE_DSM_REGISTRY_DSA] = "DSMRegistryDSA", - [LWTRANCHE_DSM_REGISTRY_HASH] = "DSMRegistryHash", - [LWTRANCHE_COMMITTS_SLRU] = "CommitTsSLRU", - [LWTRANCHE_MULTIXACTOFFSET_SLRU] = "MultixactOffsetSLRU", - [LWTRANCHE_MULTIXACTMEMBER_SLRU] = "MultixactMemberSLRU", - [LWTRANCHE_NOTIFY_SLRU] = "NotifySLRU", - [LWTRANCHE_SERIAL_SLRU] = "SerialSLRU", - [LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU", - [LWTRANCHE_XACT_SLRU] = "XactSLRU", - [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", - [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", +#undef PG_LWLOCKTRANCHE }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == @@ -186,11 +146,12 @@ StaticAssertDecl(lengthof(BuiltinTrancheNames) == /* * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and - * stores the names of all dynamically-created tranches known to the current - * process. Any unused entries in the array will contain NULL. + * points to the shared memory locations of the names of all + * dynamically-created tranches. Backends inherit the pointer by fork from the + * postmaster (except in the EXEC_BACKEND case, where we have special measures + * to pass it down). */ -static const char **LWLockTrancheNames = NULL; -static int LWLockTrancheNamesAllocated = 0; +char **LWLockTrancheNames = NULL; /* * This points to the main array of LWLocks in shared memory. Backends inherit @@ -202,8 +163,7 @@ LWLockPadded *MainLWLockArray = NULL; /* * We use this structure to keep track of locked LWLocks for release * during error recovery. Normally, only a few will be held at once, but - * occasionally the number can be much higher; for example, the pg_buffercache - * extension locks all buffer partitions simultaneously. + * occasionally the number can be much higher. */ #define MAX_SIMUL_LWLOCKS 200 @@ -224,19 +184,24 @@ typedef struct NamedLWLockTrancheRequest int num_lwlocks; } NamedLWLockTrancheRequest; -static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; -static int NamedLWLockTrancheRequestsAllocated = 0; - /* - * NamedLWLockTrancheRequests is both the valid length of the request array, - * and the length of the shared-memory NamedLWLockTrancheArray later on. - * This variable and NamedLWLockTrancheArray are non-static so that - * postmaster.c can copy them to child processes in EXEC_BACKEND builds. + * NamedLWLockTrancheRequests is the valid length of the request array. These + * variables are non-static so that launch_backend.c can copy them to child + * processes in EXEC_BACKEND builds. */ int NamedLWLockTrancheRequests = 0; +NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; + +/* postmaster's local copy of the request array */ +static NamedLWLockTrancheRequest *LocalNamedLWLockTrancheRequestArray = NULL; + +/* shared memory counter of registered tranches */ +int *LWLockCounter = NULL; -/* points to data in shared memory: */ -NamedLWLockTranche *NamedLWLockTrancheArray = NULL; +/* backend-local counter of registered tranches */ +static int LocalLWLockCounter; + +#define MAX_NAMED_TRANCHES 256 static void InitializeLWLocks(void); static inline void LWLockReportWaitStart(LWLock *lock); @@ -432,31 +397,45 @@ Size LWLockShmemSize(void) { Size size; - int i; int numLocks = NUM_FIXED_LWLOCKS; + /* + * If re-initializing shared memory, the request array will no longer be + * accessible, so switch to the copy in postmaster's local memory. We'll + * copy it back into shared memory later when CreateLWLocks() is called + * again. + */ + if (LocalNamedLWLockTrancheRequestArray) + NamedLWLockTrancheRequestArray = LocalNamedLWLockTrancheRequestArray; + /* Calculate total number of locks needed in the main array. */ numLocks += NumLWLocksForNamedTranches(); - /* Space for the LWLock array. */ - size = mul_size(numLocks, sizeof(LWLockPadded)); + /* Space for dynamic allocation counter. */ + size = MAXALIGN(sizeof(int)); - /* Space for dynamic allocation counter, plus room for alignment. */ - size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE); + /* Space for named tranches. */ + size = add_size(size, mul_size(MAX_NAMED_TRANCHES, sizeof(char *))); + size = add_size(size, mul_size(MAX_NAMED_TRANCHES, NAMEDATALEN)); - /* space for named tranches. */ - size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche))); + /* + * Make space for named tranche requests. This is done for the benefit of + * EXEC_BACKEND builds, which otherwise wouldn't be able to call + * GetNamedLWLockTranche() outside postmaster. + */ + size = add_size(size, mul_size(NamedLWLockTrancheRequests, + sizeof(NamedLWLockTrancheRequest))); - /* space for name of each tranche. */ - for (i = 0; i < NamedLWLockTrancheRequests; i++) - size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1); + /* Space for the LWLock array, plus room for cache line alignment. */ + size = add_size(size, LWLOCK_PADDED_SIZE); + size = add_size(size, mul_size(numLocks, sizeof(LWLockPadded))); return size; } /* * Allocate shmem space for the main LWLock array and all tranches and - * initialize it. We also register extension LWLock tranches here. + * initialize it. */ void CreateLWLocks(void) @@ -464,35 +443,52 @@ CreateLWLocks(void) if (!IsUnderPostmaster) { Size spaceLocks = LWLockShmemSize(); - int *LWLockCounter; char *ptr; /* Allocate space */ ptr = (char *) ShmemAlloc(spaceLocks); - /* Leave room for dynamic allocation of tranches */ - ptr += sizeof(int); - - /* Ensure desired alignment of LWLock array */ - ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; + /* Initialize the dynamic-allocation counter for tranches */ + LWLockCounter = (int *) ptr; + *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; + ptr += MAXALIGN(sizeof(int)); - MainLWLockArray = (LWLockPadded *) ptr; + /* Initialize tranche names */ + LWLockTrancheNames = (char **) ptr; + ptr += MAX_NAMED_TRANCHES * sizeof(char *); + for (int i = 0; i < MAX_NAMED_TRANCHES; i++) + { + LWLockTrancheNames[i] = ptr; + ptr += NAMEDATALEN; + } /* - * Initialize the dynamic-allocation counter for tranches, which is - * stored just before the first LWLock. + * Move named tranche requests to shared memory. This is done for the + * benefit of EXEC_BACKEND builds, which otherwise wouldn't be able to + * call GetNamedLWLockTranche() outside postmaster. */ - LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); - *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; + if (NamedLWLockTrancheRequests > 0) + { + /* + * Save the pointer to the request array in postmaster's local + * memory. We'll need it if we ever need to re-initialize shared + * memory after a crash. + */ + LocalNamedLWLockTrancheRequestArray = NamedLWLockTrancheRequestArray; + + memcpy(ptr, NamedLWLockTrancheRequestArray, + NamedLWLockTrancheRequests * sizeof(NamedLWLockTrancheRequest)); + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) ptr; + ptr += NamedLWLockTrancheRequests * sizeof(NamedLWLockTrancheRequest); + } + + /* Ensure desired alignment of LWLock array */ + ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; + MainLWLockArray = (LWLockPadded *) ptr; /* Initialize all LWLocks */ InitializeLWLocks(); } - - /* Register named extension LWLock tranches in the current process. */ - for (int i = 0; i < NamedLWLockTrancheRequests; i++) - LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId, - NamedLWLockTrancheArray[i].trancheName); } /* @@ -501,7 +497,6 @@ CreateLWLocks(void) static void InitializeLWLocks(void) { - int numNamedLocks = NumLWLocksForNamedTranches(); int id; int i; int j; @@ -532,32 +527,18 @@ InitializeLWLocks(void) */ if (NamedLWLockTrancheRequests > 0) { - char *trancheNames; - - NamedLWLockTrancheArray = (NamedLWLockTranche *) - &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks]; - - trancheNames = (char *) NamedLWLockTrancheArray + - (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche)); lock = &MainLWLockArray[NUM_FIXED_LWLOCKS]; for (i = 0; i < NamedLWLockTrancheRequests; i++) { NamedLWLockTrancheRequest *request; - NamedLWLockTranche *tranche; - char *name; + int tranche; request = &NamedLWLockTrancheRequestArray[i]; - tranche = &NamedLWLockTrancheArray[i]; - - name = trancheNames; - trancheNames += strlen(request->tranche_name) + 1; - strcpy(name, request->tranche_name); - tranche->trancheId = LWLockNewTrancheId(); - tranche->trancheName = name; + tranche = LWLockNewTrancheId(request->tranche_name); for (j = 0; j < request->num_lwlocks; j++, lock++) - LWLockInitialize(&lock->lock, tranche->trancheId); + LWLockInitialize(&lock->lock, tranche); } } } @@ -609,61 +590,47 @@ GetNamedLWLockTranche(const char *tranche_name) } /* - * Allocate a new tranche ID. + * Allocate a new tranche ID with the provided name. */ int -LWLockNewTrancheId(void) +LWLockNewTrancheId(const char *name) { int result; - int *LWLockCounter; - - LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); - /* We use the ShmemLock spinlock to protect LWLockCounter */ - SpinLockAcquire(ShmemLock); - result = (*LWLockCounter)++; - SpinLockRelease(ShmemLock); - return result; -} + if (!name) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("tranche name cannot be NULL"))); -/* - * Register a dynamic tranche name in the lookup table of the current process. - * - * This routine will save a pointer to the tranche name passed as an argument, - * so the name should be allocated in a backend-lifetime context - * (shared memory, TopMemoryContext, static constant, or similar). - * - * The tranche name will be user-visible as a wait event name, so try to - * use a name that fits the style for those. - */ -void -LWLockRegisterTranche(int tranche_id, const char *tranche_name) -{ - /* This should only be called for user-defined tranches. */ - if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED) - return; + if (strlen(name) >= NAMEDATALEN) + ereport(ERROR, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("tranche name too long"), + errdetail("LWLock tranche names must be no longer than %d bytes.", + NAMEDATALEN - 1))); - /* Convert to array index. */ - tranche_id -= LWTRANCHE_FIRST_USER_DEFINED; + /* + * We use the ShmemLock spinlock to protect LWLockCounter and + * LWLockTrancheNames. + */ + SpinLockAcquire(ShmemLock); - /* If necessary, create or enlarge array. */ - if (tranche_id >= LWLockTrancheNamesAllocated) + if (*LWLockCounter - LWTRANCHE_FIRST_USER_DEFINED >= MAX_NAMED_TRANCHES) { - int newalloc; + SpinLockRelease(ShmemLock); + ereport(ERROR, + (errmsg("maximum number of tranches already registered"), + errdetail("No more than %d tranches may be registered.", + MAX_NAMED_TRANCHES))); + } - newalloc = pg_nextpower2_32(Max(8, tranche_id + 1)); + result = (*LWLockCounter)++; + LocalLWLockCounter = *LWLockCounter; + strlcpy(LWLockTrancheNames[result - LWTRANCHE_FIRST_USER_DEFINED], name, NAMEDATALEN); - if (LWLockTrancheNames == NULL) - LWLockTrancheNames = (const char **) - MemoryContextAllocZero(TopMemoryContext, - newalloc * sizeof(char *)); - else - LWLockTrancheNames = - repalloc0_array(LWLockTrancheNames, const char *, LWLockTrancheNamesAllocated, newalloc); - LWLockTrancheNamesAllocated = newalloc; - } + SpinLockRelease(ShmemLock); - LWLockTrancheNames[tranche_id] = tranche_name; + return result; } /* @@ -682,10 +649,23 @@ void RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) { NamedLWLockTrancheRequest *request; + static int NamedLWLockTrancheRequestsAllocated; if (!process_shmem_requests_in_progress) elog(FATAL, "cannot request additional LWLocks outside shmem_request_hook"); + if (!tranche_name) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("tranche name cannot be NULL"))); + + if (strlen(tranche_name) >= NAMEDATALEN) + ereport(ERROR, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("tranche name too long"), + errdetail("LWLock tranche names must be no longer than %d bytes.", + NAMEDATALEN - 1))); + if (NamedLWLockTrancheRequestArray == NULL) { NamedLWLockTrancheRequestsAllocated = 16; @@ -706,7 +686,6 @@ RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) } request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; - Assert(strlen(tranche_name) + 1 <= NAMEDATALEN); strlcpy(request->tranche_name, tranche_name, NAMEDATALEN); request->num_lwlocks = num_lwlocks; NamedLWLockTrancheRequests++; @@ -718,6 +697,9 @@ RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) void LWLockInitialize(LWLock *lock, int tranche_id) { + /* verify the tranche_id is valid */ + (void) GetLWTrancheName(tranche_id); + pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK); #ifdef LOCK_DEBUG pg_atomic_init_u32(&lock->nwaiters, 0); @@ -759,15 +741,27 @@ GetLWTrancheName(uint16 trancheId) return BuiltinTrancheNames[trancheId]; /* - * It's an extension tranche, so look in LWLockTrancheNames[]. However, - * it's possible that the tranche has never been registered in the current - * process, in which case give up and return "extension". + * We only ever add new entries to LWLockTrancheNames, so most lookups can + * avoid taking the spinlock as long as the backend-local counter + * (LocalLWLockCounter) is greater than the requested tranche ID. Else, + * we need to first update the backend-local counter with ShmemLock held + * before attempting the lookup again. In practice, the latter case is + * probably rare. */ - trancheId -= LWTRANCHE_FIRST_USER_DEFINED; + if (trancheId >= LocalLWLockCounter) + { + SpinLockAcquire(ShmemLock); + LocalLWLockCounter = *LWLockCounter; + SpinLockRelease(ShmemLock); + + if (trancheId >= LocalLWLockCounter) + elog(ERROR, "tranche %d is not registered", trancheId); + } - if (trancheId >= LWLockTrancheNamesAllocated || - LWLockTrancheNames[trancheId] == NULL) - return "extension"; + /* + * It's an extension tranche, so look in LWLockTrancheNames. + */ + trancheId -= LWTRANCHE_FIRST_USER_DEFINED; return LWLockTrancheNames[trancheId]; } @@ -876,9 +870,13 @@ LWLockWaitListLock(LWLock *lock) while (true) { - /* always try once to acquire lock directly */ + /* + * Always try once to acquire the lock directly, without setting up + * the spin-delay infrastructure. The work necessary for that shows up + * in profiles and is rarely necessary. + */ old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED); - if (!(old_state & LW_FLAG_LOCKED)) + if (likely(!(old_state & LW_FLAG_LOCKED))) break; /* got lock */ /* and then spin without atomic operations until lock is released */ @@ -1004,7 +1002,7 @@ LWLockWakeup(LWLock *lock) else desired_state &= ~LW_FLAG_RELEASE_OK; - if (proclist_is_empty(&wakeup)) + if (proclist_is_empty(&lock->waiters)) desired_state &= ~LW_FLAG_HAS_WAITERS; desired_state &= ~LW_FLAG_LOCKED; /* release lock */ diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index d82114ffca165..149c69a187359 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -168,7 +168,7 @@ * PredicateLockRelation(Relation relation, Snapshot snapshot) * PredicateLockPage(Relation relation, BlockNumber blkno, * Snapshot snapshot) - * PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, + * PredicateLockTID(Relation relation, const ItemPointerData *tid, Snapshot snapshot, * TransactionId tuple_xid) * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, * BlockNumber newblkno) @@ -180,7 +180,7 @@ * conflict detection (may also trigger rollback) * CheckForSerializableConflictOut(Relation relation, TransactionId xid, * Snapshot snapshot) - * CheckForSerializableConflictIn(Relation relation, ItemPointer tid, + * CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, * BlockNumber blkno) * CheckTableForSerializableConflictIn(Relation relation) * @@ -191,7 +191,7 @@ * AtPrepare_PredicateLocks(void); * PostPrepare_PredicateLocks(TransactionId xid); * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit); - * predicatelock_twophase_recover(TransactionId xid, uint16 info, + * predicatelock_twophase_recover(FullTransactionId fxid, uint16 info, * void *recdata, uint32 len); */ @@ -1145,7 +1145,7 @@ void PredicateLockShmemInit(void) { HASHCTL info; - long max_table_size; + int64 max_table_size; Size requestSize; bool found; @@ -1451,7 +1451,7 @@ GetPredicateLockStatusData(void) HASH_SEQ_STATUS seqstat; PREDICATELOCK *predlock; - data = (PredicateLockData *) palloc(sizeof(PredicateLockData)); + data = palloc_object(PredicateLockData); /* * To ensure consistency, take simultaneous locks on all partition locks @@ -1464,10 +1464,8 @@ GetPredicateLockStatusData(void) /* Get number of locks and allocate appropriately-sized arrays. */ els = hash_get_num_entries(PredicateLockHash); data->nelements = els; - data->locktags = (PREDICATELOCKTARGETTAG *) - palloc(sizeof(PREDICATELOCKTARGETTAG) * els); - data->xacts = (SERIALIZABLEXACT *) - palloc(sizeof(SERIALIZABLEXACT) * els); + data->locktags = palloc_array(PREDICATELOCKTARGETTAG, els); + data->xacts = palloc_array(SERIALIZABLEXACT, els); /* Scan through PredicateLockHash and copy contents */ @@ -2618,7 +2616,7 @@ PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot) * Skip if this is a temporary table. */ void -PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, +PredicateLockTID(Relation relation, const ItemPointerData *tid, Snapshot snapshot, TransactionId tuple_xid) { PREDICATELOCKTARGETTAG tag; @@ -4333,7 +4331,7 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag) * tuple itself. */ void -CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno) +CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno) { PREDICATELOCKTARGETTAG targettag; @@ -4856,7 +4854,7 @@ AtPrepare_PredicateLocks(void) * anyway. We only need to clean up our local state. */ void -PostPrepare_PredicateLocks(TransactionId xid) +PostPrepare_PredicateLocks(FullTransactionId fxid) { if (MySerializableXact == InvalidSerializableXact) return; @@ -4879,12 +4877,12 @@ PostPrepare_PredicateLocks(TransactionId xid) * commits or aborts. */ void -PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) +PredicateLockTwoPhaseFinish(FullTransactionId fxid, bool isCommit) { SERIALIZABLEXID *sxid; SERIALIZABLEXIDTAG sxidtag; - sxidtag.xid = xid; + sxidtag.xid = XidFromFullTransactionId(fxid); LWLockAcquire(SerializableXactHashLock, LW_SHARED); sxid = (SERIALIZABLEXID *) @@ -4906,10 +4904,11 @@ PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) * Re-acquire a predicate lock belonging to a transaction that was prepared. */ void -predicatelock_twophase_recover(TransactionId xid, uint16 info, +predicatelock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePredicateRecord *record; + TransactionId xid = XidFromFullTransactionId(fxid); Assert(len == sizeof(TwoPhasePredicateRecord)); @@ -4987,7 +4986,7 @@ predicatelock_twophase_recover(TransactionId xid, uint16 info, HASH_ENTER, &found); Assert(sxid != NULL); Assert(!found); - sxid->myXact = (SERIALIZABLEXACT *) sxact; + sxid->myXact = sxact; /* * Update global xmin. Note that this is a special case compared to diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e9ef0fbfe32cb..ebc3f4ca4575f 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -36,6 +36,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xlogutils.h" +#include "access/xlogwait.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/autovacuum.h" @@ -60,7 +61,7 @@ int LockTimeout = 0; int IdleInTransactionSessionTimeout = 0; int TransactionTimeout = 0; int IdleSessionTimeout = 0; -bool log_lock_waits = false; +bool log_lock_waits = true; /* Pointer to this process's PGPROC struct, if any */ PGPROC *MyProc = NULL; @@ -144,6 +145,7 @@ ProcGlobalShmemSize(void) size = add_size(size, sizeof(PROC_HDR)); size = add_size(size, sizeof(slock_t)); + size = add_size(size, PGSemaphoreShmemSize(ProcGlobalSemas())); size = add_size(size, PGProcShmemSize()); size = add_size(size, FastPathLockShmemSize()); @@ -242,7 +244,7 @@ InitProcGlobal(void) MemSet(ptr, 0, requestSize); procs = (PGPROC *) ptr; - ptr = (char *) ptr + TotalProcs * sizeof(PGPROC); + ptr = ptr + TotalProcs * sizeof(PGPROC); ProcGlobal->allProcs = procs; /* XXX allProcCount isn't really all of them; it excludes prepared xacts */ @@ -256,13 +258,13 @@ InitProcGlobal(void) * how hotly they are accessed. */ ProcGlobal->xids = (TransactionId *) ptr; - ptr = (char *) ptr + (TotalProcs * sizeof(*ProcGlobal->xids)); + ptr = ptr + (TotalProcs * sizeof(*ProcGlobal->xids)); ProcGlobal->subxidStates = (XidCacheStatus *) ptr; - ptr = (char *) ptr + (TotalProcs * sizeof(*ProcGlobal->subxidStates)); + ptr = ptr + (TotalProcs * sizeof(*ProcGlobal->subxidStates)); ProcGlobal->statusFlags = (uint8 *) ptr; - ptr = (char *) ptr + (TotalProcs * sizeof(*ProcGlobal->statusFlags)); + ptr = ptr + (TotalProcs * sizeof(*ProcGlobal->statusFlags)); /* make sure wer didn't overflow */ Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize)); @@ -286,6 +288,9 @@ InitProcGlobal(void) /* For asserts checking we did not overflow. */ fpEndPtr = fpPtr + requestSize; + /* Reserve space for semaphores. */ + PGReserveSemaphores(ProcGlobalSemas()); + for (i = 0; i < TotalProcs; i++) { PGPROC *proc = &procs[i]; @@ -947,6 +952,11 @@ ProcKill(int code, Datum arg) */ LWLockReleaseAll(); + /* + * Cleanup waiting for LSN if any. + */ + WaitLSNCleanup(); + /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index dbb49ed9197d7..05376431ef290 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -191,7 +191,7 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail */ OffsetNumber PageAddItemExtended(Page page, - Item item, + const void *item, Size size, OffsetNumber offsetNumber, int flags) @@ -785,8 +785,8 @@ PageRepairFragmentation(Page page) if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted item lengths: total %u, available space %u", - (unsigned int) totallen, pd_special - pd_lower))); + errmsg("corrupted item lengths: total %zu, available space %u", + totallen, pd_special - pd_lower))); compactify_tuples(itemidbase, nstorage, page, presorted); } @@ -1088,8 +1088,8 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum) offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted line pointer: offset = %u, size = %u", - offset, (unsigned int) size))); + errmsg("corrupted line pointer: offset = %u, size = %zu", + offset, size))); /* Amount of space to actually be deleted */ size = MAXALIGN(size); @@ -1229,8 +1229,8 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted line pointer: offset = %u, size = %u", - offset, (unsigned int) size))); + errmsg("corrupted line pointer: offset = %u, size = %zu", + offset, size))); if (nextitm < nitems && offnum == itemnos[nextitm]) { @@ -1262,8 +1262,8 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted item lengths: total %u, available space %u", - (unsigned int) totallen, pd_special - pd_lower))); + errmsg("corrupted item lengths: total %zu, available space %u", + totallen, pd_special - pd_lower))); /* * Looks good. Overwrite the line pointers with the copy, from which we've @@ -1326,8 +1326,8 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted line pointer: offset = %u, size = %u", - offset, (unsigned int) size))); + errmsg("corrupted line pointer: offset = %u, size = %zu", + offset, size))); /* Amount of space to actually be deleted */ size = MAXALIGN(size); @@ -1402,7 +1402,7 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) */ bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, - Item newtup, Size newsize) + const void *newtup, Size newsize) { PageHeader phdr = (PageHeader) page; ItemId tupid; @@ -1438,8 +1438,8 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted line pointer: offset = %u, size = %u", - offset, (unsigned int) oldsize))); + errmsg("corrupted line pointer: offset = %u, size = %d", + offset, oldsize))); /* * Determine actual change in space requirement, check for page overflow. diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c index ad65821572194..cedb27d6cc5af 100644 --- a/src/backend/storage/page/itemptr.c +++ b/src/backend/storage/page/itemptr.c @@ -32,7 +32,7 @@ StaticAssertDecl(sizeof(ItemPointerData) == 3 * sizeof(uint16), * Asserts that the disk item pointers are both valid! */ bool -ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2) +ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2) { if (ItemPointerGetBlockNumber(pointer1) == ItemPointerGetBlockNumber(pointer2) && @@ -48,7 +48,7 @@ ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2) * Generic btree-style comparison for item pointers. */ int32 -ItemPointerCompare(ItemPointer arg1, ItemPointer arg2) +ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2) { /* * Use ItemPointerGet{Offset,Block}NumberNoCheck to avoid asserting diff --git a/src/backend/storage/page/meson.build b/src/backend/storage/page/meson.build index c3e4a805862a9..112f00ff36552 100644 --- a/src/backend/storage/page/meson.build +++ b/src/backend/storage/page/meson.build @@ -1,7 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +checksum_backend_lib = static_library('checksum_backend_lib', + 'checksum.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags + unroll_loops_cflags, +) + +backend_link_with += checksum_backend_lib + backend_sources += files( 'bufpage.c', - 'checksum.c', 'itemptr.c', ) diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c index b958be1571645..d43c30da48e83 100644 --- a/src/backend/storage/smgr/bulk_write.c +++ b/src/backend/storage/smgr/bulk_write.c @@ -101,7 +101,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal) { BulkWriteState *state; - state = palloc(sizeof(BulkWriteState)); + state = palloc_object(BulkWriteState); state->smgr = smgr; state->forknum = forknum; state->use_wal = use_wal; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2ccb0faceb5b6..71bcdeb660107 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -21,6 +21,7 @@ */ #include "postgres.h" +#include #include #include #include @@ -65,6 +66,15 @@ * out to an unlinked old copy of a segment file that will eventually * disappear. * + * RELSEG_SIZE must fit into BlockNumber; but since we expose its value + * as an integer GUC, it actually needs to fit in signed int. It's worth + * having a cross-check for this since configure's --with-segsize options + * could let people select insane values. + */ +StaticAssertDecl(RELSEG_SIZE > 0 && RELSEG_SIZE <= INT_MAX, + "RELSEG_SIZE must fit in an integer"); + +/* * File descriptors are stored in the per-fork md_seg_fds arrays inside * SMgrRelation. The length of these arrays is stored in md_num_open_segs. * Note that a fork's md_num_open_segs having a specific value does not @@ -477,7 +487,7 @@ void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) { - off_t seekpos; + pgoff_t seekpos; int nbytes; MdfdVec *v; @@ -505,9 +515,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { @@ -568,7 +578,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, while (remblocks > 0) { BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); - off_t seekpos = (off_t) BLCKSZ * segstartblock; + pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock; int numblocks; if (segstartblock + remblocks > RELSEG_SIZE) @@ -597,7 +607,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, int ret; ret = FileFallocate(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, + seekpos, (pgoff_t) BLCKSZ * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret != 0) { @@ -620,7 +630,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, * whole length of the extension. */ ret = FileZero(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, + seekpos, (pgoff_t) BLCKSZ * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret < 0) ereport(ERROR, @@ -735,7 +745,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, while (nblocks > 0) { - off_t seekpos; + pgoff_t seekpos; MdfdVec *v; int nblocks_this_segment; @@ -744,9 +754,9 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (v == NULL) return false; - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -841,7 +851,7 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { struct iovec iov[PG_IOV_MAX]; int iovcnt; - off_t seekpos; + pgoff_t seekpos; int nbytes; MdfdVec *v; BlockNumber nblocks_this_segment; @@ -851,9 +861,9 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -976,7 +986,7 @@ mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) { - off_t seekpos; + pgoff_t seekpos; MdfdVec *v; BlockNumber nblocks_this_segment; struct iovec *iov; @@ -986,9 +996,9 @@ mdstartreadv(PgAioHandle *ioh, v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -1058,7 +1068,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { struct iovec iov[PG_IOV_MAX]; int iovcnt; - off_t seekpos; + pgoff_t seekpos; int nbytes; MdfdVec *v; BlockNumber nblocks_this_segment; @@ -1068,9 +1078,9 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -1163,7 +1173,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, while (nblocks > 0) { BlockNumber nflush = nblocks; - off_t seekpos; + pgoff_t seekpos; MdfdVec *v; int segnum_start, segnum_end; @@ -1192,9 +1202,9 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, Assert(nflush >= 1); Assert(nflush <= nblocks); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); + FileWriteback(v->mdfd_vfd, seekpos, (pgoff_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); nblocks -= nflush; blocknum += nflush; @@ -1338,7 +1348,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, */ BlockNumber lastsegblocks = nblocks - priorblocks; - if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) + if (FileTruncate(v->mdfd_vfd, (pgoff_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u blocks: %m", @@ -1474,9 +1484,9 @@ mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off) v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); - *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE); return FileGetRawDesc(v->mdfd_vfd); } @@ -1589,7 +1599,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) SMgrRelation *srels; int i; - srels = palloc(sizeof(SMgrRelation) * ndelrels); + srels = palloc_array(SMgrRelation, ndelrels); for (i = 0; i < ndelrels; i++) { SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER); @@ -1858,7 +1868,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { - off_t len; + pgoff_t len; len = FileSize(seg->mdfd_vfd); if (len < 0) diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index bce37a36d51ba..f9066ab8c4965 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -561,7 +561,7 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) * create an array which contains all relations to be dropped, and close * each relation's forks at the smgr level while at it */ - rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels); + rlocators = palloc_array(RelFileLocatorBackend, nrels); for (i = 0; i < nrels; i++) { RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator; diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index fc16db90133bb..9d4c5eae5f610 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -531,7 +531,7 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type) MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); PendingUnlinkEntry *entry; - entry = palloc(sizeof(PendingUnlinkEntry)); + entry = palloc_object(PendingUnlinkEntry); entry->tag = *ftag; entry->cycle_ctr = checkpoint_cycle_ctr; entry->canceled = false; diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c index a7d1fec981f88..14d5fc0b1965a 100644 --- a/src/backend/tcop/backend_startup.c +++ b/src/backend/tcop/backend_startup.c @@ -492,7 +492,7 @@ static int ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) { int32 len; - char *buf; + char *buf = NULL; ProtocolVersion proto; MemoryContext oldcontext; @@ -516,7 +516,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) * scanners, which may be less benign, but it's not really our job to * notice those.) */ - return STATUS_ERROR; + goto fail; } if (pq_getbytes(((char *) &len) + 1, 3) == EOF) @@ -526,7 +526,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete startup packet"))); - return STATUS_ERROR; + goto fail; } len = pg_ntoh32(len); @@ -538,7 +538,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid length of startup packet"))); - return STATUS_ERROR; + goto fail; } /* @@ -554,7 +554,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete startup packet"))); - return STATUS_ERROR; + goto fail; } pq_endmsgread(); @@ -568,7 +568,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) { ProcessCancelRequestPacket(port, buf, len); /* Not really an error, but we don't want to proceed further */ - return STATUS_ERROR; + goto fail; } if (proto == NEGOTIATE_SSL_CODE && !ssl_done) @@ -607,14 +607,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode_for_socket_access(), errmsg("failed to send SSL negotiation response: %m"))); - return STATUS_ERROR; /* close the connection */ + goto fail; /* close the connection */ } #ifdef USE_SSL if (SSLok == 'S' && secure_open_server(port) == -1) - return STATUS_ERROR; + goto fail; #endif + pfree(buf); + /* * At this point we should have no data already buffered. If we do, * it was received before we performed the SSL handshake, so it wasn't @@ -661,14 +663,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode_for_socket_access(), errmsg("failed to send GSSAPI negotiation response: %m"))); - return STATUS_ERROR; /* close the connection */ + goto fail; /* close the connection */ } #ifdef ENABLE_GSS if (GSSok == 'G' && secure_open_gssapi(port) == -1) - return STATUS_ERROR; + goto fail; #endif + pfree(buf); + /* * At this point we should have no data already buffered. If we do, * it was received before we performed the GSS handshake, so it wasn't @@ -863,7 +867,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) */ MemoryContextSwitchTo(oldcontext); + pfree(buf); + return STATUS_OK; + +fail: + /* be tidy, just to avoid Valgrind complaints */ + if (buf) + pfree(buf); + + return STATUS_ERROR; } /* @@ -881,7 +894,7 @@ ProcessCancelRequestPacket(Port *port, void *pkt, int pktlen) { ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("invalid length of query cancel packet"))); + errmsg("invalid length of cancel request packet"))); return; } len = pktlen - offsetof(CancelRequestPacket, cancelAuthCode); @@ -889,7 +902,7 @@ ProcessCancelRequestPacket(Port *port, void *pkt, int pktlen) { ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("invalid length of query cancel key"))); + errmsg("invalid length of cancel key in cancel request packet"))); return; } @@ -1077,7 +1090,7 @@ check_log_connections(char **newval, void **extra, GucSource source) if (!SplitIdentifierString(rawstring, ',', &elemlist)) { - GUC_check_errdetail("Invalid list syntax in parameter \"log_connections\"."); + GUC_check_errdetail("Invalid list syntax in parameter \"%s\".", "log_connections"); pfree(rawstring); list_free(elemlist); return false; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c242c8170b562..7dd75a490aab5 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -37,6 +37,7 @@ #include "catalog/pg_type.h" #include "commands/async.h" #include "commands/event_trigger.h" +#include "commands/explain_state.h" #include "commands/prepare.h" #include "common/pg_prng.h" #include "jit/jit.h" @@ -649,6 +650,10 @@ pg_parse_query(const char *query_string) TRACE_POSTGRESQL_QUERY_PARSE_DONE(query_string); + if (Debug_print_raw_parse) + elog_node_display(LOG, "raw parse tree", raw_parsetree_list, + Debug_pretty_print); + return raw_parsetree_list; } @@ -880,7 +885,7 @@ pg_rewrite_query(Query *query) */ PlannedStmt * pg_plan_query(Query *querytree, const char *query_string, int cursorOptions, - ParamListInfo boundParams) + ParamListInfo boundParams, ExplainState *es) { PlannedStmt *plan; @@ -897,7 +902,7 @@ pg_plan_query(Query *querytree, const char *query_string, int cursorOptions, ResetUsage(); /* call the optimizer */ - plan = planner(querytree, query_string, cursorOptions, boundParams); + plan = planner(querytree, query_string, cursorOptions, boundParams, es); if (log_planner_stats) ShowUsage("PLANNER STATISTICS"); @@ -988,11 +993,12 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions, stmt->stmt_location = query->stmt_location; stmt->stmt_len = query->stmt_len; stmt->queryId = query->queryId; + stmt->planOrigin = PLAN_STMT_INTERNAL; } else { stmt = pg_plan_query(query, query_string, cursorOptions, - boundParams); + boundParams, NULL); } stmt_list = lappend(stmt_list, stmt); @@ -1682,7 +1688,7 @@ exec_bind_message(StringInfo input_message) { Query *query = lfirst_node(Query, lc); - if (query->queryId != UINT64CONST(0)) + if (query->queryId != INT64CONST(0)) { pgstat_report_query_id(query->queryId, false); break; @@ -2034,7 +2040,7 @@ exec_bind_message(StringInfo input_message) { PlannedStmt *plan = lfirst_node(PlannedStmt, lc); - if (plan->planId != UINT64CONST(0)) + if (plan->planId != INT64CONST(0)) { pgstat_report_plan_id(plan->planId, false); break; @@ -2174,7 +2180,7 @@ exec_execute_message(const char *portal_name, long max_rows) { PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); - if (stmt->queryId != UINT64CONST(0)) + if (stmt->queryId != INT64CONST(0)) { pgstat_report_query_id(stmt->queryId, false); break; @@ -2185,7 +2191,7 @@ exec_execute_message(const char *portal_name, long max_rows) { PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); - if (stmt->planId != UINT64CONST(0)) + if (stmt->planId != INT64CONST(0)) { pgstat_report_plan_id(stmt->planId, false); break; @@ -3696,7 +3702,10 @@ set_debug_options(int debug_flag, GucContext context, GucSource source) if (debug_flag >= 2) SetConfigOption("log_statement", "all", context, source); if (debug_flag >= 3) + { + SetConfigOption("debug_print_raw_parse", "true", context, source); SetConfigOption("debug_print_parse", "true", context, source); + } if (debug_flag >= 4) SetConfigOption("debug_print_plan", "true", context, source); if (debug_flag >= 5) diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index d1593f38b35fd..6dd64726f060a 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -74,7 +74,7 @@ CreateQueryDesc(PlannedStmt *plannedstmt, QueryEnvironment *queryEnv, int instrument_options) { - QueryDesc *qd = (QueryDesc *) palloc(sizeof(QueryDesc)); + QueryDesc *qd = palloc_object(QueryDesc); qd->operation = plannedstmt->commandType; /* operation */ qd->plannedstmt = plannedstmt; /* plan */ @@ -165,27 +165,22 @@ ProcessQuery(PlannedStmt *plan, */ if (qc) { - switch (queryDesc->operation) - { - case CMD_SELECT: - SetQueryCompletion(qc, CMDTAG_SELECT, queryDesc->estate->es_processed); - break; - case CMD_INSERT: - SetQueryCompletion(qc, CMDTAG_INSERT, queryDesc->estate->es_processed); - break; - case CMD_UPDATE: - SetQueryCompletion(qc, CMDTAG_UPDATE, queryDesc->estate->es_processed); - break; - case CMD_DELETE: - SetQueryCompletion(qc, CMDTAG_DELETE, queryDesc->estate->es_processed); - break; - case CMD_MERGE: - SetQueryCompletion(qc, CMDTAG_MERGE, queryDesc->estate->es_processed); - break; - default: - SetQueryCompletion(qc, CMDTAG_UNKNOWN, queryDesc->estate->es_processed); - break; - } + CommandTag tag; + + if (queryDesc->operation == CMD_SELECT) + tag = CMDTAG_SELECT; + else if (queryDesc->operation == CMD_INSERT) + tag = CMDTAG_INSERT; + else if (queryDesc->operation == CMD_UPDATE) + tag = CMDTAG_UPDATE; + else if (queryDesc->operation == CMD_DELETE) + tag = CMDTAG_DELETE; + else if (queryDesc->operation == CMD_MERGE) + tag = CMDTAG_MERGE; + else + tag = CMDTAG_UNKNOWN; + + SetQueryCompletion(qc, tag, queryDesc->estate->es_processed); } /* @@ -1163,10 +1158,11 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt, MemoryContextSwitchTo(portal->portalContext); /* - * Some utility commands (e.g., VACUUM) pop the ActiveSnapshot stack from - * under us, so don't complain if it's now empty. Otherwise, our snapshot - * should be the top one; pop it. Note that this could be a different - * snapshot from the one we made above; see EnsurePortalSnapshotExists. + * Some utility commands (e.g., VACUUM, WAIT FOR) pop the ActiveSnapshot + * stack from under us, so don't complain if it's now empty. Otherwise, + * our snapshot should be the top one; pop it. Note that this could be a + * different snapshot from the one we made above; see + * EnsurePortalSnapshotExists. */ if (portal->portalSnapshot != NULL && ActiveSnapshotSet()) { @@ -1350,24 +1346,15 @@ PortalRunMulti(Portal portal, PopActiveSnapshot(); /* - * If a query completion data was supplied, use it. Otherwise use the - * portal's query completion data. - * - * Exception: Clients expect INSERT/UPDATE/DELETE tags to have counts, so - * fake them with zeros. This can happen with DO INSTEAD rules if there - * is no replacement query of the same type as the original. We print "0 - * 0" here because technically there is no query of the matching tag type, - * and printing a non-zero count for a different query type seems wrong, - * e.g. an INSERT that does an UPDATE instead should not print "0 1" if - * one row was updated. See QueryRewrite(), step 3, for details. + * If a command tag was requested and we did not fill in a run-time- + * determined tag above, copy the parse-time tag from the Portal. (There + * might not be any tag there either, in edge cases such as empty prepared + * statements. That's OK.) */ - if (qc && qc->commandTag == CMDTAG_UNKNOWN) - { - if (portal->qc.commandTag != CMDTAG_UNKNOWN) - CopyQueryCompletion(qc, &portal->qc); - /* If the caller supplied a qc, we should have set it by now. */ - Assert(qc->commandTag != CMDTAG_UNKNOWN); - } + if (qc && + qc->commandTag == CMDTAG_UNKNOWN && + portal->qc.commandTag != CMDTAG_UNKNOWN) + CopyQueryCompletion(qc, &portal->qc); } /* @@ -1752,7 +1739,8 @@ PlannedStmtRequiresSnapshot(PlannedStmt *pstmt) IsA(utilityStmt, ListenStmt) || IsA(utilityStmt, NotifyStmt) || IsA(utilityStmt, UnlistenStmt) || - IsA(utilityStmt, CheckPointStmt)) + IsA(utilityStmt, CheckPointStmt) || + IsA(utilityStmt, WaitStmt)) return false; return true; diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 25fe3d5801665..d18a3a60a4678 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -56,6 +56,7 @@ #include "commands/user.h" #include "commands/vacuum.h" #include "commands/view.h" +#include "commands/wait.h" #include "miscadmin.h" #include "parser/parse_utilcmd.h" #include "postmaster/bgwriter.h" @@ -266,6 +267,7 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) case T_PrepareStmt: case T_UnlistenStmt: case T_VariableSetStmt: + case T_WaitStmt: { /* * These modify only backend-local state, so they're OK to run @@ -943,17 +945,7 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; case T_CheckPointStmt: - if (!has_privs_of_role(GetUserId(), ROLE_PG_CHECKPOINT)) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - /* translator: %s is name of a SQL command, eg CHECKPOINT */ - errmsg("permission denied to execute %s command", - "CHECKPOINT"), - errdetail("Only roles with privileges of the \"%s\" role may execute this command.", - "pg_checkpoint"))); - - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT | - (RecoveryInProgress() ? 0 : CHECKPOINT_FORCE)); + ExecCheckpoint(pstate, (CheckPointStmt *) parsetree); break; /* @@ -1065,6 +1057,12 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; } + case T_WaitStmt: + { + ExecWaitStmt(pstate, (WaitStmt *) parsetree, dest); + } + break; + default: /* All other statement types have event trigger support */ ProcessUtilitySlow(pstate, pstmt, queryString, @@ -1244,6 +1242,7 @@ ProcessUtilitySlow(ParseState *pstate, wrapper->utilityStmt = stmt; wrapper->stmt_location = pstmt->stmt_location; wrapper->stmt_len = pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, queryString, @@ -1343,7 +1342,7 @@ ProcessUtilitySlow(ParseState *pstate, */ switch (stmt->subtype) { - case 'T': /* ALTER DOMAIN DEFAULT */ + case AD_AlterDefault: /* * Recursively alter column default for table and, @@ -1353,30 +1352,30 @@ ProcessUtilitySlow(ParseState *pstate, AlterDomainDefault(stmt->typeName, stmt->def); break; - case 'N': /* ALTER DOMAIN DROP NOT NULL */ + case AD_DropNotNull: address = AlterDomainNotNull(stmt->typeName, false); break; - case 'O': /* ALTER DOMAIN SET NOT NULL */ + case AD_SetNotNull: address = AlterDomainNotNull(stmt->typeName, true); break; - case 'C': /* ADD CONSTRAINT */ + case AD_AddConstraint: address = AlterDomainAddConstraint(stmt->typeName, stmt->def, &secondaryObject); break; - case 'X': /* DROP CONSTRAINT */ + case AD_DropConstraint: address = AlterDomainDropConstraint(stmt->typeName, stmt->name, stmt->behavior, stmt->missing_ok); break; - case 'V': /* VALIDATE CONSTRAINT */ + case AD_ValidateConstraint: address = AlterDomainValidateConstraint(stmt->typeName, stmt->name); @@ -1883,7 +1882,7 @@ ProcessUtilitySlow(ParseState *pstate, if (!IsA(rel, RangeVar)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only a single relation is allowed in CREATE STATISTICS"))); + errmsg("CREATE STATISTICS only supports relation names in the FROM clause"))); /* * CREATE STATISTICS will influence future execution plans @@ -1901,7 +1900,7 @@ ProcessUtilitySlow(ParseState *pstate, /* Run parse analysis ... */ stmt = transformStatsStmt(relid, stmt, queryString); - address = CreateStatistics(stmt); + address = CreateStatistics(stmt, true); } break; @@ -1974,6 +1973,7 @@ ProcessUtilityForAlterTable(Node *stmt, AlterTableUtilityContext *context) wrapper->utilityStmt = stmt; wrapper->stmt_location = context->pstmt->stmt_location; wrapper->stmt_len = context->pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, context->queryString, @@ -2067,6 +2067,9 @@ UtilityReturnsTuples(Node *parsetree) case T_VariableShowStmt: return true; + case T_WaitStmt: + return true; + default: return false; } @@ -2122,6 +2125,9 @@ UtilityTupleDescriptor(Node *parsetree) return GetPGVariableResultDesc(n->name); } + case T_WaitStmt: + return WaitStmtResultDesc((WaitStmt *) parsetree); + default: return NULL; } @@ -3099,6 +3105,10 @@ CreateCommandTag(Node *parsetree) } break; + case T_WaitStmt: + tag = CMDTAG_WAIT; + break; + /* already-planned queries */ case T_PlannedStmt: { @@ -3697,6 +3707,10 @@ GetCommandLogLevel(Node *parsetree) lev = LOGSTMT_DDL; break; + case T_WaitStmt: + lev = LOGSTMT_ALL; + break; + /* already-planned queries */ case T_PlannedStmt: { diff --git a/src/backend/tsearch/dict.c b/src/backend/tsearch/dict.c index eb968858683de..6b81050aa3ad2 100644 --- a/src/backend/tsearch/dict.c +++ b/src/backend/tsearch/dict.c @@ -61,7 +61,7 @@ ts_lexize(PG_FUNCTION_ARGS) ptr = res; while (ptr->lexeme) ptr++; - da = (Datum *) palloc(sizeof(Datum) * (ptr - res)); + da = (Datum *) palloc_array(Datum, ptr - res); ptr = res; while (ptr->lexeme) { diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c index 63bd193a78a89..14e5da486e0ad 100644 --- a/src/backend/tsearch/dict_ispell.c +++ b/src/backend/tsearch/dict_ispell.c @@ -37,7 +37,7 @@ dispell_init(PG_FUNCTION_ARGS) stoploaded = false; ListCell *l; - d = (DictISpell *) palloc0(sizeof(DictISpell)); + d = palloc0_object(DictISpell); NIStartBuild(&(d->obj)); @@ -47,24 +47,30 @@ dispell_init(PG_FUNCTION_ARGS) if (strcmp(defel->defname, "dictfile") == 0) { + char *filename; + if (dictloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple DictFile parameters"))); - NIImportDictionary(&(d->obj), - get_tsearch_config_filename(defGetString(defel), - "dict")); + filename = get_tsearch_config_filename(defGetString(defel), + "dict"); + NIImportDictionary(&(d->obj), filename); + pfree(filename); dictloaded = true; } else if (strcmp(defel->defname, "afffile") == 0) { + char *filename; + if (affloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple AffFile parameters"))); - NIImportAffixes(&(d->obj), - get_tsearch_config_filename(defGetString(defel), - "affix")); + filename = get_tsearch_config_filename(defGetString(defel), + "affix"); + NIImportAffixes(&(d->obj), filename); + pfree(filename); affloaded = true; } else if (strcmp(defel->defname, "stopwords") == 0) diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c index 2c972fc053870..f6639ac7c9781 100644 --- a/src/backend/tsearch/dict_simple.c +++ b/src/backend/tsearch/dict_simple.c @@ -31,7 +31,7 @@ Datum dsimple_init(PG_FUNCTION_ARGS) { List *dictoptions = (List *) PG_GETARG_POINTER(0); - DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple)); + DictSimple *d = palloc0_object(DictSimple); bool stoploaded = false, acceptloaded = false; ListCell *l; @@ -87,13 +87,13 @@ dsimple_lexize(PG_FUNCTION_ARGS) { /* reject as stopword */ pfree(txt); - res = palloc0(sizeof(TSLexeme) * 2); + res = palloc0_array(TSLexeme, 2); PG_RETURN_POINTER(res); } else if (d->accept) { /* accept */ - res = palloc0(sizeof(TSLexeme) * 2); + res = palloc0_array(TSLexeme, 2); res[0].lexeme = txt; PG_RETURN_POINTER(res); } diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index 0da5a9d686802..e076660b5ec96 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -134,7 +134,7 @@ dsynonym_init(PG_FUNCTION_ARGS) errmsg("could not open synonym file \"%s\": %m", filename))); - d = (DictSyn *) palloc0(sizeof(DictSyn)); + d = palloc0_object(DictSyn); while ((line = tsearch_readline(&trst)) != NULL) { @@ -169,12 +169,12 @@ dsynonym_init(PG_FUNCTION_ARGS) if (d->len == 0) { d->len = 64; - d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + d->syn = palloc_array(Syn, d->len); } else { d->len *= 2; - d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + d->syn = repalloc_array(d->syn, Syn, d->len); } } @@ -199,6 +199,7 @@ dsynonym_init(PG_FUNCTION_ARGS) } tsearch_readline_end(&trst); + pfree(filename); d->len = cur; qsort(d->syn, d->len, sizeof(Syn), compareSyn); @@ -235,7 +236,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS) if (!found) PG_RETURN_POINTER(NULL); - res = palloc0(sizeof(TSLexeme) * 2); + res = palloc0_array(TSLexeme, 2); res[0].lexeme = pnstrdup(found->out, found->outlen); res[0].flags = found->flags; diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 1bebe36a6910e..6a3d336985803 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -78,12 +78,12 @@ newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst) if (d->ntwrds == 0) { d->ntwrds = 16; - d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds); + d->wrds = palloc_array(TheLexeme, d->ntwrds); } else { d->ntwrds *= 2; - d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds); + d->wrds = repalloc_array(d->wrds, TheLexeme, d->ntwrds); } } @@ -95,7 +95,7 @@ newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst) memcpy(ptr->lexeme, b, e - b); ptr->lexeme[e - b] = '\0'; - ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo)); + ptr->entries = palloc_object(LexemeInfo); ptr->entries->nextentry = NULL; ptr->entries->idsubst = idsubst; @@ -118,12 +118,12 @@ addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 p if (d->nsubst == 0) { d->nsubst = 16; - d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst); + d->subst = palloc_array(TheSubstitute, d->nsubst); } else { d->nsubst *= 2; - d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst); + d->subst = repalloc_array(d->subst, TheSubstitute, d->nsubst); } } } @@ -137,12 +137,12 @@ addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 p if (ntres == 0) { ntres = 2; - ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres); + ptr->res = palloc_array(TSLexeme, ntres); } else { ntres *= 2; - ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres); + ptr->res = repalloc_array(ptr->res, TSLexeme, ntres); } } @@ -167,17 +167,17 @@ addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 p static void thesaurusRead(const char *filename, DictThesaurus *d) { + char *real_filename = get_tsearch_config_filename(filename, "ths"); tsearch_readline_state trst; uint32 idsubst = 0; bool useasis = false; char *line; - filename = get_tsearch_config_filename(filename, "ths"); - if (!tsearch_readline_begin(&trst, filename)) + if (!tsearch_readline_begin(&trst, real_filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open thesaurus file \"%s\": %m", - filename))); + real_filename))); while ((line = tsearch_readline(&trst)) != NULL) { @@ -297,6 +297,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) d->nsubst = idsubst; tsearch_readline_end(&trst); + pfree(real_filename); } static TheLexeme * @@ -308,7 +309,7 @@ addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, Lexe newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm); } - newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo)); + newwrds[*nnw].entries = palloc_object(LexemeInfo); if (lexeme && lexeme->lexeme) { @@ -393,7 +394,7 @@ compileTheLexeme(DictThesaurus *d) int i, nnw = 0, tnm = 16; - TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm), + TheLexeme *newwrds = palloc_array(TheLexeme, tnm), *ptrwrds; for (i = 0; i < d->nwrds; i++) @@ -510,7 +511,7 @@ compileTheSubstitute(DictThesaurus *d) *inptr; int n = 2; - outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n); + outptr = d->subst[i].res = palloc_array(TSLexeme, n); outptr->lexeme = NULL; inptr = rem; @@ -602,7 +603,7 @@ thesaurus_init(PG_FUNCTION_ARGS) List *namelist; ListCell *l; - d = (DictThesaurus *) palloc0(sizeof(DictThesaurus)); + d = palloc0_object(DictThesaurus); foreach(l, dictoptions) { @@ -755,7 +756,7 @@ copyTSLexeme(TheSubstitute *ts) TSLexeme *res; uint16 i; - res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1)); + res = palloc_array(TSLexeme, ts->reslen + 1); for (i = 0; i < ts->reslen; i++) { res[i] = ts->res[i]; @@ -833,7 +834,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS) ptr++; } - infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex); + infos = palloc_array(LexemeInfo *, nlex); for (i = 0; i < nlex; i++) if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL) break; diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index 146801885d738..e5badb6b43fbe 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -691,7 +691,7 @@ NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask, else { Conf->maffixes = 16; - Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX)); + Conf->Affix = palloc_array(AFFIX, Conf->maffixes); } } @@ -737,7 +737,7 @@ NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask, * allocated in the dictionary's memory context, and will be freed * automatically when it is destroyed. */ - Affix->reg.pregex = palloc(sizeof(regex_t)); + Affix->reg.pregex = palloc_object(regex_t); err = pg_regcomp(Affix->reg.pregex, wmask, wmasklen, REG_ADVANCED | REG_NOSUB, DEFAULT_COLLATION_OID); @@ -1327,7 +1327,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) /* Also reserve place for empty flag set */ naffix++; - Conf->AffixData = (const char **) palloc0(naffix * sizeof(char *)); + Conf->AffixData = palloc0_array(const char *, naffix); Conf->lenAffixData = Conf->nAffixData = naffix; /* Add empty flag set into AffixData */ @@ -1794,7 +1794,7 @@ NISortDictionary(IspellDict *Conf) * dictionary. Replace textual flag-field of Conf->Spell entries with * indexes into Conf->AffixData array. */ - Conf->AffixData = (const char **) palloc0(naffix * sizeof(const char *)); + Conf->AffixData = palloc0_array(const char *, naffix); curaffix = -1; for (i = 0; i < Conf->nspell; i++) @@ -1991,7 +1991,7 @@ NISortAffixes(IspellDict *Conf) /* Store compound affixes in the Conf->CompoundAffix array */ if (Conf->naffixes > 1) qsort(Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); - Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes); + Conf->CompoundAffix = ptr = palloc_array(CMPDAffix, Conf->naffixes); ptr->affix = NULL; for (i = 0; i < Conf->naffixes; i++) @@ -2147,7 +2147,7 @@ CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *neww /* Convert data string to wide characters */ newword_len = strlen(newword); - data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar)); + data = palloc_array(pg_wchar, newword_len + 1); data_len = pg_mb2wchar_with_len(newword, data, newword_len); if (pg_regexec(Affix->reg.pregex, data, data_len, @@ -2197,7 +2197,7 @@ NormalizeSubWord(IspellDict *Conf, const char *word, int flag) if (wrdlen > MAXNORMLEN) return NULL; - cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); + cur = forms = palloc_array(char *, MAX_NORM); *cur = NULL; @@ -2320,7 +2320,7 @@ CheckCompoundAffixes(CMPDAffix **ptr, const char *word, int len, bool CheckInPla } else { - char *affbegin; + const char *affbegin; while ((*ptr)->affix) { @@ -2340,7 +2340,7 @@ CheckCompoundAffixes(CMPDAffix **ptr, const char *word, int len, bool CheckInPla static SplitVar * CopyVar(SplitVar *s, int makedup) { - SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar)); + SplitVar *v = palloc_object(SplitVar); v->next = NULL; if (s) @@ -2348,7 +2348,7 @@ CopyVar(SplitVar *s, int makedup) int i; v->lenstem = s->lenstem; - v->stem = (char **) palloc(sizeof(char *) * v->lenstem); + v->stem = palloc_array(char *, v->lenstem); v->nstem = s->nstem; for (i = 0; i < s->nstem; i++) v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i]; @@ -2356,7 +2356,7 @@ CopyVar(SplitVar *s, int makedup) else { v->lenstem = 16; - v->stem = (char **) palloc(sizeof(char *) * v->lenstem); + v->stem = palloc_array(char *, v->lenstem); v->nstem = 0; } return v; @@ -2529,7 +2529,7 @@ static void addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant) { if (*lres == NULL) - *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme)); + *lcur = *lres = palloc_array(TSLexeme, MAX_NORM); if (*lcur - *lres < MAX_NORM - 1) { diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index 4dfcc2cd3bd7e..b6efe108424f6 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -84,7 +84,7 @@ uniqueWORD(ParsedWord *a, int32 l) { tmppos = LIMITPOS(a->pos.pos); a->alen = 2; - a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); + a->pos.apos = palloc_array(uint16, a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; return l; @@ -103,7 +103,7 @@ uniqueWORD(ParsedWord *a, int32 l) */ tmppos = LIMITPOS(a->pos.pos); a->alen = 2; - a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); + a->pos.apos = palloc_array(uint16, a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; @@ -123,7 +123,7 @@ uniqueWORD(ParsedWord *a, int32 l) res->word = ptr->word; tmppos = LIMITPOS(ptr->pos.pos); res->alen = 2; - res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen); + res->pos.apos = palloc_array(uint16, res->alen); res->pos.apos[0] = 1; res->pos.apos[1] = tmppos; } @@ -141,7 +141,7 @@ uniqueWORD(ParsedWord *a, int32 l) if (res->pos.apos[0] + 1 >= res->alen) { res->alen *= 2; - res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen); + res->pos.apos = repalloc_array(res->pos.apos, uint16, res->alen); } if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) { @@ -255,7 +255,7 @@ to_tsvector_byid(PG_FUNCTION_ARGS) prs.lenwords = MaxAllocSize / sizeof(ParsedWord); prs.curwords = 0; prs.pos = 0; - prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + prs.words = palloc_array(ParsedWord, prs.lenwords); parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); @@ -453,7 +453,7 @@ add_to_tsvector(void *_state, char *elem_value, int elem_len) * (parsetext() will realloc it bigger as needed.) */ prs->lenwords = 16; - prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords); + prs->words = palloc_array(ParsedWord, prs->lenwords); prs->curwords = 0; prs->pos = 0; } @@ -503,7 +503,7 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, prs.lenwords = 4; prs.curwords = 0; prs.pos = 0; - prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + prs.words = palloc_array(ParsedWord, prs.lenwords); parsetext(data->cfg_id, &prs, strval, lenval); diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index b77d8c23d3694..4422f042d12de 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -20,45 +20,33 @@ static void tsearch_readline_callback(void *arg); -/* - * The reason these functions use a 3-wchar_t output buffer, not 2 as you - * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be - * getting from char2wchar() is UTF16 not UTF32. A single input character - * may therefore produce a surrogate pair rather than just one wchar_t; - * we also need room for a trailing null. When we do get a surrogate pair, - * we pass just the first code to iswdigit() etc, so that these functions will - * always return false for characters outside the Basic Multilingual Plane. - */ -#define WC_BUF_LEN 3 +/* space for a single character plus a trailing NUL */ +#define WC_BUF_LEN 2 int t_isalpha(const char *ptr) { - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ + pg_wchar wstr[WC_BUF_LEN]; + int wlen pg_attribute_unused(); - if (clen == 1 || database_ctype_is_c) - return isalpha(TOUCHAR(ptr)); + wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); + Assert(wlen <= 1); - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswalpha((wint_t) character[0]); + /* pass single character, or NUL if empty */ + return pg_iswalpha(wstr[0], pg_database_locale()); } int t_isalnum(const char *ptr) { - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || database_ctype_is_c) - return isalnum(TOUCHAR(ptr)); + pg_wchar wstr[WC_BUF_LEN]; + int wlen pg_attribute_unused(); - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); + Assert(wlen <= 1); - return iswalnum((wint_t) character[0]); + /* pass single character, or NUL if empty */ + return pg_iswalnum(wstr[0], pg_database_locale()); } diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c index e5da6cf17ec19..6eebff6c4674c 100644 --- a/src/backend/tsearch/ts_parse.c +++ b/src/backend/tsearch/ts_parse.c @@ -99,7 +99,7 @@ LPLRemoveHead(ListParsedLex *list) static void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) { - ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); + ParsedLex *newpl = palloc_object(ParsedLex); newpl->type = type; newpl->lemm = lemm; @@ -218,7 +218,7 @@ LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) * position and go to multiword mode */ - ld->curDictId = DatumGetObjectId(map->dictIds[i]); + ld->curDictId = map->dictIds[i]; ld->posDict = i + 1; ld->curSub = curVal->next; if (res) @@ -275,7 +275,7 @@ LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) * dictionaries ? */ for (i = 0; i < map->len && !dictExists; i++) - if (ld->curDictId == DatumGetObjectId(map->dictIds[i])) + if (ld->curDictId == map->dictIds[i]) dictExists = true; if (!dictExists) diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 0c1d2bc1109da..63a6ecd3e2a2d 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -47,8 +47,8 @@ typedef struct static Selectivity tsquerysel(VariableStatData *vardata, Datum constval); static Selectivity mcelem_tsquery_selec(TSQuery query, - Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers); + const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers); static Selectivity tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, int length, float4 minfreq); static int compare_lexeme_textfreq(const void *e1, const void *e2); @@ -204,8 +204,8 @@ tsquerysel(VariableStatData *vardata, Datum constval) * Extract data from the pg_statistic arrays into useful format. */ static Selectivity -mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers) +mcelem_tsquery_selec(TSQuery query, const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers) { float4 minfreq; TextFreq *lookup; @@ -226,21 +226,21 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, /* * Transpose the data into a single array so we can use bsearch(). */ - lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem); + lookup = palloc_array(TextFreq, nmcelem); for (i = 0; i < nmcelem; i++) { /* * The text Datums came from an array, so it cannot be compressed or * stored out-of-line -- it's safe to use VARSIZE_ANY*. */ - Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i])); + Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(mcelem[i])) && !VARATT_IS_EXTERNAL(DatumGetPointer(mcelem[i]))); lookup[i].element = (text *) DatumGetPointer(mcelem[i]); lookup[i].frequency = numbers[i]; } /* - * Grab the lowest frequency. compute_tsvector_stats() stored it for us in - * the one before the last cell of the Numbers array. See ts_typanalyze.c + * Grab the lowest MCE frequency. compute_tsvector_stats() stored it for + * us in the one before the last cell of the Numbers array. */ minfreq = numbers[nnumbers - 2]; @@ -374,8 +374,11 @@ tsquery_opr_selec(QueryItem *item, char *operand, else { /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. + * The element is not in MCELEM. Estimate its frequency as + * half that of the least-frequent MCE. (We know it cannot be + * more than minfreq, and it could be a great deal less. Half + * seems like a good compromise.) For probably-historical + * reasons, clamp to not more than DEFAULT_TS_MATCH_SEL. */ selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); } diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index c5a71331ce8a0..9b8b0995ab78b 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -73,7 +73,7 @@ ts_typanalyze(PG_FUNCTION_ARGS) /* * compute_tsvector_stats() -- compute statistics for a tsvector column * - * This functions computes statistics that are useful for determining @@ + * This function computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * @@ -312,7 +312,7 @@ compute_tsvector_stats(VacAttrStats *stats, /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify - * the minimum and maximum frequencies among these items. + * the maximum frequency among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. @@ -320,18 +320,16 @@ compute_tsvector_stats(VacAttrStats *stats, cutoff_freq = 9 * lexeme_no / bucket_width; i = hash_get_num_entries(lexemes_tab); /* surely enough space */ - sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); + sort_table = palloc_array(TrackItem *, i); hash_seq_init(&scan_status, lexemes_tab); track_len = 0; - minfreq = lexeme_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; - minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } @@ -346,19 +344,38 @@ compute_tsvector_stats(VacAttrStats *stats, * If we obtained more lexemes than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. + * + * If we did not find more elements than we want, then it is safe to + * assume that the stored MCE array will contain every element with + * frequency above the cutoff. In that case, rather than storing the + * smallest frequency we are keeping, we want to store the minimum + * frequency that would have been accepted as a valid MCE. The + * selectivity functions can assume that that is an upper bound on the + * frequency of elements not present in the array. + * + * If we found no candidate MCEs at all, we still want to record the + * cutoff frequency, since it's still valid to assume that no element + * has frequency more than that. */ if (num_mcelem < track_len) { qsort_interruptible(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc, NULL); - /* reset minfreq to the smallest frequency we're keeping */ + /* set minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else + { num_mcelem = track_len; + /* set minfreq to the minimum frequency above the cutoff */ + minfreq = cutoff_freq + 1; + /* ensure maxfreq is nonzero, too */ + if (track_len == 0) + maxfreq = minfreq; + } /* Generate MCELEM slot entry */ - if (num_mcelem > 0) + if (num_mcelem >= 0) { MemoryContext old_context; Datum *mcelem_values; @@ -395,8 +412,8 @@ compute_tsvector_stats(VacAttrStats *stats, * create that for a tsvector column, since null elements aren't * possible.) */ - mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); - mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); + mcelem_values = palloc_array(Datum, num_mcelem); + mcelem_freqs = palloc_array(float4, num_mcelem + 2); /* * See comments above about use of nonnull_cnt as the divisor for diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index 0b4a57866448d..647e4649644fd 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -105,12 +105,12 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *, size if (reallen == 0) { reallen = 64; - stop = (char **) palloc(sizeof(char *) * reallen); + stop = palloc_array(char *, reallen); } else { reallen *= 2; - stop = (char **) repalloc(stop, sizeof(char *) * reallen); + stop = repalloc_array(stop, char *, reallen); } } diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c index a8ddb6109910e..9e53f57324f82 100644 --- a/src/backend/tsearch/wparser.c +++ b/src/backend/tsearch/wparser.c @@ -58,7 +58,7 @@ tt_setup_firstcall(FuncCallContext *funcctx, FunctionCallInfo fcinfo, oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - st = (TSTokenTypeStorage *) palloc(sizeof(TSTokenTypeStorage)); + st = palloc_object(TSTokenTypeStorage); st->cur = 0; /* lextype takes one dummy argument */ st->list = (LexDescr *) DatumGetPointer(OidFunctionCall1(prs->lextypeOid, @@ -173,10 +173,10 @@ prs_setup_firstcall(FuncCallContext *funcctx, FunctionCallInfo fcinfo, oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - st = (PrsStorage *) palloc(sizeof(PrsStorage)); + st = palloc_object(PrsStorage); st->cur = 0; st->len = 16; - st->list = (LexemeEntry *) palloc(sizeof(LexemeEntry) * st->len); + st->list = palloc_array(LexemeEntry, st->len); prsdata = DatumGetPointer(FunctionCall2(&prs->prsstart, PointerGetDatum(VARDATA_ANY(txt)), @@ -204,7 +204,7 @@ prs_setup_firstcall(FuncCallContext *funcctx, FunctionCallInfo fcinfo, st->len = st->cur; st->cur = 0; - funcctx->user_fctx = (void *) st; + funcctx->user_fctx = st; if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); funcctx->tuple_desc = tupdesc; @@ -307,7 +307,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS) memset(&prs, 0, sizeof(HeadlineParsedText)); prs.lenwords = 32; - prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + prs.words = palloc_array(HeadlineWordEntry, prs.lenwords); hlparsetext(cfg->cfgId, &prs, query, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); @@ -373,11 +373,11 @@ ts_headline_jsonb_byid_opt(PG_FUNCTION_ARGS) Jsonb *out; JsonTransformStringValuesAction action = (JsonTransformStringValuesAction) headline_json_value; HeadlineParsedText prs; - HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState)); + HeadlineJsonState *state = palloc0_object(HeadlineJsonState); memset(&prs, 0, sizeof(HeadlineParsedText)); prs.lenwords = 32; - prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + prs.words = palloc_array(HeadlineWordEntry, prs.lenwords); state->prs = &prs; state->cfg = lookup_ts_config_cache(tsconfig); @@ -450,11 +450,11 @@ ts_headline_json_byid_opt(PG_FUNCTION_ARGS) JsonTransformStringValuesAction action = (JsonTransformStringValuesAction) headline_json_value; HeadlineParsedText prs; - HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState)); + HeadlineJsonState *state = palloc0_object(HeadlineJsonState); memset(&prs, 0, sizeof(HeadlineParsedText)); prs.lenwords = 32; - prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + prs.words = palloc_array(HeadlineWordEntry, prs.lenwords); state->prs = &prs; state->cfg = lookup_ts_config_cache(tsconfig); diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 79bcd32a0639e..9fbeab475450f 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -243,9 +243,7 @@ typedef struct TParser /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ - wchar_t *wstr; /* wide character string */ pg_wchar *pgwstr; /* wide character string for C-locale */ - bool usewide; /* State of parse */ int charmaxlen; @@ -271,7 +269,7 @@ static bool TParserGet(TParser *prs); static TParserPosition * newTParserPosition(TParserPosition *prev) { - TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition)); + TParserPosition *res = palloc_object(TParserPosition); if (prev) memcpy(res, prev, sizeof(TParserPosition)); @@ -288,38 +286,13 @@ newTParserPosition(TParserPosition *prev) static TParser * TParserInit(char *str, int len) { - TParser *prs = (TParser *) palloc0(sizeof(TParser)); + TParser *prs = palloc0_object(TParser); prs->charmaxlen = pg_database_encoding_max_length(); prs->str = str; prs->lenstr = len; - - /* - * Use wide char code only when max encoding length > 1. - */ - if (prs->charmaxlen > 1) - { - pg_locale_t mylocale = 0; /* TODO */ - - prs->usewide = true; - if (database_ctype_is_c) - { - /* - * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could - * be different from sizeof(wchar_t) - */ - prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); - pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); - } - else - { - prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); - char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, - mylocale); - } - } - else - prs->usewide = false; + prs->pgwstr = palloc_array(pg_wchar, prs->lenstr + 1); + pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -345,17 +318,14 @@ TParserInit(char *str, int len) static TParser * TParserCopyInit(const TParser *orig) { - TParser *prs = (TParser *) palloc0(sizeof(TParser)); + TParser *prs = palloc0_object(TParser); prs->charmaxlen = orig->charmaxlen; prs->str = orig->str + orig->state->posbyte; prs->lenstr = orig->lenstr - orig->state->posbyte; - prs->usewide = orig->usewide; if (orig->pgwstr) prs->pgwstr = orig->pgwstr + orig->state->poschar; - if (orig->wstr) - prs->wstr = orig->wstr + orig->state->poschar; prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -379,8 +349,6 @@ TParserClose(TParser *prs) prs->state = ptr; } - if (prs->wstr) - pfree(prs->wstr); if (prs->pgwstr) pfree(prs->pgwstr); @@ -412,13 +380,9 @@ TParserCopyClose(TParser *prs) /* - * Character-type support functions, equivalent to is* macros, but - * working with any possible encodings and locales. Notes: - * - with multibyte encoding and C-locale isw* function may fail - * or give wrong result. - * - multibyte encoding and C-locale often are used for - * Asian languages. - * - if locale is C then we use pgwstr instead of wstr. + * Character-type support functions using the database default locale. If the + * locale is C, and the input character is non-ascii, the value to be returned + * is determined by the 'nonascii' macro argument. */ #define p_iswhat(type, nonascii) \ @@ -426,19 +390,13 @@ TParserCopyClose(TParser *prs) static int \ p_is##type(TParser *prs) \ { \ + pg_locale_t locale = pg_database_locale(); \ + pg_wchar wc; \ Assert(prs->state); \ - if (prs->usewide) \ - { \ - if (prs->pgwstr) \ - { \ - unsigned int c = *(prs->pgwstr + prs->state->poschar); \ - if (c > 0x7f) \ - return nonascii; \ - return is##type(c); \ - } \ - return isw##type(*(prs->wstr + prs->state->poschar)); \ - } \ - return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ + wc = prs->pgwstr[prs->state->poschar]; \ + if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f) \ + return nonascii; \ + return pg_isw##type(wc, pg_database_locale()); \ } \ \ static int \ @@ -703,7 +661,7 @@ p_isspecial(TParser *prs) * Check that only in utf encoding, because other encodings aren't * supported by postgres or even exists. */ - if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide) + if (GetDatabaseEncoding() == PG_UTF8) { static const pg_wchar strange_letter[] = { /* @@ -944,10 +902,7 @@ p_isspecial(TParser *prs) *StopMiddle; pg_wchar c; - if (prs->pgwstr) - c = *(prs->pgwstr + prs->state->poschar); - else - c = (pg_wchar) *(prs->wstr + prs->state->poschar); + c = *(prs->pgwstr + prs->state->poschar); while (StopLow < StopHigh) { @@ -1877,7 +1832,7 @@ TParserGet(TParser *prs) Datum prsd_lextype(PG_FUNCTION_ARGS) { - LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1)); + LexDescr *descr = palloc_array(LexDescr, LASTNUM + 1); int i; for (i = 1; i <= LASTNUM; i++) @@ -1994,7 +1949,7 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) if (!data->pos) { - data->pos = palloc(sizeof(WordEntryPos) * checkval->len); + data->pos = palloc_array(WordEntryPos, checkval->len); data->allocated = true; data->npos = 1; data->pos[0] = checkval->words[i].pos; diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore index 068555695946f..303c01d051512 100644 --- a/src/backend/utils/.gitignore +++ b/src/backend/utils/.gitignore @@ -2,5 +2,6 @@ /fmgroids.h /fmgrprotos.h /fmgr-stamp +/guc_tables.inc.c /probes.h /errcodes.h diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index 140fbba5c222a..985ef52e7e318 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak submake-adt-headers: $(MAKE) -C adt jsonpath_gram.h -$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h +$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c # fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on # the timestamps of the individual output files, because the Perl script @@ -55,6 +55,9 @@ fmgr-stamp: Gen_fmgrtab.pl $(catalogdir)/Catalog.pm $(top_srcdir)/src/include/ca errcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-errcodes.pl $(PERL) $(srcdir)/generate-errcodes.pl --outfile $@ $< +guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl + $(PERL) $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl $< $@ + ifeq ($(enable_dtrace), yes) probes.h: postprocess_dtrace.sed probes.h.tmp sed -f $^ >$@ @@ -70,8 +73,8 @@ endif # These generated headers must be symlinked into src/include/. # We use header-stamp to record that we've done this because the symlinks # themselves may appear older than fmgr-stamp. -$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h - cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h; do \ +$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c + cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c; do \ rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \ done touch $@ @@ -89,4 +92,4 @@ uninstall-data: clean: rm -f probes.h probes.h.tmp - rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h + rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index e1576e64b6d4c..a290cc4c97501 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -320,8 +320,8 @@ pgstat_bestart_initial(void) lbeentry.st_state = STATE_STARTING; lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID; lbeentry.st_progress_command_target = InvalidOid; - lbeentry.st_query_id = UINT64CONST(0); - lbeentry.st_plan_id = UINT64CONST(0); + lbeentry.st_query_id = INT64CONST(0); + lbeentry.st_plan_id = INT64CONST(0); /* * we don't zero st_progress_param here to save cycles; nobody should @@ -599,8 +599,8 @@ pgstat_report_activity(BackendState state, const char *cmd_str) beentry->st_activity_start_timestamp = 0; /* st_xact_start_timestamp and wait_event_info are also disabled */ beentry->st_xact_start_timestamp = 0; - beentry->st_query_id = UINT64CONST(0); - beentry->st_plan_id = UINT64CONST(0); + beentry->st_query_id = INT64CONST(0); + beentry->st_plan_id = INT64CONST(0); proc->wait_event_info = 0; PGSTAT_END_WRITE_ACTIVITY(beentry); } @@ -662,8 +662,8 @@ pgstat_report_activity(BackendState state, const char *cmd_str) */ if (state == STATE_RUNNING) { - beentry->st_query_id = UINT64CONST(0); - beentry->st_plan_id = UINT64CONST(0); + beentry->st_query_id = INT64CONST(0); + beentry->st_plan_id = INT64CONST(0); } if (cmd_str != NULL) @@ -683,7 +683,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) * -------- */ void -pgstat_report_query_id(uint64 query_id, bool force) +pgstat_report_query_id(int64 query_id, bool force) { volatile PgBackendStatus *beentry = MyBEEntry; @@ -702,7 +702,7 @@ pgstat_report_query_id(uint64 query_id, bool force) * command, so ignore the one provided unless it's an explicit call to * reset the identifier. */ - if (beentry->st_query_id != 0 && !force) + if (beentry->st_query_id != INT64CONST(0) && !force) return; /* @@ -722,7 +722,7 @@ pgstat_report_query_id(uint64 query_id, bool force) * -------- */ void -pgstat_report_plan_id(uint64 plan_id, bool force) +pgstat_report_plan_id(int64 plan_id, bool force) { volatile PgBackendStatus *beentry = MyBEEntry; @@ -1134,7 +1134,7 @@ pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen) * * Return current backend's query identifier. */ -uint64 +int64 pgstat_get_my_query_id(void) { if (!MyBEEntry) @@ -1154,7 +1154,7 @@ pgstat_get_my_query_id(void) * * Return current backend's plan identifier. */ -uint64 +int64 pgstat_get_my_plan_id(void) { if (!MyBEEntry) diff --git a/src/backend/utils/activity/generate-wait_event_types.pl b/src/backend/utils/activity/generate-wait_event_types.pl index 424ad9f115d34..5db13419f251f 100644 --- a/src/backend/utils/activity/generate-wait_event_types.pl +++ b/src/backend/utils/activity/generate-wait_event_types.pl @@ -85,7 +85,7 @@ # Sort the lines based on the second column. # uc() is being used to force the comparison to be case-insensitive. my @lines_sorted = - sort { uc((split(/\t/, $a))[1]) cmp uc((split(/\t/, $b))[1]) } @lines; + sort { uc((split(/\t+/, $a))[1]) cmp uc((split(/\t+/, $b))[1]) } @lines; # If we are generating code, concat @lines_sorted and then # @abi_compatibility_lines. @@ -101,7 +101,7 @@ unless $line =~ /^(\w+)\t+(\w+)\t+("\w.*\.")$/; (my $waitclassname, my $waiteventname, my $waitevendocsentence) = - split(/\t/, $line); + ($1, $2, $3); # Generate the element name for the enums based on the # description. The C symbols are prefixed with "WAIT_EVENT_". @@ -334,12 +334,12 @@ sub usage { die <] [--code ] [ --sgml ] input_file +Usage: perl [--output ] [--code ] [ --docs ] input_file Options: --outdir Output directory (default '.') --code Generate C and header files. - --sgml Generate wait_event_types.sgml. + --docs Generate wait_event_types.sgml. generate-wait_event_types.pl generates the SGML documentation and code related to wait events. This should use wait_event_names.txt in input, or diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index 8b57845e8709f..f317c6e8e908c 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -212,6 +212,11 @@ int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; PgStat_LocalState pgStatLocal; +/* + * Track pending reports for fixed-numbered stats, used by + * pgstat_report_stat(). + */ +bool pgstat_report_fixed = false; /* ---------- * Local data @@ -308,6 +313,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .flush_pending_cb = pgstat_relation_flush_cb, .delete_pending_cb = pgstat_relation_delete_pending_cb, + .reset_timestamp_cb = pgstat_relation_reset_timestamp_cb, }, [PGSTAT_KIND_FUNCTION] = { @@ -322,6 +328,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .pending_size = sizeof(PgStat_FunctionCounts), .flush_pending_cb = pgstat_function_flush_cb, + .reset_timestamp_cb = pgstat_function_reset_timestamp_cb, }, [PGSTAT_KIND_REPLSLOT] = { @@ -370,7 +377,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_off = offsetof(PgStatShared_Backend, stats), .shared_data_len = sizeof(((PgStatShared_Backend *) 0)->stats), - .have_static_pending_cb = pgstat_backend_have_pending_cb, .flush_static_cb = pgstat_backend_flush_cb, .reset_timestamp_cb = pgstat_backend_reset_timestamp_cb, }, @@ -437,7 +443,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats), .flush_static_cb = pgstat_io_flush_cb, - .have_static_pending_cb = pgstat_io_have_pending_cb, .init_shmem_cb = pgstat_io_init_shmem_cb, .reset_all_cb = pgstat_io_reset_all_cb, .snapshot_cb = pgstat_io_snapshot_cb, @@ -455,7 +460,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_SLRU *) 0)->stats), .flush_static_cb = pgstat_slru_flush_cb, - .have_static_pending_cb = pgstat_slru_have_pending_cb, .init_shmem_cb = pgstat_slru_init_shmem_cb, .reset_all_cb = pgstat_slru_reset_all_cb, .snapshot_cb = pgstat_slru_snapshot_cb, @@ -474,7 +478,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .init_backend_cb = pgstat_wal_init_backend_cb, .flush_static_cb = pgstat_wal_flush_cb, - .have_static_pending_cb = pgstat_wal_have_pending_cb, .init_shmem_cb = pgstat_wal_init_shmem_cb, .reset_all_cb = pgstat_wal_reset_all_cb, .snapshot_cb = pgstat_wal_snapshot_cb, @@ -520,6 +523,7 @@ pgstat_discard_stats(void) /* NB: this needs to be done even in single user mode */ + /* First, cleanup the main pgstats file */ ret = unlink(PGSTAT_STAT_PERMANENT_FILENAME); if (ret != 0) { @@ -541,6 +545,15 @@ pgstat_discard_stats(void) PGSTAT_STAT_PERMANENT_FILENAME))); } + /* Finish callbacks, if required */ + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (kind_info && kind_info->finish) + kind_info->finish(STATS_DISCARD); + } + /* * Reset stats contents. This will set reset timestamps of fixed-numbered * stats to the current time (no variable stats exist). @@ -708,29 +721,10 @@ pgstat_report_stat(bool force) } /* Don't expend a clock check if nothing to do */ - if (dlist_is_empty(&pgStatPending)) + if (dlist_is_empty(&pgStatPending) && + !pgstat_report_fixed) { - bool do_flush = false; - - /* Check for pending stats */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) - { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - - if (!kind_info) - continue; - if (!kind_info->have_static_pending_cb) - continue; - - if (kind_info->have_static_pending_cb()) - { - do_flush = true; - break; - } - } - - if (!do_flush) - return 0; + return 0; } /* @@ -784,16 +778,19 @@ pgstat_report_stat(bool force) partial_flush |= pgstat_flush_pending_entries(nowait); /* flush of other stats kinds */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + if (pgstat_report_fixed) { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - if (!kind_info) - continue; - if (!kind_info->flush_static_cb) - continue; + if (!kind_info) + continue; + if (!kind_info->flush_static_cb) + continue; - partial_flush |= kind_info->flush_static_cb(nowait); + partial_flush |= kind_info->flush_static_cb(nowait); + } } last_flush = now; @@ -815,6 +812,7 @@ pgstat_report_stat(bool force) } pending_since = 0; + pgstat_report_fixed = false; return 0; } @@ -835,7 +833,7 @@ pgstat_force_next_flush(void) static bool match_db_entries(PgStatShared_HashEntry *entry, Datum match_data) { - return entry->key.dboid == DatumGetObjectId(MyDatabaseId); + return entry->key.dboid == MyDatabaseId; } /* @@ -946,7 +944,7 @@ pgstat_clear_snapshot(void) void * pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, uint64 objid) { - PgStat_HashKey key; + PgStat_HashKey key = {0}; PgStat_EntryRef *entry_ref; void *stats_data; const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); @@ -957,9 +955,6 @@ pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, uint64 objid) pgstat_prep_snapshot(); - /* clear padding */ - memset(&key, 0, sizeof(struct PgStat_HashKey)); - key.kind = kind; key.dboid = dboid; key.objid = objid; @@ -1504,6 +1499,10 @@ pgstat_register_kind(PgStat_Kind kind, const PgStat_KindInfo *kind_info) ereport(ERROR, (errmsg("custom cumulative statistics property is invalid"), errhint("Custom cumulative statistics require a shared memory size for fixed-numbered objects."))); + if (kind_info->track_entry_count) + ereport(ERROR, + (errmsg("custom cumulative statistics property is invalid"), + errhint("Custom cumulative statistics cannot use entry count tracking for fixed-numbered objects."))); } /* @@ -1562,20 +1561,18 @@ pgstat_assert_is_up(void) * ------------------------------------------------------------ */ -/* helpers for pgstat_write_statsfile() */ -static void -write_chunk(FILE *fpout, void *ptr, size_t len) +/* helper for pgstat_write_statsfile() */ +void +pgstat_write_chunk(FILE *fpout, void *ptr, size_t len) { int rc; rc = fwrite(ptr, len, 1, fpout); - /* we'll check for errors with ferror once at the end */ + /* We check for errors with ferror() when done writing the stats. */ (void) rc; } -#define write_chunk_s(fpout, ptr) write_chunk(fpout, ptr, sizeof(*ptr)) - /* * This function is called in the last process that is accessing the shared * stats so locking is not required. @@ -1617,7 +1614,7 @@ pgstat_write_statsfile(void) * Write the file header --- currently just a format ID. */ format_id = PGSTAT_FILE_FORMAT_ID; - write_chunk_s(fpout, &format_id); + pgstat_write_chunk_s(fpout, &format_id); /* Write various stats structs for fixed number of objects */ for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) @@ -1642,8 +1639,8 @@ pgstat_write_statsfile(void) ptr = pgStatLocal.snapshot.custom_data[kind - PGSTAT_KIND_CUSTOM_MIN]; fputc(PGSTAT_FILE_ENTRY_FIXED, fpout); - write_chunk_s(fpout, &kind); - write_chunk(fpout, ptr, info->shared_data_len); + pgstat_write_chunk_s(fpout, &kind); + pgstat_write_chunk(fpout, ptr, info->shared_data_len); } /* @@ -1697,7 +1694,7 @@ pgstat_write_statsfile(void) { /* normal stats entry, identified by PgStat_HashKey */ fputc(PGSTAT_FILE_ENTRY_HASH, fpout); - write_chunk_s(fpout, &ps->key); + pgstat_write_chunk_s(fpout, &ps->key); } else { @@ -1707,21 +1704,25 @@ pgstat_write_statsfile(void) kind_info->to_serialized_name(&ps->key, shstats, &name); fputc(PGSTAT_FILE_ENTRY_NAME, fpout); - write_chunk_s(fpout, &ps->key.kind); - write_chunk_s(fpout, &name); + pgstat_write_chunk_s(fpout, &ps->key.kind); + pgstat_write_chunk_s(fpout, &name); } /* Write except the header part of the entry */ - write_chunk(fpout, - pgstat_get_entry_data(ps->key.kind, shstats), - pgstat_get_entry_len(ps->key.kind)); + pgstat_write_chunk(fpout, + pgstat_get_entry_data(ps->key.kind, shstats), + pgstat_get_entry_len(ps->key.kind)); + + /* Write more data for the entry, if required */ + if (kind_info->to_serialized_data) + kind_info->to_serialized_data(&ps->key, shstats, fpout); } dshash_seq_term(&hstat); /* * No more output to be done. Close the temp file and replace the old * pgstat.stat with it. The ferror() check replaces testing for error - * after each individual fputc or fwrite (in write_chunk()) above. + * after each individual fputc or fwrite (in pgstat_write_chunk()) above. */ fputc(PGSTAT_FILE_ENTRY_END, fpout); @@ -1747,17 +1748,24 @@ pgstat_write_statsfile(void) /* durable_rename already emitted log message */ unlink(tmpfile); } + + /* Finish callbacks, if required */ + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (kind_info && kind_info->finish) + kind_info->finish(STATS_WRITE); + } } -/* helpers for pgstat_read_statsfile() */ -static bool -read_chunk(FILE *fpin, void *ptr, size_t len) +/* helper for pgstat_read_statsfile() */ +bool +pgstat_read_chunk(FILE *fpin, void *ptr, size_t len) { return fread(ptr, 1, len, fpin) == len; } -#define read_chunk_s(fpin, ptr) read_chunk(fpin, ptr, sizeof(*ptr)) - /* * Reads in existing statistics file into memory. * @@ -1801,7 +1809,7 @@ pgstat_read_statsfile(void) /* * Verify it's of the expected format. */ - if (!read_chunk_s(fpin, &format_id)) + if (!pgstat_read_chunk_s(fpin, &format_id)) { elog(WARNING, "could not read format ID"); goto error; @@ -1831,7 +1839,7 @@ pgstat_read_statsfile(void) char *ptr; /* entry for fixed-numbered stats */ - if (!read_chunk_s(fpin, &kind)) + if (!pgstat_read_chunk_s(fpin, &kind)) { elog(WARNING, "could not read stats kind for entry of type %c", t); goto error; @@ -1871,7 +1879,7 @@ pgstat_read_statsfile(void) info->shared_data_off; } - if (!read_chunk(fpin, ptr, info->shared_data_len)) + if (!pgstat_read_chunk(fpin, ptr, info->shared_data_len)) { elog(WARNING, "could not read data of stats kind %u for entry of type %c with size %u", kind, t, info->shared_data_len); @@ -1886,13 +1894,14 @@ pgstat_read_statsfile(void) PgStat_HashKey key; PgStatShared_HashEntry *p; PgStatShared_Common *header; + const PgStat_KindInfo *kind_info = NULL; CHECK_FOR_INTERRUPTS(); if (t == PGSTAT_FILE_ENTRY_HASH) { /* normal stats entry, identified by PgStat_HashKey */ - if (!read_chunk_s(fpin, &key)) + if (!pgstat_read_chunk_s(fpin, &key)) { elog(WARNING, "could not read key for entry of type %c", t); goto error; @@ -1906,7 +1915,8 @@ pgstat_read_statsfile(void) goto error; } - if (!pgstat_get_kind_info(key.kind)) + kind_info = pgstat_get_kind_info(key.kind); + if (!kind_info) { elog(WARNING, "could not find information of kind for entry %u/%u/%" PRIu64 " of type %c", key.kind, key.dboid, @@ -1917,16 +1927,15 @@ pgstat_read_statsfile(void) else { /* stats entry identified by name on disk (e.g. slots) */ - const PgStat_KindInfo *kind_info = NULL; PgStat_Kind kind; NameData name; - if (!read_chunk_s(fpin, &kind)) + if (!pgstat_read_chunk_s(fpin, &kind)) { elog(WARNING, "could not read stats kind for entry of type %c", t); goto error; } - if (!read_chunk_s(fpin, &name)) + if (!pgstat_read_chunk_s(fpin, &name)) { elog(WARNING, "could not read name of stats kind %u for entry of type %c", kind, t); @@ -1989,10 +1998,21 @@ pgstat_read_statsfile(void) header = pgstat_init_entry(key.kind, p); dshash_release_lock(pgStatLocal.shared_hash, p); + if (header == NULL) + { + /* + * It would be tempting to switch this ERROR to a + * WARNING, but it would mean that all the statistics + * are discarded when the environment fails on OOM. + */ + elog(ERROR, "could not allocate entry %u/%u/%" PRIu64 " of type %c", + key.kind, key.dboid, + key.objid, t); + } - if (!read_chunk(fpin, - pgstat_get_entry_data(key.kind, header), - pgstat_get_entry_len(key.kind))) + if (!pgstat_read_chunk(fpin, + pgstat_get_entry_data(key.kind, header), + pgstat_get_entry_len(key.kind))) { elog(WARNING, "could not read data for entry %u/%u/%" PRIu64 " of type %c", key.kind, key.dboid, @@ -2000,6 +2020,18 @@ pgstat_read_statsfile(void) goto error; } + /* read more data for the entry, if required */ + if (kind_info->from_serialized_data) + { + if (!kind_info->from_serialized_data(&key, header, fpin)) + { + elog(WARNING, "could not read auxiliary data for entry %u/%u/%" PRIu64 " of type %c", + key.kind, key.dboid, + key.objid, t); + goto error; + } + } + break; } case PGSTAT_FILE_ENTRY_END: @@ -2023,11 +2055,21 @@ pgstat_read_statsfile(void) } done: + /* First, cleanup the main stats file */ FreeFile(fpin); elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); unlink(statfile); + /* Finish callbacks, if required */ + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (kind_info && kind_info->finish) + kind_info->finish(STATS_READ); + } + return; error: diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 51256277e8d37..199ba2cc17a78 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -25,6 +25,7 @@ #include "postgres.h" #include "access/xlog.h" +#include "executor/instrument.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -41,9 +42,9 @@ static bool backend_has_iostats = false; /* * WAL usage counters saved from pgWalUsage at the previous call to - * pgstat_report_wal(). This is used to calculate how much WAL usage - * happens between pgstat_report_wal() calls, by subtracting the previous - * counters from the current ones. + * pgstat_flush_backend(). This is used to calculate how much WAL usage + * happens between pgstat_flush_backend() calls, by subtracting the + * previous counters from the current ones. */ static WalUsage prevBackendWalUsage; @@ -66,6 +67,7 @@ pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context, io_time); backend_has_iostats = true; + pgstat_report_fixed = true; } void @@ -81,6 +83,7 @@ pgstat_count_backend_io_op(IOObject io_object, IOContext io_context, PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes; backend_has_iostats = true; + pgstat_report_fixed = true; } /* @@ -249,6 +252,7 @@ pgstat_flush_backend_entry_wal(PgStat_EntryRef *entry_ref) WALSTAT_ACC(wal_records, wal_usage_diff); WALSTAT_ACC(wal_fpi, wal_usage_diff); WALSTAT_ACC(wal_bytes, wal_usage_diff); + WALSTAT_ACC(wal_fpi_bytes, wal_usage_diff); #undef WALSTAT_ACC /* @@ -301,18 +305,6 @@ pgstat_flush_backend(bool nowait, bits32 flags) return false; } -/* - * Check if there are any backend stats waiting for flush. - */ -bool -pgstat_backend_have_pending_cb(void) -{ - if (!pgstat_tracks_backend_bktype(MyBackendType)) - return false; - - return (backend_has_iostats || pgstat_backend_wal_have_pending()); -} - /* * Callback to flush out locally pending backend statistics. * diff --git a/src/backend/utils/activity/pgstat_function.c b/src/backend/utils/activity/pgstat_function.c index 6214f93d36e0c..b5db9d15e0710 100644 --- a/src/backend/utils/activity/pgstat_function.c +++ b/src/backend/utils/activity/pgstat_function.c @@ -214,6 +214,12 @@ pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) return true; } +void +pgstat_function_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Function *) header)->stats.stat_reset_timestamp = ts; +} + /* * find any existing PgStat_FunctionCounts entry for specified function * diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index d8d26379a571e..13ae57ed6498d 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -80,6 +80,7 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes); have_iostats = true; + pgstat_report_fixed = true; } /* @@ -167,15 +168,6 @@ pgstat_fetch_stat_io(void) return &pgStatLocal.snapshot.io; } -/* - * Check if there any IO stats waiting for flush. - */ -bool -pgstat_io_have_pending_cb(void) -{ - return have_iostats; -} - /* * Simpler wrapper of pgstat_io_flush_cb() */ diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 28587e2916b1d..55a10c299db17 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -3,7 +3,7 @@ * pgstat_relation.c * Implementation of relation statistics. * - * This file contains the implementation of function relation. It is kept + * This file contains the implementation of relation statistics. It is kept * separate from pgstat.c to enforce the line between the statistics access / * storage implementation and the details about individual types of * statistics. @@ -207,14 +207,13 @@ pgstat_drop_relation(Relation rel) * Report that the table was just vacuumed and flush IO statistics. */ void -pgstat_report_vacuum(Oid tableoid, bool shared, - PgStat_Counter livetuples, PgStat_Counter deadtuples, - TimestampTz starttime) +pgstat_report_vacuum(Relation rel, PgStat_Counter livetuples, + PgStat_Counter deadtuples, TimestampTz starttime) { PgStat_EntryRef *entry_ref; PgStatShared_Relation *shtabentry; PgStat_StatTabEntry *tabentry; - Oid dboid = (shared ? InvalidOid : MyDatabaseId); + Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); TimestampTz ts; PgStat_Counter elapsedtime; @@ -226,8 +225,8 @@ pgstat_report_vacuum(Oid tableoid, bool shared, elapsedtime = TimestampDifferenceMilliseconds(starttime, ts); /* block acquiring lock for the same reason as pgstat_report_autovac() */ - entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, - dboid, tableoid, false); + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid, + RelationGetRelid(rel), false); shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; tabentry = &shtabentry->stats; @@ -514,7 +513,7 @@ find_tabstat_entry(Oid rel_id) } tabentry = (PgStat_TableStatus *) entry_ref->pending; - tablestatus = palloc(sizeof(PgStat_TableStatus)); + tablestatus = palloc_object(PgStat_TableStatus); *tablestatus = *tabentry; /* @@ -744,7 +743,7 @@ PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) * Load the saved counts into our local pgstats state. */ void -pgstat_twophase_postcommit(TransactionId xid, uint16 info, +pgstat_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; @@ -780,7 +779,7 @@ pgstat_twophase_postcommit(TransactionId xid, uint16 info, * as aborted. */ void -pgstat_twophase_postabort(TransactionId xid, uint16 info, +pgstat_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; @@ -910,6 +909,12 @@ pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref) pgstat_unlink_relation(pending->relation); } +void +pgstat_relation_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Relation *) header)->stats.stat_reset_time = ts; +} + /* * Find or create a PgStat_TableStatus entry for rel. New entry is created and * initialized if not exists. diff --git a/src/backend/utils/activity/pgstat_replslot.c b/src/backend/utils/activity/pgstat_replslot.c index ccfb11c49bf82..d757e00eb54dd 100644 --- a/src/backend/utils/activity/pgstat_replslot.c +++ b/src/backend/utils/activity/pgstat_replslot.c @@ -94,6 +94,7 @@ pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *re REPLSLOT_ACC(stream_txns); REPLSLOT_ACC(stream_count); REPLSLOT_ACC(stream_bytes); + REPLSLOT_ACC(mem_exceeded_count); REPLSLOT_ACC(total_txns); REPLSLOT_ACC(total_bytes); #undef REPLSLOT_ACC @@ -101,6 +102,36 @@ pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *re pgstat_unlock_entry(entry_ref); } +/* + * Report replication slot sync skip statistics. + * + * Similar to pgstat_report_replslot(), we can rely on the stats for the + * slot to exist and to belong to this slot. + */ +void +pgstat_report_replslotsync(ReplicationSlot *slot) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + /* Slot sync stats are valid only for synced logical slots on standby. */ + Assert(slot->data.synced); + Assert(RecoveryInProgress()); + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + Assert(entry_ref != NULL); + + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; + + statent->slotsync_skip_count += 1; + statent->slotsync_last_skip = GetCurrentTimestamp(); + + pgstat_unlock_entry(entry_ref); +} + /* * Report replication slot creation. * @@ -132,7 +163,7 @@ pgstat_create_replslot(ReplicationSlot *slot) * Report replication slot has been acquired. * * This guarantees that a stats entry exists during later - * pgstat_report_replslot() calls. + * pgstat_report_replslot() or pgstat_report_replslotsync() calls. * * If we previously crashed, no stats data exists. But if we did not crash, * the stats do belong to this slot: diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index 2e33293b00097..746e9541bf355 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -180,10 +180,9 @@ StatsShmemInit(void) * provides a small efficiency win. */ ctl->raw_dsa_area = p; - p += MAXALIGN(pgstat_dsa_init_size()); dsa = dsa_create_in_place(ctl->raw_dsa_area, pgstat_dsa_init_size(), - LWTRANCHE_PGSTATS_DSA, 0); + LWTRANCHE_PGSTATS_DSA, NULL); dsa_pin(dsa); /* @@ -211,27 +210,35 @@ StatsShmemInit(void) pg_atomic_init_u64(&ctl->gc_request_count, 1); - /* initialize fixed-numbered stats */ + /* Do the per-kind initialization */ for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) { const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); char *ptr; - if (!kind_info || !kind_info->fixed_amount) + if (!kind_info) continue; - if (pgstat_is_kind_builtin(kind)) - ptr = ((char *) ctl) + kind_info->shared_ctl_off; - else - { - int idx = kind - PGSTAT_KIND_CUSTOM_MIN; + /* initialize entry count tracking */ + if (kind_info->track_entry_count) + pg_atomic_init_u64(&ctl->entry_counts[kind - 1], 0); - Assert(kind_info->shared_size != 0); - ctl->custom_data[idx] = ShmemAlloc(kind_info->shared_size); - ptr = ctl->custom_data[idx]; + /* initialize fixed-numbered stats */ + if (kind_info->fixed_amount) + { + if (pgstat_is_kind_builtin(kind)) + ptr = ((char *) ctl) + kind_info->shared_ctl_off; + else + { + int idx = kind - PGSTAT_KIND_CUSTOM_MIN; + + Assert(kind_info->shared_size != 0); + ctl->custom_data[idx] = ShmemAlloc(kind_info->shared_size); + ptr = ctl->custom_data[idx]; + } + + kind_info->init_shmem_cb(ptr); } - - kind_info->init_shmem_cb(ptr); } } else @@ -255,7 +262,8 @@ pgstat_attach_shmem(void) dsa_pin_mapping(pgStatLocal.dsa); pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params, - pgStatLocal.shmem->hash_handle, 0); + pgStatLocal.shmem->hash_handle, + NULL); MemoryContextSwitchTo(oldcontext); } @@ -289,6 +297,13 @@ pgstat_detach_shmem(void) * ------------------------------------------------------------ */ +/* + * Initialize entry newly-created. + * + * Returns NULL in the event of an allocation failure, so as callers can + * take cleanup actions as the entry initialized is already inserted in the + * shared hashtable. + */ PgStatShared_Common * pgstat_init_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) @@ -296,6 +311,7 @@ pgstat_init_entry(PgStat_Kind kind, /* Create new stats entry. */ dsa_pointer chunk; PgStatShared_Common *shheader; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); /* * Initialize refcount to 1, marking it as valid / not dropped. The entry @@ -311,13 +327,22 @@ pgstat_init_entry(PgStat_Kind kind, pg_atomic_init_u32(&shhashent->generation, 0); shhashent->dropped = false; - chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + chunk = dsa_allocate_extended(pgStatLocal.dsa, + kind_info->shared_size, + DSA_ALLOC_ZERO | DSA_ALLOC_NO_OOM); + if (chunk == InvalidDsaPointer) + return NULL; + shheader = dsa_get_address(pgStatLocal.dsa, chunk); shheader->magic = 0xdeadbeef; /* Link the new entry from the hash entry. */ shhashent->body = chunk; + /* Increment entry count, if required. */ + if (kind_info->track_entry_count) + pg_atomic_fetch_add_u64(&pgStatLocal.shmem->entry_counts[kind - 1], 1); + LWLockInitialize(&shheader->lock, LWTRANCHE_PGSTATS_DATA); return shheader; @@ -444,14 +469,11 @@ PgStat_EntryRef * pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, uint64 objid, bool create, bool *created_entry) { - PgStat_HashKey key; + PgStat_HashKey key = {0}; PgStatShared_HashEntry *shhashent; PgStatShared_Common *shheader = NULL; PgStat_EntryRef *entry_ref; - /* clear padding */ - memset(&key, 0, sizeof(struct PgStat_HashKey)); - key.kind = kind; key.dboid = dboid; key.objid = objid; @@ -509,6 +531,20 @@ pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, uint64 objid, bool create, if (!shfound) { shheader = pgstat_init_entry(kind, shhashent); + if (shheader == NULL) + { + /* + * Failed the allocation of a new entry, so clean up the + * shared hashtable before giving up. + */ + dshash_delete_entry(pgStatLocal.shared_hash, shhashent); + + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating entry %u/%u/%" PRIu64 ".", + key.kind, key.dboid, key.objid))); + } pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); if (created_entry != NULL) @@ -836,6 +872,7 @@ static void pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat) { dsa_pointer pdsa; + PgStat_Kind kind = shent->key.kind; /* * Fetch dsa pointer before deleting entry - that way we can free the @@ -849,6 +886,10 @@ pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat) dshash_delete_current(hstat); dsa_free(pgStatLocal.dsa, pdsa); + + /* Decrement entry count, if required. */ + if (pgstat_get_kind_info(kind)->track_entry_count) + pg_atomic_sub_fetch_u64(&pgStatLocal.shmem->entry_counts[kind - 1], 1); } /* @@ -873,11 +914,12 @@ pgstat_drop_entry_internal(PgStatShared_HashEntry *shent, */ if (shent->dropped) elog(ERROR, - "trying to drop stats entry already dropped: kind=%s dboid=%u objid=%" PRIu64 " refcount=%u", + "trying to drop stats entry already dropped: kind=%s dboid=%u objid=%" PRIu64 " refcount=%u generation=%u", pgstat_get_kind_info(shent->key.kind)->name, shent->key.dboid, shent->key.objid, - pg_atomic_read_u32(&shent->refcount)); + pg_atomic_read_u32(&shent->refcount), + pg_atomic_read_u32(&shent->generation)); shent->dropped = true; /* release refcount marking entry as not dropped */ @@ -961,13 +1003,10 @@ pgstat_drop_database_and_contents(Oid dboid) bool pgstat_drop_entry(PgStat_Kind kind, Oid dboid, uint64 objid) { - PgStat_HashKey key; + PgStat_HashKey key = {0}; PgStatShared_HashEntry *shent; bool freed = true; - /* clear padding */ - memset(&key, 0, sizeof(struct PgStat_HashKey)); - key.kind = kind; key.dboid = dboid; key.objid = objid; diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index b9e940dde45b6..da50f8a04578c 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -55,47 +55,33 @@ pgstat_reset_slru(const char *name) * SLRU statistics count accumulation functions --- called from slru.c */ -void -pgstat_count_slru_page_zeroed(int slru_idx) -{ - get_slru_entry(slru_idx)->blocks_zeroed += 1; +#define PGSTAT_COUNT_SLRU(stat) \ +void \ +CppConcat(pgstat_count_slru_,stat)(int slru_idx) \ +{ \ + get_slru_entry(slru_idx)->stat += 1; \ } -void -pgstat_count_slru_page_hit(int slru_idx) -{ - get_slru_entry(slru_idx)->blocks_hit += 1; -} +/* pgstat_count_slru_blocks_zeroed */ +PGSTAT_COUNT_SLRU(blocks_zeroed) -void -pgstat_count_slru_page_exists(int slru_idx) -{ - get_slru_entry(slru_idx)->blocks_exists += 1; -} +/* pgstat_count_slru_blocks_hit */ +PGSTAT_COUNT_SLRU(blocks_hit) -void -pgstat_count_slru_page_read(int slru_idx) -{ - get_slru_entry(slru_idx)->blocks_read += 1; -} +/* pgstat_count_slru_blocks_exists */ +PGSTAT_COUNT_SLRU(blocks_exists) -void -pgstat_count_slru_page_written(int slru_idx) -{ - get_slru_entry(slru_idx)->blocks_written += 1; -} +/* pgstat_count_slru_blocks_read */ +PGSTAT_COUNT_SLRU(blocks_read) -void -pgstat_count_slru_flush(int slru_idx) -{ - get_slru_entry(slru_idx)->flush += 1; -} +/* pgstat_count_slru_blocks_written */ +PGSTAT_COUNT_SLRU(blocks_written) -void -pgstat_count_slru_truncate(int slru_idx) -{ - get_slru_entry(slru_idx)->truncate += 1; -} +/* pgstat_count_slru_flush */ +PGSTAT_COUNT_SLRU(flush) + +/* pgstat_count_slru_truncate */ +PGSTAT_COUNT_SLRU(truncate) /* * Support function for the SQL-callable pgstat* functions. Returns @@ -143,15 +129,6 @@ pgstat_get_slru_index(const char *name) return (SLRU_NUM_ELEMENTS - 1); } -/* - * Check if there are any SLRU stats entries waiting for flush. - */ -bool -pgstat_slru_have_pending_cb(void) -{ - return have_slrustats; -} - /* * Flush out locally pending SLRU stats entries * @@ -247,6 +224,7 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); have_slrustats = true; + pgstat_report_fixed = true; return &pending_SLRUStats[slru_idx]; } diff --git a/src/backend/utils/activity/pgstat_subscription.c b/src/backend/utils/activity/pgstat_subscription.c index f9a1c831a07e6..ad6814ec5ea44 100644 --- a/src/backend/utils/activity/pgstat_subscription.c +++ b/src/backend/utils/activity/pgstat_subscription.c @@ -17,6 +17,7 @@ #include "postgres.h" +#include "replication/worker_internal.h" #include "utils/pgstat_internal.h" @@ -24,7 +25,7 @@ * Report a subscription error. */ void -pgstat_report_subscription_error(Oid subid, bool is_apply_error) +pgstat_report_subscription_error(Oid subid, LogicalRepWorkerType wtype) { PgStat_EntryRef *entry_ref; PgStat_BackendSubEntry *pending; @@ -33,10 +34,25 @@ pgstat_report_subscription_error(Oid subid, bool is_apply_error) InvalidOid, subid, NULL); pending = entry_ref->pending; - if (is_apply_error) - pending->apply_error_count++; - else - pending->sync_error_count++; + switch (wtype) + { + case WORKERTYPE_APPLY: + pending->apply_error_count++; + break; + + case WORKERTYPE_SEQUENCESYNC: + pending->sync_seq_error_count++; + break; + + case WORKERTYPE_TABLESYNC: + pending->sync_table_error_count++; + break; + + default: + /* Should never happen. */ + Assert(0); + break; + } } /* @@ -115,7 +131,8 @@ pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) #define SUB_ACC(fld) shsubent->stats.fld += localent->fld SUB_ACC(apply_error_count); - SUB_ACC(sync_error_count); + SUB_ACC(sync_seq_error_count); + SUB_ACC(sync_table_error_count); for (int i = 0; i < CONFLICT_NUM_TYPES; i++) SUB_ACC(conflict_count[i]); #undef SUB_ACC diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 16a1ecb4d90d2..d4edb8b57335d 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -71,6 +71,15 @@ pgstat_fetch_stat_wal(void) return &pgStatLocal.snapshot.wal; } +/* + * To determine whether WAL usage happened. + */ +static inline bool +pgstat_wal_have_pending(void) +{ + return pgWalUsage.wal_records != prevWalUsage.wal_records; +} + /* * Calculate how much WAL usage counters have increased by subtracting the * previous counters from the current ones. @@ -92,7 +101,7 @@ pgstat_wal_flush_cb(bool nowait) * This function can be called even if nothing at all has happened. Avoid * taking lock for nothing in that case. */ - if (!pgstat_wal_have_pending_cb()) + if (!pgstat_wal_have_pending()) return false; /* @@ -112,6 +121,7 @@ pgstat_wal_flush_cb(bool nowait) WALSTAT_ACC(wal_records, wal_usage_diff); WALSTAT_ACC(wal_fpi, wal_usage_diff); WALSTAT_ACC(wal_bytes, wal_usage_diff); + WALSTAT_ACC(wal_fpi_bytes, wal_usage_diff); WALSTAT_ACC(wal_buffers_full, wal_usage_diff); #undef WALSTAT_ACC @@ -136,15 +146,6 @@ pgstat_wal_init_backend_cb(void) prevWalUsage = pgWalUsage; } -/* - * To determine whether WAL usage happened. - */ -bool -pgstat_wal_have_pending_cb(void) -{ - return pgWalUsage.wal_records != prevWalUsage.wal_records; -} - void pgstat_wal_init_shmem_cb(void *stats) { diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index d9b8f34a3559d..a8c287c289eca 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -29,7 +29,7 @@ static const char *pgstat_get_wait_activity(WaitEventActivity w); -static const char *pgstat_get_wait_bufferpin(WaitEventBufferPin w); +static const char *pgstat_get_wait_buffer(WaitEventBuffer w); static const char *pgstat_get_wait_client(WaitEventClient w); static const char *pgstat_get_wait_ipc(WaitEventIPC w); static const char *pgstat_get_wait_timeout(WaitEventTimeout w); @@ -317,7 +317,7 @@ GetWaitEventCustomNames(uint32 classId, int *nwaitevents) els = hash_get_num_entries(WaitEventCustomHashByName); /* Allocate enough space for all entries */ - waiteventnames = palloc(els * sizeof(char *)); + waiteventnames = palloc_array(char *, els); /* Now scan the hash table to copy the data */ hash_seq_init(&hash_seq, WaitEventCustomHashByName); @@ -389,8 +389,8 @@ pgstat_get_wait_event_type(uint32 wait_event_info) case PG_WAIT_LOCK: event_type = "Lock"; break; - case PG_WAIT_BUFFERPIN: - event_type = "BufferPin"; + case PG_WAIT_BUFFER: + event_type = "Buffer"; break; case PG_WAIT_ACTIVITY: event_type = "Activity"; @@ -453,11 +453,11 @@ pgstat_get_wait_event(uint32 wait_event_info) case PG_WAIT_INJECTIONPOINT: event_name = GetWaitEventCustomIdentifier(wait_event_info); break; - case PG_WAIT_BUFFERPIN: + case PG_WAIT_BUFFER: { - WaitEventBufferPin w = (WaitEventBufferPin) wait_event_info; + WaitEventBuffer w = (WaitEventBuffer) wait_event_info; - event_name = pgstat_get_wait_bufferpin(w); + event_name = pgstat_get_wait_buffer(w); break; } case PG_WAIT_ACTIVITY: diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 5d9e04d682377..dcfadbd5aaec0 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -62,7 +62,7 @@ LOGICAL_APPLY_MAIN "Waiting in main loop of logical replication apply process." LOGICAL_LAUNCHER_MAIN "Waiting in main loop of logical replication launcher process." LOGICAL_PARALLEL_APPLY_MAIN "Waiting in main loop of logical replication parallel apply process." RECOVERY_WAL_STREAM "Waiting in main loop of startup process for WAL to arrive, during streaming recovery." -REPLICATION_SLOTSYNC_MAIN "Waiting in main loop of slot sync worker." +REPLICATION_SLOTSYNC_MAIN "Waiting in main loop of slot synchronization." REPLICATION_SLOTSYNC_SHUTDOWN "Waiting for slot sync worker to shut down." SYSLOGGER_MAIN "Waiting in main loop of syslogger process." WAL_RECEIVER_MAIN "Waiting in main loop of WAL receiver process." @@ -89,6 +89,8 @@ LIBPQWALRECEIVER_CONNECT "Waiting in WAL receiver to establish connection to rem LIBPQWALRECEIVER_RECEIVE "Waiting in WAL receiver to receive data from remote server." SSL_OPEN_SERVER "Waiting for SSL while attempting connection." WAIT_FOR_STANDBY_CONFIRMATION "Waiting for WAL to be received and flushed by the physical standby." +WAIT_FOR_WAL_FLUSH "Waiting for WAL flush to reach a target LSN on a primary." +WAIT_FOR_WAL_REPLAY "Waiting for WAL replay to reach a target LSN on a standby." WAL_SENDER_WAIT_FOR_WAL "Waiting for WAL to be flushed in WAL sender process." WAL_SENDER_WRITE_DATA "Waiting for any activity when processing replies from WAL receiver in WAL sender process." @@ -156,7 +158,6 @@ REPLICATION_SLOT_DROP "Waiting for a replication slot to become inactive so it c RESTORE_COMMAND "Waiting for to complete." SAFE_SNAPSHOT "Waiting to obtain a valid snapshot for a READ ONLY DEFERRABLE transaction." SYNC_REP "Waiting for confirmation from a remote server during synchronous replication." -WAL_BUFFER_INIT "Waiting on WAL buffer to be initialized." WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." WAL_SUMMARY_READY "Waiting for a new WAL summary to be generated." @@ -174,6 +175,7 @@ Section: ClassName - WaitEventTimeout BASE_BACKUP_THROTTLE "Waiting during base backup when throttling activity." CHECKPOINT_WRITE_DELAY "Waiting between writes while performing a checkpoint." +COMMIT_DELAY "Waiting for commit delay before WAL flush." PG_SLEEP "Waiting due to a call to pg_sleep or a sibling function." RECOVERY_APPLY_DELAY "Waiting to apply WAL during recovery because of a delay setting." RECOVERY_RETRIEVE_RETRY_INTERVAL "Waiting during recovery when WAL data is not available from any source (pg_wal, archive or stream)." @@ -278,12 +280,12 @@ WAL_WRITE "Waiting for a write to a WAL file." ABI_compatibility: # -# Wait Events - Buffer Pin +# Wait Events - Buffer # -Section: ClassName - WaitEventBufferPin +Section: ClassName - WaitEventBuffer -BUFFER_PIN "Waiting to acquire an exclusive pin on a buffer." +BUFFER_CLEANUP "Waiting to acquire an exclusive pin on a buffer. Buffer pin waits can be protracted if another process holds an open cursor that last read data from the buffer in question." ABI_compatibility: @@ -303,9 +305,12 @@ ABI_compatibility: # This class of wait events has its own set of C structure, so these are # only used for the documentation. # -# NB: Predefined LWLocks (i.e., those declared in lwlocklist.h) must be -# listed in the top section of locks and must be listed in the same order as in -# lwlocklist.h. +# NB: Predefined LWLocks (i.e., those declared with PG_LWLOCK in lwlocklist.h) +# must be listed before the "END OF PREDEFINED LWLOCKS" comment and must be +# listed in the same order as in lwlocklist.h. Likewise, the built-in LWLock +# tranches (i.e., those declared with PG_LWLOCKTRANCHE in lwlocklist.h) must be +# listed after the "END OF PREDEFINED LWLOCKS" comment and must be listed in +# the same order as lwlocklist.h. # Section: ClassName - WaitEventLWLock @@ -316,6 +321,7 @@ XidGen "Waiting to allocate a new transaction ID." ProcArray "Waiting to access the shared per-process data structures (typically, to get a snapshot or report a session's transaction ID)." SInvalRead "Waiting to retrieve messages from the shared catalog invalidation queue." SInvalWrite "Waiting to add a message to the shared catalog invalidation queue." +WALBufMapping "Waiting to replace a page in WAL buffers." WALWrite "Waiting for WAL buffers to be written to disk." ControlFile "Waiting to read or update the pg_control file or create a new WAL file." MultiXactGen "Waiting to read or update shared multixact state." @@ -352,14 +358,12 @@ DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." +WaitLSN "Waiting to read or update shared Wait-for-LSN state." +LogicalDecodingControl "Waiting to read or update logical decoding status information." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) # -# Predefined LWLocks (i.e., those declared in lwlocknames.h) must be listed -# in the section above and must be listed in the same order as in -# lwlocknames.h. Other LWLocks must be listed in the section below. -# XactBuffer "Waiting for I/O on a transaction status SLRU buffer." CommitTsBuffer "Waiting for I/O on a commit timestamp SLRU buffer." @@ -401,6 +405,7 @@ SerialSLRU "Waiting to access the serializable transaction conflict SLRU cache." SubtransSLRU "Waiting to access the sub-transaction SLRU cache." XactSLRU "Waiting to access the transaction status SLRU cache." ParallelVacuumDSA "Waiting for parallel vacuum dynamic shared memory allocation." +AioUringCompletion "Waiting for another process to complete IO via io_uring." # No "ABI_compatibility" region here as WaitEventLWLock has its own C code. diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 4a233b63c3280..ba40ada11cafa 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -23,6 +23,7 @@ OBJS = \ arrayutils.o \ ascii.o \ bool.o \ + bytea.o \ cash.o \ char.o \ cryptohashfuncs.o \ @@ -67,6 +68,7 @@ OBJS = \ misc.o \ multirangetypes.o \ multirangetypes_selfuncs.o \ + multixactfuncs.o \ name.o \ network.o \ network_gist.o \ @@ -78,11 +80,13 @@ OBJS = \ oracle_compat.o \ orderedsetaggs.o \ partitionfuncs.o \ + pg_dependencies.o \ pg_locale.o \ pg_locale_builtin.o \ pg_locale_icu.o \ pg_locale_libc.o \ pg_lsn.o \ + pg_ndistinct.o \ pg_upgrade_support.o \ pgstatfuncs.o \ pseudorandomfuncs.o \ diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c index ca3c5ee3df3ae..05d48412f827d 100644 --- a/src/backend/utils/adt/acl.c +++ b/src/backend/utils/adt/acl.c @@ -31,7 +31,6 @@ #include "catalog/pg_proc.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "commands/proclang.h" #include "commands/tablespace.h" #include "common/hashfn.h" @@ -134,6 +133,22 @@ static AclResult pg_role_aclcheck(Oid role_oid, Oid roleid, AclMode mode); static void RoleMembershipCacheCallback(Datum arg, int cacheid, uint32 hashvalue); +/* + * Test whether an identifier char can be left unquoted in ACLs. + * + * Formerly, we used isalnum() even on non-ASCII characters, resulting in + * unportable behavior. To ensure dump compatibility with old versions, + * we now treat high-bit-set characters as always requiring quoting during + * putid(), but getid() will always accept them without quotes. + */ +static inline bool +is_safe_acl_char(unsigned char c, bool is_getid) +{ + if (IS_HIGHBIT_SET(c)) + return is_getid; + return isalnum(c) || c == '_'; +} + /* * getid * Consumes the first alphanumeric string (identifier) found in string @@ -159,21 +174,22 @@ getid(const char *s, char *n, Node *escontext) while (isspace((unsigned char) *s)) s++; - /* This code had better match what putid() does, below */ for (; *s != '\0' && - (isalnum((unsigned char) *s) || - *s == '_' || - *s == '"' || - in_quotes); + (in_quotes || *s == '"' || is_safe_acl_char(*s, true)); s++) { if (*s == '"') { + if (!in_quotes) + { + in_quotes = true; + continue; + } /* safe to look at next char (could be '\0' though) */ if (*(s + 1) != '"') { - in_quotes = !in_quotes; + in_quotes = false; continue; } /* it's an escaped double quote; skip the escaping char */ @@ -207,10 +223,10 @@ putid(char *p, const char *s) const char *src; bool safe = true; + /* Detect whether we need to use double quotes */ for (src = s; *src; src++) { - /* This test had better match what getid() does, above */ - if (!isalnum((unsigned char) *src) && *src != '_') + if (!is_safe_acl_char(*src, false)) { safe = false; break; @@ -602,7 +618,7 @@ aclitemin(PG_FUNCTION_ARGS) Node *escontext = fcinfo->context; AclItem *aip; - aip = (AclItem *) palloc(sizeof(AclItem)); + aip = palloc_object(AclItem); s = aclparse(s, aip, escontext); if (s == NULL) @@ -1645,7 +1661,7 @@ makeaclitem(PG_FUNCTION_ARGS) priv = convert_any_priv_string(privtext, any_priv_map); - result = (AclItem *) palloc(sizeof(AclItem)); + result = palloc_object(AclItem); result->ai_grantee = grantee; result->ai_grantor = grantor; @@ -1805,7 +1821,7 @@ aclexplode(PG_FUNCTION_ARGS) funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* allocate memory for user context */ - idx = (int *) palloc(sizeof(int[2])); + idx = palloc_array(int, 2); idx[0] = 0; /* ACL array item index */ idx[1] = -1; /* privilege type counter */ funcctx->user_fctx = idx; @@ -5143,7 +5159,7 @@ roles_is_member_of(Oid roleid, enum RoleRecurseType type, MemoryContext oldctx; bloom_filter *bf = NULL; - Assert(OidIsValid(admin_of) == PointerIsValid(admin_role)); + Assert(OidIsValid(admin_of) == (admin_role != NULL)); if (admin_role != NULL) *admin_role = InvalidOid; diff --git a/src/backend/utils/adt/array_expanded.c b/src/backend/utils/adt/array_expanded.c index fc036d1eb3007..23627114f57cf 100644 --- a/src/backend/utils/adt/array_expanded.c +++ b/src/backend/utils/adt/array_expanded.c @@ -271,8 +271,8 @@ EA_get_flat_size(ExpandedObjectHeader *eohptr) if (!AllocSizeIsValid(nbytes)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); } if (dnulls) diff --git a/src/backend/utils/adt/array_selfuncs.c b/src/backend/utils/adt/array_selfuncs.c index a69a84c2aee33..cd201461c172f 100644 --- a/src/backend/utils/adt/array_selfuncs.c +++ b/src/backend/utils/adt/array_selfuncs.c @@ -39,25 +39,25 @@ static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval, Oid elemtype, Oid operator); -static Selectivity mcelem_array_selec(ArrayType *array, +static Selectivity mcelem_array_selec(const ArrayType *array, TypeCacheEntry *typentry, - Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers, - float4 *hist, int nhist, + const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers, + const float4 *hist, int nhist, Oid operator); -static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers, - Datum *array_data, int nitems, +static Selectivity mcelem_array_contain_overlap_selec(const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers, + const Datum *array_data, int nitems, Oid operator, TypeCacheEntry *typentry); -static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers, - Datum *array_data, int nitems, - float4 *hist, int nhist, +static Selectivity mcelem_array_contained_selec(const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers, + const Datum *array_data, int nitems, + const float4 *hist, int nhist, Oid operator, TypeCacheEntry *typentry); static float *calc_hist(const float4 *hist, int nhist, int n); static float *calc_distr(const float *p, int n, int m, float rest); static int floor_log2(uint32 n); -static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, +static bool find_next_mcelem(const Datum *mcelem, int nmcelem, Datum value, int *index, TypeCacheEntry *typentry); static int element_compare(const void *key1, const void *key2, void *arg); static int float_compare_desc(const void *key1, const void *key2); @@ -425,10 +425,10 @@ calc_arraycontsel(VariableStatData *vardata, Datum constval, * mcelem_array_contained_selec depending on the operator. */ static Selectivity -mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry, - Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers, - float4 *hist, int nhist, +mcelem_array_selec(const ArrayType *array, TypeCacheEntry *typentry, + const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers, + const float4 *hist, int nhist, Oid operator) { Selectivity selec; @@ -518,9 +518,9 @@ mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry, * fraction of nonempty arrays in the column. */ static Selectivity -mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers, - Datum *array_data, int nitems, +mcelem_array_contain_overlap_selec(const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers, + const Datum *array_data, int nitems, Oid operator, TypeCacheEntry *typentry) { Selectivity selec, @@ -544,12 +544,15 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, if (numbers) { - /* Grab the lowest observed frequency */ + /* Grab the minimal MCE frequency */ minfreq = numbers[nmcelem]; } else { - /* Without statistics make some default assumptions */ + /* + * Without statistics, use DEFAULT_CONTAIN_SEL (the factor of 2 will + * be removed again below). + */ minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL; } @@ -621,8 +624,11 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, else { /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. + * The element is not in MCELEM. Estimate its frequency as half + * that of the least-frequent MCE. (We know it cannot be more + * than minfreq, and it could be a great deal less. Half seems + * like a good compromise.) For probably-historical reasons, + * clamp to not more than DEFAULT_CONTAIN_SEL. */ elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2); } @@ -693,10 +699,10 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m */ static Selectivity -mcelem_array_contained_selec(Datum *mcelem, int nmcelem, - float4 *numbers, int nnumbers, - Datum *array_data, int nitems, - float4 *hist, int nhist, +mcelem_array_contained_selec(const Datum *mcelem, int nmcelem, + const float4 *numbers, int nnumbers, + const Datum *array_data, int nitems, + const float4 *hist, int nhist, Oid operator, TypeCacheEntry *typentry) { int mcelem_index, @@ -728,7 +734,7 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem, /* * Grab some of the summary statistics that compute_array_stats() stores: - * lowest frequency, frequency of null elements, and average distinct + * lowest MCE frequency, frequency of null elements, and average distinct * element count. */ minfreq = numbers[nmcelem]; @@ -753,7 +759,7 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem, * elem_selec is array of estimated frequencies for elements in the * constant. */ - elem_selec = (float *) palloc(sizeof(float) * nitems); + elem_selec = palloc_array(float, nitems); /* Scan mcelem and array in parallel. */ mcelem_index = 0; @@ -802,8 +808,11 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem, else { /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. + * The element is not in MCELEM. Estimate its frequency as half + * that of the least-frequent MCE. (We know it cannot be more + * than minfreq, and it could be a great deal less. Half seems + * like a good compromise.) For probably-historical reasons, + * clamp to not more than DEFAULT_CONTAIN_SEL. */ elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL, minfreq / 2); @@ -927,7 +936,7 @@ calc_hist(const float4 *hist, int nhist, int n) next_interval; float frac; - hist_part = (float *) palloc((n + 1) * sizeof(float)); + hist_part = palloc_array(float, n + 1); /* * frac is a probability contribution for each interval between histogram @@ -1019,8 +1028,8 @@ calc_distr(const float *p, int n, int m, float rest) * Since we return only the last row of the matrix and need only the * current and previous row for calculations, allocate two rows. */ - row = (float *) palloc((m + 1) * sizeof(float)); - prev_row = (float *) palloc((m + 1) * sizeof(float)); + row = palloc_array(float, m + 1); + prev_row = palloc_array(float, m + 1); /* M[0,0] = 1 */ row[0] = 1.0f; @@ -1127,7 +1136,7 @@ floor_log2(uint32 n) * exact match.) */ static bool -find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index, +find_next_mcelem(const Datum *mcelem, int nmcelem, Datum value, int *index, TypeCacheEntry *typentry) { int l = *index, diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c index 6f61629b9778d..61aedd31ff15b 100644 --- a/src/backend/utils/adt/array_typanalyze.c +++ b/src/backend/utils/adt/array_typanalyze.c @@ -132,7 +132,7 @@ array_typanalyze(PG_FUNCTION_ARGS) PG_RETURN_BOOL(true); /* Store our findings for use by compute_array_stats() */ - extra_data = (ArrayAnalyzeExtraData *) palloc(sizeof(ArrayAnalyzeExtraData)); + extra_data = palloc_object(ArrayAnalyzeExtraData); extra_data->type_id = typentry->type_id; extra_data->eq_opr = typentry->eq_opr; extra_data->coll_id = stats->attrcollid; /* collation we should use */ @@ -461,7 +461,7 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify - * the minimum and maximum frequencies among these items. + * the maximum frequency among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. @@ -469,18 +469,16 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, cutoff_freq = 9 * element_no / bucket_width; i = hash_get_num_entries(elements_tab); /* surely enough space */ - sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); + sort_table = palloc_array(TrackItem *, i); hash_seq_init(&scan_status, elements_tab); track_len = 0; - minfreq = element_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; - minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } @@ -497,19 +495,38 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, * If we obtained more elements than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. + * + * If we did not find more elements than we want, then it is safe to + * assume that the stored MCE array will contain every element with + * frequency above the cutoff. In that case, rather than storing the + * smallest frequency we are keeping, we want to store the minimum + * frequency that would have been accepted as a valid MCE. The + * selectivity functions can assume that that is an upper bound on the + * frequency of elements not present in the array. + * + * If we found no candidate MCEs at all, we still want to record the + * cutoff frequency, since it's still valid to assume that no element + * has frequency more than that. */ if (num_mcelem < track_len) { qsort_interruptible(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc, NULL); - /* reset minfreq to the smallest frequency we're keeping */ + /* set minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else + { num_mcelem = track_len; + /* set minfreq to the minimum frequency above the cutoff */ + minfreq = cutoff_freq + 1; + /* ensure maxfreq is nonzero, too */ + if (track_len == 0) + maxfreq = minfreq; + } /* Generate MCELEM slot entry */ - if (num_mcelem > 0) + if (num_mcelem >= 0) { MemoryContext old_context; Datum *mcelem_values; @@ -589,8 +606,7 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, * Create an array of DECountItem pointers, and sort them into * increasing count order. */ - sorted_count_items = (DECountItem **) - palloc(sizeof(DECountItem *) * count_items_count); + sorted_count_items = palloc_array(DECountItem *, count_items_count); hash_seq_init(&scan_status, count_tab); j = 0; while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL) diff --git a/src/backend/utils/adt/array_userfuncs.c b/src/backend/utils/adt/array_userfuncs.c index 8eb342e33823a..c01bf46612138 100644 --- a/src/backend/utils/adt/array_userfuncs.c +++ b/src/backend/utils/adt/array_userfuncs.c @@ -433,8 +433,8 @@ array_cat(PG_FUNCTION_ARGS) * themselves) of the input argument arrays */ ndims = ndims1; - dims = (int *) palloc(ndims * sizeof(int)); - lbs = (int *) palloc(ndims * sizeof(int)); + dims = palloc_array(int, ndims); + lbs = palloc_array(int, ndims); dims[0] = dims1[0] + dims2[0]; lbs[0] = lbs1[0]; @@ -459,8 +459,8 @@ array_cat(PG_FUNCTION_ARGS) * the first argument inserted at the front of the outer dimension */ ndims = ndims2; - dims = (int *) palloc(ndims * sizeof(int)); - lbs = (int *) palloc(ndims * sizeof(int)); + dims = palloc_array(int, ndims); + lbs = palloc_array(int, ndims); memcpy(dims, dims2, ndims * sizeof(int)); memcpy(lbs, lbs2, ndims * sizeof(int)); @@ -487,8 +487,8 @@ array_cat(PG_FUNCTION_ARGS) * second argument appended to the end of the outer dimension */ ndims = ndims1; - dims = (int *) palloc(ndims * sizeof(int)); - lbs = (int *) palloc(ndims * sizeof(int)); + dims = palloc_array(int, ndims); + lbs = palloc_array(int, ndims); memcpy(dims, dims1, ndims * sizeof(int)); memcpy(lbs, lbs1, ndims * sizeof(int)); diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index c8f53c6fbe788..bf54655bb964b 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -17,6 +17,7 @@ #include #include +#include "access/transam.h" #include "catalog/pg_type.h" #include "common/int.h" #include "funcapi.h" @@ -86,7 +87,7 @@ typedef struct ArrayIteratorData /* current position information, updated on each iteration */ char *data_ptr; /* our current position in the array */ int current_item; /* the item # we're at in the array */ -} ArrayIteratorData; +} ArrayIteratorData; static bool ReadArrayDimensions(char **srcptr, int *ndim_p, int *dim, int *lBound, @@ -332,8 +333,8 @@ array_in(PG_FUNCTION_ARGS) if (!AllocSizeIsValid(nbytes)) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); } } if (hasnulls) @@ -491,8 +492,8 @@ ReadArrayDimensions(char **srcptr, int *ndim_p, int *dim, int *lBound, pg_add_s32_overflow(ub, 1, &ub)) ereturn(escontext, false, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); dim[ndim] = ub; ndim++; @@ -724,8 +725,8 @@ ReadArrayStr(char **srcptr, if (maxitems >= MaxArraySize) ereturn(escontext, false, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); maxitems = Min(maxitems * 2, MaxArraySize); values = repalloc_array(values, Datum, maxitems); nulls = repalloc_array(nulls, bool, maxitems); @@ -959,8 +960,8 @@ ReadArrayToken(char **srcptr, StringInfo elembuf, char typdelim, */ void CopyArrayEls(ArrayType *array, - Datum *values, - bool *nulls, + const Datum *values, + const bool *nulls, int nitems, int typlen, bool typbyval, @@ -1530,8 +1531,8 @@ ReadArrayBinary(StringInfo buf, if (!AllocSizeIsValid(totbytes)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); } } *hasnulls = hasnull; @@ -2256,7 +2257,7 @@ array_set_element(Datum arraydatum, resultarray = (char *) palloc(arraytyplen); memcpy(resultarray, DatumGetPointer(arraydatum), arraytyplen); - elt_ptr = (char *) resultarray + indx[0] * elmlen; + elt_ptr = resultarray + indx[0] * elmlen; ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalign, elt_ptr); return PointerGetDatum(resultarray); } @@ -2338,8 +2339,8 @@ array_set_element(Datum arraydatum, pg_add_s32_overflow(dim[0], addedbefore, &dim[0])) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); lb[0] = indx[0]; if (addedbefore > 1) newhasnulls = true; /* will insert nulls */ @@ -2353,8 +2354,8 @@ array_set_element(Datum arraydatum, pg_add_s32_overflow(dim[0], addedafter, &dim[0])) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); if (addedafter > 1) newhasnulls = true; /* will insert nulls */ } @@ -2417,7 +2418,7 @@ array_set_element(Datum arraydatum, olditemlen = att_addlength_pointer(0, elmlen, elt_ptr); olditemlen = att_align_nominal(olditemlen, elmalign); } - lenafter = (int) (olddatasize - lenbefore - olditemlen); + lenafter = olddatasize - lenbefore - olditemlen; } if (isNull) @@ -2615,8 +2616,8 @@ array_set_element_expanded(Datum arraydatum, pg_add_s32_overflow(dim[0], addedbefore, &dim[0])) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); lb[0] = indx[0]; dimschanged = true; if (addedbefore > 1) @@ -2631,8 +2632,8 @@ array_set_element_expanded(Datum arraydatum, pg_add_s32_overflow(dim[0], addedafter, &dim[0])) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); dimschanged = true; if (addedafter > 1) newhasnulls = true; /* will insert nulls */ @@ -2892,8 +2893,8 @@ array_set_slice(Datum arraydatum, pg_add_s32_overflow(dim[i], 1, &dim[i])) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); lb[i] = lowerIndx[i]; } @@ -2946,8 +2947,8 @@ array_set_slice(Datum arraydatum, pg_add_s32_overflow(dim[0], addedbefore, &dim[0])) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); lb[0] = lowerIndx[0]; if (addedbefore > 1) newhasnulls = true; /* will insert nulls */ @@ -2961,8 +2962,8 @@ array_set_slice(Datum arraydatum, pg_add_s32_overflow(dim[0], addedafter, &dim[0])) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxArraySize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxArraySize))); if (addedafter > 1) newhasnulls = true; /* will insert nulls */ } @@ -3302,8 +3303,8 @@ array_map(Datum arrayd, if (!AllocSizeIsValid(nbytes)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); } } @@ -3406,7 +3407,7 @@ construct_array_builtin(Datum *elems, int nelems, Oid elmtype) case FLOAT8OID: elmlen = sizeof(float8); - elmbyval = FLOAT8PASSBYVAL; + elmbyval = true; elmalign = TYPALIGN_DOUBLE; break; @@ -3424,7 +3425,7 @@ construct_array_builtin(Datum *elems, int nelems, Oid elmtype) case INT8OID: elmlen = sizeof(int64); - elmbyval = FLOAT8PASSBYVAL; + elmbyval = true; elmalign = TYPALIGN_DOUBLE; break; @@ -3542,8 +3543,8 @@ construct_md_array(Datum *elems, if (!AllocSizeIsValid(nbytes)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); } /* Allocate and initialize result array */ @@ -3581,7 +3582,7 @@ construct_empty_array(Oid elmtype) { ArrayType *result; - result = (ArrayType *) palloc0(sizeof(ArrayType)); + result = palloc0_object(ArrayType); SET_VARSIZE(result, sizeof(ArrayType)); result->ndim = 0; result->dataoffset = 0; @@ -3628,7 +3629,7 @@ construct_empty_expanded_array(Oid element_type, * to hard-wire values if the element type is hard-wired. */ void -deconstruct_array(ArrayType *array, +deconstruct_array(const ArrayType *array, Oid elmtype, int elmlen, bool elmbyval, char elmalign, Datum **elemsp, bool **nullsp, int *nelemsp) @@ -3644,9 +3645,9 @@ deconstruct_array(ArrayType *array, Assert(ARR_ELEMTYPE(array) == elmtype); nelems = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - *elemsp = elems = (Datum *) palloc(nelems * sizeof(Datum)); + *elemsp = elems = palloc_array(Datum, nelems); if (nullsp) - *nullsp = nulls = (bool *) palloc0(nelems * sizeof(bool)); + *nullsp = nulls = palloc0_array(bool, nelems); else nulls = NULL; *nelemsp = nelems; @@ -3694,7 +3695,7 @@ deconstruct_array(ArrayType *array, * useful when manipulating arrays from/for system catalogs. */ void -deconstruct_array_builtin(ArrayType *array, +deconstruct_array_builtin(const ArrayType *array, Oid elmtype, Datum **elemsp, bool **nullsp, int *nelemsp) { @@ -3718,7 +3719,7 @@ deconstruct_array_builtin(ArrayType *array, case FLOAT8OID: elmlen = sizeof(float8); - elmbyval = FLOAT8PASSBYVAL; + elmbyval = true; elmalign = TYPALIGN_DOUBLE; break; @@ -3764,7 +3765,7 @@ deconstruct_array_builtin(ArrayType *array, * if the array *might* contain a null. */ bool -array_contains_nulls(ArrayType *array) +array_contains_nulls(const ArrayType *array) { int nelems; bits8 *bitmap; @@ -4208,7 +4209,7 @@ hash_array(PG_FUNCTION_ARGS) * modify typentry, since that points directly into the type * cache. */ - record_typentry = palloc0(sizeof(*record_typentry)); + record_typentry = palloc0_object(TypeCacheEntry); record_typentry->type_id = element_type; /* fill in what we need below */ @@ -4596,12 +4597,12 @@ arraycontained(PG_FUNCTION_ARGS) ArrayIterator array_create_iterator(ArrayType *arr, int slice_ndim, ArrayMetaState *mstate) { - ArrayIterator iterator = palloc0(sizeof(ArrayIteratorData)); + ArrayIterator iterator = palloc0_object(ArrayIteratorData); /* * Sanity-check inputs --- caller should have got this right already */ - Assert(PointerIsValid(arr)); + Assert(arr); if (slice_ndim < 0 || slice_ndim > ARR_NDIM(arr)) elog(ERROR, "invalid arguments to array_create_iterator"); @@ -5374,8 +5375,8 @@ accumArrayResult(ArrayBuildState *astate, if (!AllocSizeIsValid(astate->alen * sizeof(Datum))) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); astate->dvalues = (Datum *) repalloc(astate->dvalues, astate->alen * sizeof(Datum)); astate->dnulls = (bool *) @@ -5686,7 +5687,7 @@ accumArrayResultArr(ArrayBuildStateArr *astate, MemoryContextSwitchTo(oldcontext); /* Release detoasted copy if any */ - if ((Pointer) arg != DatumGetPointer(dvalue)) + if (arg != DatumGetPointer(dvalue)) pfree(arg); return astate; @@ -5943,7 +5944,7 @@ generate_subscripts(PG_FUNCTION_ARGS) * switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - fctx = (generate_subscripts_fctx *) palloc(sizeof(generate_subscripts_fctx)); + fctx = palloc_object(generate_subscripts_fctx); lb = AARR_LBOUND(v); dimv = AARR_DIMS(v); @@ -6213,8 +6214,8 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, !AllocSizeIsValid(totbytes)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); /* * This addition can't overflow, but it might cause us to go past @@ -6290,7 +6291,7 @@ array_unnest(PG_FUNCTION_ARGS) arr = PG_GETARG_ANY_ARRAY_P(0); /* allocate memory for user context */ - fctx = (array_unnest_fctx *) palloc(sizeof(array_unnest_fctx)); + fctx = palloc_object(array_unnest_fctx); /* initialize state */ array_iter_setup(&fctx->iter, arr); @@ -6557,8 +6558,8 @@ array_replace_internal(ArrayType *array, if (!AllocSizeIsValid(nbytes)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("array size exceeds the maximum allowed (%d)", - (int) MaxAllocSize))); + errmsg("array size exceeds the maximum allowed (%zu)", + MaxAllocSize))); } nresult++; } diff --git a/src/backend/utils/adt/arraysubs.c b/src/backend/utils/adt/arraysubs.c index 2940fb8e8d737..f280212fd8f85 100644 --- a/src/backend/utils/adt/arraysubs.c +++ b/src/backend/utils/adt/arraysubs.c @@ -140,7 +140,7 @@ array_subscript_transform(SubscriptingRef *sbsref, upperIndexpr = lappend(upperIndexpr, subexpr); } - /* ... and store the transformed lists into the SubscriptRef node */ + /* ... and store the transformed lists into the SubscriptingRef node */ sbsref->refupperindexpr = upperIndexpr; sbsref->reflowerindexpr = lowerIndexpr; @@ -497,7 +497,7 @@ array_exec_setup(const SubscriptingRef *sbsref, /* * Allocate type-specific workspace. */ - workspace = (ArraySubWorkspace *) palloc(sizeof(ArraySubWorkspace)); + workspace = palloc_object(ArraySubWorkspace); sbsrefstate->workspace = workspace; /* diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c new file mode 100644 index 0000000000000..f8524548e4671 --- /dev/null +++ b/src/backend/utils/adt/bytea.c @@ -0,0 +1,1342 @@ +/*------------------------------------------------------------------------- + * + * bytea.c + * Functions for the bytea type. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/bytea.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "common/hashfn.h" +#include "common/int.h" +#include "fmgr.h" +#include "lib/hyperloglog.h" +#include "libpq/pqformat.h" +#include "port/pg_bitutils.h" +#include "port/pg_bswap.h" +#include "utils/builtins.h" +#include "utils/bytea.h" +#include "utils/fmgrprotos.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/sortsupport.h" +#include "varatt.h" + +/* GUC variable */ +int bytea_output = BYTEA_OUTPUT_HEX; + +static bytea *bytea_catenate(bytea *t1, bytea *t2); +static bytea *bytea_substring(Datum str, int S, int L, + bool length_not_specified); +static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); + +typedef struct +{ + bool abbreviate; /* Should we abbreviate keys? */ + hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ + hyperLogLogState full_card; /* Full key cardinality state */ + double prop_card; /* Required cardinality proportion */ +} ByteaSortSupport; + +/* Static function declarations for sort support */ +static int byteafastcmp(Datum x, Datum y, SortSupport ssup); +static Datum bytea_abbrev_convert(Datum original, SortSupport ssup); +static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup); + +/* + * bytea_catenate + * Guts of byteacat(), broken out so it can be used by other functions + * + * Arguments can be in short-header form, but not compressed or out-of-line + */ +static bytea * +bytea_catenate(bytea *t1, bytea *t2) +{ + bytea *result; + int len1, + len2, + len; + char *ptr; + + len1 = VARSIZE_ANY_EXHDR(t1); + len2 = VARSIZE_ANY_EXHDR(t2); + + /* paranoia ... probably should throw error instead? */ + if (len1 < 0) + len1 = 0; + if (len2 < 0) + len2 = 0; + + len = len1 + len2 + VARHDRSZ; + result = (bytea *) palloc(len); + + /* Set size of result string... */ + SET_VARSIZE(result, len); + + /* Fill data field of result string... */ + ptr = VARDATA(result); + if (len1 > 0) + memcpy(ptr, VARDATA_ANY(t1), len1); + if (len2 > 0) + memcpy(ptr + len1, VARDATA_ANY(t2), len2); + + return result; +} + +#define PG_STR_GET_BYTEA(str_) \ + DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) + +static bytea * +bytea_substring(Datum str, + int S, + int L, + bool length_not_specified) +{ + int32 S1; /* adjusted start position */ + int32 L1; /* adjusted substring length */ + int32 E; /* end position */ + + /* + * The logic here should generally match text_substring(). + */ + S1 = Max(S, 1); + + if (length_not_specified) + { + /* + * Not passed a length - DatumGetByteaPSlice() grabs everything to the + * end of the string if we pass it a negative value for length. + */ + L1 = -1; + } + else if (L < 0) + { + /* SQL99 says to throw an error for E < S, i.e., negative length */ + ereport(ERROR, + (errcode(ERRCODE_SUBSTRING_ERROR), + errmsg("negative substring length not allowed"))); + L1 = -1; /* silence stupider compilers */ + } + else if (pg_add_s32_overflow(S, L, &E)) + { + /* + * L could be large enough for S + L to overflow, in which case the + * substring must run to end of string. + */ + L1 = -1; + } + else + { + /* + * A zero or negative value for the end position can happen if the + * start was negative or one. SQL99 says to return a zero-length + * string. + */ + if (E < 1) + return PG_STR_GET_BYTEA(""); + + L1 = E - S1; + } + + /* + * If the start position is past the end of the string, SQL99 says to + * return a zero-length string -- DatumGetByteaPSlice() will do that for + * us. We need only convert S1 to zero-based starting position. + */ + return DatumGetByteaPSlice(str, S1 - 1, L1); +} + +static bytea * +bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) +{ + bytea *result; + bytea *s1; + bytea *s2; + int sp_pl_sl; + + /* + * Check for possible integer-overflow cases. For negative sp, throw a + * "substring length" error because that's what should be expected + * according to the spec's definition of OVERLAY(). + */ + if (sp <= 0) + ereport(ERROR, + (errcode(ERRCODE_SUBSTRING_ERROR), + errmsg("negative substring length not allowed"))); + if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); + + s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); + s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); + result = bytea_catenate(s1, t2); + result = bytea_catenate(result, s2); + + return result; +} + +/***************************************************************************** + * USER I/O ROUTINES * + *****************************************************************************/ + +#define VAL(CH) ((CH) - '0') +#define DIG(VAL) ((VAL) + '0') + +/* + * byteain - converts from printable representation of byte array + * + * Non-printable characters must be passed as '\nnn' (octal) and are + * converted to internal form. '\' must be passed as '\\'. + */ +Datum +byteain(PG_FUNCTION_ARGS) +{ + char *inputText = PG_GETARG_CSTRING(0); + Node *escontext = fcinfo->context; + size_t len = strlen(inputText); + size_t bc; + char *tp; + char *rp; + bytea *result; + + /* Recognize hex input */ + if (inputText[0] == '\\' && inputText[1] == 'x') + { + bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ + result = palloc(bc); + bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), + escontext); + SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ + + PG_RETURN_BYTEA_P(result); + } + + /* Else, it's the traditional escaped style */ + result = (bytea *) palloc(len + VARHDRSZ); /* maximum possible length */ + + tp = inputText; + rp = VARDATA(result); + while (*tp != '\0') + { + if (tp[0] != '\\') + *rp++ = *tp++; + else if ((tp[1] >= '0' && tp[1] <= '3') && + (tp[2] >= '0' && tp[2] <= '7') && + (tp[3] >= '0' && tp[3] <= '7')) + { + int v; + + v = VAL(tp[1]); + v <<= 3; + v += VAL(tp[2]); + v <<= 3; + *rp++ = v + VAL(tp[3]); + + tp += 4; + } + else if (tp[1] == '\\') + { + *rp++ = '\\'; + tp += 2; + } + else + { + /* + * one backslash, not followed by another or ### valid octal + */ + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "bytea"))); + } + } + + bc = rp - VARDATA(result); /* actual length */ + SET_VARSIZE(result, bc + VARHDRSZ); + + PG_RETURN_BYTEA_P(result); +} + +/* + * byteaout - converts to printable representation of byte array + * + * In the traditional escaped format, non-printable characters are + * printed as '\nnn' (octal) and '\' as '\\'. + */ +Datum +byteaout(PG_FUNCTION_ARGS) +{ + bytea *vlena = PG_GETARG_BYTEA_PP(0); + char *result; + char *rp; + + if (bytea_output == BYTEA_OUTPUT_HEX) + { + /* Print hex format */ + rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); + *rp++ = '\\'; + *rp++ = 'x'; + rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); + } + else if (bytea_output == BYTEA_OUTPUT_ESCAPE) + { + /* Print traditional escaped format */ + char *vp; + uint64 len; + int i; + + len = 1; /* empty string has 1 char */ + vp = VARDATA_ANY(vlena); + for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) + { + if (*vp == '\\') + len += 2; + else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) + len += 4; + else + len++; + } + + /* + * In principle len can't overflow uint32 if the input fit in 1GB, but + * for safety let's check rather than relying on palloc's internal + * check. + */ + if (len > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("result of bytea output conversion is too large"))); + rp = result = (char *) palloc(len); + + vp = VARDATA_ANY(vlena); + for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) + { + if (*vp == '\\') + { + *rp++ = '\\'; + *rp++ = '\\'; + } + else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) + { + int val; /* holds unprintable chars */ + + val = *vp; + rp[0] = '\\'; + rp[3] = DIG(val & 07); + val >>= 3; + rp[2] = DIG(val & 07); + val >>= 3; + rp[1] = DIG(val & 03); + rp += 4; + } + else + *rp++ = *vp; + } + } + else + { + elog(ERROR, "unrecognized \"bytea_output\" setting: %d", + bytea_output); + rp = result = NULL; /* keep compiler quiet */ + } + *rp = '\0'; + PG_RETURN_CSTRING(result); +} + +/* + * bytearecv - converts external binary format to bytea + */ +Datum +bytearecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + bytea *result; + int nbytes; + + nbytes = buf->len - buf->cursor; + result = (bytea *) palloc(nbytes + VARHDRSZ); + SET_VARSIZE(result, nbytes + VARHDRSZ); + pq_copymsgbytes(buf, VARDATA(result), nbytes); + PG_RETURN_BYTEA_P(result); +} + +/* + * byteasend - converts bytea to binary format + * + * This is a special case: just copy the input... + */ +Datum +byteasend(PG_FUNCTION_ARGS) +{ + bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); + + PG_RETURN_BYTEA_P(vlena); +} + +Datum +bytea_string_agg_transfn(PG_FUNCTION_ARGS) +{ + StringInfo state; + + state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); + + /* Append the value unless null, preceding it with the delimiter. */ + if (!PG_ARGISNULL(1)) + { + bytea *value = PG_GETARG_BYTEA_PP(1); + bool isfirst = false; + + /* + * You might think we can just throw away the first delimiter, however + * we must keep it as we may be a parallel worker doing partial + * aggregation building a state to send to the main process. We need + * to keep the delimiter of every aggregation so that the combine + * function can properly join up the strings of two separately + * partially aggregated results. The first delimiter is only stripped + * off in the final function. To know how much to strip off the front + * of the string, we store the length of the first delimiter in the + * StringInfo's cursor field, which we don't otherwise need here. + */ + if (state == NULL) + { + MemoryContext aggcontext; + MemoryContext oldcontext; + + if (!AggCheckCallContext(fcinfo, &aggcontext)) + { + /* cannot be called directly because of internal-type argument */ + elog(ERROR, "bytea_string_agg_transfn called in non-aggregate context"); + } + + /* + * Create state in aggregate context. It'll stay there across + * subsequent calls. + */ + oldcontext = MemoryContextSwitchTo(aggcontext); + state = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + + isfirst = true; + } + + if (!PG_ARGISNULL(2)) + { + bytea *delim = PG_GETARG_BYTEA_PP(2); + + appendBinaryStringInfo(state, VARDATA_ANY(delim), + VARSIZE_ANY_EXHDR(delim)); + if (isfirst) + state->cursor = VARSIZE_ANY_EXHDR(delim); + } + + appendBinaryStringInfo(state, VARDATA_ANY(value), + VARSIZE_ANY_EXHDR(value)); + } + + /* + * The transition type for string_agg() is declared to be "internal", + * which is a pass-by-value type the same size as a pointer. + */ + if (state) + PG_RETURN_POINTER(state); + PG_RETURN_NULL(); +} + +Datum +bytea_string_agg_finalfn(PG_FUNCTION_ARGS) +{ + StringInfo state; + + /* cannot be called directly because of internal-type argument */ + Assert(AggCheckCallContext(fcinfo, NULL)); + + state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); + + if (state != NULL) + { + /* As per comment in transfn, strip data before the cursor position */ + bytea *result; + int strippedlen = state->len - state->cursor; + + result = (bytea *) palloc(strippedlen + VARHDRSZ); + SET_VARSIZE(result, strippedlen + VARHDRSZ); + memcpy(VARDATA(result), &state->data[state->cursor], strippedlen); + PG_RETURN_BYTEA_P(result); + } + else + PG_RETURN_NULL(); +} + +/*------------------------------------------------------------- + * byteaoctetlen + * + * get the number of bytes contained in an instance of type 'bytea' + *------------------------------------------------------------- + */ +Datum +byteaoctetlen(PG_FUNCTION_ARGS) +{ + Datum str = PG_GETARG_DATUM(0); + + /* We need not detoast the input at all */ + PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); +} + +/* + * byteacat - + * takes two bytea* and returns a bytea* that is the concatenation of + * the two. + * + * Cloned from textcat and modified as required. + */ +Datum +byteacat(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + + PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); +} + +/* + * byteaoverlay + * Replace specified substring of first string with second + * + * The SQL standard defines OVERLAY() in terms of substring and concatenation. + * This code is a direct implementation of what the standard says. + */ +Datum +byteaoverlay(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int sp = PG_GETARG_INT32(2); /* substring start position */ + int sl = PG_GETARG_INT32(3); /* substring length */ + + PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); +} + +Datum +byteaoverlay_no_len(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int sp = PG_GETARG_INT32(2); /* substring start position */ + int sl; + + sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ + PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); +} + +/* + * bytea_substr() + * Return a substring starting at the specified position. + * Cloned from text_substr and modified as required. + * + * Input: + * - string + * - starting position (is one-based) + * - string length (optional) + * + * If the starting position is zero or less, then return from the start of the string + * adjusting the length to be consistent with the "negative start" per SQL. + * If the length is less than zero, an ERROR is thrown. If no third argument + * (length) is provided, the length to the end of the string is assumed. + */ +Datum +bytea_substr(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), + PG_GETARG_INT32(1), + PG_GETARG_INT32(2), + false)); +} + +/* + * bytea_substr_no_len - + * Wrapper to avoid opr_sanity failure due to + * one function accepting a different number of args. + */ +Datum +bytea_substr_no_len(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), + PG_GETARG_INT32(1), + -1, + true)); +} + +/* + * bit_count + */ +Datum +bytea_bit_count(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + + PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1))); +} + +/* + * byteapos - + * Return the position of the specified substring. + * Implements the SQL POSITION() function. + * Cloned from textpos and modified as required. + */ +Datum +byteapos(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int pos; + int px, + p; + int len1, + len2; + char *p1, + *p2; + + len1 = VARSIZE_ANY_EXHDR(t1); + len2 = VARSIZE_ANY_EXHDR(t2); + + if (len2 <= 0) + PG_RETURN_INT32(1); /* result for empty pattern */ + + p1 = VARDATA_ANY(t1); + p2 = VARDATA_ANY(t2); + + pos = 0; + px = (len1 - len2); + for (p = 0; p <= px; p++) + { + if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) + { + pos = p + 1; + break; + }; + p1++; + }; + + PG_RETURN_INT32(pos); +} + +/*------------------------------------------------------------- + * byteaGetByte + * + * this routine treats "bytea" as an array of bytes. + * It returns the Nth byte (a number between 0 and 255). + *------------------------------------------------------------- + */ +Datum +byteaGetByte(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int32 n = PG_GETARG_INT32(1); + int len; + int byte; + + len = VARSIZE_ANY_EXHDR(v); + + if (n < 0 || n >= len) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %d out of valid range, 0..%d", + n, len - 1))); + + byte = ((unsigned char *) VARDATA_ANY(v))[n]; + + PG_RETURN_INT32(byte); +} + +/*------------------------------------------------------------- + * byteaGetBit + * + * This routine treats a "bytea" type like an array of bits. + * It returns the value of the Nth bit (0 or 1). + * + *------------------------------------------------------------- + */ +Datum +byteaGetBit(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int64 n = PG_GETARG_INT64(1); + int byteNo, + bitNo; + int len; + int byte; + + len = VARSIZE_ANY_EXHDR(v); + + if (n < 0 || n >= (int64) len * 8) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, + n, (int64) len * 8 - 1))); + + /* n/8 is now known < len, so safe to cast to int */ + byteNo = (int) (n / 8); + bitNo = (int) (n % 8); + + byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; + + if (byte & (1 << bitNo)) + PG_RETURN_INT32(1); + else + PG_RETURN_INT32(0); +} + +/*------------------------------------------------------------- + * byteaSetByte + * + * Given an instance of type 'bytea' creates a new one with + * the Nth byte set to the given value. + * + *------------------------------------------------------------- + */ +Datum +byteaSetByte(PG_FUNCTION_ARGS) +{ + bytea *res = PG_GETARG_BYTEA_P_COPY(0); + int32 n = PG_GETARG_INT32(1); + int32 newByte = PG_GETARG_INT32(2); + int len; + + len = VARSIZE(res) - VARHDRSZ; + + if (n < 0 || n >= len) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %d out of valid range, 0..%d", + n, len - 1))); + + /* + * Now set the byte. + */ + ((unsigned char *) VARDATA(res))[n] = newByte; + + PG_RETURN_BYTEA_P(res); +} + +/*------------------------------------------------------------- + * byteaSetBit + * + * Given an instance of type 'bytea' creates a new one with + * the Nth bit set to the given value. + * + *------------------------------------------------------------- + */ +Datum +byteaSetBit(PG_FUNCTION_ARGS) +{ + bytea *res = PG_GETARG_BYTEA_P_COPY(0); + int64 n = PG_GETARG_INT64(1); + int32 newBit = PG_GETARG_INT32(2); + int len; + int oldByte, + newByte; + int byteNo, + bitNo; + + len = VARSIZE(res) - VARHDRSZ; + + if (n < 0 || n >= (int64) len * 8) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, + n, (int64) len * 8 - 1))); + + /* n/8 is now known < len, so safe to cast to int */ + byteNo = (int) (n / 8); + bitNo = (int) (n % 8); + + /* + * sanity check! + */ + if (newBit != 0 && newBit != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("new bit must be 0 or 1"))); + + /* + * Update the byte. + */ + oldByte = ((unsigned char *) VARDATA(res))[byteNo]; + + if (newBit == 0) + newByte = oldByte & (~(1 << bitNo)); + else + newByte = oldByte | (1 << bitNo); + + ((unsigned char *) VARDATA(res))[byteNo] = newByte; + + PG_RETURN_BYTEA_P(res); +} + +/* + * Return reversed bytea + */ +Datum +bytea_reverse(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + const char *p = VARDATA_ANY(v); + int len = VARSIZE_ANY_EXHDR(v); + const char *endp = p + len; + bytea *result = palloc(len + VARHDRSZ); + char *dst = (char *) VARDATA(result) + len; + + SET_VARSIZE(result, len + VARHDRSZ); + + while (p < endp) + *(--dst) = *p++; + + PG_RETURN_BYTEA_P(result); +} + + +/***************************************************************************** + * Comparison Functions used for bytea + * + * Note: btree indexes need these routines not to leak memory; therefore, + * be careful to free working copies of toasted datums. Most places don't + * need to be so careful. + *****************************************************************************/ + +Datum +byteaeq(PG_FUNCTION_ARGS) +{ + Datum arg1 = PG_GETARG_DATUM(0); + Datum arg2 = PG_GETARG_DATUM(1); + bool result; + Size len1, + len2; + + /* + * We can use a fast path for unequal lengths, which might save us from + * having to detoast one or both values. + */ + len1 = toast_raw_datum_size(arg1); + len2 = toast_raw_datum_size(arg2); + if (len1 != len2) + result = false; + else + { + bytea *barg1 = DatumGetByteaPP(arg1); + bytea *barg2 = DatumGetByteaPP(arg2); + + result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), + len1 - VARHDRSZ) == 0); + + PG_FREE_IF_COPY(barg1, 0); + PG_FREE_IF_COPY(barg2, 1); + } + + PG_RETURN_BOOL(result); +} + +Datum +byteane(PG_FUNCTION_ARGS) +{ + Datum arg1 = PG_GETARG_DATUM(0); + Datum arg2 = PG_GETARG_DATUM(1); + bool result; + Size len1, + len2; + + /* + * We can use a fast path for unequal lengths, which might save us from + * having to detoast one or both values. + */ + len1 = toast_raw_datum_size(arg1); + len2 = toast_raw_datum_size(arg2); + if (len1 != len2) + result = true; + else + { + bytea *barg1 = DatumGetByteaPP(arg1); + bytea *barg2 = DatumGetByteaPP(arg2); + + result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), + len1 - VARHDRSZ) != 0); + + PG_FREE_IF_COPY(barg1, 0); + PG_FREE_IF_COPY(barg2, 1); + } + + PG_RETURN_BOOL(result); +} + +Datum +bytealt(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); +} + +Datum +byteale(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); +} + +Datum +byteagt(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); +} + +Datum +byteage(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); +} + +Datum +byteacmp(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + if ((cmp == 0) && (len1 != len2)) + cmp = (len1 < len2) ? -1 : 1; + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_INT32(cmp); +} + +Datum +bytea_larger(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + bytea *result; + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2); + + PG_RETURN_BYTEA_P(result); +} + +Datum +bytea_smaller(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + bytea *result; + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2); + + PG_RETURN_BYTEA_P(result); +} + +/* + * sortsupport comparison func + */ +static int +byteafastcmp(Datum x, Datum y, SortSupport ssup) +{ + bytea *arg1 = DatumGetByteaPP(x); + bytea *arg2 = DatumGetByteaPP(y); + char *a1p, + *a2p; + int len1, + len2, + result; + + a1p = VARDATA_ANY(arg1); + a2p = VARDATA_ANY(arg2); + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + result = memcmp(a1p, a2p, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + + /* We can't afford to leak memory here. */ + if (PointerGetDatum(arg1) != x) + pfree(arg1); + if (PointerGetDatum(arg2) != y) + pfree(arg2); + + return result; +} + +/* + * Conversion routine for sortsupport. Converts original to abbreviated key + * representation. Our encoding strategy is simple -- pack the first 8 bytes + * of the bytea data into a Datum (on little-endian machines, the bytes are + * stored in reverse order), and treat it as an unsigned integer. + */ +static Datum +bytea_abbrev_convert(Datum original, SortSupport ssup) +{ + const size_t max_prefix_bytes = sizeof(Datum); + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + bytea *authoritative = DatumGetByteaPP(original); + char *authoritative_data = VARDATA_ANY(authoritative); + Datum res; + char *pres; + int len; + uint32 hash; + + pres = (char *) &res; + + /* memset(), so any non-overwritten bytes are NUL */ + memset(pres, 0, max_prefix_bytes); + len = VARSIZE_ANY_EXHDR(authoritative); + + /* + * Short byteas will have terminating NUL bytes in the abbreviated datum. + * Abbreviated comparison need not make a distinction between these NUL + * bytes, and NUL bytes representing actual NULs in the authoritative + * representation. + * + * Hopefully a comparison at or past one abbreviated key's terminating NUL + * byte will resolve the comparison without consulting the authoritative + * representation; specifically, some later non-NUL byte in the longer + * bytea can resolve the comparison against a subsequent terminating NUL + * in the shorter bytea. There will usually be what is effectively a + * "length-wise" resolution there and then. + * + * If that doesn't work out -- if all bytes in the longer bytea positioned + * at or past the offset of the smaller bytea (first) terminating NUL are + * actually representative of NUL bytes in the authoritative binary bytea + * (perhaps with some *terminating* NUL bytes towards the end of the + * longer bytea iff it happens to still be small) -- then an authoritative + * tie-breaker will happen, and do the right thing: explicitly consider + * bytea length. + */ + memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); + + /* + * Maintain approximate cardinality of both abbreviated keys and original, + * authoritative keys using HyperLogLog. Used as cheap insurance against + * the worst case, where we do many string abbreviations for no saving in + * full memcmp()-based comparisons. These statistics are used by + * bytea_abbrev_abort(). + * + * First, Hash key proper, or a significant fraction of it. Mix in length + * in order to compensate for cases where differences are past + * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. + */ + hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE))); + + if (len > PG_CACHE_LINE_SIZE) + hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + + addHyperLogLog(&bss->full_card, hash); + + /* Hash abbreviated key */ + { + uint32 tmp; + + tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); + hash = DatumGetUInt32(hash_uint32(tmp)); + } + + addHyperLogLog(&bss->abbr_card, hash); + + /* + * Byteswap on little-endian machines. + * + * This is needed so that ssup_datum_unsigned_cmp() works correctly on all + * platforms. + */ + res = DatumBigEndianToNative(res); + + /* Don't leak memory here */ + if (PointerGetDatum(authoritative) != original) + pfree(authoritative); + + return res; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization, using + * heuristic rules. Returns value indicating if the abbreviation optimization + * should be aborted, based on its projected effectiveness. + * + * This is based on varstr_abbrev_abort(), but some comments have been elided + * for brevity. See there for more details. + */ +static bool +bytea_abbrev_abort(int memtupcount, SortSupport ssup) +{ + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + double abbrev_distinct, + key_distinct; + + Assert(ssup->abbreviate); + + /* Have a little patience */ + if (memtupcount < 100) + return false; + + abbrev_distinct = estimateHyperLogLog(&bss->abbr_card); + key_distinct = estimateHyperLogLog(&bss->full_card); + + /* + * Clamp cardinality estimates to at least one distinct value. While + * NULLs are generally disregarded, if only NULL values were seen so far, + * that might misrepresent costs if we failed to clamp. + */ + if (abbrev_distinct < 1.0) + abbrev_distinct = 1.0; + + if (key_distinct < 1.0) + key_distinct = 1.0; + + if (trace_sort) + { + double norm_abbrev_card = abbrev_distinct / (double) memtupcount; + + elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f " + "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, + bss->prop_card); + } + + /* + * If the number of distinct abbreviated keys approximately matches the + * number of distinct original keys, continue with abbreviation. + */ + if (abbrev_distinct > key_distinct * bss->prop_card) + { + /* + * Decay required cardinality aggressively after 10,000 tuples. + */ + if (memtupcount > 10000) + bss->prop_card *= 0.65; + + return false; + } + + /* + * Abort abbreviation strategy. + */ + if (trace_sort) + elog(LOG, "bytea_abbrev: aborted abbreviation at %d " + "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, bss->prop_card); + + return true; +} + +Datum +bytea_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + ssup->comparator = byteafastcmp; + + /* + * Set up abbreviation support if requested. + */ + if (ssup->abbreviate) + { + ByteaSortSupport *bss; + + bss = palloc_object(ByteaSortSupport); + bss->abbreviate = true; + bss->prop_card = 0.20; + initHyperLogLog(&bss->abbr_card, 10); + initHyperLogLog(&bss->full_card, 10); + + ssup->ssup_extra = bss; + ssup->abbrev_full_comparator = ssup->comparator; + ssup->comparator = ssup_datum_unsigned_cmp; + ssup->abbrev_converter = bytea_abbrev_convert; + ssup->abbrev_abort = bytea_abbrev_abort; + } + + MemoryContextSwitchTo(oldcontext); + + PG_RETURN_VOID(); +} + +/* Cast bytea -> int2 */ +Datum +bytea_int2(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint16 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("smallint out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT16(result); +} + +/* Cast bytea -> int4 */ +Datum +bytea_int4(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint32 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT32(result); +} + +/* Cast bytea -> int8 */ +Datum +bytea_int8(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint64 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT64(result); +} + +/* Cast int2 -> bytea; can just use int2send() */ +Datum +int2_bytea(PG_FUNCTION_ARGS) +{ + return int2send(fcinfo); +} + +/* Cast int4 -> bytea; can just use int4send() */ +Datum +int4_bytea(PG_FUNCTION_ARGS) +{ + return int4send(fcinfo); +} + +/* Cast int8 -> bytea; can just use int8send() */ +Datum +int8_bytea(PG_FUNCTION_ARGS) +{ + return int8send(fcinfo); +} diff --git a/src/backend/utils/adt/cash.c b/src/backend/utils/adt/cash.c index 611d23f3cb0d8..623f6eec0565a 100644 --- a/src/backend/utils/adt/cash.c +++ b/src/backend/utils/adt/cash.c @@ -1035,7 +1035,7 @@ cash_words(PG_FUNCTION_ARGS) appendStringInfoString(&buf, m0 == 1 ? " cent" : " cents"); /* capitalize output */ - buf.data[0] = pg_toupper((unsigned char) buf.data[0]); + buf.data[0] = pg_ascii_toupper((unsigned char) buf.data[0]); /* return as text datum */ res = cstring_to_text_with_len(buf.data, buf.len); diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 4227ab1a72bfb..421ccc306f67b 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -27,6 +27,7 @@ #include "common/int.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "nodes/miscnodes.h" #include "nodes/supportnodes.h" #include "parser/scansup.h" #include "utils/array.h" @@ -357,7 +358,7 @@ GetSQLCurrentTime(int32 typmod) GetCurrentTimeUsec(tm, &fsec, &tz); - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); tm2timetz(tm, fsec, tz, result); AdjustTimeForTypmod(&(result->time), typmod); return result; @@ -615,24 +616,21 @@ date_mii(PG_FUNCTION_ARGS) /* * Promote date to timestamp. * - * On successful conversion, *overflow is set to zero if it's not NULL. + * If the date falls out of the valid range for the timestamp type, error + * handling proceeds based on escontext. * - * If the date is finite but out of the valid range for timestamp, then: - * if overflow is NULL, we throw an out-of-range error. - * if overflow is not NULL, we store +1 or -1 there to indicate the sign - * of the overflow, and return the appropriate timestamp infinity. + * If escontext is NULL, we throw an out-of-range error (hard error). + * If escontext is not NULL, we return NOBEGIN or NOEND for lower bound or + * upper bound overflow, respectively, and record a soft error. * - * Note: *overflow = -1 is actually not possible currently, since both - * datatypes have the same lower bound, Julian day zero. + * Note: Lower bound overflow is currently not possible, as both date and + * timestamp datatypes share the same lower boundary: Julian day zero. */ Timestamp -date2timestamp_opt_overflow(DateADT dateVal, int *overflow) +date2timestamp_safe(DateADT dateVal, Node *escontext) { Timestamp result; - if (overflow) - *overflow = 0; - if (DATE_IS_NOBEGIN(dateVal)) TIMESTAMP_NOBEGIN(result); else if (DATE_IS_NOEND(dateVal)) @@ -645,18 +643,10 @@ date2timestamp_opt_overflow(DateADT dateVal, int *overflow) */ if (dateVal >= (TIMESTAMP_END_JULIAN - POSTGRES_EPOCH_JDATE)) { - if (overflow) - { - *overflow = 1; - TIMESTAMP_NOEND(result); - return result; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("date out of range for timestamp"))); - } + TIMESTAMP_NOEND(result); + ereturn(escontext, result, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("date out of range for timestamp"))); } /* date is days since 2000, timestamp is microseconds since same... */ @@ -672,30 +662,27 @@ date2timestamp_opt_overflow(DateADT dateVal, int *overflow) static TimestampTz date2timestamp(DateADT dateVal) { - return date2timestamp_opt_overflow(dateVal, NULL); + return date2timestamp_safe(dateVal, NULL); } /* * Promote date to timestamp with time zone. * - * On successful conversion, *overflow is set to zero if it's not NULL. + * If the date falls out of the valid range for the timestamp type, error + * handling proceeds based on escontext. * - * If the date is finite but out of the valid range for timestamptz, then: - * if overflow is NULL, we throw an out-of-range error. - * if overflow is not NULL, we store +1 or -1 there to indicate the sign - * of the overflow, and return the appropriate timestamptz infinity. + * If escontext is NULL, we throw an out-of-range error (hard error). + * If escontext is not NULL, we return NOBEGIN or NOEND for lower bound or + * upper bound overflow, respectively, and record a soft error. */ TimestampTz -date2timestamptz_opt_overflow(DateADT dateVal, int *overflow) +date2timestamptz_safe(DateADT dateVal, Node *escontext) { TimestampTz result; struct pg_tm tt, *tm = &tt; int tz; - if (overflow) - *overflow = 0; - if (DATE_IS_NOBEGIN(dateVal)) TIMESTAMP_NOBEGIN(result); else if (DATE_IS_NOEND(dateVal)) @@ -708,18 +695,10 @@ date2timestamptz_opt_overflow(DateADT dateVal, int *overflow) */ if (dateVal >= (TIMESTAMP_END_JULIAN - POSTGRES_EPOCH_JDATE)) { - if (overflow) - { - *overflow = 1; - TIMESTAMP_NOEND(result); - return result; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("date out of range for timestamp"))); - } + TIMESTAMP_NOEND(result); + ereturn(escontext, result, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("date out of range for timestamp"))); } j2date(dateVal + POSTGRES_EPOCH_JDATE, @@ -737,25 +716,14 @@ date2timestamptz_opt_overflow(DateADT dateVal, int *overflow) */ if (!IS_VALID_TIMESTAMP(result)) { - if (overflow) - { - if (result < MIN_TIMESTAMP) - { - *overflow = -1; - TIMESTAMP_NOBEGIN(result); - } - else - { - *overflow = 1; - TIMESTAMP_NOEND(result); - } - } + if (result < MIN_TIMESTAMP) + TIMESTAMP_NOBEGIN(result); else - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("date out of range for timestamp"))); - } + TIMESTAMP_NOEND(result); + + ereturn(escontext, result, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("date out of range for timestamp"))); } } @@ -768,7 +736,7 @@ date2timestamptz_opt_overflow(DateADT dateVal, int *overflow) static TimestampTz date2timestamptz(DateADT dateVal) { - return date2timestamptz_opt_overflow(dateVal, NULL); + return date2timestamptz_safe(dateVal, NULL); } /* @@ -808,15 +776,16 @@ int32 date_cmp_timestamp_internal(DateADT dateVal, Timestamp dt2) { Timestamp dt1; - int overflow; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - dt1 = date2timestamp_opt_overflow(dateVal, &overflow); - if (overflow > 0) + dt1 = date2timestamp_safe(dateVal, (Node *) &escontext); + if (escontext.error_occurred) { + Assert(TIMESTAMP_IS_NOEND(dt1)); /* NOBEGIN case cannot occur */ + /* dt1 is larger than any finite timestamp, but less than infinity */ return TIMESTAMP_IS_NOEND(dt2) ? -1 : +1; } - Assert(overflow == 0); /* -1 case cannot occur */ return timestamp_cmp_internal(dt1, dt2); } @@ -888,18 +857,22 @@ int32 date_cmp_timestamptz_internal(DateADT dateVal, TimestampTz dt2) { TimestampTz dt1; - int overflow; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - dt1 = date2timestamptz_opt_overflow(dateVal, &overflow); - if (overflow > 0) - { - /* dt1 is larger than any finite timestamp, but less than infinity */ - return TIMESTAMP_IS_NOEND(dt2) ? -1 : +1; - } - if (overflow < 0) + dt1 = date2timestamptz_safe(dateVal, (Node *) &escontext); + + if (escontext.error_occurred) { - /* dt1 is less than any finite timestamp, but more than -infinity */ - return TIMESTAMP_IS_NOBEGIN(dt2) ? +1 : -1; + if (TIMESTAMP_IS_NOEND(dt1)) + { + /* dt1 is larger than any finite timestamp, but less than infinity */ + return TIMESTAMP_IS_NOEND(dt2) ? -1 : +1; + } + if (TIMESTAMP_IS_NOBEGIN(dt1)) + { + /* dt1 is less than any finite timestamp, but more than -infinity */ + return TIMESTAMP_IS_NOBEGIN(dt2) ? +1 : -1; + } } return timestamptz_cmp_internal(dt1, dt2); @@ -1363,6 +1336,28 @@ timestamp_date(PG_FUNCTION_ARGS) { Timestamp timestamp = PG_GETARG_TIMESTAMP(0); DateADT result; + + result = timestamp2date_safe(timestamp, NULL); + PG_RETURN_DATEADT(result); +} + +/* + * Convert timestamp to date. + * + * If the timestamp falls out of the valid range for the date type, error + * handling proceeds based on escontext. + * + * If escontext is NULL, we throw an out-of-range error (hard error). + * If escontext is not NULL, we return NOBEGIN or NOEND for lower bound or + * upper bound overflow, respectively, and record a soft error. + * + * Note: given the ranges of the types, overflow is only possible at + * the lower bound of the range, but we don't assume that in this code. + */ +DateADT +timestamp2date_safe(Timestamp timestamp, Node *escontext) +{ + DateADT result; struct pg_tm tt, *tm = &tt; fsec_t fsec; @@ -1374,14 +1369,21 @@ timestamp_date(PG_FUNCTION_ARGS) else { if (timestamp2tm(timestamp, NULL, tm, &fsec, NULL, NULL) != 0) - ereport(ERROR, + { + if (timestamp < 0) + DATE_NOBEGIN(result); + else + DATE_NOEND(result); /* not actually reachable */ + + ereturn(escontext, result, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } result = date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) - POSTGRES_EPOCH_JDATE; } - PG_RETURN_DATEADT(result); + return result; } @@ -1408,6 +1410,28 @@ timestamptz_date(PG_FUNCTION_ARGS) { TimestampTz timestamp = PG_GETARG_TIMESTAMP(0); DateADT result; + + result = timestamptz2date_safe(timestamp, NULL); + PG_RETURN_DATEADT(result); +} + +/* + * Convert timestamptz to date. + * + * If the timestamp falls out of the valid range for the date type, error + * handling proceeds based on escontext. + * + * If escontext is NULL, we throw an out-of-range error (hard error). + * If escontext is not NULL, we return NOBEGIN or NOEND for lower bound or + * upper bound overflow, respectively, and record a soft error. + * + * Note: given the ranges of the types, overflow is only possible at + * the lower bound of the range, but we don't assume that in this code. + */ +DateADT +timestamptz2date_safe(TimestampTz timestamp, Node *escontext) +{ + DateADT result; struct pg_tm tt, *tm = &tt; fsec_t fsec; @@ -1420,14 +1444,21 @@ timestamptz_date(PG_FUNCTION_ARGS) else { if (timestamp2tm(timestamp, &tz, tm, &fsec, NULL, NULL) != 0) - ereport(ERROR, + { + if (timestamp < 0) + DATE_NOBEGIN(result); + else + DATE_NOEND(result); /* not actually reachable */ + + ereturn(escontext, result, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } result = date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) - POSTGRES_EPOCH_JDATE; } - PG_RETURN_DATEADT(result); + return result; } @@ -2056,7 +2087,7 @@ time_interval(PG_FUNCTION_ARGS) TimeADT time = PG_GETARG_TIMEADT(0); Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); result->time = time; result->day = 0; @@ -2101,7 +2132,7 @@ time_mi_time(PG_FUNCTION_ARGS) TimeADT time2 = PG_GETARG_TIMEADT(1); Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); result->month = 0; result->day = 0; @@ -2368,7 +2399,7 @@ timetz_in(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); tm2timetz(tm, fsec, tz, result); AdjustTimeForTypmod(&(result->time), typmod); @@ -2407,7 +2438,7 @@ timetz_recv(PG_FUNCTION_ARGS) int32 typmod = PG_GETARG_INT32(2); TimeTzADT *result; - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); result->time = pq_getmsgint64(buf); @@ -2493,7 +2524,7 @@ timetz_scale(PG_FUNCTION_ARGS) int32 typmod = PG_GETARG_INT32(1); TimeTzADT *result; - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); result->time = time->time; result->zone = time->zone; @@ -2669,7 +2700,7 @@ timetz_pl_interval(PG_FUNCTION_ARGS) (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("cannot add infinite interval to time"))); - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); result->time = time->time + span->time; result->time -= result->time / USECS_PER_DAY * USECS_PER_DAY; @@ -2696,7 +2727,7 @@ timetz_mi_interval(PG_FUNCTION_ARGS) (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("cannot subtract infinite interval from time"))); - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); result->time = time->time - span->time; result->time -= result->time / USECS_PER_DAY * USECS_PER_DAY; @@ -2903,7 +2934,7 @@ time_timetz(PG_FUNCTION_ARGS) time2tm(time, tm, &fsec); tz = DetermineTimeZoneOffset(tm, session_timezone); - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); result->time = time; result->zone = tz; @@ -2933,7 +2964,7 @@ timestamptz_timetz(PG_FUNCTION_ARGS) (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); tm2timetz(tm, fsec, tz, result); @@ -3166,7 +3197,7 @@ timetz_zone(PG_FUNCTION_ARGS) errmsg("timestamp out of range"))); } - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); result->time = t->time + (t->zone - tz) * USECS_PER_SEC; /* C99 modulo has the wrong sign convention for negative input */ @@ -3207,7 +3238,7 @@ timetz_izone(PG_FUNCTION_ARGS) tz = -(zone->time / USECS_PER_SEC); - result = (TimeTzADT *) palloc(sizeof(TimeTzADT)); + result = palloc_object(TimeTzADT); result->time = time->time + (time->zone - tz) * USECS_PER_SEC; /* C99 modulo has the wrong sign convention for negative input */ diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c index 793d8a9adccdc..e3a099eaa67f1 100644 --- a/src/backend/utils/adt/datetime.c +++ b/src/backend/utils/adt/datetime.c @@ -702,9 +702,18 @@ ParseFraction(char *cp, double *frac) } else { + /* + * On the other hand, let's reject anything that's not digits after + * the ".". strtod is happy with input like ".123e9", but that'd + * break callers' expectation that the result is in 0..1. (It's quite + * difficult to get here with such input, but not impossible.) + */ + if (strspn(cp + 1, "0123456789") != strlen(cp + 1)) + return DTERR_BAD_FORMAT; + errno = 0; *frac = strtod(cp, &cp); - /* check for parse failure */ + /* check for parse failure (probably redundant given prior check) */ if (*cp != '\0' || errno != 0) return DTERR_BAD_FORMAT; } @@ -2958,31 +2967,28 @@ DecodeNumberField(int len, char *str, int fmask, { char *cp; + /* + * This function was originally meant to cope only with DTK_NUMBER fields, + * but we now sometimes abuse it to parse (parts of) DTK_DATE fields, + * which can contain letters and other punctuation. Reject if it's not a + * valid DTK_NUMBER, that is digits and decimal point(s). (ParseFraction + * will reject if there's more than one decimal point.) + */ + if (strspn(str, "0123456789.") != len) + return DTERR_BAD_FORMAT; + /* * Have a decimal point? Then this is a date or something with a seconds * field... */ if ((cp = strchr(str, '.')) != NULL) { - /* - * Can we use ParseFractionalSecond here? Not clear whether trailing - * junk should be rejected ... - */ - if (cp[1] == '\0') - { - /* avoid assuming that strtod will accept "." */ - *fsec = 0; - } - else - { - double frac; + int dterr; - errno = 0; - frac = strtod(cp, NULL); - if (errno != 0) - return DTERR_BAD_FORMAT; - *fsec = rint(frac * 1000000); - } + /* Convert the fraction and store at *fsec */ + dterr = ParseFractionalSecond(cp, fsec); + if (dterr) + return dterr; /* Now truncate off the fraction for further processing */ *cp = '\0'; len = strlen(str); @@ -5146,7 +5152,7 @@ pg_timezone_abbrevs_zone(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ - pindex = (int *) palloc(sizeof(int)); + pindex = palloc_object(int); *pindex = 0; funcctx->user_fctx = pindex; @@ -5181,7 +5187,7 @@ pg_timezone_abbrevs_zone(PG_FUNCTION_ARGS) /* Convert offset (in seconds) to an interval; can't overflow */ MemSet(&itm_in, 0, sizeof(struct pg_itm_in)); itm_in.tm_usec = (int64) gmtoff * USECS_PER_SEC; - resInterval = (Interval *) palloc(sizeof(Interval)); + resInterval = palloc_object(Interval); (void) itmin2interval(&itm_in, resInterval); values[1] = IntervalPGetDatum(resInterval); @@ -5233,7 +5239,7 @@ pg_timezone_abbrevs_abbrevs(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ - pindex = (int *) palloc(sizeof(int)); + pindex = palloc_object(int); *pindex = 0; funcctx->user_fctx = pindex; @@ -5304,7 +5310,7 @@ pg_timezone_abbrevs_abbrevs(PG_FUNCTION_ARGS) /* Convert offset (in seconds) to an interval; can't overflow */ MemSet(&itm_in, 0, sizeof(struct pg_itm_in)); itm_in.tm_usec = (int64) gmtoffset * USECS_PER_SEC; - resInterval = (Interval *) palloc(sizeof(Interval)); + resInterval = palloc_object(Interval); (void) itmin2interval(&itm_in, resInterval); values[1] = IntervalPGetDatum(resInterval); @@ -5372,7 +5378,7 @@ pg_timezone_names(PG_FUNCTION_ARGS) /* Convert tzoff to an interval; can't overflow */ MemSet(&itm_in, 0, sizeof(struct pg_itm_in)); itm_in.tm_usec = (int64) -tzoff * USECS_PER_SEC; - resInterval = (Interval *) palloc(sizeof(Interval)); + resInterval = palloc_object(Interval); (void) itmin2interval(&itm_in, resInterval); values[2] = IntervalPGetDatum(resInterval); diff --git a/src/backend/utils/adt/datum.c b/src/backend/utils/adt/datum.c index fcd5b1653dd3e..dabcca6f4c5f7 100644 --- a/src/backend/utils/adt/datum.c +++ b/src/backend/utils/adt/datum.c @@ -84,7 +84,7 @@ datumGetSize(Datum value, bool typByVal, int typLen) /* It is a varlena datatype */ struct varlena *s = (struct varlena *) DatumGetPointer(value); - if (!PointerIsValid(s)) + if (!s) ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid Datum pointer"))); @@ -96,7 +96,7 @@ datumGetSize(Datum value, bool typByVal, int typLen) /* It is a cstring datatype */ char *s = (char *) DatumGetPointer(value); - if (!PointerIsValid(s)) + if (!s) ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid Datum pointer"))); @@ -299,9 +299,9 @@ datum_image_eq(Datum value1, Datum value2, bool typByVal, int typLen) len1 - VARHDRSZ) == 0); /* Only free memory if it's a copy made here. */ - if ((Pointer) arg1val != (Pointer) value1) + if (arg1val != DatumGetPointer(value1)) pfree(arg1val); - if ((Pointer) arg2val != (Pointer) value2) + if (arg2val != DatumGetPointer(value2)) pfree(arg2val); } } @@ -355,7 +355,7 @@ datum_image_hash(Datum value, bool typByVal, int typLen) result = hash_bytes((unsigned char *) VARDATA_ANY(val), len - VARHDRSZ); /* Only free memory if it's a copy made here. */ - if ((Pointer) val != (Pointer) value) + if (val != DatumGetPointer(value)) pfree(val); } else if (typLen == -2) diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index 25865b660ef83..894d226541f23 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -19,12 +19,12 @@ #include "catalog/pg_authid.h" #include "catalog/pg_database.h" #include "catalog/pg_tablespace.h" -#include "commands/dbcommands.h" #include "commands/tablespace.h" #include "miscadmin.h" #include "storage/fd.h" #include "utils/acl.h" #include "utils/builtins.h" +#include "utils/lsyscache.h" #include "utils/numeric.h" #include "utils/rel.h" #include "utils/relfilenumbermap.h" @@ -938,6 +938,9 @@ pg_relation_filenode(PG_FUNCTION_ARGS) * * We don't fail but return NULL if we cannot find a mapping. * + * Temporary relations are not detected, returning NULL (see + * RelidByRelfilenumber() for the reasons). + * * InvalidOid can be passed instead of the current database's default * tablespace. */ diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 4ccaed815d17e..c813ee1258b7f 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -16,6 +16,7 @@ #include #include "mb/pg_wchar.h" +#include "port/simd.h" #include "utils/builtins.h" #include "utils/memutils.h" #include "varatt.h" @@ -63,7 +64,9 @@ binary_encode(PG_FUNCTION_ARGS) if (enc == NULL) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized encoding: \"%s\"", namebuf))); + errmsg("unrecognized encoding: \"%s\"", namebuf), + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", + "base64", "base64url", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -111,7 +114,9 @@ binary_decode(PG_FUNCTION_ARGS) if (enc == NULL) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized encoding: \"%s\"", namebuf))); + errmsg("unrecognized encoding: \"%s\"", namebuf), + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", + "base64", "base64url", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -177,8 +182,8 @@ static const int8 hexlookup[128] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; -uint64 -hex_encode(const char *src, size_t len, char *dst) +static inline uint64 +hex_encode_scalar(const char *src, size_t len, char *dst) { const char *end = src + len; @@ -193,6 +198,55 @@ hex_encode(const char *src, size_t len, char *dst) return (uint64) len * 2; } +uint64 +hex_encode(const char *src, size_t len, char *dst) +{ +#ifdef USE_NO_SIMD + return hex_encode_scalar(src, len, dst); +#else + const uint64 tail_idx = len & ~(sizeof(Vector8) - 1); + uint64 i; + + /* + * This splits the high and low nibbles of each byte into separate + * vectors, adds the vectors to a mask that converts the nibbles to their + * equivalent ASCII bytes, and interleaves those bytes back together to + * form the final hex-encoded string. + */ + for (i = 0; i < tail_idx; i += sizeof(Vector8)) + { + Vector8 srcv; + Vector8 lo; + Vector8 hi; + Vector8 mask; + + vector8_load(&srcv, (const uint8 *) &src[i]); + + lo = vector8_and(srcv, vector8_broadcast(0x0f)); + mask = vector8_gt(lo, vector8_broadcast(0x9)); + mask = vector8_and(mask, vector8_broadcast('a' - '0' - 10)); + mask = vector8_add(mask, vector8_broadcast('0')); + lo = vector8_add(lo, mask); + + hi = vector8_and(srcv, vector8_broadcast(0xf0)); + hi = vector8_shift_right(hi, 4); + mask = vector8_gt(hi, vector8_broadcast(0x9)); + mask = vector8_and(mask, vector8_broadcast('a' - '0' - 10)); + mask = vector8_add(mask, vector8_broadcast('0')); + hi = vector8_add(hi, mask); + + vector8_store((uint8 *) &dst[i * 2], + vector8_interleave_low(hi, lo)); + vector8_store((uint8 *) &dst[i * 2 + sizeof(Vector8)], + vector8_interleave_high(hi, lo)); + } + + (void) hex_encode_scalar(src + i, len - i, dst + i * 2); + + return (uint64) len * 2; +#endif +} + static inline bool get_hex(const char *cp, char *out) { @@ -213,8 +267,8 @@ hex_decode(const char *src, size_t len, char *dst) return hex_decode_safe(src, len, dst, NULL); } -uint64 -hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) +static inline uint64 +hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext) { const char *s, *srcend; @@ -254,6 +308,85 @@ hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) return p - dst; } +/* + * This helper converts each byte to its binary-equivalent nibble by + * subtraction and combines them to form the return bytes (separated by zero + * bytes). Returns false if any input bytes are outside the expected ranges of + * ASCII values. Otherwise, returns true. + */ +#ifndef USE_NO_SIMD +static inline bool +hex_decode_simd_helper(const Vector8 src, Vector8 *dst) +{ + Vector8 sub; + Vector8 mask_hi = vector8_interleave_low(vector8_broadcast(0), vector8_broadcast(0x0f)); + Vector8 mask_lo = vector8_interleave_low(vector8_broadcast(0x0f), vector8_broadcast(0)); + Vector8 tmp; + bool ret; + + tmp = vector8_gt(vector8_broadcast('9' + 1), src); + sub = vector8_and(tmp, vector8_broadcast('0')); + + tmp = vector8_gt(src, vector8_broadcast('A' - 1)); + tmp = vector8_and(tmp, vector8_broadcast('A' - 10)); + sub = vector8_add(sub, tmp); + + tmp = vector8_gt(src, vector8_broadcast('a' - 1)); + tmp = vector8_and(tmp, vector8_broadcast('a' - 'A')); + sub = vector8_add(sub, tmp); + + *dst = vector8_issub(src, sub); + ret = !vector8_has_ge(*dst, 0x10); + + tmp = vector8_and(*dst, mask_hi); + tmp = vector8_shift_right(tmp, 8); + *dst = vector8_and(*dst, mask_lo); + *dst = vector8_shift_left(*dst, 4); + *dst = vector8_or(*dst, tmp); + return ret; +} +#endif /* ! USE_NO_SIMD */ + +uint64 +hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) +{ +#ifdef USE_NO_SIMD + return hex_decode_safe_scalar(src, len, dst, escontext); +#else + const uint64 tail_idx = len & ~(sizeof(Vector8) * 2 - 1); + uint64 i; + bool success = true; + + /* + * We must process 2 vectors at a time since the output will be half the + * length of the input. + */ + for (i = 0; i < tail_idx; i += sizeof(Vector8) * 2) + { + Vector8 srcv; + Vector8 dstv1; + Vector8 dstv2; + + vector8_load(&srcv, (const uint8 *) &src[i]); + success &= hex_decode_simd_helper(srcv, &dstv1); + + vector8_load(&srcv, (const uint8 *) &src[i + sizeof(Vector8)]); + success &= hex_decode_simd_helper(srcv, &dstv2); + + vector8_store((uint8 *) &dst[i / 2], vector8_pack_16(dstv1, dstv2)); + } + + /* + * If something didn't look right in the vector path, try again in the + * scalar path so that we can handle it correctly. + */ + if (!success) + i = 0; + + return i / 2 + hex_decode_safe_scalar(src + i, len - i, dst + i / 2, escontext); +#endif +} + static uint64 hex_enc_len(const char *src, size_t srclen) { @@ -267,12 +400,15 @@ hex_dec_len(const char *src, size_t srclen) } /* - * BASE64 + * BASE64 and BASE64URL */ static const char _base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char _base64url[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + static const int8 b64lookup[128] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -284,8 +420,15 @@ static const int8 b64lookup[128] = { 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, }; +/* + * pg_base64_encode_internal + * + * Helper for decoding base64 or base64url. When url is passed as true the + * input will be encoded using base64url. len bytes in src is encoded into + * dst. + */ static uint64 -pg_base64_encode(const char *src, size_t len, char *dst) +pg_base64_encode_internal(const char *src, size_t len, char *dst, bool url) { char *p, *lend = dst + 76; @@ -293,6 +436,7 @@ pg_base64_encode(const char *src, size_t len, char *dst) *end = src + len; int pos = 2; uint32 buf = 0; + const char *alphabet = url ? _base64url : _base64; s = src; p = dst; @@ -306,33 +450,64 @@ pg_base64_encode(const char *src, size_t len, char *dst) /* write it out */ if (pos < 0) { - *p++ = _base64[(buf >> 18) & 0x3f]; - *p++ = _base64[(buf >> 12) & 0x3f]; - *p++ = _base64[(buf >> 6) & 0x3f]; - *p++ = _base64[buf & 0x3f]; + *p++ = alphabet[(buf >> 18) & 0x3f]; + *p++ = alphabet[(buf >> 12) & 0x3f]; + *p++ = alphabet[(buf >> 6) & 0x3f]; + *p++ = alphabet[buf & 0x3f]; pos = 2; buf = 0; - } - if (p >= lend) - { - *p++ = '\n'; - lend = p + 76; + + if (!url && p >= lend) + { + *p++ = '\n'; + lend = p + 76; + } } } + + /* Handle remaining bytes in buf */ if (pos != 2) { - *p++ = _base64[(buf >> 18) & 0x3f]; - *p++ = _base64[(buf >> 12) & 0x3f]; - *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '='; - *p++ = '='; + *p++ = alphabet[(buf >> 18) & 0x3f]; + *p++ = alphabet[(buf >> 12) & 0x3f]; + + if (pos == 0) + { + *p++ = alphabet[(buf >> 6) & 0x3f]; + if (!url) + *p++ = '='; + } + else if (!url) + { + *p++ = '='; + *p++ = '='; + } } return p - dst; } static uint64 -pg_base64_decode(const char *src, size_t len, char *dst) +pg_base64_encode(const char *src, size_t len, char *dst) +{ + return pg_base64_encode_internal(src, len, dst, false); +} + +static uint64 +pg_base64url_encode(const char *src, size_t len, char *dst) +{ + return pg_base64_encode_internal(src, len, dst, true); +} + +/* + * pg_base64_decode_internal + * + * Helper for decoding base64 or base64url. When url is passed as true the + * input will be assumed to be encoded using base64url. + */ +static uint64 +pg_base64_decode_internal(const char *src, size_t len, char *dst, bool url) { const char *srcend = src + len, *s = src; @@ -350,6 +525,15 @@ pg_base64_decode(const char *src, size_t len, char *dst) if (c == ' ' || c == '\t' || c == '\n' || c == '\r') continue; + /* convert base64url to base64 */ + if (url) + { + if (c == '-') + c = '+'; + else if (c == '_') + c = '/'; + } + if (c == '=') { /* end sequence */ @@ -360,9 +544,12 @@ pg_base64_decode(const char *src, size_t len, char *dst) else if (pos == 3) end = 2; else + { + /* translator: %s is the name of an encoding scheme */ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unexpected \"=\" while decoding base64 sequence"))); + errmsg("unexpected \"=\" while decoding %s sequence", url ? "base64url" : "base64"))); + } } b = 0; } @@ -372,10 +559,14 @@ pg_base64_decode(const char *src, size_t len, char *dst) if (c > 0 && c < 127) b = b64lookup[(unsigned char) c]; if (b < 0) + { + /* translator: %s is the name of an encoding scheme */ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence", - pg_mblen(s - 1), s - 1))); + errmsg("invalid symbol \"%.*s\" found while decoding %s sequence", + pg_mblen(s - 1), s - 1, + url ? "base64url" : "base64"))); + } } /* add it to buffer */ buf = (buf << 6) + b; @@ -392,15 +583,40 @@ pg_base64_decode(const char *src, size_t len, char *dst) } } - if (pos != 0) + if (pos == 2) + { + buf <<= 12; + *p++ = (buf >> 16) & 0xFF; + } + else if (pos == 3) + { + buf <<= 6; + *p++ = (buf >> 16) & 0xFF; + *p++ = (buf >> 8) & 0xFF; + } + else if (pos != 0) + { + /* translator: %s is the name of an encoding scheme */ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid base64 end sequence"), + errmsg("invalid %s end sequence", url ? "base64url" : "base64"), errhint("Input data is missing padding, is truncated, or is otherwise corrupted."))); + } return p - dst; } +static uint64 +pg_base64_decode(const char *src, size_t len, char *dst) +{ + return pg_base64_decode_internal(src, len, dst, false); +} + +static uint64 +pg_base64url_decode(const char *src, size_t len, char *dst) +{ + return pg_base64_decode_internal(src, len, dst, true); +} static uint64 pg_base64_enc_len(const char *src, size_t srclen) @@ -415,6 +631,32 @@ pg_base64_dec_len(const char *src, size_t srclen) return ((uint64) srclen * 3) >> 2; } +static uint64 +pg_base64url_enc_len(const char *src, size_t srclen) +{ + /* + * Unlike standard base64, base64url doesn't use padding characters when + * the input length is not divisible by 3 + */ + return (srclen + 2) / 3 * 4; +} + +static uint64 +pg_base64url_dec_len(const char *src, size_t srclen) +{ + /* + * For base64, each 4 characters of input produce at most 3 bytes of + * output. For base64url without padding, we need to round up to the + * nearest 4 + */ + size_t adjusted_len = srclen; + + if (srclen % 4 != 0) + adjusted_len += 4 - (srclen % 4); + + return (adjusted_len * 3) / 4; +} + /* * Escape * Minimally escape bytea to text. @@ -606,6 +848,12 @@ static const struct pg_base64_enc_len, pg_base64_dec_len, pg_base64_encode, pg_base64_decode } }, + { + "base64url", + { + pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode + } + }, { "escape", { diff --git a/src/backend/utils/adt/expandedrecord.c b/src/backend/utils/adt/expandedrecord.c index 13752db44e839..495d48fb581b5 100644 --- a/src/backend/utils/adt/expandedrecord.c +++ b/src/backend/utils/adt/expandedrecord.c @@ -547,7 +547,7 @@ expanded_record_set_tuple(ExpandedRecordHeader *erh, for (i = 0; i < erh->nfields; i++) { if (!erh->dnulls[i] && - !(TupleDescAttr(tupdesc, i)->attbyval)) + !(TupleDescCompactAttr(tupdesc, i)->attbyval)) { char *oldValue = (char *) DatumGetPointer(erh->dvalues[i]); diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c index 6d20ae07ae7b0..849639fda9f87 100644 --- a/src/backend/utils/adt/float.c +++ b/src/backend/utils/adt/float.c @@ -3319,9 +3319,21 @@ float8_stddev_samp(PG_FUNCTION_ARGS) * As with the preceding aggregates, we use the Youngs-Cramer algorithm to * reduce rounding errors in the aggregate final functions. * - * The transition datatype for all these aggregates is a 6-element array of + * The transition datatype for all these aggregates is an 8-element array of * float8, holding the values N, Sx=sum(X), Sxx=sum((X-Sx/N)^2), Sy=sum(Y), - * Syy=sum((Y-Sy/N)^2), Sxy=sum((X-Sx/N)*(Y-Sy/N)) in that order. + * Syy=sum((Y-Sy/N)^2), Sxy=sum((X-Sx/N)*(Y-Sy/N)), commonX, and commonY + * in that order. + * + * commonX is defined as the common X value if all the X values were the same, + * else NaN; likewise for commonY. This is useful for deciding whether corr() + * and related functions should return NULL. This representation cannot + * distinguish the-values-were-all-NaN from the-values-were-not-all-the-same, + * but that's okay because for this purpose we use the IEEE float arithmetic + * principle that two NaNs are never equal. The SQL standard doesn't mention + * NaNs, but it says that NULL is to be returned when N*sum(X*X) equals + * sum(X)*sum(X) (etc), and that shouldn't be considered true for NaNs. + * Testing this as written in the spec would be highly subject to roundoff + * error, so instead we directly track whether all the inputs are equal. * * Note that Y is the first argument to all these aggregates! * @@ -3345,17 +3357,21 @@ float8_regr_accum(PG_FUNCTION_ARGS) Sy, Syy, Sxy, + commonX, + commonY, tmpX, tmpY, scale; - transvalues = check_float8_array(transarray, "float8_regr_accum", 6); + transvalues = check_float8_array(transarray, "float8_regr_accum", 8); N = transvalues[0]; Sx = transvalues[1]; Sxx = transvalues[2]; Sy = transvalues[3]; Syy = transvalues[4]; Sxy = transvalues[5]; + commonX = transvalues[6]; + commonY = transvalues[7]; /* * Use the Youngs-Cramer algorithm to incorporate the new values into the @@ -3366,12 +3382,33 @@ float8_regr_accum(PG_FUNCTION_ARGS) Sy += newvalY; if (transvalues[0] > 0.0) { + /* + * Check to see if we have seen distinct inputs. We can use a test + * that's a bit cheaper than float8_ne() because if commonX is already + * NaN, it does not matter whether the != test returns true or not. + */ + if (newvalX != commonX || isnan(newvalX)) + commonX = get_float8_nan(); + if (newvalY != commonY || isnan(newvalY)) + commonY = get_float8_nan(); + tmpX = newvalX * N - Sx; tmpY = newvalY * N - Sy; scale = 1.0 / (N * transvalues[0]); - Sxx += tmpX * tmpX * scale; - Syy += tmpY * tmpY * scale; - Sxy += tmpX * tmpY * scale; + + /* + * If we have not seen distinct inputs, then Sxx, Syy, and/or Sxy + * should remain zero (since Sx's exact value would be N * commonX, + * etc). Updating them would just create the possibility of injecting + * roundoff error, and we need exact zero results so that the final + * functions will return NULL in the right cases. + */ + if (isnan(commonX)) + Sxx += tmpX * tmpX * scale; + if (isnan(commonY)) + Syy += tmpY * tmpY * scale; + if (isnan(commonX) && isnan(commonY)) + Sxy += tmpX * tmpY * scale; /* * Overflow check. We only report an overflow error when finite @@ -3410,6 +3447,9 @@ float8_regr_accum(PG_FUNCTION_ARGS) Sxx = Sxy = get_float8_nan(); if (isnan(newvalY) || isinf(newvalY)) Syy = Sxy = get_float8_nan(); + + commonX = newvalX; + commonY = newvalY; } /* @@ -3425,12 +3465,14 @@ float8_regr_accum(PG_FUNCTION_ARGS) transvalues[3] = Sy; transvalues[4] = Syy; transvalues[5] = Sxy; + transvalues[6] = commonX; + transvalues[7] = commonY; PG_RETURN_ARRAYTYPE_P(transarray); } else { - Datum transdatums[6]; + Datum transdatums[8]; ArrayType *result; transdatums[0] = Float8GetDatumFast(N); @@ -3439,8 +3481,10 @@ float8_regr_accum(PG_FUNCTION_ARGS) transdatums[3] = Float8GetDatumFast(Sy); transdatums[4] = Float8GetDatumFast(Syy); transdatums[5] = Float8GetDatumFast(Sxy); + transdatums[6] = Float8GetDatumFast(commonX); + transdatums[7] = Float8GetDatumFast(commonY); - result = construct_array_builtin(transdatums, 6, FLOAT8OID); + result = construct_array_builtin(transdatums, 8, FLOAT8OID); PG_RETURN_ARRAYTYPE_P(result); } @@ -3449,7 +3493,7 @@ float8_regr_accum(PG_FUNCTION_ARGS) /* * float8_regr_combine * - * An aggregate combine function used to combine two 6 fields + * An aggregate combine function used to combine two 8-fields * aggregate transition data into a single transition data. * This function is used only in two stage aggregation and * shouldn't be called outside aggregate context. @@ -3467,12 +3511,16 @@ float8_regr_combine(PG_FUNCTION_ARGS) Sy1, Syy1, Sxy1, + Cx1, + Cy1, N2, Sx2, Sxx2, Sy2, Syy2, Sxy2, + Cx2, + Cy2, tmp1, tmp2, N, @@ -3480,10 +3528,12 @@ float8_regr_combine(PG_FUNCTION_ARGS) Sxx, Sy, Syy, - Sxy; + Sxy, + Cx, + Cy; - transvalues1 = check_float8_array(transarray1, "float8_regr_combine", 6); - transvalues2 = check_float8_array(transarray2, "float8_regr_combine", 6); + transvalues1 = check_float8_array(transarray1, "float8_regr_combine", 8); + transvalues2 = check_float8_array(transarray2, "float8_regr_combine", 8); N1 = transvalues1[0]; Sx1 = transvalues1[1]; @@ -3491,6 +3541,8 @@ float8_regr_combine(PG_FUNCTION_ARGS) Sy1 = transvalues1[3]; Syy1 = transvalues1[4]; Sxy1 = transvalues1[5]; + Cx1 = transvalues1[6]; + Cy1 = transvalues1[7]; N2 = transvalues2[0]; Sx2 = transvalues2[1]; @@ -3498,6 +3550,8 @@ float8_regr_combine(PG_FUNCTION_ARGS) Sy2 = transvalues2[3]; Syy2 = transvalues2[4]; Sxy2 = transvalues2[5]; + Cx2 = transvalues2[6]; + Cy2 = transvalues2[7]; /*-------------------- * The transition values combine using a generalization of the @@ -3523,6 +3577,8 @@ float8_regr_combine(PG_FUNCTION_ARGS) Sy = Sy2; Syy = Syy2; Sxy = Sxy2; + Cx = Cx2; + Cy = Cy2; } else if (N2 == 0.0) { @@ -3532,6 +3588,8 @@ float8_regr_combine(PG_FUNCTION_ARGS) Sy = Sy1; Syy = Syy1; Sxy = Sxy1; + Cx = Cx1; + Cy = Cy1; } else { @@ -3549,6 +3607,14 @@ float8_regr_combine(PG_FUNCTION_ARGS) Sxy = Sxy1 + Sxy2 + N1 * N2 * tmp1 * tmp2 / N; if (unlikely(isinf(Sxy)) && !isinf(Sxy1) && !isinf(Sxy2)) float_overflow_error(); + if (float8_eq(Cx1, Cx2)) + Cx = Cx1; + else + Cx = get_float8_nan(); + if (float8_eq(Cy1, Cy2)) + Cy = Cy1; + else + Cy = get_float8_nan(); } /* @@ -3564,12 +3630,14 @@ float8_regr_combine(PG_FUNCTION_ARGS) transvalues1[3] = Sy; transvalues1[4] = Syy; transvalues1[5] = Sxy; + transvalues1[6] = Cx; + transvalues1[7] = Cy; PG_RETURN_ARRAYTYPE_P(transarray1); } else { - Datum transdatums[6]; + Datum transdatums[8]; ArrayType *result; transdatums[0] = Float8GetDatumFast(N); @@ -3578,8 +3646,10 @@ float8_regr_combine(PG_FUNCTION_ARGS) transdatums[3] = Float8GetDatumFast(Sy); transdatums[4] = Float8GetDatumFast(Syy); transdatums[5] = Float8GetDatumFast(Sxy); + transdatums[6] = Float8GetDatumFast(Cx); + transdatums[7] = Float8GetDatumFast(Cy); - result = construct_array_builtin(transdatums, 6, FLOAT8OID); + result = construct_array_builtin(transdatums, 8, FLOAT8OID); PG_RETURN_ARRAYTYPE_P(result); } @@ -3594,7 +3664,7 @@ float8_regr_sxx(PG_FUNCTION_ARGS) float8 N, Sxx; - transvalues = check_float8_array(transarray, "float8_regr_sxx", 6); + transvalues = check_float8_array(transarray, "float8_regr_sxx", 8); N = transvalues[0]; Sxx = transvalues[2]; @@ -3615,7 +3685,7 @@ float8_regr_syy(PG_FUNCTION_ARGS) float8 N, Syy; - transvalues = check_float8_array(transarray, "float8_regr_syy", 6); + transvalues = check_float8_array(transarray, "float8_regr_syy", 8); N = transvalues[0]; Syy = transvalues[4]; @@ -3636,7 +3706,7 @@ float8_regr_sxy(PG_FUNCTION_ARGS) float8 N, Sxy; - transvalues = check_float8_array(transarray, "float8_regr_sxy", 6); + transvalues = check_float8_array(transarray, "float8_regr_sxy", 8); N = transvalues[0]; Sxy = transvalues[5]; @@ -3655,16 +3725,22 @@ float8_regr_avgx(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - Sx; + Sx, + commonX; - transvalues = check_float8_array(transarray, "float8_regr_avgx", 6); + transvalues = check_float8_array(transarray, "float8_regr_avgx", 8); N = transvalues[0]; Sx = transvalues[1]; + commonX = transvalues[6]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); + /* if all inputs were the same just return that, avoiding roundoff error */ + if (!isnan(commonX)) + PG_RETURN_FLOAT8(commonX); + PG_RETURN_FLOAT8(Sx / N); } @@ -3674,16 +3750,22 @@ float8_regr_avgy(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - Sy; + Sy, + commonY; - transvalues = check_float8_array(transarray, "float8_regr_avgy", 6); + transvalues = check_float8_array(transarray, "float8_regr_avgy", 8); N = transvalues[0]; Sy = transvalues[3]; + commonY = transvalues[7]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); + /* if all inputs were the same just return that, avoiding roundoff error */ + if (!isnan(commonY)) + PG_RETURN_FLOAT8(commonY); + PG_RETURN_FLOAT8(Sy / N); } @@ -3695,7 +3777,7 @@ float8_covar_pop(PG_FUNCTION_ARGS) float8 N, Sxy; - transvalues = check_float8_array(transarray, "float8_covar_pop", 6); + transvalues = check_float8_array(transarray, "float8_covar_pop", 8); N = transvalues[0]; Sxy = transvalues[5]; @@ -3714,7 +3796,7 @@ float8_covar_samp(PG_FUNCTION_ARGS) float8 N, Sxy; - transvalues = check_float8_array(transarray, "float8_covar_samp", 6); + transvalues = check_float8_array(transarray, "float8_covar_samp", 8); N = transvalues[0]; Sxy = transvalues[5]; @@ -3733,9 +3815,12 @@ float8_corr(PG_FUNCTION_ARGS) float8 N, Sxx, Syy, - Sxy; + Sxy, + product, + sqrtproduct, + result; - transvalues = check_float8_array(transarray, "float8_corr", 6); + transvalues = check_float8_array(transarray, "float8_corr", 8); N = transvalues[0]; Sxx = transvalues[2]; Syy = transvalues[4]; @@ -3751,7 +3836,29 @@ float8_corr(PG_FUNCTION_ARGS) if (Sxx == 0 || Syy == 0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(Sxy / sqrt(Sxx * Syy)); + /* + * The product Sxx * Syy might underflow or overflow. If so, we can + * recover by computing sqrt(Sxx) * sqrt(Syy) instead of sqrt(Sxx * Syy). + * However, the double sqrt() calculation is a bit slower and less + * accurate, so don't do it if we don't have to. + */ + product = Sxx * Syy; + if (product == 0 || isinf(product)) + sqrtproduct = sqrt(Sxx) * sqrt(Syy); + else + sqrtproduct = sqrt(product); + result = Sxy / sqrtproduct; + + /* + * Despite all these precautions, this formula can yield results outside + * [-1, 1] due to roundoff error. Clamp it to the expected range. + */ + if (result < -1) + result = -1; + else if (result > 1) + result = 1; + + PG_RETURN_FLOAT8(result); } Datum @@ -3764,7 +3871,7 @@ float8_regr_r2(PG_FUNCTION_ARGS) Syy, Sxy; - transvalues = check_float8_array(transarray, "float8_regr_r2", 6); + transvalues = check_float8_array(transarray, "float8_regr_r2", 8); N = transvalues[0]; Sxx = transvalues[2]; Syy = transvalues[4]; @@ -3796,7 +3903,7 @@ float8_regr_slope(PG_FUNCTION_ARGS) Sxx, Sxy; - transvalues = check_float8_array(transarray, "float8_regr_slope", 6); + transvalues = check_float8_array(transarray, "float8_regr_slope", 8); N = transvalues[0]; Sxx = transvalues[2]; Sxy = transvalues[5]; @@ -3825,7 +3932,7 @@ float8_regr_intercept(PG_FUNCTION_ARGS) Sy, Sxy; - transvalues = check_float8_array(transarray, "float8_regr_intercept", 6); + transvalues = check_float8_array(transarray, "float8_regr_intercept", 8); N = transvalues[0]; Sx = transvalues[1]; Sxx = transvalues[2]; @@ -4065,10 +4172,11 @@ float84ge(PG_FUNCTION_ARGS) * in the histogram. width_bucket() returns an integer indicating the * bucket number that 'operand' belongs to in an equiwidth histogram * with the specified characteristics. An operand smaller than the - * lower bound is assigned to bucket 0. An operand greater than the - * upper bound is assigned to an additional bucket (with number - * count+1). We don't allow "NaN" for any of the float8 inputs, and we - * don't allow either of the histogram bounds to be +/- infinity. + * lower bound is assigned to bucket 0. An operand greater than or equal + * to the upper bound is assigned to an additional bucket (with number + * count+1). We don't allow the histogram bounds to be NaN or +/- infinity, + * but we do allow those values for the operand (taking NaN to be larger + * than any other value, as we do in comparisons). */ Datum width_bucket_float8(PG_FUNCTION_ARGS) @@ -4084,12 +4192,11 @@ width_bucket_float8(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), errmsg("count must be greater than zero"))); - if (isnan(operand) || isnan(bound1) || isnan(bound2)) + if (isnan(bound1) || isnan(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), - errmsg("operand, lower bound, and upper bound cannot be NaN"))); + errmsg("lower and upper bounds cannot be NaN"))); - /* Note that we allow "operand" to be infinite */ if (isinf(bound1) || isinf(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), @@ -4097,15 +4204,15 @@ width_bucket_float8(PG_FUNCTION_ARGS) if (bound1 < bound2) { - if (operand < bound1) - result = 0; - else if (operand >= bound2) + if (isnan(operand) || operand >= bound2) { if (pg_add_s32_overflow(count, 1, &result)) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("integer out of range"))); } + else if (operand < bound1) + result = 0; else { if (!isinf(bound2 - bound1)) @@ -4135,7 +4242,7 @@ width_bucket_float8(PG_FUNCTION_ARGS) } else if (bound1 > bound2) { - if (operand > bound1) + if (isnan(operand) || operand > bound1) result = 0; else if (operand <= bound2) { diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c index 9948c26e76cd5..0afd1cb3563fe 100644 --- a/src/backend/utils/adt/format_type.c +++ b/src/backend/utils/adt/format_type.c @@ -378,7 +378,7 @@ printTypmod(const char *typname, int32 typmod, Oid typmodout) if (typmodout == InvalidOid) { /* Default behavior: just print the integer typmod with parens */ - res = psprintf("%s(%d)", typname, (int) typmod); + res = psprintf("%s(%d)", typname, typmod); } else { diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 5bd1e01f7e463..a4570471bbaee 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1,4 +1,4 @@ -/* ----------------------------------------------------------------------- +/*------------------------------------------------------------------------- * formatting.c * * src/backend/utils/adt/formatting.c @@ -54,7 +54,7 @@ * than Oracle :-), * to_char('Hello', 'X X X X X') -> 'H e l l o' * - * ----------------------------------------------------------------------- + *------------------------------------------------------------------------- */ #ifdef DEBUG_TO_FROM_CHAR @@ -92,44 +92,46 @@ #include "varatt.h" -/* ---------- +/* * Routines flags - * ---------- */ #define DCH_FLAG 0x1 /* DATE-TIME flag */ #define NUM_FLAG 0x2 /* NUMBER flag */ #define STD_FLAG 0x4 /* STANDARD flag */ -/* ---------- +/* * KeyWord Index (ascii from position 32 (' ') to 126 (~)) - * ---------- */ #define KeyWord_INDEX_SIZE ('~' - ' ') #define KeyWord_INDEX_FILTER(_c) ((_c) <= ' ' || (_c) >= '~' ? 0 : 1) -/* ---------- +/* * Maximal length of one node - * ---------- */ #define DCH_MAX_ITEM_SIZ 12 /* max localized day name */ #define NUM_MAX_ITEM_SIZ 8 /* roman number (RN has 15 chars) */ -/* ---------- +/* * Format parser structs - * ---------- */ + +enum KeySuffixType +{ + SUFFTYPE_PREFIX = 1, + SUFFTYPE_POSTFIX = 2, +}; + typedef struct { const char *name; /* suffix string */ - int len, /* suffix length */ - id, /* used in node->suffix */ - type; /* prefix / postfix */ + size_t len; /* suffix length */ + int id; /* used in node->suffix */ + enum KeySuffixType type; /* prefix / postfix */ } KeySuffix; -/* ---------- +/* * FromCharDateMode - * ---------- * * This value is used to nominate one of several distinct (and mutually * exclusive) date conventions that a keyword can belong to. @@ -144,36 +146,33 @@ typedef enum typedef struct { const char *name; - int len; + size_t len; int id; bool is_digit; FromCharDateMode date_mode; } KeyWord; +enum FormatNodeType +{ + NODE_TYPE_END = 1, + NODE_TYPE_ACTION = 2, + NODE_TYPE_CHAR = 3, + NODE_TYPE_SEPARATOR = 4, + NODE_TYPE_SPACE = 5, +}; + typedef struct { - uint8 type; /* NODE_TYPE_XXX, see below */ + enum FormatNodeType type; char character[MAX_MULTIBYTE_CHAR_LEN + 1]; /* if type is CHAR */ - uint8 suffix; /* keyword prefix/suffix code, if any */ + uint8 suffix; /* keyword prefix/suffix code, if any + * (DCH_SUFFIX_*) */ const KeyWord *key; /* if type is ACTION */ } FormatNode; -#define NODE_TYPE_END 1 -#define NODE_TYPE_ACTION 2 -#define NODE_TYPE_CHAR 3 -#define NODE_TYPE_SEPARATOR 4 -#define NODE_TYPE_SPACE 5 - -#define SUFFTYPE_PREFIX 1 -#define SUFFTYPE_POSTFIX 2 - -#define CLOCK_24_HOUR 0 -#define CLOCK_12_HOUR 1 - -/* ---------- +/* * Full months - * ---------- */ static const char *const months_full[] = { "January", "February", "March", "April", "May", "June", "July", @@ -184,9 +183,9 @@ static const char *const days_short[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", NULL }; -/* ---------- +/* * AD / BC - * ---------- + * * There is no 0 AD. Years go from 1 BC to 1 AD, so we make it * positive and map year == -1 to year zero, and shift all negative * years up one. For interval years, we just return the year. @@ -216,9 +215,8 @@ static const char *const days_short[] = { static const char *const adbc_strings[] = {ad_STR, bc_STR, AD_STR, BC_STR, NULL}; static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_STR, NULL}; -/* ---------- +/* * AM / PM - * ---------- */ #define A_M_STR "A.M." #define a_m_STR "a.m." @@ -243,11 +241,10 @@ static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_S static const char *const ampm_strings[] = {am_STR, pm_STR, AM_STR, PM_STR, NULL}; static const char *const ampm_strings_long[] = {a_m_STR, p_m_STR, A_M_STR, P_M_STR, NULL}; -/* ---------- +/* * Months in roman-numeral * (Must be in reverse order for seq_search (in FROM_CHAR), because * 'VIII' must have higher precedence than 'V') - * ---------- */ static const char *const rm_months_upper[] = {"XII", "XI", "X", "IX", "VIII", "VII", "VI", "V", "IV", "III", "II", "I", NULL}; @@ -255,9 +252,8 @@ static const char *const rm_months_upper[] = static const char *const rm_months_lower[] = {"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL}; -/* ---------- +/* * Roman numerals - * ---------- */ static const char *const rm1[] = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", NULL}; static const char *const rm10[] = {"X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC", NULL}; @@ -289,40 +285,46 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D */ #define MAX_ROMAN_LEN 15 -/* ---------- +/* * Ordinal postfixes - * ---------- */ static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL}; static const char *const numth[] = {"st", "nd", "rd", "th", NULL}; -/* ---------- +/* * Flags & Options: - * ---------- */ -#define TH_UPPER 1 -#define TH_LOWER 2 +enum TH_Case +{ + TH_UPPER = 1, + TH_LOWER = 2, +}; -/* ---------- +enum NUMDesc_lsign +{ + NUM_LSIGN_PRE = -1, + NUM_LSIGN_POST = 1, + NUM_LSIGN_NONE = 0, +}; + +/* * Number description struct - * ---------- */ typedef struct { - int pre, /* (count) numbers before decimal */ - post, /* (count) numbers after decimal */ - lsign, /* want locales sign */ - flag, /* number parameters */ - pre_lsign_num, /* tmp value for lsign */ - multi, /* multiplier for 'V' */ - zero_start, /* position of first zero */ - zero_end, /* position of last zero */ - need_locale; /* needs it locale */ + int pre; /* (count) numbers before decimal */ + int post; /* (count) numbers after decimal */ + enum NUMDesc_lsign lsign; /* want locales sign */ + int flag; /* number parameters (NUM_F_*) */ + int pre_lsign_num; /* tmp value for lsign */ + int multi; /* multiplier for 'V' */ + int zero_start; /* position of first zero */ + int zero_end; /* position of last zero */ + bool need_locale; /* needs it locale */ } NUMDesc; -/* ---------- +/* * Flags for NUMBER version - * ---------- */ #define NUM_F_DECIMAL (1 << 1) #define NUM_F_LDECIMAL (1 << 2) @@ -339,13 +341,8 @@ typedef struct #define NUM_F_MINUS_POST (1 << 13) #define NUM_F_EEEE (1 << 14) -#define NUM_LSIGN_PRE (-1) -#define NUM_LSIGN_POST 1 -#define NUM_LSIGN_NONE 0 - -/* ---------- +/* * Tests - * ---------- */ #define IS_DECIMAL(_f) ((_f)->flag & NUM_F_DECIMAL) #define IS_LDECIMAL(_f) ((_f)->flag & NUM_F_LDECIMAL) @@ -360,7 +357,7 @@ typedef struct #define IS_MULTI(_f) ((_f)->flag & NUM_F_MULTI) #define IS_EEEE(_f) ((_f)->flag & NUM_F_EEEE) -/* ---------- +/* * Format picture cache * * We will cache datetime format pictures up to DCH_CACHE_SIZE bytes long; @@ -376,7 +373,6 @@ typedef struct * * The max number of entries in each cache is DCH_CACHE_ENTRIES * resp. NUM_CACHE_ENTRIES. - * ---------- */ #define DCH_CACHE_OVERHEAD \ MAXALIGN(sizeof(bool) + sizeof(int)) @@ -419,53 +415,49 @@ static NUMCacheEntry *NUMCache[NUM_CACHE_ENTRIES]; static int n_NUMCache = 0; /* current number of entries */ static int NUMCounter = 0; /* aging-event counter */ -/* ---------- +/* * For char->date/time conversion - * ---------- */ typedef struct { FromCharDateMode mode; - int hh, - pm, - mi, - ss, - ssss, - d, /* stored as 1-7, Sunday = 1, 0 means missing */ - dd, - ddd, - mm, - ms, - year, - bc, - ww, - w, - cc, - j, - us, - yysz, /* is it YY or YYYY ? */ - clock, /* 12 or 24 hour clock? */ - tzsign, /* +1, -1, or 0 if no TZH/TZM fields */ - tzh, - tzm, - ff; /* fractional precision */ + int hh; + int pm; + int mi; + int ss; + int ssss; + int d; /* stored as 1-7, Sunday = 1, 0 means missing */ + int dd; + int ddd; + int mm; + int ms; + int year; + int bc; + int ww; + int w; + int cc; + int j; + int us; + int yysz; /* is it YY or YYYY ? */ + bool clock_12_hour; /* 12 or 24 hour clock? */ + int tzsign; /* +1, -1, or 0 if no TZH/TZM fields */ + int tzh; + int tzm; + int ff; /* fractional precision */ bool has_tz; /* was there a TZ field? */ int gmtoffset; /* GMT offset of fixed-offset zone abbrev */ pg_tz *tzp; /* pg_tz for dynamic abbrev */ - char *abbrev; /* dynamic abbrev */ + const char *abbrev; /* dynamic abbrev */ } TmFromChar; -#define ZERO_tmfc(_X) memset(_X, 0, sizeof(TmFromChar)) - struct fmt_tz /* do_to_timestamp's timezone info output */ { bool has_tz; /* was there any TZ/TZH/TZM field? */ int gmtoffset; /* GMT offset in seconds */ }; -/* ---------- +/* * Debug - * ---------- */ #ifdef DEBUG_TO_FROM_CHAR #define DEBUG_TMFC(_X) \ @@ -473,7 +465,7 @@ struct fmt_tz /* do_to_timestamp's timezone info output */ (_X)->mode, (_X)->hh, (_X)->pm, (_X)->mi, (_X)->ss, (_X)->ssss, \ (_X)->d, (_X)->dd, (_X)->ddd, (_X)->mm, (_X)->ms, (_X)->year, \ (_X)->bc, (_X)->ww, (_X)->w, (_X)->cc, (_X)->j, (_X)->us, \ - (_X)->yysz, (_X)->clock) + (_X)->yysz, (_X)->clock_12_hour) #define DEBUG_TM(_X) \ elog(DEBUG_elog_output, "TM:\nsec %d\nyear %d\nmin %d\nwday %d\nhour %d\nyday %d\nmday %d\nnisdst %d\nmon %d\n",\ (_X)->tm_sec, (_X)->tm_year,\ @@ -484,13 +476,12 @@ struct fmt_tz /* do_to_timestamp's timezone info output */ #define DEBUG_TM(_X) #endif -/* ---------- +/* * Datetime to char conversion * * To support intervals as well as timestamps, we use a custom "tm" struct * that is almost like struct pg_tm, but has a 64-bit tm_hour field. * We omit the tm_isdst and tm_zone fields, which are not used here. - * ---------- */ struct fmt_tm { @@ -561,50 +552,74 @@ do { \ * KeyWord definitions *****************************************************************************/ -/* ---------- +/* * Suffixes (FormatNode.suffix is an OR of these codes) - * ---------- */ -#define DCH_S_FM 0x01 -#define DCH_S_TH 0x02 -#define DCH_S_th 0x04 -#define DCH_S_SP 0x08 -#define DCH_S_TM 0x10 +#define DCH_SUFFIX_FM 0x01 +#define DCH_SUFFIX_TH 0x02 +#define DCH_SUFFIX_th 0x04 +#define DCH_SUFFIX_SP 0x08 +#define DCH_SUFFIX_TM 0x10 -/* ---------- +/* * Suffix tests - * ---------- */ -#define S_THth(_s) ((((_s) & DCH_S_TH) || ((_s) & DCH_S_th)) ? 1 : 0) -#define S_TH(_s) (((_s) & DCH_S_TH) ? 1 : 0) -#define S_th(_s) (((_s) & DCH_S_th) ? 1 : 0) -#define S_TH_TYPE(_s) (((_s) & DCH_S_TH) ? TH_UPPER : TH_LOWER) +static inline bool +IS_SUFFIX_TH(uint8 _s) +{ + return (_s & DCH_SUFFIX_TH); +} + +static inline bool +IS_SUFFIX_th(uint8 _s) +{ + return (_s & DCH_SUFFIX_th); +} + +static inline bool +IS_SUFFIX_THth(uint8 _s) +{ + return IS_SUFFIX_TH(_s) || IS_SUFFIX_th(_s); +} + +static inline enum TH_Case +SUFFIX_TH_TYPE(uint8 _s) +{ + return _s & DCH_SUFFIX_TH ? TH_UPPER : TH_LOWER; +} /* Oracle toggles FM behavior, we don't; see docs. */ -#define S_FM(_s) (((_s) & DCH_S_FM) ? 1 : 0) -#define S_SP(_s) (((_s) & DCH_S_SP) ? 1 : 0) -#define S_TM(_s) (((_s) & DCH_S_TM) ? 1 : 0) +static inline bool +IS_SUFFIX_FM(uint8 _s) +{ + return (_s & DCH_SUFFIX_FM); +} -/* ---------- +static inline bool +IS_SUFFIX_TM(uint8 _s) +{ + return (_s & DCH_SUFFIX_TM); +} + +/* * Suffixes definition for DATE-TIME TO/FROM CHAR - * ---------- */ #define TM_SUFFIX_LEN 2 static const KeySuffix DCH_suff[] = { - {"FM", 2, DCH_S_FM, SUFFTYPE_PREFIX}, - {"fm", 2, DCH_S_FM, SUFFTYPE_PREFIX}, - {"TM", TM_SUFFIX_LEN, DCH_S_TM, SUFFTYPE_PREFIX}, - {"tm", 2, DCH_S_TM, SUFFTYPE_PREFIX}, - {"TH", 2, DCH_S_TH, SUFFTYPE_POSTFIX}, - {"th", 2, DCH_S_th, SUFFTYPE_POSTFIX}, - {"SP", 2, DCH_S_SP, SUFFTYPE_POSTFIX}, + {"FM", 2, DCH_SUFFIX_FM, SUFFTYPE_PREFIX}, + {"fm", 2, DCH_SUFFIX_FM, SUFFTYPE_PREFIX}, + {"TM", TM_SUFFIX_LEN, DCH_SUFFIX_TM, SUFFTYPE_PREFIX}, + {"tm", 2, DCH_SUFFIX_TM, SUFFTYPE_PREFIX}, + {"TH", 2, DCH_SUFFIX_TH, SUFFTYPE_POSTFIX}, + {"th", 2, DCH_SUFFIX_th, SUFFTYPE_POSTFIX}, + {"SP", 2, DCH_SUFFIX_SP, SUFFTYPE_POSTFIX}, /* last */ {NULL, 0, 0, 0} }; -/* ---------- +/* * Format-pictures (KeyWord). * * The KeyWord field; alphabetic sorted, *BUT* strings alike is sorted @@ -628,8 +643,6 @@ static const KeySuffix DCH_suff[] = { * 1) see in index to index['M' - 32], * 2) take keywords position (enum DCH_MI) from index * 3) run sequential search in keywords[] from this position - * - * ---------- */ typedef enum @@ -794,9 +807,8 @@ typedef enum _NUM_last_ } NUM_poz; -/* ---------- +/* * KeyWords for DATE-TIME version - * ---------- */ static const KeyWord DCH_keywords[] = { /* name, len, id, is_digit, date_mode */ @@ -917,11 +929,10 @@ static const KeyWord DCH_keywords[] = { {NULL, 0, 0, 0, 0} }; -/* ---------- +/* * KeyWords for NUMBER version * * The is_digit and date_mode fields are not relevant here. - * ---------- */ static const KeyWord NUM_keywords[] = { /* name, len, id is in Index */ @@ -967,9 +978,8 @@ static const KeyWord NUM_keywords[] = { }; -/* ---------- +/* * KeyWords index for DATE-TIME version - * ---------- */ static const int DCH_index[KeyWord_INDEX_SIZE] = { /* @@ -991,9 +1001,8 @@ static const int DCH_index[KeyWord_INDEX_SIZE] = { /*---- chars over 126 are skipped ----*/ }; -/* ---------- +/* * KeyWords index for NUMBER version - * ---------- */ static const int NUM_index[KeyWord_INDEX_SIZE] = { /* @@ -1015,9 +1024,8 @@ static const int NUM_index[KeyWord_INDEX_SIZE] = { /*---- chars over 126 are skipped ----*/ }; -/* ---------- +/* * Number processor struct - * ---------- */ typedef struct NUMProc { @@ -1038,8 +1046,9 @@ typedef struct NUMProc char *number, /* string with number */ *number_p, /* pointer to current number position */ *inout, /* in / out buffer */ - *inout_p, /* pointer to current inout position */ - *last_relevant, /* last relevant number after decimal point */ + *inout_p; /* pointer to current inout position */ + + const char *last_relevant, /* last relevant number after decimal point */ *L_negative_sign, /* Locale */ *L_positive_sign, @@ -1062,13 +1071,12 @@ typedef struct NUMProc #define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s))) -/* ---------- +/* * Functions - * ---------- */ static const KeyWord *index_seq_search(const char *str, const KeyWord *kw, const int *index); -static const KeySuffix *suff_search(const char *str, const KeySuffix *suf, int type); +static const KeySuffix *suff_search(const char *str, const KeySuffix *suf, enum KeySuffixType type); static bool is_separator_char(const char *str); static void NUMDesc_prepare(NUMDesc *num, FormatNode *n); static void parse_format(FormatNode *node, const char *str, const KeyWord *kw, @@ -1084,38 +1092,38 @@ static void dump_index(const KeyWord *k, const int *index); static void dump_node(FormatNode *node, int max); #endif -static const char *get_th(char *num, int type); -static char *str_numth(char *dest, char *num, int type); +static const char *get_th(const char *num, enum TH_Case type); +static char *str_numth(char *dest, const char *num, enum TH_Case type); static int adjust_partial_year_to_2020(int year); -static int strspace_len(const char *str); +static size_t strspace_len(const char *str); static bool from_char_set_mode(TmFromChar *tmfc, const FromCharDateMode mode, Node *escontext); static bool from_char_set_int(int *dest, const int value, const FormatNode *node, Node *escontext); -static int from_char_parse_int_len(int *dest, const char **src, const int len, +static int from_char_parse_int_len(int *dest, const char **src, const size_t len, FormatNode *node, Node *escontext); static int from_char_parse_int(int *dest, const char **src, FormatNode *node, Node *escontext); -static int seq_search_ascii(const char *name, const char *const *array, int *len); -static int seq_search_localized(const char *name, char **array, int *len, +static int seq_search_ascii(const char *name, const char *const *array, size_t *len); +static int seq_search_localized(const char *name, char **array, size_t *len, Oid collid); static bool from_char_seq_search(int *dest, const char **src, const char *const *array, char **localized_array, Oid collid, FormatNode *node, Node *escontext); -static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, +static bool do_to_timestamp(const text *date_txt, const text *fmt, Oid collid, bool std, struct pg_tm *tm, fsec_t *fsec, struct fmt_tz *tz, int *fprec, uint32 *flags, Node *escontext); -static char *fill_str(char *str, int c, int max); -static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree); +static void fill_str(char *str, int c, int max); +static FormatNode *NUM_cache(int len, NUMDesc *Num, const text *pars_str, bool *shouldFree); static char *int_to_roman(int number); -static int roman_to_int(NUMProc *Np, int input_len); +static int roman_to_int(NUMProc *Np, size_t input_len); static void NUM_prepare_locale(NUMProc *Np); -static char *get_last_relevant_decnum(char *num); -static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len); +static const char *get_last_relevant_decnum(const char *num); +static void NUM_numpart_from_char(NUMProc *Np, int id, size_t input_len); static void NUM_numpart_to_char(NUMProc *Np, int id); static char *NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, - char *number, int input_len, int to_char_out_pre_spaces, + char *number, size_t input_len, int to_char_out_pre_spaces, int sign, bool is_to_char, Oid collid); static DCHCacheEntry *DCH_cache_getnew(const char *str, bool std); static DCHCacheEntry *DCH_cache_search(const char *str, bool std); @@ -1125,11 +1133,10 @@ static NUMCacheEntry *NUM_cache_search(const char *str); static NUMCacheEntry *NUM_cache_fetch(const char *str); -/* ---------- +/* * Fast sequential search, use index for data selection which * go to seq. cycle (it is very fast for unwanted strings) * (can't be used binary search in format parsing) - * ---------- */ static const KeyWord * index_seq_search(const char *str, const KeyWord *kw, const int *index) @@ -1139,7 +1146,7 @@ index_seq_search(const char *str, const KeyWord *kw, const int *index) if (!KeyWord_INDEX_FILTER(*str)) return NULL; - if ((poz = *(index + (*str - ' '))) > -1) + if ((poz = index[*str - ' ']) > -1) { const KeyWord *k = kw + poz; @@ -1156,11 +1163,9 @@ index_seq_search(const char *str, const KeyWord *kw, const int *index) } static const KeySuffix * -suff_search(const char *str, const KeySuffix *suf, int type) +suff_search(const char *str, const KeySuffix *suf, enum KeySuffixType type) { - const KeySuffix *s; - - for (s = suf; s->name != NULL; s++) + for (const KeySuffix *s = suf; s->name != NULL; s++) { if (s->type != type) continue; @@ -1181,9 +1186,8 @@ is_separator_char(const char *str) !(*str >= '0' && *str <= '9')); } -/* ---------- +/* * Prepare NUMDesc (number description struct) via FormatNode struct - * ---------- */ static void NUMDesc_prepare(NUMDesc *num, FormatNode *n) @@ -1233,7 +1237,7 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n) break; case NUM_B: - if (num->pre == 0 && num->post == 0 && (!IS_ZERO(num))) + if (num->pre == 0 && num->post == 0 && !IS_ZERO(num)) num->flag |= NUM_F_BLANK; break; @@ -1364,12 +1368,11 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n) errdetail("\"RN\" may only be used together with \"FM\"."))); } -/* ---------- +/* * Format parser, search small keywords and keyword's suffixes, and make * format-node tree. * * for DATE-TIME & NUMBER version - * ---------- */ static void parse_format(FormatNode *node, const char *str, const KeyWord *kw, @@ -1514,14 +1517,13 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, n->suffix = 0; } -/* ---------- +/* * DEBUG: Dump the FormatNode Tree (debug) - * ---------- */ #ifdef DEBUG_TO_FROM_CHAR -#define DUMP_THth(_suf) (S_TH(_suf) ? "TH" : (S_th(_suf) ? "th" : " ")) -#define DUMP_FM(_suf) (S_FM(_suf) ? "FM" : " ") +#define DUMP_THth(_suf) (IS_SUFFIX_TH(_suf) ? "TH" : (IS_SUFFIX_th(_suf) ? "th" : " ")) +#define DUMP_FM(_suf) (IS_SUFFIX_FM(_suf) ? "FM" : " ") static void dump_node(FormatNode *node, int max) @@ -1554,18 +1556,18 @@ dump_node(FormatNode *node, int max) * Private utils *****************************************************************************/ -/* ---------- +/* * Return ST/ND/RD/TH for simple (1..9) numbers - * type --> 0 upper, 1 lower - * ---------- */ static const char * -get_th(char *num, int type) +get_th(const char *num, enum TH_Case type) { - int len = strlen(num), - last; + size_t len = strlen(num); + char last; + + Assert(len > 0); - last = *(num + (len - 1)); + last = num[len - 1]; if (!isdigit((unsigned char) last)) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), @@ -1575,7 +1577,7 @@ get_th(char *num, int type) * All "teens" (1[0-9]) get 'TH/th', while [02-9][123] still get * 'ST/st', 'ND/nd', 'RD/rd', respectively */ - if ((len > 1) && (num[len - 2] == '1')) + if (len > 1 && num[len - 2] == '1') last = 0; switch (last) @@ -1599,13 +1601,11 @@ get_th(char *num, int type) } } -/* ---------- +/* * Convert string-number to ordinal string-number - * type --> 0 upper, 1 lower - * ---------- */ static char * -str_numth(char *dest, char *num, int type) +str_numth(char *dest, const char *num, enum TH_Case type) { if (dest != num) strcpy(dest, num); @@ -1617,16 +1617,6 @@ str_numth(char *dest, char *num, int type) * upper/lower/initcap functions *****************************************************************************/ -/* - * If the system provides the needed functions for wide-character manipulation - * (which are all standardized by C99), then we implement upper/lower/initcap - * using wide-character functions, if necessary. Otherwise we use the - * traditional functions, which of course will not work as desired - * in multibyte character sets. Note that in either case we are effectively - * assuming that the database character encoding matches the encoding implied - * by LC_CTYPE. - */ - /* * collation-aware, wide-character-aware lower function * @@ -1898,14 +1888,13 @@ char * asc_tolower(const char *buff, size_t nbytes) { char *result; - char *p; if (!buff) return NULL; result = pnstrdup(buff, nbytes); - for (p = result; *p; p++) + for (char *p = result; *p; p++) *p = pg_ascii_tolower((unsigned char) *p); return result; @@ -1921,14 +1910,13 @@ char * asc_toupper(const char *buff, size_t nbytes) { char *result; - char *p; if (!buff) return NULL; result = pnstrdup(buff, nbytes); - for (p = result; *p; p++) + for (char *p = result; *p; p++) *p = pg_ascii_toupper((unsigned char) *p); return result; @@ -1944,7 +1932,6 @@ char * asc_initcap(const char *buff, size_t nbytes) { char *result; - char *p; int wasalnum = false; if (!buff) @@ -1952,7 +1939,7 @@ asc_initcap(const char *buff, size_t nbytes) result = pnstrdup(buff, nbytes); - for (p = result; *p; p++) + for (char *p = result; *p; p++) { char c; @@ -2004,15 +1991,14 @@ asc_toupper_z(const char *buff) /* asc_initcap_z is not currently needed */ -/* ---------- +/* * Skip TM / th in FROM_CHAR * - * If S_THth is on, skip two chars, assuming there are two available - * ---------- + * If IS_SUFFIX_THth is on, skip two chars, assuming there are two available */ #define SKIP_THth(ptr, _suf) \ do { \ - if (S_THth(_suf)) \ + if (IS_SUFFIX_THth(_suf)) \ { \ if (*(ptr)) (ptr) += pg_mblen(ptr); \ if (*(ptr)) (ptr) += pg_mblen(ptr); \ @@ -2021,21 +2007,19 @@ asc_toupper_z(const char *buff) #ifdef DEBUG_TO_FROM_CHAR -/* ----------- +/* * DEBUG: Call for debug and for index checking; (Show ASCII char * and defined keyword for each used position - * ---------- */ static void dump_index(const KeyWord *k, const int *index) { - int i, - count = 0, + int count = 0, free_i = 0; elog(DEBUG_elog_output, "TO-FROM_CHAR: Dump KeyWord Index:"); - for (i = 0; i < KeyWord_INDEX_SIZE; i++) + for (int i = 0; i < KeyWord_INDEX_SIZE; i++) { if (index[i] != -1) { @@ -2053,9 +2037,8 @@ dump_index(const KeyWord *k, const int *index) } #endif /* DEBUG */ -/* ---------- +/* * Return true if next format picture is not digit value - * ---------- */ static bool is_next_separator(FormatNode *n) @@ -2063,7 +2046,7 @@ is_next_separator(FormatNode *n) if (n->type == NODE_TYPE_END) return false; - if (n->type == NODE_TYPE_ACTION && S_THth(n->suffix)) + if (n->type == NODE_TYPE_ACTION && IS_SUFFIX_THth(n->suffix)) return true; /* @@ -2114,10 +2097,10 @@ adjust_partial_year_to_2020(int year) } -static int +static size_t strspace_len(const char *str) { - int len = 0; + size_t len = 0; while (*str && isspace((unsigned char) *str)) { @@ -2148,8 +2131,7 @@ from_char_set_mode(TmFromChar *tmfc, const FromCharDateMode mode, ereturn(escontext, false, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("invalid combination of date conventions"), - errhint("Do not mix Gregorian and ISO week date " - "conventions in a formatting template."))); + errhint("Do not mix Gregorian and ISO week date conventions in a formatting template."))); } return true; } @@ -2172,8 +2154,7 @@ from_char_set_int(int *dest, const int value, const FormatNode *node, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("conflicting values for \"%s\" field in formatting string", node->key->name), - errdetail("This value contradicts a previous setting " - "for the same field type."))); + errdetail("This value contradicts a previous setting for the same field type."))); *dest = value; return true; } @@ -2200,13 +2181,13 @@ from_char_set_int(int *dest, const int value, const FormatNode *node, * with DD and MI). */ static int -from_char_parse_int_len(int *dest, const char **src, const int len, FormatNode *node, +from_char_parse_int_len(int *dest, const char **src, const size_t len, FormatNode *node, Node *escontext) { long result; char copy[DCH_MAX_ITEM_SIZ + 1]; const char *init = *src; - int used; + size_t used; /* * Skip any whitespace before parsing the integer. @@ -2214,9 +2195,9 @@ from_char_parse_int_len(int *dest, const char **src, const int len, FormatNode * *src += strspace_len(*src); Assert(len <= DCH_MAX_ITEM_SIZ); - used = (int) strlcpy(copy, *src, len + 1); + used = strlcpy(copy, *src, len + 1); - if (S_FM(node->suffix) || is_next_separator(node)) + if (IS_SUFFIX_FM(node->suffix) || is_next_separator(node)) { /* * This node is in Fill Mode, or the next node is known to be a @@ -2241,10 +2222,9 @@ from_char_parse_int_len(int *dest, const char **src, const int len, FormatNode * (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("source string too short for \"%s\" formatting field", node->key->name), - errdetail("Field requires %d characters, but only %d remain.", + errdetail("Field requires %zu characters, but only %zu remain.", len, used), - errhint("If your source string is not fixed-width, " - "try using the \"FM\" modifier."))); + errhint("If your source string is not fixed-width, try using the \"FM\" modifier."))); errno = 0; result = strtol(copy, &last, 10); @@ -2255,10 +2235,9 @@ from_char_parse_int_len(int *dest, const char **src, const int len, FormatNode * (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("invalid value \"%s\" for \"%s\"", copy, node->key->name), - errdetail("Field requires %d characters, but only %d could be parsed.", + errdetail("Field requires %zu characters, but only %zu could be parsed.", len, used), - errhint("If your source string is not fixed-width, " - "try using the \"FM\" modifier."))); + errhint("If your source string is not fixed-width, try using the \"FM\" modifier."))); *src += used; } @@ -2315,10 +2294,9 @@ from_char_parse_int(int *dest, const char **src, FormatNode *node, * suitable for comparisons to ASCII strings. */ static int -seq_search_ascii(const char *name, const char *const *array, int *len) +seq_search_ascii(const char *name, const char *const *array, size_t *len) { unsigned char firstc; - const char *const *a; *len = 0; @@ -2329,17 +2307,14 @@ seq_search_ascii(const char *name, const char *const *array, int *len) /* we handle first char specially to gain some speed */ firstc = pg_ascii_tolower((unsigned char) *name); - for (a = array; *a != NULL; a++) + for (const char *const *a = array; *a != NULL; a++) { - const char *p; - const char *n; - /* compare first chars */ if (pg_ascii_tolower((unsigned char) **a) != firstc) continue; /* compare rest of string */ - for (p = *a + 1, n = name + 1;; p++, n++) + for (const char *p = *a + 1, *n = name + 1;; p++, n++) { /* return success if we matched whole array entry */ if (*p == '\0') @@ -2372,9 +2347,8 @@ seq_search_ascii(const char *name, const char *const *array, int *len) * the arrays exported by pg_locale.c aren't const. */ static int -seq_search_localized(const char *name, char **array, int *len, Oid collid) +seq_search_localized(const char *name, char **array, size_t *len, Oid collid) { - char **a; char *upper_name; char *lower_name; @@ -2388,9 +2362,9 @@ seq_search_localized(const char *name, char **array, int *len, Oid collid) * The case-folding processing done below is fairly expensive, so before * doing that, make a quick pass to see if there is an exact match. */ - for (a = array; *a != NULL; a++) + for (char **a = array; *a != NULL; a++) { - int element_len = strlen(*a); + size_t element_len = strlen(*a); if (strncmp(name, *a, element_len) == 0) { @@ -2407,11 +2381,11 @@ seq_search_localized(const char *name, char **array, int *len, Oid collid) lower_name = str_tolower(upper_name, strlen(upper_name), collid); pfree(upper_name); - for (a = array; *a != NULL; a++) + for (char **a = array; *a != NULL; a++) { char *upper_element; char *lower_element; - int element_len; + size_t element_len; /* Likewise upper/lower-case array element */ upper_element = str_toupper(*a, strlen(*a), collid); @@ -2460,7 +2434,7 @@ from_char_seq_search(int *dest, const char **src, const char *const *array, char **localized_array, Oid collid, FormatNode *node, Node *escontext) { - int len; + size_t len; if (localized_array == NULL) *dest = seq_search_ascii(*src, array, &len); @@ -2474,9 +2448,8 @@ from_char_seq_search(int *dest, const char **src, const char *const *array, * any) to avoid including irrelevant data. */ char *copy = pstrdup(*src); - char *c; - for (c = copy; *c; c++) + for (char *c = copy; *c; c++) { if (scanner_isspace(*c)) { @@ -2489,22 +2462,19 @@ from_char_seq_search(int *dest, const char **src, const char *const *array, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("invalid value \"%s\" for \"%s\"", copy, node->key->name), - errdetail("The given value did not match any of " - "the allowed values for this field."))); + errdetail("The given value did not match any of the allowed values for this field."))); } *src += len; return true; } -/* ---------- +/* * Process a TmToChar struct as denoted by a list of FormatNodes. * The formatted data is written to the string pointed to by 'out'. - * ---------- */ static void DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid collid) { - FormatNode *n; char *s; struct fmt_tm *tm = &in->tm; int i; @@ -2513,7 +2483,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col cache_locale_time(); s = out; - for (n = node; n->type != NODE_TYPE_END; n++) + for (FormatNode *n = node; n->type != NODE_TYPE_END; n++) { if (n->type != NODE_TYPE_ACTION) { @@ -2555,40 +2525,40 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col * display time as shown on a 12-hour clock, even for * intervals */ - sprintf(s, "%0*lld", S_FM(n->suffix) ? 0 : (tm->tm_hour >= 0) ? 2 : 3, + sprintf(s, "%0*lld", IS_SUFFIX_FM(n->suffix) ? 0 : (tm->tm_hour >= 0) ? 2 : 3, tm->tm_hour % (HOURS_PER_DAY / 2) == 0 ? (long long) (HOURS_PER_DAY / 2) : (long long) (tm->tm_hour % (HOURS_PER_DAY / 2))); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_HH24: - sprintf(s, "%0*lld", S_FM(n->suffix) ? 0 : (tm->tm_hour >= 0) ? 2 : 3, + sprintf(s, "%0*lld", IS_SUFFIX_FM(n->suffix) ? 0 : (tm->tm_hour >= 0) ? 2 : 3, (long long) tm->tm_hour); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_MI: - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : (tm->tm_min >= 0) ? 2 : 3, + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : (tm->tm_min >= 0) ? 2 : 3, tm->tm_min); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_SS: - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : (tm->tm_sec >= 0) ? 2 : 3, + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : (tm->tm_sec >= 0) ? 2 : 3, tm->tm_sec); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; #define DCH_to_char_fsec(frac_fmt, frac_val) \ sprintf(s, frac_fmt, (int) (frac_val)); \ - if (S_THth(n->suffix)) \ - str_numth(s, s, S_TH_TYPE(n->suffix)); \ + if (IS_SUFFIX_THth(n->suffix)) \ + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); \ s += strlen(s) case DCH_FF1: /* tenth of second */ @@ -2617,8 +2587,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col (long long) (tm->tm_hour * SECS_PER_HOUR + tm->tm_min * SECS_PER_MINUTE + tm->tm_sec)); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_tz: @@ -2658,7 +2628,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col INVALID_FOR_INTERVAL; sprintf(s, "%c%0*d", (tm->tm_gmtoff >= 0) ? '+' : '-', - S_FM(n->suffix) ? 0 : 2, + IS_SUFFIX_FM(n->suffix) ? 0 : 2, abs((int) tm->tm_gmtoff) / SECS_PER_HOUR); s += strlen(s); if (abs((int) tm->tm_gmtoff) % SECS_PER_HOUR != 0) @@ -2696,7 +2666,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col INVALID_FOR_INTERVAL; if (!tm->tm_mon) break; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_toupper_z(localized_full_months[tm->tm_mon - 1], collid); @@ -2708,7 +2678,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col errmsg("localized string format value too long"))); } else - sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, + sprintf(s, "%*s", IS_SUFFIX_FM(n->suffix) ? 0 : -9, asc_toupper_z(months_full[tm->tm_mon - 1])); s += strlen(s); break; @@ -2716,7 +2686,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col INVALID_FOR_INTERVAL; if (!tm->tm_mon) break; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_initcap_z(localized_full_months[tm->tm_mon - 1], collid); @@ -2728,7 +2698,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col errmsg("localized string format value too long"))); } else - sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, + sprintf(s, "%*s", IS_SUFFIX_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]); s += strlen(s); break; @@ -2736,7 +2706,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col INVALID_FOR_INTERVAL; if (!tm->tm_mon) break; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_tolower_z(localized_full_months[tm->tm_mon - 1], collid); @@ -2748,7 +2718,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col errmsg("localized string format value too long"))); } else - sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, + sprintf(s, "%*s", IS_SUFFIX_FM(n->suffix) ? 0 : -9, asc_tolower_z(months_full[tm->tm_mon - 1])); s += strlen(s); break; @@ -2756,7 +2726,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col INVALID_FOR_INTERVAL; if (!tm->tm_mon) break; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_toupper_z(localized_abbrev_months[tm->tm_mon - 1], collid); @@ -2775,7 +2745,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col INVALID_FOR_INTERVAL; if (!tm->tm_mon) break; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_initcap_z(localized_abbrev_months[tm->tm_mon - 1], collid); @@ -2794,7 +2764,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col INVALID_FOR_INTERVAL; if (!tm->tm_mon) break; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_tolower_z(localized_abbrev_months[tm->tm_mon - 1], collid); @@ -2810,15 +2780,15 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col s += strlen(s); break; case DCH_MM: - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : (tm->tm_mon >= 0) ? 2 : 3, + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : (tm->tm_mon >= 0) ? 2 : 3, tm->tm_mon); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_DAY: INVALID_FOR_INTERVAL; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_toupper_z(localized_full_days[tm->tm_wday], collid); @@ -2830,13 +2800,13 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col errmsg("localized string format value too long"))); } else - sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, + sprintf(s, "%*s", IS_SUFFIX_FM(n->suffix) ? 0 : -9, asc_toupper_z(days[tm->tm_wday])); s += strlen(s); break; case DCH_Day: INVALID_FOR_INTERVAL; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_initcap_z(localized_full_days[tm->tm_wday], collid); @@ -2848,13 +2818,13 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col errmsg("localized string format value too long"))); } else - sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, + sprintf(s, "%*s", IS_SUFFIX_FM(n->suffix) ? 0 : -9, days[tm->tm_wday]); s += strlen(s); break; case DCH_day: INVALID_FOR_INTERVAL; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_tolower_z(localized_full_days[tm->tm_wday], collid); @@ -2866,13 +2836,13 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col errmsg("localized string format value too long"))); } else - sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, + sprintf(s, "%*s", IS_SUFFIX_FM(n->suffix) ? 0 : -9, asc_tolower_z(days[tm->tm_wday])); s += strlen(s); break; case DCH_DY: INVALID_FOR_INTERVAL; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_toupper_z(localized_abbrev_days[tm->tm_wday], collid); @@ -2889,7 +2859,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col break; case DCH_Dy: INVALID_FOR_INTERVAL; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_initcap_z(localized_abbrev_days[tm->tm_wday], collid); @@ -2906,7 +2876,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col break; case DCH_dy: INVALID_FOR_INTERVAL; - if (S_TM(n->suffix)) + if (IS_SUFFIX_TM(n->suffix)) { char *str = str_tolower_z(localized_abbrev_days[tm->tm_wday], collid); @@ -2923,54 +2893,54 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col break; case DCH_DDD: case DCH_IDDD: - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : 3, + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : 3, (n->key->id == DCH_DDD) ? tm->tm_yday : date2isoyearday(tm->tm_year, tm->tm_mon, tm->tm_mday)); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_DD: - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : 2, tm->tm_mday); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : 2, tm->tm_mday); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_D: INVALID_FOR_INTERVAL; sprintf(s, "%d", tm->tm_wday + 1); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_ID: INVALID_FOR_INTERVAL; sprintf(s, "%d", (tm->tm_wday == 0) ? 7 : tm->tm_wday); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_WW: - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : 2, + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : 2, (tm->tm_yday - 1) / 7 + 1); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_IW: - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : 2, + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : 2, date2isoweek(tm->tm_year, tm->tm_mon, tm->tm_mday)); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_Q: if (!tm->tm_mon) break; sprintf(s, "%d", (tm->tm_mon - 1) / 3 + 1); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_CC: @@ -2986,25 +2956,25 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col i = tm->tm_year / 100 - 1; } if (i <= 99 && i >= -99) - sprintf(s, "%0*d", S_FM(n->suffix) ? 0 : (i >= 0) ? 2 : 3, i); + sprintf(s, "%0*d", IS_SUFFIX_FM(n->suffix) ? 0 : (i >= 0) ? 2 : 3, i); else sprintf(s, "%d", i); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_Y_YYY: i = ADJUST_YEAR(tm->tm_year, is_interval) / 1000; sprintf(s, "%d,%03d", i, ADJUST_YEAR(tm->tm_year, is_interval) - (i * 1000)); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_YYYY: case DCH_IYYY: sprintf(s, "%0*d", - S_FM(n->suffix) ? 0 : + IS_SUFFIX_FM(n->suffix) ? 0 : (ADJUST_YEAR(tm->tm_year, is_interval) >= 0) ? 4 : 5, (n->key->id == DCH_YYYY ? ADJUST_YEAR(tm->tm_year, is_interval) : @@ -3012,14 +2982,14 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col tm->tm_mon, tm->tm_mday), is_interval))); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_YYY: case DCH_IYY: sprintf(s, "%0*d", - S_FM(n->suffix) ? 0 : + IS_SUFFIX_FM(n->suffix) ? 0 : (ADJUST_YEAR(tm->tm_year, is_interval) >= 0) ? 3 : 4, (n->key->id == DCH_YYY ? ADJUST_YEAR(tm->tm_year, is_interval) : @@ -3027,14 +2997,14 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col tm->tm_mon, tm->tm_mday), is_interval)) % 1000); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_YY: case DCH_IY: sprintf(s, "%0*d", - S_FM(n->suffix) ? 0 : + IS_SUFFIX_FM(n->suffix) ? 0 : (ADJUST_YEAR(tm->tm_year, is_interval) >= 0) ? 2 : 3, (n->key->id == DCH_YY ? ADJUST_YEAR(tm->tm_year, is_interval) : @@ -3042,8 +3012,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col tm->tm_mon, tm->tm_mday), is_interval)) % 100); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_Y: @@ -3055,8 +3025,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col tm->tm_mon, tm->tm_mday), is_interval)) % 10); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_RM: @@ -3111,21 +3081,21 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col mon = MONTHS_PER_YEAR - tm->tm_mon; } - sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -4, + sprintf(s, "%*s", IS_SUFFIX_FM(n->suffix) ? 0 : -4, months[mon]); s += strlen(s); } break; case DCH_W: sprintf(s, "%d", (tm->tm_mday - 1) / 7 + 1); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; case DCH_J: sprintf(s, "%d", date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)); - if (S_THth(n->suffix)) - str_numth(s, s, S_TH_TYPE(n->suffix)); + if (IS_SUFFIX_THth(n->suffix)) + str_numth(s, s, SUFFIX_TH_TYPE(n->suffix)); s += strlen(s); break; } @@ -3282,7 +3252,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, return; if (!from_char_set_int(&out->pm, value % 2, n, escontext)) return; - out->clock = CLOCK_12_HOUR; + out->clock_12_hour = true; break; case DCH_AM: case DCH_PM: @@ -3294,13 +3264,13 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, return; if (!from_char_set_int(&out->pm, value % 2, n, escontext)) return; - out->clock = CLOCK_12_HOUR; + out->clock_12_hour = true; break; case DCH_HH: case DCH_HH12: if (from_char_parse_int_len(&out->hh, &s, 2, n, escontext) < 0) return; - out->clock = CLOCK_12_HOUR; + out->clock_12_hour = true; SKIP_THth(s, n->suffix); break; case DCH_HH24: @@ -3387,8 +3357,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, */ ereturn(escontext,, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), - errmsg("invalid value \"%s\" for \"%s\"", - s, n->key->name), + errmsg("invalid value \"%s\" for \"%s\"", s, n->key->name), errdetail("Time zone abbreviation is not recognized."))); } /* otherwise parse it like OF */ @@ -3477,7 +3446,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, case DCH_Month: case DCH_month: if (!from_char_seq_search(&value, &s, months_full, - S_TM(n->suffix) ? localized_full_months : NULL, + IS_SUFFIX_TM(n->suffix) ? localized_full_months : NULL, collid, n, escontext)) return; @@ -3488,7 +3457,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, case DCH_Mon: case DCH_mon: if (!from_char_seq_search(&value, &s, months, - S_TM(n->suffix) ? localized_abbrev_months : NULL, + IS_SUFFIX_TM(n->suffix) ? localized_abbrev_months : NULL, collid, n, escontext)) return; @@ -3504,7 +3473,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, case DCH_Day: case DCH_day: if (!from_char_seq_search(&value, &s, days, - S_TM(n->suffix) ? localized_full_days : NULL, + IS_SUFFIX_TM(n->suffix) ? localized_full_days : NULL, collid, n, escontext)) return; @@ -3516,7 +3485,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, case DCH_Dy: case DCH_dy: if (!from_char_seq_search(&value, &s, days_short, - S_TM(n->suffix) ? localized_abbrev_days : NULL, + IS_SUFFIX_TM(n->suffix) ? localized_abbrev_days : NULL, collid, n, escontext)) return; @@ -3590,14 +3559,14 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, if (matched < 2) ereturn(escontext,, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), - errmsg("invalid input string for \"Y,YYY\""))); + errmsg("invalid value \"%s\" for \"%s\"", s, "Y,YYY"))); /* years += (millennia * 1000); */ if (pg_mul_s32_overflow(millennia, 1000, &millennia) || pg_add_s32_overflow(years, millennia, &years)) ereturn(escontext,, (errcode(ERRCODE_DATETIME_FIELD_OVERFLOW), - errmsg("value for \"Y,YYY\" in source string is out of range"))); + errmsg("value for \"%s\" in source string is out of range", "Y,YYY"))); if (!from_char_set_int(&out->year, years, n, escontext)) return; @@ -3722,10 +3691,9 @@ DCH_prevent_counter_overflow(void) static int DCH_datetime_type(FormatNode *node) { - FormatNode *n; int flags = 0; - for (n = node; n->type != NODE_TYPE_END; n++) + for (FormatNode *n = node; n->type != NODE_TYPE_END; n++) { if (n->type != NODE_TYPE_ACTION) continue; @@ -3925,13 +3893,13 @@ DCH_cache_fetch(const char *str, bool std) * for formatting. */ static text * -datetime_to_char_body(TmToChar *tmtc, text *fmt, bool is_interval, Oid collid) +datetime_to_char_body(TmToChar *tmtc, const text *fmt, bool is_interval, Oid collid) { FormatNode *format; char *fmt_str, *result; bool incache; - int fmt_len; + size_t fmt_len; text *res; /* @@ -3989,9 +3957,8 @@ datetime_to_char_body(TmToChar *tmtc, text *fmt, bool is_interval, Oid collid) * Public routines ***************************************************************************/ -/* ------------------- +/* * TIMESTAMP to_char() - * ------------------- */ Datum timestamp_to_char(PG_FUNCTION_ARGS) @@ -4065,9 +4032,8 @@ timestamptz_to_char(PG_FUNCTION_ARGS) } -/* ------------------- +/* * INTERVAL to_char() - * ------------------- */ Datum interval_to_char(PG_FUNCTION_ARGS) @@ -4104,12 +4070,11 @@ interval_to_char(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(res); } -/* --------------------- +/* * TO_TIMESTAMP() * * Make Timestamp from date_str which is formatted at argument 'fmt' * ( to_timestamp is reverse to_char() ) - * --------------------- */ Datum to_timestamp(PG_FUNCTION_ARGS) @@ -4145,10 +4110,9 @@ to_timestamp(PG_FUNCTION_ARGS) PG_RETURN_TIMESTAMP(result); } -/* ---------- +/* * TO_DATE * Make Date from date_str which is formatted at argument 'fmt' - * ---------- */ Datum to_date(PG_FUNCTION_ARGS) @@ -4168,8 +4132,7 @@ to_date(PG_FUNCTION_ARGS) if (!IS_VALID_JULIAN(tm.tm_year, tm.tm_mon, tm.tm_mday)) ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("date out of range: \"%s\"", - text_to_cstring(date_txt)))); + errmsg("date out of range: \"%s\"", text_to_cstring(date_txt)))); result = date2j(tm.tm_year, tm.tm_mon, tm.tm_mday) - POSTGRES_EPOCH_JDATE; @@ -4177,8 +4140,7 @@ to_date(PG_FUNCTION_ARGS) if (!IS_VALID_DATE(result)) ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("date out of range: \"%s\"", - text_to_cstring(date_txt)))); + errmsg("date out of range: \"%s\"", text_to_cstring(date_txt)))); PG_RETURN_DATEADT(result); } @@ -4282,8 +4244,7 @@ parse_datetime(text *date_txt, text *fmt, Oid collid, bool strict, if (!IS_VALID_JULIAN(tm.tm_year, tm.tm_mon, tm.tm_mday)) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("date out of range: \"%s\"", - text_to_cstring(date_txt)))); + errmsg("date out of range: \"%s\"", text_to_cstring(date_txt)))); result = date2j(tm.tm_year, tm.tm_mon, tm.tm_mday) - POSTGRES_EPOCH_JDATE; @@ -4292,8 +4253,7 @@ parse_datetime(text *date_txt, text *fmt, Oid collid, bool strict, if (!IS_VALID_DATE(result)) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("date out of range: \"%s\"", - text_to_cstring(date_txt)))); + errmsg("date out of range: \"%s\"", text_to_cstring(date_txt)))); *typid = DATEOID; return DateADTGetDatum(result); @@ -4304,7 +4264,7 @@ parse_datetime(text *date_txt, text *fmt, Oid collid, bool strict, { if (flags & DCH_ZONED) { - TimeTzADT *result = palloc(sizeof(TimeTzADT)); + TimeTzADT *result = palloc_object(TimeTzADT); if (ftz.has_tz) { @@ -4365,7 +4325,7 @@ bool datetime_format_has_tz(const char *fmt_str) { bool incache; - int fmt_len = strlen(fmt_str); + size_t fmt_len = strlen(fmt_str); int result; FormatNode *format; @@ -4425,12 +4385,12 @@ datetime_format_has_tz(const char *fmt_str) * struct 'tm', 'fsec', struct 'tz', and 'fprec'. */ static bool -do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, +do_to_timestamp(const text *date_txt, const text *fmt, Oid collid, bool std, struct pg_tm *tm, fsec_t *fsec, struct fmt_tz *tz, int *fprec, uint32 *flags, Node *escontext) { FormatNode *format = NULL; - TmFromChar tmfc; + TmFromChar tmfc = {0}; int fmt_len; char *date_str; int fmask; @@ -4441,7 +4401,6 @@ do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, date_str = text_to_cstring(date_txt); - ZERO_tmfc(&tmfc); ZERO_tm(tm); *fsec = 0; tz->has_tz = false; @@ -4524,14 +4483,13 @@ do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, if (tmfc.hh) tm->tm_hour = tmfc.hh; - if (tmfc.clock == CLOCK_12_HOUR) + if (tmfc.clock_12_hour) { if (tm->tm_hour < 1 || tm->tm_hour > HOURS_PER_DAY / 2) { errsave(escontext, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), - errmsg("hour \"%d\" is invalid for the 12-hour clock", - tm->tm_hour), + errmsg("hour \"%d\" is invalid for the 12-hour clock", tm->tm_hour), errhint("Use the 24-hour clock, or give an hour between 1 and 12."))); goto fail; } @@ -4857,27 +4815,17 @@ do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, *********************************************************************/ -static char * +/* + * Fill str with character c max times, and add terminating \0. (So max+1 + * bytes are written altogether!) + */ +static void fill_str(char *str, int c, int max) { memset(str, c, max); - *(str + max) = '\0'; - return str; + str[max] = '\0'; } -#define zeroize_NUM(_n) \ -do { \ - (_n)->flag = 0; \ - (_n)->lsign = 0; \ - (_n)->pre = 0; \ - (_n)->post = 0; \ - (_n)->pre_lsign_num = 0; \ - (_n)->need_locale = 0; \ - (_n)->multi = 0; \ - (_n)->zero_start = 0; \ - (_n)->zero_end = 0; \ -} while(0) - /* This works the same as DCH_prevent_counter_overflow */ static inline void NUM_prevent_counter_overflow(void) @@ -4985,7 +4933,7 @@ NUM_cache_fetch(const char *str) */ ent = NUM_cache_getnew(str); - zeroize_NUM(&ent->Num); + memset(&ent->Num, 0, sizeof ent->Num); parse_format(ent->format, str, NUM_keywords, NULL, NUM_index, NUM_FLAG, &ent->Num); @@ -4995,12 +4943,11 @@ NUM_cache_fetch(const char *str) return ent; } -/* ---------- +/* * Cache routine for NUM to_char version - * ---------- */ static FormatNode * -NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree) +NUM_cache(int len, NUMDesc *Num, const text *pars_str, bool *shouldFree) { FormatNode *format = NULL; char *str; @@ -5017,7 +4964,7 @@ NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree) *shouldFree = true; - zeroize_NUM(Num); + memset(Num, 0, sizeof *Num); parse_format(format, str, NUM_keywords, NULL, NUM_index, NUM_FLAG, Num); @@ -5067,8 +5014,7 @@ int_to_roman(int number) { int len, num; - char *p, - *result, + char *result, numstr[12]; result = (char *) palloc(MAX_ROMAN_LEN + 1); @@ -5089,7 +5035,7 @@ int_to_roman(int number) len = snprintf(numstr, sizeof(numstr), "%d", number); Assert(len > 0 && len <= 4); - for (p = numstr; *p != '\0'; p++, --len) + for (char *p = numstr; *p != '\0'; p++, --len) { num = *p - ('0' + 1); if (num < 0) @@ -5123,10 +5069,10 @@ int_to_roman(int number) * If input is invalid, return -1. */ static int -roman_to_int(NUMProc *Np, int input_len) +roman_to_int(NUMProc *Np, size_t input_len) { int result = 0; - int len; + size_t len; char romanChars[MAX_ROMAN_LEN]; int romanValues[MAX_ROMAN_LEN]; int repeatCount = 1; @@ -5165,7 +5111,7 @@ roman_to_int(NUMProc *Np, int input_len) return -1; /* No valid roman numerals. */ /* Check for valid combinations and compute the represented value. */ - for (int i = 0; i < len; i++) + for (size_t i = 0; i < len; i++) { char currChar = romanChars[i]; int currValue = romanValues[i]; @@ -5268,9 +5214,8 @@ roman_to_int(NUMProc *Np, int input_len) } -/* ---------- +/* * Locale - * ---------- */ static void NUM_prepare_locale(NUMProc *Np) @@ -5346,18 +5291,17 @@ NUM_prepare_locale(NUMProc *Np) } } -/* ---------- +/* * Return pointer of last relevant number after decimal point * 12.0500 --> last relevant is '5' * 12.0000 --> last relevant is '.' * If there is no decimal point, return NULL (which will result in same * behavior as if FM hadn't been specified). - * ---------- */ -static char * -get_last_relevant_decnum(char *num) +static const char * +get_last_relevant_decnum(const char *num) { - char *result, + const char *result, *p = strchr(num, '.'); #ifdef DEBUG_TO_FROM_CHAR @@ -5378,12 +5322,11 @@ get_last_relevant_decnum(char *num) return result; } -/* ---------- +/* * Number extraction for TO_NUMBER() - * ---------- */ static void -NUM_numpart_from_char(NUMProc *Np, int id, int input_len) +NUM_numpart_from_char(NUMProc *Np, int id, size_t input_len) { bool isread = false; @@ -5417,7 +5360,7 @@ NUM_numpart_from_char(NUMProc *Np, int id, int input_len) */ if (IS_LSIGN(Np->Num) && Np->Num->lsign == NUM_LSIGN_PRE) { - int x = 0; + size_t x = 0; #ifdef DEBUG_TO_FROM_CHAR elog(DEBUG_elog_output, "Try read locale pre-sign (%c)", *Np->inout_p); @@ -5496,7 +5439,7 @@ NUM_numpart_from_char(NUMProc *Np, int id, int input_len) * Np->decimal is always just "." if we don't have a D format token. * So we just unconditionally match to Np->decimal. */ - int x = strlen(Np->decimal); + size_t x = strlen(Np->decimal); #ifdef DEBUG_TO_FROM_CHAR elog(DEBUG_elog_output, "Try read decimal point (%c)", @@ -5535,7 +5478,7 @@ NUM_numpart_from_char(NUMProc *Np, int id, int input_len) (Np->inout_p + 1) < Np->inout + input_len && !isdigit((unsigned char) *(Np->inout_p + 1))) { - int x; + size_t x; char *tmp = Np->inout_p++; #ifdef DEBUG_TO_FROM_CHAR @@ -5593,9 +5536,8 @@ NUM_numpart_from_char(NUMProc *Np, int id, int input_len) *(_n)->number == '0' && \ (_n)->Num->post != 0) -/* ---------- +/* * Add digit or sign to number-string - * ---------- */ static void NUM_numpart_to_char(NUMProc *Np, int id) @@ -5788,7 +5730,7 @@ NUM_numpart_to_char(NUMProc *Np, int id) * Skip over "n" input characters, but only if they aren't numeric data */ static void -NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len) +NUM_eat_non_data_chars(NUMProc *Np, int n, size_t input_len) { while (n-- > 0) { @@ -5802,14 +5744,14 @@ NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len) static char * NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, - char *number, int input_len, int to_char_out_pre_spaces, + char *number, size_t input_len, int to_char_out_pre_spaces, int sign, bool is_to_char, Oid collid) { FormatNode *n; NUMProc _Np, *Np = &_Np; const char *pattern; - int pattern_len; + size_t pattern_len; MemSet(Np, 0, sizeof(NUMProc)); @@ -5890,7 +5832,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, */ if (Np->last_relevant && Np->Num->zero_end > Np->out_pre_spaces) { - int last_zero_pos; + size_t last_zero_pos; char *last_zero; /* note that Np->number cannot be zero-length here */ @@ -6264,10 +6206,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, } } -/* ---------- +/* * MACRO: Start part of NUM - for all NUM's to_char variants * (sorry, but I hate copy same code - macro is better..) - * ---------- */ #define NUM_TOCHAR_prepare \ do { \ @@ -6278,13 +6219,12 @@ do { \ format = NUM_cache(len, &Num, fmt, &shouldFree); \ } while (0) -/* ---------- +/* * MACRO: Finish part of NUM - * ---------- */ #define NUM_TOCHAR_finish \ do { \ - int len; \ + size_t len; \ \ NUM_processor(format, &Num, VARDATA(result), numstr, 0, out_pre_spaces, sign, true, PG_GET_COLLATION()); \ \ @@ -6300,9 +6240,8 @@ do { \ SET_VARSIZE(result, len + VARHDRSZ); \ } while (0) -/* ------------------- +/* * NUMERIC to_number() (convert string to numeric) - * ------------------- */ Datum numeric_to_number(PG_FUNCTION_ARGS) @@ -6359,9 +6298,8 @@ numeric_to_number(PG_FUNCTION_ARGS) return result; } -/* ------------------ +/* * NUMERIC to_char() - * ------------------ */ Datum numeric_to_char(PG_FUNCTION_ARGS) @@ -6386,12 +6324,12 @@ numeric_to_char(PG_FUNCTION_ARGS) if (IS_ROMAN(&Num)) { int32 intvalue; - bool err; + ErrorSaveContext escontext = {T_ErrorSaveContext}; /* Round and convert to int */ - intvalue = numeric_int4_opt_error(value, &err); + intvalue = numeric_int4_safe(value, (Node *) &escontext); /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ - if (err) + if (escontext.error_occurred) intvalue = PG_INT32_MAX; numstr = int_to_roman(intvalue); } @@ -6431,7 +6369,7 @@ numeric_to_char(PG_FUNCTION_ARGS) } else { - int numstr_pre_len; + size_t numstr_pre_len; Numeric val = value; Numeric x; @@ -6487,9 +6425,8 @@ numeric_to_char(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } -/* --------------- +/* * INT4 to_char() - * --------------- */ Datum int4_to_char(PG_FUNCTION_ARGS) @@ -6529,7 +6466,7 @@ int4_to_char(PG_FUNCTION_ARGS) } else { - int numstr_pre_len; + size_t numstr_pre_len; if (IS_MULTI(&Num)) { @@ -6581,9 +6518,8 @@ int4_to_char(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } -/* --------------- +/* * INT8 to_char() - * --------------- */ Datum int8_to_char(PG_FUNCTION_ARGS) @@ -6639,7 +6575,7 @@ int8_to_char(PG_FUNCTION_ARGS) } else { - int numstr_pre_len; + size_t numstr_pre_len; if (IS_MULTI(&Num)) { @@ -6693,9 +6629,8 @@ int8_to_char(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } -/* ----------------- +/* * FLOAT4 to_char() - * ----------------- */ Datum float4_to_char(PG_FUNCTION_ARGS) @@ -6754,7 +6689,7 @@ float4_to_char(PG_FUNCTION_ARGS) { float4 val = value; char *orgnum; - int numstr_pre_len; + size_t numstr_pre_len; if (IS_MULTI(&Num)) { @@ -6806,9 +6741,8 @@ float4_to_char(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } -/* ----------------- +/* * FLOAT8 to_char() - * ----------------- */ Datum float8_to_char(PG_FUNCTION_ARGS) @@ -6867,7 +6801,7 @@ float8_to_char(PG_FUNCTION_ARGS) { float8 val = value; char *orgnum; - int numstr_pre_len; + size_t numstr_pre_len; if (IS_MULTI(&Num)) { diff --git a/src/backend/utils/adt/geo_ops.c b/src/backend/utils/adt/geo_ops.c index 377a1b3f3ade1..43b7eb43a79c6 100644 --- a/src/backend/utils/adt/geo_ops.c +++ b/src/backend/utils/adt/geo_ops.c @@ -423,7 +423,7 @@ box_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); Node *escontext = fcinfo->context; - BOX *box = (BOX *) palloc(sizeof(BOX)); + BOX *box = palloc_object(BOX); bool isopen; float8 x, y; @@ -470,7 +470,7 @@ box_recv(PG_FUNCTION_ARGS) float8 x, y; - box = (BOX *) palloc(sizeof(BOX)); + box = palloc_object(BOX); box->high.x = pq_getmsgfloat8(buf); box->high.y = pq_getmsgfloat8(buf); @@ -849,7 +849,7 @@ Datum box_center(PG_FUNCTION_ARGS) { BOX *box = PG_GETARG_BOX_P(0); - Point *result = (Point *) palloc(sizeof(Point)); + Point *result = palloc_object(Point); box_cn(result, box); @@ -914,7 +914,7 @@ box_intersect(PG_FUNCTION_ARGS) if (!box_ov(box1, box2)) PG_RETURN_NULL(); - result = (BOX *) palloc(sizeof(BOX)); + result = palloc_object(BOX); result->high.x = float8_min(box1->high.x, box2->high.x); result->low.x = float8_max(box1->low.x, box2->low.x); @@ -933,7 +933,7 @@ Datum box_diagonal(PG_FUNCTION_ARGS) { BOX *box = PG_GETARG_BOX_P(0); - LSEG *result = (LSEG *) palloc(sizeof(LSEG)); + LSEG *result = palloc_object(LSEG); statlseg_construct(result, &box->high, &box->low); @@ -980,7 +980,7 @@ line_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); Node *escontext = fcinfo->context; - LINE *line = (LINE *) palloc(sizeof(LINE)); + LINE *line = palloc_object(LINE); LSEG lseg; bool isopen; char *s; @@ -1040,7 +1040,7 @@ line_recv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); LINE *line; - line = (LINE *) palloc(sizeof(LINE)); + line = palloc_object(LINE); line->A = pq_getmsgfloat8(buf); line->B = pq_getmsgfloat8(buf); @@ -1116,7 +1116,7 @@ line_construct_pp(PG_FUNCTION_ARGS) { Point *pt1 = PG_GETARG_POINT_P(0); Point *pt2 = PG_GETARG_POINT_P(1); - LINE *result = (LINE *) palloc(sizeof(LINE)); + LINE *result = palloc_object(LINE); if (point_eq_point(pt1, pt2)) ereport(ERROR, @@ -1276,7 +1276,7 @@ line_distance(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(float8_div(fabs(float8_mi(l1->C, float8_mul(ratio, l2->C))), - HYPOT(l1->A, l1->B))); + hypot(l1->A, l1->B))); } /* line_interpt() @@ -1289,7 +1289,7 @@ line_interpt(PG_FUNCTION_ARGS) LINE *l2 = PG_GETARG_LINE_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (!line_interpt_line(result, l1, l2)) PG_RETURN_NULL(); @@ -1831,7 +1831,7 @@ Datum point_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); - Point *point = (Point *) palloc(sizeof(Point)); + Point *point = palloc_object(Point); /* Ignore failure from pair_decode, since our return value won't matter */ pair_decode(str, &point->x, &point->y, NULL, "point", str, fcinfo->context); @@ -1855,7 +1855,7 @@ point_recv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); Point *point; - point = (Point *) palloc(sizeof(Point)); + point = palloc_object(Point); point->x = pq_getmsgfloat8(buf); point->y = pq_getmsgfloat8(buf); PG_RETURN_POINT_P(point); @@ -2001,7 +2001,7 @@ point_distance(PG_FUNCTION_ARGS) static inline float8 point_dt(Point *pt1, Point *pt2) { - return HYPOT(float8_mi(pt1->x, pt2->x), float8_mi(pt1->y, pt2->y)); + return hypot(float8_mi(pt1->x, pt2->x), float8_mi(pt1->y, pt2->y)); } Datum @@ -2066,7 +2066,7 @@ lseg_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); Node *escontext = fcinfo->context; - LSEG *lseg = (LSEG *) palloc(sizeof(LSEG)); + LSEG *lseg = palloc_object(LSEG); bool isopen; if (!path_decode(str, true, 2, &lseg->p[0], &isopen, NULL, "lseg", str, @@ -2094,7 +2094,7 @@ lseg_recv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); LSEG *lseg; - lseg = (LSEG *) palloc(sizeof(LSEG)); + lseg = palloc_object(LSEG); lseg->p[0].x = pq_getmsgfloat8(buf); lseg->p[0].y = pq_getmsgfloat8(buf); @@ -2130,7 +2130,7 @@ lseg_construct(PG_FUNCTION_ARGS) { Point *pt1 = PG_GETARG_POINT_P(0); Point *pt2 = PG_GETARG_POINT_P(1); - LSEG *result = (LSEG *) palloc(sizeof(LSEG)); + LSEG *result = palloc_object(LSEG); statlseg_construct(result, pt1, pt2); @@ -2318,7 +2318,7 @@ lseg_center(PG_FUNCTION_ARGS) LSEG *lseg = PG_GETARG_LSEG_P(0); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); result->x = float8_div(float8_pl(lseg->p[0].x, lseg->p[1].x), 2.0); result->y = float8_div(float8_pl(lseg->p[0].y, lseg->p[1].y), 2.0); @@ -2364,7 +2364,7 @@ lseg_interpt(PG_FUNCTION_ARGS) LSEG *l2 = PG_GETARG_LSEG_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (!lseg_interpt_lseg(result, l1, l2)) PG_RETURN_NULL(); @@ -2753,7 +2753,7 @@ close_pl(PG_FUNCTION_ARGS) LINE *line = PG_GETARG_LINE_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (isnan(line_closept_point(result, line, pt))) PG_RETURN_NULL(); @@ -2794,7 +2794,7 @@ close_ps(PG_FUNCTION_ARGS) LSEG *lseg = PG_GETARG_LSEG_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (isnan(lseg_closept_point(result, lseg, pt))) PG_RETURN_NULL(); @@ -2859,7 +2859,7 @@ close_lseg(PG_FUNCTION_ARGS) if (lseg_sl(l1) == lseg_sl(l2)) PG_RETURN_NULL(); - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (isnan(lseg_closept_lseg(result, l2, l1))) PG_RETURN_NULL(); @@ -2936,7 +2936,7 @@ close_pb(PG_FUNCTION_ARGS) BOX *box = PG_GETARG_BOX_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (isnan(box_closept_point(result, box, pt))) PG_RETURN_NULL(); @@ -2994,7 +2994,7 @@ close_ls(PG_FUNCTION_ARGS) if (lseg_sl(lseg) == line_sl(line)) PG_RETURN_NULL(); - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (isnan(lseg_closept_line(result, lseg, line))) PG_RETURN_NULL(); @@ -3066,7 +3066,7 @@ close_sb(PG_FUNCTION_ARGS) BOX *box = PG_GETARG_BOX_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); if (isnan(box_closept_lseg(result, box, lseg))) PG_RETURN_NULL(); @@ -4099,7 +4099,7 @@ construct_point(PG_FUNCTION_ARGS) float8 y = PG_GETARG_FLOAT8(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); point_construct(result, x, y); @@ -4122,7 +4122,7 @@ point_add(PG_FUNCTION_ARGS) Point *p2 = PG_GETARG_POINT_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); point_add_point(result, p1, p2); @@ -4145,7 +4145,7 @@ point_sub(PG_FUNCTION_ARGS) Point *p2 = PG_GETARG_POINT_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); point_sub_point(result, p1, p2); @@ -4170,7 +4170,7 @@ point_mul(PG_FUNCTION_ARGS) Point *p2 = PG_GETARG_POINT_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); point_mul_point(result, p1, p2); @@ -4199,7 +4199,7 @@ point_div(PG_FUNCTION_ARGS) Point *p2 = PG_GETARG_POINT_P(1); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); point_div_point(result, p1, p2); @@ -4220,7 +4220,7 @@ points_box(PG_FUNCTION_ARGS) Point *p2 = PG_GETARG_POINT_P(1); BOX *result; - result = (BOX *) palloc(sizeof(BOX)); + result = palloc_object(BOX); box_construct(result, p1, p2); @@ -4234,7 +4234,7 @@ box_add(PG_FUNCTION_ARGS) Point *p = PG_GETARG_POINT_P(1); BOX *result; - result = (BOX *) palloc(sizeof(BOX)); + result = palloc_object(BOX); point_add_point(&result->high, &box->high, p); point_add_point(&result->low, &box->low, p); @@ -4249,7 +4249,7 @@ box_sub(PG_FUNCTION_ARGS) Point *p = PG_GETARG_POINT_P(1); BOX *result; - result = (BOX *) palloc(sizeof(BOX)); + result = palloc_object(BOX); point_sub_point(&result->high, &box->high, p); point_sub_point(&result->low, &box->low, p); @@ -4266,7 +4266,7 @@ box_mul(PG_FUNCTION_ARGS) Point high, low; - result = (BOX *) palloc(sizeof(BOX)); + result = palloc_object(BOX); point_mul_point(&high, &box->high, p); point_mul_point(&low, &box->low, p); @@ -4285,7 +4285,7 @@ box_div(PG_FUNCTION_ARGS) Point high, low; - result = (BOX *) palloc(sizeof(BOX)); + result = palloc_object(BOX); point_div_point(&high, &box->high, p); point_div_point(&low, &box->low, p); @@ -4304,7 +4304,7 @@ point_box(PG_FUNCTION_ARGS) Point *pt = PG_GETARG_POINT_P(0); BOX *box; - box = (BOX *) palloc(sizeof(BOX)); + box = palloc_object(BOX); box->high.x = pt->x; box->low.x = pt->x; @@ -4324,7 +4324,7 @@ boxes_bound_box(PG_FUNCTION_ARGS) *box2 = PG_GETARG_BOX_P(1), *container; - container = (BOX *) palloc(sizeof(BOX)); + container = palloc_object(BOX); container->high.x = float8_max(box1->high.x, box2->high.x); container->low.x = float8_min(box1->low.x, box2->low.x); @@ -4506,7 +4506,7 @@ poly_center(PG_FUNCTION_ARGS) Point *result; CIRCLE circle; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); poly_to_circle(&circle, poly); *result = circle.center; @@ -4521,7 +4521,7 @@ poly_box(PG_FUNCTION_ARGS) POLYGON *poly = PG_GETARG_POLYGON_P(0); BOX *box; - box = (BOX *) palloc(sizeof(BOX)); + box = palloc_object(BOX); *box = poly->boundbox; PG_RETURN_BOX_P(box); @@ -4612,7 +4612,7 @@ circle_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); Node *escontext = fcinfo->context; - CIRCLE *circle = (CIRCLE *) palloc(sizeof(CIRCLE)); + CIRCLE *circle = palloc_object(CIRCLE); char *s, *cp; int depth = 0; @@ -4705,7 +4705,7 @@ circle_recv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); CIRCLE *circle; - circle = (CIRCLE *) palloc(sizeof(CIRCLE)); + circle = palloc_object(CIRCLE); circle->center.x = pq_getmsgfloat8(buf); circle->center.y = pq_getmsgfloat8(buf); @@ -4968,7 +4968,7 @@ circle_add_pt(PG_FUNCTION_ARGS) Point *point = PG_GETARG_POINT_P(1); CIRCLE *result; - result = (CIRCLE *) palloc(sizeof(CIRCLE)); + result = palloc_object(CIRCLE); point_add_point(&result->center, &circle->center, point); result->radius = circle->radius; @@ -4983,7 +4983,7 @@ circle_sub_pt(PG_FUNCTION_ARGS) Point *point = PG_GETARG_POINT_P(1); CIRCLE *result; - result = (CIRCLE *) palloc(sizeof(CIRCLE)); + result = palloc_object(CIRCLE); point_sub_point(&result->center, &circle->center, point); result->radius = circle->radius; @@ -5002,10 +5002,10 @@ circle_mul_pt(PG_FUNCTION_ARGS) Point *point = PG_GETARG_POINT_P(1); CIRCLE *result; - result = (CIRCLE *) palloc(sizeof(CIRCLE)); + result = palloc_object(CIRCLE); point_mul_point(&result->center, &circle->center, point); - result->radius = float8_mul(circle->radius, HYPOT(point->x, point->y)); + result->radius = float8_mul(circle->radius, hypot(point->x, point->y)); PG_RETURN_CIRCLE_P(result); } @@ -5017,10 +5017,10 @@ circle_div_pt(PG_FUNCTION_ARGS) Point *point = PG_GETARG_POINT_P(1); CIRCLE *result; - result = (CIRCLE *) palloc(sizeof(CIRCLE)); + result = palloc_object(CIRCLE); point_div_point(&result->center, &circle->center, point); - result->radius = float8_div(circle->radius, HYPOT(point->x, point->y)); + result->radius = float8_div(circle->radius, hypot(point->x, point->y)); PG_RETURN_CIRCLE_P(result); } @@ -5145,7 +5145,7 @@ circle_center(PG_FUNCTION_ARGS) CIRCLE *circle = PG_GETARG_CIRCLE_P(0); Point *result; - result = (Point *) palloc(sizeof(Point)); + result = palloc_object(Point); result->x = circle->center.x; result->y = circle->center.y; @@ -5173,7 +5173,7 @@ cr_circle(PG_FUNCTION_ARGS) float8 radius = PG_GETARG_FLOAT8(1); CIRCLE *result; - result = (CIRCLE *) palloc(sizeof(CIRCLE)); + result = palloc_object(CIRCLE); result->center.x = center->x; result->center.y = center->y; @@ -5189,7 +5189,7 @@ circle_box(PG_FUNCTION_ARGS) BOX *box; float8 delta; - box = (BOX *) palloc(sizeof(BOX)); + box = palloc_object(BOX); delta = float8_div(circle->radius, sqrt(2.0)); @@ -5210,7 +5210,7 @@ box_circle(PG_FUNCTION_ARGS) BOX *box = PG_GETARG_BOX_P(0); CIRCLE *circle; - circle = (CIRCLE *) palloc(sizeof(CIRCLE)); + circle = palloc_object(CIRCLE); circle->center.x = float8_div(float8_pl(box->high.x, box->low.x), 2.0); circle->center.y = float8_div(float8_pl(box->high.y, box->low.y), 2.0); @@ -5309,7 +5309,7 @@ poly_circle(PG_FUNCTION_ARGS) POLYGON *poly = PG_GETARG_POLYGON_P(0); CIRCLE *result; - result = (CIRCLE *) palloc(sizeof(CIRCLE)); + result = palloc_object(CIRCLE); poly_to_circle(result, poly); @@ -5492,71 +5492,3 @@ plist_same(int npts, Point *p1, Point *p2) return false; } - - -/*------------------------------------------------------------------------- - * Determine the hypotenuse. - * - * If required, x and y are swapped to make x the larger number. The - * traditional formula of x^2+y^2 is rearranged to factor x outside the - * sqrt. This allows computation of the hypotenuse for significantly - * larger values, and with a higher precision than when using the naive - * formula. In particular, this cannot overflow unless the final result - * would be out-of-range. - * - * sqrt( x^2 + y^2 ) = sqrt( x^2( 1 + y^2/x^2) ) - * = x * sqrt( 1 + y^2/x^2 ) - * = x * sqrt( 1 + y/x * y/x ) - * - * It is expected that this routine will eventually be replaced with the - * C99 hypot() function. - * - * This implementation conforms to IEEE Std 1003.1 and GLIBC, in that the - * case of hypot(inf,nan) results in INF, and not NAN. - *----------------------------------------------------------------------- - */ -float8 -pg_hypot(float8 x, float8 y) -{ - float8 yx, - result; - - /* Handle INF and NaN properly */ - if (isinf(x) || isinf(y)) - return get_float8_infinity(); - - if (isnan(x) || isnan(y)) - return get_float8_nan(); - - /* Else, drop any minus signs */ - x = fabs(x); - y = fabs(y); - - /* Swap x and y if needed to make x the larger one */ - if (x < y) - { - float8 temp = x; - - x = y; - y = temp; - } - - /* - * If y is zero, the hypotenuse is x. This test saves a few cycles in - * such cases, but more importantly it also protects against - * divide-by-zero errors, since now x >= y. - */ - if (y == 0.0) - return x; - - /* Determine the hypotenuse */ - yx = y / x; - result = x * sqrt(1.0 + (yx * yx)); - - if (unlikely(isinf(result))) - float_overflow_error(); - if (unlikely(result == 0.0)) - float_underflow_error(); - - return result; -} diff --git a/src/backend/utils/adt/geo_spgist.c b/src/backend/utils/adt/geo_spgist.c index fec33e9537243..94d351d478608 100644 --- a/src/backend/utils/adt/geo_spgist.c +++ b/src/backend/utils/adt/geo_spgist.c @@ -156,7 +156,7 @@ getQuadrant(BOX *centroid, BOX *inBox) static RangeBox * getRangeBox(BOX *box) { - RangeBox *range_box = (RangeBox *) palloc(sizeof(RangeBox)); + RangeBox *range_box = palloc_object(RangeBox); range_box->left.low = box->low.x; range_box->left.high = box->high.x; @@ -176,7 +176,7 @@ getRangeBox(BOX *box) static RectBox * initRectBox(void) { - RectBox *rect_box = (RectBox *) palloc(sizeof(RectBox)); + RectBox *rect_box = palloc_object(RectBox); float8 infinity = get_float8_infinity(); rect_box->range_box_x.left.low = -infinity; @@ -204,7 +204,7 @@ initRectBox(void) static RectBox * nextRectBox(RectBox *rect_box, RangeBox *centroid, uint8 quadrant) { - RectBox *next_rect_box = (RectBox *) palloc(sizeof(RectBox)); + RectBox *next_rect_box = palloc_object(RectBox); memcpy(next_rect_box, rect_box, sizeof(RectBox)); @@ -390,7 +390,7 @@ pointToRectBoxDistance(Point *point, RectBox *rect_box) else dy = 0; - return HYPOT(dx, dy); + return hypot(dx, dy); } @@ -445,10 +445,10 @@ spg_box_quad_picksplit(PG_FUNCTION_ARGS) BOX *centroid; int median, i; - float8 *lowXs = palloc(sizeof(float8) * in->nTuples); - float8 *highXs = palloc(sizeof(float8) * in->nTuples); - float8 *lowYs = palloc(sizeof(float8) * in->nTuples); - float8 *highYs = palloc(sizeof(float8) * in->nTuples); + float8 *lowXs = palloc_array(float8, in->nTuples); + float8 *highXs = palloc_array(float8, in->nTuples); + float8 *lowYs = palloc_array(float8, in->nTuples); + float8 *highYs = palloc_array(float8, in->nTuples); /* Calculate median of all 4D coordinates */ for (i = 0; i < in->nTuples; i++) @@ -468,7 +468,7 @@ spg_box_quad_picksplit(PG_FUNCTION_ARGS) median = in->nTuples / 2; - centroid = palloc(sizeof(BOX)); + centroid = palloc_object(BOX); centroid->low.x = lowXs[median]; centroid->high.x = highXs[median]; @@ -482,8 +482,8 @@ spg_box_quad_picksplit(PG_FUNCTION_ARGS) out->nNodes = 16; out->nodeLabels = NULL; /* We don't need node labels. */ - out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); - out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = palloc_array(int, in->nTuples); + out->leafTupleDatums = palloc_array(Datum, in->nTuples); /* * Assign ranges to corresponding nodes according to quadrants relative to @@ -574,13 +574,13 @@ spg_box_quad_inner_consistent(PG_FUNCTION_ARGS) { /* Report that all nodes should be visited */ out->nNodes = in->nNodes; - out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->nodeNumbers = palloc_array(int, in->nNodes); for (i = 0; i < in->nNodes; i++) out->nodeNumbers[i] = i; if (in->norderbys > 0 && in->nNodes > 0) { - double *distances = palloc(sizeof(double) * in->norderbys); + double *distances = palloc_array(double, in->norderbys); int j; for (j = 0; j < in->norderbys; j++) @@ -590,12 +590,12 @@ spg_box_quad_inner_consistent(PG_FUNCTION_ARGS) distances[j] = pointToRectBoxDistance(pt, rect_box); } - out->distances = (double **) palloc(sizeof(double *) * in->nNodes); + out->distances = palloc_array(double *, in->nNodes); out->distances[0] = distances; for (i = 1; i < in->nNodes; i++) { - out->distances[i] = palloc(sizeof(double) * in->norderbys); + out->distances[i] = palloc_array(double, in->norderbys); memcpy(out->distances[i], distances, sizeof(double) * in->norderbys); } @@ -609,7 +609,7 @@ spg_box_quad_inner_consistent(PG_FUNCTION_ARGS) * following operations. */ centroid = getRangeBox(DatumGetBoxP(in->prefixDatum)); - queries = (RangeBox **) palloc(in->nkeys * sizeof(RangeBox *)); + queries = palloc_array(RangeBox *, in->nkeys); for (i = 0; i < in->nkeys; i++) { BOX *box = spg_box_quad_get_scankey_bbox(&in->scankeys[i], NULL); @@ -619,10 +619,10 @@ spg_box_quad_inner_consistent(PG_FUNCTION_ARGS) /* Allocate enough memory for nodes */ out->nNodes = 0; - out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); - out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes); + out->nodeNumbers = palloc_array(int, in->nNodes); + out->traversalValues = palloc_array(void *, in->nNodes); if (in->norderbys > 0) - out->distances = (double **) palloc(sizeof(double *) * in->nNodes); + out->distances = palloc_array(double *, in->nNodes); /* * We switch memory context, because we want to allocate memory for new @@ -703,7 +703,7 @@ spg_box_quad_inner_consistent(PG_FUNCTION_ARGS) if (in->norderbys > 0) { - double *distances = palloc(sizeof(double) * in->norderbys); + double *distances = palloc_array(double, in->norderbys); int j; out->distances[out->nNodes] = distances; @@ -878,7 +878,7 @@ spg_poly_quad_compress(PG_FUNCTION_ARGS) POLYGON *polygon = PG_GETARG_POLYGON_P(0); BOX *box; - box = (BOX *) palloc(sizeof(BOX)); + box = palloc_object(BOX); *box = polygon->boundbox; PG_RETURN_BOX_P(box); diff --git a/src/backend/utils/adt/hbafuncs.c b/src/backend/utils/adt/hbafuncs.c index b62c3d944cf1a..1614d6d230298 100644 --- a/src/backend/utils/adt/hbafuncs.c +++ b/src/backend/utils/adt/hbafuncs.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "catalog/objectaddress.h" #include "common/ip.h" #include "funcapi.h" diff --git a/src/backend/utils/adt/inet_net_pton.c b/src/backend/utils/adt/inet_net_pton.c index ef2236d9f0430..3b0db2a379937 100644 --- a/src/backend/utils/adt/inet_net_pton.c +++ b/src/backend/utils/adt/inet_net_pton.c @@ -115,8 +115,7 @@ inet_cidr_pton_ipv4(const char *src, u_char *dst, size_t size) src++; /* skip x or X. */ while ((ch = *src++) != '\0' && isxdigit((unsigned char) ch)) { - if (isupper((unsigned char) ch)) - ch = tolower((unsigned char) ch); + ch = pg_ascii_tolower((unsigned char) ch); n = strchr(xdigits, ch) - xdigits; assert(n >= 0 && n <= 15); if (dirty == 0) diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c index b5781989a64d5..60411ee024ded 100644 --- a/src/backend/utils/adt/int.c +++ b/src/backend/utils/adt/int.c @@ -1537,7 +1537,7 @@ generate_series_step_int4(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ - fctx = (generate_series_fctx *) palloc(sizeof(generate_series_fctx)); + fctx = palloc_object(generate_series_fctx); /* * Use fctx to keep state from call to call. Seed current with the diff --git a/src/backend/utils/adt/int8.c b/src/backend/utils/adt/int8.c index 9dd5889f34c62..678f971508be2 100644 --- a/src/backend/utils/adt/int8.c +++ b/src/backend/utils/adt/int8.c @@ -24,7 +24,7 @@ #include "nodes/supportnodes.h" #include "optimizer/optimizer.h" #include "utils/builtins.h" - +#include "utils/fmgroids.h" typedef struct { @@ -718,76 +718,29 @@ int8lcm(PG_FUNCTION_ARGS) Datum int8inc(PG_FUNCTION_ARGS) { - /* - * When int8 is pass-by-reference, we provide this special case to avoid - * palloc overhead for COUNT(): when called as an aggregate, we know that - * the argument is modifiable local storage, so just update it in-place. - * (If int8 is pass-by-value, then of course this is useless as well as - * incorrect, so just ifdef it out.) - */ -#ifndef USE_FLOAT8_BYVAL /* controls int8 too */ - if (AggCheckCallContext(fcinfo, NULL)) - { - int64 *arg = (int64 *) PG_GETARG_POINTER(0); - - if (unlikely(pg_add_s64_overflow(*arg, 1, arg))) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); - - PG_RETURN_POINTER(arg); - } - else -#endif - { - /* Not called as an aggregate, so just do it the dumb way */ - int64 arg = PG_GETARG_INT64(0); - int64 result; + int64 arg = PG_GETARG_INT64(0); + int64 result; - if (unlikely(pg_add_s64_overflow(arg, 1, &result))) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); + if (unlikely(pg_add_s64_overflow(arg, 1, &result))) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); - PG_RETURN_INT64(result); - } + PG_RETURN_INT64(result); } Datum int8dec(PG_FUNCTION_ARGS) { - /* - * When int8 is pass-by-reference, we provide this special case to avoid - * palloc overhead for COUNT(): when called as an aggregate, we know that - * the argument is modifiable local storage, so just update it in-place. - * (If int8 is pass-by-value, then of course this is useless as well as - * incorrect, so just ifdef it out.) - */ -#ifndef USE_FLOAT8_BYVAL /* controls int8 too */ - if (AggCheckCallContext(fcinfo, NULL)) - { - int64 *arg = (int64 *) PG_GETARG_POINTER(0); - - if (unlikely(pg_sub_s64_overflow(*arg, 1, arg))) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); - PG_RETURN_POINTER(arg); - } - else -#endif - { - /* Not called as an aggregate, so just do it the dumb way */ - int64 arg = PG_GETARG_INT64(0); - int64 result; + int64 arg = PG_GETARG_INT64(0); + int64 result; - if (unlikely(pg_sub_s64_overflow(arg, 1, &result))) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); + if (unlikely(pg_sub_s64_overflow(arg, 1, &result))) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); - PG_RETURN_INT64(result); - } + PG_RETURN_INT64(result); } @@ -858,6 +811,53 @@ int8inc_support(PG_FUNCTION_ARGS) PG_RETURN_POINTER(req); } + if (IsA(rawreq, SupportRequestSimplifyAggref)) + { + SupportRequestSimplifyAggref *req = (SupportRequestSimplifyAggref *) rawreq; + Aggref *agg = req->aggref; + + /* + * Check for COUNT(ANY) and try to convert to COUNT(*). The input + * argument cannot be NULL, we can't have an ORDER BY / DISTINCT in + * the aggregate, and agglevelsup must be 0. + * + * Technically COUNT(ANY) must have 1 arg, but be paranoid and check. + */ + if (agg->aggfnoid == F_COUNT_ANY && list_length(agg->args) == 1) + { + TargetEntry *tle = (TargetEntry *) linitial(agg->args); + Expr *arg = tle->expr; + + /* Check for unsupported cases */ + if (agg->aggdistinct != NIL || agg->aggorder != NIL || + agg->agglevelsup != 0) + PG_RETURN_POINTER(NULL); + + /* If the arg isn't NULLable, do the conversion */ + if (expr_is_nonnullable(req->root, arg, false)) + { + Aggref *newagg; + + /* We don't expect these to have been set yet */ + Assert(agg->aggtransno == -1); + Assert(agg->aggtranstype == InvalidOid); + + /* Convert COUNT(ANY) to COUNT(*) by making a new Aggref */ + newagg = makeNode(Aggref); + memcpy(newagg, agg, sizeof(Aggref)); + newagg->aggfnoid = F_COUNT_; + + /* count(*) has no args */ + newagg->aggargtypes = NULL; + newagg->args = NULL; + newagg->aggstar = true; + newagg->location = -1; + + PG_RETURN_POINTER(newagg); + } + } + } + PG_RETURN_POINTER(NULL); } @@ -1411,7 +1411,7 @@ generate_series_step_int8(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ - fctx = (generate_series_fctx *) palloc(sizeof(generate_series_fctx)); + fctx = palloc_object(generate_series_fctx); /* * Use fctx to keep state from call to call. Seed current with the diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index 51452755f5868..73cacf33e614b 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" #include "common/hashfn.h" @@ -88,7 +89,7 @@ typedef struct JsonAggState static void composite_to_json(Datum composite, StringInfo result, bool use_line_feeds); static void array_dim_to_json(StringInfo result, int dim, int ndims, int *dims, - Datum *vals, bool *nulls, int *valcount, + const Datum *vals, const bool *nulls, int *valcount, JsonTypeCategory tcategory, Oid outfuncoid, bool use_line_feeds); static void array_to_json_internal(Datum array, StringInfo result, @@ -428,8 +429,8 @@ JsonEncodeDateTime(char *buf, Datum value, Oid typid, const int *tzp) * ourselves recursively to process the next dimension. */ static void -array_dim_to_json(StringInfo result, int dim, int ndims, int *dims, Datum *vals, - bool *nulls, int *valcount, JsonTypeCategory tcategory, +array_dim_to_json(StringInfo result, int dim, int ndims, int *dims, const Datum *vals, + const bool *nulls, int *valcount, JsonTypeCategory tcategory, Oid outfuncoid, bool use_line_feeds) { int i; @@ -630,13 +631,13 @@ Datum array_to_json(PG_FUNCTION_ARGS) { Datum array = PG_GETARG_DATUM(0); - StringInfo result; + StringInfoData result; - result = makeStringInfo(); + initStringInfo(&result); - array_to_json_internal(array, result, false); + array_to_json_internal(array, &result, false); - PG_RETURN_TEXT_P(cstring_to_text_with_len(result->data, result->len)); + PG_RETURN_TEXT_P(cstring_to_text_with_len(result.data, result.len)); } /* @@ -647,13 +648,13 @@ array_to_json_pretty(PG_FUNCTION_ARGS) { Datum array = PG_GETARG_DATUM(0); bool use_line_feeds = PG_GETARG_BOOL(1); - StringInfo result; + StringInfoData result; - result = makeStringInfo(); + initStringInfo(&result); - array_to_json_internal(array, result, use_line_feeds); + array_to_json_internal(array, &result, use_line_feeds); - PG_RETURN_TEXT_P(cstring_to_text_with_len(result->data, result->len)); + PG_RETURN_TEXT_P(cstring_to_text_with_len(result.data, result.len)); } /* @@ -663,13 +664,13 @@ Datum row_to_json(PG_FUNCTION_ARGS) { Datum array = PG_GETARG_DATUM(0); - StringInfo result; + StringInfoData result; - result = makeStringInfo(); + initStringInfo(&result); - composite_to_json(array, result, false); + composite_to_json(array, &result, false); - PG_RETURN_TEXT_P(cstring_to_text_with_len(result->data, result->len)); + PG_RETURN_TEXT_P(cstring_to_text_with_len(result.data, result.len)); } /* @@ -680,13 +681,13 @@ row_to_json_pretty(PG_FUNCTION_ARGS) { Datum array = PG_GETARG_DATUM(0); bool use_line_feeds = PG_GETARG_BOOL(1); - StringInfo result; + StringInfoData result; - result = makeStringInfo(); + initStringInfo(&result); - composite_to_json(array, result, use_line_feeds); + composite_to_json(array, &result, use_line_feeds); - PG_RETURN_TEXT_P(cstring_to_text_with_len(result->data, result->len)); + PG_RETURN_TEXT_P(cstring_to_text_with_len(result.data, result.len)); } /* @@ -762,12 +763,13 @@ to_json(PG_FUNCTION_ARGS) Datum datum_to_json(Datum val, JsonTypeCategory tcategory, Oid outfuncoid) { - StringInfo result = makeStringInfo(); + StringInfoData result; - datum_to_json_internal(val, false, result, tcategory, outfuncoid, + initStringInfo(&result); + datum_to_json_internal(val, false, &result, tcategory, outfuncoid, false); - return PointerGetDatum(cstring_to_text_with_len(result->data, result->len)); + return PointerGetDatum(cstring_to_text_with_len(result.data, result.len)); } /* @@ -805,7 +807,7 @@ json_agg_transfn_worker(FunctionCallInfo fcinfo, bool absent_on_null) * use the right context to enlarge the object if necessary. */ oldcontext = MemoryContextSwitchTo(aggcontext); - state = (JsonAggState *) palloc(sizeof(JsonAggState)); + state = palloc_object(JsonAggState); state->str = makeStringInfo(); MemoryContextSwitchTo(oldcontext); @@ -904,7 +906,7 @@ json_unique_hash(const void *key, Size keysize) hash ^= hash_bytes((const unsigned char *) entry->key, entry->key_len); - return DatumGetUInt32(hash); + return hash; } static int @@ -1027,7 +1029,7 @@ json_object_agg_transfn_worker(FunctionCallInfo fcinfo, * sure they use the right context to enlarge the object if necessary. */ oldcontext = MemoryContextSwitchTo(aggcontext); - state = (JsonAggState *) palloc(sizeof(JsonAggState)); + state = palloc_object(JsonAggState); state->str = makeStringInfo(); if (unique_keys) json_unique_builder_init(&state->unique_check); @@ -1346,25 +1348,25 @@ json_build_array_worker(int nargs, const Datum *args, const bool *nulls, const O { int i; const char *sep = ""; - StringInfo result; + StringInfoData result; - result = makeStringInfo(); + initStringInfo(&result); - appendStringInfoChar(result, '['); + appendStringInfoChar(&result, '['); for (i = 0; i < nargs; i++) { if (absent_on_null && nulls[i]) continue; - appendStringInfoString(result, sep); + appendStringInfoString(&result, sep); sep = ", "; - add_json(args[i], nulls[i], result, types[i], false); + add_json(args[i], nulls[i], &result, types[i], false); } - appendStringInfoChar(result, ']'); + appendStringInfoChar(&result, ']'); - return PointerGetDatum(cstring_to_text_with_len(result->data, result->len)); + return PointerGetDatum(cstring_to_text_with_len(result.data, result.len)); } /* @@ -1760,7 +1762,7 @@ json_unique_object_start(void *_state) return JSON_SUCCESS; /* push object entry to stack */ - entry = palloc(sizeof(*entry)); + entry = palloc_object(JsonUniqueStackEntry); entry->object_id = state->id_counter++; entry->parent = state->stack; state->stack = entry; diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c index da94d424d617f..dcf84c3fddc23 100644 --- a/src/backend/utils/adt/jsonb.c +++ b/src/backend/utils/adt/jsonb.c @@ -19,23 +19,16 @@ #include "libpq/pqformat.h" #include "miscadmin.h" #include "utils/builtins.h" +#include "utils/fmgroids.h" #include "utils/json.h" #include "utils/jsonb.h" #include "utils/jsonfuncs.h" #include "utils/lsyscache.h" #include "utils/typcache.h" -typedef struct JsonbInState -{ - JsonbParseState *parseState; - JsonbValue *res; - bool unique_keys; - Node *escontext; -} JsonbInState; - typedef struct JsonbAggState { - JsonbInState *res; + JsonbInState pstate; JsonTypeCategory key_category; Oid key_output_func; JsonTypeCategory val_category; @@ -62,7 +55,6 @@ static void datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *resul bool key_scalar); static void add_jsonb(Datum val, bool is_null, JsonbInState *result, Oid val_type, bool key_scalar); -static JsonbParseState *clone_parse_state(JsonbParseState *state); static char *JsonbToCStringWorker(StringInfo out, JsonbContainer *in, int estimated_len, bool indent); static void add_indent(StringInfo out, bool indent, int level); @@ -125,15 +117,16 @@ jsonb_send(PG_FUNCTION_ARGS) { Jsonb *jb = PG_GETARG_JSONB_P(0); StringInfoData buf; - StringInfo jtext = makeStringInfo(); + StringInfoData jtext; int version = 1; - (void) JsonbToCString(jtext, &jb->root, VARSIZE(jb)); + initStringInfo(&jtext); + (void) JsonbToCString(&jtext, &jb->root, VARSIZE(jb)); pq_begintypsend(&buf); pq_sendint8(&buf, version); - pq_sendtext(&buf, jtext->data, jtext->len); - destroyStringInfo(jtext); + pq_sendtext(&buf, jtext.data, jtext.len); + pfree(jtext.data); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } @@ -269,8 +262,8 @@ jsonb_from_cstring(char *json, int len, bool unique_keys, Node *escontext) if (!pg_parse_json_or_errsave(&lex, &sem, escontext)) return (Datum) 0; - /* after parsing, the item member has the composed jsonb structure */ - PG_RETURN_POINTER(JsonbValueToJsonb(state.res)); + /* after parsing, the result field has the composed jsonb structure */ + PG_RETURN_POINTER(JsonbValueToJsonb(state.result)); } static bool @@ -291,7 +284,7 @@ jsonb_in_object_start(void *pstate) { JsonbInState *_state = (JsonbInState *) pstate; - _state->res = pushJsonbValue(&_state->parseState, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(_state, WJB_BEGIN_OBJECT, NULL); _state->parseState->unique_keys = _state->unique_keys; return JSON_SUCCESS; @@ -302,7 +295,7 @@ jsonb_in_object_end(void *pstate) { JsonbInState *_state = (JsonbInState *) pstate; - _state->res = pushJsonbValue(&_state->parseState, WJB_END_OBJECT, NULL); + pushJsonbValue(_state, WJB_END_OBJECT, NULL); return JSON_SUCCESS; } @@ -312,7 +305,7 @@ jsonb_in_array_start(void *pstate) { JsonbInState *_state = (JsonbInState *) pstate; - _state->res = pushJsonbValue(&_state->parseState, WJB_BEGIN_ARRAY, NULL); + pushJsonbValue(_state, WJB_BEGIN_ARRAY, NULL); return JSON_SUCCESS; } @@ -322,7 +315,7 @@ jsonb_in_array_end(void *pstate) { JsonbInState *_state = (JsonbInState *) pstate; - _state->res = pushJsonbValue(&_state->parseState, WJB_END_ARRAY, NULL); + pushJsonbValue(_state, WJB_END_ARRAY, NULL); return JSON_SUCCESS; } @@ -340,7 +333,7 @@ jsonb_in_object_field_start(void *pstate, char *fname, bool isnull) return JSON_SEM_ACTION_FAILED; v.val.string.val = fname; - _state->res = pushJsonbValue(&_state->parseState, WJB_KEY, &v); + pushJsonbValue(_state, WJB_KEY, &v); return JSON_SUCCESS; } @@ -434,9 +427,9 @@ jsonb_in_scalar(void *pstate, char *token, JsonTokenType tokentype) va.val.array.rawScalar = true; va.val.array.nElems = 1; - _state->res = pushJsonbValue(&_state->parseState, WJB_BEGIN_ARRAY, &va); - _state->res = pushJsonbValue(&_state->parseState, WJB_ELEM, &v); - _state->res = pushJsonbValue(&_state->parseState, WJB_END_ARRAY, NULL); + pushJsonbValue(_state, WJB_BEGIN_ARRAY, &va); + pushJsonbValue(_state, WJB_ELEM, &v); + pushJsonbValue(_state, WJB_END_ARRAY, NULL); } else { @@ -445,10 +438,10 @@ jsonb_in_scalar(void *pstate, char *token, JsonTokenType tokentype) switch (o->type) { case jbvArray: - _state->res = pushJsonbValue(&_state->parseState, WJB_ELEM, &v); + pushJsonbValue(_state, WJB_ELEM, &v); break; case jbvObject: - _state->res = pushJsonbValue(&_state->parseState, WJB_VALUE, &v); + pushJsonbValue(_state, WJB_VALUE, &v); break; default: elog(ERROR, "unexpected parent of nested structure"); @@ -640,7 +633,8 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, bool key_scalar) { char *outputstr; - bool numeric_error; + Numeric numeric_val; + bool numeric_to_string; JsonbValue jb; bool scalar_jsonb = false; @@ -665,9 +659,6 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, } else { - if (tcategory == JSONTYPE_CAST) - val = OidFunctionCall1(outfuncoid, val); - switch (tcategory) { case JSONTYPE_ARRAY: @@ -691,41 +682,73 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, } break; case JSONTYPE_NUMERIC: - outputstr = OidOutputFunctionCall(outfuncoid, val); if (key_scalar) { - /* always quote keys */ + /* always stringify keys */ + numeric_to_string = true; + numeric_val = NULL; /* pacify stupider compilers */ + } + else + { + Datum numd; + + switch (outfuncoid) + { + case F_NUMERIC_OUT: + numeric_val = DatumGetNumeric(val); + break; + case F_INT2OUT: + numeric_val = int64_to_numeric(DatumGetInt16(val)); + break; + case F_INT4OUT: + numeric_val = int64_to_numeric(DatumGetInt32(val)); + break; + case F_INT8OUT: + numeric_val = int64_to_numeric(DatumGetInt64(val)); + break; +#ifdef NOT_USED + + /* + * Ideally we'd short-circuit these two cases + * using float[48]_numeric. However, those + * functions are currently slower than the generic + * coerce-via-I/O approach. And they may round + * off differently. Until/unless that gets fixed, + * continue to use coerce-via-I/O for floats. + */ + case F_FLOAT4OUT: + numd = DirectFunctionCall1(float4_numeric, val); + numeric_val = DatumGetNumeric(numd); + break; + case F_FLOAT8OUT: + numd = DirectFunctionCall1(float8_numeric, val); + numeric_val = DatumGetNumeric(numd); + break; +#endif + default: + outputstr = OidOutputFunctionCall(outfuncoid, val); + numd = DirectFunctionCall3(numeric_in, + CStringGetDatum(outputstr), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1)); + numeric_val = DatumGetNumeric(numd); + break; + } + /* Must convert to string if it's Inf or NaN */ + numeric_to_string = (numeric_is_inf(numeric_val) || + numeric_is_nan(numeric_val)); + } + if (numeric_to_string) + { + outputstr = OidOutputFunctionCall(outfuncoid, val); jb.type = jbvString; jb.val.string.len = strlen(outputstr); jb.val.string.val = outputstr; } else { - /* - * Make it numeric if it's a valid JSON number, otherwise - * a string. Invalid numeric output will always have an - * 'N' or 'n' in it (I think). - */ - numeric_error = (strchr(outputstr, 'N') != NULL || - strchr(outputstr, 'n') != NULL); - if (!numeric_error) - { - Datum numd; - - jb.type = jbvNumeric; - numd = DirectFunctionCall3(numeric_in, - CStringGetDatum(outputstr), - ObjectIdGetDatum(InvalidOid), - Int32GetDatum(-1)); - jb.val.numeric = DatumGetNumeric(numd); - pfree(outputstr); - } - else - { - jb.type = jbvString; - jb.val.string.len = strlen(outputstr); - jb.val.string.val = outputstr; - } + jb.type = jbvNumeric; + jb.val.numeric = numeric_val; } break; case JSONTYPE_DATE: @@ -747,6 +770,9 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, jb.val.string.len = strlen(jb.val.string.val); break; case JSONTYPE_CAST: + /* cast to JSON, and then process as JSON */ + val = OidFunctionCall1(outfuncoid, val); + /* FALL THROUGH */ case JSONTYPE_JSON: { /* parse the json right into the existing result object */ @@ -794,21 +820,32 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, { if (type == WJB_END_ARRAY || type == WJB_END_OBJECT || type == WJB_BEGIN_ARRAY || type == WJB_BEGIN_OBJECT) - result->res = pushJsonbValue(&result->parseState, - type, NULL); + pushJsonbValue(result, type, NULL); else - result->res = pushJsonbValue(&result->parseState, - type, &jb); + pushJsonbValue(result, type, &jb); } } } break; default: - outputstr = OidOutputFunctionCall(outfuncoid, val); + /* special-case text types to save useless palloc/memcpy ops */ + if (outfuncoid == F_TEXTOUT || + outfuncoid == F_VARCHAROUT || + outfuncoid == F_BPCHAROUT) + { + text *txt = DatumGetTextPP(val); + + jb.val.string.len = VARSIZE_ANY_EXHDR(txt); + jb.val.string.val = VARDATA_ANY(txt); + } + else + { + outputstr = OidOutputFunctionCall(outfuncoid, val); + jb.val.string.len = strlen(outputstr); + jb.val.string.val = outputstr; + } jb.type = jbvString; - jb.val.string.len = strlen(outputstr); (void) checkStringLen(jb.val.string.len, NULL); - jb.val.string.val = outputstr; break; } } @@ -829,9 +866,9 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, va.val.array.rawScalar = true; va.val.array.nElems = 1; - result->res = pushJsonbValue(&result->parseState, WJB_BEGIN_ARRAY, &va); - result->res = pushJsonbValue(&result->parseState, WJB_ELEM, &jb); - result->res = pushJsonbValue(&result->parseState, WJB_END_ARRAY, NULL); + pushJsonbValue(result, WJB_BEGIN_ARRAY, &va); + pushJsonbValue(result, WJB_ELEM, &jb); + pushJsonbValue(result, WJB_END_ARRAY, NULL); } else { @@ -840,12 +877,12 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, switch (o->type) { case jbvArray: - result->res = pushJsonbValue(&result->parseState, WJB_ELEM, &jb); + pushJsonbValue(result, WJB_ELEM, &jb); break; case jbvObject: - result->res = pushJsonbValue(&result->parseState, - key_scalar ? WJB_KEY : WJB_VALUE, - &jb); + pushJsonbValue(result, + key_scalar ? WJB_KEY : WJB_VALUE, + &jb); break; default: elog(ERROR, "unexpected parent of nested structure"); @@ -867,7 +904,7 @@ array_dim_to_jsonb(JsonbInState *result, int dim, int ndims, int *dims, const Da Assert(dim < ndims); - result->res = pushJsonbValue(&result->parseState, WJB_BEGIN_ARRAY, NULL); + pushJsonbValue(result, WJB_BEGIN_ARRAY, NULL); for (i = 1; i <= dims[dim]; i++) { @@ -884,7 +921,7 @@ array_dim_to_jsonb(JsonbInState *result, int dim, int ndims, int *dims, const Da } } - result->res = pushJsonbValue(&result->parseState, WJB_END_ARRAY, NULL); + pushJsonbValue(result, WJB_END_ARRAY, NULL); } /* @@ -913,8 +950,8 @@ array_to_jsonb_internal(Datum array, JsonbInState *result) if (nitems <= 0) { - result->res = pushJsonbValue(&result->parseState, WJB_BEGIN_ARRAY, NULL); - result->res = pushJsonbValue(&result->parseState, WJB_END_ARRAY, NULL); + pushJsonbValue(result, WJB_BEGIN_ARRAY, NULL); + pushJsonbValue(result, WJB_END_ARRAY, NULL); return; } @@ -961,7 +998,7 @@ composite_to_jsonb(Datum composite, JsonbInState *result) tmptup.t_data = td; tuple = &tmptup; - result->res = pushJsonbValue(&result->parseState, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(result, WJB_BEGIN_OBJECT, NULL); for (i = 0; i < tupdesc->natts; i++) { @@ -983,7 +1020,7 @@ composite_to_jsonb(Datum composite, JsonbInState *result) v.val.string.len = strlen(attname); v.val.string.val = attname; - result->res = pushJsonbValue(&result->parseState, WJB_KEY, &v); + pushJsonbValue(result, WJB_KEY, &v); val = heap_getattr(tuple, i + 1, tupdesc, &isnull); @@ -1000,7 +1037,7 @@ composite_to_jsonb(Datum composite, JsonbInState *result) false); } - result->res = pushJsonbValue(&result->parseState, WJB_END_OBJECT, NULL); + pushJsonbValue(result, WJB_END_OBJECT, NULL); ReleaseTupleDesc(tupdesc); } @@ -1118,7 +1155,7 @@ datum_to_jsonb(Datum val, JsonTypeCategory tcategory, Oid outfuncoid) datum_to_jsonb_internal(val, false, &result, tcategory, outfuncoid, false); - return JsonbPGetDatum(JsonbValueToJsonb(result.res)); + return JsonbPGetDatum(JsonbValueToJsonb(result.result)); } Datum @@ -1138,7 +1175,7 @@ jsonb_build_object_worker(int nargs, const Datum *args, const bool *nulls, const memset(&result, 0, sizeof(JsonbInState)); - result.res = pushJsonbValue(&result.parseState, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(&result, WJB_BEGIN_OBJECT, NULL); result.parseState->unique_keys = unique_keys; result.parseState->skip_nulls = absent_on_null; @@ -1165,9 +1202,9 @@ jsonb_build_object_worker(int nargs, const Datum *args, const bool *nulls, const add_jsonb(args[i + 1], nulls[i + 1], &result, types[i + 1], false); } - result.res = pushJsonbValue(&result.parseState, WJB_END_OBJECT, NULL); + pushJsonbValue(&result, WJB_END_OBJECT, NULL); - return JsonbPGetDatum(JsonbValueToJsonb(result.res)); + return JsonbPGetDatum(JsonbValueToJsonb(result.result)); } /* @@ -1200,10 +1237,10 @@ jsonb_build_object_noargs(PG_FUNCTION_ARGS) memset(&result, 0, sizeof(JsonbInState)); - (void) pushJsonbValue(&result.parseState, WJB_BEGIN_OBJECT, NULL); - result.res = pushJsonbValue(&result.parseState, WJB_END_OBJECT, NULL); + pushJsonbValue(&result, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(&result, WJB_END_OBJECT, NULL); - PG_RETURN_POINTER(JsonbValueToJsonb(result.res)); + PG_RETURN_POINTER(JsonbValueToJsonb(result.result)); } Datum @@ -1215,7 +1252,7 @@ jsonb_build_array_worker(int nargs, const Datum *args, const bool *nulls, const memset(&result, 0, sizeof(JsonbInState)); - result.res = pushJsonbValue(&result.parseState, WJB_BEGIN_ARRAY, NULL); + pushJsonbValue(&result, WJB_BEGIN_ARRAY, NULL); for (i = 0; i < nargs; i++) { @@ -1225,9 +1262,9 @@ jsonb_build_array_worker(int nargs, const Datum *args, const bool *nulls, const add_jsonb(args[i], nulls[i], &result, types[i], false); } - result.res = pushJsonbValue(&result.parseState, WJB_END_ARRAY, NULL); + pushJsonbValue(&result, WJB_END_ARRAY, NULL); - return JsonbPGetDatum(JsonbValueToJsonb(result.res)); + return JsonbPGetDatum(JsonbValueToJsonb(result.result)); } /* @@ -1261,10 +1298,10 @@ jsonb_build_array_noargs(PG_FUNCTION_ARGS) memset(&result, 0, sizeof(JsonbInState)); - (void) pushJsonbValue(&result.parseState, WJB_BEGIN_ARRAY, NULL); - result.res = pushJsonbValue(&result.parseState, WJB_END_ARRAY, NULL); + pushJsonbValue(&result, WJB_BEGIN_ARRAY, NULL); + pushJsonbValue(&result, WJB_END_ARRAY, NULL); - PG_RETURN_POINTER(JsonbValueToJsonb(result.res)); + PG_RETURN_POINTER(JsonbValueToJsonb(result.result)); } @@ -1289,7 +1326,7 @@ jsonb_object(PG_FUNCTION_ARGS) memset(&result, 0, sizeof(JsonbInState)); - (void) pushJsonbValue(&result.parseState, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(&result, WJB_BEGIN_OBJECT, NULL); switch (ndims) { @@ -1340,7 +1377,7 @@ jsonb_object(PG_FUNCTION_ARGS) v.val.string.len = len; v.val.string.val = str; - (void) pushJsonbValue(&result.parseState, WJB_KEY, &v); + pushJsonbValue(&result, WJB_KEY, &v); if (in_nulls[i * 2 + 1]) { @@ -1357,16 +1394,16 @@ jsonb_object(PG_FUNCTION_ARGS) v.val.string.val = str; } - (void) pushJsonbValue(&result.parseState, WJB_VALUE, &v); + pushJsonbValue(&result, WJB_VALUE, &v); } pfree(in_datums); pfree(in_nulls); close_object: - result.res = pushJsonbValue(&result.parseState, WJB_END_OBJECT, NULL); + pushJsonbValue(&result, WJB_END_OBJECT, NULL); - PG_RETURN_POINTER(JsonbValueToJsonb(result.res)); + PG_RETURN_POINTER(JsonbValueToJsonb(result.result)); } /* @@ -1393,7 +1430,7 @@ jsonb_object_two_arg(PG_FUNCTION_ARGS) memset(&result, 0, sizeof(JsonbInState)); - (void) pushJsonbValue(&result.parseState, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(&result, WJB_BEGIN_OBJECT, NULL); if (nkdims > 1 || nkdims != nvdims) ereport(ERROR, @@ -1430,7 +1467,7 @@ jsonb_object_two_arg(PG_FUNCTION_ARGS) v.val.string.len = len; v.val.string.val = str; - (void) pushJsonbValue(&result.parseState, WJB_KEY, &v); + pushJsonbValue(&result, WJB_KEY, &v); if (val_nulls[i]) { @@ -1447,7 +1484,7 @@ jsonb_object_two_arg(PG_FUNCTION_ARGS) v.val.string.val = str; } - (void) pushJsonbValue(&result.parseState, WJB_VALUE, &v); + pushJsonbValue(&result, WJB_VALUE, &v); } pfree(key_datums); @@ -1456,61 +1493,23 @@ jsonb_object_two_arg(PG_FUNCTION_ARGS) pfree(val_nulls); close_object: - result.res = pushJsonbValue(&result.parseState, WJB_END_OBJECT, NULL); + pushJsonbValue(&result, WJB_END_OBJECT, NULL); - PG_RETURN_POINTER(JsonbValueToJsonb(result.res)); + PG_RETURN_POINTER(JsonbValueToJsonb(result.result)); } /* - * shallow clone of a parse state, suitable for use in aggregate - * final functions that will only append to the values rather than - * change them. + * Functions for jsonb_agg, jsonb_object_agg, and variants */ -static JsonbParseState * -clone_parse_state(JsonbParseState *state) -{ - JsonbParseState *result, - *icursor, - *ocursor; - - if (state == NULL) - return NULL; - - result = palloc(sizeof(JsonbParseState)); - icursor = state; - ocursor = result; - for (;;) - { - ocursor->contVal = icursor->contVal; - ocursor->size = icursor->size; - ocursor->unique_keys = icursor->unique_keys; - ocursor->skip_nulls = icursor->skip_nulls; - icursor = icursor->next; - if (icursor == NULL) - break; - ocursor->next = palloc(sizeof(JsonbParseState)); - ocursor = ocursor->next; - } - ocursor->next = NULL; - - return result; -} static Datum jsonb_agg_transfn_worker(FunctionCallInfo fcinfo, bool absent_on_null) { - MemoryContext oldcontext, - aggcontext; + MemoryContext aggcontext; JsonbAggState *state; - JsonbInState elem; Datum val; JsonbInState *result; - bool single_scalar = false; - JsonbIterator *it; - Jsonb *jbelem; - JsonbValue v; - JsonbIteratorToken type; if (!AggCheckCallContext(fcinfo, &aggcontext)) { @@ -1529,13 +1528,10 @@ jsonb_agg_transfn_worker(FunctionCallInfo fcinfo, bool absent_on_null) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("could not determine input data type"))); - oldcontext = MemoryContextSwitchTo(aggcontext); - state = palloc(sizeof(JsonbAggState)); - result = palloc0(sizeof(JsonbInState)); - state->res = result; - result->res = pushJsonbValue(&result->parseState, - WJB_BEGIN_ARRAY, NULL); - MemoryContextSwitchTo(oldcontext); + state = MemoryContextAllocZero(aggcontext, sizeof(JsonbAggState)); + result = &state->pstate; + result->outcontext = aggcontext; + pushJsonbValue(result, WJB_BEGIN_ARRAY, NULL); json_categorize_type(arg_type, true, &state->val_category, &state->val_output_func); @@ -1543,78 +1539,23 @@ jsonb_agg_transfn_worker(FunctionCallInfo fcinfo, bool absent_on_null) else { state = (JsonbAggState *) PG_GETARG_POINTER(0); - result = state->res; + result = &state->pstate; } if (absent_on_null && PG_ARGISNULL(1)) PG_RETURN_POINTER(state); - /* turn the argument into jsonb in the normal function context */ - + /* + * We run this code in the normal function context, so that we don't leak + * any cruft from datatype output functions and such into the aggcontext. + * But the "result" JsonbValue will be constructed in aggcontext, so that + * it remains available across calls. + */ val = PG_ARGISNULL(1) ? (Datum) 0 : PG_GETARG_DATUM(1); - memset(&elem, 0, sizeof(JsonbInState)); - - datum_to_jsonb_internal(val, PG_ARGISNULL(1), &elem, state->val_category, + datum_to_jsonb_internal(val, PG_ARGISNULL(1), result, state->val_category, state->val_output_func, false); - jbelem = JsonbValueToJsonb(elem.res); - - /* switch to the aggregate context for accumulation operations */ - - oldcontext = MemoryContextSwitchTo(aggcontext); - - it = JsonbIteratorInit(&jbelem->root); - - while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) - { - switch (type) - { - case WJB_BEGIN_ARRAY: - if (v.val.array.rawScalar) - single_scalar = true; - else - result->res = pushJsonbValue(&result->parseState, - type, NULL); - break; - case WJB_END_ARRAY: - if (!single_scalar) - result->res = pushJsonbValue(&result->parseState, - type, NULL); - break; - case WJB_BEGIN_OBJECT: - case WJB_END_OBJECT: - result->res = pushJsonbValue(&result->parseState, - type, NULL); - break; - case WJB_ELEM: - case WJB_KEY: - case WJB_VALUE: - if (v.type == jbvString) - { - /* copy string values in the aggregate context */ - char *buf = palloc(v.val.string.len + 1); - - snprintf(buf, v.val.string.len + 1, "%s", v.val.string.val); - v.val.string.val = buf; - } - else if (v.type == jbvNumeric) - { - /* same for numeric */ - v.val.numeric = - DatumGetNumeric(DirectFunctionCall1(numeric_uplus, - NumericGetDatum(v.val.numeric))); - } - result->res = pushJsonbValue(&result->parseState, - type, &v); - break; - default: - elog(ERROR, "unknown jsonb iterator token type"); - } - } - - MemoryContextSwitchTo(oldcontext); - PG_RETURN_POINTER(state); } @@ -1652,19 +1593,19 @@ jsonb_agg_finalfn(PG_FUNCTION_ARGS) arg = (JsonbAggState *) PG_GETARG_POINTER(0); /* - * We need to do a shallow clone of the argument in case the final - * function is called more than once, so we avoid changing the argument. A - * shallow clone is sufficient as we aren't going to change any of the - * values, just add the final array end marker. + * The final function can be called more than once, so we must not change + * the stored JsonbValue data structure. Fortunately, the WJB_END_ARRAY + * action will only change fields in the JsonbInState struct itself, so we + * can simply invoke pushJsonbValue on a local copy of that. */ - memset(&result, 0, sizeof(JsonbInState)); + result = arg->pstate; - result.parseState = clone_parse_state(arg->res->parseState); + pushJsonbValue(&result, WJB_END_ARRAY, NULL); - result.res = pushJsonbValue(&result.parseState, - WJB_END_ARRAY, NULL); + /* We expect result.parseState == NULL after closing the array */ + Assert(result.parseState == NULL); - out = JsonbValueToJsonb(result.res); + out = JsonbValueToJsonb(result.result); PG_RETURN_POINTER(out); } @@ -1673,18 +1614,10 @@ static Datum jsonb_object_agg_transfn_worker(FunctionCallInfo fcinfo, bool absent_on_null, bool unique_keys) { - MemoryContext oldcontext, - aggcontext; - JsonbInState elem; + MemoryContext aggcontext; JsonbAggState *state; Datum val; JsonbInState *result; - bool single_scalar; - JsonbIterator *it; - Jsonb *jbkey, - *jbval; - JsonbValue v; - JsonbIteratorToken type; bool skip; if (!AggCheckCallContext(fcinfo, &aggcontext)) @@ -1699,17 +1632,13 @@ jsonb_object_agg_transfn_worker(FunctionCallInfo fcinfo, { Oid arg_type; - oldcontext = MemoryContextSwitchTo(aggcontext); - state = palloc(sizeof(JsonbAggState)); - result = palloc0(sizeof(JsonbInState)); - state->res = result; - result->res = pushJsonbValue(&result->parseState, - WJB_BEGIN_OBJECT, NULL); + state = MemoryContextAllocZero(aggcontext, sizeof(JsonbAggState)); + result = &state->pstate; + result->outcontext = aggcontext; + pushJsonbValue(result, WJB_BEGIN_OBJECT, NULL); result->parseState->unique_keys = unique_keys; result->parseState->skip_nulls = absent_on_null; - MemoryContextSwitchTo(oldcontext); - arg_type = get_fn_expr_argtype(fcinfo->flinfo, 1); if (arg_type == InvalidOid) @@ -1733,11 +1662,9 @@ jsonb_object_agg_transfn_worker(FunctionCallInfo fcinfo, else { state = (JsonbAggState *) PG_GETARG_POINTER(0); - result = state->res; + result = &state->pstate; } - /* turn the argument into jsonb in the normal function context */ - if (PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1752,140 +1679,22 @@ jsonb_object_agg_transfn_worker(FunctionCallInfo fcinfo, if (skip && !unique_keys) PG_RETURN_POINTER(state); + /* + * We run this code in the normal function context, so that we don't leak + * any cruft from datatype output functions and such into the aggcontext. + * But the "result" JsonbValue will be constructed in aggcontext, so that + * it remains available across calls. + */ val = PG_GETARG_DATUM(1); - memset(&elem, 0, sizeof(JsonbInState)); - - datum_to_jsonb_internal(val, false, &elem, state->key_category, + datum_to_jsonb_internal(val, false, result, state->key_category, state->key_output_func, true); - jbkey = JsonbValueToJsonb(elem.res); - val = PG_ARGISNULL(2) ? (Datum) 0 : PG_GETARG_DATUM(2); - memset(&elem, 0, sizeof(JsonbInState)); - - datum_to_jsonb_internal(val, PG_ARGISNULL(2), &elem, state->val_category, + datum_to_jsonb_internal(val, PG_ARGISNULL(2), result, state->val_category, state->val_output_func, false); - jbval = JsonbValueToJsonb(elem.res); - - it = JsonbIteratorInit(&jbkey->root); - - /* switch to the aggregate context for accumulation operations */ - - oldcontext = MemoryContextSwitchTo(aggcontext); - - /* - * keys should be scalar, and we should have already checked for that - * above when calling datum_to_jsonb, so we only need to look for these - * things. - */ - - while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) - { - switch (type) - { - case WJB_BEGIN_ARRAY: - if (!v.val.array.rawScalar) - elog(ERROR, "unexpected structure for key"); - break; - case WJB_ELEM: - if (v.type == jbvString) - { - /* copy string values in the aggregate context */ - char *buf = palloc(v.val.string.len + 1); - - snprintf(buf, v.val.string.len + 1, "%s", v.val.string.val); - v.val.string.val = buf; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("object keys must be strings"))); - } - result->res = pushJsonbValue(&result->parseState, - WJB_KEY, &v); - - if (skip) - { - v.type = jbvNull; - result->res = pushJsonbValue(&result->parseState, - WJB_VALUE, &v); - MemoryContextSwitchTo(oldcontext); - PG_RETURN_POINTER(state); - } - - break; - case WJB_END_ARRAY: - break; - default: - elog(ERROR, "unexpected structure for key"); - break; - } - } - - it = JsonbIteratorInit(&jbval->root); - - single_scalar = false; - - /* - * values can be anything, including structured and null, so we treat them - * as in json_agg_transfn, except that single scalars are always pushed as - * WJB_VALUE items. - */ - - while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) - { - switch (type) - { - case WJB_BEGIN_ARRAY: - if (v.val.array.rawScalar) - single_scalar = true; - else - result->res = pushJsonbValue(&result->parseState, - type, NULL); - break; - case WJB_END_ARRAY: - if (!single_scalar) - result->res = pushJsonbValue(&result->parseState, - type, NULL); - break; - case WJB_BEGIN_OBJECT: - case WJB_END_OBJECT: - result->res = pushJsonbValue(&result->parseState, - type, NULL); - break; - case WJB_ELEM: - case WJB_KEY: - case WJB_VALUE: - if (v.type == jbvString) - { - /* copy string values in the aggregate context */ - char *buf = palloc(v.val.string.len + 1); - - snprintf(buf, v.val.string.len + 1, "%s", v.val.string.val); - v.val.string.val = buf; - } - else if (v.type == jbvNumeric) - { - /* same for numeric */ - v.val.numeric = - DatumGetNumeric(DirectFunctionCall1(numeric_uplus, - NumericGetDatum(v.val.numeric))); - } - result->res = pushJsonbValue(&result->parseState, - single_scalar ? WJB_VALUE : type, - &v); - break; - default: - elog(ERROR, "unknown jsonb iterator token type"); - } - } - - MemoryContextSwitchTo(oldcontext); - PG_RETURN_POINTER(state); } @@ -1942,20 +1751,24 @@ jsonb_object_agg_finalfn(PG_FUNCTION_ARGS) arg = (JsonbAggState *) PG_GETARG_POINTER(0); /* - * We need to do a shallow clone of the argument's res field in case the - * final function is called more than once, so we avoid changing the - * aggregate state value. A shallow clone is sufficient as we aren't - * going to change any of the values, just add the final object end - * marker. + * The final function can be called more than once, so we must not change + * the stored JsonbValue data structure. Fortunately, the WJB_END_OBJECT + * action will only destructively change fields in the JsonbInState struct + * itself, so we can simply invoke pushJsonbValue on a local copy of that. + * Note that this will run uniqueifyJsonbObject each time; that's hard to + * avoid, since duplicate pairs may have been added since the previous + * finalization. We assume uniqueifyJsonbObject can be applied repeatedly + * (with the same unique_keys/skip_nulls options) without damaging the + * data structure. */ - memset(&result, 0, sizeof(JsonbInState)); + result = arg->pstate; - result.parseState = clone_parse_state(arg->res->parseState); + pushJsonbValue(&result, WJB_END_OBJECT, NULL); - result.res = pushJsonbValue(&result.parseState, - WJB_END_OBJECT, NULL); + /* We expect result.parseState == NULL after closing the object */ + Assert(result.parseState == NULL); - out = JsonbValueToJsonb(result.res); + out = JsonbValueToJsonb(result.result); PG_RETURN_POINTER(out); } diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c index c1950792b5aea..a1daa3f5034ae 100644 --- a/src/backend/utils/adt/jsonb_gin.c +++ b/src/backend/utils/adt/jsonb_gin.c @@ -163,7 +163,7 @@ static void init_gin_entries(GinEntries *entries, int preallocated) { entries->allocated = preallocated; - entries->buf = preallocated ? palloc(sizeof(Datum) * preallocated) : NULL; + entries->buf = preallocated ? palloc_array(Datum, preallocated) : NULL; entries->count = 0; } @@ -178,13 +178,14 @@ add_gin_entry(GinEntries *entries, Datum entry) if (entries->allocated) { entries->allocated *= 2; - entries->buf = repalloc(entries->buf, - sizeof(Datum) * entries->allocated); + entries->buf = repalloc_array(entries->buf, + Datum, + entries->allocated); } else { entries->allocated = 8; - entries->buf = palloc(sizeof(Datum) * entries->allocated); + entries->buf = palloc_array(Datum, entries->allocated); } } @@ -307,7 +308,7 @@ jsonb_ops__add_path_item(JsonPathGinPath *path, JsonPathItem *jsp) return false; } - pentry = palloc(sizeof(*pentry)); + pentry = palloc_object(JsonPathGinPathItem); pentry->type = jsp->type; pentry->keyName = keyName; @@ -785,7 +786,7 @@ extract_jsp_query(JsonPath *jp, StrategyNumber strat, bool pathOps, if (!*nentries) return NULL; - *extra_data = palloc0(sizeof(**extra_data) * entries.count); + *extra_data = palloc0_array(Pointer, entries.count); **extra_data = (Pointer) node; return entries.buf; @@ -869,7 +870,7 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS) text *query = PG_GETARG_TEXT_PP(0); *nentries = 1; - entries = (Datum *) palloc(sizeof(Datum)); + entries = palloc_object(Datum); entries[0] = make_text_key(JGINFLAG_KEY, VARDATA_ANY(query), VARSIZE_ANY_EXHDR(query)); @@ -887,7 +888,7 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS) deconstruct_array_builtin(query, TEXTOID, &key_datums, &key_nulls, &key_count); - entries = (Datum *) palloc(sizeof(Datum) * key_count); + entries = palloc_array(Datum, key_count); for (i = 0, j = 0; i < key_count; i++) { @@ -896,8 +897,8 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS) continue; /* We rely on the array elements not being toasted */ entries[j++] = make_text_key(JGINFLAG_KEY, - VARDATA_ANY(key_datums[i]), - VARSIZE_ANY_EXHDR(key_datums[i])); + VARDATA_ANY(DatumGetPointer(key_datums[i])), + VARSIZE_ANY_EXHDR(DatumGetPointer(key_datums[i]))); } *nentries = j; @@ -999,8 +1000,7 @@ gin_consistent_jsonb(PG_FUNCTION_ARGS) if (nkeys > 0) { Assert(extra_data && extra_data[0]); - res = execute_jsp_gin_node((JsonPathGinNode *) extra_data[0], check, - false) != GIN_FALSE; + res = execute_jsp_gin_node(extra_data[0], check, false) != GIN_FALSE; } } else @@ -1060,8 +1060,7 @@ gin_triconsistent_jsonb(PG_FUNCTION_ARGS) if (nkeys > 0) { Assert(extra_data && extra_data[0]); - res = execute_jsp_gin_node((JsonPathGinNode *) extra_data[0], check, - true); + res = execute_jsp_gin_node(extra_data[0], check, true); /* Should always recheck the result */ if (res == GIN_TRUE) @@ -1126,7 +1125,7 @@ gin_extract_jsonb_path(PG_FUNCTION_ARGS) case WJB_BEGIN_OBJECT: /* Push a stack level for this object */ parent = stack; - stack = (PathHashStack *) palloc(sizeof(PathHashStack)); + stack = palloc_object(PathHashStack); /* * We pass forward hashes from outer nesting levels so that @@ -1258,8 +1257,7 @@ gin_consistent_jsonb_path(PG_FUNCTION_ARGS) if (nkeys > 0) { Assert(extra_data && extra_data[0]); - res = execute_jsp_gin_node((JsonPathGinNode *) extra_data[0], check, - false) != GIN_FALSE; + res = execute_jsp_gin_node(extra_data[0], check, false) != GIN_FALSE; } } else @@ -1302,8 +1300,7 @@ gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS) if (nkeys > 0) { Assert(extra_data && extra_data[0]); - res = execute_jsp_gin_node((JsonPathGinNode *) extra_data[0], check, - true); + res = execute_jsp_gin_node(extra_data[0], check, true); /* Should always recheck the result */ if (res == GIN_TRUE) diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c index fa5603f26e1d6..51d38e321fb2f 100644 --- a/src/backend/utils/adt/jsonb_op.c +++ b/src/backend/utils/adt/jsonb_op.c @@ -63,8 +63,8 @@ jsonb_exists_any(PG_FUNCTION_ARGS) strVal.type = jbvString; /* We rely on the array elements not being toasted */ - strVal.val.string.val = VARDATA_ANY(key_datums[i]); - strVal.val.string.len = VARSIZE_ANY_EXHDR(key_datums[i]); + strVal.val.string.val = VARDATA_ANY(DatumGetPointer(key_datums[i])); + strVal.val.string.len = VARSIZE_ANY_EXHDR(DatumGetPointer(key_datums[i])); if (findJsonbValueFromContainer(&jb->root, JB_FOBJECT | JB_FARRAY, @@ -96,8 +96,8 @@ jsonb_exists_all(PG_FUNCTION_ARGS) strVal.type = jbvString; /* We rely on the array elements not being toasted */ - strVal.val.string.val = VARDATA_ANY(key_datums[i]); - strVal.val.string.len = VARSIZE_ANY_EXHDR(key_datums[i]); + strVal.val.string.val = VARDATA_ANY(DatumGetPointer(key_datums[i])); + strVal.val.string.len = VARSIZE_ANY_EXHDR(DatumGetPointer(key_datums[i])); if (findJsonbValueFromContainer(&jb->root, JB_FOBJECT | JB_FARRAY, diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index c8b6c15e05975..5911b7c4d5705 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -14,10 +14,13 @@ #include "postgres.h" #include "catalog/pg_collation.h" +#include "catalog/pg_type.h" #include "common/hashfn.h" #include "miscadmin.h" #include "port/pg_bitutils.h" +#include "utils/date.h" #include "utils/datetime.h" +#include "utils/datum.h" #include "utils/fmgrprotos.h" #include "utils/json.h" #include "utils/jsonb.h" @@ -54,19 +57,20 @@ static short padBufferToInt(StringInfo buffer); static JsonbIterator *iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent); static JsonbIterator *freeAndGetParent(JsonbIterator *it); -static JsonbParseState *pushState(JsonbParseState **pstate); -static void appendKey(JsonbParseState *pstate, JsonbValue *string); -static void appendValue(JsonbParseState *pstate, JsonbValue *scalarVal); -static void appendElement(JsonbParseState *pstate, JsonbValue *scalarVal); +static JsonbParseState *pushState(JsonbInState *pstate); +static void appendKey(JsonbInState *pstate, JsonbValue *string, bool needCopy); +static void appendValue(JsonbInState *pstate, JsonbValue *scalarVal, bool needCopy); +static void appendElement(JsonbInState *pstate, JsonbValue *scalarVal, bool needCopy); +static void copyScalarSubstructure(JsonbValue *v, MemoryContext outcontext); static int lengthCompareJsonbStringValue(const void *a, const void *b); static int lengthCompareJsonbString(const char *val1, int len1, const char *val2, int len2); static int lengthCompareJsonbPair(const void *a, const void *b, void *binequal); static void uniqueifyJsonbObject(JsonbValue *object, bool unique_keys, bool skip_nulls); -static JsonbValue *pushJsonbValueScalar(JsonbParseState **pstate, - JsonbIteratorToken seq, - JsonbValue *scalarVal); +static void pushJsonbValueScalar(JsonbInState *pstate, + JsonbIteratorToken seq, + JsonbValue *scalarVal); void JsonbToJsonbValue(Jsonb *jsonb, JsonbValue *val) @@ -95,9 +99,8 @@ JsonbValueToJsonb(JsonbValue *val) if (IsAJsonbScalar(val)) { - /* Scalar value */ - JsonbParseState *pstate = NULL; - JsonbValue *res; + /* Scalar value, so wrap it in an array */ + JsonbInState pstate = {0}; JsonbValue scalarArray; scalarArray.type = jbvArray; @@ -106,9 +109,9 @@ JsonbValueToJsonb(JsonbValue *val) pushJsonbValue(&pstate, WJB_BEGIN_ARRAY, &scalarArray); pushJsonbValue(&pstate, WJB_ELEM, val); - res = pushJsonbValue(&pstate, WJB_END_ARRAY, NULL); + pushJsonbValue(&pstate, WJB_END_ARRAY, NULL); - out = convertToJsonb(res); + out = convertToJsonb(pstate.result); } else if (val->type == jbvObject || val->type == jbvArray) { @@ -277,22 +280,16 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b) else { /* - * It's safe to assume that the types differed, and that the va - * and vb values passed were set. - * - * If the two values were of the same container type, then there'd - * have been a chance to observe the variation in the number of - * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're - * either two heterogeneously-typed containers, or a container and - * some scalar type. - * - * We don't have to consider the WJB_END_ARRAY and WJB_END_OBJECT - * cases here, because we would have seen the corresponding - * WJB_BEGIN_ARRAY and WJB_BEGIN_OBJECT tokens first, and - * concluded that they don't match. + * It's not possible for one iterator to report end of array or + * object while the other one reports something else, because we + * would have detected a length mismatch when we processed the + * container-start tokens above. Likewise we can't see WJB_DONE + * from one but not the other. So we have two different-type + * containers, or a container and some scalar type, or two + * different scalar types. Sort on the basis of the type code. */ - Assert(ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); - Assert(rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); + Assert(ra != WJB_DONE && ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); + Assert(rb != WJB_DONE && rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); Assert(va.type != vb.type); Assert(va.type != jbvBinary); @@ -362,7 +359,7 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags, if ((flags & JB_FARRAY) && JsonContainerIsArray(container)) { - JsonbValue *result = palloc(sizeof(JsonbValue)); + JsonbValue *result = palloc_object(JsonbValue); char *base_addr = (char *) (children + count); uint32 offset = 0; int i; @@ -445,7 +442,7 @@ getKeyJsonValueFromContainer(JsonbContainer *container, int index = stopMiddle + count; if (!res) - res = palloc(sizeof(JsonbValue)); + res = palloc_object(JsonbValue); fillJsonbValue(container, index, baseAddr, getJsonbOffset(container, index), @@ -487,7 +484,7 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i) if (i >= nelements) return NULL; - result = palloc(sizeof(JsonbValue)); + result = palloc_object(JsonbValue); fillJsonbValue(container, i, base_addr, getJsonbOffset(container, i), @@ -553,13 +550,23 @@ fillJsonbValue(JsonbContainer *container, int index, } /* - * Push JsonbValue into JsonbParseState. + * Push JsonbValue into JsonbInState. * - * Used when parsing JSON tokens to form Jsonb, or when converting an in-memory - * JsonbValue to a Jsonb. + * Used, for example, when parsing JSON input. * - * Initial state of *JsonbParseState is NULL, since it'll be allocated here - * originally (caller will get JsonbParseState back by reference). + * *pstate is typically initialized to all-zeroes, except that the caller + * may provide outcontext and/or escontext. (escontext is ignored by this + * function and its subroutines, however.) + * + * "seq" tells what is being pushed (start/end of array or object, key, + * value, etc). WJB_DONE is not used here, but the other values of + * JsonbIteratorToken are. We assume the caller passes a valid sequence + * of values. + * + * The passed "jbval" is typically transient storage, such as a local variable. + * We will copy it into the outcontext (CurrentMemoryContext by default). + * If outcontext isn't NULL, we will also make copies of any pass-by-reference + * scalar values. * * Only sequential tokens pertaining to non-container types should pass a * JsonbValue. There is one exception -- WJB_BEGIN_ARRAY callers may pass a @@ -568,18 +575,32 @@ fillJsonbValue(JsonbContainer *container, int index, * * Values of type jbvBinary, which are rolled up arrays and objects, * are unpacked before being added to the result. + * + * At the end of construction of a JsonbValue, pstate->result will reference + * the top-level JsonbValue object. */ -JsonbValue * -pushJsonbValue(JsonbParseState **pstate, JsonbIteratorToken seq, +void +pushJsonbValue(JsonbInState *pstate, JsonbIteratorToken seq, JsonbValue *jbval) { JsonbIterator *it; - JsonbValue *res = NULL; JsonbValue v; JsonbIteratorToken tok; int i; - if (jbval && (seq == WJB_ELEM || seq == WJB_VALUE) && jbval->type == jbvObject) + /* + * pushJsonbValueScalar handles all cases not involving pushing a + * container object as an ELEM or VALUE. + */ + if (!jbval || IsAJsonbScalar(jbval) || + (seq != WJB_ELEM && seq != WJB_VALUE)) + { + pushJsonbValueScalar(pstate, seq, jbval); + return; + } + + /* If an object or array is pushed, recursively push its contents */ + if (jbval->type == jbvObject) { pushJsonbValue(pstate, WJB_BEGIN_OBJECT, NULL); for (i = 0; i < jbval->val.object.nPairs; i++) @@ -587,32 +608,29 @@ pushJsonbValue(JsonbParseState **pstate, JsonbIteratorToken seq, pushJsonbValue(pstate, WJB_KEY, &jbval->val.object.pairs[i].key); pushJsonbValue(pstate, WJB_VALUE, &jbval->val.object.pairs[i].value); } - - return pushJsonbValue(pstate, WJB_END_OBJECT, NULL); + pushJsonbValue(pstate, WJB_END_OBJECT, NULL); + return; } - if (jbval && (seq == WJB_ELEM || seq == WJB_VALUE) && jbval->type == jbvArray) + if (jbval->type == jbvArray) { pushJsonbValue(pstate, WJB_BEGIN_ARRAY, NULL); for (i = 0; i < jbval->val.array.nElems; i++) { pushJsonbValue(pstate, WJB_ELEM, &jbval->val.array.elems[i]); } - - return pushJsonbValue(pstate, WJB_END_ARRAY, NULL); + pushJsonbValue(pstate, WJB_END_ARRAY, NULL); + return; } - if (!jbval || (seq != WJB_ELEM && seq != WJB_VALUE) || - jbval->type != jbvBinary) - { - /* drop through */ - return pushJsonbValueScalar(pstate, seq, jbval); - } + /* Else it must be a jbvBinary value; push its contents */ + Assert(jbval->type == jbvBinary); - /* unpack the binary and add each piece to the pstate */ it = JsonbIteratorInit(jbval->val.binary.data); - if ((jbval->val.binary.data->header & JB_FSCALAR) && *pstate) + /* ... with a special case for pushing a raw scalar */ + if ((jbval->val.binary.data->header & JB_FSCALAR) && + pstate->parseState != NULL) { tok = JsonbIteratorNext(&it, &v, true); Assert(tok == WJB_BEGIN_ARRAY); @@ -621,197 +639,290 @@ pushJsonbValue(JsonbParseState **pstate, JsonbIteratorToken seq, tok = JsonbIteratorNext(&it, &v, true); Assert(tok == WJB_ELEM); - res = pushJsonbValueScalar(pstate, seq, &v); + pushJsonbValueScalar(pstate, seq, &v); tok = JsonbIteratorNext(&it, &v, true); Assert(tok == WJB_END_ARRAY); Assert(it == NULL); - return res; + return; } while ((tok = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) - res = pushJsonbValueScalar(pstate, tok, - tok < WJB_BEGIN_ARRAY || - (tok == WJB_BEGIN_ARRAY && - v.val.array.rawScalar) ? &v : NULL); - - return res; + pushJsonbValueScalar(pstate, tok, + tok < WJB_BEGIN_ARRAY || + (tok == WJB_BEGIN_ARRAY && + v.val.array.rawScalar) ? &v : NULL); } /* * Do the actual pushing, with only scalar or pseudo-scalar-array values * accepted. */ -static JsonbValue * -pushJsonbValueScalar(JsonbParseState **pstate, JsonbIteratorToken seq, +static void +pushJsonbValueScalar(JsonbInState *pstate, JsonbIteratorToken seq, JsonbValue *scalarVal) { - JsonbValue *result = NULL; + JsonbParseState *ppstate; + JsonbValue *val; + MemoryContext outcontext; switch (seq) { case WJB_BEGIN_ARRAY: Assert(!scalarVal || scalarVal->val.array.rawScalar); - *pstate = pushState(pstate); - result = &(*pstate)->contVal; - (*pstate)->contVal.type = jbvArray; - (*pstate)->contVal.val.array.nElems = 0; - (*pstate)->contVal.val.array.rawScalar = (scalarVal && - scalarVal->val.array.rawScalar); + ppstate = pushState(pstate); + val = &ppstate->contVal; + val->type = jbvArray; + val->val.array.nElems = 0; + val->val.array.rawScalar = (scalarVal && + scalarVal->val.array.rawScalar); if (scalarVal && scalarVal->val.array.nElems > 0) { /* Assume that this array is still really a scalar */ Assert(scalarVal->type == jbvArray); - (*pstate)->size = scalarVal->val.array.nElems; + ppstate->size = scalarVal->val.array.nElems; } else { - (*pstate)->size = 4; + ppstate->size = 4; /* initial guess at array size */ } - (*pstate)->contVal.val.array.elems = palloc(sizeof(JsonbValue) * - (*pstate)->size); + outcontext = pstate->outcontext ? pstate->outcontext : CurrentMemoryContext; + val->val.array.elems = MemoryContextAlloc(outcontext, + sizeof(JsonbValue) * + ppstate->size); break; case WJB_BEGIN_OBJECT: Assert(!scalarVal); - *pstate = pushState(pstate); - result = &(*pstate)->contVal; - (*pstate)->contVal.type = jbvObject; - (*pstate)->contVal.val.object.nPairs = 0; - (*pstate)->size = 4; - (*pstate)->contVal.val.object.pairs = palloc(sizeof(JsonbPair) * - (*pstate)->size); + ppstate = pushState(pstate); + val = &ppstate->contVal; + val->type = jbvObject; + val->val.object.nPairs = 0; + ppstate->size = 4; /* initial guess at object size */ + outcontext = pstate->outcontext ? pstate->outcontext : CurrentMemoryContext; + val->val.object.pairs = MemoryContextAlloc(outcontext, + sizeof(JsonbPair) * + ppstate->size); break; case WJB_KEY: Assert(scalarVal->type == jbvString); - appendKey(*pstate, scalarVal); + appendKey(pstate, scalarVal, true); break; case WJB_VALUE: Assert(IsAJsonbScalar(scalarVal)); - appendValue(*pstate, scalarVal); + appendValue(pstate, scalarVal, true); break; case WJB_ELEM: Assert(IsAJsonbScalar(scalarVal)); - appendElement(*pstate, scalarVal); + appendElement(pstate, scalarVal, true); break; case WJB_END_OBJECT: - uniqueifyJsonbObject(&(*pstate)->contVal, - (*pstate)->unique_keys, - (*pstate)->skip_nulls); + ppstate = pstate->parseState; + uniqueifyJsonbObject(&ppstate->contVal, + ppstate->unique_keys, + ppstate->skip_nulls); /* fall through! */ case WJB_END_ARRAY: /* Steps here common to WJB_END_OBJECT case */ Assert(!scalarVal); - result = &(*pstate)->contVal; + ppstate = pstate->parseState; + val = &ppstate->contVal; /* * Pop stack and push current array/object as value in parent - * array/object + * array/object, or return it as the final result. We don't need + * to re-copy any scalars that are in the data structure. */ - *pstate = (*pstate)->next; - if (*pstate) + pstate->parseState = ppstate = ppstate->next; + if (ppstate) { - switch ((*pstate)->contVal.type) + switch (ppstate->contVal.type) { case jbvArray: - appendElement(*pstate, result); + appendElement(pstate, val, false); break; case jbvObject: - appendValue(*pstate, result); + appendValue(pstate, val, false); break; default: elog(ERROR, "invalid jsonb container type"); } } + else + pstate->result = val; break; default: elog(ERROR, "unrecognized jsonb sequential processing token"); } - - return result; } /* - * pushJsonbValue() worker: Iteration-like forming of Jsonb + * Push a new JsonbParseState onto the JsonbInState's stack + * + * As a notational convenience, the new state's address is returned. + * The caller must initialize the new state's contVal and size fields. */ static JsonbParseState * -pushState(JsonbParseState **pstate) +pushState(JsonbInState *pstate) { - JsonbParseState *ns = palloc(sizeof(JsonbParseState)); + MemoryContext outcontext = pstate->outcontext ? pstate->outcontext : CurrentMemoryContext; + JsonbParseState *ns = MemoryContextAlloc(outcontext, + sizeof(JsonbParseState)); - ns->next = *pstate; + ns->next = pstate->parseState; + /* This module never changes these fields, but callers can: */ ns->unique_keys = false; ns->skip_nulls = false; + pstate->parseState = ns; return ns; } /* - * pushJsonbValue() worker: Append a pair key to state when generating a Jsonb + * pushJsonbValue() worker: Append a pair key to pstate */ static void -appendKey(JsonbParseState *pstate, JsonbValue *string) +appendKey(JsonbInState *pstate, JsonbValue *string, bool needCopy) { - JsonbValue *object = &pstate->contVal; + JsonbParseState *ppstate = pstate->parseState; + JsonbValue *object = &ppstate->contVal; + JsonbPair *pair; Assert(object->type == jbvObject); Assert(string->type == jbvString); - if (object->val.object.nPairs >= JSONB_MAX_PAIRS) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("number of jsonb object pairs exceeds the maximum allowed (%zu)", - JSONB_MAX_PAIRS))); - - if (object->val.object.nPairs >= pstate->size) + if (object->val.object.nPairs >= ppstate->size) { - pstate->size *= 2; + if (unlikely(object->val.object.nPairs >= JSONB_MAX_PAIRS)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("number of jsonb object pairs exceeds the maximum allowed (%zu)", + JSONB_MAX_PAIRS))); + ppstate->size = Min(ppstate->size * 2, JSONB_MAX_PAIRS); object->val.object.pairs = repalloc(object->val.object.pairs, - sizeof(JsonbPair) * pstate->size); + sizeof(JsonbPair) * ppstate->size); } - object->val.object.pairs[object->val.object.nPairs].key = *string; - object->val.object.pairs[object->val.object.nPairs].order = object->val.object.nPairs; + pair = &object->val.object.pairs[object->val.object.nPairs]; + pair->key = *string; + pair->order = object->val.object.nPairs; + + if (needCopy) + copyScalarSubstructure(&pair->key, pstate->outcontext); } /* - * pushJsonbValue() worker: Append a pair value to state when generating a - * Jsonb + * pushJsonbValue() worker: Append a pair value to pstate */ static void -appendValue(JsonbParseState *pstate, JsonbValue *scalarVal) +appendValue(JsonbInState *pstate, JsonbValue *scalarVal, bool needCopy) { - JsonbValue *object = &pstate->contVal; + JsonbValue *object = &pstate->parseState->contVal; + JsonbPair *pair; Assert(object->type == jbvObject); - object->val.object.pairs[object->val.object.nPairs++].value = *scalarVal; + pair = &object->val.object.pairs[object->val.object.nPairs]; + pair->value = *scalarVal; + object->val.object.nPairs++; + + if (needCopy) + copyScalarSubstructure(&pair->value, pstate->outcontext); } /* - * pushJsonbValue() worker: Append an element to state when generating a Jsonb + * pushJsonbValue() worker: Append an array element to pstate */ static void -appendElement(JsonbParseState *pstate, JsonbValue *scalarVal) +appendElement(JsonbInState *pstate, JsonbValue *scalarVal, bool needCopy) { - JsonbValue *array = &pstate->contVal; + JsonbParseState *ppstate = pstate->parseState; + JsonbValue *array = &ppstate->contVal; + JsonbValue *elem; Assert(array->type == jbvArray); - if (array->val.array.nElems >= JSONB_MAX_ELEMS) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("number of jsonb array elements exceeds the maximum allowed (%zu)", - JSONB_MAX_ELEMS))); - - if (array->val.array.nElems >= pstate->size) + if (array->val.array.nElems >= ppstate->size) { - pstate->size *= 2; + if (unlikely(array->val.array.nElems >= JSONB_MAX_ELEMS)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("number of jsonb array elements exceeds the maximum allowed (%zu)", + JSONB_MAX_ELEMS))); + ppstate->size = Min(ppstate->size * 2, JSONB_MAX_ELEMS); array->val.array.elems = repalloc(array->val.array.elems, - sizeof(JsonbValue) * pstate->size); + sizeof(JsonbValue) * ppstate->size); } - array->val.array.elems[array->val.array.nElems++] = *scalarVal; + elem = &array->val.array.elems[array->val.array.nElems]; + *elem = *scalarVal; + array->val.array.nElems++; + + if (needCopy) + copyScalarSubstructure(elem, pstate->outcontext); +} + +/* + * Copy any infrastructure of a scalar JsonbValue into the outcontext, + * adjusting the pointer(s) in *v. + * + * We need not deal with containers here, as the routines above ensure + * that they are built fresh. + */ +static void +copyScalarSubstructure(JsonbValue *v, MemoryContext outcontext) +{ + MemoryContext oldcontext; + + /* Nothing to do if caller did not specify an outcontext */ + if (outcontext == NULL) + return; + switch (v->type) + { + case jbvNull: + case jbvBool: + /* pass-by-value, nothing to do */ + break; + case jbvString: + { + char *buf = MemoryContextAlloc(outcontext, + v->val.string.len); + + memcpy(buf, v->val.string.val, v->val.string.len); + v->val.string.val = buf; + } + break; + case jbvNumeric: + oldcontext = MemoryContextSwitchTo(outcontext); + v->val.numeric = + DatumGetNumeric(datumCopy(NumericGetDatum(v->val.numeric), + false, -1)); + MemoryContextSwitchTo(oldcontext); + break; + case jbvDatetime: + switch (v->val.datetime.typid) + { + case DATEOID: + case TIMEOID: + case TIMESTAMPOID: + case TIMESTAMPTZOID: + /* pass-by-value, nothing to do */ + break; + case TIMETZOID: + /* pass-by-reference */ + oldcontext = MemoryContextSwitchTo(outcontext); + v->val.datetime.value = datumCopy(v->val.datetime.value, + false, TIMETZ_TYPLEN); + MemoryContextSwitchTo(oldcontext); + break; + default: + elog(ERROR, "unexpected jsonb datetime type oid %u", + v->val.datetime.typid); + } + break; + default: + elog(ERROR, "invalid jsonb scalar type"); + } } /* @@ -852,15 +963,20 @@ JsonbIteratorInit(JsonbContainer *container) * It is our job to expand the jbvBinary representation without bothering them * with it. However, clients should not take it upon themselves to touch array * or Object element/pair buffers, since their element/pair pointers are - * garbage. Also, *val will not be set when returning WJB_END_ARRAY or - * WJB_END_OBJECT, on the assumption that it's only useful to access values - * when recursing in. + * garbage. + * + * *val is not meaningful when the result is WJB_DONE, WJB_END_ARRAY or + * WJB_END_OBJECT. However, we set val->type = jbvNull in those cases, + * so that callers may assume that val->type is always well-defined. */ JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) { if (*it == NULL) + { + val->type = jbvNull; return WJB_DONE; + } /* * When stepping into a nested container, we jump back here to start @@ -898,6 +1014,7 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) * nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_ARRAY; } @@ -951,6 +1068,7 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) * of nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_OBJECT; } else @@ -995,8 +1113,10 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) return WJB_VALUE; } - elog(ERROR, "invalid iterator state"); - return -1; + elog(ERROR, "invalid jsonb iterator state"); + /* satisfy compilers that don't know that elog(ERROR) doesn't return */ + val->type = jbvNull; + return WJB_DONE; } /* @@ -1007,7 +1127,7 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent) { JsonbIterator *it; - it = palloc0(sizeof(JsonbIterator)); + it = palloc0_object(JsonbIterator); it->container = container; it->parent = parent; it->nElems = JsonContainerSize(container); @@ -1253,7 +1373,7 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained) uint32 j = 0; /* Make room for all possible values */ - lhsConts = palloc(sizeof(JsonbValue) * nLhsElems); + lhsConts = palloc_array(JsonbValue, nLhsElems); for (i = 0; i < nLhsElems; i++) { @@ -1949,12 +2069,14 @@ lengthCompareJsonbPair(const void *a, const void *b, void *binequal) static void uniqueifyJsonbObject(JsonbValue *object, bool unique_keys, bool skip_nulls) { + JsonbPair *pairs = object->val.object.pairs; + int nPairs = object->val.object.nPairs; bool hasNonUniq = false; Assert(object->type == jbvObject); - if (object->val.object.nPairs > 1) - qsort_arg(object->val.object.pairs, object->val.object.nPairs, sizeof(JsonbPair), + if (nPairs > 1) + qsort_arg(pairs, nPairs, sizeof(JsonbPair), lengthCompareJsonbPair, &hasNonUniq); if (hasNonUniq && unique_keys) @@ -1964,36 +2086,25 @@ uniqueifyJsonbObject(JsonbValue *object, bool unique_keys, bool skip_nulls) if (hasNonUniq || skip_nulls) { - JsonbPair *ptr, - *res; + int nNewPairs = 0; - while (skip_nulls && object->val.object.nPairs > 0 && - object->val.object.pairs->value.type == jbvNull) + for (int i = 0; i < nPairs; i++) { - /* If skip_nulls is true, remove leading items with null */ - object->val.object.pairs++; - object->val.object.nPairs--; - } - - if (object->val.object.nPairs > 0) - { - ptr = object->val.object.pairs + 1; - res = object->val.object.pairs; - - while (ptr - object->val.object.pairs < object->val.object.nPairs) - { - /* Avoid copying over duplicate or null */ - if (lengthCompareJsonbStringValue(ptr, res) != 0 && - (!skip_nulls || ptr->value.type != jbvNull)) - { - res++; - if (ptr != res) - memcpy(res, ptr, sizeof(JsonbPair)); - } - ptr++; - } + JsonbPair *ptr = pairs + i; - object->val.object.nPairs = res + 1 - object->val.object.pairs; + /* Skip duplicate keys */ + if (nNewPairs > 0 && + lengthCompareJsonbStringValue(&pairs[nNewPairs - 1].key, + &ptr->key) == 0) + continue; + /* Skip null values, if told to */ + if (skip_nulls && ptr->value.type == jbvNull) + continue; + /* Emit this pair, but avoid no-op copy */ + if (i > nNewPairs) + pairs[nNewPairs] = *ptr; + nNewPairs++; } + object->val.object.nPairs = nNewPairs; } } diff --git a/src/backend/utils/adt/jsonbsubs.c b/src/backend/utils/adt/jsonbsubs.c index de64d49851251..b9f5eed44a6a7 100644 --- a/src/backend/utils/adt/jsonbsubs.c +++ b/src/backend/utils/adt/jsonbsubs.c @@ -51,7 +51,7 @@ jsonb_subscript_transform(SubscriptingRef *sbsref, /* * Transform and convert the subscript expressions. Jsonb subscripting - * does not support slices, look only and the upper index. + * does not support slices, look only at the upper index. */ foreach(idx, indirection) { @@ -152,7 +152,7 @@ jsonb_subscript_transform(SubscriptingRef *sbsref, upperIndexpr = lappend(upperIndexpr, subExpr); } - /* store the transformed lists into the SubscriptRef node */ + /* store the transformed lists into the SubscriptingRef node */ sbsref->refupperindexpr = upperIndexpr; sbsref->reflowerindexpr = NIL; diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index bcb1720b6cde2..980e2882957cd 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -475,18 +475,18 @@ static Datum populate_domain(DomainIOData *io, Oid typid, const char *colname, Node *escontext, bool omit_quotes); /* functions supporting jsonb_delete, jsonb_set and jsonb_concat */ -static JsonbValue *IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, - JsonbParseState **state); -static JsonbValue *setPath(JsonbIterator **it, Datum *path_elems, - bool *path_nulls, int path_len, - JsonbParseState **st, int level, JsonbValue *newval, - int op_type); -static void setPathObject(JsonbIterator **it, Datum *path_elems, - bool *path_nulls, int path_len, JsonbParseState **st, +static void IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, + JsonbInState *state); +static void setPath(JsonbIterator **it, const Datum *path_elems, + const bool *path_nulls, int path_len, + JsonbInState *st, int level, JsonbValue *newval, + int op_type); +static void setPathObject(JsonbIterator **it, const Datum *path_elems, + const bool *path_nulls, int path_len, JsonbInState *st, int level, JsonbValue *newval, uint32 npairs, int op_type); -static void setPathArray(JsonbIterator **it, Datum *path_elems, - bool *path_nulls, int path_len, JsonbParseState **st, +static void setPathArray(JsonbIterator **it, const Datum *path_elems, + const bool *path_nulls, int path_len, JsonbInState *st, int level, JsonbValue *newval, uint32 nelems, int op_type); @@ -593,12 +593,12 @@ jsonb_object_keys(PG_FUNCTION_ARGS) funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - state = palloc(sizeof(OkeysState)); + state = palloc_object(OkeysState); state->result_size = JB_ROOT_COUNT(jb); state->result_count = 0; state->sent_count = 0; - state->result = palloc(state->result_size * sizeof(char *)); + state->result = palloc_array(char *, state->result_size); it = JsonbIteratorInit(&jb->root); @@ -744,14 +744,14 @@ json_object_keys(PG_FUNCTION_ARGS) funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - state = palloc(sizeof(OkeysState)); - sem = palloc0(sizeof(JsonSemAction)); + state = palloc_object(OkeysState); + sem = palloc0_object(JsonSemAction); state->lex = makeJsonLexContext(&lex, json, true); state->result_size = 256; state->result_count = 0; state->sent_count = 0; - state->result = palloc(256 * sizeof(char *)); + state->result = palloc_array(char *, 256); sem->semstate = state; sem->array_start = okeys_array_start; @@ -1045,8 +1045,8 @@ get_path_all(FunctionCallInfo fcinfo, bool as_text) deconstruct_array_builtin(path, TEXTOID, &pathtext, &pathnulls, &npath); - tpath = palloc(npath * sizeof(char *)); - ipath = palloc(npath * sizeof(int)); + tpath = palloc_array(char *, npath); + ipath = palloc_array(int, npath); for (i = 0; i < npath; i++) { @@ -1106,8 +1106,8 @@ get_worker(text *json, int npath, bool normalize_results) { - JsonSemAction *sem = palloc0(sizeof(JsonSemAction)); - GetState *state = palloc0(sizeof(GetState)); + JsonSemAction *sem = palloc0_object(JsonSemAction); + GetState *state = palloc0_object(GetState); Assert(npath >= 0); @@ -1118,8 +1118,8 @@ get_worker(text *json, state->npath = npath; state->path_names = tpath; state->path_indexes = ipath; - state->pathok = palloc0(sizeof(bool) * npath); - state->array_cur_index = palloc(sizeof(int) * npath); + state->pathok = palloc0_array(bool, npath); + state->array_cur_index = palloc_array(int, npath); if (npath > 0) state->pathok[0] = true; @@ -1528,7 +1528,7 @@ get_jsonb_path_all(FunctionCallInfo fcinfo, bool as_text) } Datum -jsonb_get_element(Jsonb *jb, Datum *path, int npath, bool *isnull, bool as_text) +jsonb_get_element(Jsonb *jb, const Datum *path, int npath, bool *isnull, bool as_text) { JsonbContainer *container = &jb->root; JsonbValue *jbvp = NULL; @@ -1676,30 +1676,29 @@ jsonb_get_element(Jsonb *jb, Datum *path, int npath, bool *isnull, bool as_text) } Datum -jsonb_set_element(Jsonb *jb, Datum *path, int path_len, +jsonb_set_element(Jsonb *jb, const Datum *path, int path_len, JsonbValue *newval) { - JsonbValue *res; - JsonbParseState *state = NULL; + JsonbInState state = {0}; JsonbIterator *it; - bool *path_nulls = palloc0(path_len * sizeof(bool)); + bool *path_nulls = palloc0_array(bool, path_len); if (newval->type == jbvArray && newval->val.array.rawScalar) *newval = newval->val.array.elems[0]; it = JsonbIteratorInit(&jb->root); - res = setPath(&it, path, path_nulls, path_len, &state, 0, newval, - JB_PATH_CREATE | JB_PATH_FILL_GAPS | - JB_PATH_CONSISTENT_POSITION); + setPath(&it, path, path_nulls, path_len, &state, 0, newval, + JB_PATH_CREATE | JB_PATH_FILL_GAPS | + JB_PATH_CONSISTENT_POSITION); pfree(path_nulls); - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(state.result)); } static void -push_null_elements(JsonbParseState **ps, int num) +push_null_elements(JsonbInState *ps, int num) { JsonbValue null; @@ -1718,8 +1717,8 @@ push_null_elements(JsonbParseState **ps, int num) * Caller is responsible to make sure such path does not exist yet. */ static void -push_path(JsonbParseState **st, int level, Datum *path_elems, - bool *path_nulls, int path_len, JsonbValue *newval) +push_path(JsonbInState *st, int level, const Datum *path_elems, + const bool *path_nulls, int path_len, JsonbValue *newval) { /* * tpath contains expected type of an empty jsonb created at each level @@ -1727,7 +1726,7 @@ push_path(JsonbParseState **st, int level, Datum *path_elems, * it contains only information about path slice from level to the end, * the access index must be normalized by level. */ - enum jbvType *tpath = palloc0((path_len - level) * sizeof(enum jbvType)); + enum jbvType *tpath = palloc0_array(enum jbvType, path_len - level); JsonbValue newkey; /* @@ -1758,15 +1757,15 @@ push_path(JsonbParseState **st, int level, Datum *path_elems, newkey.val.string.val = c; newkey.val.string.len = strlen(c); - (void) pushJsonbValue(st, WJB_BEGIN_OBJECT, NULL); - (void) pushJsonbValue(st, WJB_KEY, &newkey); + pushJsonbValue(st, WJB_BEGIN_OBJECT, NULL); + pushJsonbValue(st, WJB_KEY, &newkey); tpath[i - level] = jbvObject; } else { /* integer, an array is expected */ - (void) pushJsonbValue(st, WJB_BEGIN_ARRAY, NULL); + pushJsonbValue(st, WJB_BEGIN_ARRAY, NULL); push_null_elements(st, lindex); @@ -1776,11 +1775,9 @@ push_path(JsonbParseState **st, int level, Datum *path_elems, /* Insert an actual value for either an object or array */ if (tpath[(path_len - level) - 1] == jbvArray) - { - (void) pushJsonbValue(st, WJB_ELEM, newval); - } + pushJsonbValue(st, WJB_ELEM, newval); else - (void) pushJsonbValue(st, WJB_VALUE, newval); + pushJsonbValue(st, WJB_VALUE, newval); /* * Close everything up to the last but one level. The last one will be @@ -1792,9 +1789,9 @@ push_path(JsonbParseState **st, int level, Datum *path_elems, break; if (tpath[i - level] == jbvObject) - (void) pushJsonbValue(st, WJB_END_OBJECT, NULL); + pushJsonbValue(st, WJB_END_OBJECT, NULL); else - (void) pushJsonbValue(st, WJB_END_ARRAY, NULL); + pushJsonbValue(st, WJB_END_ARRAY, NULL); } } @@ -1856,14 +1853,14 @@ json_array_length(PG_FUNCTION_ARGS) JsonLexContext lex; JsonSemAction *sem; - state = palloc0(sizeof(AlenState)); + state = palloc0_object(AlenState); state->lex = makeJsonLexContext(&lex, json, false); /* palloc0 does this for us */ #if 0 state->count = 0; #endif - sem = palloc0(sizeof(JsonSemAction)); + sem = palloc0_object(JsonSemAction); sem->semstate = state; sem->object_start = alen_object_start; sem->scalar = alen_scalar; @@ -2027,7 +2024,7 @@ each_worker_jsonb(FunctionCallInfo fcinfo, const char *funcname, bool as_text) { /* a json null is an sql null in text mode */ nulls[1] = true; - values[1] = (Datum) NULL; + values[1] = (Datum) 0; } else values[1] = PointerGetDatum(JsonbValueAsText(&v)); @@ -2063,8 +2060,8 @@ each_worker(FunctionCallInfo fcinfo, bool as_text) ReturnSetInfo *rsi; EachState *state; - state = palloc0(sizeof(EachState)); - sem = palloc0(sizeof(JsonSemAction)); + state = palloc0_object(EachState); + sem = palloc0_object(JsonSemAction); rsi = (ReturnSetInfo *) fcinfo->resultinfo; @@ -2266,7 +2263,7 @@ elements_worker_jsonb(FunctionCallInfo fcinfo, const char *funcname, { /* a json null is an sql null in text mode */ nulls[0] = true; - values[0] = (Datum) NULL; + values[0] = (Datum) 0; } else values[0] = PointerGetDatum(JsonbValueAsText(&v)); @@ -2316,8 +2313,8 @@ elements_worker(FunctionCallInfo fcinfo, const char *funcname, bool as_text) /* elements only needs escaped strings when as_text */ makeJsonLexContext(&lex, json, as_text); - state = palloc0(sizeof(ElementsState)); - sem = palloc0(sizeof(JsonSemAction)); + state = palloc0_object(ElementsState); + sem = palloc0_object(JsonSemAction); InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC | MAT_SRF_BLESS); rsi = (ReturnSetInfo *) fcinfo->resultinfo; @@ -2389,7 +2386,7 @@ elements_array_element_end(void *state, bool isnull) if (isnull && _state->normalize_results) { nulls[0] = true; - values[0] = (Datum) NULL; + values[0] = (Datum) 0; } else if (_state->next_scalar) { @@ -2572,8 +2569,8 @@ populate_array_assign_ndims(PopulateArrayContext *ctx, int ndims) } ctx->ndims = ndims; - ctx->dims = palloc(sizeof(int) * ndims); - ctx->sizes = palloc0(sizeof(int) * ndims); + ctx->dims = palloc_array(int, ndims); + ctx->sizes = palloc0_array(int, ndims); for (i = 0; i < ndims; i++) ctx->dims[i] = -1; /* dimensions are unknown yet */ @@ -2958,7 +2955,7 @@ populate_array(ArrayIOData *aio, Assert(ctx.ndims > 0); - lbs = palloc(sizeof(int) * ctx.ndims); + lbs = palloc_array(int, ctx.ndims); for (i = 0; i < ctx.ndims; i++) lbs[i] = 1; @@ -3824,8 +3821,8 @@ get_json_object_as_hash(const char *json, int len, const char *funcname, &ctl, HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); - state = palloc0(sizeof(JHashState)); - sem = palloc0(sizeof(JsonSemAction)); + state = palloc0_object(JHashState); + sem = palloc0_object(JsonSemAction); state->function_name = funcname; state->hash = tab; @@ -4122,7 +4119,7 @@ populate_recordset_worker(FunctionCallInfo fcinfo, const char *funcname, */ update_cached_tupdesc(&cache->c.io.composite, cache->fn_mcxt); - state = palloc0(sizeof(PopulateRecordsetState)); + state = palloc0_object(PopulateRecordsetState); /* make tuplestore in a sufficiently long-lived memory context */ old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory); @@ -4141,7 +4138,7 @@ populate_recordset_worker(FunctionCallInfo fcinfo, const char *funcname, JsonLexContext lex; JsonSemAction *sem; - sem = palloc0(sizeof(JsonSemAction)); + sem = palloc0_object(JsonSemAction); makeJsonLexContext(&lex, json, true); @@ -4507,14 +4504,16 @@ json_strip_nulls(PG_FUNCTION_ARGS) text *json = PG_GETARG_TEXT_PP(0); bool strip_in_arrays = PG_NARGS() == 2 ? PG_GETARG_BOOL(1) : false; StripnullState *state; + StringInfoData strbuf; JsonLexContext lex; JsonSemAction *sem; - state = palloc0(sizeof(StripnullState)); - sem = palloc0(sizeof(JsonSemAction)); + state = palloc0_object(StripnullState); + sem = palloc0_object(JsonSemAction); + initStringInfo(&strbuf); state->lex = makeJsonLexContext(&lex, json, true); - state->strval = makeStringInfo(); + state->strval = &strbuf; state->skip_next_null = false; state->strip_in_arrays = strip_in_arrays; @@ -4542,8 +4541,7 @@ jsonb_strip_nulls(PG_FUNCTION_ARGS) Jsonb *jb = PG_GETARG_JSONB_P(0); bool strip_in_arrays = false; JsonbIterator *it; - JsonbParseState *parseState = NULL; - JsonbValue *res = NULL; + JsonbInState parseState = {0}; JsonbValue v, k; JsonbIteratorToken type; @@ -4579,7 +4577,7 @@ jsonb_strip_nulls(PG_FUNCTION_ARGS) continue; /* otherwise, do a delayed push of the key */ - (void) pushJsonbValue(&parseState, WJB_KEY, &k); + pushJsonbValue(&parseState, WJB_KEY, &k); } /* if strip_in_arrays is set, also skip null array elements */ @@ -4588,14 +4586,12 @@ jsonb_strip_nulls(PG_FUNCTION_ARGS) continue; if (type == WJB_VALUE || type == WJB_ELEM) - res = pushJsonbValue(&parseState, type, &v); + pushJsonbValue(&parseState, type, &v); else - res = pushJsonbValue(&parseState, type, NULL); + pushJsonbValue(&parseState, type, NULL); } - Assert(res != NULL); - - PG_RETURN_POINTER(JsonbValueToJsonb(res)); + PG_RETURN_POINTER(JsonbValueToJsonb(parseState.result)); } /* @@ -4607,11 +4603,12 @@ Datum jsonb_pretty(PG_FUNCTION_ARGS) { Jsonb *jb = PG_GETARG_JSONB_P(0); - StringInfo str = makeStringInfo(); + StringInfoData str; - JsonbToCStringIndent(str, &jb->root, VARSIZE(jb)); + initStringInfo(&str); + JsonbToCStringIndent(&str, &jb->root, VARSIZE(jb)); - PG_RETURN_TEXT_P(cstring_to_text_with_len(str->data, str->len)); + PG_RETURN_TEXT_P(cstring_to_text_with_len(str.data, str.len)); } /* @@ -4624,8 +4621,7 @@ jsonb_concat(PG_FUNCTION_ARGS) { Jsonb *jb1 = PG_GETARG_JSONB_P(0); Jsonb *jb2 = PG_GETARG_JSONB_P(1); - JsonbParseState *state = NULL; - JsonbValue *res; + JsonbInState state = {0}; JsonbIterator *it1, *it2; @@ -4646,11 +4642,9 @@ jsonb_concat(PG_FUNCTION_ARGS) it1 = JsonbIteratorInit(&jb1->root); it2 = JsonbIteratorInit(&jb2->root); - res = IteratorConcat(&it1, &it2, &state); - - Assert(res != NULL); + IteratorConcat(&it1, &it2, &state); - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(state.result)); } @@ -4667,10 +4661,9 @@ jsonb_delete(PG_FUNCTION_ARGS) text *key = PG_GETARG_TEXT_PP(1); char *keyptr = VARDATA_ANY(key); int keylen = VARSIZE_ANY_EXHDR(key); - JsonbParseState *state = NULL; + JsonbInState pstate = {0}; JsonbIterator *it; - JsonbValue v, - *res = NULL; + JsonbValue v; bool skipNested = false; JsonbIteratorToken r; @@ -4699,12 +4692,10 @@ jsonb_delete(PG_FUNCTION_ARGS) continue; } - res = pushJsonbValue(&state, r, r < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(&pstate, r, r < WJB_BEGIN_ARRAY ? &v : NULL); } - Assert(res != NULL); - - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(pstate.result)); } /* @@ -4721,10 +4712,9 @@ jsonb_delete_array(PG_FUNCTION_ARGS) Datum *keys_elems; bool *keys_nulls; int keys_len; - JsonbParseState *state = NULL; + JsonbInState pstate = {0}; JsonbIterator *it; - JsonbValue v, - *res = NULL; + JsonbValue v; bool skipNested = false; JsonbIteratorToken r; @@ -4766,8 +4756,8 @@ jsonb_delete_array(PG_FUNCTION_ARGS) continue; /* We rely on the array elements not being toasted */ - keyptr = VARDATA_ANY(keys_elems[i]); - keylen = VARSIZE_ANY_EXHDR(keys_elems[i]); + keyptr = VARDATA_ANY(DatumGetPointer(keys_elems[i])); + keylen = VARSIZE_ANY_EXHDR(DatumGetPointer(keys_elems[i])); if (keylen == v.val.string.len && memcmp(keyptr, v.val.string.val, keylen) == 0) { @@ -4785,12 +4775,10 @@ jsonb_delete_array(PG_FUNCTION_ARGS) } } - res = pushJsonbValue(&state, r, r < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(&pstate, r, r < WJB_BEGIN_ARRAY ? &v : NULL); } - Assert(res != NULL); - - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(pstate.result)); } /* @@ -4805,12 +4793,11 @@ jsonb_delete_idx(PG_FUNCTION_ARGS) { Jsonb *in = PG_GETARG_JSONB_P(0); int idx = PG_GETARG_INT32(1); - JsonbParseState *state = NULL; + JsonbInState pstate = {0}; JsonbIterator *it; uint32 i = 0, n; - JsonbValue v, - *res = NULL; + JsonbValue v; JsonbIteratorToken r; if (JB_ROOT_IS_SCALAR(in)) @@ -4843,7 +4830,7 @@ jsonb_delete_idx(PG_FUNCTION_ARGS) if (idx >= n) PG_RETURN_JSONB_P(in); - pushJsonbValue(&state, r, NULL); + pushJsonbValue(&pstate, r, NULL); while ((r = JsonbIteratorNext(&it, &v, true)) != WJB_DONE) { @@ -4853,12 +4840,10 @@ jsonb_delete_idx(PG_FUNCTION_ARGS) continue; } - res = pushJsonbValue(&state, r, r < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(&pstate, r, r < WJB_BEGIN_ARRAY ? &v : NULL); } - Assert(res != NULL); - - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(pstate.result)); } /* @@ -4872,12 +4857,11 @@ jsonb_set(PG_FUNCTION_ARGS) Jsonb *newjsonb = PG_GETARG_JSONB_P(2); JsonbValue newval; bool create = PG_GETARG_BOOL(3); - JsonbValue *res = NULL; Datum *path_elems; bool *path_nulls; int path_len; JsonbIterator *it; - JsonbParseState *st = NULL; + JsonbInState st = {0}; JsonbToJsonbValue(newjsonb, &newval); @@ -4901,12 +4885,10 @@ jsonb_set(PG_FUNCTION_ARGS) it = JsonbIteratorInit(&in->root); - res = setPath(&it, path_elems, path_nulls, path_len, &st, - 0, &newval, create ? JB_PATH_CREATE : JB_PATH_REPLACE); - - Assert(res != NULL); + setPath(&it, path_elems, path_nulls, path_len, &st, + 0, &newval, create ? JB_PATH_CREATE : JB_PATH_REPLACE); - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(st.result)); } @@ -4985,12 +4967,11 @@ jsonb_delete_path(PG_FUNCTION_ARGS) { Jsonb *in = PG_GETARG_JSONB_P(0); ArrayType *path = PG_GETARG_ARRAYTYPE_P(1); - JsonbValue *res = NULL; Datum *path_elems; bool *path_nulls; int path_len; JsonbIterator *it; - JsonbParseState *st = NULL; + JsonbInState st = {0}; if (ARR_NDIM(path) > 1) ereport(ERROR, @@ -5012,12 +4993,10 @@ jsonb_delete_path(PG_FUNCTION_ARGS) it = JsonbIteratorInit(&in->root); - res = setPath(&it, path_elems, path_nulls, path_len, &st, - 0, NULL, JB_PATH_DELETE); + setPath(&it, path_elems, path_nulls, path_len, &st, + 0, NULL, JB_PATH_DELETE); - Assert(res != NULL); - - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(st.result)); } /* @@ -5031,12 +5010,11 @@ jsonb_insert(PG_FUNCTION_ARGS) Jsonb *newjsonb = PG_GETARG_JSONB_P(2); JsonbValue newval; bool after = PG_GETARG_BOOL(3); - JsonbValue *res = NULL; Datum *path_elems; bool *path_nulls; int path_len; JsonbIterator *it; - JsonbParseState *st = NULL; + JsonbInState st = {0}; JsonbToJsonbValue(newjsonb, &newval); @@ -5057,12 +5035,10 @@ jsonb_insert(PG_FUNCTION_ARGS) it = JsonbIteratorInit(&in->root); - res = setPath(&it, path_elems, path_nulls, path_len, &st, 0, &newval, - after ? JB_PATH_INSERT_AFTER : JB_PATH_INSERT_BEFORE); - - Assert(res != NULL); + setPath(&it, path_elems, path_nulls, path_len, &st, 0, &newval, + after ? JB_PATH_INSERT_AFTER : JB_PATH_INSERT_BEFORE); - PG_RETURN_JSONB_P(JsonbValueToJsonb(res)); + PG_RETURN_JSONB_P(JsonbValueToJsonb(st.result)); } /* @@ -5072,13 +5048,12 @@ jsonb_insert(PG_FUNCTION_ARGS) * In that case we just append the content of it2 to it1 without any * verifications. */ -static JsonbValue * +static void IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, - JsonbParseState **state) + JsonbInState *state) { JsonbValue v1, - v2, - *res = NULL; + v2; JsonbIteratorToken r1, r2, rk1, @@ -5109,7 +5084,7 @@ IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, * automatically override the value from the first object. */ while ((r2 = JsonbIteratorNext(it2, &v2, true)) != WJB_DONE) - res = pushJsonbValue(state, r2, r2 != WJB_END_OBJECT ? &v2 : NULL); + pushJsonbValue(state, r2, r2 != WJB_END_OBJECT ? &v2 : NULL); } else if (rk1 == WJB_BEGIN_ARRAY && rk2 == WJB_BEGIN_ARRAY) { @@ -5130,7 +5105,7 @@ IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, pushJsonbValue(state, WJB_ELEM, &v2); } - res = pushJsonbValue(state, WJB_END_ARRAY, NULL /* signal to sort */ ); + pushJsonbValue(state, WJB_END_ARRAY, NULL /* signal to sort */ ); } else if (rk1 == WJB_BEGIN_OBJECT) { @@ -5146,7 +5121,7 @@ IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, pushJsonbValue(state, r1, r1 != WJB_END_OBJECT ? &v1 : NULL); while ((r2 = JsonbIteratorNext(it2, &v2, true)) != WJB_DONE) - res = pushJsonbValue(state, r2, r2 != WJB_END_ARRAY ? &v2 : NULL); + pushJsonbValue(state, r2, r2 != WJB_END_ARRAY ? &v2 : NULL); } else { @@ -5165,10 +5140,8 @@ IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, while ((r2 = JsonbIteratorNext(it2, &v2, true)) != WJB_DONE) pushJsonbValue(state, r2, r2 != WJB_END_OBJECT ? &v2 : NULL); - res = pushJsonbValue(state, WJB_END_ARRAY, NULL); + pushJsonbValue(state, WJB_END_ARRAY, NULL); } - - return res; } /* @@ -5200,14 +5173,13 @@ IteratorConcat(JsonbIterator **it1, JsonbIterator **it2, * All path elements before the last must already exist * whatever bits in op_type are set, or nothing is done. */ -static JsonbValue * -setPath(JsonbIterator **it, Datum *path_elems, - bool *path_nulls, int path_len, - JsonbParseState **st, int level, JsonbValue *newval, int op_type) +static void +setPath(JsonbIterator **it, const Datum *path_elems, + const bool *path_nulls, int path_len, + JsonbInState *st, int level, JsonbValue *newval, int op_type) { JsonbValue v; JsonbIteratorToken r; - JsonbValue *res; check_stack_depth(); @@ -5237,20 +5209,20 @@ setPath(JsonbIterator **it, Datum *path_elems, errdetail("The path assumes key is a composite object, " "but it is a scalar value."))); - (void) pushJsonbValue(st, r, NULL); + pushJsonbValue(st, r, NULL); setPathArray(it, path_elems, path_nulls, path_len, st, level, newval, v.val.array.nElems, op_type); r = JsonbIteratorNext(it, &v, false); Assert(r == WJB_END_ARRAY); - res = pushJsonbValue(st, r, NULL); + pushJsonbValue(st, r, NULL); break; case WJB_BEGIN_OBJECT: - (void) pushJsonbValue(st, r, NULL); + pushJsonbValue(st, r, NULL); setPathObject(it, path_elems, path_nulls, path_len, st, level, newval, v.val.object.nPairs, op_type); r = JsonbIteratorNext(it, &v, true); Assert(r == WJB_END_OBJECT); - res = pushJsonbValue(st, r, NULL); + pushJsonbValue(st, r, NULL); break; case WJB_ELEM: case WJB_VALUE: @@ -5268,23 +5240,20 @@ setPath(JsonbIterator **it, Datum *path_elems, errdetail("The path assumes key is a composite object, " "but it is a scalar value."))); - res = pushJsonbValue(st, r, &v); + pushJsonbValue(st, r, &v); break; default: elog(ERROR, "unrecognized iterator result: %d", (int) r); - res = NULL; /* keep compiler quiet */ break; } - - return res; } /* * Object walker for setPath */ static void -setPathObject(JsonbIterator **it, Datum *path_elems, bool *path_nulls, - int path_len, JsonbParseState **st, int level, +setPathObject(JsonbIterator **it, const Datum *path_elems, const bool *path_nulls, + int path_len, JsonbInState *st, int level, JsonbValue *newval, uint32 npairs, int op_type) { text *pathelem = NULL; @@ -5311,8 +5280,8 @@ setPathObject(JsonbIterator **it, Datum *path_elems, bool *path_nulls, newkey.val.string.val = VARDATA_ANY(pathelem); newkey.val.string.len = VARSIZE_ANY_EXHDR(pathelem); - (void) pushJsonbValue(st, WJB_KEY, &newkey); - (void) pushJsonbValue(st, WJB_VALUE, newval); + pushJsonbValue(st, WJB_KEY, &newkey); + pushJsonbValue(st, WJB_VALUE, newval); } for (i = 0; i < npairs; i++) @@ -5344,13 +5313,13 @@ setPathObject(JsonbIterator **it, Datum *path_elems, bool *path_nulls, r = JsonbIteratorNext(it, &v, true); /* skip value */ if (!(op_type & JB_PATH_DELETE)) { - (void) pushJsonbValue(st, WJB_KEY, &k); - (void) pushJsonbValue(st, WJB_VALUE, newval); + pushJsonbValue(st, WJB_KEY, &k); + pushJsonbValue(st, WJB_VALUE, newval); } } else { - (void) pushJsonbValue(st, r, &k); + pushJsonbValue(st, r, &k); setPath(it, path_elems, path_nulls, path_len, st, level + 1, newval, op_type); } @@ -5366,13 +5335,13 @@ setPathObject(JsonbIterator **it, Datum *path_elems, bool *path_nulls, newkey.val.string.val = VARDATA_ANY(pathelem); newkey.val.string.len = VARSIZE_ANY_EXHDR(pathelem); - (void) pushJsonbValue(st, WJB_KEY, &newkey); - (void) pushJsonbValue(st, WJB_VALUE, newval); + pushJsonbValue(st, WJB_KEY, &newkey); + pushJsonbValue(st, WJB_VALUE, newval); } - (void) pushJsonbValue(st, r, &k); + pushJsonbValue(st, r, &k); r = JsonbIteratorNext(it, &v, false); - (void) pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); if (r == WJB_BEGIN_ARRAY || r == WJB_BEGIN_OBJECT) { int walking_level = 1; @@ -5386,7 +5355,7 @@ setPathObject(JsonbIterator **it, Datum *path_elems, bool *path_nulls, if (r == WJB_END_ARRAY || r == WJB_END_OBJECT) --walking_level; - (void) pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); } } } @@ -5410,9 +5379,8 @@ setPathObject(JsonbIterator **it, Datum *path_elems, bool *path_nulls, newkey.val.string.val = VARDATA_ANY(pathelem); newkey.val.string.len = VARSIZE_ANY_EXHDR(pathelem); - (void) pushJsonbValue(st, WJB_KEY, &newkey); - (void) push_path(st, level, path_elems, path_nulls, - path_len, newval); + pushJsonbValue(st, WJB_KEY, &newkey); + push_path(st, level, path_elems, path_nulls, path_len, newval); /* Result is closed with WJB_END_OBJECT outside of this function */ } @@ -5422,8 +5390,8 @@ setPathObject(JsonbIterator **it, Datum *path_elems, bool *path_nulls, * Array walker for setPath */ static void -setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, - int path_len, JsonbParseState **st, int level, +setPathArray(JsonbIterator **it, const Datum *path_elems, const bool *path_nulls, + int path_len, JsonbInState *st, int level, JsonbValue *newval, uint32 nelems, int op_type) { JsonbValue v; @@ -5491,7 +5459,7 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, if (op_type & JB_PATH_FILL_GAPS && nelems == 0 && idx > 0) push_null_elements(st, idx); - (void) pushJsonbValue(st, WJB_ELEM, newval); + pushJsonbValue(st, WJB_ELEM, newval); done = true; } @@ -5510,7 +5478,7 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, r = JsonbIteratorNext(it, &v, true); /* skip */ if (op_type & (JB_PATH_INSERT_BEFORE | JB_PATH_CREATE)) - (void) pushJsonbValue(st, WJB_ELEM, newval); + pushJsonbValue(st, WJB_ELEM, newval); /* * We should keep current value only in case of @@ -5518,20 +5486,20 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, * otherwise it should be deleted or replaced */ if (op_type & (JB_PATH_INSERT_AFTER | JB_PATH_INSERT_BEFORE)) - (void) pushJsonbValue(st, r, &v); + pushJsonbValue(st, r, &v); if (op_type & (JB_PATH_INSERT_AFTER | JB_PATH_REPLACE)) - (void) pushJsonbValue(st, WJB_ELEM, newval); + pushJsonbValue(st, WJB_ELEM, newval); } else - (void) setPath(it, path_elems, path_nulls, path_len, - st, level + 1, newval, op_type); + setPath(it, path_elems, path_nulls, path_len, + st, level + 1, newval, op_type); } else { r = JsonbIteratorNext(it, &v, false); - (void) pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); if (r == WJB_BEGIN_ARRAY || r == WJB_BEGIN_OBJECT) { @@ -5546,7 +5514,7 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, if (r == WJB_END_ARRAY || r == WJB_END_OBJECT) --walking_level; - (void) pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(st, r, r < WJB_BEGIN_ARRAY ? &v : NULL); } } } @@ -5561,7 +5529,7 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, if (op_type & JB_PATH_FILL_GAPS && idx > nelems) push_null_elements(st, idx - nelems); - (void) pushJsonbValue(st, WJB_ELEM, newval); + pushJsonbValue(st, WJB_ELEM, newval); done = true; } @@ -5580,8 +5548,7 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, if (idx > 0) push_null_elements(st, idx - nelems); - (void) push_path(st, level, path_elems, path_nulls, - path_len, newval); + push_path(st, level, path_elems, path_nulls, path_len, newval); /* Result is closed with WJB_END_OBJECT outside of this function */ } @@ -5733,8 +5700,8 @@ iterate_json_values(text *json, uint32 flags, void *action_state, JsonIterateStringValuesAction action) { JsonLexContext lex; - JsonSemAction *sem = palloc0(sizeof(JsonSemAction)); - IterateJsonStringValuesState *state = palloc0(sizeof(IterateJsonStringValuesState)); + JsonSemAction *sem = palloc0_object(JsonSemAction); + IterateJsonStringValuesState *state = palloc0_object(IterateJsonStringValuesState); state->lex = makeJsonLexContext(&lex, json, true); state->action = action; @@ -5807,10 +5774,9 @@ transform_jsonb_string_values(Jsonb *jsonb, void *action_state, JsonTransformStringValuesAction transform_action) { JsonbIterator *it; - JsonbValue v, - *res = NULL; + JsonbValue v; JsonbIteratorToken type; - JsonbParseState *st = NULL; + JsonbInState st = {0}; text *out; bool is_scalar = false; @@ -5826,27 +5792,27 @@ transform_jsonb_string_values(Jsonb *jsonb, void *action_state, out = pg_detoast_datum_packed(out); v.val.string.val = VARDATA_ANY(out); v.val.string.len = VARSIZE_ANY_EXHDR(out); - res = pushJsonbValue(&st, type, type < WJB_BEGIN_ARRAY ? &v : NULL); + pushJsonbValue(&st, type, type < WJB_BEGIN_ARRAY ? &v : NULL); } else { - res = pushJsonbValue(&st, type, (type == WJB_KEY || - type == WJB_VALUE || - type == WJB_ELEM) ? &v : NULL); + pushJsonbValue(&st, type, (type == WJB_KEY || + type == WJB_VALUE || + type == WJB_ELEM) ? &v : NULL); } } - if (res->type == jbvArray) - res->val.array.rawScalar = is_scalar; + if (st.result->type == jbvArray) + st.result->val.array.rawScalar = is_scalar; - return JsonbValueToJsonb(res); + return JsonbValueToJsonb(st.result); } /* * Iterate over a json, and apply a specified JsonTransformStringValuesAction * to every string value or element. Any necessary context for a * JsonTransformStringValuesAction can be passed in the action_state variable. - * Function returns a StringInfo, which is a copy of an original json with + * Function returns a Text Datum, which is a copy of an original json with * transformed values. */ text * @@ -5854,11 +5820,14 @@ transform_json_string_values(text *json, void *action_state, JsonTransformStringValuesAction transform_action) { JsonLexContext lex; - JsonSemAction *sem = palloc0(sizeof(JsonSemAction)); - TransformJsonStringValuesState *state = palloc0(sizeof(TransformJsonStringValuesState)); + JsonSemAction *sem = palloc0_object(JsonSemAction); + TransformJsonStringValuesState *state = palloc0_object(TransformJsonStringValuesState); + StringInfoData strbuf; + + initStringInfo(&strbuf); state->lex = makeJsonLexContext(&lex, json, true); - state->strval = makeStringInfo(); + state->strval = &strbuf; state->action = transform_action; state->action_state = action_state; diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c index dbab24737ef1f..fc0e05e878916 100644 --- a/src/backend/utils/adt/jsonpath_exec.c +++ b/src/backend/utils/adt/jsonpath_exec.c @@ -252,7 +252,8 @@ typedef JsonPathBool (*JsonPathPredicateCallback) (JsonPathItem *jsp, JsonbValue *larg, JsonbValue *rarg, void *param); -typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, bool *error); +typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, + Node *escontext); static JsonPathExecResult executeJsonPath(JsonPath *path, void *vars, JsonPathGetVarCallback getVar, @@ -774,7 +775,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, break; } - v = hasNext ? &vbuf : palloc(sizeof(*v)); + v = hasNext ? &vbuf : palloc_object(JsonbValue); baseObject = cxt->baseObject; getJsonPathItem(cxt, jsp, v); @@ -808,23 +809,23 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, case jpiAdd: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_add_opt_error, found); + numeric_add_safe, found); case jpiSub: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_sub_opt_error, found); + numeric_sub_safe, found); case jpiMul: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mul_opt_error, found); + numeric_mul_safe, found); case jpiDiv: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_div_opt_error, found); + numeric_div_safe, found); case jpiMod: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mod_opt_error, found); + numeric_mod_safe, found); case jpiPlus: return executeUnaryArithmExpr(cxt, jsp, jb, NULL, found); @@ -1087,7 +1088,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, case jpiType: { - JsonbValue *jbv = palloc(sizeof(*jbv)); + JsonbValue *jbv = palloc_object(JsonbValue); jbv->type = jbvString; jbv->val.string.val = pstrdup(JsonbTypeName(jb)); @@ -1117,7 +1118,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, size = 1; } - jb = palloc(sizeof(*jb)); + jb = palloc_object(JsonbValue); jb->type = jbvNumeric; jb->val.numeric = int64_to_numeric(size); @@ -1248,7 +1249,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, last = cxt->innermostArraySize - 1; - lastjbv = hasNext ? &tmpjbv : palloc(sizeof(*lastjbv)); + lastjbv = hasNext ? &tmpjbv : palloc_object(JsonbValue); lastjbv->type = jbvNumeric; lastjbv->val.numeric = int64_to_numeric(last); @@ -1269,11 +1270,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; int64 val; - val = numeric_int8_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int8_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -1466,7 +1468,6 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, Datum dtypmod; int32 precision; int32 scale = 0; - bool have_error; bool noerr; ArrayType *arrtypmod; Datum datums[2]; @@ -1478,9 +1479,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() precision"); - precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("precision of jsonpath item method .%s() is out of range for type integer", @@ -1492,9 +1493,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() scale"); - scale = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + scale = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("scale of jsonpath item method .%s() is out of range for type integer", @@ -1517,7 +1518,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, /* Convert numstr to Numeric with typmod */ Assert(numstr != NULL); noerr = DirectInputFunctionCallSafe(numeric_in, numstr, - InvalidOid, dtypmod, + InvalidOid, DatumGetInt32(dtypmod), (Node *) &escontext, &numdatum); @@ -1550,11 +1551,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; int32 val; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - val = numeric_int4_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int4_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -2149,18 +2151,18 @@ executeBinaryArithmExpr(JsonPathExecContext *cxt, JsonPathItem *jsp, } else { - bool error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - res = func(lval->val.numeric, rval->val.numeric, &error); + res = func(lval->val.numeric, rval->val.numeric, (Node *) &escontext); - if (error) + if (escontext.error_occurred) return jperError; } if (!jspGetNext(jsp, &elem) && !found) return jperOk; - lval = palloc(sizeof(*lval)); + lval = palloc_object(JsonbValue); lval->type = jbvNumeric; lval->val.numeric = res; @@ -2315,7 +2317,7 @@ executeNumericItemMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, if (!jspGetNext(jsp, &next) && !found) return jperOk; - jb = palloc(sizeof(*jb)); + jb = palloc_object(JsonbValue); jb->type = jbvNumeric; jb->val.numeric = DatumGetNumeric(datum); @@ -2433,7 +2435,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jsp->type != jpiDatetime && jsp->type != jpiDate && jsp->content.arg) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; jspGetArg(jsp, &elem); @@ -2441,9 +2443,9 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, elog(ERROR, "invalid jsonpath item type for %s argument", jspOperationName(jsp->type)); - time_precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + time_precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION), errmsg("time precision of jsonpath item method .%s() is out of range for type integer", @@ -2781,7 +2783,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, if (!hasNext && !found) return res; - jb = hasNext ? &jbvbuf : palloc(sizeof(*jb)); + jb = hasNext ? &jbvbuf : palloc_object(JsonbValue); jb->type = jbvDatetime; jb->val.datetime.value = value; @@ -2872,8 +2874,7 @@ executeKeyValueMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, { JsonBaseObjectInfo baseObject; JsonbValue obj; - JsonbParseState *ps; - JsonbValue *keyval; + JsonbInState ps; Jsonb *jsonb; if (tok != WJB_KEY) @@ -2887,7 +2888,8 @@ executeKeyValueMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, tok = JsonbIteratorNext(&it, &val, true); Assert(tok == WJB_VALUE); - ps = NULL; + memset(&ps, 0, sizeof(ps)); + pushJsonbValue(&ps, WJB_BEGIN_OBJECT, NULL); pushJsonbValue(&ps, WJB_KEY, &keystr); @@ -2899,9 +2901,9 @@ executeKeyValueMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, pushJsonbValue(&ps, WJB_KEY, &idstr); pushJsonbValue(&ps, WJB_VALUE, &idval); - keyval = pushJsonbValue(&ps, WJB_END_OBJECT, NULL); + pushJsonbValue(&ps, WJB_END_OBJECT, NULL); - jsonb = JsonbValueToJsonb(keyval); + jsonb = JsonbValueToJsonb(ps.result); JsonbInitBinary(&obj, jsonb); @@ -3016,7 +3018,7 @@ GetJsonPathVar(void *cxt, char *varName, int varNameLen, return NULL; } - result = palloc(sizeof(JsonbValue)); + result = palloc_object(JsonbValue); if (var->isnull) { *baseObjectId = 0; @@ -3074,8 +3076,8 @@ JsonItemFromDatum(Datum val, Oid typid, int32 typmod, JsonbValue *res) case TEXTOID: case VARCHAROID: res->type = jbvString; - res->val.string.val = VARDATA_ANY(val); - res->val.string.len = VARSIZE_ANY_EXHDR(val); + res->val.string.val = VARDATA_ANY(DatumGetPointer(val)); + res->val.string.len = VARSIZE_ANY_EXHDR(DatumGetPointer(val)); break; case DATEOID: case TIMEOID: @@ -3443,7 +3445,7 @@ compareNumeric(Numeric a, Numeric b) static JsonbValue * copyJsonbValue(JsonbValue *src) { - JsonbValue *dst = palloc(sizeof(*dst)); + JsonbValue *dst = palloc_object(JsonbValue); *dst = *src; @@ -3462,7 +3464,7 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, JsonValueList found = {0}; JsonPathExecResult res = executeItem(cxt, jsp, jb, &found); Datum numeric_index; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; if (jperIsError(res)) return res; @@ -3477,10 +3479,10 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, NumericGetDatum(jbv->val.numeric), Int32GetDatum(0)); - *index = numeric_int4_opt_error(DatumGetNumeric(numeric_index), - &have_error); + *index = numeric_int4_safe(DatumGetNumeric(numeric_index), + (Node *) &escontext); - if (have_error) + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_SQL_JSON_SUBSCRIPT), errmsg("jsonpath array subscript is out of integer range")))); @@ -3647,7 +3649,7 @@ getScalar(JsonbValue *scalar, enum jbvType type) static JsonbValue * wrapItemsInArray(const JsonValueList *items) { - JsonbParseState *ps = NULL; + JsonbInState ps = {0}; JsonValueListIterator it; JsonbValue *jbv; @@ -3657,7 +3659,9 @@ wrapItemsInArray(const JsonValueList *items) while ((jbv = JsonValueListNext(items, &it))) pushJsonbValue(&ps, WJB_ELEM, jbv); - return pushJsonbValue(&ps, WJB_END_ARRAY, NULL); + pushJsonbValue(&ps, WJB_END_ARRAY, NULL); + + return ps.result; } /* Check if the timezone required for casting from type1 to type2 is used */ @@ -4117,7 +4121,7 @@ JsonTableInitOpaque(TableFuncScanState *state, int natts) JsonExpr *je = castNode(JsonExpr, tf->docexpr); List *args = NIL; - cxt = palloc0(sizeof(JsonTableExecContext)); + cxt = palloc0_object(JsonTableExecContext); cxt->magic = JSON_TABLE_EXEC_CONTEXT_MAGIC; /* @@ -4136,7 +4140,7 @@ JsonTableInitOpaque(TableFuncScanState *state, int natts) { ExprState *state = lfirst_node(ExprState, exprlc); String *name = lfirst_node(String, namelc); - JsonPathVariable *var = palloc(sizeof(*var)); + JsonPathVariable *var = palloc_object(JsonPathVariable); var->name = pstrdup(name->sval); var->namelen = strlen(var->name); @@ -4154,8 +4158,7 @@ JsonTableInitOpaque(TableFuncScanState *state, int natts) } } - cxt->colplanstates = palloc(sizeof(JsonTablePlanState *) * - list_length(tf->colvalexprs)); + cxt->colplanstates = palloc_array(JsonTablePlanState *, list_length(tf->colvalexprs)); /* * Initialize plan for the root path and, recursively, also any child @@ -4193,7 +4196,7 @@ JsonTableInitPlan(JsonTableExecContext *cxt, JsonTablePlan *plan, JsonTablePlanState *parentstate, List *args, MemoryContext mcxt) { - JsonTablePlanState *planstate = palloc0(sizeof(*planstate)); + JsonTablePlanState *planstate = palloc0_object(JsonTablePlanState); planstate->plan = plan; planstate->parent = parentstate; diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y index 499745a8fef65..411a8baf380fa 100644 --- a/src/backend/utils/adt/jsonpath_gram.y +++ b/src/backend/utils/adt/jsonpath_gram.y @@ -120,7 +120,7 @@ static bool makeItemLikeRegex(JsonPathParseItem *expr, result: mode expr_or_predicate { - *result = palloc(sizeof(JsonPathParseResult)); + *result = palloc_object(JsonPathParseResult); (*result)->expr = $2; (*result)->lax = $1; (void) yynerrs; @@ -384,7 +384,7 @@ method: static JsonPathParseItem * makeItemType(JsonPathItemType type) { - JsonPathParseItem *v = palloc(sizeof(*v)); + JsonPathParseItem *v = palloc_object(JsonPathParseItem); CHECK_FOR_INTERRUPTS(); diff --git a/src/backend/utils/adt/jsonpath_internal.h b/src/backend/utils/adt/jsonpath_internal.h index f78069857d02b..19567aca6f775 100644 --- a/src/backend/utils/adt/jsonpath_internal.h +++ b/src/backend/utils/adt/jsonpath_internal.h @@ -22,10 +22,7 @@ typedef struct JsonPathString int total; } JsonPathString; -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif #include "utils/jsonpath.h" #include "jsonpath_gram.h" diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l index c7aab83eeb4f6..8c3a0a9c64241 100644 --- a/src/backend/utils/adt/jsonpath_scan.l +++ b/src/backend/utils/adt/jsonpath_scan.l @@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner) /* Add given unicode character to scanstring */ static bool -addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner) +addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner) { if (ch == 0) { @@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner) /* Add unicode character, processing any surrogate pairs */ static bool -addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner) +addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner) { if (is_utf16_surrogate_first(ch)) { @@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner) for (i = 2; i < l; i += 2) /* skip '\u' */ { - int ch = 0; + char32_t ch = 0; int j, si; diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 7f4cf6145854a..2898026430780 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -43,8 +43,8 @@ static text *MB_do_like_escape(text *pat, text *esc); static int UTF8_MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale); -static int SB_IMatchText(const char *t, int tlen, const char *p, int plen, - pg_locale_t locale); +static int C_IMatchText(const char *t, int tlen, const char *p, int plen, + pg_locale_t locale); static int GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation); static int Generic_Text_IC_like(text *str, text *pat, Oid collation); @@ -84,22 +84,10 @@ wchareq(const char *p1, const char *p2) * of getting a single character transformed to the system's wchar_t format. * So now, we just downcase the strings using lower() and apply regular LIKE * comparison. This should be revisited when we install better locale support. - */ - -/* - * We do handle case-insensitive matching for single-byte encodings using + * + * We do handle case-insensitive matching for the C locale using * fold-on-the-fly processing, however. */ -static char -SB_lower_char(unsigned char c, pg_locale_t locale) -{ - if (locale->ctype_is_c) - return pg_ascii_tolower(c); - else if (locale->is_default) - return pg_tolower(c); - else - return tolower_l(c, locale->info.lt); -} #define NextByte(p, plen) ((p)++, (plen)--) @@ -130,10 +118,10 @@ SB_lower_char(unsigned char c, pg_locale_t locale) #include "like_match.c" -/* setup to compile like_match.c for single byte case insensitive matches */ -#define MATCH_LOWER(t, locale) SB_lower_char((unsigned char) (t), locale) +/* setup to compile like_match.c for case-insensitive matches in C locale */ +#define MATCH_LOWER #define NextChar(p, plen) NextByte((p), (plen)) -#define MatchText SB_IMatchText +#define MatchText C_IMatchText #include "like_match.c" @@ -202,35 +190,37 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) errmsg("nondeterministic collations are not supported for ILIKE"))); /* - * For efficiency reasons, in the single byte case we don't call lower() - * on the pattern and text, but instead call SB_lower_char on each - * character. In the multi-byte case we don't have much choice :-(. Also, - * ICU does not support single-character case folding, so we go the long - * way. + * For efficiency reasons, in the C locale we don't call lower() on the + * pattern and text, but instead lowercase each character lazily. + * + * XXX: use casefolding instead? */ - if (pg_database_encoding_max_length() > 1 || (locale->provider == COLLPROVIDER_ICU)) + if (locale->ctype_is_c) { - pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, - PointerGetDatum(pat))); p = VARDATA_ANY(pat); plen = VARSIZE_ANY_EXHDR(pat); - str = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, - PointerGetDatum(str))); s = VARDATA_ANY(str); slen = VARSIZE_ANY_EXHDR(str); - if (GetDatabaseEncoding() == PG_UTF8) - return UTF8_MatchText(s, slen, p, plen, 0); - else - return MB_MatchText(s, slen, p, plen, 0); + return C_IMatchText(s, slen, p, plen, locale); } else { + pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, + PointerGetDatum(pat))); p = VARDATA_ANY(pat); plen = VARSIZE_ANY_EXHDR(pat); + str = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, + PointerGetDatum(str))); s = VARDATA_ANY(str); slen = VARSIZE_ANY_EXHDR(str); - return SB_IMatchText(s, slen, p, plen, locale); + + if (GetDatabaseEncoding() == PG_UTF8) + return UTF8_MatchText(s, slen, p, plen, 0); + else if (pg_database_encoding_max_length() > 1) + return MB_MatchText(s, slen, p, plen, 0); + else + return SB_MatchText(s, slen, p, plen, 0); } } diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index 892f8a745ea43..54846c9541d89 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -70,10 +70,14 @@ *-------------------- */ +/* + * MATCH_LOWER is defined for ILIKE in the C locale as an optimization. Other + * locales must casefold the inputs before matching. + */ #ifdef MATCH_LOWER -#define GETCHAR(t, locale) MATCH_LOWER(t, locale) +#define GETCHAR(t) pg_ascii_tolower(t) #else -#define GETCHAR(t, locale) (t) +#define GETCHAR(t) (t) #endif static int @@ -105,7 +109,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), errmsg("LIKE pattern must not end with escape character"))); - if (GETCHAR(*p, locale) != GETCHAR(*t, locale)) + if (GETCHAR(*p) != GETCHAR(*t)) return LIKE_FALSE; } else if (*p == '%') @@ -167,14 +171,14 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), errmsg("LIKE pattern must not end with escape character"))); - firstpat = GETCHAR(p[1], locale); + firstpat = GETCHAR(p[1]); } else - firstpat = GETCHAR(*p, locale); + firstpat = GETCHAR(*p); while (tlen > 0) { - if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic)) + if (GETCHAR(*t) == firstpat || (locale && !locale->deterministic)) { int matched = MatchText(t, tlen, p, plen, locale); @@ -342,7 +346,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) NextChar(t1, t1len); } } - else if (GETCHAR(*p, locale) != GETCHAR(*t, locale)) + else if (GETCHAR(*p) != GETCHAR(*t)) { /* non-wildcard pattern char fails to match text char */ return LIKE_FALSE; diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8fdc677371f4d..b1b0192aa4673 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -99,8 +99,6 @@ static Selectivity like_selectivity(const char *patt, int pattlen, static Selectivity regex_selectivity(const char *patt, int pattlen, bool case_insensitive, int fixed_prefix_len); -static int pattern_char_isalpha(char c, bool is_multibyte, - pg_locale_t locale); static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation); static Datum string_to_datum(const char *str, Oid datatype); @@ -986,8 +984,8 @@ icnlikejoinsel(PG_FUNCTION_ARGS) */ static Pattern_Prefix_Status -like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, - Const **prefix_const, Selectivity *rest_selec) +like_fixed_prefix(Const *patt_const, Const **prefix_const, + Selectivity *rest_selec) { char *match; char *patt; @@ -995,34 +993,10 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Oid typeid = patt_const->consttype; int pos, match_pos; - bool is_multibyte = (pg_database_encoding_max_length() > 1); - pg_locale_t locale = 0; /* the right-hand const is type text or bytea */ Assert(typeid == BYTEAOID || typeid == TEXTOID); - if (case_insensitive) - { - if (typeid == BYTEAOID) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("case insensitive matching not supported on type bytea"))); - - if (!OidIsValid(collation)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for ILIKE"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } - - locale = pg_newlocale_from_collation(collation); - } - if (typeid != BYTEAOID) { patt = TextDatumGetCString(patt_const->constvalue); @@ -1035,7 +1009,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, pattlen = VARSIZE_ANY_EXHDR(bstr); patt = (char *) palloc(pattlen); memcpy(patt, VARDATA_ANY(bstr), pattlen); - Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue)); + Assert(bstr == DatumGetPointer(patt_const->constvalue)); } match = palloc(pattlen + 1); @@ -1055,11 +1029,6 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, break; } - /* Stop if case-varying character (it's sort of a wildcard) */ - if (case_insensitive && - pattern_char_isalpha(patt[pos], is_multibyte, locale)) - break; - match[match_pos++] = patt[pos]; } @@ -1071,8 +1040,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, *prefix_const = string_to_bytea_const(match, match_pos); if (rest_selec != NULL) - *rest_selec = like_selectivity(&patt[pos], pattlen - pos, - case_insensitive); + *rest_selec = like_selectivity(&patt[pos], pattlen - pos, false); pfree(patt); pfree(match); @@ -1087,6 +1055,112 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, return Pattern_Prefix_None; } +/* + * Case-insensitive variant of like_fixed_prefix(). Multibyte and + * locale-aware for detecting cased characters. + */ +static Pattern_Prefix_Status +like_fixed_prefix_ci(Const *patt_const, Oid collation, Const **prefix_const, + Selectivity *rest_selec) +{ + text *val = DatumGetTextPP(patt_const->constvalue); + Oid typeid = patt_const->consttype; + int nbytes = VARSIZE_ANY_EXHDR(val); + int wpos; + pg_wchar *wpatt; + int wpattlen; + pg_wchar *wmatch; + int wmatch_pos = 0; + char *match; + int match_mblen; + pg_locale_t locale = 0; + + /* the right-hand const is type text or bytea */ + Assert(typeid == BYTEAOID || typeid == TEXTOID); + + if (typeid == BYTEAOID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("case insensitive matching not supported on type bytea"))); + + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for ILIKE"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + locale = pg_newlocale_from_collation(collation); + + wpatt = palloc((nbytes + 1) * sizeof(pg_wchar)); + wpattlen = pg_mb2wchar_with_len(VARDATA_ANY(val), wpatt, nbytes); + + wmatch = palloc((nbytes + 1) * sizeof(pg_wchar)); + for (wpos = 0; wpos < wpattlen; wpos++) + { + /* % and _ are wildcard characters in LIKE */ + if (wpatt[wpos] == '%' || + wpatt[wpos] == '_') + break; + + /* Backslash escapes the next character */ + if (wpatt[wpos] == '\\') + { + wpos++; + if (wpos >= wpattlen) + break; + } + + /* + * For ILIKE, stop if it's a case-varying character (it's sort of a + * wildcard). + */ + if (pg_iswcased(wpatt[wpos], locale)) + break; + + wmatch[wmatch_pos++] = wpatt[wpos]; + } + + wmatch[wmatch_pos] = '\0'; + + match = palloc(pg_database_encoding_max_length() * wmatch_pos + 1); + match_mblen = pg_wchar2mb_with_len(wmatch, match, wmatch_pos); + match[match_mblen] = '\0'; + pfree(wmatch); + + *prefix_const = string_to_const(match, TEXTOID); + pfree(match); + + if (rest_selec != NULL) + { + int wrestlen = wpattlen - wmatch_pos; + char *rest; + int rest_mblen; + + rest = palloc(pg_database_encoding_max_length() * wrestlen + 1); + rest_mblen = pg_wchar2mb_with_len(&wpatt[wmatch_pos], rest, wrestlen); + + *rest_selec = like_selectivity(rest, rest_mblen, true); + pfree(rest); + } + + pfree(wpatt); + + /* in LIKE, an empty pattern is an exact match! */ + if (wpos == wpattlen) + return Pattern_Prefix_Exact; /* reached end of pattern, so exact */ + + if (wmatch_pos > 0) + return Pattern_Prefix_Partial; + + return Pattern_Prefix_None; +} + static Pattern_Prefix_Status regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec) @@ -1164,12 +1238,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, switch (ptype) { case Pattern_Type_Like: - result = like_fixed_prefix(patt, false, collation, - prefix, rest_selec); + result = like_fixed_prefix(patt, prefix, rest_selec); break; case Pattern_Type_Like_IC: - result = like_fixed_prefix(patt, true, collation, - prefix, rest_selec); + result = like_fixed_prefix_ci(patt, collation, prefix, + rest_selec); break; case Pattern_Type_Regex: result = regex_fixed_prefix(patt, false, collation, @@ -1481,29 +1554,6 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive, return sel; } -/* - * Check whether char is a letter (and, hence, subject to case-folding) - * - * In multibyte character sets or with ICU, we can't use isalpha, and it does - * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha. - * Instead, just assume any non-ASCII char is potentially case-varying, and - * hard-wire knowledge of which ASCII chars are letters. - */ -static int -pattern_char_isalpha(char c, bool is_multibyte, - pg_locale_t locale) -{ - if (locale->ctype_is_c) - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else if (is_multibyte && IS_HIGHBIT_SET(c)) - return true; - else if (locale->provider != COLLPROVIDER_LIBC) - return IS_HIGHBIT_SET(c) || - (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else - return isalpha_l((unsigned char) c, locale->info.lt); -} - /* * For bytea, the increment function need only increment the current byte @@ -1582,7 +1632,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) len = VARSIZE_ANY_EXHDR(bstr); workstr = (char *) palloc(len); memcpy(workstr, VARDATA_ANY(bstr), len); - Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue)); + Assert(bstr == DatumGetPointer(str_const->constvalue)); cmpstr = str_const->constvalue; } else diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index 00e67fb46d074..bf38d68aa0307 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -152,7 +152,7 @@ pg_lock_status(PG_FUNCTION_ARGS) * Collect all the locking information that we will format and send * out as a result set. */ - mystatus = (PG_Lock_Status *) palloc(sizeof(PG_Lock_Status)); + mystatus = palloc_object(PG_Lock_Status); funcctx->user_fctx = mystatus; mystatus->lockData = GetLockStatusData(); @@ -398,15 +398,15 @@ pg_lock_status(PG_FUNCTION_ARGS) values[0] = CStringGetTextDatum(PredicateLockTagTypeNames[lockType]); /* lock target */ - values[1] = GET_PREDICATELOCKTARGETTAG_DB(*predTag); - values[2] = GET_PREDICATELOCKTARGETTAG_RELATION(*predTag); + values[1] = ObjectIdGetDatum(GET_PREDICATELOCKTARGETTAG_DB(*predTag)); + values[2] = ObjectIdGetDatum(GET_PREDICATELOCKTARGETTAG_RELATION(*predTag)); if (lockType == PREDLOCKTAG_TUPLE) - values[4] = GET_PREDICATELOCKTARGETTAG_OFFSET(*predTag); + values[4] = UInt16GetDatum(GET_PREDICATELOCKTARGETTAG_OFFSET(*predTag)); else nulls[4] = true; if ((lockType == PREDLOCKTAG_TUPLE) || (lockType == PREDLOCKTAG_PAGE)) - values[3] = GET_PREDICATELOCKTARGETTAG_PAGE(*predTag); + values[3] = UInt32GetDatum(GET_PREDICATELOCKTARGETTAG_PAGE(*predTag)); else nulls[3] = true; diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c index 3644e9735f5d0..bb12ed758cb11 100644 --- a/src/backend/utils/adt/mac.c +++ b/src/backend/utils/adt/mac.c @@ -101,7 +101,7 @@ macaddr_in(PG_FUNCTION_ARGS) (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("invalid octet value in \"macaddr\" value: \"%s\"", str))); - result = (macaddr *) palloc(sizeof(macaddr)); + result = palloc_object(macaddr); result->a = a; result->b = b; @@ -142,7 +142,7 @@ macaddr_recv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); macaddr *addr; - addr = (macaddr *) palloc(sizeof(macaddr)); + addr = palloc_object(macaddr); addr->a = pq_getmsgbyte(buf); addr->b = pq_getmsgbyte(buf); @@ -289,7 +289,7 @@ macaddr_not(PG_FUNCTION_ARGS) macaddr *addr = PG_GETARG_MACADDR_P(0); macaddr *result; - result = (macaddr *) palloc(sizeof(macaddr)); + result = palloc_object(macaddr); result->a = ~addr->a; result->b = ~addr->b; result->c = ~addr->c; @@ -306,7 +306,7 @@ macaddr_and(PG_FUNCTION_ARGS) macaddr *addr2 = PG_GETARG_MACADDR_P(1); macaddr *result; - result = (macaddr *) palloc(sizeof(macaddr)); + result = palloc_object(macaddr); result->a = addr1->a & addr2->a; result->b = addr1->b & addr2->b; result->c = addr1->c & addr2->c; @@ -323,7 +323,7 @@ macaddr_or(PG_FUNCTION_ARGS) macaddr *addr2 = PG_GETARG_MACADDR_P(1); macaddr *result; - result = (macaddr *) palloc(sizeof(macaddr)); + result = palloc_object(macaddr); result->a = addr1->a | addr2->a; result->b = addr1->b | addr2->b; result->c = addr1->c | addr2->c; @@ -343,7 +343,7 @@ macaddr_trunc(PG_FUNCTION_ARGS) macaddr *addr = PG_GETARG_MACADDR_P(0); macaddr *result; - result = (macaddr *) palloc(sizeof(macaddr)); + result = palloc_object(macaddr); result->a = addr->a; result->b = addr->b; @@ -374,7 +374,7 @@ macaddr_sortsupport(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - uss = palloc(sizeof(macaddr_sortsupport_state)); + uss = palloc_object(macaddr_sortsupport_state); uss->input_count = 0; uss->estimating = true; initHyperLogLog(&uss->abbr_card, 10); @@ -481,33 +481,26 @@ macaddr_abbrev_convert(Datum original, SortSupport ssup) Datum res; /* - * On a 64-bit machine, zero out the 8-byte datum and copy the 6 bytes of - * the MAC address in. There will be two bytes of zero padding on the end - * of the least significant bits. + * Zero out the 8-byte Datum and copy in the 6 bytes of the MAC address. + * There will be two bytes of zero padding on the end of the least + * significant bits. */ -#if SIZEOF_DATUM == 8 - memset(&res, 0, SIZEOF_DATUM); + StaticAssertDecl(sizeof(res) >= sizeof(macaddr), + "Datum is too small for macaddr"); + memset(&res, 0, sizeof(res)); memcpy(&res, authoritative, sizeof(macaddr)); -#else /* SIZEOF_DATUM != 8 */ - memcpy(&res, authoritative, SIZEOF_DATUM); -#endif uss->input_count += 1; /* - * Cardinality estimation. The estimate uses uint32, so on a 64-bit - * architecture, XOR the two 32-bit halves together to produce slightly - * more entropy. The two zeroed bytes won't have any practical impact on - * this operation. + * Cardinality estimation. The estimate uses uint32, so XOR the two 32-bit + * halves together to produce slightly more entropy. The two zeroed bytes + * won't have any practical impact on this operation. */ if (uss->estimating) { uint32 tmp; -#if SIZEOF_DATUM == 8 - tmp = (uint32) res ^ (uint32) ((uint64) res >> 32); -#else /* SIZEOF_DATUM != 8 */ - tmp = (uint32) res; -#endif + tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); } diff --git a/src/backend/utils/adt/mac8.c b/src/backend/utils/adt/mac8.c index 08e41ba4eeabc..ea715a7a0d436 100644 --- a/src/backend/utils/adt/mac8.c +++ b/src/backend/utils/adt/mac8.c @@ -207,7 +207,7 @@ macaddr8_in(PG_FUNCTION_ARGS) else if (count != 8) goto fail; - result = (macaddr8 *) palloc0(sizeof(macaddr8)); + result = palloc0_object(macaddr8); result->a = a; result->b = b; @@ -256,7 +256,7 @@ macaddr8_recv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); macaddr8 *addr; - addr = (macaddr8 *) palloc0(sizeof(macaddr8)); + addr = palloc0_object(macaddr8); addr->a = pq_getmsgbyte(buf); addr->b = pq_getmsgbyte(buf); @@ -417,7 +417,7 @@ macaddr8_not(PG_FUNCTION_ARGS) macaddr8 *addr = PG_GETARG_MACADDR8_P(0); macaddr8 *result; - result = (macaddr8 *) palloc0(sizeof(macaddr8)); + result = palloc0_object(macaddr8); result->a = ~addr->a; result->b = ~addr->b; result->c = ~addr->c; @@ -437,7 +437,7 @@ macaddr8_and(PG_FUNCTION_ARGS) macaddr8 *addr2 = PG_GETARG_MACADDR8_P(1); macaddr8 *result; - result = (macaddr8 *) palloc0(sizeof(macaddr8)); + result = palloc0_object(macaddr8); result->a = addr1->a & addr2->a; result->b = addr1->b & addr2->b; result->c = addr1->c & addr2->c; @@ -457,7 +457,7 @@ macaddr8_or(PG_FUNCTION_ARGS) macaddr8 *addr2 = PG_GETARG_MACADDR8_P(1); macaddr8 *result; - result = (macaddr8 *) palloc0(sizeof(macaddr8)); + result = palloc0_object(macaddr8); result->a = addr1->a | addr2->a; result->b = addr1->b | addr2->b; result->c = addr1->c | addr2->c; @@ -479,7 +479,7 @@ macaddr8_trunc(PG_FUNCTION_ARGS) macaddr8 *addr = PG_GETARG_MACADDR8_P(0); macaddr8 *result; - result = (macaddr8 *) palloc0(sizeof(macaddr8)); + result = palloc0_object(macaddr8); result->a = addr->a; result->b = addr->b; @@ -502,7 +502,7 @@ macaddr8_set7bit(PG_FUNCTION_ARGS) macaddr8 *addr = PG_GETARG_MACADDR8_P(0); macaddr8 *result; - result = (macaddr8 *) palloc0(sizeof(macaddr8)); + result = palloc0_object(macaddr8); result->a = addr->a | 0x02; result->b = addr->b; @@ -526,7 +526,7 @@ macaddrtomacaddr8(PG_FUNCTION_ARGS) macaddr *addr6 = PG_GETARG_MACADDR_P(0); macaddr8 *result; - result = (macaddr8 *) palloc0(sizeof(macaddr8)); + result = palloc0_object(macaddr8); result->a = addr6->a; result->b = addr6->b; @@ -547,7 +547,7 @@ macaddr8tomacaddr(PG_FUNCTION_ARGS) macaddr8 *addr = PG_GETARG_MACADDR8_P(0); macaddr *result; - result = (macaddr *) palloc0(sizeof(macaddr)); + result = palloc0_object(macaddr); if ((addr->d != 0xFF) || (addr->e != 0xFE)) ereport(ERROR, diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c index 396c2f223b4e1..46dfb3dd133f5 100644 --- a/src/backend/utils/adt/mcxtfuncs.c +++ b/src/backend/utils/adt/mcxtfuncs.c @@ -38,7 +38,7 @@ typedef struct MemoryContextId { MemoryContext context; int context_id; -} MemoryContextId; +} MemoryContextId; /* * int_list_to_array @@ -52,7 +52,7 @@ int_list_to_array(const List *list) ArrayType *result_array; length = list_length(list); - datum_array = (Datum *) palloc(length * sizeof(Datum)); + datum_array = palloc_array(Datum, length); foreach_int(i, list) datum_array[foreach_current_index(i)] = Int32GetDatum(i); diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 244f48f4fd711..9c4c62d41da14 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -1,5 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +# Some code in numeric.c benefits from auto-vectorization +numeric_backend_lib = static_library('numeric_backend_lib', + 'numeric.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags, +) + +backend_link_with += numeric_backend_lib + backend_sources += files( 'acl.c', 'amutils.c', @@ -12,6 +22,7 @@ backend_sources += files( 'arrayutils.c', 'ascii.c', 'bool.c', + 'bytea.c', 'cash.c', 'char.c', 'cryptohashfuncs.c', @@ -54,22 +65,24 @@ backend_sources += files( 'misc.c', 'multirangetypes.c', 'multirangetypes_selfuncs.c', + 'multixactfuncs.c', 'name.c', 'network.c', 'network_gist.c', 'network_selfuncs.c', 'network_spgist.c', - 'numeric.c', 'numutils.c', 'oid.c', 'oracle_compat.c', 'orderedsetaggs.c', 'partitionfuncs.c', + 'pg_dependencies.c', 'pg_locale.c', 'pg_locale_builtin.c', 'pg_locale_icu.c', 'pg_locale_libc.c', 'pg_lsn.c', + 'pg_ndistinct.c', 'pg_upgrade_support.c', 'pgstatfuncs.c', 'pseudorandomfuncs.c', diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c index 6fcfd031428ed..c32f24fbf9776 100644 --- a/src/backend/utils/adt/misc.c +++ b/src/backend/utils/adt/misc.c @@ -21,12 +21,12 @@ #include #include +#include "access/htup_details.h" #include "access/sysattr.h" #include "access/table.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" #include "catalog/system_fk_info.h" -#include "commands/dbcommands.h" #include "commands/tablespace.h" #include "common/keywords.h" #include "funcapi.h" @@ -186,6 +186,20 @@ pg_num_nonnulls(PG_FUNCTION_ARGS) PG_RETURN_INT32(nargs - nulls); } +/* + * error_on_null() + * Check if the input is the NULL value + */ +Datum +pg_error_on_null(PG_FUNCTION_ARGS) +{ + if (PG_ARGISNULL(0)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("null value not allowed"))); + + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); +} /* * current_database() @@ -301,66 +315,12 @@ Datum pg_tablespace_location(PG_FUNCTION_ARGS) { Oid tablespaceOid = PG_GETARG_OID(0); - char sourcepath[MAXPGPATH]; - char targetpath[MAXPGPATH]; - int rllen; - struct stat st; - - /* - * It's useful to apply this function to pg_class.reltablespace, wherein - * zero means "the database's default tablespace". So, rather than - * throwing an error for zero, we choose to assume that's what is meant. - */ - if (tablespaceOid == InvalidOid) - tablespaceOid = MyDatabaseTableSpace; - - /* - * Return empty string for the cluster's default tablespaces - */ - if (tablespaceOid == DEFAULTTABLESPACE_OID || - tablespaceOid == GLOBALTABLESPACE_OID) - PG_RETURN_TEXT_P(cstring_to_text("")); - - /* - * Find the location of the tablespace by reading the symbolic link that - * is in pg_tblspc/. - */ - snprintf(sourcepath, sizeof(sourcepath), "%s/%u", PG_TBLSPC_DIR, tablespaceOid); - - /* - * Before reading the link, check if the source path is a link or a - * junction point. Note that a directory is possible for a tablespace - * created with allow_in_place_tablespaces enabled. If a directory is - * found, a relative path to the data directory is returned. - */ - if (lstat(sourcepath, &st) < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", - sourcepath))); - } - - if (!S_ISLNK(st.st_mode)) - PG_RETURN_TEXT_P(cstring_to_text(sourcepath)); + char *tablespaceLoc; - /* - * In presence of a link or a junction point, return the path pointing to. - */ - rllen = readlink(sourcepath, targetpath, sizeof(targetpath)); - if (rllen < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read symbolic link \"%s\": %m", - sourcepath))); - if (rllen >= sizeof(targetpath)) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("symbolic link \"%s\" target is too long", - sourcepath))); - targetpath[rllen] = '\0'; + /* Get LOCATION string from its OID */ + tablespaceLoc = get_tablespace_location(tablespaceOid); - PG_RETURN_TEXT_P(cstring_to_text(targetpath)); + PG_RETURN_TEXT_P(cstring_to_text(tablespaceLoc)); } /* @@ -370,7 +330,20 @@ Datum pg_sleep(PG_FUNCTION_ARGS) { float8 secs = PG_GETARG_FLOAT8(0); - float8 endtime; + int64 usecs; + TimestampTz endtime; + + /* + * Convert the delay to int64 microseconds, rounding up any fraction, and + * silently limiting it to PG_INT64_MAX/2 microseconds (about 150K years) + * to ensure the computation of endtime won't overflow. Historically + * we've treated NaN as "no wait", not an error, so keep that behavior. + */ + if (isnan(secs) || secs <= 0.0) + PG_RETURN_VOID(); + secs *= USECS_PER_SEC; /* we assume overflow will produce +Inf */ + secs = ceil(secs); /* round up any fractional microsecond */ + usecs = (int64) Min(secs, (float8) (PG_INT64_MAX / 2)); /* * We sleep using WaitLatch, to ensure that we'll wake up promptly if an @@ -384,22 +357,20 @@ pg_sleep(PG_FUNCTION_ARGS) * less than the specified time when WaitLatch is terminated early by a * non-query-canceling signal such as SIGHUP. */ -#define GetNowFloat() ((float8) GetCurrentTimestamp() / 1000000.0) - - endtime = GetNowFloat() + secs; + endtime = GetCurrentTimestamp() + usecs; for (;;) { - float8 delay; + TimestampTz delay; long delay_ms; CHECK_FOR_INTERRUPTS(); - delay = endtime - GetNowFloat(); - if (delay >= 600.0) + delay = endtime - GetCurrentTimestamp(); + if (delay >= 600 * USECS_PER_SEC) delay_ms = 600000; - else if (delay > 0.0) - delay_ms = (long) ceil(delay * 1000.0); + else if (delay > 0) + delay_ms = (long) ((delay + 999) / 1000); else break; @@ -516,7 +487,7 @@ pg_get_catalog_foreign_keys(PG_FUNCTION_ARGS) * array_in, and it wouldn't be very efficient if we could. Fill an * FmgrInfo to use for the call. */ - arrayinp = (FmgrInfo *) palloc(sizeof(FmgrInfo)); + arrayinp = palloc_object(FmgrInfo); fmgr_info(F_ARRAY_IN, arrayinp); funcctx->user_fctx = arrayinp; diff --git a/src/backend/utils/adt/multirangetypes.c b/src/backend/utils/adt/multirangetypes.c index cd84ced5b487c..169acf0ef633f 100644 --- a/src/backend/utils/adt/multirangetypes.c +++ b/src/backend/utils/adt/multirangetypes.c @@ -68,11 +68,11 @@ typedef enum * Macros for accessing past MultirangeType parts of multirange: items, flags * and boundaries. */ -#define MultirangeGetItemsPtr(mr) ((uint32 *) ((Pointer) (mr) + \ +#define MultirangeGetItemsPtr(mr) ((uint32 *) ((char *) (mr) + \ sizeof(MultirangeType))) -#define MultirangeGetFlagsPtr(mr) ((uint8 *) ((Pointer) (mr) + \ +#define MultirangeGetFlagsPtr(mr) ((uint8 *) ((char *) (mr) + \ sizeof(MultirangeType) + ((mr)->rangeCount - 1) * sizeof(uint32))) -#define MultirangeGetBoundariesPtr(mr, align) ((Pointer) (mr) + \ +#define MultirangeGetBoundariesPtr(mr, align) ((char *) (mr) + \ att_align_nominal(sizeof(MultirangeType) + \ ((mr)->rangeCount - 1) * sizeof(uint32) + \ (mr)->rangeCount * sizeof(uint8), (align))) @@ -125,7 +125,7 @@ multirange_in(PG_FUNCTION_ARGS) int32 range_count = 0; int32 range_capacity = 8; RangeType *range; - RangeType **ranges = palloc(range_capacity * sizeof(RangeType *)); + RangeType **ranges = palloc_array(RangeType *, range_capacity); MultirangeIOData *cache; MultirangeType *ret; MultirangeParseState parse_state; @@ -348,7 +348,7 @@ multirange_recv(PG_FUNCTION_ARGS) cache = get_multirange_io_data(fcinfo, mltrngtypoid, IOFunc_receive); range_count = pq_getmsgint(buf, 4); - ranges = palloc(range_count * sizeof(RangeType *)); + ranges = palloc_array(RangeType *, range_count); initStringInfo(&tmpbuf); for (int i = 0; i < range_count; i++) @@ -378,31 +378,33 @@ multirange_send(PG_FUNCTION_ARGS) { MultirangeType *multirange = PG_GETARG_MULTIRANGE_P(0); Oid mltrngtypoid = MultirangeTypeGetOid(multirange); - StringInfo buf = makeStringInfo(); + StringInfoData buf; RangeType **ranges; int32 range_count; MultirangeIOData *cache; + initStringInfo(&buf); cache = get_multirange_io_data(fcinfo, mltrngtypoid, IOFunc_send); /* construct output */ - pq_begintypsend(buf); + pq_begintypsend(&buf); - pq_sendint32(buf, multirange->rangeCount); + pq_sendint32(&buf, multirange->rangeCount); multirange_deserialize(cache->typcache->rngtype, multirange, &range_count, &ranges); for (int i = 0; i < range_count; i++) { Datum range; + bytea *outputbytes; range = RangeTypePGetDatum(ranges[i]); - range = PointerGetDatum(SendFunctionCall(&cache->typioproc, range)); + outputbytes = SendFunctionCall(&cache->typioproc, range); - pq_sendint32(buf, VARSIZE(range) - VARHDRSZ); - pq_sendbytes(buf, VARDATA(range), VARSIZE(range) - VARHDRSZ); + pq_sendint32(&buf, VARSIZE(outputbytes) - VARHDRSZ); + pq_sendbytes(&buf, VARDATA(outputbytes), VARSIZE(outputbytes) - VARHDRSZ); } - PG_RETURN_BYTEA_P(pq_endtypsend(buf)); + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } /* @@ -600,13 +602,13 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp, uint32 prev_offset = 0; uint8 *flags; int32 i; - Pointer begin, - ptr; + const char *begin; + char *ptr; char elemalign = rangetyp->rngelemtype->typalign; items = MultirangeGetItemsPtr(multirange); flags = MultirangeGetFlagsPtr(multirange); - ptr = begin = MultirangeGetBoundariesPtr(multirange, elemalign); + begin = ptr = MultirangeGetBoundariesPtr(multirange, elemalign); for (i = 0; i < range_count; i++) { uint32 len; @@ -625,9 +627,9 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp, items[i - 1] |= MULTIRANGE_ITEM_OFF_BIT; prev_offset = ptr - begin; } - flags[i] = *((Pointer) ranges[i] + VARSIZE(ranges[i]) - sizeof(char)); + flags[i] = *((char *) ranges[i] + VARSIZE(ranges[i]) - sizeof(char)); len = VARSIZE(ranges[i]) - sizeof(RangeType) - sizeof(char); - memcpy(ptr, (Pointer) (ranges[i] + 1), len); + memcpy(ptr, ranges[i] + 1, len); ptr += att_align_nominal(len, elemalign); } } @@ -697,8 +699,8 @@ multirange_get_range(TypeCacheEntry *rangetyp, { uint32 offset; uint8 flags; - Pointer begin, - ptr; + const char *begin; + char *ptr; int16 typlen = rangetyp->rngelemtype->typlen; char typalign = rangetyp->rngelemtype->typalign; uint32 len; @@ -708,7 +710,7 @@ multirange_get_range(TypeCacheEntry *rangetyp, offset = multirange_get_bounds_offset(multirange, i); flags = MultirangeGetFlagsPtr(multirange)[i]; - ptr = begin = MultirangeGetBoundariesPtr(multirange, typalign) + offset; + begin = ptr = MultirangeGetBoundariesPtr(multirange, typalign) + offset; /* * Calculate the size of bound values. In principle, we could get offset @@ -717,11 +719,11 @@ multirange_get_range(TypeCacheEntry *rangetyp, * exact size. */ if (RANGE_HAS_LBOUND(flags)) - ptr = (Pointer) att_addlength_pointer(ptr, typlen, ptr); + ptr = (char *) att_addlength_pointer(ptr, typlen, ptr); if (RANGE_HAS_UBOUND(flags)) { - ptr = (Pointer) att_align_pointer(ptr, typalign, typlen, ptr); - ptr = (Pointer) att_addlength_pointer(ptr, typlen, ptr); + ptr = (char *) att_align_pointer(ptr, typalign, typlen, ptr); + ptr = (char *) att_addlength_pointer(ptr, typlen, ptr); } len = (ptr - begin) + sizeof(RangeType) + sizeof(uint8); @@ -747,7 +749,7 @@ multirange_get_bounds(TypeCacheEntry *rangetyp, { uint32 offset; uint8 flags; - Pointer ptr; + const char *ptr; int16 typlen = rangetyp->rngelemtype->typlen; char typalign = rangetyp->rngelemtype->typalign; bool typbyval = rangetyp->rngelemtype->typbyval; @@ -768,7 +770,7 @@ multirange_get_bounds(TypeCacheEntry *rangetyp, { /* att_align_pointer cannot be necessary here */ lbound = fetch_att(ptr, typbyval, typlen); - ptr = (Pointer) att_addlength_pointer(ptr, typlen, ptr); + ptr = (char *) att_addlength_pointer(ptr, typlen, ptr); } else lbound = (Datum) 0; @@ -776,7 +778,7 @@ multirange_get_bounds(TypeCacheEntry *rangetyp, /* fetch upper bound, if any */ if (RANGE_HAS_UBOUND(flags)) { - ptr = (Pointer) att_align_pointer(ptr, typalign, typlen, ptr); + ptr = (char *) att_align_pointer(ptr, typalign, typlen, ptr); ubound = fetch_att(ptr, typbyval, typlen); /* no need for att_addlength_pointer */ } @@ -834,7 +836,7 @@ multirange_deserialize(TypeCacheEntry *rangetyp, { int i; - *ranges = palloc(*range_count * sizeof(RangeType *)); + *ranges = palloc_array(RangeType *, *range_count); for (i = 0; i < *range_count; i++) (*ranges)[i] = multirange_get_range(rangetyp, multirange, i); } @@ -1225,6 +1227,77 @@ multirange_minus_internal(Oid mltrngtypoid, TypeCacheEntry *rangetyp, return make_multirange(mltrngtypoid, rangetyp, range_count3, ranges3); } +/* + * multirange_minus_multi - like multirange_minus but returning the result as a + * SRF, with no rows if the result would be empty. + */ +Datum +multirange_minus_multi(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + MemoryContext oldcontext; + + if (!SRF_IS_FIRSTCALL()) + { + /* We never have more than one result */ + funcctx = SRF_PERCALL_SETUP(); + SRF_RETURN_DONE(funcctx); + } + else + { + MultirangeType *mr1; + MultirangeType *mr2; + Oid mltrngtypoid; + TypeCacheEntry *typcache; + TypeCacheEntry *rangetyp; + int32 range_count1; + int32 range_count2; + RangeType **ranges1; + RangeType **ranges2; + MultirangeType *mr; + + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* get args, detoasting into multi-call memory context */ + mr1 = PG_GETARG_MULTIRANGE_P(0); + mr2 = PG_GETARG_MULTIRANGE_P(1); + + mltrngtypoid = MultirangeTypeGetOid(mr1); + typcache = lookup_type_cache(mltrngtypoid, TYPECACHE_MULTIRANGE_INFO); + if (typcache->rngtype == NULL) + elog(ERROR, "type %u is not a multirange type", mltrngtypoid); + rangetyp = typcache->rngtype; + + if (MultirangeIsEmpty(mr1) || MultirangeIsEmpty(mr2)) + mr = mr1; + else + { + multirange_deserialize(rangetyp, mr1, &range_count1, &ranges1); + multirange_deserialize(rangetyp, mr2, &range_count2, &ranges2); + + mr = multirange_minus_internal(mltrngtypoid, + rangetyp, + range_count1, + ranges1, + range_count2, + ranges2); + } + + MemoryContextSwitchTo(oldcontext); + + funcctx = SRF_PERCALL_SETUP(); + if (MultirangeIsEmpty(mr)) + SRF_RETURN_DONE(funcctx); + else + SRF_RETURN_NEXT(funcctx, MultirangeTypePGetDatum(mr)); + } +} + /* multirange intersection */ Datum multirange_intersect(PG_FUNCTION_ARGS) @@ -2081,15 +2154,14 @@ range_overleft_multirange_internal(TypeCacheEntry *rangetyp, bool empty; if (RangeIsEmpty(r) || MultirangeIsEmpty(mr)) - PG_RETURN_BOOL(false); - + return false; range_deserialize(rangetyp, r, &lower1, &upper1, &empty); Assert(!empty); multirange_get_bounds(rangetyp, mr, mr->rangeCount - 1, &lower2, &upper2); - PG_RETURN_BOOL(range_cmp_bounds(rangetyp, &upper1, &upper2) <= 0); + return (range_cmp_bounds(rangetyp, &upper1, &upper2) <= 0); } Datum @@ -2166,7 +2238,7 @@ range_overright_multirange_internal(TypeCacheEntry *rangetyp, bool empty; if (RangeIsEmpty(r) || MultirangeIsEmpty(mr)) - PG_RETURN_BOOL(false); + return false; range_deserialize(rangetyp, r, &lower1, &upper1, &empty); Assert(!empty); @@ -2523,7 +2595,7 @@ multirange_adjacent_range(PG_FUNCTION_ARGS) TypeCacheEntry *typcache; if (RangeIsEmpty(r) || MultirangeIsEmpty(mr)) - return false; + PG_RETURN_BOOL(false); typcache = multirange_get_typcache(fcinfo, MultirangeTypeGetOid(mr)); @@ -2544,7 +2616,7 @@ multirange_adjacent_multirange(PG_FUNCTION_ARGS) upper2; if (MultirangeIsEmpty(mr1) || MultirangeIsEmpty(mr2)) - return false; + PG_RETURN_BOOL(false); typcache = multirange_get_typcache(fcinfo, MultirangeTypeGetOid(mr1)); @@ -2639,7 +2711,7 @@ multirange_cmp(PG_FUNCTION_ARGS) Datum multirange_lt(PG_FUNCTION_ARGS) { - int cmp = multirange_cmp(fcinfo); + int cmp = DatumGetInt32(multirange_cmp(fcinfo)); PG_RETURN_BOOL(cmp < 0); } @@ -2647,7 +2719,7 @@ multirange_lt(PG_FUNCTION_ARGS) Datum multirange_le(PG_FUNCTION_ARGS) { - int cmp = multirange_cmp(fcinfo); + int cmp = DatumGetInt32(multirange_cmp(fcinfo)); PG_RETURN_BOOL(cmp <= 0); } @@ -2655,7 +2727,7 @@ multirange_le(PG_FUNCTION_ARGS) Datum multirange_ge(PG_FUNCTION_ARGS) { - int cmp = multirange_cmp(fcinfo); + int cmp = DatumGetInt32(multirange_cmp(fcinfo)); PG_RETURN_BOOL(cmp >= 0); } @@ -2663,7 +2735,7 @@ multirange_ge(PG_FUNCTION_ARGS) Datum multirange_gt(PG_FUNCTION_ARGS) { - int cmp = multirange_cmp(fcinfo); + int cmp = DatumGetInt32(multirange_cmp(fcinfo)); PG_RETURN_BOOL(cmp > 0); } @@ -2746,7 +2818,7 @@ multirange_unnest(PG_FUNCTION_ARGS) mr = PG_GETARG_MULTIRANGE_P(0); /* allocate memory for user context */ - fctx = (multirange_unnest_fctx *) palloc(sizeof(multirange_unnest_fctx)); + fctx = palloc_object(multirange_unnest_fctx); /* initialize state */ fctx->mr = mr; @@ -2833,7 +2905,7 @@ hash_multirange(PG_FUNCTION_ARGS) upper_hash = 0; /* Merge hashes of flags and bounds */ - range_hash = hash_uint32((uint32) flags); + range_hash = hash_bytes_uint32((uint32) flags); range_hash ^= lower_hash; range_hash = pg_rotate_left32(range_hash, 1); range_hash ^= upper_hash; diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c index b87bcf3ea306c..fc5a4354fce0e 100644 --- a/src/backend/utils/adt/multirangetypes_selfuncs.c +++ b/src/backend/utils/adt/multirangetypes_selfuncs.c @@ -49,10 +49,10 @@ static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value, static float8 get_len_position(double value, double hist1, double hist2); static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2); -static int length_hist_bsearch(Datum *length_hist_values, +static int length_hist_bsearch(const Datum *length_hist_values, int length_hist_nvalues, double value, bool equal); -static double calc_length_hist_frac(Datum *length_hist_values, +static double calc_length_hist_frac(const Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal); static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, @@ -60,14 +60,14 @@ static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, + const Datum *length_hist_values, int length_hist_nvalues); static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, const RangeBound *lower, const RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, + const Datum *length_hist_values, int length_hist_nvalues); /* @@ -496,8 +496,8 @@ calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, * bounds. */ nhist = hslot.nvalues; - hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist); - hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist); + hist_lower = palloc_array(RangeBound, nhist); + hist_upper = palloc_array(RangeBound, nhist); for (i = 0; i < nhist; i++) { bool empty; @@ -765,7 +765,7 @@ rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBou * given length, returns -1. */ static int -length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues, +length_hist_bsearch(const Datum *length_hist_values, int length_hist_nvalues, double value, bool equal) { int lower = -1, @@ -963,7 +963,7 @@ get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBoun * 'equal' is true). */ static double -calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues, +calc_length_hist_frac(const Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal) { double frac; @@ -1131,7 +1131,7 @@ static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, const RangeBound *lower, RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, int length_hist_nvalues) + const Datum *length_hist_values, int length_hist_nvalues) { int i, upper_index; @@ -1252,7 +1252,7 @@ static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, const RangeBound *lower, const RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, int length_hist_nvalues) + const Datum *length_hist_values, int length_hist_nvalues) { int i, lower_index; diff --git a/src/backend/utils/adt/multixactfuncs.c b/src/backend/utils/adt/multixactfuncs.c new file mode 100644 index 0000000000000..a428e140bc4bf --- /dev/null +++ b/src/backend/utils/adt/multixactfuncs.c @@ -0,0 +1,87 @@ +/*------------------------------------------------------------------------- + * + * multixactfuncs.c + * Functions for accessing multixact-related data. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/multixactfuncs.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/multixact.h" +#include "funcapi.h" +#include "utils/builtins.h" + +/* + * pg_get_multixact_members + * + * Returns information about the MultiXactMembers of the specified + * MultiXactId. + */ +Datum +pg_get_multixact_members(PG_FUNCTION_ARGS) +{ + typedef struct + { + MultiXactMember *members; + int nmembers; + int iter; + } mxact; + MultiXactId mxid = PG_GETARG_TRANSACTIONID(0); + mxact *multi; + FuncCallContext *funccxt; + + if (mxid < FirstMultiXactId) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid MultiXactId: %u", mxid))); + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcxt; + TupleDesc tupdesc; + + funccxt = SRF_FIRSTCALL_INIT(); + oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); + + multi = palloc_object(mxact); + /* no need to allow for old values here */ + multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false, + false); + multi->iter = 0; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + funccxt->tuple_desc = tupdesc; + funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc); + funccxt->user_fctx = multi; + + MemoryContextSwitchTo(oldcxt); + } + + funccxt = SRF_PERCALL_SETUP(); + multi = (mxact *) funccxt->user_fctx; + + while (multi->iter < multi->nmembers) + { + HeapTuple tuple; + char *values[2]; + + values[0] = psprintf("%u", multi->members[multi->iter].xid); + values[1] = mxstatus_to_string(multi->members[multi->iter].status); + + tuple = BuildTupleFromCStrings(funccxt->attinmeta, values); + + multi->iter++; + pfree(values[0]); + SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple)); + } + + SRF_RETURN_DONE(funccxt); +} diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index f03fcc1147bb0..3a2002097ddce 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -12,8 +12,6 @@ #include #include -#include "access/stratnum.h" -#include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "common/hashfn.h" #include "common/ip.h" @@ -77,7 +75,7 @@ network_in(char *src, bool is_cidr, Node *escontext) int bits; inet *dst; - dst = (inet *) palloc0(sizeof(inet)); + dst = (inet *) palloc0_object(inet); /* * First, check to see if this is an IPv6 or IPv4 address. IPv6 addresses @@ -198,7 +196,7 @@ network_recv(StringInfo buf, bool is_cidr) i; /* make sure any unused bits in a CIDR value are zeroed */ - addr = (inet *) palloc0(sizeof(inet)); + addr = palloc0_object(inet); ip_family(addr) = pq_getmsgbyte(buf); if (ip_family(addr) != PGSQL_AF_INET && @@ -365,7 +363,7 @@ cidr_set_masklen(PG_FUNCTION_ARGS) inet * cidr_set_masklen_internal(const inet *src, int bits) { - inet *dst = (inet *) palloc0(sizeof(inet)); + inet *dst = palloc0_object(inet); ip_family(dst) = ip_family(src); ip_bits(dst) = bits; @@ -446,7 +444,7 @@ network_sortsupport(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - uss = palloc(sizeof(network_sortsupport_state)); + uss = palloc_object(network_sortsupport_state); uss->input_count = 0; uss->estimating = true; initHyperLogLog(&uss->abbr_card, 10); @@ -569,24 +567,11 @@ network_abbrev_abort(int memtupcount, SortSupport ssup) * * When generating abbreviated keys for SortSupport, we pack as much as we can * into a datum while ensuring that when comparing those keys as integers, - * these rules will be respected. Exact contents depend on IP family and datum - * size. + * these rules will be respected. Exact contents depend on IP family: * * IPv4 * ---- * - * 4 byte datums: - * - * Start with 1 bit for the IP family (IPv4 or IPv6; this bit is present in - * every case below) followed by all but 1 of the netmasked bits. - * - * +----------+---------------------+ - * | 1 bit IP | 31 bits network | (1 bit network - * | family | (truncated) | omitted) - * +----------+---------------------+ - * - * 8 byte datums: - * * We have space to store all netmasked bits, followed by the netmask size, * followed by 25 bits of the subnet (25 bits is usually more than enough in * practice). cidr datums always have all-zero subnet bits. @@ -599,15 +584,6 @@ network_abbrev_abort(int memtupcount, SortSupport ssup) * IPv6 * ---- * - * 4 byte datums: - * - * +----------+---------------------+ - * | 1 bit IP | 31 bits network | (up to 97 bits - * | family | (truncated) | network omitted) - * +----------+---------------------+ - * - * 8 byte datums: - * * +----------+---------------------------------+ * | 1 bit IP | 63 bits network | (up to 65 bits * | family | (truncated) | network omitted) @@ -630,8 +606,7 @@ network_abbrev_convert(Datum original, SortSupport ssup) /* * Get an unsigned integer representation of the IP address by taking its * first 4 or 8 bytes. Always take all 4 bytes of an IPv4 address. Take - * the first 8 bytes of an IPv6 address with an 8 byte datum and 4 bytes - * otherwise. + * the first 8 bytes of an IPv6 address. * * We're consuming an array of unsigned char, so byteswap on little endian * systems (an inet's ipaddr field stores the most significant byte @@ -661,7 +636,7 @@ network_abbrev_convert(Datum original, SortSupport ssup) ipaddr_datum = DatumBigEndianToNative(ipaddr_datum); /* Initialize result with ipfamily (most significant) bit set */ - res = ((Datum) 1) << (SIZEOF_DATUM * BITS_PER_BYTE - 1); + res = ((Datum) 1) << (sizeof(Datum) * BITS_PER_BYTE - 1); } /* @@ -670,8 +645,7 @@ network_abbrev_convert(Datum original, SortSupport ssup) * while low order bits go in "subnet" component when there is space for * one. This is often accomplished by generating a temp datum subnet * bitmask, which we may reuse later when generating the subnet bits - * themselves. (Note that subnet bits are only used with IPv4 datums on - * platforms where datum is 8 bytes.) + * themselves. * * The number of bits in subnet is used to generate a datum subnet * bitmask. For example, with a /24 IPv4 datum there are 8 subnet bits @@ -683,14 +657,14 @@ network_abbrev_convert(Datum original, SortSupport ssup) subnet_size = ip_maxbits(authoritative) - ip_bits(authoritative); Assert(subnet_size >= 0); /* subnet size must work with prefix ipaddr cases */ - subnet_size %= SIZEOF_DATUM * BITS_PER_BYTE; + subnet_size %= sizeof(Datum) * BITS_PER_BYTE; if (ip_bits(authoritative) == 0) { /* Fit as many ipaddr bits as possible into subnet */ subnet_bitmask = ((Datum) 0) - 1; network = 0; } - else if (ip_bits(authoritative) < SIZEOF_DATUM * BITS_PER_BYTE) + else if (ip_bits(authoritative) < sizeof(Datum) * BITS_PER_BYTE) { /* Split ipaddr bits between network and subnet */ subnet_bitmask = (((Datum) 1) << subnet_size) - 1; @@ -703,12 +677,11 @@ network_abbrev_convert(Datum original, SortSupport ssup) network = ipaddr_datum; } -#if SIZEOF_DATUM == 8 if (ip_family(authoritative) == PGSQL_AF_INET) { /* - * IPv4 with 8 byte datums: keep all 32 netmasked bits, netmask size, - * and most significant 25 subnet bits + * IPv4: keep all 32 netmasked bits, netmask size, and most + * significant 25 subnet bits */ Datum netmask_size = (Datum) ip_bits(authoritative); Datum subnet; @@ -752,12 +725,11 @@ network_abbrev_convert(Datum original, SortSupport ssup) res |= network | netmask_size | subnet; } else -#endif { /* - * 4 byte datums, or IPv6 with 8 byte datums: Use as many of the - * netmasked bits as will fit in final abbreviated key. Avoid - * clobbering the ipfamily bit that was set earlier. + * IPv6: Use as many of the netmasked bits as will fit in final + * abbreviated key. Avoid clobbering the ipfamily bit that was set + * earlier. */ res |= network >> 1; } @@ -769,11 +741,7 @@ network_abbrev_convert(Datum original, SortSupport ssup) { uint32 tmp; -#if SIZEOF_DATUM == 8 - tmp = (uint32) res ^ (uint32) ((uint64) res >> 32); -#else /* SIZEOF_DATUM != 8 */ - tmp = (uint32) res; -#endif + tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); } @@ -1259,7 +1227,7 @@ network_broadcast(PG_FUNCTION_ARGS) *b; /* make sure any unused bits are zeroed */ - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); maxbytes = ip_addrsize(ip); bits = ip_bits(ip); @@ -1303,7 +1271,7 @@ network_network(PG_FUNCTION_ARGS) *b; /* make sure any unused bits are zeroed */ - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); bits = ip_bits(ip); a = ip_addr(ip); @@ -1346,7 +1314,7 @@ network_netmask(PG_FUNCTION_ARGS) unsigned char *b; /* make sure any unused bits are zeroed */ - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); bits = ip_bits(ip); b = ip_addr(dst); @@ -1389,7 +1357,7 @@ network_hostmask(PG_FUNCTION_ARGS) unsigned char *b; /* make sure any unused bits are zeroed */ - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); maxbytes = ip_addrsize(ip); bits = ip_maxbits(ip) - ip_bits(ip); @@ -1824,7 +1792,7 @@ inetnot(PG_FUNCTION_ARGS) inet *ip = PG_GETARG_INET_PP(0); inet *dst; - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); { int nb = ip_addrsize(ip); @@ -1850,7 +1818,7 @@ inetand(PG_FUNCTION_ARGS) inet *ip2 = PG_GETARG_INET_PP(1); inet *dst; - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); if (ip_family(ip) != ip_family(ip2)) ereport(ERROR, @@ -1882,7 +1850,7 @@ inetor(PG_FUNCTION_ARGS) inet *ip2 = PG_GETARG_INET_PP(1); inet *dst; - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); if (ip_family(ip) != ip_family(ip2)) ereport(ERROR, @@ -1912,7 +1880,7 @@ internal_inetpl(inet *ip, int64 addend) { inet *dst; - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); { int nb = ip_addrsize(ip); diff --git a/src/backend/utils/adt/network_gist.c b/src/backend/utils/adt/network_gist.c index a08c495378919..30145f5985a58 100644 --- a/src/backend/utils/adt/network_gist.c +++ b/src/backend/utils/adt/network_gist.c @@ -475,7 +475,7 @@ build_inet_union_key(int family, int minbits, int commonbits, GistInetKey *result; /* Make sure any unused bits are zeroed. */ - result = (GistInetKey *) palloc0(sizeof(GistInetKey)); + result = palloc0_object(GistInetKey); gk_ip_family(result) = family; gk_ip_minbits(result) = minbits; @@ -546,13 +546,13 @@ inet_gist_compress(PG_FUNCTION_ARGS) if (entry->leafkey) { - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); if (DatumGetPointer(entry->key) != NULL) { inet *in = DatumGetInetPP(entry->key); GistInetKey *r; - r = (GistInetKey *) palloc0(sizeof(GistInetKey)); + r = palloc0_object(GistInetKey); gk_ip_family(r) = ip_family(in); gk_ip_minbits(r) = ip_bits(in); @@ -594,14 +594,14 @@ inet_gist_fetch(PG_FUNCTION_ARGS) GISTENTRY *retval; inet *dst; - dst = (inet *) palloc0(sizeof(inet)); + dst = palloc0_object(inet); ip_family(dst) = gk_ip_family(key); ip_bits(dst) = gk_ip_minbits(key); memcpy(ip_addr(dst), gk_ip_addr(key), ip_addrsize(dst)); SET_INET_VARSIZE(dst); - retval = palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, InetPGetDatum(dst), entry->rel, entry->page, entry->offset, false); diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c index 940cdafa54619..d08f40e033288 100644 --- a/src/backend/utils/adt/network_selfuncs.c +++ b/src/backend/utils/adt/network_selfuncs.c @@ -48,17 +48,17 @@ static Selectivity networkjoinsel_inner(Oid operator, static Selectivity networkjoinsel_semi(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2); static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues); -static Selectivity inet_hist_value_sel(Datum *values, int nvalues, +static Selectivity inet_hist_value_sel(const Datum *values, int nvalues, Datum constvalue, int opr_codenum); static Selectivity inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues, Oid operator); -static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, - int mcv_nvalues, Datum *hist_values, int hist_nvalues, +static Selectivity inet_mcv_hist_sel(const Datum *mcv_values, float4 *mcv_numbers, + int mcv_nvalues, const Datum *hist_values, int hist_nvalues, int opr_codenum); -static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values, +static Selectivity inet_hist_inclusion_join_sel(const Datum *hist1_values, int hist1_nvalues, - Datum *hist2_values, int hist2_nvalues, + const Datum *hist2_values, int hist2_nvalues, int opr_codenum); static Selectivity inet_semi_join_sel(Datum lhs_value, bool mcv_exists, Datum *mcv_values, int mcv_nvalues, @@ -601,7 +601,7 @@ mcv_population(float4 *mcv_numbers, int mcv_nvalues) * better option than not considering these buckets at all. */ static Selectivity -inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue, +inet_hist_value_sel(const Datum *values, int nvalues, Datum constvalue, int opr_codenum) { Selectivity match = 0.0; @@ -702,8 +702,8 @@ inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues, * the histogram. */ static Selectivity -inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues, - Datum *hist_values, int hist_nvalues, +inet_mcv_hist_sel(const Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues, + const Datum *hist_values, int hist_nvalues, int opr_codenum) { Selectivity selec = 0.0; @@ -739,8 +739,8 @@ inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues, * average? That would at least avoid non-commutative estimation results. */ static Selectivity -inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues, - Datum *hist2_values, int hist2_nvalues, +inet_hist_inclusion_join_sel(const Datum *hist1_values, int hist1_nvalues, + const Datum *hist2_values, int hist2_nvalues, int opr_codenum) { double match = 0.0; diff --git a/src/backend/utils/adt/network_spgist.c b/src/backend/utils/adt/network_spgist.c index a84747d927586..9bdacca5abddd 100644 --- a/src/backend/utils/adt/network_spgist.c +++ b/src/backend/utils/adt/network_spgist.c @@ -196,8 +196,8 @@ inet_spg_picksplit(PG_FUNCTION_ARGS) /* Don't need labels; allocate output arrays */ out->nodeLabels = NULL; - out->mapTuplesToNodes = (int *) palloc(sizeof(int) * in->nTuples); - out->leafTupleDatums = (Datum *) palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = palloc_array(int, in->nTuples); + out->leafTupleDatums = palloc_array(Datum, in->nTuples); if (differentFamilies) { @@ -301,7 +301,7 @@ inet_spg_inner_consistent(PG_FUNCTION_ARGS) if (which) { - out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->nodeNumbers = palloc_array(int, in->nNodes); for (i = 0; i < in->nNodes; i++) { diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 40dcbc7b6710b..2460698df0197 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -28,6 +28,7 @@ #include "common/hashfn.h" #include "common/int.h" +#include "common/int128.h" #include "funcapi.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" @@ -391,30 +392,21 @@ typedef struct NumericSumAccum /* * We define our own macros for packing and unpacking abbreviated-key - * representations for numeric values in order to avoid depending on - * USE_FLOAT8_BYVAL. The type of abbreviation we use is based only on - * the size of a datum, not the argument-passing convention for float8. + * representations, just to have a notational indication that that's + * what we're doing. Now that sizeof(Datum) is always 8, we can rely + * on fitting an int64 into Datum. * - * The range of abbreviations for finite values is from +PG_INT64/32_MAX - * to -PG_INT64/32_MAX. NaN has the abbreviation PG_INT64/32_MIN, and we + * The range of abbreviations for finite values is from +PG_INT64_MAX + * to -PG_INT64_MAX. NaN has the abbreviation PG_INT64_MIN, and we * define the sort ordering to make that work out properly (see further * comments below). PINF and NINF share the abbreviations of the largest * and smallest finite abbreviation classes. */ -#define NUMERIC_ABBREV_BITS (SIZEOF_DATUM * BITS_PER_BYTE) -#if SIZEOF_DATUM == 8 -#define NumericAbbrevGetDatum(X) ((Datum) (X)) -#define DatumGetNumericAbbrev(X) ((int64) (X)) +#define NumericAbbrevGetDatum(X) Int64GetDatum(X) +#define DatumGetNumericAbbrev(X) DatumGetInt64(X) #define NUMERIC_ABBREV_NAN NumericAbbrevGetDatum(PG_INT64_MIN) #define NUMERIC_ABBREV_PINF NumericAbbrevGetDatum(-PG_INT64_MAX) #define NUMERIC_ABBREV_NINF NumericAbbrevGetDatum(PG_INT64_MAX) -#else -#define NumericAbbrevGetDatum(X) ((Datum) (X)) -#define DatumGetNumericAbbrev(X) ((int32) (X)) -#define NUMERIC_ABBREV_NAN NumericAbbrevGetDatum(PG_INT32_MIN) -#define NUMERIC_ABBREV_PINF NumericAbbrevGetDatum(-PG_INT32_MAX) -#define NUMERIC_ABBREV_NINF NumericAbbrevGetDatum(PG_INT32_MAX) -#endif /* ---------- @@ -525,7 +517,7 @@ static void numericvar_deserialize(StringInfo buf, NumericVar *var); static Numeric duplicate_numeric(Numeric num); static Numeric make_result(const NumericVar *var); -static Numeric make_result_opt_error(const NumericVar *var, bool *have_error); +static Numeric make_result_safe(const NumericVar *var, Node *escontext); static bool apply_typmod(NumericVar *var, int32 typmod, Node *escontext); static bool apply_typmod_special(Numeric num, int32 typmod, Node *escontext); @@ -534,10 +526,7 @@ static bool numericvar_to_int32(const NumericVar *var, int32 *result); static bool numericvar_to_int64(const NumericVar *var, int64 *result); static void int64_to_numericvar(int64 val, NumericVar *var); static bool numericvar_to_uint64(const NumericVar *var, uint64 *result); -#ifdef HAVE_INT128 -static bool numericvar_to_int128(const NumericVar *var, int128 *result); -static void int128_to_numericvar(int128 val, NumericVar *var); -#endif +static void int128_to_numericvar(INT128 val, NumericVar *var); static double numericvar_to_double_no_overflow(const NumericVar *var); static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup); @@ -728,7 +717,6 @@ numeric_in(PG_FUNCTION_ARGS) */ NumericVar value; int base; - bool have_error; init_var(&value); @@ -787,12 +775,7 @@ numeric_in(PG_FUNCTION_ARGS) if (!apply_typmod(&value, typmod, escontext)) PG_RETURN_NULL(); - res = make_result_opt_error(&value, &have_error); - - if (have_error) - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); + res = make_result_safe(&value, escontext); free_var(&value); } @@ -1776,8 +1759,7 @@ generate_series_step_numeric(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ - fctx = (generate_series_numeric_fctx *) - palloc(sizeof(generate_series_numeric_fctx)); + fctx = palloc_object(generate_series_numeric_fctx); /* * Use fctx to keep state from call to call. Seed current with the @@ -1958,9 +1940,11 @@ generate_series_numeric_support(PG_FUNCTION_ARGS) * in the histogram. width_bucket() returns an integer indicating the * bucket number that 'operand' belongs to in an equiwidth histogram * with the specified characteristics. An operand smaller than the - * lower bound is assigned to bucket 0. An operand greater than the - * upper bound is assigned to an additional bucket (with number - * count+1). We don't allow "NaN" for any of the numeric arguments. + * lower bound is assigned to bucket 0. An operand greater than or equal + * to the upper bound is assigned to an additional bucket (with number + * count+1). We don't allow the histogram bounds to be NaN or +/- infinity, + * but we do allow those values for the operand (taking NaN to be larger + * than any other value, as we do in comparisons). */ Datum width_bucket_numeric(PG_FUNCTION_ARGS) @@ -1978,17 +1962,13 @@ width_bucket_numeric(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), errmsg("count must be greater than zero"))); - if (NUMERIC_IS_SPECIAL(operand) || - NUMERIC_IS_SPECIAL(bound1) || - NUMERIC_IS_SPECIAL(bound2)) + if (NUMERIC_IS_SPECIAL(bound1) || NUMERIC_IS_SPECIAL(bound2)) { - if (NUMERIC_IS_NAN(operand) || - NUMERIC_IS_NAN(bound1) || - NUMERIC_IS_NAN(bound2)) + if (NUMERIC_IS_NAN(bound1) || NUMERIC_IS_NAN(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), - errmsg("operand, lower bound, and upper bound cannot be NaN"))); - /* We allow "operand" to be infinite; cmp_numerics will cope */ + errmsg("lower and upper bounds cannot be NaN"))); + if (NUMERIC_IS_INF(bound1) || NUMERIC_IS_INF(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), @@ -2100,12 +2080,11 @@ compute_bucket(Numeric operand, Numeric bound1, Numeric bound2, * while this could be worked on itself, the abbreviation strategy gives more * speedup in many common cases. * - * Two different representations are used for the abbreviated form, one in - * int32 and one in int64, whichever fits into a by-value Datum. In both cases - * the representation is negated relative to the original value, because we use - * the largest negative value for NaN, which sorts higher than other values. We - * convert the absolute value of the numeric to a 31-bit or 63-bit positive - * value, and then negate it if the original number was positive. + * The abbreviated format is an int64. The representation is negated relative + * to the original value, because we use the largest negative value for NaN, + * which sorts higher than other values. We convert the absolute value of the + * numeric to a 63-bit positive value, and then negate it if the original + * number was positive. * * We abort the abbreviation process if the abbreviation cardinality is below * 0.01% of the row count (1 per 10k non-null rows). The actual break-even @@ -2137,7 +2116,7 @@ numeric_sortsupport(PG_FUNCTION_ARGS) NumericSortSupport *nss; MemoryContext oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - nss = palloc(sizeof(NumericSortSupport)); + nss = palloc_object(NumericSortSupport); /* * palloc a buffer for handling unaligned packed values in addition to @@ -2214,7 +2193,7 @@ numeric_abbrev_convert(Datum original_datum, SortSupport ssup) } /* should happen only for external/compressed toasts */ - if ((Pointer) original_varatt != DatumGetPointer(original_datum)) + if (original_varatt != DatumGetPointer(original_datum)) pfree(original_varatt); return result; @@ -2304,9 +2283,9 @@ numeric_fast_cmp(Datum x, Datum y, SortSupport ssup) result = cmp_numerics(nx, ny); - if ((Pointer) nx != DatumGetPointer(x)) + if (nx != DatumGetPointer(x)) pfree(nx); - if ((Pointer) ny != DatumGetPointer(y)) + if (ny != DatumGetPointer(y)) pfree(ny); return result; @@ -2332,7 +2311,7 @@ numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup) } /* - * Abbreviate a NumericVar according to the available bit size. + * Abbreviate a NumericVar into the 64-bit sortsupport size. * * The 31-bit value is constructed as: * @@ -2376,9 +2355,6 @@ numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup) * with all bits zero. This allows simple comparisons to work on the composite * value. */ - -#if NUMERIC_ABBREV_BITS == 64 - static Datum numeric_abbrev_convert_var(const NumericVar *var, NumericSortSupport *nss) { @@ -2430,84 +2406,6 @@ numeric_abbrev_convert_var(const NumericVar *var, NumericSortSupport *nss) return NumericAbbrevGetDatum(result); } -#endif /* NUMERIC_ABBREV_BITS == 64 */ - -#if NUMERIC_ABBREV_BITS == 32 - -static Datum -numeric_abbrev_convert_var(const NumericVar *var, NumericSortSupport *nss) -{ - int ndigits = var->ndigits; - int weight = var->weight; - int32 result; - - if (ndigits == 0 || weight < -11) - { - result = 0; - } - else if (weight > 20) - { - result = PG_INT32_MAX; - } - else - { - NumericDigit nxt1 = (ndigits > 1) ? var->digits[1] : 0; - - weight = (weight + 11) * 4; - - result = var->digits[0]; - - /* - * "result" now has 1 to 4 nonzero decimal digits. We pack in more - * digits to make 7 in total (largest we can fit in 24 bits) - */ - - if (result > 999) - { - /* already have 4 digits, add 3 more */ - result = (result * 1000) + (nxt1 / 10); - weight += 3; - } - else if (result > 99) - { - /* already have 3 digits, add 4 more */ - result = (result * 10000) + nxt1; - weight += 2; - } - else if (result > 9) - { - NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0; - - /* already have 2 digits, add 5 more */ - result = (result * 100000) + (nxt1 * 10) + (nxt2 / 1000); - weight += 1; - } - else - { - NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0; - - /* already have 1 digit, add 6 more */ - result = (result * 1000000) + (nxt1 * 100) + (nxt2 / 100); - } - - result = result | (weight << 24); - } - - /* the abbrev is negated relative to the original */ - if (var->sign == NUMERIC_POS) - result = -result; - - if (nss->estimating) - { - uint32 tmp = (uint32) result; - - addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); - } - - return NumericAbbrevGetDatum(result); -} - -#endif /* NUMERIC_ABBREV_BITS == 32 */ /* * Ordinary (non-sortsupport) comparisons follow. @@ -2969,20 +2867,18 @@ numeric_add(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_add_opt_error(num1, num2, NULL); + res = numeric_add_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_add_opt_error() - + * numeric_add_safe() - * - * Internal version of numeric_add(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_add() with support for soft error reporting. */ Numeric -numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_add_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3026,7 +2922,7 @@ numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); add_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3046,21 +2942,19 @@ numeric_sub(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_sub_opt_error(num1, num2, NULL); + res = numeric_sub_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_sub_opt_error() - + * numeric_sub_safe() - * - * Internal version of numeric_sub(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_sub() with support for soft error reporting. */ Numeric -numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_sub_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3104,7 +2998,7 @@ numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); sub_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3124,21 +3018,19 @@ numeric_mul(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mul_opt_error(num1, num2, NULL); + res = numeric_mul_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mul_opt_error() - + * numeric_mul_safe() - * - * Internal version of numeric_mul(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mul() with support for soft error reporting. */ Numeric -numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mul_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3225,7 +3117,7 @@ numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) if (result.dscale > NUMERIC_DSCALE_MAX) round_var(&result, NUMERIC_DSCALE_MAX); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3245,21 +3137,19 @@ numeric_div(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_div_opt_error(num1, num2, NULL); + res = numeric_div_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_div_opt_error() - + * numeric_div_safe() - * - * Internal version of numeric_div(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_div() with support for soft error reporting. */ Numeric -numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_div_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3267,9 +3157,6 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) Numeric res; int rscale; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities */ @@ -3284,15 +3171,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_pinf); case -1: @@ -3307,15 +3186,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_ninf); case -1: @@ -3346,25 +3217,25 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) */ rscale = select_div_scale(&arg1, &arg2); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; /* * Do the divide and return the result */ div_var(&arg1, &arg2, &result, rscale, true, true); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -3469,30 +3340,25 @@ numeric_mod(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mod_opt_error(num1, num2, NULL); + res = numeric_mod_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mod_opt_error() - + * numeric_mod_safe() - * - * Internal version of numeric_mod(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mod() with support for soft error reporting. */ Numeric -numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mod_safe(Numeric num1, Numeric num2, Node *escontext) { Numeric res; NumericVar arg1; NumericVar arg2; NumericVar result; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities. We follow POSIX fmod() on this, except that * POSIX treats x-is-infinite and y-is-zero identically, raising EDOM and @@ -3505,16 +3371,8 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) if (NUMERIC_IS_INF(num1)) { if (numeric_sign_internal(num2) == 0) - { - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - } + goto division_by_zero; + /* Inf % any nonzero = NaN */ return make_result(&const_nan); } @@ -3527,22 +3385,22 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; mod_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, NULL); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -4465,25 +4323,13 @@ int64_div_fast_to_numeric(int64 val1, int log10val2) if (unlikely(pg_mul_s64_overflow(val1, factor, &new_val1))) { -#ifdef HAVE_INT128 /* do the multiplication using 128-bit integers */ - int128 tmp; + INT128 tmp; - tmp = (int128) val1 * (int128) factor; + tmp = int64_to_int128(0); + int128_add_int64_mul_int64(&tmp, val1, factor); int128_to_numericvar(tmp, &result); -#else - /* do the multiplication using numerics */ - NumericVar tmp; - - init_var(&tmp); - - int64_to_numericvar(val1, &result); - int64_to_numericvar(factor, &tmp); - mul_var(&result, &tmp, &result, 0); - - free_var(&tmp); -#endif } else int64_to_numericvar(new_val1, &result); @@ -4511,52 +4357,34 @@ int4_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of numeric_int4() with support for soft error reporting. + */ int32 -numeric_int4_opt_error(Numeric num, bool *have_error) +numeric_int4_safe(Numeric num, Node *escontext) { NumericVar x; int32 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "integer"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "integer"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "integer"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "integer"))); } /* Convert to variable format, then convert to int4 */ init_var_from_num(num, &x); if (!numericvar_to_int32(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); return result; } @@ -4566,7 +4394,7 @@ numeric_int4(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT32(numeric_int4_opt_error(num, NULL)); + PG_RETURN_INT32(numeric_int4_safe(num, NULL)); } /* @@ -4599,52 +4427,34 @@ int8_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of numeric_int8() with support for soft error reporting. + */ int64 -numeric_int8_opt_error(Numeric num, bool *have_error) +numeric_int8_safe(Numeric num, Node *escontext) { NumericVar x; int64 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "bigint"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "bigint"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "bigint"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "bigint"))); } /* Convert to variable format, then convert to int8 */ init_var_from_num(num, &x); if (!numericvar_to_int64(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); return result; } @@ -4654,7 +4464,7 @@ numeric_int8(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT64(numeric_int8_opt_error(num, NULL)); + PG_RETURN_INT64(numeric_int8_safe(num, NULL)); } @@ -4903,8 +4713,8 @@ numeric_pg_lsn(PG_FUNCTION_ARGS) * Actually, it's a pointer to a NumericAggState allocated in the aggregate * context. The digit buffers for the NumericVars will be there too. * - * On platforms which support 128-bit integers some aggregates instead use a - * 128-bit integer based transition datatype to speed up calculations. + * For integer inputs, some aggregates use special-purpose 64-bit or 128-bit + * integer based transition datatypes to speed up calculations. * * ---------------------------------------------------------------------- */ @@ -4943,7 +4753,7 @@ makeNumericAggState(FunctionCallInfo fcinfo, bool calcSumX2) old_context = MemoryContextSwitchTo(agg_context); - state = (NumericAggState *) palloc0(sizeof(NumericAggState)); + state = palloc0_object(NumericAggState); state->calcSumX2 = calcSumX2; state->agg_context = agg_context; @@ -4961,7 +4771,7 @@ makeNumericAggStateCurrentContext(bool calcSumX2) { NumericAggState *state; - state = (NumericAggState *) palloc0(sizeof(NumericAggState)); + state = palloc0_object(NumericAggState); state->calcSumX2 = calcSumX2; state->agg_context = CurrentMemoryContext; @@ -5568,26 +5378,27 @@ numeric_accum_inv(PG_FUNCTION_ARGS) /* - * Integer data types in general use Numeric accumulators to share code - * and avoid risk of overflow. + * Integer data types in general use Numeric accumulators to share code and + * avoid risk of overflow. However for performance reasons optimized + * special-purpose accumulator routines are used when possible: * - * However for performance reasons optimized special-purpose accumulator - * routines are used when possible. + * For 16-bit and 32-bit inputs, N and sum(X) fit into 64-bit, so 64-bit + * accumulators are used for SUM and AVG of these data types. * - * On platforms with 128-bit integer support, the 128-bit routines will be - * used when sum(X) or sum(X*X) fit into 128-bit. + * For 16-bit and 32-bit inputs, sum(X^2) fits into 128-bit, so 128-bit + * accumulators are used for STDDEV_POP, STDDEV_SAMP, VAR_POP, and VAR_SAMP of + * these data types. * - * For 16 and 32 bit inputs, the N and sum(X) fit into 64-bit so the 64-bit - * accumulators will be used for SUM and AVG of these data types. + * For 64-bit inputs, sum(X) fits into 128-bit, so a 128-bit accumulator is + * used for SUM(int8) and AVG(int8). */ -#ifdef HAVE_INT128 typedef struct Int128AggState { bool calcSumX2; /* if true, calculate sumX2 */ int64 N; /* count of processed numbers */ - int128 sumX; /* sum of processed numbers */ - int128 sumX2; /* sum of squares of processed numbers */ + INT128 sumX; /* sum of processed numbers */ + INT128 sumX2; /* sum of squares of processed numbers */ } Int128AggState; /* @@ -5606,7 +5417,7 @@ makeInt128AggState(FunctionCallInfo fcinfo, bool calcSumX2) old_context = MemoryContextSwitchTo(agg_context); - state = (Int128AggState *) palloc0(sizeof(Int128AggState)); + state = palloc0_object(Int128AggState); state->calcSumX2 = calcSumX2; MemoryContextSwitchTo(old_context); @@ -5623,7 +5434,7 @@ makeInt128AggStateCurrentContext(bool calcSumX2) { Int128AggState *state; - state = (Int128AggState *) palloc0(sizeof(Int128AggState)); + state = palloc0_object(Int128AggState); state->calcSumX2 = calcSumX2; return state; @@ -5633,12 +5444,12 @@ makeInt128AggStateCurrentContext(bool calcSumX2) * Accumulate a new input value for 128-bit aggregate functions. */ static void -do_int128_accum(Int128AggState *state, int128 newval) +do_int128_accum(Int128AggState *state, int64 newval) { if (state->calcSumX2) - state->sumX2 += newval * newval; + int128_add_int64_mul_int64(&state->sumX2, newval, newval); - state->sumX += newval; + int128_add_int64(&state->sumX, newval); state->N++; } @@ -5646,43 +5457,28 @@ do_int128_accum(Int128AggState *state, int128 newval) * Remove an input value from the aggregated state. */ static void -do_int128_discard(Int128AggState *state, int128 newval) +do_int128_discard(Int128AggState *state, int64 newval) { if (state->calcSumX2) - state->sumX2 -= newval * newval; + int128_sub_int64_mul_int64(&state->sumX2, newval, newval); - state->sumX -= newval; + int128_sub_int64(&state->sumX, newval); state->N--; } -typedef Int128AggState PolyNumAggState; -#define makePolyNumAggState makeInt128AggState -#define makePolyNumAggStateCurrentContext makeInt128AggStateCurrentContext -#else -typedef NumericAggState PolyNumAggState; -#define makePolyNumAggState makeNumericAggState -#define makePolyNumAggStateCurrentContext makeNumericAggStateCurrentContext -#endif - Datum int2_accum(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* Create the state data on the first call */ if (state == NULL) - state = makePolyNumAggState(fcinfo, true); + state = makeInt128AggState(fcinfo, true); if (!PG_ARGISNULL(1)) - { -#ifdef HAVE_INT128 - do_int128_accum(state, (int128) PG_GETARG_INT16(1)); -#else - do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT16(1))); -#endif - } + do_int128_accum(state, PG_GETARG_INT16(1)); PG_RETURN_POINTER(state); } @@ -5690,22 +5486,16 @@ int2_accum(PG_FUNCTION_ARGS) Datum int4_accum(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* Create the state data on the first call */ if (state == NULL) - state = makePolyNumAggState(fcinfo, true); + state = makeInt128AggState(fcinfo, true); if (!PG_ARGISNULL(1)) - { -#ifdef HAVE_INT128 - do_int128_accum(state, (int128) PG_GETARG_INT32(1)); -#else - do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT32(1))); -#endif - } + do_int128_accum(state, PG_GETARG_INT32(1)); PG_RETURN_POINTER(state); } @@ -5728,21 +5518,21 @@ int8_accum(PG_FUNCTION_ARGS) } /* - * Combine function for numeric aggregates which require sumX2 + * Combine function for Int128AggState for aggregates which require sumX2 */ Datum numeric_poly_combine(PG_FUNCTION_ARGS) { - PolyNumAggState *state1; - PolyNumAggState *state2; + Int128AggState *state1; + Int128AggState *state2; MemoryContext agg_context; MemoryContext old_context; if (!AggCheckCallContext(fcinfo, &agg_context)) elog(ERROR, "aggregate function called in non-aggregate context"); - state1 = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); - state2 = PG_ARGISNULL(1) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(1); + state1 = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); + state2 = PG_ARGISNULL(1) ? NULL : (Int128AggState *) PG_GETARG_POINTER(1); if (state2 == NULL) PG_RETURN_POINTER(state1); @@ -5752,16 +5542,10 @@ numeric_poly_combine(PG_FUNCTION_ARGS) { old_context = MemoryContextSwitchTo(agg_context); - state1 = makePolyNumAggState(fcinfo, true); + state1 = makeInt128AggState(fcinfo, true); state1->N = state2->N; - -#ifdef HAVE_INT128 state1->sumX = state2->sumX; state1->sumX2 = state2->sumX2; -#else - accum_sum_copy(&state1->sumX, &state2->sumX); - accum_sum_copy(&state1->sumX2, &state2->sumX2); -#endif MemoryContextSwitchTo(old_context); @@ -5771,54 +5555,51 @@ numeric_poly_combine(PG_FUNCTION_ARGS) if (state2->N > 0) { state1->N += state2->N; + int128_add_int128(&state1->sumX, state2->sumX); + int128_add_int128(&state1->sumX2, state2->sumX2); + } + PG_RETURN_POINTER(state1); +} -#ifdef HAVE_INT128 - state1->sumX += state2->sumX; - state1->sumX2 += state2->sumX2; -#else - /* The rest of this needs to work in the aggregate context */ - old_context = MemoryContextSwitchTo(agg_context); - - /* Accumulate sums */ - accum_sum_combine(&state1->sumX, &state2->sumX); - accum_sum_combine(&state1->sumX2, &state2->sumX2); +/* + * int128_serialize - serialize a 128-bit integer to binary format + */ +static inline void +int128_serialize(StringInfo buf, INT128 val) +{ + pq_sendint64(buf, PG_INT128_HI_INT64(val)); + pq_sendint64(buf, PG_INT128_LO_UINT64(val)); +} - MemoryContextSwitchTo(old_context); -#endif +/* + * int128_deserialize - deserialize binary format to a 128-bit integer. + */ +static inline INT128 +int128_deserialize(StringInfo buf) +{ + int64 hi = pq_getmsgint64(buf); + uint64 lo = pq_getmsgint64(buf); - } - PG_RETURN_POINTER(state1); + return make_int128(hi, lo); } /* * numeric_poly_serialize - * Serialize PolyNumAggState into bytea for aggregate functions which + * Serialize Int128AggState into bytea for aggregate functions which * require sumX2. */ Datum numeric_poly_serialize(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; StringInfoData buf; bytea *result; - NumericVar tmp_var; /* Ensure we disallow calling when not in aggregate context */ if (!AggCheckCallContext(fcinfo, NULL)) elog(ERROR, "aggregate function called in non-aggregate context"); - state = (PolyNumAggState *) PG_GETARG_POINTER(0); - - /* - * If the platform supports int128 then sumX and sumX2 will be a 128 bit - * integer type. Here we'll convert that into a numeric type so that the - * combine state is in the same format for both int128 enabled machines - * and machines which don't support that type. The logic here is that one - * day we might like to send these over to another server for further - * processing and we want a standard format to work with. - */ - - init_var(&tmp_var); + state = (Int128AggState *) PG_GETARG_POINTER(0); pq_begintypsend(&buf); @@ -5826,48 +5607,33 @@ numeric_poly_serialize(PG_FUNCTION_ARGS) pq_sendint64(&buf, state->N); /* sumX */ -#ifdef HAVE_INT128 - int128_to_numericvar(state->sumX, &tmp_var); -#else - accum_sum_final(&state->sumX, &tmp_var); -#endif - numericvar_serialize(&buf, &tmp_var); + int128_serialize(&buf, state->sumX); /* sumX2 */ -#ifdef HAVE_INT128 - int128_to_numericvar(state->sumX2, &tmp_var); -#else - accum_sum_final(&state->sumX2, &tmp_var); -#endif - numericvar_serialize(&buf, &tmp_var); + int128_serialize(&buf, state->sumX2); result = pq_endtypsend(&buf); - free_var(&tmp_var); - PG_RETURN_BYTEA_P(result); } /* * numeric_poly_deserialize - * Deserialize PolyNumAggState from bytea for aggregate functions which + * Deserialize Int128AggState from bytea for aggregate functions which * require sumX2. */ Datum numeric_poly_deserialize(PG_FUNCTION_ARGS) { bytea *sstate; - PolyNumAggState *result; + Int128AggState *result; StringInfoData buf; - NumericVar tmp_var; if (!AggCheckCallContext(fcinfo, NULL)) elog(ERROR, "aggregate function called in non-aggregate context"); sstate = PG_GETARG_BYTEA_PP(0); - init_var(&tmp_var); - /* * Initialize a StringInfo so that we can "receive" it using the standard * recv-function infrastructure. @@ -5875,31 +5641,19 @@ numeric_poly_deserialize(PG_FUNCTION_ARGS) initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate), VARSIZE_ANY_EXHDR(sstate)); - result = makePolyNumAggStateCurrentContext(false); + result = makeInt128AggStateCurrentContext(false); /* N */ result->N = pq_getmsgint64(&buf); /* sumX */ - numericvar_deserialize(&buf, &tmp_var); -#ifdef HAVE_INT128 - numericvar_to_int128(&tmp_var, &result->sumX); -#else - accum_sum_add(&result->sumX, &tmp_var); -#endif + result->sumX = int128_deserialize(&buf); /* sumX2 */ - numericvar_deserialize(&buf, &tmp_var); -#ifdef HAVE_INT128 - numericvar_to_int128(&tmp_var, &result->sumX2); -#else - accum_sum_add(&result->sumX2, &tmp_var); -#endif + result->sumX2 = int128_deserialize(&buf); pq_getmsgend(&buf); - free_var(&tmp_var); - PG_RETURN_POINTER(result); } @@ -5909,43 +5663,37 @@ numeric_poly_deserialize(PG_FUNCTION_ARGS) Datum int8_avg_accum(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* Create the state data on the first call */ if (state == NULL) - state = makePolyNumAggState(fcinfo, false); + state = makeInt128AggState(fcinfo, false); if (!PG_ARGISNULL(1)) - { -#ifdef HAVE_INT128 - do_int128_accum(state, (int128) PG_GETARG_INT64(1)); -#else - do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT64(1))); -#endif - } + do_int128_accum(state, PG_GETARG_INT64(1)); PG_RETURN_POINTER(state); } /* - * Combine function for PolyNumAggState for aggregates which don't require + * Combine function for Int128AggState for aggregates which don't require * sumX2 */ Datum int8_avg_combine(PG_FUNCTION_ARGS) { - PolyNumAggState *state1; - PolyNumAggState *state2; + Int128AggState *state1; + Int128AggState *state2; MemoryContext agg_context; MemoryContext old_context; if (!AggCheckCallContext(fcinfo, &agg_context)) elog(ERROR, "aggregate function called in non-aggregate context"); - state1 = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); - state2 = PG_ARGISNULL(1) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(1); + state1 = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); + state2 = PG_ARGISNULL(1) ? NULL : (Int128AggState *) PG_GETARG_POINTER(1); if (state2 == NULL) PG_RETURN_POINTER(state1); @@ -5955,14 +5703,10 @@ int8_avg_combine(PG_FUNCTION_ARGS) { old_context = MemoryContextSwitchTo(agg_context); - state1 = makePolyNumAggState(fcinfo, false); + state1 = makeInt128AggState(fcinfo, false); state1->N = state2->N; - -#ifdef HAVE_INT128 state1->sumX = state2->sumX; -#else - accum_sum_copy(&state1->sumX, &state2->sumX); -#endif + MemoryContextSwitchTo(old_context); PG_RETURN_POINTER(state1); @@ -5971,52 +5715,28 @@ int8_avg_combine(PG_FUNCTION_ARGS) if (state2->N > 0) { state1->N += state2->N; - -#ifdef HAVE_INT128 - state1->sumX += state2->sumX; -#else - /* The rest of this needs to work in the aggregate context */ - old_context = MemoryContextSwitchTo(agg_context); - - /* Accumulate sums */ - accum_sum_combine(&state1->sumX, &state2->sumX); - - MemoryContextSwitchTo(old_context); -#endif - + int128_add_int128(&state1->sumX, state2->sumX); } PG_RETURN_POINTER(state1); } /* * int8_avg_serialize - * Serialize PolyNumAggState into bytea using the standard - * recv-function infrastructure. + * Serialize Int128AggState into bytea for aggregate functions which + * don't require sumX2. */ Datum int8_avg_serialize(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; StringInfoData buf; bytea *result; - NumericVar tmp_var; /* Ensure we disallow calling when not in aggregate context */ if (!AggCheckCallContext(fcinfo, NULL)) elog(ERROR, "aggregate function called in non-aggregate context"); - state = (PolyNumAggState *) PG_GETARG_POINTER(0); - - /* - * If the platform supports int128 then sumX will be a 128 integer type. - * Here we'll convert that into a numeric type so that the combine state - * is in the same format for both int128 enabled machines and machines - * which don't support that type. The logic here is that one day we might - * like to send these over to another server for further processing and we - * want a standard format to work with. - */ - - init_var(&tmp_var); + state = (Int128AggState *) PG_GETARG_POINTER(0); pq_begintypsend(&buf); @@ -6024,39 +5744,30 @@ int8_avg_serialize(PG_FUNCTION_ARGS) pq_sendint64(&buf, state->N); /* sumX */ -#ifdef HAVE_INT128 - int128_to_numericvar(state->sumX, &tmp_var); -#else - accum_sum_final(&state->sumX, &tmp_var); -#endif - numericvar_serialize(&buf, &tmp_var); + int128_serialize(&buf, state->sumX); result = pq_endtypsend(&buf); - free_var(&tmp_var); - PG_RETURN_BYTEA_P(result); } /* * int8_avg_deserialize - * Deserialize bytea back into PolyNumAggState. + * Deserialize Int128AggState from bytea for aggregate functions which + * don't require sumX2. */ Datum int8_avg_deserialize(PG_FUNCTION_ARGS) { bytea *sstate; - PolyNumAggState *result; + Int128AggState *result; StringInfoData buf; - NumericVar tmp_var; if (!AggCheckCallContext(fcinfo, NULL)) elog(ERROR, "aggregate function called in non-aggregate context"); sstate = PG_GETARG_BYTEA_PP(0); - init_var(&tmp_var); - /* * Initialize a StringInfo so that we can "receive" it using the standard * recv-function infrastructure. @@ -6064,23 +5775,16 @@ int8_avg_deserialize(PG_FUNCTION_ARGS) initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate), VARSIZE_ANY_EXHDR(sstate)); - result = makePolyNumAggStateCurrentContext(false); + result = makeInt128AggStateCurrentContext(false); /* N */ result->N = pq_getmsgint64(&buf); /* sumX */ - numericvar_deserialize(&buf, &tmp_var); -#ifdef HAVE_INT128 - numericvar_to_int128(&tmp_var, &result->sumX); -#else - accum_sum_add(&result->sumX, &tmp_var); -#endif + result->sumX = int128_deserialize(&buf); pq_getmsgend(&buf); - free_var(&tmp_var); - PG_RETURN_POINTER(result); } @@ -6091,24 +5795,16 @@ int8_avg_deserialize(PG_FUNCTION_ARGS) Datum int2_accum_inv(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* Should not get here with no state */ if (state == NULL) elog(ERROR, "int2_accum_inv called with NULL state"); if (!PG_ARGISNULL(1)) - { -#ifdef HAVE_INT128 - do_int128_discard(state, (int128) PG_GETARG_INT16(1)); -#else - /* Should never fail, all inputs have dscale 0 */ - if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT16(1)))) - elog(ERROR, "do_numeric_discard failed unexpectedly"); -#endif - } + do_int128_discard(state, PG_GETARG_INT16(1)); PG_RETURN_POINTER(state); } @@ -6116,24 +5812,16 @@ int2_accum_inv(PG_FUNCTION_ARGS) Datum int4_accum_inv(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* Should not get here with no state */ if (state == NULL) elog(ERROR, "int4_accum_inv called with NULL state"); if (!PG_ARGISNULL(1)) - { -#ifdef HAVE_INT128 - do_int128_discard(state, (int128) PG_GETARG_INT32(1)); -#else - /* Should never fail, all inputs have dscale 0 */ - if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT32(1)))) - elog(ERROR, "do_numeric_discard failed unexpectedly"); -#endif - } + do_int128_discard(state, PG_GETARG_INT32(1)); PG_RETURN_POINTER(state); } @@ -6162,24 +5850,16 @@ int8_accum_inv(PG_FUNCTION_ARGS) Datum int8_avg_accum_inv(PG_FUNCTION_ARGS) { - PolyNumAggState *state; + Int128AggState *state; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* Should not get here with no state */ if (state == NULL) elog(ERROR, "int8_avg_accum_inv called with NULL state"); if (!PG_ARGISNULL(1)) - { -#ifdef HAVE_INT128 - do_int128_discard(state, (int128) PG_GETARG_INT64(1)); -#else - /* Should never fail, all inputs have dscale 0 */ - if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT64(1)))) - elog(ERROR, "do_numeric_discard failed unexpectedly"); -#endif - } + do_int128_discard(state, PG_GETARG_INT64(1)); PG_RETURN_POINTER(state); } @@ -6187,12 +5867,11 @@ int8_avg_accum_inv(PG_FUNCTION_ARGS) Datum numeric_poly_sum(PG_FUNCTION_ARGS) { -#ifdef HAVE_INT128 - PolyNumAggState *state; + Int128AggState *state; Numeric res; NumericVar result; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* If there were no non-null inputs, return NULL */ if (state == NULL || state->N == 0) @@ -6207,21 +5886,17 @@ numeric_poly_sum(PG_FUNCTION_ARGS) free_var(&result); PG_RETURN_NUMERIC(res); -#else - return numeric_sum(fcinfo); -#endif } Datum numeric_poly_avg(PG_FUNCTION_ARGS) { -#ifdef HAVE_INT128 - PolyNumAggState *state; + Int128AggState *state; NumericVar result; Datum countd, sumd; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); /* If there were no non-null inputs, return NULL */ if (state == NULL || state->N == 0) @@ -6237,9 +5912,6 @@ numeric_poly_avg(PG_FUNCTION_ARGS) free_var(&result); PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sumd, countd)); -#else - return numeric_avg(fcinfo); -#endif } Datum @@ -6472,7 +6144,6 @@ numeric_stddev_pop(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(res); } -#ifdef HAVE_INT128 static Numeric numeric_poly_stddev_internal(Int128AggState *state, bool variance, bool sample, @@ -6516,17 +6187,15 @@ numeric_poly_stddev_internal(Int128AggState *state, return res; } -#endif Datum numeric_poly_var_samp(PG_FUNCTION_ARGS) { -#ifdef HAVE_INT128 - PolyNumAggState *state; + Int128AggState *state; Numeric res; bool is_null; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); res = numeric_poly_stddev_internal(state, true, true, &is_null); @@ -6534,20 +6203,16 @@ numeric_poly_var_samp(PG_FUNCTION_ARGS) PG_RETURN_NULL(); else PG_RETURN_NUMERIC(res); -#else - return numeric_var_samp(fcinfo); -#endif } Datum numeric_poly_stddev_samp(PG_FUNCTION_ARGS) { -#ifdef HAVE_INT128 - PolyNumAggState *state; + Int128AggState *state; Numeric res; bool is_null; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); res = numeric_poly_stddev_internal(state, false, true, &is_null); @@ -6555,20 +6220,16 @@ numeric_poly_stddev_samp(PG_FUNCTION_ARGS) PG_RETURN_NULL(); else PG_RETURN_NUMERIC(res); -#else - return numeric_stddev_samp(fcinfo); -#endif } Datum numeric_poly_var_pop(PG_FUNCTION_ARGS) { -#ifdef HAVE_INT128 - PolyNumAggState *state; + Int128AggState *state; Numeric res; bool is_null; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); res = numeric_poly_stddev_internal(state, true, false, &is_null); @@ -6576,20 +6237,16 @@ numeric_poly_var_pop(PG_FUNCTION_ARGS) PG_RETURN_NULL(); else PG_RETURN_NUMERIC(res); -#else - return numeric_var_pop(fcinfo); -#endif } Datum numeric_poly_stddev_pop(PG_FUNCTION_ARGS) { -#ifdef HAVE_INT128 - PolyNumAggState *state; + Int128AggState *state; Numeric res; bool is_null; - state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0); + state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0); res = numeric_poly_stddev_internal(state, false, false, &is_null); @@ -6597,9 +6254,6 @@ numeric_poly_stddev_pop(PG_FUNCTION_ARGS) PG_RETURN_NULL(); else PG_RETURN_NUMERIC(res); -#else - return numeric_stddev_pop(fcinfo); -#endif } /* @@ -6625,6 +6279,7 @@ numeric_poly_stddev_pop(PG_FUNCTION_ARGS) Datum int2_sum(PG_FUNCTION_ARGS) { + int64 oldsum; int64 newval; if (PG_ARGISNULL(0)) @@ -6637,43 +6292,22 @@ int2_sum(PG_FUNCTION_ARGS) PG_RETURN_INT64(newval); } - /* - * If we're invoked as an aggregate, we can cheat and modify our first - * parameter in-place to avoid palloc overhead. If not, we need to return - * the new value of the transition variable. (If int8 is pass-by-value, - * then of course this is useless as well as incorrect, so just ifdef it - * out.) - */ -#ifndef USE_FLOAT8_BYVAL /* controls int8 too */ - if (AggCheckCallContext(fcinfo, NULL)) - { - int64 *oldsum = (int64 *) PG_GETARG_POINTER(0); - - /* Leave the running sum unchanged in the new input is null */ - if (!PG_ARGISNULL(1)) - *oldsum = *oldsum + (int64) PG_GETARG_INT16(1); + oldsum = PG_GETARG_INT64(0); - PG_RETURN_POINTER(oldsum); - } - else -#endif - { - int64 oldsum = PG_GETARG_INT64(0); - - /* Leave sum unchanged if new input is null. */ - if (PG_ARGISNULL(1)) - PG_RETURN_INT64(oldsum); + /* Leave sum unchanged if new input is null. */ + if (PG_ARGISNULL(1)) + PG_RETURN_INT64(oldsum); - /* OK to do the addition. */ - newval = oldsum + (int64) PG_GETARG_INT16(1); + /* OK to do the addition. */ + newval = oldsum + (int64) PG_GETARG_INT16(1); - PG_RETURN_INT64(newval); - } + PG_RETURN_INT64(newval); } Datum int4_sum(PG_FUNCTION_ARGS) { + int64 oldsum; int64 newval; if (PG_ARGISNULL(0)) @@ -6686,38 +6320,16 @@ int4_sum(PG_FUNCTION_ARGS) PG_RETURN_INT64(newval); } - /* - * If we're invoked as an aggregate, we can cheat and modify our first - * parameter in-place to avoid palloc overhead. If not, we need to return - * the new value of the transition variable. (If int8 is pass-by-value, - * then of course this is useless as well as incorrect, so just ifdef it - * out.) - */ -#ifndef USE_FLOAT8_BYVAL /* controls int8 too */ - if (AggCheckCallContext(fcinfo, NULL)) - { - int64 *oldsum = (int64 *) PG_GETARG_POINTER(0); - - /* Leave the running sum unchanged in the new input is null */ - if (!PG_ARGISNULL(1)) - *oldsum = *oldsum + (int64) PG_GETARG_INT32(1); - - PG_RETURN_POINTER(oldsum); - } - else -#endif - { - int64 oldsum = PG_GETARG_INT64(0); + oldsum = PG_GETARG_INT64(0); - /* Leave sum unchanged if new input is null. */ - if (PG_ARGISNULL(1)) - PG_RETURN_INT64(oldsum); + /* Leave sum unchanged if new input is null. */ + if (PG_ARGISNULL(1)) + PG_RETURN_INT64(oldsum); - /* OK to do the addition. */ - newval = oldsum + (int64) PG_GETARG_INT32(1); + /* OK to do the addition. */ + newval = oldsum + (int64) PG_GETARG_INT32(1); - PG_RETURN_INT64(newval); - } + PG_RETURN_INT64(newval); } /* @@ -7888,16 +7500,13 @@ duplicate_numeric(Numeric num) } /* - * make_result_opt_error() - + * make_result_safe() - * * Create the packed db numeric format in palloc()'d memory from * a variable. This will handle NaN and Infinity cases. - * - * If "have_error" isn't NULL, on overflow *have_error is set to true and - * NULL is returned. This is helpful when caller needs to handle errors. */ static Numeric -make_result_opt_error(const NumericVar *var, bool *have_error) +make_result_safe(const NumericVar *var, Node *escontext) { Numeric result; NumericDigit *digits = var->digits; @@ -7906,9 +7515,6 @@ make_result_opt_error(const NumericVar *var, bool *have_error) int n; Size len; - if (have_error) - *have_error = false; - if ((sign & NUMERIC_SIGN_MASK) == NUMERIC_SPECIAL) { /* @@ -7981,19 +7587,9 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* Check for overflow of int16 fields */ if (NUMERIC_WEIGHT(result) != weight || NUMERIC_DSCALE(result) != var->dscale) - { - if (have_error) - { - *have_error = true; - return NULL; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); - } - } + ereturn(escontext, NULL, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value overflows numeric format"))); dump_numeric("make_result()", result); return result; @@ -8003,12 +7599,12 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* * make_result() - * - * An interface to make_result_opt_error() without "have_error" argument. + * An interface to make_result_safe() without "escontext" argument. */ static Numeric make_result(const NumericVar *var) { - return make_result_opt_error(var, NULL); + return make_result_safe(var, NULL); } @@ -8332,105 +7928,23 @@ numericvar_to_uint64(const NumericVar *var, uint64 *result) return true; } -#ifdef HAVE_INT128 -/* - * Convert numeric to int128, rounding if needed. - * - * If overflow, return false (no error is raised). Return true if okay. - */ -static bool -numericvar_to_int128(const NumericVar *var, int128 *result) -{ - NumericDigit *digits; - int ndigits; - int weight; - int i; - int128 val, - oldval; - bool neg; - NumericVar rounded; - - /* Round to nearest integer */ - init_var(&rounded); - set_var_from_var(var, &rounded); - round_var(&rounded, 0); - - /* Check for zero input */ - strip_var(&rounded); - ndigits = rounded.ndigits; - if (ndigits == 0) - { - *result = 0; - free_var(&rounded); - return true; - } - - /* - * For input like 10000000000, we must treat stripped digits as real. So - * the loop assumes there are weight+1 digits before the decimal point. - */ - weight = rounded.weight; - Assert(weight >= 0 && ndigits <= weight + 1); - - /* Construct the result */ - digits = rounded.digits; - neg = (rounded.sign == NUMERIC_NEG); - val = digits[0]; - for (i = 1; i <= weight; i++) - { - oldval = val; - val *= NBASE; - if (i < ndigits) - val += digits[i]; - - /* - * The overflow check is a bit tricky because we want to accept - * INT128_MIN, which will overflow the positive accumulator. We can - * detect this case easily though because INT128_MIN is the only - * nonzero value for which -val == val (on a two's complement machine, - * anyway). - */ - if ((val / NBASE) != oldval) /* possible overflow? */ - { - if (!neg || (-val) != val || val == 0 || oldval < 0) - { - free_var(&rounded); - return false; - } - } - } - - free_var(&rounded); - - *result = neg ? -val : val; - return true; -} - /* * Convert 128 bit integer to numeric. */ static void -int128_to_numericvar(int128 val, NumericVar *var) +int128_to_numericvar(INT128 val, NumericVar *var) { - uint128 uval, - newuval; + int sign; NumericDigit *ptr; int ndigits; + int32 dig; /* int128 can require at most 39 decimal digits; add one for safety */ alloc_var(var, 40 / DEC_DIGITS); - if (val < 0) - { - var->sign = NUMERIC_NEG; - uval = -val; - } - else - { - var->sign = NUMERIC_POS; - uval = val; - } + sign = int128_sign(val); + var->sign = sign < 0 ? NUMERIC_NEG : NUMERIC_POS; var->dscale = 0; - if (val == 0) + if (sign == 0) { var->ndigits = 0; var->weight = 0; @@ -8442,15 +7956,13 @@ int128_to_numericvar(int128 val, NumericVar *var) { ptr--; ndigits++; - newuval = uval / NBASE; - *ptr = uval - newuval * NBASE; - uval = newuval; - } while (uval); + int128_div_mod_int32(&val, NBASE, &dig); + *ptr = (NumericDigit) abs(dig); + } while (!int128_is_zero(val)); var->digits = ptr; var->ndigits = ndigits; var->weight = ndigits - 1; } -#endif /* * Convert a NumericVar to float8; if out of range, return +/- HUGE_VAL diff --git a/src/backend/utils/adt/numutils.c b/src/backend/utils/adt/numutils.c index 3bf30774a0c94..254c5cf82e4b1 100644 --- a/src/backend/utils/adt/numutils.c +++ b/src/backend/utils/adt/numutils.c @@ -113,7 +113,7 @@ static const int8 hexlookup[128] = { * pg_strtoint16() will throw ereport() upon bad input format or overflow; * while pg_strtoint16_safe() instead returns such complaints in *escontext, * if it's an ErrorSaveContext. -* + * * NB: Accumulate input as an unsigned number, to deal with two's complement * representation of the most negative number, which can't be represented as a * signed positive number. diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c index 9457d23971581..ac3963fc3e032 100644 --- a/src/backend/utils/adt/orderedsetaggs.c +++ b/src/backend/utils/adt/orderedsetaggs.c @@ -153,7 +153,7 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples) qcontext = fcinfo->flinfo->fn_mcxt; oldcontext = MemoryContextSwitchTo(qcontext); - qstate = (OSAPerQueryState *) palloc0(sizeof(OSAPerQueryState)); + qstate = palloc0_object(OSAPerQueryState); qstate->aggref = aggref; qstate->qcontext = qcontext; @@ -278,7 +278,7 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples) /* Now build the stuff we need in group-lifespan context */ oldcontext = MemoryContextSwitchTo(gcontext); - osastate = (OSAPerGroupState *) palloc(sizeof(OSAPerGroupState)); + osastate = palloc_object(OSAPerGroupState); osastate->qstate = qstate; osastate->gcontext = gcontext; @@ -660,8 +660,8 @@ pct_info_cmp(const void *pa, const void *pb) */ static struct pct_info * setup_pct_info(int num_percentiles, - Datum *percentiles_datum, - bool *percentiles_null, + const Datum *percentiles_datum, + const bool *percentiles_null, int64 rowcount, bool continuous) { @@ -1007,7 +1007,7 @@ percentile_cont_float8_multi_final(PG_FUNCTION_ARGS) FLOAT8OID, /* hard-wired info on type float8 */ sizeof(float8), - FLOAT8PASSBYVAL, + true, TYPALIGN_DOUBLE, float8_lerp); } diff --git a/src/backend/utils/adt/pg_dependencies.c b/src/backend/utils/adt/pg_dependencies.c new file mode 100644 index 0000000000000..b5a79bb82bba6 --- /dev/null +++ b/src/backend/utils/adt/pg_dependencies.c @@ -0,0 +1,873 @@ +/*------------------------------------------------------------------------- + * + * pg_dependencies.c + * pg_dependencies data type support. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/pg_dependencies.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "common/int.h" +#include "common/jsonapi.h" +#include "lib/stringinfo.h" +#include "mb/pg_wchar.h" +#include "nodes/miscnodes.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics_format.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/fmgrprotos.h" + +typedef enum +{ + DEPS_EXPECT_START = 0, + DEPS_EXPECT_ITEM, + DEPS_EXPECT_KEY, + DEPS_EXPECT_ATTNUM_LIST, + DEPS_EXPECT_ATTNUM, + DEPS_EXPECT_DEPENDENCY, + DEPS_EXPECT_DEGREE, + DEPS_PARSE_COMPLETE, +} DependenciesSemanticState; + +typedef struct +{ + const char *str; + DependenciesSemanticState state; + + List *dependency_list; + Node *escontext; + + bool found_attributes; /* Item has an attributes key */ + bool found_dependency; /* Item has an dependency key */ + bool found_degree; /* Item has degree key */ + List *attnum_list; /* Accumulated attribute numbers */ + AttrNumber dependency; + double degree; +} DependenciesParseState; + +/* + * Invoked at the start of each MVDependency object. + * + * The entire JSON document should be one array of MVDependency objects. + * + * If we are anywhere else in the document, it's an error. + */ +static JsonParseErrorType +dependencies_object_start(void *state) +{ + DependenciesParseState *parse = state; + + switch (parse->state) + { + case DEPS_EXPECT_ITEM: + /* Now we expect to see attributes/dependency/degree keys */ + parse->state = DEPS_EXPECT_KEY; + return JSON_SUCCESS; + + case DEPS_EXPECT_START: + /* pg_dependencies must begin with a '[' */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Initial element must be an array.")); + break; + + case DEPS_EXPECT_KEY: + /* In an object, expecting key */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("A key was expected.")); + break; + + case DEPS_EXPECT_ATTNUM_LIST: + /* Just followed an "attributes": key */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Value of \"%s\" must be an array of attribute numbers.", + PG_DEPENDENCIES_KEY_ATTRIBUTES)); + break; + + case DEPS_EXPECT_ATTNUM: + /* In an attribute number list, expect only scalar integers */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Attribute lists can only contain attribute numbers.")); + break; + + case DEPS_EXPECT_DEPENDENCY: + /* Just followed a "dependency" key */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Value of \"%s\" must be an integer.", + PG_DEPENDENCIES_KEY_DEPENDENCY)); + break; + + case DEPS_EXPECT_DEGREE: + /* Just followed a "degree" key */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Value of \"%s\" must be an integer.", + PG_DEPENDENCIES_KEY_DEGREE)); + break; + + default: + elog(ERROR, + "object start of \"%s\" found in unexpected parse state: %d.", + "pg_dependencies", (int) parse->state); + break; + } + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Invoked at the end of an object. + * + * Handle the end of an MVDependency object's JSON representation. + */ +static JsonParseErrorType +dependencies_object_end(void *state) +{ + DependenciesParseState *parse = state; + + MVDependency *dep; + + int natts = 0; + + if (parse->state != DEPS_EXPECT_KEY) + elog(ERROR, + "object end of \"%s\" found in unexpected parse state: %d.", + "pg_dependencies", (int) parse->state); + + if (!parse->found_attributes) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Item must contain \"%s\" key.", + PG_DEPENDENCIES_KEY_ATTRIBUTES)); + return JSON_SEM_ACTION_FAILED; + } + + if (!parse->found_dependency) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Item must contain \"%s\" key.", + PG_DEPENDENCIES_KEY_DEPENDENCY)); + return JSON_SEM_ACTION_FAILED; + } + + if (!parse->found_degree) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Item must contain \"%s\" key.", + PG_DEPENDENCIES_KEY_DEGREE)); + return JSON_SEM_ACTION_FAILED; + } + + /* + * We need at least one attribute number in a dependencies item, anything + * less is malformed. + */ + natts = list_length(parse->attnum_list); + if ((natts < 1) || (natts > (STATS_MAX_DIMENSIONS - 1))) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("The \"%s\" key must contain an array of at least %d and no more than %d elements.", + PG_DEPENDENCIES_KEY_ATTRIBUTES, 1, + STATS_MAX_DIMENSIONS - 1)); + return JSON_SEM_ACTION_FAILED; + } + + /* + * Allocate enough space for the dependency, the attribute numbers in the + * list and the final attribute number for the dependency. + */ + dep = palloc0(offsetof(MVDependency, attributes) + ((natts + 1) * sizeof(AttrNumber))); + dep->nattributes = natts + 1; + + dep->attributes[natts] = parse->dependency; + dep->degree = parse->degree; + + /* + * Assign attribute numbers to the attributes array, comparing each one + * against the dependency attribute to ensure that there there are no + * matches. + */ + for (int i = 0; i < natts; i++) + { + dep->attributes[i] = (AttrNumber) list_nth_int(parse->attnum_list, i); + if (dep->attributes[i] == parse->dependency) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Item \"%s\" with value %d has been found in the \"%s\" list.", + PG_DEPENDENCIES_KEY_DEPENDENCY, parse->dependency, + PG_DEPENDENCIES_KEY_ATTRIBUTES)); + return JSON_SEM_ACTION_FAILED; + } + } + + parse->dependency_list = lappend(parse->dependency_list, (void *) dep); + + /* + * Reset dependency item state variables to look for the next + * MVDependency. + */ + list_free(parse->attnum_list); + parse->attnum_list = NIL; + parse->dependency = 0; + parse->degree = 0.0; + parse->found_attributes = false; + parse->found_dependency = false; + parse->found_degree = false; + parse->state = DEPS_EXPECT_ITEM; + + return JSON_SUCCESS; +} + +/* + * Invoked at the start of an array. + * + * Dependency input format does not have arrays, so any array elements + * encountered are an error. + */ +static JsonParseErrorType +dependencies_array_start(void *state) +{ + DependenciesParseState *parse = state; + + switch (parse->state) + { + case DEPS_EXPECT_ATTNUM_LIST: + parse->state = DEPS_EXPECT_ATTNUM; + break; + case DEPS_EXPECT_START: + parse->state = DEPS_EXPECT_ITEM; + break; + default: + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Array has been found at an unexpected location.")); + return JSON_SEM_ACTION_FAILED; + } + + return JSON_SUCCESS; +} + +/* + * Invoked at the end of an array. + * + * Either the end of an attribute number list or the whole object. + */ +static JsonParseErrorType +dependencies_array_end(void *state) +{ + DependenciesParseState *parse = state; + + switch (parse->state) + { + case DEPS_EXPECT_ATTNUM: + if (list_length(parse->attnum_list) > 0) + { + parse->state = DEPS_EXPECT_KEY; + return JSON_SUCCESS; + } + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("The \"%s\" key must be an non-empty array.", + PG_DEPENDENCIES_KEY_ATTRIBUTES)); + break; + + case DEPS_EXPECT_ITEM: + if (list_length(parse->dependency_list) > 0) + { + parse->state = DEPS_PARSE_COMPLETE; + return JSON_SUCCESS; + } + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Item array cannot be empty.")); + break; + + default: + + /* + * This can only happen if a case was missed in + * dependencies_array_start(). + */ + elog(ERROR, + "array end of \"%s\" found in unexpected parse state: %d.", + "pg_dependencies", (int) parse->state); + break; + } + return JSON_SEM_ACTION_FAILED; +} + +/* + * Invoked at the start of a key/value field. + * + * The valid keys for the MVDependency object are: + * - attributes + * - dependency + * - degree + */ +static JsonParseErrorType +dependencies_object_field_start(void *state, char *fname, bool isnull) +{ + DependenciesParseState *parse = state; + + if (strcmp(fname, PG_DEPENDENCIES_KEY_ATTRIBUTES) == 0) + { + if (parse->found_attributes) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Multiple \"%s\" keys are not allowed.", + PG_DEPENDENCIES_KEY_ATTRIBUTES)); + return JSON_SEM_ACTION_FAILED; + } + + parse->found_attributes = true; + parse->state = DEPS_EXPECT_ATTNUM_LIST; + return JSON_SUCCESS; + } + + if (strcmp(fname, PG_DEPENDENCIES_KEY_DEPENDENCY) == 0) + { + if (parse->found_dependency) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Multiple \"%s\" keys are not allowed.", + PG_DEPENDENCIES_KEY_DEPENDENCY)); + return JSON_SEM_ACTION_FAILED; + } + + parse->found_dependency = true; + parse->state = DEPS_EXPECT_DEPENDENCY; + return JSON_SUCCESS; + } + + if (strcmp(fname, PG_DEPENDENCIES_KEY_DEGREE) == 0) + { + if (parse->found_degree) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Multiple \"%s\" keys are not allowed.", + PG_DEPENDENCIES_KEY_DEGREE)); + return JSON_SEM_ACTION_FAILED; + } + + parse->found_degree = true; + parse->state = DEPS_EXPECT_DEGREE; + return JSON_SUCCESS; + } + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Only allowed keys are \"%s\", \"%s\" and \"%s\".", + PG_DEPENDENCIES_KEY_ATTRIBUTES, + PG_DEPENDENCIES_KEY_DEPENDENCY, + PG_DEPENDENCIES_KEY_DEGREE)); + return JSON_SEM_ACTION_FAILED; +} + +/* + * Invoked at the start of an array element. + * + * pg_dependencies input format does not have arrays, so any array elements + * encountered are an error. + */ +static JsonParseErrorType +dependencies_array_element_start(void *state, bool isnull) +{ + DependenciesParseState *parse = state; + + switch (parse->state) + { + case DEPS_EXPECT_ATTNUM: + if (!isnull) + return JSON_SUCCESS; + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Attribute number array cannot be null.")); + break; + + case DEPS_EXPECT_ITEM: + if (!isnull) + return JSON_SUCCESS; + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Item list elements cannot be null.")); + break; + + default: + elog(ERROR, + "array element start of \"%s\" found in unexpected parse state: %d.", + "pg_dependencies", (int) parse->state); + break; + } + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Test for valid subsequent attribute number. + * + * If the previous value is positive, then current value must either be + * greater than the previous value, or negative. + * + * If the previous value is negative, then the value must be less than + * the previous value. + * + * Duplicate values are not allowed; that is already covered by the rules + * described above. + */ +static bool +valid_subsequent_attnum(const AttrNumber prev, const AttrNumber cur) +{ + Assert(prev != 0); + + if (prev > 0) + return ((cur > prev) || (cur < 0)); + + return (cur < prev); +} + +/* + * Handle scalar events from the dependencies input parser. + * + * There is only one case where we will encounter a scalar, and that is the + * dependency degree for the previous object key. + */ +static JsonParseErrorType +dependencies_scalar(void *state, char *token, JsonTokenType tokentype) +{ + DependenciesParseState *parse = state; + AttrNumber attnum; + ErrorSaveContext escontext = {T_ErrorSaveContext}; + + switch (parse->state) + { + case DEPS_EXPECT_ATTNUM: + attnum = pg_strtoint16_safe(token, (Node *) &escontext); + + if (escontext.error_occurred) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Key \"%s\" has an incorrect value.", PG_DEPENDENCIES_KEY_ATTRIBUTES)); + return JSON_SEM_ACTION_FAILED; + } + + /* + * An attribute number cannot be zero or a negative number beyond + * the number of the possible expressions. + */ + if (attnum == 0 || attnum < (0 - STATS_MAX_DIMENSIONS)) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Invalid \"%s\" element has been found: %d.", + PG_DEPENDENCIES_KEY_ATTRIBUTES, attnum)); + return JSON_SEM_ACTION_FAILED; + } + + if (parse->attnum_list != NIL) + { + const AttrNumber prev = llast_int(parse->attnum_list); + + if (!valid_subsequent_attnum(prev, attnum)) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Invalid \"%s\" element has been found: %d cannot follow %d.", + PG_DEPENDENCIES_KEY_ATTRIBUTES, attnum, prev)); + return JSON_SEM_ACTION_FAILED; + } + } + + parse->attnum_list = lappend_int(parse->attnum_list, (int) attnum); + return JSON_SUCCESS; + + case DEPS_EXPECT_DEPENDENCY: + parse->dependency = (AttrNumber) + pg_strtoint16_safe(token, (Node *) &escontext); + + if (escontext.error_occurred) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Key \"%s\" has an incorrect value.", PG_DEPENDENCIES_KEY_DEPENDENCY)); + return JSON_SEM_ACTION_FAILED; + } + + /* + * The dependency attribute number cannot be zero or a negative + * number beyond the number of the possible expressions. + */ + if (parse->dependency == 0 || parse->dependency < (0 - STATS_MAX_DIMENSIONS)) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Key \"%s\" has an incorrect value: %d.", + PG_DEPENDENCIES_KEY_DEPENDENCY, parse->dependency)); + return JSON_SEM_ACTION_FAILED; + } + + parse->state = DEPS_EXPECT_KEY; + return JSON_SUCCESS; + + case DEPS_EXPECT_DEGREE: + parse->degree = float8in_internal(token, NULL, "double", + token, (Node *) &escontext); + + if (escontext.error_occurred) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Key \"%s\" has an incorrect value.", PG_DEPENDENCIES_KEY_DEGREE)); + return JSON_SEM_ACTION_FAILED; + } + + parse->state = DEPS_EXPECT_KEY; + return JSON_SUCCESS; + + default: + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", parse->str), + errdetail("Unexpected scalar has been found.")); + break; + } + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Compare the attribute arrays of two MVDependency values, + * looking for duplicated sets. + */ +static bool +dep_attributes_eq(const MVDependency *a, const MVDependency *b) +{ + int i; + + if (a->nattributes != b->nattributes) + return false; + + for (i = 0; i < a->nattributes; i++) + { + if (a->attributes[i] != b->attributes[i]) + return false; + } + + return true; +} + +/* + * Generate a string representing an array of attribute numbers. + * Internally, the dependency attribute is the last element, so we + * leave that off. + * + * Freeing the allocated string is the responsibility of the caller. + */ +static char * +dep_attnum_list(const MVDependency *item) +{ + StringInfoData str; + + initStringInfo(&str); + + appendStringInfo(&str, "%d", item->attributes[0]); + + for (int i = 1; i < item->nattributes - 1; i++) + appendStringInfo(&str, ", %d", item->attributes[i]); + + return str.data; +} + +/* + * Return the dependency, which is the last attribute element. + */ +static AttrNumber +dep_attnum_dependency(const MVDependency *item) +{ + return item->attributes[item->nattributes - 1]; +} + +/* + * Attempt to build and serialize the MVDependencies object. + * + * This can only be executed after the completion of the JSON parsing. + * + * In the event of an error, set the error context and return NULL. + */ +static bytea * +build_mvdependencies(DependenciesParseState *parse, char *str) +{ + int ndeps = list_length(parse->dependency_list); + + MVDependencies *mvdeps; + bytea *bytes; + + switch (parse->state) + { + case DEPS_PARSE_COMPLETE: + + /* + * Parse ended in the expected place. We should have a list of + * items, but if we do not there is an issue with one of the + * earlier parse steps. + */ + if (ndeps == 0) + elog(ERROR, + "pg_dependencies parsing claims success with an empty item list."); + break; + + case DEPS_EXPECT_START: + /* blank */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", str), + errdetail("Value cannot be empty.")); + return NULL; + + default: + /* Unexpected end-state. */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", str), + errdetail("Unexpected end state has been found: %d.", parse->state)); + return NULL; + } + + mvdeps = palloc0(offsetof(MVDependencies, deps) + + (ndeps * sizeof(MVDependency *))); + mvdeps->magic = STATS_DEPS_MAGIC; + mvdeps->type = STATS_DEPS_TYPE_BASIC; + mvdeps->ndeps = ndeps; + + for (int i = 0; i < ndeps; i++) + { + /* + * Use the MVDependency objects in the dependency_list. + * + * Because we free the dependency_list after parsing is done, we + * cannot free it here. + */ + mvdeps->deps[i] = list_nth(parse->dependency_list, i); + + /* + * Ensure that this item does not duplicate the attributes of any + * pre-existing item. + */ + for (int j = 0; j < i; j++) + { + if (dep_attributes_eq(mvdeps->deps[i], mvdeps->deps[j])) + { + MVDependency *dep = mvdeps->deps[i]; + char *attnum_list = dep_attnum_list(dep); + AttrNumber attnum_dep = dep_attnum_dependency(dep); + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", str), + errdetail("Duplicated \"%s\" array has been found: [%s] for key \"%s\" and value %d.", + PG_DEPENDENCIES_KEY_ATTRIBUTES, attnum_list, + PG_DEPENDENCIES_KEY_DEPENDENCY, attnum_dep)); + pfree(mvdeps); + return NULL; + } + } + } + + bytes = statext_dependencies_serialize(mvdeps); + + /* + * No need to free the individual MVDependency objects, because they are + * still in the dependency_list, and will be freed with that. + */ + pfree(mvdeps); + + return bytes; +} + + +/* + * pg_dependencies_in - input routine for type pg_dependencies. + * + * This format is valid JSON, with the expected format: + * [{"attributes": [1,2], "dependency": -1, "degree": 1.0000}, + * {"attributes": [1,-1], "dependency": 2, "degree": 0.0000}, + * {"attributes": [2,-1], "dependency": 1, "degree": 1.0000}] + * + */ +Datum +pg_dependencies_in(PG_FUNCTION_ARGS) +{ + char *str = PG_GETARG_CSTRING(0); + bytea *bytes = NULL; + + DependenciesParseState parse_state; + JsonParseErrorType result; + JsonLexContext *lex; + JsonSemAction sem_action; + + /* initialize the semantic state */ + parse_state.str = str; + parse_state.state = DEPS_EXPECT_START; + parse_state.dependency_list = NIL; + parse_state.attnum_list = NIL; + parse_state.dependency = 0; + parse_state.degree = 0.0; + parse_state.found_attributes = false; + parse_state.found_dependency = false; + parse_state.found_degree = false; + parse_state.escontext = fcinfo->context; + + /* set callbacks */ + sem_action.semstate = (void *) &parse_state; + sem_action.object_start = dependencies_object_start; + sem_action.object_end = dependencies_object_end; + sem_action.array_start = dependencies_array_start; + sem_action.array_end = dependencies_array_end; + sem_action.array_element_start = dependencies_array_element_start; + sem_action.array_element_end = NULL; + sem_action.object_field_start = dependencies_object_field_start; + sem_action.object_field_end = NULL; + sem_action.scalar = dependencies_scalar; + + lex = makeJsonLexContextCstringLen(NULL, str, strlen(str), PG_UTF8, true); + + result = pg_parse_json(lex, &sem_action); + freeJsonLexContext(lex); + + if (result == JSON_SUCCESS) + bytes = build_mvdependencies(&parse_state, str); + + list_free_deep(parse_state.dependency_list); + list_free(parse_state.attnum_list); + + if (bytes) + PG_RETURN_BYTEA_P(bytes); + + /* + * If escontext already set, just use that. Anything else is a generic + * JSON parse error. + */ + if (!SOFT_ERROR_OCCURRED(parse_state.escontext)) + errsave(parse_state.escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_dependencies: \"%s\"", str), + errdetail("Input data must be valid JSON.")); + + PG_RETURN_NULL(); +} + + +/* + * pg_dependencies_out - output routine for type pg_dependencies. + */ +Datum +pg_dependencies_out(PG_FUNCTION_ARGS) +{ + bytea *data = PG_GETARG_BYTEA_PP(0); + MVDependencies *dependencies = statext_dependencies_deserialize(data); + StringInfoData str; + + initStringInfo(&str); + appendStringInfoChar(&str, '['); + + for (int i = 0; i < dependencies->ndeps; i++) + { + MVDependency *dependency = dependencies->deps[i]; + + if (i > 0) + appendStringInfoString(&str, ", "); + + if (dependency->nattributes <= 1) + elog(ERROR, "invalid zero-length nattributes array in MVDependencies"); + + appendStringInfo(&str, "{\"" PG_DEPENDENCIES_KEY_ATTRIBUTES "\": [%d", + dependency->attributes[0]); + + for (int j = 1; j < dependency->nattributes - 1; j++) + appendStringInfo(&str, ", %d", dependency->attributes[j]); + + appendStringInfo(&str, "], \"" PG_DEPENDENCIES_KEY_DEPENDENCY "\": %d, " + "\"" PG_DEPENDENCIES_KEY_DEGREE "\": %f}", + dependency->attributes[dependency->nattributes - 1], + dependency->degree); + } + + appendStringInfoChar(&str, ']'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * pg_dependencies_recv - binary input routine for type pg_dependencies. + */ +Datum +pg_dependencies_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_dependencies"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_dependencies_send - binary output routine for type pg_dependencies. + * + * Functional dependencies are serialized in a bytea value (although the type + * is named differently), so let's just send that. + */ +Datum +pg_dependencies_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index f5e31c433a0de..ee08ac045b7b2 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -41,11 +41,11 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" -#include "utils/formatting.h" #include "utils/guc_hooks.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_c.h" #include "utils/relcache.h" #include "utils/syscache.h" @@ -80,31 +80,6 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); extern char *get_collation_actual_version_libc(const char *collcollate); -extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - /* GUC settings */ char *locale_messages; char *locale_monetary; @@ -125,15 +100,18 @@ char *localized_full_days[7 + 1]; char *localized_abbrev_months[12 + 1]; char *localized_full_months[12 + 1]; -/* is the databases's LC_CTYPE the C locale? */ -bool database_ctype_is_c = false; - static pg_locale_t default_locale = NULL; /* indicates whether locale information cache is valid */ static bool CurrentLocaleConvValid = false; static bool CurrentLCTimeValid = false; +static struct pg_locale_struct c_locale = { + .deterministic = true, + .collate_is_c = true, + .ctype_is_c = true, +}; + /* Cache for collation-related knowledge */ typedef struct @@ -975,7 +953,7 @@ get_iso_localename(const char *winlocname) wchar_t wc_locale_name[LOCALE_NAME_MAX_LENGTH]; wchar_t buffer[LOCALE_NAME_MAX_LENGTH]; static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH]; - char *period; + const char *period; int len; int ret_val; @@ -1093,6 +1071,9 @@ create_pg_locale(Oid collid, MemoryContext context) Assert((result->collate_is_c && result->collate == NULL) || (!result->collate_is_c && result->collate != NULL)); + Assert((result->ctype_is_c && result->ctype == NULL) || + (!result->ctype_is_c && result->ctype != NULL)); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, &isnull); if (!isnull) @@ -1172,11 +1153,27 @@ init_database_collation(void) PGLOCALE_SUPPORT_ERROR(dbform->datlocprovider); result->is_default = true; + + Assert((result->collate_is_c && result->collate == NULL) || + (!result->collate_is_c && result->collate != NULL)); + + Assert((result->ctype_is_c && result->ctype == NULL) || + (!result->ctype_is_c && result->ctype != NULL)); + ReleaseSysCache(tup); default_locale = result; } +/* + * Get database default locale. + */ +pg_locale_t +pg_database_locale(void) +{ + return pg_newlocale_from_collation(DEFAULT_COLLATION_OID); +} + /* * Create a pg_locale_t from a collation OID. Results are cached for the * lifetime of the backend. Thus, do not free the result with freelocale(). @@ -1194,6 +1191,13 @@ pg_newlocale_from_collation(Oid collid) if (collid == DEFAULT_COLLATION_OID) return default_locale; + /* + * Some callers expect C_COLLATION_OID to succeed even without catalog + * access. + */ + if (collid == C_COLLATION_OID) + return &c_locale; + if (!OidIsValid(collid)) elog(ERROR, "cache lookup failed for collation %u", collid); @@ -1218,10 +1222,10 @@ pg_newlocale_from_collation(Oid collid) * Make sure cache entry is marked invalid, in case we fail before * setting things. */ - cache_entry->locale = 0; + cache_entry->locale = NULL; } - if (cache_entry->locale == 0) + if (cache_entry->locale == NULL) { cache_entry->locale = create_pg_locale(collid, CollationCacheContext); } @@ -1253,81 +1257,119 @@ get_collation_actual_version(char collprovider, const char *collcollate) return collversion; } +/* lowercasing/casefolding in C locale */ +static size_t +strlower_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) +{ + int i; + + srclen = (srclen >= 0) ? srclen : strlen(src); + for (i = 0; i < srclen && i < dstsize; i++) + dst[i] = pg_ascii_tolower(src[i]); + if (i < dstsize) + dst[i] = '\0'; + return srclen; +} + +/* titlecasing in C locale */ +static size_t +strtitle_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) +{ + bool wasalnum = false; + int i; + + srclen = (srclen >= 0) ? srclen : strlen(src); + for (i = 0; i < srclen && i < dstsize; i++) + { + char c = src[i]; + + if (wasalnum) + dst[i] = pg_ascii_tolower(c); + else + dst[i] = pg_ascii_toupper(c); + + wasalnum = ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z')); + } + if (i < dstsize) + dst[i] = '\0'; + return srclen; +} + +/* uppercasing in C locale */ +static size_t +strupper_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) +{ + int i; + + srclen = (srclen >= 0) ? srclen : strlen(src); + for (i = 0; i < srclen && i < dstsize; i++) + dst[i] = pg_ascii_toupper(src[i]); + if (i < dstsize) + dst[i] = '\0'; + return srclen; +} + size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strlower_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strlower_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); + if (locale->ctype == NULL) + return strlower_c(dst, dstsize, src, srclen); else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } size_t pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strtitle_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strtitle_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strtitle_libc(dst, dstsize, src, srclen, locale); + if (locale->ctype == NULL) + return strtitle_c(dst, dstsize, src, srclen); else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strtitle(dst, dstsize, src, srclen, locale); } size_t pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strupper_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strupper_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strupper_libc(dst, dstsize, src, srclen, locale); + if (locale->ctype == NULL) + return strupper_c(dst, dstsize, src, srclen); else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strupper(dst, dstsize, src, srclen, locale); } size_t pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strfold_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strfold_icu(dst, dstsize, src, srclen, locale); -#endif - /* for libc, just use strlower */ - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); + /* in the C locale, casefolding is the same as lowercasing */ + if (locale->ctype == NULL) + return strlower_c(dst, dstsize, src, srclen); else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); + return locale->ctype->strfold(dst, dstsize, src, srclen, locale); +} - return 0; /* keep compiler quiet */ +/* + * Lowercase an identifier using the database default locale. + * + * For historical reasons, does not use ordinary locale behavior. Should only + * be used for identifiers. XXX: can we make this equivalent to + * pg_strfold(..., default_locale)? + */ +size_t +pg_downcase_ident(char *dst, size_t dstsize, const char *src, ssize_t srclen) +{ + pg_locale_t locale = default_locale; + + if (locale == NULL || locale->ctype == NULL || + locale->ctype->downcase_ident == NULL) + return strlower_c(dst, dstsize, src, srclen); + else + return locale->ctype->downcase_ident(dst, dstsize, src, srclen, + locale); } /* @@ -1464,6 +1506,145 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale); } +bool +pg_iswdigit(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISDIGIT)); + else + return locale->ctype->wc_isdigit(wc, locale); +} + +bool +pg_iswalpha(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISALPHA)); + else + return locale->ctype->wc_isalpha(wc, locale); +} + +bool +pg_iswalnum(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISALNUM)); + else + return locale->ctype->wc_isalnum(wc, locale); +} + +bool +pg_iswupper(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISUPPER)); + else + return locale->ctype->wc_isupper(wc, locale); +} + +bool +pg_iswlower(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISLOWER)); + else + return locale->ctype->wc_islower(wc, locale); +} + +bool +pg_iswgraph(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISGRAPH)); + else + return locale->ctype->wc_isgraph(wc, locale); +} + +bool +pg_iswprint(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISPRINT)); + else + return locale->ctype->wc_isprint(wc, locale); +} + +bool +pg_iswpunct(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISPUNCT)); + else + return locale->ctype->wc_ispunct(wc, locale); +} + +bool +pg_iswspace(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISSPACE)); + else + return locale->ctype->wc_isspace(wc, locale); +} + +bool +pg_iswxdigit(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + ((pg_char_properties[wc] & PG_ISDIGIT) || + ((wc >= 'A' && wc <= 'F') || + (wc >= 'a' && wc <= 'f')))); + else + return locale->ctype->wc_isxdigit(wc, locale); +} + +bool +pg_iswcased(pg_wchar wc, pg_locale_t locale) +{ + /* for the C locale, Cased and Alpha are equivalent */ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISALPHA)); + else + return locale->ctype->wc_iscased(wc, locale); +} + +pg_wchar +pg_towupper(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + { + if (wc <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) wc); + return wc; + } + else + return locale->ctype->wc_toupper(wc, locale); +} + +pg_wchar +pg_towlower(pg_wchar wc, pg_locale_t locale) +{ + if (locale->ctype == NULL) + { + if (wc <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) wc); + return wc; + } + else + return locale->ctype->wc_tolower(wc, locale); +} + /* * Return required encoding ID for the given locale, or -1 if any encoding is * valid for the locale. diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index f51768830cd7b..145b4641b1b82 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -15,25 +15,14 @@ #include "catalog/pg_collation.h" #include "common/unicode_case.h" #include "common/unicode_category.h" -#include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" -#include "utils/memutils.h" #include "utils/pg_locale.h" #include "utils/syscache.h" extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); extern char *get_collation_actual_version_builtin(const char *collcollate); -extern size_t strlower_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); - struct WordBoundaryState { @@ -45,6 +34,23 @@ struct WordBoundaryState bool prev_alnum; }; +/* + * In UTF-8, pg_wchar is guaranteed to be the code point value. + */ +static inline char32_t +to_char32(pg_wchar wc) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + return (char32_t) wc; +} + +static inline pg_wchar +to_pg_wchar(char32_t c32) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + return (pg_wchar) c32; +} + /* * Simple word boundary iterator that draws boundaries each time the result of * pg_u_isalnum() changes. @@ -57,7 +63,7 @@ initcap_wbnext(void *state) while (wbstate->offset < wbstate->len && wbstate->str[wbstate->offset] != '\0') { - pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + + char32_t u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); bool curr_alnum = pg_u_isalnum(u, wbstate->posix); @@ -77,15 +83,15 @@ initcap_wbnext(void *state) return wbstate->len; } -size_t +static size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { return unicode_strlower(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + locale->builtin.casemap_full); } -size_t +static size_t strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -93,32 +99,132 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, .str = src, .len = srclen, .offset = 0, - .posix = !locale->info.builtin.casemap_full, + .posix = !locale->builtin.casemap_full, .init = false, .prev_alnum = false, }; return unicode_strtitle(dest, destsize, src, srclen, - locale->info.builtin.casemap_full, + locale->builtin.casemap_full, initcap_wbnext, &wbstate); } -size_t +static size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { return unicode_strupper(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + locale->builtin.casemap_full); } -size_t +static size_t strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { return unicode_strfold(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + locale->builtin.casemap_full); +} + +static bool +wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full); +} + +static bool +wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalpha(to_char32(wc)); +} + +static bool +wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full); +} + +static bool +wc_isupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isupper(to_char32(wc)); +} + +static bool +wc_islower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_islower(to_char32(wc)); +} + +static bool +wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isgraph(to_char32(wc)); +} + +static bool +wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isprint(to_char32(wc)); } +static bool +wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full); +} + +static bool +wc_isspace_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isspace(to_char32(wc)); +} + +static bool +wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full); +} + +static bool +wc_iscased_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_prop_cased(to_char32(wc)); +} + +static pg_wchar +wc_toupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return to_pg_wchar(unicode_uppercase_simple(to_char32(wc))); +} + +static pg_wchar +wc_tolower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return to_pg_wchar(unicode_lowercase_simple(to_char32(wc))); +} + +static const struct ctype_methods ctype_methods_builtin = { + .strlower = strlower_builtin, + .strtitle = strtitle_builtin, + .strupper = strupper_builtin, + .strfold = strfold_builtin, + /* uses plain ASCII semantics for historical reasons */ + .downcase_ident = NULL, + .wc_isdigit = wc_isdigit_builtin, + .wc_isalpha = wc_isalpha_builtin, + .wc_isalnum = wc_isalnum_builtin, + .wc_isupper = wc_isupper_builtin, + .wc_islower = wc_islower_builtin, + .wc_isgraph = wc_isgraph_builtin, + .wc_isprint = wc_isprint_builtin, + .wc_ispunct = wc_ispunct_builtin, + .wc_isspace = wc_isspace_builtin, + .wc_isxdigit = wc_isxdigit_builtin, + .wc_iscased = wc_iscased_builtin, + .wc_tolower = wc_tolower_builtin, + .wc_toupper = wc_toupper_builtin, +}; + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { @@ -156,12 +262,13 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->info.builtin.locale = MemoryContextStrdup(context, locstr); - result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); - result->provider = COLLPROVIDER_BUILTIN; + result->builtin.locale = MemoryContextStrdup(context, locstr); + result->builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); + if (!result->ctype_is_c) + result->ctype = &ctype_methods_builtin; return result; } diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index a32c32a0744bd..43d44fe43bdc4 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -48,19 +48,24 @@ #define TEXTBUFLEN 1024 extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); -extern size_t strlower_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); +static size_t strlower_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strupper_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strfold_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static int strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); static size_t strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -118,6 +123,23 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); +/* + * XXX: many of the functions below rely on casts directly from pg_wchar to + * UChar32, which is correct for UTF-8 and LATIN1, but not in general. + */ + +static pg_wchar +toupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_toupper(wc); +} + +static pg_wchar +tolower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_tolower(wc); +} + static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, .strnxfrm = strnxfrm_icu, @@ -136,6 +158,115 @@ static const struct collate_methods collate_methods_icu_utf8 = { .strxfrm_is_safe = true, }; +static bool +wc_isdigit_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isdigit(wc); +} + +static bool +wc_isalpha_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalpha(wc); +} + +static bool +wc_isalnum_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalnum(wc); +} + +static bool +wc_isupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isupper(wc); +} + +static bool +wc_islower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_islower(wc); +} + +static bool +wc_isgraph_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isgraph(wc); +} + +static bool +wc_isprint_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isprint(wc); +} + +static bool +wc_ispunct_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_ispunct(wc); +} + +static bool +wc_isspace_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isspace(wc); +} + +static bool +wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isxdigit(wc); +} + +static bool +wc_iscased_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_hasBinaryProperty(wc, UCHAR_CASED); +} + +static const struct ctype_methods ctype_methods_icu = { + .strlower = strlower_icu, + .strtitle = strtitle_icu, + .strupper = strupper_icu, + .strfold = strfold_icu, + .downcase_ident = downcase_ident_icu, + .wc_isdigit = wc_isdigit_icu, + .wc_isalpha = wc_isalpha_icu, + .wc_isalnum = wc_isalnum_icu, + .wc_isupper = wc_isupper_icu, + .wc_islower = wc_islower_icu, + .wc_isgraph = wc_isgraph_icu, + .wc_isprint = wc_isprint_icu, + .wc_ispunct = wc_ispunct_icu, + .wc_isspace = wc_isspace_icu, + .wc_isxdigit = wc_isxdigit_icu, + .wc_iscased = wc_iscased_icu, + .wc_toupper = toupper_icu, + .wc_tolower = tolower_icu, +}; + +/* + * ICU still depends on libc for compatibility with certain historical + * behavior for single-byte encodings. See downcase_ident_icu(). + * + * XXX: consider fixing by decoding the single byte into a code point, and + * using u_tolower(). + */ +static locale_t +make_libc_ctype_locale(const char *ctype) +{ + locale_t loc; + +#ifndef WIN32 + loc = newlocale(LC_CTYPE_MASK, ctype, NULL); +#else + loc = _create_locale(LC_ALL, ctype); +#endif + if (!loc) + report_newlocale_failure(ctype); + + return loc; +} #endif pg_locale_t @@ -146,6 +277,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) const char *iculocstr; const char *icurules = NULL; UCollator *collator; + locale_t loc = (locale_t) 0; pg_locale_t result; if (collid == DEFAULT_COLLATION_OID) @@ -168,6 +300,18 @@ create_pg_locale_icu(Oid collid, MemoryContext context) if (!isnull) icurules = TextDatumGetCString(datum); + /* libc only needed for default locale and single-byte encoding */ + if (pg_database_encoding_max_length() == 1) + { + const char *ctype; + + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, + Anum_pg_database_datctype); + ctype = TextDatumGetCString(datum); + + loc = make_libc_ctype_locale(ctype); + } + ReleaseSysCache(tp); } else @@ -196,9 +340,9 @@ create_pg_locale_icu(Oid collid, MemoryContext context) collator = make_icu_collator(iculocstr, icurules); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->info.icu.locale = MemoryContextStrdup(context, iculocstr); - result->info.icu.ucol = collator; - result->provider = COLLPROVIDER_ICU; + result->icu.locale = MemoryContextStrdup(context, iculocstr); + result->icu.ucol = collator; + result->icu.lt = loc; result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -206,6 +350,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result->collate = &collate_methods_icu_utf8; else result->collate = &collate_methods_icu; + result->ctype = &ctype_methods_icu; return result; #else @@ -379,7 +524,7 @@ make_icu_collator(const char *iculocstr, const char *icurules) } } -size_t +static size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -399,7 +544,7 @@ strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -419,7 +564,7 @@ strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -439,7 +584,7 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -459,6 +604,39 @@ strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } +/* + * For historical compatibility, behavior is not multibyte-aware. + * + * NB: uses libc tolower() for single-byte encodings (also for historical + * compatibility), and therefore relies on the global LC_CTYPE setting. + */ +static size_t +downcase_ident_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + int i; + bool libc_lower; + locale_t lt = locale->icu.lt; + + libc_lower = lt && (pg_database_encoding_max_length() == 1); + + for (i = 0; i < srclen && i < dstsize; i++) + { + unsigned char ch = (unsigned char) src[i]; + + if (ch >= 'A' && ch <= 'Z') + ch = pg_ascii_tolower(ch); + else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt)) + ch = tolower_l(ch, lt); + dst[i] = (char) ch; + } + + if (i < dstsize) + dst[i] = '\0'; + + return srclen; +} + /* * strncoll_icu_utf8 * @@ -474,12 +652,10 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 int result; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); status = U_ZERO_ERROR; - result = ucol_strcollUTF8(locale->info.icu.ucol, + result = ucol_strcollUTF8(locale->icu.ucol, arg1, len1, arg2, len2, &status); @@ -503,8 +679,6 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - init_icu_converter(); ulen = uchar_length(icu_converter, src, srclen); @@ -518,7 +692,7 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); - result_bsize = ucol_getSortKey(locale->info.icu.ucol, + result_bsize = ucol_getSortKey(locale->icu.ucol, uchar, ulen, (uint8_t *) dest, destsize); @@ -549,14 +723,12 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, uint32_t state[2]; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); uiter_setUTF8(&iter, src, srclen); state[0] = state[1] = 0; /* won't need that again */ status = U_ZERO_ERROR; - result = ucol_nextSortKeyPart(locale->info.icu.ucol, + result = ucol_nextSortKeyPart(locale->icu.ucol, &iter, state, (uint8_t *) dest, @@ -667,7 +839,7 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + mylocale->icu.locale, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { /* try again with adjusted length */ @@ -675,7 +847,7 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + mylocale->icu.locale, &status); } if (U_FAILURE(status)) ereport(ERROR, @@ -749,8 +921,6 @@ strncoll_icu(const char *arg1, ssize_t len1, *uchar2; int result; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); @@ -773,7 +943,7 @@ strncoll_icu(const char *arg1, ssize_t len1, ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); - result = ucol_strcoll(locale->info.icu.ucol, + result = ucol_strcoll(locale->icu.ucol, uchar1, ulen1, uchar2, ulen2); @@ -799,8 +969,6 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); @@ -820,7 +988,7 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, uiter_setString(&iter, uchar, ulen); state[0] = state[1] = 0; /* won't need that again */ status = U_ZERO_ERROR; - result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, + result_bsize = ucol_nextSortKeyPart(locale->icu.ucol, &iter, state, (uint8_t *) dest, diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 199857e22dbec..ab6117aaace57 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -33,6 +33,45 @@ #include #endif +/* + * For the libc provider, to provide as much functionality as possible on a + * variety of platforms without going so far as to implement everything from + * scratch, we use several implementation strategies depending on the + * situation: + * + * 1. In C/POSIX collations, we use hard-wired code. We can't depend on + * the functions since those will obey LC_CTYPE. Note that these + * collations don't give a fig about multibyte characters. + * + * 2. When working in UTF8 encoding, we use the functions. + * This assumes that every platform uses Unicode codepoints directly + * as the wchar_t representation of Unicode. On some platforms + * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. + * + * 3. In all other encodings, we use the functions for pg_wchar + * values up to 255, and punt for values above that. This is 100% correct + * only in single-byte encodings such as LATINn. However, non-Unicode + * multibyte encodings are mostly Far Eastern character sets for which the + * properties being tested here aren't very relevant for higher code values + * anyway. The difficulty with using the functions with + * non-Unicode multibyte encodings is that we can have no certainty that + * the platform's wchar_t representation matches what we do in pg_wchar + * conversions. + * + * As a special case, in the "default" collation, (2) and (3) force ASCII + * letters to follow ASCII upcase/downcase rules, while in a non-default + * collation we just let the library functions do what they will. The case + * where this matters is treatment of I/i in Turkish, and the behavior is + * meant to match the upper()/lower() SQL functions. + * + * We store the active collation setting in static variables. In principle + * it could be passed down to here via the regex library's "struct vars" data + * structure; but that would require somewhat invasive changes in the regex + * library, and right now there's no real benefit to be gained from that. + * + * NB: the coding here assumes pg_wchar is an unsigned type. + */ + /* * Size of stack buffer to use for string transformations, used to avoid heap * allocations in typical cases. This should be large enough that most strings @@ -43,13 +82,6 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -66,6 +98,9 @@ static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, pg_locale_t locale); #endif +static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, + size_t fromlen, locale_t loc); + static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -85,6 +120,304 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static bool +wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isdigit_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalpha_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalnum_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isupper_l((unsigned char) wc, locale->lt); +} + +static bool +wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return islower_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isgraph_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isprint_l((unsigned char) wc, locale->lt); +} + +static bool +wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return ispunct_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isspace_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale) +{ +#ifndef WIN32 + return isxdigit_l((unsigned char) wc, locale->lt); +#else + return _isxdigit_l((unsigned char) wc, locale->lt); +#endif +} + +static bool +wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isupper_l((unsigned char) wc, locale->lt) || + islower_l((unsigned char) wc, locale->lt); +} + +static bool +wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswdigit_l((wint_t) wc, locale->lt); +} + +static bool +wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalpha_l((wint_t) wc, locale->lt); +} + +static bool +wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalnum_l((wint_t) wc, locale->lt); +} + +static bool +wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswupper_l((wint_t) wc, locale->lt); +} + +static bool +wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswlower_l((wint_t) wc, locale->lt); +} + +static bool +wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswgraph_l((wint_t) wc, locale->lt); +} + +static bool +wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswprint_l((wint_t) wc, locale->lt); +} + +static bool +wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswpunct_l((wint_t) wc, locale->lt); +} + +static bool +wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswspace_l((wint_t) wc, locale->lt); +} + +static bool +wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale) +{ +#ifndef WIN32 + return iswxdigit_l((wint_t) wc, locale->lt); +#else + return _iswxdigit_l((wint_t) wc, locale->lt); +#endif +} + +static bool +wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswupper_l((wint_t) wc, locale->lt) || + iswlower_l((wint_t) wc, locale->lt); +} + +static pg_wchar +toupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) wc); + if (wc <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) wc, locale->lt); + else + return wc; +} + +static pg_wchar +toupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) wc); + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towupper_l((wint_t) wc, locale->lt); + else + return wc; +} + +static pg_wchar +tolower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) wc); + if (wc <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) wc, locale->lt); + else + return wc; +} + +static pg_wchar +tolower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) wc); + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towlower_l((wint_t) wc, locale->lt); + else + return wc; +} + +/* + * Characters A..Z always downcase to a..z, even in the Turkish + * locale. Characters beyond 127 use tolower(). + */ +static size_t +downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + locale_t loc = locale->lt; + int i; + + for (i = 0; i < srclen && i < dstsize; i++) + { + unsigned char ch = (unsigned char) src[i]; + + if (ch >= 'A' && ch <= 'Z') + ch = pg_ascii_tolower(ch); + else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc)) + ch = tolower_l(ch, loc); + dst[i] = (char) ch; + } + + if (i < dstsize) + dst[i] = '\0'; + + return srclen; +} + +static const struct ctype_methods ctype_methods_libc_sb = { + .strlower = strlower_libc_sb, + .strtitle = strtitle_libc_sb, + .strupper = strupper_libc_sb, + /* in libc, casefolding is the same as lowercasing */ + .strfold = strlower_libc_sb, + .downcase_ident = downcase_ident_libc_sb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .wc_isxdigit = wc_isxdigit_libc_sb, + .wc_iscased = wc_iscased_libc_sb, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, +}; + +/* + * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but + * single-byte semantics for pattern matching. + */ +static const struct ctype_methods ctype_methods_libc_other_mb = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + /* in libc, casefolding is the same as lowercasing */ + .strfold = strlower_libc_mb, + /* uses plain ASCII semantics for historical reasons */ + .downcase_ident = NULL, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .wc_isxdigit = wc_isxdigit_libc_sb, + .wc_iscased = wc_iscased_libc_sb, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, +}; + +static const struct ctype_methods ctype_methods_libc_utf8 = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + /* in libc, casefolding is the same as lowercasing */ + .strfold = strlower_libc_mb, + /* uses plain ASCII semantics for historical reasons */ + .downcase_ident = NULL, + .wc_isdigit = wc_isdigit_libc_mb, + .wc_isalpha = wc_isalpha_libc_mb, + .wc_isalnum = wc_isalnum_libc_mb, + .wc_isupper = wc_isupper_libc_mb, + .wc_islower = wc_islower_libc_mb, + .wc_isgraph = wc_isgraph_libc_mb, + .wc_isprint = wc_isprint_libc_mb, + .wc_ispunct = wc_ispunct_libc_mb, + .wc_isspace = wc_isspace_libc_mb, + .wc_isxdigit = wc_isxdigit_libc_mb, + .wc_iscased = wc_iscased_libc_mb, + .wc_toupper = toupper_libc_mb, + .wc_tolower = tolower_libc_mb, +}; + static const struct collate_methods collate_methods_libc = { .strncoll = strncoll_libc, .strnxfrm = strnxfrm_libc, @@ -119,36 +452,6 @@ static const struct collate_methods collate_methods_libc_win32_utf8 = { }; #endif -size_t -strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strlower_libc_mb(dst, dstsize, src, srclen, locale); - else - return strlower_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strtitle_libc_mb(dst, dstsize, src, srclen, locale); - else - return strtitle_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strupper_libc_mb(dst, dstsize, src, srclen, locale); - else - return strupper_libc_sb(dst, dstsize, src, srclen, locale); -} - static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) @@ -158,12 +461,9 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + locale_t loc = locale->lt; char *p; - if (srclen + 1 > destsize) - return srclen; - memcpy(dest, src, srclen); dest[srclen] = '\0'; @@ -177,7 +477,12 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, for (p = dest; *p; p++) { if (locale->is_default) - *p = pg_tolower((unsigned char) *p); + { + if (*p >= 'A' && *p <= 'Z') + *p += 'a' - 'A'; + else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc)) + *p = tolower_l((unsigned char) *p, loc); + } else *p = tolower_l((unsigned char) *p, loc); } @@ -190,7 +495,7 @@ static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + locale_t loc = locale->lt; size_t result_size; wchar_t *workspace; char *result; @@ -207,9 +512,9 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, errmsg("out of memory"))); /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); + workspace = palloc_array(wchar_t, srclen + 1); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) workspace[curr_char] = towlower_l(workspace[curr_char], loc); @@ -220,7 +525,7 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -243,7 +548,7 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + locale_t loc = locale->lt; int wasalnum = false; char *p; @@ -262,9 +567,19 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (locale->is_default) { if (wasalnum) - *p = pg_tolower((unsigned char) *p); + { + if (*p >= 'A' && *p <= 'Z') + *p += 'a' - 'A'; + else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc)) + *p = tolower_l((unsigned char) *p, loc); + } else - *p = pg_toupper((unsigned char) *p); + { + if (*p >= 'a' && *p <= 'z') + *p -= 'a' - 'A'; + else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc)) + *p = toupper_l((unsigned char) *p, loc); + } } else { @@ -284,7 +599,7 @@ static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + locale_t loc = locale->lt; int wasalnum = false; size_t result_size; wchar_t *workspace; @@ -302,9 +617,9 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, errmsg("out of memory"))); /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); + workspace = palloc_array(wchar_t, srclen + 1); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { @@ -321,7 +636,7 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -344,7 +659,7 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + locale_t loc = locale->lt; char *p; memcpy(dest, src, srclen); @@ -360,7 +675,12 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, for (p = dest; *p; p++) { if (locale->is_default) - *p = pg_toupper((unsigned char) *p); + { + if (*p >= 'a' && *p <= 'z') + *p -= 'a' - 'A'; + else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc)) + *p = toupper_l((unsigned char) *p, loc); + } else *p = toupper_l((unsigned char) *p, loc); } @@ -373,7 +693,7 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + locale_t loc = locale->lt; size_t result_size; wchar_t *workspace; char *result; @@ -390,9 +710,9 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, errmsg("out of memory"))); /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); + workspace = palloc_array(wchar_t, srclen + 1); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) workspace[curr_char] = towupper_l(workspace[curr_char], loc); @@ -403,7 +723,7 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -465,13 +785,12 @@ create_pg_locale_libc(Oid collid, MemoryContext context) loc = make_libc_collator(collate, ctype); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->provider = COLLPROVIDER_LIBC; result->deterministic = true; result->collate_is_c = (strcmp(collate, "C") == 0) || (strcmp(collate, "POSIX") == 0); result->ctype_is_c = (strcmp(ctype, "C") == 0) || (strcmp(ctype, "POSIX") == 0); - result->info.lt = loc; + result->lt = loc; if (!result->collate_is_c) { #ifdef WIN32 @@ -481,6 +800,15 @@ create_pg_locale_libc(Oid collid, MemoryContext context) #endif result->collate = &collate_methods_libc; } + if (!result->ctype_is_c) + { + if (GetDatabaseEncoding() == PG_UTF8) + result->ctype = &ctype_methods_libc_utf8; + else if (pg_database_encoding_max_length() > 1) + result->ctype = &ctype_methods_libc_other_mb; + else + result->ctype = &ctype_methods_libc_sb; + } return result; } @@ -576,8 +904,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, const char *arg2n; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); @@ -608,7 +934,7 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, arg2n = buf2; } - result = strcoll_l(arg1n, arg2n, locale->info.lt); + result = strcoll_l(arg1n, arg2n, locale->lt); if (buf != sbuf) pfree(buf); @@ -632,10 +958,8 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (srclen == -1) - return strxfrm_l(dest, src, destsize, locale->info.lt); + return strxfrm_l(dest, src, destsize, locale->lt); if (bufsize > TEXTBUFLEN) buf = palloc(bufsize); @@ -644,7 +968,7 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, memcpy(buf, src, srclen); buf[srclen] = '\0'; - result = strxfrm_l(dest, buf, destsize, locale->info.lt); + result = strxfrm_l(dest, buf, destsize, locale->lt); if (buf != sbuf) pfree(buf); @@ -742,7 +1066,6 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, int r; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); Assert(GetDatabaseEncoding() == PG_UTF8); if (len1 == -1) @@ -787,7 +1110,7 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, ((LPWSTR) a2p)[r] = 0; errno = 0; - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt); if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */ ereport(ERROR, (errmsg("could not compare Unicode strings: %m"))); @@ -867,7 +1190,7 @@ wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc) #endif /* - * These functions convert from/to libc's wchar_t, *not* pg_wchar_t. + * These functions convert from/to libc's wchar_t, *not* pg_wchar. * Therefore we keep them here rather than with the mbutils code. */ @@ -879,7 +1202,7 @@ wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc) * zero-terminated. The output will be zero-terminated iff there is room. */ size_t -wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) +wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc) { size_t result; @@ -909,7 +1232,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) } else #endif /* WIN32 */ - if (locale == (pg_locale_t) 0) + if (loc == (locale_t) 0) { /* Use wcstombs directly for the default locale */ result = wcstombs(to, from, tolen); @@ -917,7 +1240,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) else { /* Use wcstombs_l for nondefault locales */ - result = wcstombs_l(to, from, tolen, locale->info.lt); + result = wcstombs_l(to, from, tolen, loc); } return result; @@ -932,9 +1255,9 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) * input encoding. tolen is the maximum number of wchar_t's to store at *to. * The output will be zero-terminated iff there is room. */ -size_t +static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, - pg_locale_t locale) + locale_t loc) { size_t result; @@ -969,7 +1292,7 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, /* mbstowcs requires ending '\0' */ char *str = pnstrdup(from, fromlen); - if (locale == (pg_locale_t) 0) + if (loc == (locale_t) 0) { /* Use mbstowcs directly for the default locale */ result = mbstowcs(to, str, tolen); @@ -977,7 +1300,7 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, else { /* Use mbstowcs_l for nondefault locales */ - result = mbstowcs_l(to, str, tolen, locale->info.lt); + result = mbstowcs_l(to, str, tolen, loc); } pfree(str); diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c index 16311590a14a0..e1ec5f3bc69cf 100644 --- a/src/backend/utils/adt/pg_lsn.c +++ b/src/backend/utils/adt/pg_lsn.c @@ -25,8 +25,11 @@ * Formatting and conversion routines. *---------------------------------------------------------*/ +/* + * Internal version of pg_lsn_in() with support for soft error reporting. + */ XLogRecPtr -pg_lsn_in_internal(const char *str, bool *have_error) +pg_lsn_in_safe(const char *str, Node *escontext) { int len1, len2; @@ -34,22 +37,14 @@ pg_lsn_in_internal(const char *str, bool *have_error) off; XLogRecPtr result; - Assert(have_error != NULL); - *have_error = false; - /* Sanity check input format. */ len1 = strspn(str, "0123456789abcdefABCDEF"); if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; + len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF"); if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; /* Decode result. */ id = (uint32) strtoul(str, NULL, 16); @@ -57,6 +52,12 @@ pg_lsn_in_internal(const char *str, bool *have_error) result = ((uint64) id << 32) | off; return result; + +syntax_error: + ereturn(escontext, InvalidXLogRecPtr, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s: \"%s\"", + "pg_lsn", str))); } Datum @@ -64,14 +65,8 @@ pg_lsn_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); XLogRecPtr result; - bool have_error = false; - result = pg_lsn_in_internal(str, &have_error); - if (have_error) - ereturn(fcinfo->context, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s: \"%s\"", - "pg_lsn", str))); + result = pg_lsn_in_safe(str, fcinfo->context); PG_RETURN_LSN(result); } @@ -83,7 +78,7 @@ pg_lsn_out(PG_FUNCTION_ARGS) char buf[MAXPG_LSNLEN + 1]; char *result; - snprintf(buf, sizeof buf, "%X/%X", LSN_FORMAT_ARGS(lsn)); + snprintf(buf, sizeof buf, "%X/%08X", LSN_FORMAT_ARGS(lsn)); result = pstrdup(buf); PG_RETURN_CSTRING(result); } diff --git a/src/backend/utils/adt/pg_ndistinct.c b/src/backend/utils/adt/pg_ndistinct.c new file mode 100644 index 0000000000000..8ff4353c4e7db --- /dev/null +++ b/src/backend/utils/adt/pg_ndistinct.c @@ -0,0 +1,851 @@ +/*------------------------------------------------------------------------- + * + * pg_ndistinct.c + * pg_ndistinct data type support. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/pg_ndistinct.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "common/int.h" +#include "common/jsonapi.h" +#include "lib/stringinfo.h" +#include "mb/pg_wchar.h" +#include "nodes/miscnodes.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics_format.h" +#include "utils/builtins.h" +#include "utils/fmgrprotos.h" + +/* Parsing state data */ +typedef enum +{ + NDIST_EXPECT_START = 0, + NDIST_EXPECT_ITEM, + NDIST_EXPECT_KEY, + NDIST_EXPECT_ATTNUM_LIST, + NDIST_EXPECT_ATTNUM, + NDIST_EXPECT_NDISTINCT, + NDIST_EXPECT_COMPLETE, +} NDistinctSemanticState; + +typedef struct +{ + const char *str; + NDistinctSemanticState state; + + List *distinct_items; /* Accumulated complete MVNDistinctItems */ + Node *escontext; + + bool found_attributes; /* Item has "attributes" key */ + bool found_ndistinct; /* Item has "ndistinct" key */ + List *attnum_list; /* Accumulated attribute numbers */ + int32 ndistinct; +} NDistinctParseState; + +/* + * Invoked at the start of each MVNDistinctItem. + * + * The entire JSON document should be one array of MVNDistinctItem objects. + * If we are anywhere else in the document, it is an error. + */ +static JsonParseErrorType +ndistinct_object_start(void *state) +{ + NDistinctParseState *parse = state; + + switch (parse->state) + { + case NDIST_EXPECT_ITEM: + /* Now we expect to see attributes/ndistinct keys */ + parse->state = NDIST_EXPECT_KEY; + return JSON_SUCCESS; + + case NDIST_EXPECT_START: + /* pg_ndistinct must begin with a '[' */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Initial element must be an array.")); + break; + + case NDIST_EXPECT_KEY: + /* In an object, expecting key */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("A key was expected.")); + break; + + case NDIST_EXPECT_ATTNUM_LIST: + /* Just followed an "attributes" key */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Value of \"%s\" must be an array of attribute numbers.", + PG_NDISTINCT_KEY_ATTRIBUTES)); + break; + + case NDIST_EXPECT_ATTNUM: + /* In an attribute number list, expect only scalar integers */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Attribute lists can only contain attribute numbers.")); + break; + + case NDIST_EXPECT_NDISTINCT: + /* Just followed an "ndistinct" key */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Value of \"%s\" must be an integer.", + PG_NDISTINCT_KEY_NDISTINCT)); + break; + + default: + elog(ERROR, + "object start of \"%s\" found in unexpected parse state: %d.", + "pg_ndistinct", (int) parse->state); + break; + } + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Invoked at the end of an object. + * + * Check to ensure that it was a complete MVNDistinctItem + */ +static JsonParseErrorType +ndistinct_object_end(void *state) +{ + NDistinctParseState *parse = state; + + int natts = 0; + + MVNDistinctItem *item; + + if (parse->state != NDIST_EXPECT_KEY) + elog(ERROR, + "object end of \"%s\" found in unexpected parse state: %d.", + "pg_ndistinct", (int) parse->state); + + if (!parse->found_attributes) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Item must contain \"%s\" key.", + PG_NDISTINCT_KEY_ATTRIBUTES)); + return JSON_SEM_ACTION_FAILED; + } + + if (!parse->found_ndistinct) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Item must contain \"%s\" key.", + PG_NDISTINCT_KEY_NDISTINCT)); + return JSON_SEM_ACTION_FAILED; + } + + /* + * We need at least two attribute numbers for a ndistinct item, anything + * less is malformed. + */ + natts = list_length(parse->attnum_list); + if ((natts < 2) || (natts > STATS_MAX_DIMENSIONS)) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("The \"%s\" key must contain an array of at least %d and no more than %d attributes.", + PG_NDISTINCT_KEY_ATTRIBUTES, 2, STATS_MAX_DIMENSIONS)); + return JSON_SEM_ACTION_FAILED; + } + + /* Create the MVNDistinctItem */ + item = palloc_object(MVNDistinctItem); + item->nattributes = natts; + item->attributes = palloc0(natts * sizeof(AttrNumber)); + item->ndistinct = (double) parse->ndistinct; + + for (int i = 0; i < natts; i++) + item->attributes[i] = (AttrNumber) list_nth_int(parse->attnum_list, i); + + parse->distinct_items = lappend(parse->distinct_items, (void *) item); + + /* reset item state vars */ + list_free(parse->attnum_list); + parse->attnum_list = NIL; + parse->ndistinct = 0; + parse->found_attributes = false; + parse->found_ndistinct = false; + + /* Now we are looking for the next MVNDistinctItem */ + parse->state = NDIST_EXPECT_ITEM; + return JSON_SUCCESS; +} + + +/* + * Invoked at the start of an array. + * + * ndistinct input format has two types of arrays, the outer MVNDistinctItem + * array and the attribute number array within each MVNDistinctItem. + */ +static JsonParseErrorType +ndistinct_array_start(void *state) +{ + NDistinctParseState *parse = state; + + switch (parse->state) + { + case NDIST_EXPECT_ATTNUM_LIST: + parse->state = NDIST_EXPECT_ATTNUM; + break; + + case NDIST_EXPECT_START: + parse->state = NDIST_EXPECT_ITEM; + break; + + default: + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Array has been found at an unexpected location.")); + return JSON_SEM_ACTION_FAILED; + } + + return JSON_SUCCESS; +} + + +/* + * Invoked at the end of an array. + * + * Arrays can never be empty. + */ +static JsonParseErrorType +ndistinct_array_end(void *state) +{ + NDistinctParseState *parse = state; + + switch (parse->state) + { + case NDIST_EXPECT_ATTNUM: + if (list_length(parse->attnum_list) > 0) + { + /* + * The attribute number list is complete, look for more + * MVNDistinctItem keys. + */ + parse->state = NDIST_EXPECT_KEY; + return JSON_SUCCESS; + } + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("The \"%s\" key must be a non-empty array.", + PG_NDISTINCT_KEY_ATTRIBUTES)); + break; + + case NDIST_EXPECT_ITEM: + if (list_length(parse->distinct_items) > 0) + { + /* Item list is complete, we are done. */ + parse->state = NDIST_EXPECT_COMPLETE; + return JSON_SUCCESS; + } + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Item array cannot be empty.")); + break; + + default: + + /* + * This can only happen if a case was missed in + * ndistinct_array_start(). + */ + elog(ERROR, + "array end of \"%s\" found in unexpected parse state: %d.", + "pg_ndistinct", (int) parse->state); + break; + } + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Invoked at the start of a key/value field. + * + * The valid keys for the MVNDistinctItem object are: + * - attributes + * - ndistinct + */ +static JsonParseErrorType +ndistinct_object_field_start(void *state, char *fname, bool isnull) +{ + NDistinctParseState *parse = state; + + if (strcmp(fname, PG_NDISTINCT_KEY_ATTRIBUTES) == 0) + { + if (parse->found_attributes) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Multiple \"%s\" keys are not allowed.", + PG_NDISTINCT_KEY_ATTRIBUTES)); + return JSON_SEM_ACTION_FAILED; + } + parse->found_attributes = true; + parse->state = NDIST_EXPECT_ATTNUM_LIST; + return JSON_SUCCESS; + } + + if (strcmp(fname, PG_NDISTINCT_KEY_NDISTINCT) == 0) + { + if (parse->found_ndistinct) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Multiple \"%s\" keys are not allowed.", + PG_NDISTINCT_KEY_NDISTINCT)); + return JSON_SEM_ACTION_FAILED; + } + parse->found_ndistinct = true; + parse->state = NDIST_EXPECT_NDISTINCT; + return JSON_SUCCESS; + } + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Only allowed keys are \"%s\" and \"%s\".", + PG_NDISTINCT_KEY_ATTRIBUTES, + PG_NDISTINCT_KEY_NDISTINCT)); + return JSON_SEM_ACTION_FAILED; +} + +/* + * Invoked at the start of an array element. + * + * The overall structure of the datatype is an array, but there are also + * arrays as the value of every attributes key. + */ +static JsonParseErrorType +ndistinct_array_element_start(void *state, bool isnull) +{ + const NDistinctParseState *parse = state; + + switch (parse->state) + { + case NDIST_EXPECT_ATTNUM: + if (!isnull) + return JSON_SUCCESS; + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Attribute number array cannot be null.")); + break; + + case NDIST_EXPECT_ITEM: + if (!isnull) + return JSON_SUCCESS; + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Item list elements cannot be null.")); + + break; + + default: + elog(ERROR, + "array element start of \"%s\" found in unexpected parse state: %d.", + "pg_ndistinct", (int) parse->state); + break; + } + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Test for valid subsequent attribute number. + * + * If the previous value is positive, then current value must either be + * greater than the previous value, or negative. + * + * If the previous value is negative, then the value must be less than + * the previous value. + * + * Duplicate values are obviously not allowed, but that is already covered + * by the rules listed above. + */ +static bool +valid_subsequent_attnum(AttrNumber prev, AttrNumber cur) +{ + Assert(prev != 0); + + if (prev > 0) + return ((cur > prev) || (cur < 0)); + + return (cur < prev); +} + +/* + * Handle scalar events from the ndistinct input parser. + * + * Override integer parse error messages and replace them with errors + * specific to the context. + */ +static JsonParseErrorType +ndistinct_scalar(void *state, char *token, JsonTokenType tokentype) +{ + NDistinctParseState *parse = state; + AttrNumber attnum; + ErrorSaveContext escontext = {T_ErrorSaveContext}; + + switch (parse->state) + { + case NDIST_EXPECT_ATTNUM: + attnum = pg_strtoint16_safe(token, (Node *) &escontext); + + if (escontext.error_occurred) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Key \"%s\" has an incorrect value.", PG_NDISTINCT_KEY_ATTRIBUTES)); + return JSON_SEM_ACTION_FAILED; + } + + /* + * The attribute number cannot be zero a negative number beyond + * the number of the possible expressions. + */ + if (attnum == 0 || attnum < (0 - STATS_MAX_DIMENSIONS)) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Invalid \"%s\" element has been found: %d.", + PG_NDISTINCT_KEY_ATTRIBUTES, attnum)); + return JSON_SEM_ACTION_FAILED; + } + + if (list_length(parse->attnum_list) > 0) + { + const AttrNumber prev = llast_int(parse->attnum_list); + + if (!valid_subsequent_attnum(prev, attnum)) + { + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Invalid \"%s\" element has been found: %d cannot follow %d.", + PG_NDISTINCT_KEY_ATTRIBUTES, attnum, prev)); + return JSON_SEM_ACTION_FAILED; + } + } + + parse->attnum_list = lappend_int(parse->attnum_list, (int) attnum); + return JSON_SUCCESS; + + case NDIST_EXPECT_NDISTINCT: + + /* + * While the structure dictates that ndistinct is a double + * precision floating point, it has always been an integer in the + * output generated. Therefore, we parse it as an integer here. + */ + parse->ndistinct = pg_strtoint32_safe(token, (Node *) &escontext); + + if (!escontext.error_occurred) + { + parse->state = NDIST_EXPECT_KEY; + return JSON_SUCCESS; + } + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Key \"%s\" has an incorrect value.", + PG_NDISTINCT_KEY_NDISTINCT)); + break; + + default: + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Unexpected scalar has been found.")); + break; + } + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Compare the attribute arrays of two MVNDistinctItem values, + * looking for duplicate sets. Return true if a duplicate set is found. + * + * The arrays are required to be in canonical order (all positive numbers + * in ascending order first, followed by all negative numbers in descending + * order) so it's safe to compare the attrnums in order, stopping at the + * first difference. + */ +static bool +item_attributes_eq(const MVNDistinctItem *a, const MVNDistinctItem *b) +{ + if (a->nattributes != b->nattributes) + return false; + + for (int i = 0; i < a->nattributes; i++) + { + if (a->attributes[i] != b->attributes[i]) + return false; + } + + return true; +} + +/* + * Ensure that an attribute number appears as one of the attribute numbers + * in a MVNDistinctItem. + */ +static bool +item_has_attnum(const MVNDistinctItem *item, AttrNumber attnum) +{ + for (int i = 0; i < item->nattributes; i++) + { + if (attnum == item->attributes[i]) + return true; + } + return false; +} + +/* + * Ensure that the attributes in MVNDistinctItem A are a subset of the + * reference MVNDistinctItem B. + */ +static bool +item_is_attnum_subset(const MVNDistinctItem *item, + const MVNDistinctItem *refitem) +{ + for (int i = 0; i < item->nattributes; i++) + { + if (!item_has_attnum(refitem, item->attributes[i])) + return false; + } + return true; +} + +/* + * Generate a string representing an array of attribute numbers. + * + * Freeing the allocated string is the responsibility of the caller. + */ +static char * +item_attnum_list(const MVNDistinctItem *item) +{ + StringInfoData str; + + initStringInfo(&str); + + appendStringInfo(&str, "%d", item->attributes[0]); + + for (int i = 1; i < item->nattributes; i++) + appendStringInfo(&str, ", %d", item->attributes[i]); + + return str.data; +} + +/* + * Attempt to build and serialize the MVNDistinct object. + * + * This can only be executed after the completion of the JSON parsing. + * + * In the event of an error, set the error context and return NULL. + */ +static bytea * +build_mvndistinct(NDistinctParseState *parse, char *str) +{ + MVNDistinct *ndistinct; + int nitems = list_length(parse->distinct_items); + bytea *bytes; + int item_most_attrs = 0; + int item_most_attrs_idx = 0; + + switch (parse->state) + { + case NDIST_EXPECT_COMPLETE: + + /* + * Parsing has ended correctly and we should have a list of items. + * If we don't, something has been done wrong in one of the + * earlier parsing steps. + */ + if (nitems == 0) + elog(ERROR, + "cannot have empty item list after parsing success."); + break; + + case NDIST_EXPECT_START: + /* blank */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", str), + errdetail("Value cannot be empty.")); + return NULL; + + default: + /* Unexpected end-state. */ + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", str), + errdetail("Unexpected end state has been found: %d.", parse->state)); + return NULL; + } + + ndistinct = palloc(offsetof(MVNDistinct, items) + + nitems * sizeof(MVNDistinctItem)); + + ndistinct->magic = STATS_NDISTINCT_MAGIC; + ndistinct->type = STATS_NDISTINCT_TYPE_BASIC; + ndistinct->nitems = nitems; + + for (int i = 0; i < nitems; i++) + { + MVNDistinctItem *item = list_nth(parse->distinct_items, i); + + /* + * Ensure that this item does not duplicate the attributes of any + * pre-existing item. + */ + for (int j = 0; j < i; j++) + { + if (item_attributes_eq(item, &ndistinct->items[j])) + { + char *s = item_attnum_list(item); + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", str), + errdetail("Duplicated \"%s\" array has been found: [%s].", + PG_NDISTINCT_KEY_ATTRIBUTES, s)); + pfree(s); + return NULL; + } + } + + ndistinct->items[i].ndistinct = item->ndistinct; + ndistinct->items[i].nattributes = item->nattributes; + + /* + * This transfers free-ing responsibility from the distinct_items list + * to the ndistinct object. + */ + ndistinct->items[i].attributes = item->attributes; + + /* + * Keep track of the first longest attribute list. All other attribute + * lists must be a subset of this list. + */ + if (item->nattributes > item_most_attrs) + { + item_most_attrs = item->nattributes; + item_most_attrs_idx = i; + } + } + + /* + * Verify that all the sets of attribute numbers are a proper subset of + * the longest set recorded. This acts as an extra sanity check based on + * the input given. Note that this still needs to be cross-checked with + * the extended statistics objects this would be assigned to, but it + * provides one extra layer of protection. + */ + for (int i = 0; i < nitems; i++) + { + if (i == item_most_attrs_idx) + continue; + + if (!item_is_attnum_subset(&ndistinct->items[i], + &ndistinct->items[item_most_attrs_idx])) + { + const MVNDistinctItem *item = &ndistinct->items[i]; + const MVNDistinctItem *refitem = &ndistinct->items[item_most_attrs_idx]; + char *item_list = item_attnum_list(item); + char *refitem_list = item_attnum_list(refitem); + + errsave(parse->escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", str), + errdetail("\"%s\" array [%s] must be a subset of array [%s].", + PG_NDISTINCT_KEY_ATTRIBUTES, + item_list, refitem_list)); + pfree(item_list); + pfree(refitem_list); + return NULL; + } + } + + bytes = statext_ndistinct_serialize(ndistinct); + + /* + * Free the attribute lists, before the ndistinct itself. + */ + for (int i = 0; i < nitems; i++) + pfree(ndistinct->items[i].attributes); + pfree(ndistinct); + + return bytes; +} + +/* + * pg_ndistinct_in + * input routine for type pg_ndistinct. + */ +Datum +pg_ndistinct_in(PG_FUNCTION_ARGS) +{ + char *str = PG_GETARG_CSTRING(0); + NDistinctParseState parse_state; + JsonParseErrorType result; + JsonLexContext *lex; + JsonSemAction sem_action; + bytea *bytes = NULL; + + /* initialize semantic state */ + parse_state.str = str; + parse_state.state = NDIST_EXPECT_START; + parse_state.distinct_items = NIL; + parse_state.escontext = fcinfo->context; + parse_state.found_attributes = false; + parse_state.found_ndistinct = false; + parse_state.attnum_list = NIL; + parse_state.ndistinct = 0; + + /* set callbacks */ + sem_action.semstate = (void *) &parse_state; + sem_action.object_start = ndistinct_object_start; + sem_action.object_end = ndistinct_object_end; + sem_action.array_start = ndistinct_array_start; + sem_action.array_end = ndistinct_array_end; + sem_action.object_field_start = ndistinct_object_field_start; + sem_action.object_field_end = NULL; + sem_action.array_element_start = ndistinct_array_element_start; + sem_action.array_element_end = NULL; + sem_action.scalar = ndistinct_scalar; + + lex = makeJsonLexContextCstringLen(NULL, str, strlen(str), + PG_UTF8, true); + result = pg_parse_json(lex, &sem_action); + freeJsonLexContext(lex); + + if (result == JSON_SUCCESS) + bytes = build_mvndistinct(&parse_state, str); + + list_free(parse_state.attnum_list); + list_free_deep(parse_state.distinct_items); + + if (bytes) + PG_RETURN_BYTEA_P(bytes); + + /* + * If escontext already set, just use that. Anything else is a generic + * JSON parse error. + */ + if (!SOFT_ERROR_OCCURRED(parse_state.escontext)) + errsave(parse_state.escontext, + errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", str), + errdetail("Input data must be valid JSON.")); + + PG_RETURN_NULL(); +} + +/* + * pg_ndistinct_out + * output routine for type pg_ndistinct + * + * Produces a human-readable representation of the value. + */ +Datum +pg_ndistinct_out(PG_FUNCTION_ARGS) +{ + bytea *data = PG_GETARG_BYTEA_PP(0); + MVNDistinct *ndist = statext_ndistinct_deserialize(data); + int i; + StringInfoData str; + + initStringInfo(&str); + appendStringInfoChar(&str, '['); + + for (i = 0; i < ndist->nitems; i++) + { + MVNDistinctItem item = ndist->items[i]; + + if (i > 0) + appendStringInfoString(&str, ", "); + + if (item.nattributes <= 0) + elog(ERROR, "invalid zero-length attribute array in MVNDistinct"); + + appendStringInfo(&str, "{\"" PG_NDISTINCT_KEY_ATTRIBUTES "\": [%d", + item.attributes[0]); + + for (int j = 1; j < item.nattributes; j++) + appendStringInfo(&str, ", %d", item.attributes[j]); + + appendStringInfo(&str, "], \"" PG_NDISTINCT_KEY_NDISTINCT "\": %d}", + (int) item.ndistinct); + } + + appendStringInfoChar(&str, ']'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * pg_ndistinct_recv + * binary input routine for type pg_ndistinct + */ +Datum +pg_ndistinct_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_ndistinct_send + * binary output routine for type pg_ndistinct + * + * n-distinct is serialized into a bytea value, so let's send that. + */ +Datum +pg_ndistinct_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index d44f8c262baa2..a4f8b4faa90dc 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -21,6 +21,7 @@ #include "commands/extension.h" #include "miscadmin.h" #include "replication/logical.h" +#include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/worker_internal.h" #include "storage/lmgr.h" @@ -410,3 +411,21 @@ binary_upgrade_replorigin_advance(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * binary_upgrade_create_conflict_detection_slot + * + * Create a replication slot to retain information necessary for conflict + * detection such as dead tuples, commit timestamps, and origins. + */ +Datum +binary_upgrade_create_conflict_detection_slot(PG_FUNCTION_ARGS) +{ + CHECK_IS_BINARY_UPGRADE; + + CreateConflictDetectionSlot(); + + ReplicationSlotRelease(); + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 97af7c6554ff3..a97aa7c73dbe2 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -168,6 +168,9 @@ PG_STAT_GET_RELENTRY_TIMESTAMPTZ(last_vacuum_time) /* pg_stat_get_lastscan */ PG_STAT_GET_RELENTRY_TIMESTAMPTZ(lastscan) +/* pg_stat_get_stat_reset_time */ +PG_STAT_GET_RELENTRY_TIMESTAMPTZ(stat_reset_time) + Datum pg_stat_get_function_calls(PG_FUNCTION_ARGS) { @@ -200,6 +203,24 @@ PG_STAT_GET_FUNCENTRY_FLOAT8_MS(total_time) /* pg_stat_get_function_self_time */ PG_STAT_GET_FUNCENTRY_FLOAT8_MS(self_time) +Datum +pg_stat_get_function_stat_reset_time(PG_FUNCTION_ARGS) +{ + Oid funcid = PG_GETARG_OID(0); + TimestampTz result; + PgStat_StatFuncEntry *funcentry; + + if ((funcentry = pgstat_fetch_stat_funcentry(funcid)) == NULL) + result = 0; + else + result = funcentry->stat_reset_timestamp; + + if (result == 0) + PG_RETURN_NULL(); + else + PG_RETURN_TIMESTAMPTZ(result); +} + Datum pg_stat_get_backend_idset(PG_FUNCTION_ARGS) { @@ -640,10 +661,10 @@ pg_stat_get_activity(PG_FUNCTION_ARGS) values[28] = BoolGetDatum(false); /* GSS credentials not * delegated */ } - if (beentry->st_query_id == 0) + if (beentry->st_query_id == INT64CONST(0)) nulls[30] = true; else - values[30] = UInt64GetDatum(beentry->st_query_id); + values[30] = Int64GetDatum(beentry->st_query_id); } else { @@ -785,7 +806,7 @@ pg_stat_get_backend_activity(PG_FUNCTION_ARGS) activity = beentry->st_activity_raw; clipped_activity = pgstat_clip_activity(activity); - ret = cstring_to_text(activity); + ret = cstring_to_text(clipped_activity); pfree(clipped_activity); PG_RETURN_TEXT_P(ret); @@ -1510,7 +1531,7 @@ pg_stat_io_build_tuples(ReturnSetInfo *rsinfo, bktype_stats->bytes[io_obj][io_context][io_op]; /* Convert to numeric */ - snprintf(buf, sizeof buf, UINT64_FORMAT, byte); + snprintf(buf, sizeof buf, INT64_FORMAT, byte); values[byte_idx] = DirectFunctionCall3(numeric_in, CStringGetDatum(buf), ObjectIdGetDatum(0), @@ -1616,7 +1637,7 @@ static Datum pg_stat_wal_build_tuple(PgStat_WalCounters wal_counters, TimestampTz stat_reset_timestamp) { -#define PG_STAT_WAL_COLS 5 +#define PG_STAT_WAL_COLS 6 TupleDesc tupdesc; Datum values[PG_STAT_WAL_COLS] = {0}; bool nulls[PG_STAT_WAL_COLS] = {0}; @@ -1630,9 +1651,11 @@ pg_stat_wal_build_tuple(PgStat_WalCounters wal_counters, INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "wal_bytes", NUMERICOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 4, "wal_buffers_full", + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "wal_fpi_bytes", + NUMERICOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "wal_buffers_full", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 5, "stats_reset", + TupleDescInitEntry(tupdesc, (AttrNumber) 6, "stats_reset", TIMESTAMPTZOID, -1, 0); BlessTupleDesc(tupdesc); @@ -1648,12 +1671,18 @@ pg_stat_wal_build_tuple(PgStat_WalCounters wal_counters, ObjectIdGetDatum(0), Int32GetDatum(-1)); - values[3] = Int64GetDatum(wal_counters.wal_buffers_full); + snprintf(buf, sizeof buf, UINT64_FORMAT, wal_counters.wal_fpi_bytes); + values[3] = DirectFunctionCall3(numeric_in, + CStringGetDatum(buf), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); + + values[4] = Int64GetDatum(wal_counters.wal_buffers_full); if (stat_reset_timestamp != 0) - values[4] = TimestampTzGetDatum(stat_reset_timestamp); + values[5] = TimestampTzGetDatum(stat_reset_timestamp); else - nulls[4] = true; + nulls[5] = true; /* Returns the record as Datum */ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); @@ -1913,7 +1942,7 @@ pg_stat_reset_shared(PG_FUNCTION_ARGS) } /* - * Reset a statistics for a single object, which may be of current + * Reset statistics for a single object, which may be of current * database or shared across all databases in the cluster. */ Datum @@ -2100,7 +2129,7 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS) Datum pg_stat_get_replication_slot(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_REPLICATION_SLOT_COLS 10 +#define PG_STAT_GET_REPLICATION_SLOT_COLS 13 text *slotname_text = PG_GETARG_TEXT_P(0); NameData slotname; TupleDesc tupdesc; @@ -2125,11 +2154,17 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS) INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 7, "stream_bytes", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 8, "total_txns", + TupleDescInitEntry(tupdesc, (AttrNumber) 8, "mem_exceeded_count", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 9, "total_bytes", + TupleDescInitEntry(tupdesc, (AttrNumber) 9, "total_txns", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 10, "stats_reset", + TupleDescInitEntry(tupdesc, (AttrNumber) 10, "total_bytes", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 11, "slotsync_skip_count", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 12, "slotsync_last_skip", + TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 13, "stats_reset", TIMESTAMPTZOID, -1, 0); BlessTupleDesc(tupdesc); @@ -2152,13 +2187,20 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS) values[4] = Int64GetDatum(slotent->stream_txns); values[5] = Int64GetDatum(slotent->stream_count); values[6] = Int64GetDatum(slotent->stream_bytes); - values[7] = Int64GetDatum(slotent->total_txns); - values[8] = Int64GetDatum(slotent->total_bytes); + values[7] = Int64GetDatum(slotent->mem_exceeded_count); + values[8] = Int64GetDatum(slotent->total_txns); + values[9] = Int64GetDatum(slotent->total_bytes); + values[10] = Int64GetDatum(slotent->slotsync_skip_count); + + if (slotent->slotsync_last_skip == 0) + nulls[11] = true; + else + values[11] = TimestampTzGetDatum(slotent->slotsync_last_skip); if (slotent->stat_reset_timestamp == 0) - nulls[9] = true; + nulls[12] = true; else - values[9] = TimestampTzGetDatum(slotent->stat_reset_timestamp); + values[12] = TimestampTzGetDatum(slotent->stat_reset_timestamp); /* Returns the record as Datum */ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); @@ -2171,7 +2213,7 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS) Datum pg_stat_get_subscription_stats(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_SUBSCRIPTION_STATS_COLS 11 +#define PG_STAT_GET_SUBSCRIPTION_STATS_COLS 13 Oid subid = PG_GETARG_OID(0); TupleDesc tupdesc; Datum values[PG_STAT_GET_SUBSCRIPTION_STATS_COLS] = {0}; @@ -2189,23 +2231,27 @@ pg_stat_get_subscription_stats(PG_FUNCTION_ARGS) OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "apply_error_count", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "sync_error_count", + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "sync_seq_error_count", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "sync_table_error_count", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 4, "confl_insert_exists", + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "confl_insert_exists", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 5, "confl_update_origin_differs", + TupleDescInitEntry(tupdesc, (AttrNumber) 6, "confl_update_origin_differs", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 6, "confl_update_exists", + TupleDescInitEntry(tupdesc, (AttrNumber) 7, "confl_update_exists", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 7, "confl_update_missing", + TupleDescInitEntry(tupdesc, (AttrNumber) 8, "confl_update_deleted", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 8, "confl_delete_origin_differs", + TupleDescInitEntry(tupdesc, (AttrNumber) 9, "confl_update_missing", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 9, "confl_delete_missing", + TupleDescInitEntry(tupdesc, (AttrNumber) 10, "confl_delete_origin_differs", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 10, "confl_multiple_unique_conflicts", + TupleDescInitEntry(tupdesc, (AttrNumber) 11, "confl_delete_missing", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 11, "stats_reset", + TupleDescInitEntry(tupdesc, (AttrNumber) 12, "confl_multiple_unique_conflicts", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 13, "stats_reset", TIMESTAMPTZOID, -1, 0); BlessTupleDesc(tupdesc); @@ -2222,8 +2268,11 @@ pg_stat_get_subscription_stats(PG_FUNCTION_ARGS) /* apply_error_count */ values[i++] = Int64GetDatum(subentry->apply_error_count); - /* sync_error_count */ - values[i++] = Int64GetDatum(subentry->sync_error_count); + /* sync_seq_error_count */ + values[i++] = Int64GetDatum(subentry->sync_seq_error_count); + + /* sync_table_error_count */ + values[i++] = Int64GetDatum(subentry->sync_table_error_count); /* conflict count */ for (int nconflict = 0; nconflict < CONFLICT_NUM_TYPES; nconflict++) diff --git a/src/backend/utils/adt/pseudorandomfuncs.c b/src/backend/utils/adt/pseudorandomfuncs.c index e7b8045f92508..1d2a981491bf5 100644 --- a/src/backend/utils/adt/pseudorandomfuncs.c +++ b/src/backend/utils/adt/pseudorandomfuncs.c @@ -17,6 +17,7 @@ #include "common/pg_prng.h" #include "miscadmin.h" +#include "utils/date.h" #include "utils/fmgrprotos.h" #include "utils/numeric.h" #include "utils/timestamp.h" @@ -25,6 +26,18 @@ static pg_prng_state prng_state; static bool prng_seed_set = false; +/* + * Macro for checking the range bounds of random(min, max) functions. Throws + * an error if they're the wrong way round. + */ +#define CHECK_RANGE_BOUNDS(rmin, rmax) \ + do { \ + if ((rmin) > (rmax)) \ + ereport(ERROR, \ + errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ + errmsg("lower bound must be less than or equal to upper bound")); \ + } while (0) + /* * initialize_prng() - * @@ -129,10 +142,7 @@ int4random(PG_FUNCTION_ARGS) int32 rmax = PG_GETARG_INT32(1); int32 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -153,10 +163,7 @@ int8random(PG_FUNCTION_ARGS) int64 rmax = PG_GETARG_INT64(1); int64 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -177,9 +184,90 @@ numeric_random(PG_FUNCTION_ARGS) Numeric rmax = PG_GETARG_NUMERIC(1); Numeric result; + /* Leave range bound checking to random_numeric() */ + initialize_prng(); result = random_numeric(&prng_state, rmin, rmax); PG_RETURN_NUMERIC(result); } + + +/* + * date_random() - + * + * Returns a random date chosen uniformly in the specified range. + */ +Datum +date_random(PG_FUNCTION_ARGS) +{ + int32 rmin = (int32) PG_GETARG_DATEADT(0); + int32 rmax = (int32) PG_GETARG_DATEADT(1); + DateADT result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (DATE_IS_NOBEGIN(rmin) || DATE_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (DateADT) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_DATEADT(result); +} + +/* + * timestamp_random() - + * + * Returns a random timestamp chosen uniformly in the specified range. + */ +Datum +timestamp_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMP(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMP(1); + Timestamp result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (Timestamp) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMP(result); +} + +/* + * timestamptz_random() - + * + * Returns a random timestamptz chosen uniformly in the specified range. + */ +Datum +timestamptz_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMPTZ(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMPTZ(1); + TimestampTz result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (TimestampTz) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMPTZ(result); +} diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c index 66cc0acf4a712..0c454b136e22b 100644 --- a/src/backend/utils/adt/rangetypes.c +++ b/src/backend/utils/adt/rangetypes.c @@ -31,6 +31,7 @@ #include "postgres.h" #include "common/hashfn.h" +#include "funcapi.h" #include "libpq/pqformat.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -45,6 +46,7 @@ #include "utils/rangetypes.h" #include "utils/sortsupport.h" #include "utils/timestamp.h" +#include "varatt.h" /* fn_extra cache entry for one of the range I/O functions */ @@ -70,8 +72,8 @@ static char *range_deparse(char flags, const char *lbound_str, static char *range_bound_escape(const char *value); static Size datum_compute_size(Size data_length, Datum val, bool typbyval, char typalign, int16 typlen, char typstorage); -static Pointer datum_write(Pointer ptr, Datum datum, bool typbyval, - char typalign, int16 typlen, char typstorage); +static char *datum_write(char *ptr, Datum datum, bool typbyval, + char typalign, int16 typlen, char typstorage); static Node *find_simplified_clause(PlannerInfo *root, Expr *rangeExpr, Expr *elemExpr); static Expr *build_bound_expr(Expr *elemExpr, Datum val, @@ -263,7 +265,7 @@ Datum range_send(PG_FUNCTION_ARGS) { RangeType *range = PG_GETARG_RANGE_P(0); - StringInfo buf = makeStringInfo(); + StringInfoData buf; RangeIOData *cache; char flags; RangeBound lower; @@ -272,6 +274,8 @@ range_send(PG_FUNCTION_ARGS) check_stack_depth(); /* recurses when subtype is a range type */ + initStringInfo(&buf); + cache = get_range_io_data(fcinfo, RangeTypeGetOid(range), IOFunc_send); /* deserialize */ @@ -279,33 +283,31 @@ range_send(PG_FUNCTION_ARGS) flags = range_get_flags(range); /* construct output */ - pq_begintypsend(buf); + pq_begintypsend(&buf); - pq_sendbyte(buf, flags); + pq_sendbyte(&buf, flags); if (RANGE_HAS_LBOUND(flags)) { - Datum bound = PointerGetDatum(SendFunctionCall(&cache->typioproc, - lower.val)); + bytea *bound = SendFunctionCall(&cache->typioproc, lower.val); uint32 bound_len = VARSIZE(bound) - VARHDRSZ; char *bound_data = VARDATA(bound); - pq_sendint32(buf, bound_len); - pq_sendbytes(buf, bound_data, bound_len); + pq_sendint32(&buf, bound_len); + pq_sendbytes(&buf, bound_data, bound_len); } if (RANGE_HAS_UBOUND(flags)) { - Datum bound = PointerGetDatum(SendFunctionCall(&cache->typioproc, - upper.val)); + bytea *bound = SendFunctionCall(&cache->typioproc, upper.val); uint32 bound_len = VARSIZE(bound) - VARHDRSZ; char *bound_data = VARDATA(bound); - pq_sendint32(buf, bound_len); - pq_sendbytes(buf, bound_data, bound_len); + pq_sendint32(&buf, bound_len); + pq_sendbytes(&buf, bound_data, bound_len); } - PG_RETURN_BYTEA_P(pq_endtypsend(buf)); + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } /* @@ -1077,8 +1079,8 @@ range_union_internal(TypeCacheEntry *typcache, RangeType *r1, RangeType *r2, return r1; if (strict && - !DatumGetBool(range_overlaps_internal(typcache, r1, r2)) && - !DatumGetBool(range_adjacent_internal(typcache, r1, r2))) + !range_overlaps_internal(typcache, r1, r2) && + !range_adjacent_internal(typcache, r1, r2)) ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("result of range union would not be contiguous"))); @@ -1215,6 +1217,172 @@ range_split_internal(TypeCacheEntry *typcache, const RangeType *r1, const RangeT return false; } +/* + * range_minus_multi - like range_minus but as a SRF to accommodate splits, + * with no result rows if the result would be empty. + */ +Datum +range_minus_multi(PG_FUNCTION_ARGS) +{ + struct range_minus_multi_fctx + { + RangeType *rs[2]; + int n; + }; + + FuncCallContext *funcctx; + struct range_minus_multi_fctx *fctx; + MemoryContext oldcontext; + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) + { + RangeType *r1; + RangeType *r2; + Oid rngtypid; + TypeCacheEntry *typcache; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + r1 = PG_GETARG_RANGE_P(0); + r2 = PG_GETARG_RANGE_P(1); + + /* Different types should be prevented by ANYRANGE matching rules */ + if (RangeTypeGetOid(r1) != RangeTypeGetOid(r2)) + elog(ERROR, "range types do not match"); + + /* allocate memory for user context */ + fctx = palloc_object(struct range_minus_multi_fctx); + + /* + * Initialize state. We can't store the range typcache in fn_extra + * because the caller uses that for the SRF state. + */ + rngtypid = RangeTypeGetOid(r1); + typcache = lookup_type_cache(rngtypid, TYPECACHE_RANGE_INFO); + if (typcache->rngelemtype == NULL) + elog(ERROR, "type %u is not a range type", rngtypid); + range_minus_multi_internal(typcache, r1, r2, fctx->rs, &fctx->n); + + funcctx->user_fctx = fctx; + MemoryContextSwitchTo(oldcontext); + } + + /* stuff done on every call of the function */ + funcctx = SRF_PERCALL_SETUP(); + fctx = funcctx->user_fctx; + + if (funcctx->call_cntr < fctx->n) + { + /* + * We must keep these on separate lines because SRF_RETURN_NEXT does + * call_cntr++: + */ + RangeType *ret = fctx->rs[funcctx->call_cntr]; + + SRF_RETURN_NEXT(funcctx, RangeTypePGetDatum(ret)); + } + else + /* do when there is no more left */ + SRF_RETURN_DONE(funcctx); +} + +/* + * range_minus_multi_internal - Subtracts r2 from r1 + * + * The subtraction can produce zero, one, or two resulting ranges. We return + * the results by setting outputs and outputn to the ranges remaining and their + * count (respectively). The results will never contain empty ranges and will + * be ordered. Caller should set outputs to a two-element array of RangeType + * pointers. + */ +void +range_minus_multi_internal(TypeCacheEntry *typcache, RangeType *r1, + RangeType *r2, RangeType **outputs, int *outputn) +{ + int cmp_l1l2, + cmp_l1u2, + cmp_u1l2, + cmp_u1u2; + RangeBound lower1, + lower2; + RangeBound upper1, + upper2; + bool empty1, + empty2; + + range_deserialize(typcache, r1, &lower1, &upper1, &empty1); + range_deserialize(typcache, r2, &lower2, &upper2, &empty2); + + if (empty1) + { + /* if r1 is empty then r1 - r2 is empty, so return zero results */ + *outputn = 0; + return; + } + else if (empty2) + { + /* r2 is empty so the result is just r1 (which we know is not empty) */ + outputs[0] = r1; + *outputn = 1; + return; + } + + /* + * Use the same logic as range_minus_internal, but support the split case + */ + cmp_l1l2 = range_cmp_bounds(typcache, &lower1, &lower2); + cmp_l1u2 = range_cmp_bounds(typcache, &lower1, &upper2); + cmp_u1l2 = range_cmp_bounds(typcache, &upper1, &lower2); + cmp_u1u2 = range_cmp_bounds(typcache, &upper1, &upper2); + + if (cmp_l1l2 < 0 && cmp_u1u2 > 0) + { + lower2.inclusive = !lower2.inclusive; + lower2.lower = false; /* it will become the upper bound */ + outputs[0] = make_range(typcache, &lower1, &lower2, false, NULL); + + upper2.inclusive = !upper2.inclusive; + upper2.lower = true; /* it will become the lower bound */ + outputs[1] = make_range(typcache, &upper2, &upper1, false, NULL); + + *outputn = 2; + } + else if (cmp_l1u2 > 0 || cmp_u1l2 < 0) + { + outputs[0] = r1; + *outputn = 1; + } + else if (cmp_l1l2 >= 0 && cmp_u1u2 <= 0) + { + *outputn = 0; + } + else if (cmp_l1l2 <= 0 && cmp_u1l2 >= 0 && cmp_u1u2 <= 0) + { + lower2.inclusive = !lower2.inclusive; + lower2.lower = false; /* it will become the upper bound */ + outputs[0] = make_range(typcache, &lower1, &lower2, false, NULL); + *outputn = 1; + } + else if (cmp_l1l2 >= 0 && cmp_u1u2 >= 0 && cmp_l1u2 <= 0) + { + upper2.inclusive = !upper2.inclusive; + upper2.lower = true; /* it will become the lower bound */ + outputs[0] = make_range(typcache, &upper2, &upper1, false, NULL); + *outputn = 1; + } + else + { + elog(ERROR, "unexpected case in range_minus_multi"); + } +} + /* range -> range aggregate functions */ Datum @@ -1345,9 +1513,9 @@ range_fast_cmp(Datum a, Datum b, SortSupport ssup) cmp = range_cmp_bounds(typcache, &upper1, &upper2); } - if ((Datum) range_a != a) + if (range_a != DatumGetPointer(a)) pfree(range_a); - if ((Datum) range_b != b) + if (range_b != DatumGetPointer(b)) pfree(range_b); return cmp; @@ -1358,7 +1526,7 @@ range_fast_cmp(Datum a, Datum b, SortSupport ssup) Datum range_lt(PG_FUNCTION_ARGS) { - int cmp = range_cmp(fcinfo); + int cmp = DatumGetInt32(range_cmp(fcinfo)); PG_RETURN_BOOL(cmp < 0); } @@ -1366,7 +1534,7 @@ range_lt(PG_FUNCTION_ARGS) Datum range_le(PG_FUNCTION_ARGS) { - int cmp = range_cmp(fcinfo); + int cmp = DatumGetInt32(range_cmp(fcinfo)); PG_RETURN_BOOL(cmp <= 0); } @@ -1374,7 +1542,7 @@ range_le(PG_FUNCTION_ARGS) Datum range_ge(PG_FUNCTION_ARGS) { - int cmp = range_cmp(fcinfo); + int cmp = DatumGetInt32(range_cmp(fcinfo)); PG_RETURN_BOOL(cmp >= 0); } @@ -1382,7 +1550,7 @@ range_ge(PG_FUNCTION_ARGS) Datum range_gt(PG_FUNCTION_ARGS) { - int cmp = range_cmp(fcinfo); + int cmp = DatumGetInt32(range_cmp(fcinfo)); PG_RETURN_BOOL(cmp > 0); } @@ -1444,7 +1612,7 @@ hash_range(PG_FUNCTION_ARGS) upper_hash = 0; /* Merge hashes of flags and bounds */ - result = hash_uint32((uint32) flags); + result = hash_bytes_uint32((uint32) flags); result ^= lower_hash; result = pg_rotate_left32(result, 1); result ^= upper_hash; @@ -1924,7 +2092,7 @@ range_deserialize(TypeCacheEntry *typcache, const RangeType *range, int16 typlen; bool typbyval; char typalign; - Pointer ptr; + const char *ptr; Datum lbound; Datum ubound; @@ -1940,14 +2108,14 @@ range_deserialize(TypeCacheEntry *typcache, const RangeType *range, typalign = typcache->rngelemtype->typalign; /* initialize data pointer just after the range OID */ - ptr = (Pointer) (range + 1); + ptr = (char *) (range + 1); /* fetch lower bound, if any */ if (RANGE_HAS_LBOUND(flags)) { /* att_align_pointer cannot be necessary here */ lbound = fetch_att(ptr, typbyval, typlen); - ptr = (Pointer) att_addlength_pointer(ptr, typlen, ptr); + ptr = (char *) att_addlength_pointer(ptr, typlen, ptr); } else lbound = (Datum) 0; @@ -1955,7 +2123,7 @@ range_deserialize(TypeCacheEntry *typcache, const RangeType *range, /* fetch upper bound, if any */ if (RANGE_HAS_UBOUND(flags)) { - ptr = (Pointer) att_align_pointer(ptr, typalign, typlen, ptr); + ptr = (char *) att_align_pointer(ptr, typalign, typlen, ptr); ubound = fetch_att(ptr, typbyval, typlen); /* no need for att_addlength_pointer */ } @@ -2769,8 +2937,8 @@ datum_compute_size(Size data_length, Datum val, bool typbyval, char typalign, * Write the given datum beginning at ptr (after advancing to correct * alignment, if needed). Return the pointer incremented by space used. */ -static Pointer -datum_write(Pointer ptr, Datum datum, bool typbyval, char typalign, +static char * +datum_write(char *ptr, Datum datum, bool typbyval, char typalign, int16 typlen, char typstorage) { Size data_length; diff --git a/src/backend/utils/adt/rangetypes_gist.c b/src/backend/utils/adt/rangetypes_gist.c index a60ee985e746b..33c705e6a8786 100644 --- a/src/backend/utils/adt/rangetypes_gist.c +++ b/src/backend/utils/adt/rangetypes_gist.c @@ -251,7 +251,7 @@ multirange_gist_compress(PG_FUNCTION_ARGS) MultirangeType *mr = DatumGetMultirangeTypeP(entry->key); RangeType *r; TypeCacheEntry *typcache; - GISTENTRY *retval = palloc(sizeof(GISTENTRY)); + GISTENTRY *retval = palloc_object(GISTENTRY); typcache = multirange_get_typcache(fcinfo, MultirangeTypeGetOid(mr)); r = multirange_get_union_range(typcache->rngtype, mr); @@ -1240,8 +1240,7 @@ range_gist_single_sorting_split(TypeCacheEntry *typcache, maxoff = entryvec->n - 1; - sortItems = (SingleBoundSortItem *) - palloc(maxoff * sizeof(SingleBoundSortItem)); + sortItems = palloc_array(SingleBoundSortItem, maxoff); /* * Prepare auxiliary array and sort the values. @@ -1343,8 +1342,8 @@ range_gist_double_sorting_split(TypeCacheEntry *typcache, context.first = true; /* Allocate arrays for sorted range bounds */ - by_lower = (NonEmptyRange *) palloc(nentries * sizeof(NonEmptyRange)); - by_upper = (NonEmptyRange *) palloc(nentries * sizeof(NonEmptyRange)); + by_lower = palloc_array(NonEmptyRange, nentries); + by_upper = palloc_array(NonEmptyRange, nentries); /* Fill arrays of bounds */ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) @@ -1499,8 +1498,8 @@ range_gist_double_sorting_split(TypeCacheEntry *typcache, */ /* Allocate vectors for results */ - v->spl_left = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); - v->spl_right = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); + v->spl_left = palloc_array(OffsetNumber, nentries); + v->spl_right = palloc_array(OffsetNumber, nentries); v->spl_nleft = 0; v->spl_nright = 0; @@ -1509,7 +1508,7 @@ range_gist_double_sorting_split(TypeCacheEntry *typcache, * either group without affecting overlap along selected axis. */ common_entries_count = 0; - common_entries = (CommonEntry *) palloc(nentries * sizeof(CommonEntry)); + common_entries = palloc_array(CommonEntry, nentries); /* * Distribute entries which can be distributed unambiguously, and collect diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c index d126abc5a82ee..27d736a40e592 100644 --- a/src/backend/utils/adt/rangetypes_selfuncs.c +++ b/src/backend/utils/adt/rangetypes_selfuncs.c @@ -46,18 +46,18 @@ static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value, static float8 get_len_position(double value, double hist1, double hist2); static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2); -static int length_hist_bsearch(Datum *length_hist_values, +static int length_hist_bsearch(const Datum *length_hist_values, int length_hist_nvalues, double value, bool equal); -static double calc_length_hist_frac(Datum *length_hist_values, +static double calc_length_hist_frac(const Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal); static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, const RangeBound *lower, RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, int length_hist_nvalues); + const Datum *length_hist_values, int length_hist_nvalues); static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, const RangeBound *lower, const RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, int length_hist_nvalues); + const Datum *length_hist_values, int length_hist_nvalues); /* * Returns a default selectivity estimate for given operator, when we don't @@ -412,8 +412,8 @@ calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, * bounds. */ nhist = hslot.nvalues; - hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist); - hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist); + hist_lower = palloc_array(RangeBound, nhist); + hist_upper = palloc_array(RangeBound, nhist); for (i = 0; i < nhist; i++) { range_deserialize(typcache, DatumGetRangeTypeP(hslot.values[i]), @@ -654,7 +654,7 @@ rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBou * given length, returns -1. */ static int -length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues, +length_hist_bsearch(const Datum *length_hist_values, int length_hist_nvalues, double value, bool equal) { int lower = -1, @@ -852,7 +852,7 @@ get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBoun * 'equal' is true). */ static double -calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues, +calc_length_hist_frac(const Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal) { double frac; @@ -1018,7 +1018,7 @@ static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, const RangeBound *lower, RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, int length_hist_nvalues) + const Datum *length_hist_values, int length_hist_nvalues) { int i, upper_index; @@ -1139,7 +1139,7 @@ static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, const RangeBound *lower, const RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, - Datum *length_hist_values, int length_hist_nvalues) + const Datum *length_hist_values, int length_hist_nvalues) { int i, lower_index; diff --git a/src/backend/utils/adt/rangetypes_spgist.c b/src/backend/utils/adt/rangetypes_spgist.c index 9b6d7061a1812..14e5d6065f869 100644 --- a/src/backend/utils/adt/rangetypes_spgist.c +++ b/src/backend/utils/adt/rangetypes_spgist.c @@ -216,8 +216,8 @@ spg_range_quad_picksplit(PG_FUNCTION_ARGS) RangeTypeGetOid(DatumGetRangeTypeP(in->datums[0]))); /* Allocate memory for bounds */ - lowerBounds = palloc(sizeof(RangeBound) * in->nTuples); - upperBounds = palloc(sizeof(RangeBound) * in->nTuples); + lowerBounds = palloc_array(RangeBound, in->nTuples); + upperBounds = palloc_array(RangeBound, in->nTuples); j = 0; /* Deserialize bounds of ranges, count non-empty ranges */ @@ -243,8 +243,8 @@ spg_range_quad_picksplit(PG_FUNCTION_ARGS) out->prefixDatum = PointerGetDatum(NULL); out->nodeLabels = NULL; - out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); - out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = palloc_array(int, in->nTuples); + out->leafTupleDatums = palloc_array(Datum, in->nTuples); /* Place all ranges into node 0 */ for (i = 0; i < in->nTuples; i++) @@ -273,8 +273,8 @@ spg_range_quad_picksplit(PG_FUNCTION_ARGS) out->nNodes = (in->level == 0) ? 5 : 4; out->nodeLabels = NULL; /* we don't need node labels */ - out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); - out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = palloc_array(int, in->nTuples); + out->leafTupleDatums = palloc_array(Datum, in->nTuples); /* * Assign ranges to corresponding nodes according to quadrants relative to @@ -316,7 +316,7 @@ spg_range_quad_inner_consistent(PG_FUNCTION_ARGS) { /* Report that all nodes should be visited */ out->nNodes = in->nNodes; - out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->nodeNumbers = palloc_array(int, in->nNodes); for (i = 0; i < in->nNodes; i++) out->nodeNumbers[i] = i; PG_RETURN_VOID(); @@ -732,9 +732,9 @@ spg_range_quad_inner_consistent(PG_FUNCTION_ARGS) } /* We must descend into the quadrant(s) identified by 'which' */ - out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->nodeNumbers = palloc_array(int, in->nNodes); if (needPrevious) - out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes); + out->traversalValues = palloc_array(void *, in->nNodes); out->nNodes = 0; /* @@ -757,7 +757,7 @@ spg_range_quad_inner_consistent(PG_FUNCTION_ARGS) * because it's range */ previousCentroid = datumCopy(in->prefixDatum, false, -1); - out->traversalValues[out->nNodes] = (void *) previousCentroid; + out->traversalValues[out->nNodes] = DatumGetPointer(previousCentroid); } out->nodeNumbers[out->nNodes] = i - 1; out->nNodes++; diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c index a18196d8a34a5..45ea6cbc78062 100644 --- a/src/backend/utils/adt/rangetypes_typanalyze.c +++ b/src/backend/utils/adt/rangetypes_typanalyze.c @@ -151,9 +151,9 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); /* Allocate memory to hold range bounds and lengths of the sample ranges. */ - lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows); - uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows); - lengths = (float8 *) palloc(sizeof(float8) * samplerows); + lowers = palloc_array(RangeBound, samplerows); + uppers = palloc_array(RangeBound, samplerows); + lengths = palloc_array(float8, samplerows); /* Loop over the sample ranges. */ for (range_no = 0; range_no < samplerows; range_no++) @@ -397,11 +397,11 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, stats->numvalues[slot_idx] = num_hist; stats->statypid[slot_idx] = FLOAT8OID; stats->statyplen[slot_idx] = sizeof(float8); - stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL; + stats->statypbyval[slot_idx] = true; stats->statypalign[slot_idx] = 'd'; /* Store the fraction of empty ranges */ - emptyfrac = (float4 *) palloc(sizeof(float4)); + emptyfrac = palloc_object(float4); *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt); stats->stanumbers[slot_idx] = emptyfrac; stats->numnumbers[slot_idx] = 1; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index edee1f7880bde..6542e8c1df0c9 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -189,7 +189,7 @@ RE_compile_and_cache(text *text_re, int cflags, Oid collation) */ /* Convert pattern string to wide characters */ - pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); + pattern = palloc_array(pg_wchar, text_re_len + 1); pattern_len = pg_mb2wchar_with_len(text_re_val, pattern, text_re_len); @@ -329,7 +329,7 @@ RE_execute(regex_t *re, char *dat, int dat_len, bool match; /* Convert data string to wide characters */ - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data = palloc_array(pg_wchar, dat_len + 1); data_len = pg_mb2wchar_with_len(dat, data, dat_len); /* Perform RE match and return result */ @@ -773,8 +773,9 @@ similar_escape_internal(text *pat_text, text *esc_text) int plen, elen; bool afterescape = false; - bool incharclass = false; int nquotes = 0; + int bracket_depth = 0; /* square bracket nesting level */ + int charclass_pos = 0; /* position inside a character class */ p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); @@ -833,6 +834,17 @@ similar_escape_internal(text *pat_text, text *esc_text) * the relevant part separators in the above expansion. If the result * of this function is used in a plain regexp match (SIMILAR TO), the * escape-double-quotes have no effect on the match behavior. + * + * While we don't fully validate character classes (bracket expressions), + * we do need to parse them well enough to know where they end. + * "charclass_pos" tracks where we are in a character class. + * Its value is uninteresting when bracket_depth is 0. + * But when bracket_depth > 0, it will be + * 1: right after the opening '[' (a following '^' will negate + * the class, while ']' is a literal character) + * 2: right after a '^' after the opening '[' (']' is still a literal + * character) + * 3 or more: further inside the character class (']' ends the class) *---------- */ @@ -904,7 +916,7 @@ similar_escape_internal(text *pat_text, text *esc_text) /* fast path */ if (afterescape) { - if (pchar == '"' && !incharclass) /* escape-double-quote? */ + if (pchar == '"' && bracket_depth < 1) /* escape-double-quote? */ { /* emit appropriate part separator, per notes above */ if (nquotes == 0) @@ -945,6 +957,12 @@ similar_escape_internal(text *pat_text, text *esc_text) */ *r++ = '\\'; *r++ = pchar; + + /* + * If we encounter an escaped character in a character class, + * we are no longer at the beginning. + */ + charclass_pos = 3; } afterescape = false; } @@ -953,18 +971,69 @@ similar_escape_internal(text *pat_text, text *esc_text) /* SQL escape character; do not send to output */ afterescape = true; } - else if (incharclass) + else if (bracket_depth > 0) { + /* inside a character class */ if (pchar == '\\') + { + /* + * If we're here, backslash is not the SQL escape character, + * so treat it as a literal class element, which requires + * doubling it. (This matches our behavior for backslashes + * outside character classes.) + */ *r++ = '\\'; + } *r++ = pchar; - if (pchar == ']') - incharclass = false; + + /* parse the character class well enough to identify ending ']' */ + if (pchar == ']' && charclass_pos > 2) + { + /* found the real end of a bracket pair */ + bracket_depth--; + /* don't reset charclass_pos, this may be an inner bracket */ + } + else if (pchar == '[') + { + /* start of a nested bracket pair */ + bracket_depth++; + + /* + * We are no longer at the beginning of a character class. + * (The nested bracket pair is a collating element, not a + * character class in its own right.) + */ + charclass_pos = 3; + } + else if (pchar == '^') + { + /* + * A caret right after the opening bracket negates the + * character class. In that case, the following will + * increment charclass_pos from 1 to 2, so that a following + * ']' is still a literal character and does not end the + * character class. If we are further inside a character + * class, charclass_pos might get incremented past 3, which is + * fine. + */ + charclass_pos++; + } + else + { + /* + * Anything else (including a backslash or leading ']') is an + * element of the character class, so we are no longer at the + * beginning of the class. + */ + charclass_pos = 3; + } } else if (pchar == '[') { + /* start of a character class */ *r++ = pchar; - incharclass = true; + bracket_depth = 1; + charclass_pos = 1; } else if (pchar == '%') { @@ -1320,8 +1389,8 @@ regexp_match(PG_FUNCTION_ARGS) Assert(matchctx->nmatches == 1); /* Create workspace that build_regexp_match_result needs */ - matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); - matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); + matchctx->elems = palloc_array(Datum, matchctx->npatterns); + matchctx->nulls = palloc_array(bool, matchctx->npatterns); PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx))); } @@ -1363,8 +1432,8 @@ regexp_matches(PG_FUNCTION_ARGS) true, false, false); /* Pre-create workspace that build_regexp_match_result needs */ - matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); - matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); + matchctx->elems = palloc_array(Datum, matchctx->npatterns); + matchctx->nulls = palloc_array(bool, matchctx->npatterns); MemoryContextSwitchTo(oldcontext); funcctx->user_fctx = matchctx; @@ -1420,7 +1489,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, bool ignore_degenerate, bool fetching_unmatched) { - regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); + regexp_matches_ctx *matchctx = palloc0_object(regexp_matches_ctx); int eml = pg_database_encoding_max_length(); int orig_len; pg_wchar *wide_str; @@ -1440,7 +1509,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, /* convert string to pg_wchar form for matching */ orig_len = VARSIZE_ANY_EXHDR(orig_str); - wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); + wide_str = palloc_array(pg_wchar, orig_len + 1); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* set up the compiled pattern */ @@ -1463,7 +1532,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, } /* temporary output space for RE package */ - pmatch = palloc(sizeof(regmatch_t) * pmatch_len); + pmatch = palloc_array(regmatch_t, pmatch_len); /* * the real output space (grown dynamically if needed) @@ -1472,7 +1541,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, * than at 2^27 */ array_len = re_flags->glob ? 255 : 31; - matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); + matchctx->match_locs = palloc_array(int, array_len); array_idx = 0; /* search for the pattern, perhaps repeatedly */ diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c index 5ee608a2b3921..e5c2246f2c923 100644 --- a/src/backend/utils/adt/regproc.c +++ b/src/backend/utils/adt/regproc.c @@ -25,6 +25,7 @@ #include "catalog/namespace.h" #include "catalog/pg_class.h" #include "catalog/pg_collation.h" +#include "catalog/pg_database.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" #include "catalog/pg_ts_config.h" @@ -70,6 +71,7 @@ regprocin(PG_FUNCTION_ARGS) RegProcedure result; List *names; FuncCandidateList clist; + int fgc_flags; /* Handle "-" or numeric OID */ if (parseDashOrOid(pro_name_or_oid, &result, escontext)) @@ -92,7 +94,8 @@ regprocin(PG_FUNCTION_ARGS) if (names == NIL) PG_RETURN_NULL(); - clist = FuncnameGetCandidates(names, -1, NIL, false, false, false, true); + clist = FuncnameGetCandidates(names, -1, NIL, false, false, false, true, + &fgc_flags); if (clist == NULL) ereturn(escontext, (Datum) 0, @@ -163,13 +166,15 @@ regprocout(PG_FUNCTION_ARGS) { char *nspname; FuncCandidateList clist; + int fgc_flags; /* * Would this proc be found (uniquely!) by regprocin? If not, * qualify it. */ clist = FuncnameGetCandidates(list_make1(makeString(proname)), - -1, NIL, false, false, false, false); + -1, NIL, false, false, false, false, + &fgc_flags); if (clist != NULL && clist->next == NULL && clist->oid == proid) nspname = NULL; @@ -230,6 +235,7 @@ regprocedurein(PG_FUNCTION_ARGS) int nargs; Oid argtypes[FUNC_MAX_ARGS]; FuncCandidateList clist; + int fgc_flags; /* Handle "-" or numeric OID */ if (parseDashOrOid(pro_name_or_oid, &result, escontext)) @@ -250,8 +256,8 @@ regprocedurein(PG_FUNCTION_ARGS) escontext)) PG_RETURN_NULL(); - clist = FuncnameGetCandidates(names, nargs, NIL, false, false, - false, true); + clist = FuncnameGetCandidates(names, nargs, NIL, false, false, false, true, + &fgc_flags); for (; clist; clist = clist->next) { @@ -482,6 +488,7 @@ regoperin(PG_FUNCTION_ARGS) Oid result; List *names; FuncCandidateList clist; + int fgc_flags; /* Handle "0" or numeric OID */ if (parseNumericOid(opr_name_or_oid, &result, escontext)) @@ -501,7 +508,7 @@ regoperin(PG_FUNCTION_ARGS) if (names == NIL) PG_RETURN_NULL(); - clist = OpernameGetCandidates(names, '\0', true); + clist = OpernameGetCandidates(names, '\0', true, &fgc_flags); if (clist == NULL) ereturn(escontext, (Datum) 0, @@ -571,13 +578,14 @@ regoperout(PG_FUNCTION_ARGS) else { FuncCandidateList clist; + int fgc_flags; /* * Would this oper be found (uniquely!) by regoperin? If not, * qualify it. */ clist = OpernameGetCandidates(list_make1(makeString(oprname)), - '\0', false); + '\0', false, &fgc_flags); if (clist != NULL && clist->next == NULL && clist->oid == oprid) result = pstrdup(oprname); @@ -1763,6 +1771,123 @@ regnamespacesend(PG_FUNCTION_ARGS) return oidsend(fcinfo); } +/* + * regdatabasein - converts database name to database OID + * + * We also accept a numeric OID, for symmetry with the output routine. + * + * '-' signifies unknown (OID 0). In all other cases, the input must + * match an existing pg_database entry. + */ +Datum +regdatabasein(PG_FUNCTION_ARGS) +{ + char *db_name_or_oid = PG_GETARG_CSTRING(0); + Node *escontext = fcinfo->context; + Oid result; + List *names; + + /* Handle "-" or numeric OID */ + if (parseDashOrOid(db_name_or_oid, &result, escontext)) + PG_RETURN_OID(result); + + /* The rest of this wouldn't work in bootstrap mode */ + if (IsBootstrapProcessingMode()) + elog(ERROR, "regdatabase values must be OIDs in bootstrap mode"); + + /* Normal case: see if the name matches any pg_database entry. */ + names = stringToQualifiedNameList(db_name_or_oid, escontext); + if (names == NIL) + PG_RETURN_NULL(); + + if (list_length(names) != 1) + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid name syntax"))); + + result = get_database_oid(strVal(linitial(names)), true); + + if (!OidIsValid(result)) + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("database \"%s\" does not exist", + strVal(linitial(names))))); + + PG_RETURN_OID(result); +} + +/* + * to_regdatabase - converts database name to database OID + * + * If the name is not found, we return NULL. + */ +Datum +to_regdatabase(PG_FUNCTION_ARGS) +{ + char *db_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); + Datum result; + ErrorSaveContext escontext = {T_ErrorSaveContext}; + + if (!DirectInputFunctionCallSafe(regdatabasein, db_name, + InvalidOid, -1, + (Node *) &escontext, + &result)) + PG_RETURN_NULL(); + PG_RETURN_DATUM(result); +} + +/* + * regdatabaseout - converts database OID to database name + */ +Datum +regdatabaseout(PG_FUNCTION_ARGS) +{ + Oid dboid = PG_GETARG_OID(0); + char *result; + + if (dboid == InvalidOid) + { + result = pstrdup("-"); + PG_RETURN_CSTRING(result); + } + + result = get_database_name(dboid); + + if (result) + { + /* pstrdup is not really necessary, but it avoids a compiler warning */ + result = pstrdup(quote_identifier(result)); + } + else + { + /* If OID doesn't match any database, return it numerically */ + result = (char *) palloc(NAMEDATALEN); + snprintf(result, NAMEDATALEN, "%u", dboid); + } + + PG_RETURN_CSTRING(result); +} + +/* + * regdatabaserecv - converts external binary format to regdatabase + */ +Datum +regdatabaserecv(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidrecv, so share code */ + return oidrecv(fcinfo); +} + +/* + * regdatabasesend - converts regdatabase to binary format + */ +Datum +regdatabasesend(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidsend, so share code */ + return oidsend(fcinfo); +} + /* * text_regclass: convert text to regclass * diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 6239900fa2892..d54591fce58c2 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -30,7 +30,6 @@ #include "access/xact.h" #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" -#include "catalog/pg_proc.h" #include "commands/trigger.h" #include "executor/executor.h" #include "executor/spi.h" @@ -46,7 +45,6 @@ #include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" -#include "utils/rangetypes.h" #include "utils/rel.h" #include "utils/rls.h" #include "utils/ruleutils.h" @@ -128,9 +126,11 @@ typedef struct RI_ConstraintInfo Oid pf_eq_oprs[RI_MAX_NUMKEYS]; /* equality operators (PK = FK) */ Oid pp_eq_oprs[RI_MAX_NUMKEYS]; /* equality operators (PK = PK) */ Oid ff_eq_oprs[RI_MAX_NUMKEYS]; /* equality operators (FK = FK) */ - Oid period_contained_by_oper; /* anyrange <@ anyrange */ + Oid period_contained_by_oper; /* anyrange <@ anyrange (or + * multiranges) */ Oid agged_period_contained_by_oper; /* fkattr <@ range_agg(pkattr) */ - Oid period_intersect_oper; /* anyrange * anyrange */ + Oid period_intersect_oper; /* anyrange * anyrange (or + * multiranges) */ dlist_node valid_link; /* Link in list of valid entries */ } RI_ConstraintInfo; diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c index fe5edc0027da3..7220995ce218e 100644 --- a/src/backend/utils/adt/rowtypes.c +++ b/src/backend/utils/adt/rowtypes.c @@ -140,8 +140,8 @@ record_in(PG_FUNCTION_ARGS) my_extra->ncolumns = ncolumns; } - values = (Datum *) palloc(ncolumns * sizeof(Datum)); - nulls = (bool *) palloc(ncolumns * sizeof(bool)); + values = palloc_array(Datum, ncolumns); + nulls = palloc_array(bool, ncolumns); /* * Scan the string. We use "buf" to accumulate the de-quoted data for @@ -383,8 +383,8 @@ record_out(PG_FUNCTION_ARGS) my_extra->ncolumns = ncolumns; } - values = (Datum *) palloc(ncolumns * sizeof(Datum)); - nulls = (bool *) palloc(ncolumns * sizeof(bool)); + values = palloc_array(Datum, ncolumns); + nulls = palloc_array(bool, ncolumns); /* Break down the tuple into fields */ heap_deform_tuple(&tuple, tupdesc, values, nulls); @@ -539,8 +539,8 @@ record_recv(PG_FUNCTION_ARGS) my_extra->ncolumns = ncolumns; } - values = (Datum *) palloc(ncolumns * sizeof(Datum)); - nulls = (bool *) palloc(ncolumns * sizeof(bool)); + values = palloc_array(Datum, ncolumns); + nulls = palloc_array(bool, ncolumns); /* Fetch number of columns user thinks it has */ usercols = pq_getmsgint(buf, 4); @@ -741,8 +741,8 @@ record_send(PG_FUNCTION_ARGS) my_extra->ncolumns = ncolumns; } - values = (Datum *) palloc(ncolumns * sizeof(Datum)); - nulls = (bool *) palloc(ncolumns * sizeof(bool)); + values = palloc_array(Datum, ncolumns); + nulls = palloc_array(bool, ncolumns); /* Break down the tuple into fields */ heap_deform_tuple(&tuple, tupdesc, values, nulls); @@ -1529,9 +1529,9 @@ record_image_cmp(FunctionCallInfo fcinfo) if ((cmpresult == 0) && (len1 != len2)) cmpresult = (len1 < len2) ? -1 : 1; - if ((Pointer) arg1val != (Pointer) values1[i1]) + if (arg1val != DatumGetPointer(values1[i1])) pfree(arg1val); - if ((Pointer) arg2val != (Pointer) values2[i2]) + if (arg2val != DatumGetPointer(values2[i2])) pfree(arg2val); } else @@ -1863,8 +1863,8 @@ hash_record(PG_FUNCTION_ARGS) } /* Break down the tuple into fields */ - values = (Datum *) palloc(ncolumns * sizeof(Datum)); - nulls = (bool *) palloc(ncolumns * sizeof(bool)); + values = palloc_array(Datum, ncolumns); + nulls = palloc_array(bool, ncolumns); heap_deform_tuple(&tuple, tupdesc, values, nulls); for (int i = 0; i < ncolumns; i++) @@ -1984,8 +1984,8 @@ hash_record_extended(PG_FUNCTION_ARGS) } /* Break down the tuple into fields */ - values = (Datum *) palloc(ncolumns * sizeof(Datum)); - nulls = (bool *) palloc(ncolumns * sizeof(bool)); + values = palloc_array(Datum, ncolumns); + nulls = palloc_array(bool, ncolumns); heap_deform_tuple(&tuple, tupdesc, values, nulls); for (int i = 0; i < ncolumns; i++) diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 3d6e6bdbfd21b..9f85eb86da1cb 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1620,7 +1620,6 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS) /* * Internal version for use by ALTER TABLE. - * Includes a tablespace clause in the result. * Returns a palloc'd C string; no pretty-printing. */ char * @@ -3088,7 +3087,8 @@ pg_get_functiondef(PG_FUNCTION_ARGS) * string literals. (The elements may be double-quoted as-is, * but we can't just feed them to the SQL parser; it would do * the wrong thing with elements that are zero-length or - * longer than NAMEDATALEN.) + * longer than NAMEDATALEN.) Also, we need a special case for + * empty lists. * * Variables that are not so marked should just be emitted as * simple string literals. If the variable is not known to @@ -3106,6 +3106,9 @@ pg_get_functiondef(PG_FUNCTION_ARGS) /* this shouldn't fail really */ elog(ERROR, "invalid list syntax in proconfig item"); } + /* Special case: represent an empty list as NULL */ + if (namelist == NIL) + appendStringInfoString(&buf, "NULL"); foreach(lc, namelist) { char *curname = (char *) lfirst(lc); @@ -3710,7 +3713,7 @@ deparse_context_for(const char *aliasname, Oid relid) deparse_namespace *dpns; RangeTblEntry *rte; - dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); + dpns = palloc0_object(deparse_namespace); /* Build a minimal RTE for the rel */ rte = makeNode(RangeTblEntry); @@ -3754,7 +3757,7 @@ deparse_context_for_plan_tree(PlannedStmt *pstmt, List *rtable_names) { deparse_namespace *dpns; - dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); + dpns = palloc0_object(deparse_namespace); /* Initialize fields that stay the same across the whole plan tree */ dpns->rtable = pstmt->rtable; @@ -6187,7 +6190,9 @@ get_basic_select_query(Query *query, deparse_context *context) save_ingroupby = context->inGroupBy; context->inGroupBy = true; - if (query->groupingSets == NIL) + if (query->groupByAll) + appendStringInfoString(buf, "ALL"); + else if (query->groupingSets == NIL) { sep = ""; foreach(l, query->groupClause) @@ -8750,8 +8755,16 @@ get_parameter(Param *param, deparse_context *context) subplan = find_param_generator(param, context, &column); if (subplan) { - appendStringInfo(context->buf, "(%s%s).col%d", + const char *nameprefix; + + if (subplan->isInitPlan) + nameprefix = "InitPlan "; + else + nameprefix = "SubPlan "; + + appendStringInfo(context->buf, "(%s%s%s).col%d", subplan->useHashTable ? "hashed " : "", + nameprefix, subplan->plan_name, column + 1); return; @@ -9588,11 +9601,19 @@ get_rule_expr(Node *node, deparse_context *context, } else { + const char *nameprefix; + /* No referencing Params, so show the SubPlan's name */ + if (subplan->isInitPlan) + nameprefix = "InitPlan "; + else + nameprefix = "SubPlan "; if (subplan->useHashTable) - appendStringInfo(buf, "hashed %s)", subplan->plan_name); + appendStringInfo(buf, "hashed %s%s)", + nameprefix, subplan->plan_name); else - appendStringInfo(buf, "%s)", subplan->plan_name); + appendStringInfo(buf, "%s%s)", + nameprefix, subplan->plan_name); } } break; @@ -9612,11 +9633,18 @@ get_rule_expr(Node *node, deparse_context *context, foreach(lc, asplan->subplans) { SubPlan *splan = lfirst_node(SubPlan, lc); + const char *nameprefix; + if (splan->isInitPlan) + nameprefix = "InitPlan "; + else + nameprefix = "SubPlan "; if (splan->useHashTable) - appendStringInfo(buf, "hashed %s", splan->plan_name); + appendStringInfo(buf, "hashed %s%s", nameprefix, + splan->plan_name); else - appendStringInfoString(buf, splan->plan_name); + appendStringInfo(buf, "%s%s", nameprefix, + splan->plan_name); if (lnext(asplan->subplans, lc)) appendStringInfoString(buf, " or "); } @@ -9911,7 +9939,7 @@ get_rule_expr(Node *node, deparse_context *context, Node *e = (Node *) lfirst(arg); if (tupdesc == NULL || - !TupleDescAttr(tupdesc, i)->attisdropped) + !TupleDescCompactAttr(tupdesc, i)->attisdropped) { appendStringInfoString(buf, sep); /* Whole-row Vars need special treatment here */ @@ -9924,7 +9952,7 @@ get_rule_expr(Node *node, deparse_context *context, { while (i < tupdesc->natts) { - if (!TupleDescAttr(tupdesc, i)->attisdropped) + if (!TupleDescCompactAttr(tupdesc, i)->attisdropped) { appendStringInfoString(buf, sep); appendStringInfoString(buf, "NULL"); @@ -10123,7 +10151,7 @@ get_rule_expr(Node *node, deparse_context *context, if (needcomma) appendStringInfoString(buf, ", "); - get_rule_expr((Node *) e, context, true); + get_rule_expr(e, context, true); appendStringInfo(buf, " AS %s", quote_identifier(map_xml_name_to_sql_identifier(argname))); needcomma = true; @@ -11090,7 +11118,12 @@ get_windowfunc_expr_helper(WindowFunc *wfunc, deparse_context *context, get_rule_expr((Node *) wfunc->aggfilter, context, false); } - appendStringInfoString(buf, ") OVER "); + appendStringInfoString(buf, ") "); + + if (wfunc->ignore_nulls == PARSER_IGNORE_NULLS) + appendStringInfoString(buf, "IGNORE NULLS "); + + appendStringInfoString(buf, "OVER "); if (context->windowClause) { @@ -13265,6 +13298,7 @@ generate_function_name(Oid funcid, int nargs, List *argnames, Oid *argtypes, bool use_variadic; char *nspname; FuncDetailCode p_result; + int fgc_flags; Oid p_funcid; Oid p_rettype; bool p_retset; @@ -13323,6 +13357,7 @@ generate_function_name(Oid funcid, int nargs, List *argnames, Oid *argtypes, p_result = func_get_detail(list_make1(makeString(proname)), NIL, argnames, nargs, argtypes, !use_variadic, true, false, + &fgc_flags, &p_funcid, &p_rettype, &p_retset, &p_nvargs, &p_vatype, &p_true_typeids, NULL); @@ -13676,25 +13711,26 @@ char * get_range_partbound_string(List *bound_datums) { deparse_context context; - StringInfo buf = makeStringInfo(); + StringInfoData buf; ListCell *cell; char *sep; + initStringInfo(&buf); memset(&context, 0, sizeof(deparse_context)); - context.buf = buf; + context.buf = &buf; - appendStringInfoChar(buf, '('); + appendStringInfoChar(&buf, '('); sep = ""; foreach(cell, bound_datums) { PartitionRangeDatum *datum = lfirst_node(PartitionRangeDatum, cell); - appendStringInfoString(buf, sep); + appendStringInfoString(&buf, sep); if (datum->kind == PARTITION_RANGE_DATUM_MINVALUE) - appendStringInfoString(buf, "MINVALUE"); + appendStringInfoString(&buf, "MINVALUE"); else if (datum->kind == PARTITION_RANGE_DATUM_MAXVALUE) - appendStringInfoString(buf, "MAXVALUE"); + appendStringInfoString(&buf, "MAXVALUE"); else { Const *val = castNode(Const, datum->value); @@ -13703,7 +13739,7 @@ get_range_partbound_string(List *bound_datums) } sep = ", "; } - appendStringInfoChar(buf, ')'); + appendStringInfoChar(&buf, ')'); - return buf->data; + return buf.data; } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a96b1b9c0bc69..c760b19db55db 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -103,7 +103,6 @@ #include "access/table.h" #include "access/tableam.h" #include "access/visibilitymap.h" -#include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" @@ -144,26 +143,78 @@ #define DEFAULT_PAGE_CPU_MULTIPLIER 50.0 +/* + * In production builds, switch to hash-based MCV matching when the lists are + * large enough to amortize hash setup cost. (This threshold is compared to + * the sum of the lengths of the two MCV lists. This is simplistic but seems + * to work well enough.) In debug builds, we use a smaller threshold so that + * the regression tests cover both paths well. + */ +#ifndef USE_ASSERT_CHECKING +#define EQJOINSEL_MCV_HASH_THRESHOLD 200 +#else +#define EQJOINSEL_MCV_HASH_THRESHOLD 20 +#endif + +/* Entries in the simplehash hash table used by eqjoinsel_find_matches */ +typedef struct MCVHashEntry +{ + Datum value; /* the value represented by this entry */ + int index; /* its index in the relevant AttStatsSlot */ + uint32 hash; /* hash code for the Datum */ + char status; /* status code used by simplehash.h */ +} MCVHashEntry; + +/* private_data for the simplehash hash table */ +typedef struct MCVHashContext +{ + FunctionCallInfo equal_fcinfo; /* the equality join operator */ + FunctionCallInfo hash_fcinfo; /* the hash function to use */ + bool op_is_reversed; /* equality compares hash type to probe type */ + bool insert_mode; /* doing inserts or lookups? */ + bool hash_typbyval; /* typbyval of hashed data type */ + int16 hash_typlen; /* typlen of hashed data type */ +} MCVHashContext; + +/* forward reference */ +typedef struct MCVHashTable_hash MCVHashTable_hash; + /* Hooks for plugins to get control when we ask for stats */ get_relation_stats_hook_type get_relation_stats_hook = NULL; get_index_stats_hook_type get_index_stats_hook = NULL; static double eqsel_internal(PG_FUNCTION_ARGS, bool negate); -static double eqjoinsel_inner(Oid opfuncoid, Oid collation, +static double eqjoinsel_inner(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, - bool have_mcvs1, bool have_mcvs2); -static double eqjoinsel_semi(Oid opfuncoid, Oid collation, + bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches); +static double eqjoinsel_semi(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, RelOptInfo *inner_rel); +static void eqjoinsel_find_matches(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + int nvalues1, int nvalues2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, double *p_matchprodfreq); +static uint32 hash_mcv(MCVHashTable_hash *tab, Datum key); +static bool mcvs_equal(MCVHashTable_hash *tab, Datum key0, Datum key1); static bool estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, List **varinfos, double *ndistinct); static bool convert_to_scalar(Datum value, Oid valuetypid, Oid collid, @@ -219,6 +270,20 @@ static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids); static double btcost_correlation(IndexOptInfo *index, VariableStatData *vardata); +/* Define support routines for MCV hash tables */ +#define SH_PREFIX MCVHashTable +#define SH_ELEMENT_TYPE MCVHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY value +#define SH_HASH_KEY(tab,key) hash_mcv(tab, key) +#define SH_EQUAL(tab,key0,key1) mcvs_equal(tab, key0, key1) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tab,ent) (ent)->hash +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + /* * eqsel - Selectivity of "=" for any data types. @@ -1529,6 +1594,17 @@ boolvarsel(PlannerInfo *root, Node *arg, int varRelid) selec = var_eq_const(&vardata, BooleanEqualOperator, InvalidOid, BoolGetDatum(true), false, true, false); } + else if (is_funcclause(arg)) + { + /* + * If we have no stats and it's a function call, estimate 0.3333333. + * This seems a pretty unprincipled choice, but Postgres has been + * using that estimate for function calls since 1992. The hoariness + * of this behavior suggests that we should not be in too much hurry + * to use another value. + */ + selec = 0.3333333; + } else { /* Otherwise, the default estimate is 0.5 */ @@ -2294,12 +2370,18 @@ eqjoinsel(PG_FUNCTION_ARGS) bool isdefault1; bool isdefault2; Oid opfuncoid; + FmgrInfo eqproc; + Oid hashLeft = InvalidOid; + Oid hashRight = InvalidOid; AttStatsSlot sslot1; AttStatsSlot sslot2; Form_pg_statistic stats1 = NULL; Form_pg_statistic stats2 = NULL; bool have_mcvs1 = false; bool have_mcvs2 = false; + bool *hasmatch1 = NULL; + bool *hasmatch2 = NULL; + int nmatches = 0; bool get_mcv_stats; bool join_is_reversed; RelOptInfo *inner_rel; @@ -2350,14 +2432,34 @@ eqjoinsel(PG_FUNCTION_ARGS) ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); } + /* Prepare info usable by both eqjoinsel_inner and eqjoinsel_semi */ + if (have_mcvs1 && have_mcvs2) + { + fmgr_info(opfuncoid, &eqproc); + hasmatch1 = (bool *) palloc0(sslot1.nvalues * sizeof(bool)); + hasmatch2 = (bool *) palloc0(sslot2.nvalues * sizeof(bool)); + + /* + * If the MCV lists are long enough to justify hashing, try to look up + * hash functions for the join operator. + */ + if ((sslot1.nvalues + sslot2.nvalues) >= EQJOINSEL_MCV_HASH_THRESHOLD) + (void) get_op_hash_functions(operator, &hashLeft, &hashRight); + } + else + memset(&eqproc, 0, sizeof(eqproc)); /* silence uninit-var warnings */ + /* We need to compute the inner-join selectivity in all cases */ - selec_inner = eqjoinsel_inner(opfuncoid, collation, + selec_inner = eqjoinsel_inner(&eqproc, collation, + hashLeft, hashRight, &vardata1, &vardata2, nd1, nd2, isdefault1, isdefault2, &sslot1, &sslot2, stats1, stats2, - have_mcvs1, have_mcvs2); + have_mcvs1, have_mcvs2, + hasmatch1, hasmatch2, + &nmatches); switch (sjinfo->jointype) { @@ -2378,28 +2480,31 @@ eqjoinsel(PG_FUNCTION_ARGS) inner_rel = find_join_input_rel(root, sjinfo->min_righthand); if (!join_is_reversed) - selec = eqjoinsel_semi(opfuncoid, collation, + selec = eqjoinsel_semi(&eqproc, collation, + hashLeft, hashRight, + false, &vardata1, &vardata2, nd1, nd2, isdefault1, isdefault2, &sslot1, &sslot2, stats1, stats2, have_mcvs1, have_mcvs2, + hasmatch1, hasmatch2, + &nmatches, inner_rel); else - { - Oid commop = get_commutator(operator); - Oid commopfuncoid = OidIsValid(commop) ? get_opcode(commop) : InvalidOid; - - selec = eqjoinsel_semi(commopfuncoid, collation, + selec = eqjoinsel_semi(&eqproc, collation, + hashLeft, hashRight, + true, &vardata2, &vardata1, nd2, nd1, isdefault2, isdefault1, &sslot2, &sslot1, stats2, stats1, have_mcvs2, have_mcvs1, + hasmatch2, hasmatch1, + &nmatches, inner_rel); - } /* * We should never estimate the output of a semijoin to be more @@ -2427,6 +2532,11 @@ eqjoinsel(PG_FUNCTION_ARGS) ReleaseVariableStats(vardata1); ReleaseVariableStats(vardata2); + if (hasmatch1) + pfree(hasmatch1); + if (hasmatch2) + pfree(hasmatch2); + CLAMP_PROBABILITY(selec); PG_RETURN_FLOAT8((float8) selec); @@ -2435,17 +2545,24 @@ eqjoinsel(PG_FUNCTION_ARGS) /* * eqjoinsel_inner --- eqjoinsel for normal inner join * + * In addition to computing the selectivity estimate, this will fill + * hasmatch1[], hasmatch2[], and *p_nmatches (if have_mcvs1 && have_mcvs2). + * We may be able to re-use that data in eqjoinsel_semi. + * * We also use this for LEFT/FULL outer joins; it's not presently clear * that it's worth trying to distinguish them here. */ static double -eqjoinsel_inner(Oid opfuncoid, Oid collation, +eqjoinsel_inner(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, - bool have_mcvs1, bool have_mcvs2) + bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches) { double selec; @@ -2463,10 +2580,6 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, * results", Technical Report 1018, Computer Science Dept., University * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu). */ - LOCAL_FCINFO(fcinfo, 2); - FmgrInfo eqproc; - bool *hasmatch1; - bool *hasmatch2; double nullfrac1 = stats1->stanullfrac; double nullfrac2 = stats2->stanullfrac; double matchprodfreq, @@ -2481,55 +2594,17 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, int i, nmatches; - fmgr_info(opfuncoid, &eqproc); - - /* - * Save a few cycles by setting up the fcinfo struct just once. Using - * FunctionCallInvoke directly also avoids failure if the eqproc - * returns NULL, though really equality functions should never do - * that. - */ - InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, - NULL, NULL); - fcinfo->args[0].isnull = false; - fcinfo->args[1].isnull = false; - - hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); - hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool)); - - /* - * Note we assume that each MCV will match at most one member of the - * other MCV list. If the operator isn't really equality, there could - * be multiple matches --- but we don't look for them, both for speed - * and because the math wouldn't add up... - */ - matchprodfreq = 0.0; - nmatches = 0; - for (i = 0; i < sslot1->nvalues; i++) - { - int j; - - fcinfo->args[0].value = sslot1->values[i]; - - for (j = 0; j < sslot2->nvalues; j++) - { - Datum fresult; - - if (hasmatch2[j]) - continue; - fcinfo->args[1].value = sslot2->values[j]; - fcinfo->isnull = false; - fresult = FunctionCallInvoke(fcinfo); - if (!fcinfo->isnull && DatumGetBool(fresult)) - { - hasmatch1[i] = hasmatch2[j] = true; - matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j]; - nmatches++; - break; - } - } - } + /* Fill the match arrays */ + eqjoinsel_find_matches(eqproc, collation, + hashLeft, hashRight, + false, + sslot1, sslot2, + sslot1->nvalues, sslot2->nvalues, + hasmatch1, hasmatch2, + p_nmatches, &matchprodfreq); + nmatches = *p_nmatches; CLAMP_PROBABILITY(matchprodfreq); + /* Sum up frequencies of matched and unmatched MCVs */ matchfreq1 = unmatchfreq1 = 0.0; for (i = 0; i < sslot1->nvalues; i++) @@ -2551,8 +2626,6 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, } CLAMP_PROBABILITY(matchfreq2); CLAMP_PROBABILITY(unmatchfreq2); - pfree(hasmatch1); - pfree(hasmatch2); /* * Compute total frequency of non-null values that are not in the MCV @@ -2632,17 +2705,24 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, * eqjoinsel_semi --- eqjoinsel for semi join * * (Also used for anti join, which we are supposed to estimate the same way.) - * Caller has ensured that vardata1 is the LHS variable. - * Unlike eqjoinsel_inner, we have to cope with opfuncoid being InvalidOid. + * Caller has ensured that vardata1 is the LHS variable; however, eqproc + * is for the original join operator, which might now need to have the inputs + * swapped in order to apply correctly. Also, if have_mcvs1 && have_mcvs2 + * then hasmatch1[], hasmatch2[], and *p_nmatches were filled by + * eqjoinsel_inner. */ static double -eqjoinsel_semi(Oid opfuncoid, Oid collation, +eqjoinsel_semi(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, RelOptInfo *inner_rel) { double selec; @@ -2680,7 +2760,7 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, isdefault2 = false; } - if (have_mcvs1 && have_mcvs2 && OidIsValid(opfuncoid)) + if (have_mcvs1 && have_mcvs2) { /* * We have most-common-value lists for both relations. Run through @@ -2690,12 +2770,9 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, * lists. We still have to estimate for the remaining population, but * in a skewed distribution this gives us a big leg up in accuracy. */ - LOCAL_FCINFO(fcinfo, 2); - FmgrInfo eqproc; - bool *hasmatch1; - bool *hasmatch2; double nullfrac1 = stats1->stanullfrac; - double matchfreq1, + double matchprodfreq, + matchfreq1, uncertainfrac, uncertain; int i, @@ -2711,52 +2788,32 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, */ clamped_nvalues2 = Min(sslot2->nvalues, nd2); - fmgr_info(opfuncoid, &eqproc); - /* - * Save a few cycles by setting up the fcinfo struct just once. Using - * FunctionCallInvoke directly also avoids failure if the eqproc - * returns NULL, though really equality functions should never do - * that. - */ - InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, - NULL, NULL); - fcinfo->args[0].isnull = false; - fcinfo->args[1].isnull = false; - - hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); - hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool)); - - /* - * Note we assume that each MCV will match at most one member of the - * other MCV list. If the operator isn't really equality, there could - * be multiple matches --- but we don't look for them, both for speed - * and because the math wouldn't add up... + * If we did not set clamped_nvalues2 to less than sslot2->nvalues, + * then the hasmatch1[] and hasmatch2[] match flags computed by + * eqjoinsel_inner are still perfectly applicable, so we need not + * re-do the matching work. Note that it does not matter if + * op_is_reversed: we'd get the same answers. + * + * If we did clamp, then a different set of sslot2 values is to be + * compared, so we have to re-do the matching. */ - nmatches = 0; - for (i = 0; i < sslot1->nvalues; i++) + if (clamped_nvalues2 != sslot2->nvalues) { - int j; - - fcinfo->args[0].value = sslot1->values[i]; - - for (j = 0; j < clamped_nvalues2; j++) - { - Datum fresult; - - if (hasmatch2[j]) - continue; - fcinfo->args[1].value = sslot2->values[j]; - fcinfo->isnull = false; - fresult = FunctionCallInvoke(fcinfo); - if (!fcinfo->isnull && DatumGetBool(fresult)) - { - hasmatch1[i] = hasmatch2[j] = true; - nmatches++; - break; - } - } + /* Must re-zero the arrays */ + memset(hasmatch1, 0, sslot1->nvalues * sizeof(bool)); + memset(hasmatch2, 0, clamped_nvalues2 * sizeof(bool)); + /* Re-fill the match arrays */ + eqjoinsel_find_matches(eqproc, collation, + hashLeft, hashRight, + op_is_reversed, + sslot1, sslot2, + sslot1->nvalues, clamped_nvalues2, + hasmatch1, hasmatch2, + p_nmatches, &matchprodfreq); } + nmatches = *p_nmatches; + /* Sum up frequencies of matched MCVs */ matchfreq1 = 0.0; for (i = 0; i < sslot1->nvalues; i++) @@ -2765,8 +2822,6 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, matchfreq1 += sslot1->numbers[i]; } CLAMP_PROBABILITY(matchfreq1); - pfree(hasmatch1); - pfree(hasmatch2); /* * Now we need to estimate the fraction of relation 1 that has at @@ -2820,6 +2875,273 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, return selec; } +/* + * Identify matching MCVs for eqjoinsel_inner or eqjoinsel_semi. + * + * Inputs: + * eqproc: FmgrInfo for equality function to use (might be reversed) + * collation: OID of collation to use + * hashLeft, hashRight: OIDs of hash functions associated with equality op, + * or InvalidOid if we're not to use hashing + * op_is_reversed: indicates that eqproc compares right type to left type + * sslot1, sslot2: MCV values for the lefthand and righthand inputs + * nvalues1, nvalues2: number of values to be considered (can be less than + * sslotN->nvalues, but not more) + * Outputs: + * hasmatch1[], hasmatch2[]: pre-zeroed arrays of lengths nvalues1, nvalues2; + * entries are set to true if that MCV has a match on the other side + * *p_nmatches: receives number of MCV pairs that match + * *p_matchprodfreq: receives sum(sslot1->numbers[i] * sslot2->numbers[j]) + * for matching MCVs + * + * Note that hashLeft is for the eqproc's left-hand input type, hashRight + * for its right, regardless of op_is_reversed. + * + * Note we assume that each MCV will match at most one member of the other + * MCV list. If the operator isn't really equality, there could be multiple + * matches --- but we don't look for them, both for speed and because the + * math wouldn't add up... + */ +static void +eqjoinsel_find_matches(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + int nvalues1, int nvalues2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, double *p_matchprodfreq) +{ + LOCAL_FCINFO(fcinfo, 2); + double matchprodfreq = 0.0; + int nmatches = 0; + + /* + * Save a few cycles by setting up the fcinfo struct just once. Using + * FunctionCallInvoke directly also avoids failure if the eqproc returns + * NULL, though really equality functions should never do that. + */ + InitFunctionCallInfoData(*fcinfo, eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + + if (OidIsValid(hashLeft) && OidIsValid(hashRight)) + { + /* Use a hash table to speed up the matching */ + LOCAL_FCINFO(hash_fcinfo, 1); + FmgrInfo hash_proc; + MCVHashContext hashContext; + MCVHashTable_hash *hashTable; + AttStatsSlot *statsProbe; + AttStatsSlot *statsHash; + bool *hasMatchProbe; + bool *hasMatchHash; + int nvaluesProbe; + int nvaluesHash; + + /* Make sure we build the hash table on the smaller array. */ + if (sslot1->nvalues >= sslot2->nvalues) + { + statsProbe = sslot1; + statsHash = sslot2; + hasMatchProbe = hasmatch1; + hasMatchHash = hasmatch2; + nvaluesProbe = nvalues1; + nvaluesHash = nvalues2; + } + else + { + /* We'll have to reverse the direction of use of the operator. */ + op_is_reversed = !op_is_reversed; + statsProbe = sslot2; + statsHash = sslot1; + hasMatchProbe = hasmatch2; + hasMatchHash = hasmatch1; + nvaluesProbe = nvalues2; + nvaluesHash = nvalues1; + } + + /* + * Build the hash table on the smaller array, using the appropriate + * hash function for its data type. + */ + fmgr_info(op_is_reversed ? hashLeft : hashRight, &hash_proc); + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + + hashContext.equal_fcinfo = fcinfo; + hashContext.hash_fcinfo = hash_fcinfo; + hashContext.op_is_reversed = op_is_reversed; + hashContext.insert_mode = true; + get_typlenbyval(statsHash->valuetype, + &hashContext.hash_typlen, + &hashContext.hash_typbyval); + + hashTable = MCVHashTable_create(CurrentMemoryContext, + nvaluesHash, + &hashContext); + + for (int i = 0; i < nvaluesHash; i++) + { + bool found = false; + MCVHashEntry *entry = MCVHashTable_insert(hashTable, + statsHash->values[i], + &found); + + /* + * MCVHashTable_insert will only report "found" if the new value + * is equal to some previous one per datum_image_eq(). That + * probably shouldn't happen, since we're not expecting duplicates + * in the MCV list. If we do find a dup, just ignore it, leaving + * the hash entry's index pointing at the first occurrence. That + * matches the behavior that the non-hashed code path would have. + */ + if (likely(!found)) + entry->index = i; + } + + /* + * Prepare to probe the hash table. If the probe values are of a + * different data type, then we need to change hash functions. (This + * code relies on the assumption that since we defined SH_STORE_HASH, + * simplehash.h will never need to compute hash values for existing + * hash table entries.) + */ + hashContext.insert_mode = false; + if (hashLeft != hashRight) + { + fmgr_info(op_is_reversed ? hashRight : hashLeft, &hash_proc); + /* Resetting hash_fcinfo is probably unnecessary, but be safe */ + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + } + + /* Look up each probe value in turn. */ + for (int i = 0; i < nvaluesProbe; i++) + { + MCVHashEntry *entry = MCVHashTable_lookup(hashTable, + statsProbe->values[i]); + + /* As in the other code path, skip already-matched hash entries */ + if (entry != NULL && !hasMatchHash[entry->index]) + { + hasMatchHash[entry->index] = hasMatchProbe[i] = true; + nmatches++; + matchprodfreq += statsHash->numbers[entry->index] * statsProbe->numbers[i]; + } + } + + MCVHashTable_destroy(hashTable); + } + else + { + /* We're not to use hashing, so do it the O(N^2) way */ + int index1, + index2; + + /* Set up to supply the values in the order the operator expects */ + if (op_is_reversed) + { + index1 = 1; + index2 = 0; + } + else + { + index1 = 0; + index2 = 1; + } + + for (int i = 0; i < nvalues1; i++) + { + fcinfo->args[index1].value = sslot1->values[i]; + + for (int j = 0; j < nvalues2; j++) + { + Datum fresult; + + if (hasmatch2[j]) + continue; + fcinfo->args[index2].value = sslot2->values[j]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + { + hasmatch1[i] = hasmatch2[j] = true; + matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j]; + nmatches++; + break; + } + } + } + } + + *p_nmatches = nmatches; + *p_matchprodfreq = matchprodfreq; +} + +/* + * Support functions for the hash tables used by eqjoinsel_find_matches + */ +static uint32 +hash_mcv(MCVHashTable_hash *tab, Datum key) +{ + MCVHashContext *context = (MCVHashContext *) tab->private_data; + FunctionCallInfo fcinfo = context->hash_fcinfo; + Datum fresult; + + fcinfo->args[0].value = key; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + Assert(!fcinfo->isnull); + return DatumGetUInt32(fresult); +} + +static bool +mcvs_equal(MCVHashTable_hash *tab, Datum key0, Datum key1) +{ + MCVHashContext *context = (MCVHashContext *) tab->private_data; + + if (context->insert_mode) + { + /* + * During the insertion step, any comparisons will be between two + * Datums of the hash table's data type, so if the given operator is + * cross-type it will be the wrong thing to use. Fortunately, we can + * use datum_image_eq instead. The MCV values should all be distinct + * anyway, so it's mostly pro-forma to compare them at all. + */ + return datum_image_eq(key0, key1, + context->hash_typbyval, context->hash_typlen); + } + else + { + FunctionCallInfo fcinfo = context->equal_fcinfo; + Datum fresult; + + /* + * Apply the operator the correct way around. Although simplehash.h + * doesn't document this explicitly, during lookups key0 is from the + * hash table while key1 is the probe value, so we should compare them + * in that order only if op_is_reversed. + */ + if (context->op_is_reversed) + { + fcinfo->args[0].value = key0; + fcinfo->args[1].value = key1; + } + else + { + fcinfo->args[0].value = key1; + fcinfo->args[1].value = key0; + } + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + return (!fcinfo->isnull && DatumGetBool(fresult)); + } +} + /* * neqjoinsel - Join selectivity of "!=" */ @@ -3361,7 +3683,7 @@ add_unique_group_var(PlannerInfo *root, List *varinfos, } } - varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo)); + varinfo = palloc_object(GroupVarInfo); varinfo->var = var; varinfo->rel = vardata->rel; @@ -3799,18 +4121,25 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, List *hashclauses, Selectivity *innerbucketsize) { - List *clauses = list_copy(hashclauses); - List *otherclauses = NIL; - double ndistinct = 1.0; + List *clauses; + List *otherclauses; + double ndistinct; if (list_length(hashclauses) <= 1) - + { /* * Nothing to do for a single clause. Could we employ univariate * extended stat here? */ return hashclauses; + } + /* "clauses" is the list of hashclauses we've not dealt with yet */ + clauses = list_copy(hashclauses); + /* "otherclauses" holds clauses we are going to return to caller */ + otherclauses = NIL; + /* current estimate of ndistinct */ + ndistinct = 1.0; while (clauses != NIL) { ListCell *lc; @@ -3875,12 +4204,13 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, group_rel = root->simple_rel_array[relid]; } else if (group_relid != relid) - + { /* * Being in the group forming state we don't need other * clauses. */ continue; + } /* * We're going to add the new clause to the varinfos list. We @@ -3934,7 +4264,7 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, * estimate_multivariate_ndistinct(), which doesn't care about * ndistinct and isdefault fields. Thus, skip these fields. */ - varinfo = (GroupVarInfo *) palloc0(sizeof(GroupVarInfo)); + varinfo = palloc0_object(GroupVarInfo); varinfo->var = expr; varinfo->rel = root->simple_rel_array[relid]; varinfos = lappend(varinfos, varinfo); @@ -4620,6 +4950,7 @@ convert_to_scalar(Datum value, Oid valuetypid, Oid collid, double *scaledvalue, case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: *scaledvalue = convert_numeric_to_scalar(value, valuetypid, &failure); *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid, @@ -4752,6 +5083,7 @@ convert_numeric_to_scalar(Datum value, Oid typid, bool *failure) case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: /* we can treat OIDs as integers... */ return (double) DatumGetObjectId(value); } @@ -5279,8 +5611,8 @@ ReleaseDummy(HeapTuple tuple) * unique for this query. (Caution: this should be trusted for * statistical purposes only, since we do not check indimmediate nor * verify that the exact same definition of equality applies.) - * acl_ok: true if current user has permission to read the column(s) - * underlying the pg_statistic entry. This is consulted by + * acl_ok: true if current user has permission to read all table rows from + * the column(s) underlying the pg_statistic entry. This is consulted by * statistic_proc_security_check(). * * Caller is responsible for doing ReleaseVariableStats() before exiting. @@ -5399,7 +5731,6 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, */ ListCell *ilist; ListCell *slist; - Oid userid; /* * The nullingrels bits within the expression could prevent us from @@ -5409,17 +5740,6 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, if (bms_overlap(varnos, root->outer_join_rels)) node = remove_nulling_relids(node, root->outer_join_rels, NULL); - /* - * Determine the user ID to use for privilege checks: either - * onerel->userid if it's set (e.g., in case we're accessing the table - * via a view), or the current user otherwise. - * - * If we drill down to child relations, we keep using the same userid: - * it's going to be the same anyway, due to how we set up the relation - * tree (q.v. build_simple_rel). - */ - userid = OidIsValid(onerel->userid) ? onerel->userid : GetUserId(); - foreach(ilist, onerel->indexlist) { IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist); @@ -5487,69 +5807,32 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, if (HeapTupleIsValid(vardata->statsTuple)) { - /* Get index's table for permission check */ - RangeTblEntry *rte; - - rte = planner_rt_fetch(index->rel->relid, root); - Assert(rte->rtekind == RTE_RELATION); - /* + * Test if user has permission to access all + * rows from the index's table. + * * For simplicity, we insist on the whole * table being selectable, rather than trying * to identify which column(s) the index - * depends on. Also require all rows to be - * selectable --- there must be no - * securityQuals from security barrier views - * or RLS policies. + * depends on. + * + * Note that for an inheritance child, + * permissions are checked on the inheritance + * root parent, and whole-table select + * privilege on the parent doesn't quite + * guarantee that the user could read all + * columns of the child. But in practice it's + * unlikely that any interesting security + * violation could result from allowing access + * to the expression index's stats, so we + * allow it anyway. See similar code in + * examine_simple_variable() for additional + * comments. */ vardata->acl_ok = - rte->securityQuals == NIL && - (pg_class_aclcheck(rte->relid, userid, - ACL_SELECT) == ACLCHECK_OK); - - /* - * If the user doesn't have permissions to - * access an inheritance child relation, check - * the permissions of the table actually - * mentioned in the query, since most likely - * the user does have that permission. Note - * that whole-table select privilege on the - * parent doesn't quite guarantee that the - * user could read all columns of the child. - * But in practice it's unlikely that any - * interesting security violation could result - * from allowing access to the expression - * index's stats, so we allow it anyway. See - * similar code in examine_simple_variable() - * for additional comments. - */ - if (!vardata->acl_ok && - root->append_rel_array != NULL) - { - AppendRelInfo *appinfo; - Index varno = index->rel->relid; - - appinfo = root->append_rel_array[varno]; - while (appinfo && - planner_rt_fetch(appinfo->parent_relid, - root)->rtekind == RTE_RELATION) - { - varno = appinfo->parent_relid; - appinfo = root->append_rel_array[varno]; - } - if (varno != index->rel->relid) - { - /* Repeat access check on this rel */ - rte = planner_rt_fetch(varno, root); - Assert(rte->rtekind == RTE_RELATION); - - vardata->acl_ok = - rte->securityQuals == NIL && - (pg_class_aclcheck(rte->relid, - userid, - ACL_SELECT) == ACLCHECK_OK); - } - } + all_rows_selectable(root, + index->rel->relid, + NULL); } else { @@ -5619,58 +5902,26 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, vardata->freefunc = ReleaseDummy; /* + * Test if user has permission to access all rows from the + * table. + * * For simplicity, we insist on the whole table being * selectable, rather than trying to identify which - * column(s) the statistics object depends on. Also - * require all rows to be selectable --- there must be no - * securityQuals from security barrier views or RLS - * policies. - */ - vardata->acl_ok = - rte->securityQuals == NIL && - (pg_class_aclcheck(rte->relid, userid, - ACL_SELECT) == ACLCHECK_OK); - - /* - * If the user doesn't have permissions to access an - * inheritance child relation, check the permissions of - * the table actually mentioned in the query, since most - * likely the user does have that permission. Note that - * whole-table select privilege on the parent doesn't - * quite guarantee that the user could read all columns of - * the child. But in practice it's unlikely that any - * interesting security violation could result from - * allowing access to the expression stats, so we allow it - * anyway. See similar code in examine_simple_variable() - * for additional comments. + * column(s) the statistics object depends on. + * + * Note that for an inheritance child, permissions are + * checked on the inheritance root parent, and whole-table + * select privilege on the parent doesn't quite guarantee + * that the user could read all columns of the child. But + * in practice it's unlikely that any interesting security + * violation could result from allowing access to the + * expression stats, so we allow it anyway. See similar + * code in examine_simple_variable() for additional + * comments. */ - if (!vardata->acl_ok && - root->append_rel_array != NULL) - { - AppendRelInfo *appinfo; - Index varno = onerel->relid; - - appinfo = root->append_rel_array[varno]; - while (appinfo && - planner_rt_fetch(appinfo->parent_relid, - root)->rtekind == RTE_RELATION) - { - varno = appinfo->parent_relid; - appinfo = root->append_rel_array[varno]; - } - if (varno != onerel->relid) - { - /* Repeat access check on this rel */ - rte = planner_rt_fetch(varno, root); - Assert(rte->rtekind == RTE_RELATION); - - vardata->acl_ok = - rte->securityQuals == NIL && - (pg_class_aclcheck(rte->relid, - userid, - ACL_SELECT) == ACLCHECK_OK); - } - } + vardata->acl_ok = all_rows_selectable(root, + onerel->relid, + NULL); break; } @@ -5725,109 +5976,20 @@ examine_simple_variable(PlannerInfo *root, Var *var, if (HeapTupleIsValid(vardata->statsTuple)) { - RelOptInfo *onerel = find_base_rel_noerr(root, var->varno); - Oid userid; - /* - * Check if user has permission to read this column. We require - * all rows to be accessible, so there must be no securityQuals - * from security barrier views or RLS policies. + * Test if user has permission to read all rows from this column. * - * Normally the Var will have an associated RelOptInfo from which - * we can find out which userid to do the check as; but it might - * not if it's a RETURNING Var for an INSERT target relation. In - * that case use the RTEPermissionInfo associated with the RTE. + * This requires that the user has the appropriate SELECT + * privileges and that there are no securityQuals from security + * barrier views or RLS policies. If that's not the case, then we + * only permit leakproof functions to be passed pg_statistic data + * in vardata, otherwise the functions might reveal data that the + * user doesn't have permission to see --- see + * statistic_proc_security_check(). */ - if (onerel) - userid = onerel->userid; - else - { - RTEPermissionInfo *perminfo; - - perminfo = getRTEPermissionInfo(root->parse->rteperminfos, rte); - userid = perminfo->checkAsUser; - } - if (!OidIsValid(userid)) - userid = GetUserId(); - vardata->acl_ok = - rte->securityQuals == NIL && - ((pg_class_aclcheck(rte->relid, userid, - ACL_SELECT) == ACLCHECK_OK) || - (pg_attribute_aclcheck(rte->relid, var->varattno, userid, - ACL_SELECT) == ACLCHECK_OK)); - - /* - * If the user doesn't have permissions to access an inheritance - * child relation or specifically this attribute, check the - * permissions of the table/column actually mentioned in the - * query, since most likely the user does have that permission - * (else the query will fail at runtime), and if the user can read - * the column there then he can get the values of the child table - * too. To do that, we must find out which of the root parent's - * attributes the child relation's attribute corresponds to. - */ - if (!vardata->acl_ok && var->varattno > 0 && - root->append_rel_array != NULL) - { - AppendRelInfo *appinfo; - Index varno = var->varno; - int varattno = var->varattno; - bool found = false; - - appinfo = root->append_rel_array[varno]; - - /* - * Partitions are mapped to their immediate parent, not the - * root parent, so must be ready to walk up multiple - * AppendRelInfos. But stop if we hit a parent that is not - * RTE_RELATION --- that's a flattened UNION ALL subquery, not - * an inheritance parent. - */ - while (appinfo && - planner_rt_fetch(appinfo->parent_relid, - root)->rtekind == RTE_RELATION) - { - int parent_varattno; - - found = false; - if (varattno <= 0 || varattno > appinfo->num_child_cols) - break; /* safety check */ - parent_varattno = appinfo->parent_colnos[varattno - 1]; - if (parent_varattno == 0) - break; /* Var is local to child */ - - varno = appinfo->parent_relid; - varattno = parent_varattno; - found = true; - - /* If the parent is itself a child, continue up. */ - appinfo = root->append_rel_array[varno]; - } - - /* - * In rare cases, the Var may be local to the child table, in - * which case, we've got to live with having no access to this - * column's stats. - */ - if (!found) - return; - - /* Repeat the access check on this parent rel & column */ - rte = planner_rt_fetch(varno, root); - Assert(rte->rtekind == RTE_RELATION); - - /* - * Fine to use the same userid as it's the same in all - * relations of a given inheritance tree. - */ - vardata->acl_ok = - rte->securityQuals == NIL && - ((pg_class_aclcheck(rte->relid, userid, - ACL_SELECT) == ACLCHECK_OK) || - (pg_attribute_aclcheck(rte->relid, varattno, userid, - ACL_SELECT) == ACLCHECK_OK)); - } + all_rows_selectable(root, var->varno, + bms_make_singleton(var->varattno - FirstLowInvalidHeapAttributeNumber)); } else { @@ -6024,6 +6186,214 @@ examine_simple_variable(PlannerInfo *root, Var *var, } } +/* + * all_rows_selectable + * Test whether the user has permission to select all rows from a given + * relation. + * + * Inputs: + * root: the planner info + * varno: the index of the relation (assumed to be an RTE_RELATION) + * varattnos: the attributes for which permission is required, or NULL if + * whole-table access is required + * + * Returns true if the user has the required select permissions, and there are + * no securityQuals from security barrier views or RLS policies. + * + * Note that if the relation is an inheritance child relation, securityQuals + * and access permissions are checked against the inheritance root parent (the + * relation actually mentioned in the query) --- see the comments in + * expand_single_inheritance_child() for an explanation of why it has to be + * done this way. + * + * If varattnos is non-NULL, its attribute numbers should be offset by + * FirstLowInvalidHeapAttributeNumber so that system attributes can be + * checked. If varattnos is NULL, only table-level SELECT privileges are + * checked, not any column-level privileges. + * + * Note: if the relation is accessed via a view, this function actually tests + * whether the view owner has permission to select from the relation. To + * ensure that the current user has permission, it is also necessary to check + * that the current user has permission to select from the view, which we do + * at planner-startup --- see subquery_planner(). + * + * This is exported so that other estimation functions can use it. + */ +bool +all_rows_selectable(PlannerInfo *root, Index varno, Bitmapset *varattnos) +{ + RelOptInfo *rel = find_base_rel_noerr(root, varno); + RangeTblEntry *rte = planner_rt_fetch(varno, root); + Oid userid; + int varattno; + + Assert(rte->rtekind == RTE_RELATION); + + /* + * Determine the user ID to use for privilege checks (either the current + * user or the view owner, if we're accessing the table via a view). + * + * Normally the relation will have an associated RelOptInfo from which we + * can find the userid, but it might not if it's a RETURNING Var for an + * INSERT target relation. In that case use the RTEPermissionInfo + * associated with the RTE. + * + * If we navigate up to a parent relation, we keep using the same userid, + * since it's the same in all relations of a given inheritance tree. + */ + if (rel) + userid = rel->userid; + else + { + RTEPermissionInfo *perminfo; + + perminfo = getRTEPermissionInfo(root->parse->rteperminfos, rte); + userid = perminfo->checkAsUser; + } + if (!OidIsValid(userid)) + userid = GetUserId(); + + /* + * Permissions and securityQuals must be checked on the table actually + * mentioned in the query, so if this is an inheritance child, navigate up + * to the inheritance root parent. If the user can read the whole table + * or the required columns there, then they can read from the child table + * too. For per-column checks, we must find out which of the root + * parent's attributes the child relation's attributes correspond to. + */ + if (root->append_rel_array != NULL) + { + AppendRelInfo *appinfo; + + appinfo = root->append_rel_array[varno]; + + /* + * Partitions are mapped to their immediate parent, not the root + * parent, so must be ready to walk up multiple AppendRelInfos. But + * stop if we hit a parent that is not RTE_RELATION --- that's a + * flattened UNION ALL subquery, not an inheritance parent. + */ + while (appinfo && + planner_rt_fetch(appinfo->parent_relid, + root)->rtekind == RTE_RELATION) + { + Bitmapset *parent_varattnos = NULL; + + /* + * For each child attribute, find the corresponding parent + * attribute. In rare cases, the attribute may be local to the + * child table, in which case, we've got to live with having no + * access to this column. + */ + varattno = -1; + while ((varattno = bms_next_member(varattnos, varattno)) >= 0) + { + AttrNumber attno; + AttrNumber parent_attno; + + attno = varattno + FirstLowInvalidHeapAttributeNumber; + + if (attno == InvalidAttrNumber) + { + /* + * Whole-row reference, so must map each column of the + * child to the parent table. + */ + for (attno = 1; attno <= appinfo->num_child_cols; attno++) + { + parent_attno = appinfo->parent_colnos[attno - 1]; + if (parent_attno == 0) + return false; /* attr is local to child */ + parent_varattnos = + bms_add_member(parent_varattnos, + parent_attno - FirstLowInvalidHeapAttributeNumber); + } + } + else + { + if (attno < 0) + { + /* System attnos are the same in all tables */ + parent_attno = attno; + } + else + { + if (attno > appinfo->num_child_cols) + return false; /* safety check */ + parent_attno = appinfo->parent_colnos[attno - 1]; + if (parent_attno == 0) + return false; /* attr is local to child */ + } + parent_varattnos = + bms_add_member(parent_varattnos, + parent_attno - FirstLowInvalidHeapAttributeNumber); + } + } + + /* If the parent is itself a child, continue up */ + varno = appinfo->parent_relid; + varattnos = parent_varattnos; + appinfo = root->append_rel_array[varno]; + } + + /* Perform the access check on this parent rel */ + rte = planner_rt_fetch(varno, root); + Assert(rte->rtekind == RTE_RELATION); + } + + /* + * For all rows to be accessible, there must be no securityQuals from + * security barrier views or RLS policies. + */ + if (rte->securityQuals != NIL) + return false; + + /* + * Test for table-level SELECT privilege. + * + * If varattnos is non-NULL, this is sufficient to give access to all + * requested attributes, even for a child table, since we have verified + * that all required child columns have matching parent columns. + * + * If varattnos is NULL (whole-table access requested), this doesn't + * necessarily guarantee that the user can read all columns of a child + * table, but we allow it anyway (see comments in examine_variable()) and + * don't bother checking any column privileges. + */ + if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) == ACLCHECK_OK) + return true; + + if (varattnos == NULL) + return false; /* whole-table access requested */ + + /* + * Don't have table-level SELECT privilege, so check per-column + * privileges. + */ + varattno = -1; + while ((varattno = bms_next_member(varattnos, varattno)) >= 0) + { + AttrNumber attno = varattno + FirstLowInvalidHeapAttributeNumber; + + if (attno == InvalidAttrNumber) + { + /* Whole-row reference, so must have access to all columns */ + if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT, + ACLMASK_ALL) != ACLCHECK_OK) + return false; + } + else + { + if (pg_attribute_aclcheck(rte->relid, attno, userid, + ACL_SELECT) != ACLCHECK_OK) + return false; + } + } + + /* If we reach here, have all required column privileges */ + return true; +} + /* * examine_indexcol_variable * Try to look up statistical data about an index column/expression. @@ -6112,15 +6482,17 @@ examine_indexcol_variable(PlannerInfo *root, IndexOptInfo *index, /* * Check whether it is permitted to call func_oid passing some of the - * pg_statistic data in vardata. We allow this either if the user has SELECT - * privileges on the table or column underlying the pg_statistic data or if - * the function is marked leakproof. + * pg_statistic data in vardata. We allow this if either of the following + * conditions is met: (1) the user has SELECT privileges on the table or + * column underlying the pg_statistic data and there are no securityQuals from + * security barrier views or RLS policies, or (2) the function is marked + * leakproof. */ bool statistic_proc_security_check(VariableStatData *vardata, Oid func_oid) { if (vardata->acl_ok) - return true; + return true; /* have SELECT privs and no securityQuals */ if (!OidIsValid(func_oid)) return false; @@ -6514,6 +6886,13 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, if (index->hypothetical) continue; + /* + * get_actual_variable_endpoint uses the index-only-scan machinery, so + * ignore indexes that can't use it on their first column. + */ + if (!index->canreturn[0]) + continue; + /* * The first index column must match the desired variable, sortop, and * collation --- but we can use a descending-order index. diff --git a/src/backend/utils/adt/skipsupport.c b/src/backend/utils/adt/skipsupport.c index 2bd35d2d27221..2fcf5782ec810 100644 --- a/src/backend/utils/adt/skipsupport.c +++ b/src/backend/utils/adt/skipsupport.c @@ -38,7 +38,7 @@ PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, bool reverse) if (!OidIsValid(skipSupportFunction)) return NULL; - sksup = palloc(sizeof(SkipSupportData)); + sksup = palloc_object(SkipSupportData); OidFunctionCall1(skipSupportFunction, PointerGetDatum(sksup)); if (reverse) diff --git a/src/backend/utils/adt/tid.c b/src/backend/utils/adt/tid.c index 1b0df1117171a..435d40fee3e9a 100644 --- a/src/backend/utils/adt/tid.c +++ b/src/backend/utils/adt/tid.c @@ -42,7 +42,7 @@ #define DELIM ',' #define NTIDARGS 2 -static ItemPointer currtid_for_view(Relation viewrel, ItemPointer tid); +static ItemPointer currtid_for_view(Relation viewrel, const ItemPointerData *tid); /* ---------------------------------------------------------------- * tidin @@ -84,7 +84,7 @@ tidin(PG_FUNCTION_ARGS) /* * Cope with possibility that unsigned long is wider than BlockNumber, in * which case strtoul will not raise an error for some values that are out - * of the range of BlockNumber. (See similar code in oidin().) + * of the range of BlockNumber. (See similar code in uint32in_subr().) */ #if SIZEOF_LONG > 4 if (cvt != (unsigned long) blockNumber && @@ -104,7 +104,7 @@ tidin(PG_FUNCTION_ARGS) "tid", str))); offsetNumber = (OffsetNumber) cvt; - result = (ItemPointer) palloc(sizeof(ItemPointerData)); + result = (ItemPointer) palloc_object(ItemPointerData); ItemPointerSet(result, blockNumber, offsetNumber); @@ -146,7 +146,7 @@ tidrecv(PG_FUNCTION_ARGS) blockNumber = pq_getmsgint(buf, sizeof(blockNumber)); offsetNumber = pq_getmsgint(buf, sizeof(offsetNumber)); - result = (ItemPointer) palloc(sizeof(ItemPointerData)); + result = (ItemPointer) palloc_object(ItemPointerData); ItemPointerSet(result, blockNumber, offsetNumber); @@ -293,14 +293,14 @@ hashtidextended(PG_FUNCTION_ARGS) * relation "rel". */ static ItemPointer -currtid_internal(Relation rel, ItemPointer tid) +currtid_internal(Relation rel, const ItemPointerData *tid) { ItemPointer result; AclResult aclresult; Snapshot snapshot; TableScanDesc scan; - result = (ItemPointer) palloc(sizeof(ItemPointerData)); + result = (ItemPointer) palloc_object(ItemPointerData); aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), ACL_SELECT); @@ -335,7 +335,7 @@ currtid_internal(Relation rel, ItemPointer tid) * correspond to the CTID of a base relation. */ static ItemPointer -currtid_for_view(Relation viewrel, ItemPointer tid) +currtid_for_view(Relation viewrel, const ItemPointerData *tid) { TupleDesc att = RelationGetDescr(viewrel); RuleLock *rulelock; diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 347089b762646..3569d201ee151 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -937,7 +937,7 @@ interval_in(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); switch (dtype) { @@ -1004,7 +1004,7 @@ interval_recv(PG_FUNCTION_ARGS) int32 typmod = PG_GETARG_INT32(2); Interval *interval; - interval = (Interval *) palloc(sizeof(Interval)); + interval = palloc_object(Interval); interval->time = pq_getmsgint64(buf); interval->day = pq_getmsgint(buf, sizeof(interval->day)); @@ -1331,7 +1331,7 @@ interval_scale(PG_FUNCTION_ARGS) int32 typmod = PG_GETARG_INT32(1); Interval *result; - result = palloc(sizeof(Interval)); + result = palloc_object(Interval); *result = *interval; AdjustIntervalForTypmod(result, typmod, NULL); @@ -1545,7 +1545,7 @@ make_interval(PG_FUNCTION_ARGS) if (isinf(secs) || isnan(secs)) goto out_of_range; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); /* years and months -> months */ if (pg_mul_s32_overflow(years, MONTHS_PER_YEAR, &result->month) || @@ -2275,33 +2275,12 @@ timestamp_cmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(timestamp_cmp_internal(dt1, dt2)); } -#if SIZEOF_DATUM < 8 -/* note: this is used for timestamptz also */ -static int -timestamp_fastcmp(Datum x, Datum y, SortSupport ssup) -{ - Timestamp a = DatumGetTimestamp(x); - Timestamp b = DatumGetTimestamp(y); - - return timestamp_cmp_internal(a, b); -} -#endif - Datum timestamp_sortsupport(PG_FUNCTION_ARGS) { SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); -#if SIZEOF_DATUM >= 8 - - /* - * If this build has pass-by-value timestamps, then we can use a standard - * comparator function. - */ ssup->comparator = ssup_datum_signed_cmp; -#else - ssup->comparator = timestamp_fastcmp; -#endif PG_RETURN_VOID(); } @@ -2384,18 +2363,21 @@ int32 timestamp_cmp_timestamptz_internal(Timestamp timestampVal, TimestampTz dt2) { TimestampTz dt1; - int overflow; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - dt1 = timestamp2timestamptz_opt_overflow(timestampVal, &overflow); - if (overflow > 0) - { - /* dt1 is larger than any finite timestamp, but less than infinity */ - return TIMESTAMP_IS_NOEND(dt2) ? -1 : +1; - } - if (overflow < 0) + dt1 = timestamp2timestamptz_safe(timestampVal, (Node *) &escontext); + if (escontext.error_occurred) { - /* dt1 is less than any finite timestamp, but more than -infinity */ - return TIMESTAMP_IS_NOBEGIN(dt2) ? +1 : -1; + if (TIMESTAMP_IS_NOEND(dt1)) + { + /* dt1 is larger than any finite timestamp, but less than infinity */ + return TIMESTAMP_IS_NOEND(dt2) ? -1 : +1; + } + if (TIMESTAMP_IS_NOBEGIN(dt1)) + { + /* dt1 is less than any finite timestamp, but more than -infinity */ + return TIMESTAMP_IS_NOBEGIN(dt2) ? +1 : -1; + } } return timestamptz_cmp_internal(dt1, dt2); @@ -2848,7 +2830,7 @@ timestamp_mi(PG_FUNCTION_ARGS) Timestamp dt2 = PG_GETARG_TIMESTAMP(1); Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); /* * Handle infinities. @@ -2943,7 +2925,7 @@ interval_justify_interval(PG_FUNCTION_ARGS) TimeOffset wholeday; int32 wholemonth; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); result->month = span->month; result->day = span->day; result->time = span->time; @@ -3022,7 +3004,7 @@ interval_justify_hours(PG_FUNCTION_ARGS) Interval *result; TimeOffset wholeday; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); result->month = span->month; result->day = span->day; result->time = span->time; @@ -3064,7 +3046,7 @@ interval_justify_days(PG_FUNCTION_ARGS) Interval *result; int32 wholemonth; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); result->month = span->month; result->day = span->day; result->time = span->time; @@ -3466,7 +3448,7 @@ interval_um(PG_FUNCTION_ARGS) Interval *interval = PG_GETARG_INTERVAL_P(0); Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); interval_um_internal(interval, result); PG_RETURN_INTERVAL_P(result); @@ -3524,7 +3506,7 @@ interval_pl(PG_FUNCTION_ARGS) Interval *span2 = PG_GETARG_INTERVAL_P(1); Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); /* * Handle infinities. @@ -3580,7 +3562,7 @@ interval_mi(PG_FUNCTION_ARGS) Interval *span2 = PG_GETARG_INTERVAL_P(1); Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); /* * Handle infinities. @@ -3634,7 +3616,7 @@ interval_mul(PG_FUNCTION_ARGS) orig_day = span->day; Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); /* * Handle NaN and infinities. @@ -3764,7 +3746,7 @@ interval_div(PG_FUNCTION_ARGS) orig_day = span->day; Interval *result; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); if (factor == 0.0) ereport(ERROR, @@ -3993,7 +3975,7 @@ makeIntervalAggState(FunctionCallInfo fcinfo) old_context = MemoryContextSwitchTo(agg_context); - state = (IntervalAggState *) palloc0(sizeof(IntervalAggState)); + state = palloc0_object(IntervalAggState); MemoryContextSwitchTo(old_context); @@ -4180,7 +4162,7 @@ interval_avg_deserialize(PG_FUNCTION_ARGS) initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate), VARSIZE_ANY_EXHDR(sstate)); - result = (IntervalAggState *) palloc0(sizeof(IntervalAggState)); + result = palloc0_object(IntervalAggState); /* N */ result->N = pq_getmsgint64(&buf); @@ -4247,7 +4229,7 @@ interval_avg(PG_FUNCTION_ARGS) (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("interval out of range"))); - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); if (state->pInfcount > 0) INTERVAL_NOEND(result); else @@ -4284,7 +4266,7 @@ interval_sum(PG_FUNCTION_ARGS) (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("interval out of range"))); - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); if (state->pInfcount > 0) INTERVAL_NOEND(result); @@ -4317,7 +4299,7 @@ timestamp_age(PG_FUNCTION_ARGS) struct pg_tm tt2, *tm2 = &tt2; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); /* * Handle infinities. @@ -4465,7 +4447,7 @@ timestamptz_age(PG_FUNCTION_ARGS) int tz1; int tz2; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); /* * Handle infinities. @@ -4954,7 +4936,7 @@ timestamptz_trunc_internal(text *units, TimestampTz timestamp, pg_tz *tzp) case DTK_SECOND: case DTK_MILLISEC: case DTK_MICROSEC: - PG_RETURN_TIMESTAMPTZ(timestamp); + return timestamp; break; default: @@ -5138,7 +5120,7 @@ interval_trunc(PG_FUNCTION_ARGS) struct pg_itm tt, *tm = &tt; - result = (Interval *) palloc(sizeof(Interval)); + result = palloc_object(Interval); lowunits = downcase_truncate_identifier(VARDATA_ANY(units), VARSIZE_ANY_EXHDR(units), @@ -5179,7 +5161,7 @@ interval_trunc(PG_FUNCTION_ARGS) errmsg("unit \"%s\" not supported for type %s", lowunits, format_type_be(INTERVALOID)), (val == DTK_WEEK) ? errdetail("Months usually have fractional weeks.") : 0)); - result = 0; + result = NULL; } } @@ -5312,10 +5294,10 @@ isoweekdate2date(int isoweek, int wday, int *year, int *mon, int *mday) int date2isoweek(int year, int mon, int mday) { - float8 result; int day0, day4, - dayn; + dayn, + week; /* current day */ dayn = date2j(year, mon, mday); @@ -5338,13 +5320,13 @@ date2isoweek(int year, int mon, int mday) day0 = j2day(day4 - 1); } - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; /* * Sometimes the last few days in a year will fall into the first week of * the next year, so check for this. */ - if (result >= 52) + if (week >= 52) { day4 = date2j(year + 1, 1, 4); @@ -5352,10 +5334,10 @@ date2isoweek(int year, int mon, int mday) day0 = j2day(day4 - 1); if (dayn >= day4 - day0) - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; } - return (int) result; + return week; } @@ -5367,10 +5349,10 @@ date2isoweek(int year, int mon, int mday) int date2isoyear(int year, int mon, int mday) { - float8 result; int day0, day4, - dayn; + dayn, + week; /* current day */ dayn = date2j(year, mon, mday); @@ -5395,13 +5377,13 @@ date2isoyear(int year, int mon, int mday) year--; } - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; /* * Sometimes the last few days in a year will fall into the first week of * the next year, so check for this. */ - if (result >= 52) + if (week >= 52) { day4 = date2j(year + 1, 1, 4); @@ -5650,11 +5632,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5706,11 +5688,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -5924,11 +5906,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5977,11 +5959,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -6268,9 +6250,9 @@ interval_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(val, 6); else result = - numeric_add_opt_error(int64_div_fast_to_numeric(interval->time, 6), - int64_to_numeric(secs_from_day_month), - NULL); + numeric_add_safe(int64_div_fast_to_numeric(interval->time, 6), + int64_to_numeric(secs_from_day_month), + NULL); PG_RETURN_NUMERIC(result); } @@ -6455,15 +6437,15 @@ timestamp_timestamptz(PG_FUNCTION_ARGS) /* * Convert timestamp to timestamp with time zone. * - * On successful conversion, *overflow is set to zero if it's not NULL. + * If the timestamp is finite but out of the valid range for timestamptz, + * error handling proceeds based on escontext. * - * If the timestamp is finite but out of the valid range for timestamptz, then: - * if overflow is NULL, we throw an out-of-range error. - * if overflow is not NULL, we store +1 or -1 there to indicate the sign - * of the overflow, and return the appropriate timestamptz infinity. + * If escontext is NULL, we throw an out-of-range error (hard error). + * If escontext is not NULL, we return NOBEGIN or NOEND for lower bound or + * upper bound overflow, respectively, and record a soft error. */ TimestampTz -timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) +timestamp2timestamptz_safe(Timestamp timestamp, Node *escontext) { TimestampTz result; struct pg_tm tt, @@ -6471,13 +6453,10 @@ timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) fsec_t fsec; int tz; - if (overflow) - *overflow = 0; - if (TIMESTAMP_NOT_FINITE(timestamp)) return timestamp; - /* We don't expect this to fail, but check it pro forma */ + /* timestamp2tm should not fail on valid timestamps, but cope */ if (timestamp2tm(timestamp, NULL, tm, &fsec, NULL, NULL) == 0) { tz = DetermineTimeZoneOffset(tm, session_timezone); @@ -6485,30 +6464,17 @@ timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) result = dt2local(timestamp, -tz); if (IS_VALID_TIMESTAMP(result)) - { - return result; - } - else if (overflow) - { - if (result < MIN_TIMESTAMP) - { - *overflow = -1; - TIMESTAMP_NOBEGIN(result); - } - else - { - *overflow = 1; - TIMESTAMP_NOEND(result); - } return result; - } } - ereport(ERROR, + if (timestamp < 0) + TIMESTAMP_NOBEGIN(result); + else + TIMESTAMP_NOEND(result); + + ereturn(escontext, result, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); - - return 0; } /* @@ -6517,7 +6483,7 @@ timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) static TimestampTz timestamp2timestamptz(Timestamp timestamp) { - return timestamp2timestamptz_opt_overflow(timestamp, NULL); + return timestamp2timestamptz_safe(timestamp, NULL); } /* timestamptz_timestamp() @@ -6531,8 +6497,27 @@ timestamptz_timestamp(PG_FUNCTION_ARGS) PG_RETURN_TIMESTAMP(timestamptz2timestamp(timestamp)); } +/* + * Convert timestamptz to timestamp, throwing error for overflow. + */ static Timestamp timestamptz2timestamp(TimestampTz timestamp) +{ + return timestamptz2timestamp_safe(timestamp, NULL); +} + +/* + * Convert timestamp with time zone to timestamp. + * + * If the timestamptz is finite but out of the valid range for timestamp, + * error handling proceeds based on escontext. + * + * If escontext is NULL, we throw an out-of-range error (hard error). + * If escontext is not NULL, we return NOBEGIN or NOEND for lower bound or + * upper bound overflow, respectively, and record a soft error. + */ +Timestamp +timestamptz2timestamp_safe(TimestampTz timestamp, Node *escontext) { Timestamp result; struct pg_tm tt, @@ -6545,13 +6530,27 @@ timestamptz2timestamp(TimestampTz timestamp) else { if (timestamp2tm(timestamp, &tz, tm, &fsec, NULL, NULL) != 0) - ereport(ERROR, + { + if (timestamp < 0) + TIMESTAMP_NOBEGIN(result); + else + TIMESTAMP_NOEND(result); + + ereturn(escontext, result, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } if (tm2timestamp(tm, fsec, NULL, &result) != 0) - ereport(ERROR, + { + if (timestamp < 0) + TIMESTAMP_NOBEGIN(result); + else + TIMESTAMP_NOEND(result); + + ereturn(escontext, result, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } } return result; } @@ -6688,8 +6687,7 @@ generate_series_timestamp(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ - fctx = (generate_series_timestamp_fctx *) - palloc(sizeof(generate_series_timestamp_fctx)); + fctx = palloc_object(generate_series_timestamp_fctx); /* * Use fctx to keep state from call to call. Seed current with the @@ -6773,8 +6771,7 @@ generate_series_timestamptz_internal(FunctionCallInfo fcinfo) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ - fctx = (generate_series_timestamptz_fctx *) - palloc(sizeof(generate_series_timestamptz_fctx)); + fctx = palloc_object(generate_series_timestamptz_fctx); /* * Use fctx to keep state from call to call. Seed current with the diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c index 2712fd89df095..24a1cbe89ed9b 100644 --- a/src/backend/utils/adt/tsginidx.c +++ b/src/backend/utils/adt/tsginidx.c @@ -73,7 +73,7 @@ gin_extract_tsvector(PG_FUNCTION_ARGS) int i; WordEntry *we = ARRPTR(vector); - entries = (Datum *) palloc(sizeof(Datum) * vector->size); + entries = palloc_array(Datum, vector->size); for (i = 0; i < vector->size; i++) { @@ -133,16 +133,16 @@ gin_extract_tsquery(PG_FUNCTION_ARGS) } *nentries = j; - entries = (Datum *) palloc(sizeof(Datum) * j); - partialmatch = *ptr_partialmatch = (bool *) palloc(sizeof(bool) * j); + entries = palloc_array(Datum, j); + partialmatch = *ptr_partialmatch = palloc_array(bool, j); /* * Make map to convert item's number to corresponding operand's (the * same, entry's) number. Entry's number is used in check array in * consistent method. We use the same map for each entry. */ - *extra_data = (Pointer *) palloc(sizeof(Pointer) * j); - map_item_operand = (int *) palloc0(sizeof(int) * query->size); + *extra_data = palloc_array(Pointer, j); + map_item_operand = palloc0_array(int, query->size); /* Now rescan the VAL items and fill in the arrays */ j = 0; diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index 935187b37c749..01e43f862146b 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -212,7 +212,7 @@ gtsvector_compress(PG_FUNCTION_ARGS) res = ressign; } - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, false); @@ -231,7 +231,7 @@ gtsvector_compress(PG_FUNCTION_ARGS) } res = gtsvector_alloc(SIGNKEY | ALLISTRUE, siglen, sign); - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, entry->offset, false); @@ -251,7 +251,7 @@ gtsvector_decompress(PG_FUNCTION_ARGS) if (key != (SignTSVector *) DatumGetPointer(entry->key)) { - GISTENTRY *retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + GISTENTRY *retval = palloc_object(GISTENTRY); gistentryinit(*retval, PointerGetDatum(key), entry->rel, entry->page, @@ -641,7 +641,7 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) v->spl_left = (OffsetNumber *) palloc(nbytes); v->spl_right = (OffsetNumber *) palloc(nbytes); - cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 2)); + cache = palloc_array(CACHESIGN, maxoff + 2); cache_sign = palloc(siglen * (maxoff + 2)); for (j = 0; j < maxoff + 2; j++) @@ -688,7 +688,7 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) maxoff = OffsetNumberNext(maxoff); fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff), siglen); /* sort before ... */ - costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + costvector = palloc_array(SPLITCOST, maxoff); for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) { costvector[j - 1].pos = j; diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 717de8073d58d..a0c990fdfa03d 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -534,7 +534,7 @@ pushOperator(TSQueryParserState state, int8 oper, int16 distance) Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE); - tmp = (QueryOperator *) palloc0(sizeof(QueryOperator)); + tmp = palloc0_object(QueryOperator); tmp->type = QI_OPR; tmp->oper = oper; tmp->distance = (oper == OP_PHRASE) ? distance : 0; @@ -559,7 +559,7 @@ pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int errmsg("operand is too long in tsquery: \"%s\"", state->buffer))); - tmp = (QueryOperand *) palloc0(sizeof(QueryOperand)); + tmp = palloc0_object(QueryOperand); tmp->type = QI_VAL; tmp->weight = weight; tmp->prefix = prefix; @@ -617,7 +617,7 @@ pushStop(TSQueryParserState state) { QueryOperand *tmp; - tmp = (QueryOperand *) palloc0(sizeof(QueryOperand)); + tmp = palloc0_object(QueryOperand); tmp->type = QI_VALSTOP; state->polstr = lcons(tmp, state->polstr); @@ -1101,7 +1101,7 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp) nrm.curpol = in->curpol; nrm.op = in->op; nrm.buflen = 16; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); /* get right operand */ infix(&nrm, priority, (op == OP_PHRASE)); @@ -1157,7 +1157,7 @@ tsqueryout(PG_FUNCTION_ARGS) } nrm.curpol = GETQUERY(query); nrm.buflen = 32; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); *(nrm.cur) = '\0'; nrm.op = GETOPERAND(query); infix(&nrm, -1 /* lowest priority */ , false); @@ -1385,7 +1385,7 @@ tsquerytree(PG_FUNCTION_ARGS) { nrm.curpol = q; nrm.buflen = 32; - nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + nrm.cur = nrm.buf = palloc_array(char, nrm.buflen); *(nrm.cur) = '\0'; nrm.op = GETOPERAND(query); infix(&nrm, -1, false); diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c index 590d7c7989c7e..45de2da900cf4 100644 --- a/src/backend/utils/adt/tsquery_cleanup.c +++ b/src/backend/utils/adt/tsquery_cleanup.c @@ -32,7 +32,7 @@ typedef struct NODE static NODE * maketree(QueryItem *in) { - NODE *node = (NODE *) palloc(sizeof(NODE)); + NODE *node = palloc_object(NODE); /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); diff --git a/src/backend/utils/adt/tsquery_gist.c b/src/backend/utils/adt/tsquery_gist.c index f7f94c1c760f5..55fc93ebef516 100644 --- a/src/backend/utils/adt/tsquery_gist.c +++ b/src/backend/utils/adt/tsquery_gist.c @@ -33,7 +33,7 @@ gtsquery_compress(PG_FUNCTION_ARGS) { TSQuerySign sign; - retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + retval = palloc_object(GISTENTRY); sign = makeTSQuerySign(DatumGetTSQuery(entry->key)); gistentryinit(*retval, TSQuerySignGetDatum(sign), @@ -213,7 +213,7 @@ gtsquery_picksplit(PG_FUNCTION_ARGS) datum_r = GETENTRY(entryvec, seed_2); maxoff = OffsetNumberNext(maxoff); - costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + costvector = palloc_array(SPLITCOST, maxoff); for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) { costvector[j - 1].pos = j; diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c index bb77e923062cf..84bf070dff2d0 100644 --- a/src/backend/utils/adt/tsquery_op.c +++ b/src/backend/utils/adt/tsquery_op.c @@ -32,17 +32,17 @@ tsquery_numnode(PG_FUNCTION_ARGS) static QTNode * join_tsqueries(TSQuery a, TSQuery b, int8 operator, uint16 distance) { - QTNode *res = (QTNode *) palloc0(sizeof(QTNode)); + QTNode *res = palloc0_object(QTNode); res->flags |= QTN_NEEDFREE; - res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + res->valnode = palloc0_object(QueryItem); res->valnode->type = QI_OPR; res->valnode->qoperator.oper = operator; if (operator == OP_PHRASE) res->valnode->qoperator.distance = distance; - res->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + res->child = palloc0_array(QTNode *, 2); res->child[0] = QT2QTN(GETQUERY(b), GETOPERAND(b)); res->child[1] = QT2QTN(GETQUERY(a), GETOPERAND(a)); res->nchild = 2; @@ -165,15 +165,15 @@ tsquery_not(PG_FUNCTION_ARGS) if (a->size == 0) PG_RETURN_POINTER(a); - res = (QTNode *) palloc0(sizeof(QTNode)); + res = palloc0_object(QTNode); res->flags |= QTN_NEEDFREE; - res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + res->valnode = palloc0_object(QueryItem); res->valnode->type = QI_OPR; res->valnode->qoperator.oper = OP_NOT; - res->child = (QTNode **) palloc0(sizeof(QTNode *)); + res->child = palloc0_object(QTNode *); res->child[0] = QT2QTN(GETQUERY(a), GETOPERAND(a)); res->nchild = 1; @@ -272,7 +272,7 @@ collectTSQueryValues(TSQuery a, int *nvalues_p) int nvalues = 0; int i; - values = (char **) palloc(sizeof(char *) * a->size); + values = palloc_array(char *, a->size); for (i = 0; i < a->size; i++) { diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c index 1c24b041aa29c..2ccfc9d3303f4 100644 --- a/src/backend/utils/adt/tsquery_util.c +++ b/src/backend/utils/adt/tsquery_util.c @@ -24,7 +24,7 @@ QTNode * QT2QTN(QueryItem *in, char *operand) { - QTNode *node = (QTNode *) palloc0(sizeof(QTNode)); + QTNode *node = palloc0_object(QTNode); /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); @@ -33,7 +33,7 @@ QT2QTN(QueryItem *in, char *operand) if (in->type == QI_OPR) { - node->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + node->child = palloc0_array(QTNode *, 2); node->child[0] = QT2QTN(in + 1, operand); node->sign = node->child[0]->sign; if (in->qoperator.oper == OP_NOT) @@ -226,7 +226,7 @@ QTNTernary(QTNode *in) int oldnchild = in->nchild; in->nchild += cc->nchild - 1; - in->child = (QTNode **) repalloc(in->child, in->nchild * sizeof(QTNode *)); + in->child = repalloc_array(in->child, QTNode *, in->nchild); if (i + 1 != oldnchild) memmove(in->child + i + cc->nchild, in->child + i + 1, @@ -262,10 +262,10 @@ QTNBinary(QTNode *in) while (in->nchild > 2) { - QTNode *nn = (QTNode *) palloc0(sizeof(QTNode)); + QTNode *nn = palloc0_object(QTNode); - nn->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); - nn->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + nn->valnode = palloc0_object(QueryItem); + nn->child = palloc0_array(QTNode *, 2); nn->nchild = 2; nn->flags = QTN_NEEDFREE; @@ -400,10 +400,10 @@ QTNCopy(QTNode *in) /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); - out = (QTNode *) palloc(sizeof(QTNode)); + out = palloc_object(QTNode); *out = *in; - out->valnode = (QueryItem *) palloc(sizeof(QueryItem)); + out->valnode = palloc_object(QueryItem); *(out->valnode) = *(in->valnode); out->flags |= QTN_NEEDFREE; @@ -418,7 +418,7 @@ QTNCopy(QTNode *in) { int i; - out->child = (QTNode **) palloc(sizeof(QTNode *) * in->nchild); + out->child = palloc_array(QTNode *, in->nchild); for (i = 0; i < in->nchild; i++) out->child[i] = QTNCopy(in->child[i]); diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c index e863aa586535d..4a3418486472e 100644 --- a/src/backend/utils/adt/tsrank.c +++ b/src/backend/utils/adt/tsrank.c @@ -160,7 +160,7 @@ SortAndUniqItems(TSQuery q, int *size) **ptr, **prevptr; - ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size); + ptr = res = palloc_array(QueryOperand *, *size); /* Collect all operands from the tree to res */ while ((*size)--) @@ -225,7 +225,7 @@ calc_rank_and(const float *w, TSVector t, TSQuery q) pfree(item); return calc_rank_or(w, t, q); } - pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size); + pos = palloc0_array(WordEntryPosVector *, q->size); /* A dummy WordEntryPos array to use when haspos is false */ posnull.npos = 1; @@ -743,7 +743,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) cur = 0; DocRepresentation *doc; - doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); + doc = palloc_array(DocRepresentation, len); /* * Iterate through query to make DocRepresentation for words and it's @@ -815,7 +815,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) * Join QueryItem per WordEntry and its position */ storage.pos = doc->pos; - storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); + storage.data.query.items = palloc_array(QueryItem *, qr->query->size); storage.data.query.items[0] = doc->data.map.item; storage.data.query.nitem = 1; @@ -832,7 +832,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) *wptr = storage; wptr++; storage.pos = rptr->pos; - storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); + storage.data.query.items = palloc_array(QueryItem *, qr->query->size); storage.data.query.items[0] = rptr->data.map.item; storage.data.query.nitem = 1; } @@ -878,8 +878,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) } qr.query = query; - qr.operandData = (QueryRepresentationOperand *) - palloc0(sizeof(QueryRepresentationOperand) * query->size); + qr.operandData = palloc0_array(QueryRepresentationOperand, query->size); doc = get_docrep(txt, &qr, &doclen); if (!doc) diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 1fa2e3729bfab..d00c6032087c4 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -202,17 +202,17 @@ tsvectorin(PG_FUNCTION_ARGS) state = init_tsvector_parser(buf, 0, escontext); arrlen = 64; - arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); - cur = tmpbuf = (char *) palloc(buflen); + arr = palloc_array(WordEntryIN, arrlen); + cur = tmpbuf = palloc_array(char, buflen); while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) { if (toklen >= MAXSTRLEN) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("word is too long (%ld bytes, max %ld bytes)", - (long) toklen, - (long) (MAXSTRLEN - 1)))); + errmsg("word is too long (%d bytes, max %d bytes)", + toklen, + MAXSTRLEN - 1))); if (cur - tmpbuf > MAXSTRPOS) ereturn(escontext, (Datum) 0, diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 1fa1275ca63b2..b809089ac5d3e 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -75,7 +75,7 @@ static bool TS_execute_locations_recurse(QueryItem *curitem, void *arg, TSExecuteCallback chkcond, List **locations); -static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len); +static int tsvector_bsearch(const TSVectorData *tsv, char *lexeme, int lexeme_len); static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); @@ -83,7 +83,7 @@ static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); * Order: haspos, len, word, for all positions (pos, weight) */ static int -silly_cmp_tsvector(const TSVector a, const TSVector b) +silly_cmp_tsvector(const TSVectorData *a, const TSVectorData *b) { if (VARSIZE(a) < VARSIZE(b)) return -1; @@ -95,8 +95,8 @@ silly_cmp_tsvector(const TSVector a, const TSVector b) return 1; else { - WordEntry *aptr = ARRPTR(a); - WordEntry *bptr = ARRPTR(b); + const WordEntry *aptr = ARRPTR(a); + const WordEntry *bptr = ARRPTR(b); int i = 0; int res; @@ -329,8 +329,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) if (nulls[i]) continue; - lex = VARDATA(dlexemes[i]); - lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + lex = VARDATA(DatumGetPointer(dlexemes[i])); + lex_len = VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; lex_pos = tsvector_bsearch(tsout, lex, lex_len); if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0) @@ -397,9 +397,9 @@ add_pos(TSVector src, WordEntry *srcptr, * found. */ static int -tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len) +tsvector_bsearch(const TSVectorData *tsv, char *lexeme, int lexeme_len) { - WordEntry *arrin = ARRPTR(tsv); + const WordEntry *arrin = ARRPTR(tsv); int StopLow = 0, StopHigh = tsv->size, StopMiddle, @@ -443,10 +443,10 @@ compare_text_lexemes(const void *va, const void *vb) { Datum a = *((const Datum *) va); Datum b = *((const Datum *) vb); - char *alex = VARDATA_ANY(a); - int alex_len = VARSIZE_ANY_EXHDR(a); - char *blex = VARDATA_ANY(b); - int blex_len = VARSIZE_ANY_EXHDR(b); + char *alex = VARDATA_ANY(DatumGetPointer(a)); + int alex_len = VARSIZE_ANY_EXHDR(DatumGetPointer(a)); + char *blex = VARDATA_ANY(DatumGetPointer(b)); + int blex_len = VARSIZE_ANY_EXHDR(DatumGetPointer(b)); return tsCompareString(alex, alex_len, blex, blex_len, false); } @@ -605,8 +605,8 @@ tsvector_delete_arr(PG_FUNCTION_ARGS) if (nulls[i]) continue; - lex = VARDATA(dlexemes[i]); - lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + lex = VARDATA(DatumGetPointer(dlexemes[i])); + lex_len = VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; lex_pos = tsvector_bsearch(tsin, lex, lex_len); if (lex_pos >= 0) @@ -770,7 +770,7 @@ array_to_tsvector(PG_FUNCTION_ARGS) (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); - if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0) + if (VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ == 0) ereport(ERROR, (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), errmsg("lexeme array may not contain empty strings"))); @@ -786,7 +786,7 @@ array_to_tsvector(PG_FUNCTION_ARGS) /* Calculate space needed for surviving lexemes. */ for (i = 0; i < nitems; i++) - datalen += VARSIZE(dlexemes[i]) - VARHDRSZ; + datalen += VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; tslen = CALCDATASIZE(nitems, datalen); /* Allocate and fill tsvector. */ @@ -798,8 +798,8 @@ array_to_tsvector(PG_FUNCTION_ARGS) cur = STRPTR(tsout); for (i = 0; i < nitems; i++) { - char *lex = VARDATA(dlexemes[i]); - int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + char *lex = VARDATA(DatumGetPointer(dlexemes[i])); + int lex_len = VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; memcpy(cur, lex, lex_len); arrout[i].haspos = 0; @@ -1212,7 +1212,7 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, /* * Filter position information by weights */ - dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos); + dptr = data->pos = palloc_array(WordEntryPos, posvec->npos); data->allocated = true; /* Is there a position with a matching weight? */ @@ -1391,12 +1391,12 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) if (totalpos == 0) { totalpos = 256; - allpos = palloc(sizeof(WordEntryPos) * totalpos); + allpos = palloc_array(WordEntryPos, totalpos); } else { totalpos *= 2; - allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos); + allpos = repalloc_array(allpos, WordEntryPos, totalpos); } } @@ -2456,7 +2456,7 @@ ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx, oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1)); + stat->stack = palloc0_array(StatEntry *, stat->maxdepth + 1); stat->stackpos = 0; node = stat->root; @@ -2839,7 +2839,7 @@ tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column) prs.lenwords = 32; prs.curwords = 0; prs.pos = 0; - prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + prs.words = palloc_array(ParsedWord, prs.lenwords); /* find all words in indexable column(s) */ for (i = 2; i < trigger->tgnargs; i++) diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index e1620d3ed1f2d..a1c374a04a40d 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -58,7 +58,7 @@ init_tsvector_parser(char *input, int flags, Node *escontext) { TSVectorParseState state; - state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData)); + state = palloc_object(struct TSVectorParseStateData); state->prsbuf = input; state->bufstart = input; state->len = 32; @@ -322,13 +322,13 @@ gettoken_tsvector(TSVectorParseState state, if (posalen == 0) { posalen = 4; - pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen); + pos = palloc_array(WordEntryPos, posalen); npos = 0; } else if (npos + 1 >= posalen) { posalen *= 2; - pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen); + pos = repalloc_array(pos, WordEntryPos, posalen); } npos++; WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index bce7309c1833a..2bc915edc2e5c 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -71,7 +71,7 @@ static int uuid_fast_cmp(Datum x, Datum y, SortSupport ssup); static bool uuid_abbrev_abort(int memtupcount, SortSupport ssup); static Datum uuid_abbrev_convert(Datum original, SortSupport ssup); static inline void uuid_set_version(pg_uuid_t *uuid, unsigned char version); -static inline int64 get_real_time_ns_ascending(); +static inline int64 get_real_time_ns_ascending(void); static pg_uuid_t *generate_uuidv7(uint64 unix_ts_ms, uint32 sub_ms); Datum @@ -80,7 +80,7 @@ uuid_in(PG_FUNCTION_ARGS) char *uuid_str = PG_GETARG_CSTRING(0); pg_uuid_t *uuid; - uuid = (pg_uuid_t *) palloc(sizeof(*uuid)); + uuid = palloc_object(pg_uuid_t); string_to_uuid(uuid_str, uuid, fcinfo->context); PG_RETURN_UUID_P(uuid); } @@ -288,7 +288,7 @@ uuid_sortsupport(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - uss = palloc(sizeof(uuid_sortsupport_state)); + uss = palloc_object(uuid_sortsupport_state); uss->input_count = 0; uss->estimating = true; initHyperLogLog(&uss->abbr_card, 10); @@ -398,11 +398,7 @@ uuid_abbrev_convert(Datum original, SortSupport ssup) { uint32 tmp; -#if SIZEOF_DATUM == 8 - tmp = (uint32) res ^ (uint32) ((uint64) res >> 32); -#else /* SIZEOF_DATUM != 8 */ - tmp = (uint32) res; -#endif + tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); } @@ -549,7 +545,7 @@ gen_random_uuid(PG_FUNCTION_ARGS) * than the previous returned timestamp (on this backend). */ static inline int64 -get_real_time_ns_ascending() +get_real_time_ns_ascending(void) { static int64 previous_ns = 0; int64 ns; @@ -752,7 +748,7 @@ uuid_extract_timestamp(PG_FUNCTION_ARGS) + (((uint64) uuid->data[0]) << 40); /* convert ms to us, then adjust */ - ts = (TimestampTz) (tms * NS_PER_US) - + ts = (TimestampTz) (tms * US_PER_MS) - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY * USECS_PER_SEC; PG_RETURN_TIMESTAMPTZ(ts); diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 3f40c9da1a0d5..39fc27e1f11f4 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -158,8 +158,8 @@ bpchar_input(const char *s, size_t len, int32 atttypmod, Node *escontext) if (s[j] != ' ') ereturn(escontext, NULL, (errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION), - errmsg("value too long for type character(%d)", - (int) maxlen))); + errmsg("value too long for type character(%zu)", + maxlen))); } /* @@ -472,8 +472,8 @@ varchar_input(const char *s, size_t len, int32 atttypmod, Node *escontext) if (s[j] != ' ') ereturn(escontext, NULL, (errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION), - errmsg("value too long for type character varying(%d)", - (int) maxlen))); + errmsg("value too long for type character varying(%zu)", + maxlen))); } len = mbmaxlen; diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 3e4d5568bde89..8adeb8dadc66e 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -35,7 +35,6 @@ #include "port/pg_bswap.h" #include "regex/regex.h" #include "utils/builtins.h" -#include "utils/bytea.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -43,10 +42,6 @@ #include "utils/sortsupport.h" #include "utils/varlena.h" - -/* GUC variable */ -int bytea_output = BYTEA_OUTPUT_HEX; - typedef struct varlena VarString; /* @@ -97,7 +92,7 @@ typedef struct int last_returned; /* Last comparison result (cache) */ bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */ bool collate_c; - Oid typid; /* Actual datatype (text/bpchar/bytea/name) */ + Oid typid; /* Actual datatype (text/bpchar/name) */ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ hyperLogLogState full_card; /* Full key cardinality state */ double prop_card; /* Required cardinality proportion */ @@ -148,12 +143,6 @@ static int text_position_get_match_pos(TextPositionState *state); static void text_position_cleanup(TextPositionState *state); static void check_collation_set(Oid collid); static int text_cmp(text *arg1, text *arg2, Oid collid); -static bytea *bytea_catenate(bytea *t1, bytea *t2); -static bytea *bytea_substring(Datum str, - int S, - int L, - bool length_not_specified); -static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); static void appendStringInfoText(StringInfo str, const text *t); static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate); static void split_text_accum_result(SplitTextOutputData *tstate, @@ -279,307 +268,6 @@ text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) * USER I/O ROUTINES * *****************************************************************************/ - -#define VAL(CH) ((CH) - '0') -#define DIG(VAL) ((VAL) + '0') - -/* - * byteain - converts from printable representation of byte array - * - * Non-printable characters must be passed as '\nnn' (octal) and are - * converted to internal form. '\' must be passed as '\\'. - * ereport(ERROR, ...) if bad form. - * - * BUGS: - * The input is scanned twice. - * The error checking of input is minimal. - */ -Datum -byteain(PG_FUNCTION_ARGS) -{ - char *inputText = PG_GETARG_CSTRING(0); - Node *escontext = fcinfo->context; - char *tp; - char *rp; - int bc; - bytea *result; - - /* Recognize hex input */ - if (inputText[0] == '\\' && inputText[1] == 'x') - { - size_t len = strlen(inputText); - - bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ - result = palloc(bc); - bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), - escontext); - SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ - - PG_RETURN_BYTEA_P(result); - } - - /* Else, it's the traditional escaped style */ - for (bc = 0, tp = inputText; *tp != '\0'; bc++) - { - if (tp[0] != '\\') - tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - tp += 4; - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - tp += 2; - else - { - /* - * one backslash, not followed by another or ### valid octal - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - bc += VARHDRSZ; - - result = (bytea *) palloc(bc); - SET_VARSIZE(result, bc); - - tp = inputText; - rp = VARDATA(result); - while (*tp != '\0') - { - if (tp[0] != '\\') - *rp++ = *tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - { - bc = VAL(tp[1]); - bc <<= 3; - bc += VAL(tp[2]); - bc <<= 3; - *rp++ = bc + VAL(tp[3]); - - tp += 4; - } - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - { - *rp++ = '\\'; - tp += 2; - } - else - { - /* - * We should never get here. The first pass should not allow it. - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - PG_RETURN_BYTEA_P(result); -} - -/* - * byteaout - converts to printable representation of byte array - * - * In the traditional escaped format, non-printable characters are - * printed as '\nnn' (octal) and '\' as '\\'. - */ -Datum -byteaout(PG_FUNCTION_ARGS) -{ - bytea *vlena = PG_GETARG_BYTEA_PP(0); - char *result; - char *rp; - - if (bytea_output == BYTEA_OUTPUT_HEX) - { - /* Print hex format */ - rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); - *rp++ = '\\'; - *rp++ = 'x'; - rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); - } - else if (bytea_output == BYTEA_OUTPUT_ESCAPE) - { - /* Print traditional escaped format */ - char *vp; - uint64 len; - int i; - - len = 1; /* empty string has 1 char */ - vp = VARDATA_ANY(vlena); - for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) - { - if (*vp == '\\') - len += 2; - else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) - len += 4; - else - len++; - } - - /* - * In principle len can't overflow uint32 if the input fit in 1GB, but - * for safety let's check rather than relying on palloc's internal - * check. - */ - if (len > MaxAllocSize) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_internal("result of bytea output conversion is too large"))); - rp = result = (char *) palloc(len); - - vp = VARDATA_ANY(vlena); - for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) - { - if (*vp == '\\') - { - *rp++ = '\\'; - *rp++ = '\\'; - } - else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) - { - int val; /* holds unprintable chars */ - - val = *vp; - rp[0] = '\\'; - rp[3] = DIG(val & 07); - val >>= 3; - rp[2] = DIG(val & 07); - val >>= 3; - rp[1] = DIG(val & 03); - rp += 4; - } - else - *rp++ = *vp; - } - } - else - { - elog(ERROR, "unrecognized \"bytea_output\" setting: %d", - bytea_output); - rp = result = NULL; /* keep compiler quiet */ - } - *rp = '\0'; - PG_RETURN_CSTRING(result); -} - -/* - * bytearecv - converts external binary format to bytea - */ -Datum -bytearecv(PG_FUNCTION_ARGS) -{ - StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - bytea *result; - int nbytes; - - nbytes = buf->len - buf->cursor; - result = (bytea *) palloc(nbytes + VARHDRSZ); - SET_VARSIZE(result, nbytes + VARHDRSZ); - pq_copymsgbytes(buf, VARDATA(result), nbytes); - PG_RETURN_BYTEA_P(result); -} - -/* - * byteasend - converts bytea to binary format - * - * This is a special case: just copy the input... - */ -Datum -byteasend(PG_FUNCTION_ARGS) -{ - bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); - - PG_RETURN_BYTEA_P(vlena); -} - -Datum -bytea_string_agg_transfn(PG_FUNCTION_ARGS) -{ - StringInfo state; - - state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); - - /* Append the value unless null, preceding it with the delimiter. */ - if (!PG_ARGISNULL(1)) - { - bytea *value = PG_GETARG_BYTEA_PP(1); - bool isfirst = false; - - /* - * You might think we can just throw away the first delimiter, however - * we must keep it as we may be a parallel worker doing partial - * aggregation building a state to send to the main process. We need - * to keep the delimiter of every aggregation so that the combine - * function can properly join up the strings of two separately - * partially aggregated results. The first delimiter is only stripped - * off in the final function. To know how much to strip off the front - * of the string, we store the length of the first delimiter in the - * StringInfo's cursor field, which we don't otherwise need here. - */ - if (state == NULL) - { - state = makeStringAggState(fcinfo); - isfirst = true; - } - - if (!PG_ARGISNULL(2)) - { - bytea *delim = PG_GETARG_BYTEA_PP(2); - - appendBinaryStringInfo(state, VARDATA_ANY(delim), - VARSIZE_ANY_EXHDR(delim)); - if (isfirst) - state->cursor = VARSIZE_ANY_EXHDR(delim); - } - - appendBinaryStringInfo(state, VARDATA_ANY(value), - VARSIZE_ANY_EXHDR(value)); - } - - /* - * The transition type for string_agg() is declared to be "internal", - * which is a pass-by-value type the same size as a pointer. - */ - if (state) - PG_RETURN_POINTER(state); - PG_RETURN_NULL(); -} - -Datum -bytea_string_agg_finalfn(PG_FUNCTION_ARGS) -{ - StringInfo state; - - /* cannot be called directly because of internal-type argument */ - Assert(AggCheckCallContext(fcinfo, NULL)); - - state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); - - if (state != NULL) - { - /* As per comment in transfn, strip data before the cursor position */ - bytea *result; - int strippedlen = state->len - state->cursor; - - result = (bytea *) palloc(strippedlen + VARHDRSZ); - SET_VARSIZE(result, strippedlen + VARHDRSZ); - memcpy(VARDATA(result), &state->data[state->cursor], strippedlen); - PG_RETURN_BYTEA_P(result); - } - else - PG_RETURN_NULL(); -} - /* * textin - converts cstring to internal representation */ @@ -720,13 +408,12 @@ text_length(Datum str) { /* fastpath when max encoding length is one */ if (pg_database_encoding_max_length() == 1) - PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); + return (toast_raw_datum_size(str) - VARHDRSZ); else { text *t = DatumGetTextPP(str); - PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t), - VARSIZE_ANY_EXHDR(t))); + return (pg_mbstrlen_with_len(VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t))); } } @@ -1424,6 +1111,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) const char *hptr; Assert(start_ptr >= haystack && start_ptr <= haystack_end); + Assert(needle_len > 0); state->last_match_len_tmp = needle_len; @@ -1436,19 +1124,26 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) * needle under the given collation. * * Note, the found substring could have a different length than the - * needle, including being empty. Callers that want to skip over the - * found string need to read the length of the found substring from - * last_match_len rather than just using the length of their needle. + * needle. Callers that want to skip over the found string need to + * read the length of the found substring from last_match_len rather + * than just using the length of their needle. * * Most callers will require "greedy" semantics, meaning that we need * to find the longest such substring, not the shortest. For callers * that don't need greedy semantics, we can finish on the first match. + * + * This loop depends on the assumption that the needle is nonempty and + * any matching substring must also be nonempty. (Even if the + * collation would accept an empty match, returning one would send + * callers that search for successive matches into an infinite loop.) */ const char *result_hptr = NULL; hptr = start_ptr; while (hptr < haystack_end) { + const char *test_end; + /* * First check the common case that there is a match in the * haystack of exactly the length of the needle. @@ -1459,11 +1154,13 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) return (char *) hptr; /* - * Else check if any of the possible substrings starting at hptr - * are equal to the needle. + * Else check if any of the non-empty substrings starting at hptr + * compare equal to the needle. */ - for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end)) + test_end = hptr; + do { + test_end += pg_mblen(test_end); if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0) { state->last_match_len_tmp = (test_end - hptr); @@ -1471,7 +1168,8 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) if (!state->greedy) break; } - } + } while (test_end < haystack_end); + if (result_hptr) break; @@ -1919,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS) * Includes locale support, and support for BpChar semantics (i.e. removing * trailing spaces before comparison). * - * Relies on the assumption that text, VarChar, BpChar, and bytea all have the - * same representation. Callers that always use the C collation (e.g. - * non-collatable type callers like bytea) may have NUL bytes in their strings; - * this will not work with any other collation, though. + * Relies on the assumption that text, VarChar, and BpChar all have the + * same representation. */ void varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) @@ -1984,14 +1680,13 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) * * Even apart from the risk of broken locales, it's possible that * there are platforms where the use of abbreviated keys should be - * disabled at compile time. Having only 4 byte datums could make - * worst-case performance drastically more likely, for example. - * Moreover, macOS's strxfrm() implementation is known to not - * effectively concentrate a significant amount of entropy from the - * original string in earlier transformed blobs. It's possible that - * other supported platforms are similarly encumbered. So, if we ever - * get past disabling this categorically, we may still want or need to - * disable it for particular platforms. + * disabled at compile time. For example, macOS's strxfrm() + * implementation is known to not effectively concentrate a + * significant amount of entropy from the original string in earlier + * transformed blobs. It's possible that other supported platforms + * are similarly encumbered. So, if we ever get past disabling this + * categorically, we may still want or need to disable it for + * particular platforms. */ if (!pg_strxfrm_enabled(locale)) abbreviate = false; @@ -2006,7 +1701,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) */ if (abbreviate || !collate_c) { - sss = palloc(sizeof(VarStringSortSupport)); + sss = palloc_object(VarStringSortSupport); sss->buf1 = palloc(TEXTBUFLEN); sss->buflen1 = TEXTBUFLEN; sss->buf2 = palloc(TEXTBUFLEN); @@ -2286,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) * representation. Our encoding strategy is simple -- pack the first 8 bytes * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are * stored in reverse order), and treat it as an unsigned integer. When the "C" - * locale is used, or in case of bytea, just memcpy() from original instead. + * locale is used just memcpy() from original instead. */ static Datum varstr_abbrev_convert(Datum original, SortSupport ssup) @@ -2313,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) /* * If we're using the C collation, use memcpy(), rather than strxfrm(), to - * abbreviate keys. The full comparator for the C locale is always - * memcmp(). It would be incorrect to allow bytea callers (callers that - * always force the C collation -- bytea isn't a collatable type, but this - * approach is convenient) to use strxfrm(). This is because bytea - * strings may contain NUL bytes. Besides, this should be faster, too. - * - * More generally, it's okay that bytea callers can have NUL bytes in - * strings because abbreviated cmp need not make a distinction between - * terminating NUL bytes, and NUL bytes representing actual NULs in the - * authoritative representation. Hopefully a comparison at or past one - * abbreviated key's terminating NUL byte will resolve the comparison - * without consulting the authoritative representation; specifically, some - * later non-NUL byte in the longer string can resolve the comparison - * against a subsequent terminating NUL in the shorter string. There will - * usually be what is effectively a "length-wise" resolution there and - * then. - * - * If that doesn't work out -- if all bytes in the longer string - * positioned at or past the offset of the smaller string's (first) - * terminating NUL are actually representative of NUL bytes in the - * authoritative binary string (perhaps with some *terminating* NUL bytes - * towards the end of the longer string iff it happens to still be small) - * -- then an authoritative tie-breaker will happen, and do the right - * thing: explicitly consider string length. + * abbreviate keys. The full comparator for the C locale is also + * memcmp(). This should be faster than strxfrm(). */ if (sss->collate_c) memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); @@ -2418,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * strxfrm() blob is itself NUL terminated, leaving no danger of * misinterpreting any NUL bytes not intended to be interpreted as * logically representing termination. - * - * (Actually, even if there were NUL bytes in the blob it would be - * okay. See remarks on bytea case above.) */ memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); } @@ -2445,18 +2115,12 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) addHyperLogLog(&sss->full_card, hash); /* Hash abbreviated key */ -#if SIZEOF_DATUM == 8 { - uint32 lohalf, - hihalf; + uint32 tmp; - lohalf = (uint32) res; - hihalf = (uint32) (res >> 32); - hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf)); + tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); + hash = DatumGetUInt32(hash_uint32(tmp)); } -#else /* SIZEOF_DATUM != 8 */ - hash = DatumGetUInt32(hash_uint32((uint32) res)); -#endif addHyperLogLog(&sss->abbr_card, hash); @@ -2507,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup) * NULLs are generally disregarded, if only NULL values were seen so far, * that might misrepresent costs if we failed to clamp. */ - if (abbrev_distinct <= 1.0) + if (abbrev_distinct < 1.0) abbrev_distinct = 1.0; - if (key_distinct <= 1.0) + if (key_distinct < 1.0) key_distinct = 1.0; /* @@ -2959,547 +2623,86 @@ bttext_pattern_sortsupport(PG_FUNCTION_ARGS) } -/*------------------------------------------------------------- - * byteaoctetlen - * - * get the number of bytes contained in an instance of type 'bytea' - *------------------------------------------------------------- +/* text_name() + * Converts a text type to a Name type. */ Datum -byteaoctetlen(PG_FUNCTION_ARGS) +text_name(PG_FUNCTION_ARGS) { - Datum str = PG_GETARG_DATUM(0); + text *s = PG_GETARG_TEXT_PP(0); + Name result; + int len; - /* We need not detoast the input at all */ - PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); + len = VARSIZE_ANY_EXHDR(s); + + /* Truncate oversize input */ + if (len >= NAMEDATALEN) + len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1); + + /* We use palloc0 here to ensure result is zero-padded */ + result = (Name) palloc0(NAMEDATALEN); + memcpy(NameStr(*result), VARDATA_ANY(s), len); + + PG_RETURN_NAME(result); } -/* - * byteacat - - * takes two bytea* and returns a bytea* that is the concatenation of - * the two. - * - * Cloned from textcat and modified as required. +/* name_text() + * Converts a Name type to a text type. */ Datum -byteacat(PG_FUNCTION_ARGS) +name_text(PG_FUNCTION_ARGS) { - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); + Name s = PG_GETARG_NAME(0); - PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); + PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s))); } + /* - * bytea_catenate - * Guts of byteacat(), broken out so it can be used by other functions + * textToQualifiedNameList - convert a text object to list of names * - * Arguments can be in short-header form, but not compressed or out-of-line + * This implements the input parsing needed by nextval() and other + * functions that take a text parameter representing a qualified name. + * We split the name at dots, downcase if not double-quoted, and + * truncate names if they're too long. */ -static bytea * -bytea_catenate(bytea *t1, bytea *t2) +List * +textToQualifiedNameList(text *textval) { - bytea *result; - int len1, - len2, - len; - char *ptr; + char *rawname; + List *result = NIL; + List *namelist; + ListCell *l; - len1 = VARSIZE_ANY_EXHDR(t1); - len2 = VARSIZE_ANY_EXHDR(t2); + /* Convert to C string (handles possible detoasting). */ + /* Note we rely on being able to modify rawname below. */ + rawname = text_to_cstring(textval); - /* paranoia ... probably should throw error instead? */ - if (len1 < 0) - len1 = 0; - if (len2 < 0) - len2 = 0; + if (!SplitIdentifierString(rawname, '.', &namelist)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid name syntax"))); - len = len1 + len2 + VARHDRSZ; - result = (bytea *) palloc(len); + if (namelist == NIL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid name syntax"))); - /* Set size of result string... */ - SET_VARSIZE(result, len); + foreach(l, namelist) + { + char *curname = (char *) lfirst(l); - /* Fill data field of result string... */ - ptr = VARDATA(result); - if (len1 > 0) - memcpy(ptr, VARDATA_ANY(t1), len1); - if (len2 > 0) - memcpy(ptr + len1, VARDATA_ANY(t2), len2); + result = lappend(result, makeString(pstrdup(curname))); + } + + pfree(rawname); + list_free(namelist); return result; } -#define PG_STR_GET_BYTEA(str_) \ - DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) - /* - * bytea_substr() - * Return a substring starting at the specified position. - * Cloned from text_substr and modified as required. - * - * Input: - * - string - * - starting position (is one-based) - * - string length (optional) - * - * If the starting position is zero or less, then return from the start of the string - * adjusting the length to be consistent with the "negative start" per SQL. - * If the length is less than zero, an ERROR is thrown. If no third argument - * (length) is provided, the length to the end of the string is assumed. - */ -Datum -bytea_substr(PG_FUNCTION_ARGS) -{ - PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), - PG_GETARG_INT32(1), - PG_GETARG_INT32(2), - false)); -} - -/* - * bytea_substr_no_len - - * Wrapper to avoid opr_sanity failure due to - * one function accepting a different number of args. - */ -Datum -bytea_substr_no_len(PG_FUNCTION_ARGS) -{ - PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), - PG_GETARG_INT32(1), - -1, - true)); -} - -static bytea * -bytea_substring(Datum str, - int S, - int L, - bool length_not_specified) -{ - int32 S1; /* adjusted start position */ - int32 L1; /* adjusted substring length */ - int32 E; /* end position */ - - /* - * The logic here should generally match text_substring(). - */ - S1 = Max(S, 1); - - if (length_not_specified) - { - /* - * Not passed a length - DatumGetByteaPSlice() grabs everything to the - * end of the string if we pass it a negative value for length. - */ - L1 = -1; - } - else if (L < 0) - { - /* SQL99 says to throw an error for E < S, i.e., negative length */ - ereport(ERROR, - (errcode(ERRCODE_SUBSTRING_ERROR), - errmsg("negative substring length not allowed"))); - L1 = -1; /* silence stupider compilers */ - } - else if (pg_add_s32_overflow(S, L, &E)) - { - /* - * L could be large enough for S + L to overflow, in which case the - * substring must run to end of string. - */ - L1 = -1; - } - else - { - /* - * A zero or negative value for the end position can happen if the - * start was negative or one. SQL99 says to return a zero-length - * string. - */ - if (E < 1) - return PG_STR_GET_BYTEA(""); - - L1 = E - S1; - } - - /* - * If the start position is past the end of the string, SQL99 says to - * return a zero-length string -- DatumGetByteaPSlice() will do that for - * us. We need only convert S1 to zero-based starting position. - */ - return DatumGetByteaPSlice(str, S1 - 1, L1); -} - -/* - * byteaoverlay - * Replace specified substring of first string with second - * - * The SQL standard defines OVERLAY() in terms of substring and concatenation. - * This code is a direct implementation of what the standard says. - */ -Datum -byteaoverlay(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int sp = PG_GETARG_INT32(2); /* substring start position */ - int sl = PG_GETARG_INT32(3); /* substring length */ - - PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); -} - -Datum -byteaoverlay_no_len(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int sp = PG_GETARG_INT32(2); /* substring start position */ - int sl; - - sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ - PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); -} - -static bytea * -bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) -{ - bytea *result; - bytea *s1; - bytea *s2; - int sp_pl_sl; - - /* - * Check for possible integer-overflow cases. For negative sp, throw a - * "substring length" error because that's what should be expected - * according to the spec's definition of OVERLAY(). - */ - if (sp <= 0) - ereport(ERROR, - (errcode(ERRCODE_SUBSTRING_ERROR), - errmsg("negative substring length not allowed"))); - if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); - - s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); - s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); - result = bytea_catenate(s1, t2); - result = bytea_catenate(result, s2); - - return result; -} - -/* - * bit_count - */ -Datum -bytea_bit_count(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - - PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1))); -} - -/* - * byteapos - - * Return the position of the specified substring. - * Implements the SQL POSITION() function. - * Cloned from textpos and modified as required. - */ -Datum -byteapos(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int pos; - int px, - p; - int len1, - len2; - char *p1, - *p2; - - len1 = VARSIZE_ANY_EXHDR(t1); - len2 = VARSIZE_ANY_EXHDR(t2); - - if (len2 <= 0) - PG_RETURN_INT32(1); /* result for empty pattern */ - - p1 = VARDATA_ANY(t1); - p2 = VARDATA_ANY(t2); - - pos = 0; - px = (len1 - len2); - for (p = 0; p <= px; p++) - { - if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) - { - pos = p + 1; - break; - }; - p1++; - }; - - PG_RETURN_INT32(pos); -} - -/*------------------------------------------------------------- - * byteaGetByte - * - * this routine treats "bytea" as an array of bytes. - * It returns the Nth byte (a number between 0 and 255). - *------------------------------------------------------------- - */ -Datum -byteaGetByte(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int32 n = PG_GETARG_INT32(1); - int len; - int byte; - - len = VARSIZE_ANY_EXHDR(v); - - if (n < 0 || n >= len) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %d out of valid range, 0..%d", - n, len - 1))); - - byte = ((unsigned char *) VARDATA_ANY(v))[n]; - - PG_RETURN_INT32(byte); -} - -/*------------------------------------------------------------- - * byteaGetBit - * - * This routine treats a "bytea" type like an array of bits. - * It returns the value of the Nth bit (0 or 1). - * - *------------------------------------------------------------- - */ -Datum -byteaGetBit(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int64 n = PG_GETARG_INT64(1); - int byteNo, - bitNo; - int len; - int byte; - - len = VARSIZE_ANY_EXHDR(v); - - if (n < 0 || n >= (int64) len * 8) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, - n, (int64) len * 8 - 1))); - - /* n/8 is now known < len, so safe to cast to int */ - byteNo = (int) (n / 8); - bitNo = (int) (n % 8); - - byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; - - if (byte & (1 << bitNo)) - PG_RETURN_INT32(1); - else - PG_RETURN_INT32(0); -} - -/*------------------------------------------------------------- - * byteaSetByte - * - * Given an instance of type 'bytea' creates a new one with - * the Nth byte set to the given value. - * - *------------------------------------------------------------- - */ -Datum -byteaSetByte(PG_FUNCTION_ARGS) -{ - bytea *res = PG_GETARG_BYTEA_P_COPY(0); - int32 n = PG_GETARG_INT32(1); - int32 newByte = PG_GETARG_INT32(2); - int len; - - len = VARSIZE(res) - VARHDRSZ; - - if (n < 0 || n >= len) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %d out of valid range, 0..%d", - n, len - 1))); - - /* - * Now set the byte. - */ - ((unsigned char *) VARDATA(res))[n] = newByte; - - PG_RETURN_BYTEA_P(res); -} - -/*------------------------------------------------------------- - * byteaSetBit - * - * Given an instance of type 'bytea' creates a new one with - * the Nth bit set to the given value. - * - *------------------------------------------------------------- - */ -Datum -byteaSetBit(PG_FUNCTION_ARGS) -{ - bytea *res = PG_GETARG_BYTEA_P_COPY(0); - int64 n = PG_GETARG_INT64(1); - int32 newBit = PG_GETARG_INT32(2); - int len; - int oldByte, - newByte; - int byteNo, - bitNo; - - len = VARSIZE(res) - VARHDRSZ; - - if (n < 0 || n >= (int64) len * 8) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, - n, (int64) len * 8 - 1))); - - /* n/8 is now known < len, so safe to cast to int */ - byteNo = (int) (n / 8); - bitNo = (int) (n % 8); - - /* - * sanity check! - */ - if (newBit != 0 && newBit != 1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("new bit must be 0 or 1"))); - - /* - * Update the byte. - */ - oldByte = ((unsigned char *) VARDATA(res))[byteNo]; - - if (newBit == 0) - newByte = oldByte & (~(1 << bitNo)); - else - newByte = oldByte | (1 << bitNo); - - ((unsigned char *) VARDATA(res))[byteNo] = newByte; - - PG_RETURN_BYTEA_P(res); -} - -/* - * Return reversed bytea - */ -Datum -bytea_reverse(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - const char *p = VARDATA_ANY(v); - int len = VARSIZE_ANY_EXHDR(v); - const char *endp = p + len; - bytea *result = palloc(len + VARHDRSZ); - char *dst = (char *) VARDATA(result) + len; - - SET_VARSIZE(result, len + VARHDRSZ); - - while (p < endp) - *(--dst) = *p++; - - PG_RETURN_BYTEA_P(result); -} - - -/* text_name() - * Converts a text type to a Name type. - */ -Datum -text_name(PG_FUNCTION_ARGS) -{ - text *s = PG_GETARG_TEXT_PP(0); - Name result; - int len; - - len = VARSIZE_ANY_EXHDR(s); - - /* Truncate oversize input */ - if (len >= NAMEDATALEN) - len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1); - - /* We use palloc0 here to ensure result is zero-padded */ - result = (Name) palloc0(NAMEDATALEN); - memcpy(NameStr(*result), VARDATA_ANY(s), len); - - PG_RETURN_NAME(result); -} - -/* name_text() - * Converts a Name type to a text type. - */ -Datum -name_text(PG_FUNCTION_ARGS) -{ - Name s = PG_GETARG_NAME(0); - - PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s))); -} - - -/* - * textToQualifiedNameList - convert a text object to list of names - * - * This implements the input parsing needed by nextval() and other - * functions that take a text parameter representing a qualified name. - * We split the name at dots, downcase if not double-quoted, and - * truncate names if they're too long. - */ -List * -textToQualifiedNameList(text *textval) -{ - char *rawname; - List *result = NIL; - List *namelist; - ListCell *l; - - /* Convert to C string (handles possible detoasting). */ - /* Note we rely on being able to modify rawname below. */ - rawname = text_to_cstring(textval); - - if (!SplitIdentifierString(rawname, '.', &namelist)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_NAME), - errmsg("invalid name syntax"))); - - if (namelist == NIL) - ereport(ERROR, - (errcode(ERRCODE_INVALID_NAME), - errmsg("invalid name syntax"))); - - foreach(l, namelist) - { - char *curname = (char *) lfirst(l); - - result = lappend(result, makeString(pstrdup(curname))); - } - - pfree(rawname); - list_free(namelist); - - return result; -} - -/* - * SplitIdentifierString --- parse a string containing identifiers + * SplitIdentifierString --- parse a string containing identifiers * * This is the guts of textToQualifiedNameList, and is exported for use in * other situations such as parsing GUC variables. In the GUC case, it's @@ -3534,7 +2737,7 @@ SplitIdentifierString(char *rawstring, char separator, nextp++; /* skip leading whitespace */ if (*nextp == '\0') - return true; /* allow empty string */ + return true; /* empty string represents empty list */ /* At the top of the loop, we are at start of a new identifier. */ do @@ -3661,7 +2864,7 @@ SplitDirectoriesString(char *rawstring, char separator, nextp++; /* skip leading whitespace */ if (*nextp == '\0') - return true; /* allow empty string */ + return true; /* empty string represents empty list */ /* At the top of the loop, we are at start of a new directory. */ do @@ -3782,7 +2985,7 @@ SplitGUCList(char *rawstring, char separator, nextp++; /* skip leading whitespace */ if (*nextp == '\0') - return true; /* allow empty string */ + return true; /* empty string represents empty list */ /* At the top of the loop, we are at start of a new identifier. */ do @@ -3849,331 +3052,6 @@ SplitGUCList(char *rawstring, char separator, return true; } - -/***************************************************************************** - * Comparison Functions used for bytea - * - * Note: btree indexes need these routines not to leak memory; therefore, - * be careful to free working copies of toasted datums. Most places don't - * need to be so careful. - *****************************************************************************/ - -Datum -byteaeq(PG_FUNCTION_ARGS) -{ - Datum arg1 = PG_GETARG_DATUM(0); - Datum arg2 = PG_GETARG_DATUM(1); - bool result; - Size len1, - len2; - - /* - * We can use a fast path for unequal lengths, which might save us from - * having to detoast one or both values. - */ - len1 = toast_raw_datum_size(arg1); - len2 = toast_raw_datum_size(arg2); - if (len1 != len2) - result = false; - else - { - bytea *barg1 = DatumGetByteaPP(arg1); - bytea *barg2 = DatumGetByteaPP(arg2); - - result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), - len1 - VARHDRSZ) == 0); - - PG_FREE_IF_COPY(barg1, 0); - PG_FREE_IF_COPY(barg2, 1); - } - - PG_RETURN_BOOL(result); -} - -Datum -byteane(PG_FUNCTION_ARGS) -{ - Datum arg1 = PG_GETARG_DATUM(0); - Datum arg2 = PG_GETARG_DATUM(1); - bool result; - Size len1, - len2; - - /* - * We can use a fast path for unequal lengths, which might save us from - * having to detoast one or both values. - */ - len1 = toast_raw_datum_size(arg1); - len2 = toast_raw_datum_size(arg2); - if (len1 != len2) - result = true; - else - { - bytea *barg1 = DatumGetByteaPP(arg1); - bytea *barg2 = DatumGetByteaPP(arg2); - - result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), - len1 - VARHDRSZ) != 0); - - PG_FREE_IF_COPY(barg1, 0); - PG_FREE_IF_COPY(barg2, 1); - } - - PG_RETURN_BOOL(result); -} - -Datum -bytealt(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); -} - -Datum -byteale(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); -} - -Datum -byteagt(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); -} - -Datum -byteage(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); -} - -Datum -byteacmp(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - if ((cmp == 0) && (len1 != len2)) - cmp = (len1 < len2) ? -1 : 1; - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_INT32(cmp); -} - -Datum -bytea_larger(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - bytea *result; - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2); - - PG_RETURN_BYTEA_P(result); -} - -Datum -bytea_smaller(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - bytea *result; - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2); - - PG_RETURN_BYTEA_P(result); -} - -Datum -bytea_sortsupport(PG_FUNCTION_ARGS) -{ - SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - - /* Use generic string SortSupport, forcing "C" collation */ - varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); - - MemoryContextSwitchTo(oldcontext); - - PG_RETURN_VOID(); -} - -/* Cast bytea -> int2 */ -Datum -bytea_int2(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint16 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("smallint out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT16(result); -} - -/* Cast bytea -> int4 */ -Datum -bytea_int4(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint32 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT32(result); -} - -/* Cast bytea -> int8 */ -Datum -bytea_int8(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint64 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT64(result); -} - -/* Cast int2 -> bytea; can just use int2send() */ -Datum -int2_bytea(PG_FUNCTION_ARGS) -{ - return int2send(fcinfo); -} - -/* Cast int4 -> bytea; can just use int4send() */ -Datum -int4_bytea(PG_FUNCTION_ARGS) -{ - return int4send(fcinfo); -} - -/* Cast int8 -> bytea; can just use int8send() */ -Datum -int8_bytea(PG_FUNCTION_ARGS) -{ - return int8send(fcinfo); -} - /* * appendStringInfoText * @@ -6525,12 +5403,12 @@ unicode_assigned(PG_FUNCTION_ARGS) ereport(ERROR, (errmsg("Unicode categorization can only be performed if server encoding is UTF8"))); - /* convert to pg_wchar */ + /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); p = (unsigned char *) VARDATA_ANY(input); for (int i = 0; i < size; i++) { - pg_wchar uchar = utf8_to_unicode(p); + char32_t uchar = utf8_to_unicode(p); int category = unicode_category(uchar); if (category == PG_U_UNASSIGNED) @@ -6549,24 +5427,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS) char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); UnicodeNormalizationForm form; int size; - pg_wchar *input_chars; - pg_wchar *output_chars; + char32_t *input_chars; + char32_t *output_chars; unsigned char *p; text *result; int i; form = unicode_norm_form_from_string(formstr); - /* convert to pg_wchar */ + /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); - input_chars = palloc((size + 1) * sizeof(pg_wchar)); + input_chars = palloc((size + 1) * sizeof(char32_t)); p = (unsigned char *) VARDATA_ANY(input); for (i = 0; i < size; i++) { input_chars[i] = utf8_to_unicode(p); p += pg_utf_mblen(p); } - input_chars[i] = (pg_wchar) '\0'; + input_chars[i] = (char32_t) '\0'; Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); /* action */ @@ -6574,7 +5452,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS) /* convert back to UTF-8 string */ size = 0; - for (pg_wchar *wp = output_chars; *wp; wp++) + for (char32_t *wp = output_chars; *wp; wp++) { unsigned char buf[4]; @@ -6586,7 +5464,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS) SET_VARSIZE(result, size + VARHDRSZ); p = (unsigned char *) VARDATA_ANY(result); - for (pg_wchar *wp = output_chars; *wp; wp++) + for (char32_t *wp = output_chars; *wp; wp++) { unicode_to_utf8(*wp, p); p += pg_utf_mblen(p); @@ -6615,8 +5493,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS) char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); UnicodeNormalizationForm form; int size; - pg_wchar *input_chars; - pg_wchar *output_chars; + char32_t *input_chars; + char32_t *output_chars; unsigned char *p; int i; UnicodeNormalizationQC quickcheck; @@ -6625,16 +5503,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS) form = unicode_norm_form_from_string(formstr); - /* convert to pg_wchar */ + /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); - input_chars = palloc((size + 1) * sizeof(pg_wchar)); + input_chars = palloc((size + 1) * sizeof(char32_t)); p = (unsigned char *) VARDATA_ANY(input); for (i = 0; i < size; i++) { input_chars[i] = utf8_to_unicode(p); p += pg_utf_mblen(p); } - input_chars[i] = (pg_wchar) '\0'; + input_chars[i] = (char32_t) '\0'; Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); /* quick check (see UAX #15) */ @@ -6648,11 +5526,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS) output_chars = unicode_normalize(form, input_chars); output_size = 0; - for (pg_wchar *wp = output_chars; *wp; wp++) + for (char32_t *wp = output_chars; *wp; wp++) output_size++; result = (size == output_size) && - (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0); + (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0); PG_RETURN_BOOL(result); } @@ -6708,7 +5586,7 @@ unistr(PG_FUNCTION_ARGS) int len; StringInfoData str; text *result; - pg_wchar pair_first = 0; + char16_t pair_first = 0; char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; instr = VARDATA_ANY(input_text); @@ -6732,7 +5610,7 @@ unistr(PG_FUNCTION_ARGS) else if ((len >= 5 && isxdigits_n(instr + 1, 4)) || (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4))) { - pg_wchar unicode; + char32_t unicode; int offset = instr[1] == 'u' ? 2 : 1; unicode = hexval_n(instr + offset, 4); @@ -6768,7 +5646,7 @@ unistr(PG_FUNCTION_ARGS) } else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6)) { - pg_wchar unicode; + char32_t unicode; unicode = hexval_n(instr + 2, 6); @@ -6803,7 +5681,7 @@ unistr(PG_FUNCTION_ARGS) } else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8)) { - pg_wchar unicode; + char32_t unicode; unicode = hexval_n(instr + 2, 8); diff --git a/src/backend/utils/adt/waitfuncs.c b/src/backend/utils/adt/waitfuncs.c index ddd0a57c0c597..f01cad72a0feb 100644 --- a/src/backend/utils/adt/waitfuncs.c +++ b/src/backend/utils/adt/waitfuncs.c @@ -73,7 +73,7 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) * acquire heavyweight locks. */ blocking_pids_a = - DatumGetArrayTypeP(DirectFunctionCall1(pg_blocking_pids, blocked_pid)); + DatumGetArrayTypeP(DirectFunctionCall1(pg_blocking_pids, Int32GetDatum(blocked_pid))); Assert(ARR_ELEMTYPE(blocking_pids_a) == INT4OID); Assert(!array_contains_nulls(blocking_pids_a)); diff --git a/src/backend/utils/adt/windowfuncs.c b/src/backend/utils/adt/windowfuncs.c index bb35f3bc4a981..969f02aa59b44 100644 --- a/src/backend/utils/adt/windowfuncs.c +++ b/src/backend/utils/adt/windowfuncs.c @@ -86,6 +86,7 @@ window_row_number(PG_FUNCTION_ARGS) WindowObject winobj = PG_WINDOW_OBJECT(); int64 curpos = WinGetCurrentPosition(winobj); + WinCheckAndInitializeNullTreatment(winobj, false, fcinfo); WinSetMarkPosition(winobj, curpos); PG_RETURN_INT64(curpos + 1); } @@ -141,6 +142,7 @@ window_rank(PG_FUNCTION_ARGS) rank_context *context; bool up; + WinCheckAndInitializeNullTreatment(winobj, false, fcinfo); up = rank_up(winobj); context = (rank_context *) WinGetPartitionLocalMemory(winobj, sizeof(rank_context)); @@ -203,6 +205,7 @@ window_dense_rank(PG_FUNCTION_ARGS) rank_context *context; bool up; + WinCheckAndInitializeNullTreatment(winobj, false, fcinfo); up = rank_up(winobj); context = (rank_context *) WinGetPartitionLocalMemory(winobj, sizeof(rank_context)); @@ -266,6 +269,7 @@ window_percent_rank(PG_FUNCTION_ARGS) int64 totalrows = WinGetPartitionRowCount(winobj); Assert(totalrows > 0); + WinCheckAndInitializeNullTreatment(winobj, false, fcinfo); up = rank_up(winobj); context = (rank_context *) @@ -335,6 +339,7 @@ window_cume_dist(PG_FUNCTION_ARGS) int64 totalrows = WinGetPartitionRowCount(winobj); Assert(totalrows > 0); + WinCheckAndInitializeNullTreatment(winobj, false, fcinfo); up = rank_up(winobj); context = (rank_context *) @@ -413,6 +418,7 @@ window_ntile(PG_FUNCTION_ARGS) WindowObject winobj = PG_WINDOW_OBJECT(); ntile_context *context; + WinCheckAndInitializeNullTreatment(winobj, false, fcinfo); context = (ntile_context *) WinGetPartitionLocalMemory(winobj, sizeof(ntile_context)); @@ -535,6 +541,7 @@ leadlag_common(FunctionCallInfo fcinfo, bool isnull; bool isout; + WinCheckAndInitializeNullTreatment(winobj, true, fcinfo); if (withoffset) { offset = DatumGetInt32(WinGetFuncArgCurrent(winobj, 1, &isnull)); @@ -652,6 +659,7 @@ window_first_value(PG_FUNCTION_ARGS) Datum result; bool isnull; + WinCheckAndInitializeNullTreatment(winobj, true, fcinfo); result = WinGetFuncArgInFrame(winobj, 0, 0, WINDOW_SEEK_HEAD, true, &isnull, NULL); @@ -673,6 +681,7 @@ window_last_value(PG_FUNCTION_ARGS) Datum result; bool isnull; + WinCheckAndInitializeNullTreatment(winobj, true, fcinfo); result = WinGetFuncArgInFrame(winobj, 0, 0, WINDOW_SEEK_TAIL, true, &isnull, NULL); @@ -696,6 +705,7 @@ window_nth_value(PG_FUNCTION_ARGS) bool isnull; int32 nth; + WinCheckAndInitializeNullTreatment(winobj, true, fcinfo); nth = DatumGetInt32(WinGetFuncArgCurrent(winobj, 1, &isnull)); if (isnull) PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/xid.c b/src/backend/utils/adt/xid.c index 3d0c48769cce8..8a6e0390709b8 100644 --- a/src/backend/utils/adt/xid.c +++ b/src/backend/utils/adt/xid.c @@ -45,7 +45,7 @@ xidout(PG_FUNCTION_ARGS) TransactionId transactionId = PG_GETARG_TRANSACTIONID(0); char *result = (char *) palloc(16); - snprintf(result, 16, "%lu", (unsigned long) transactionId); + snprintf(result, 16, "%u", transactionId); PG_RETURN_CSTRING(result); } @@ -362,7 +362,7 @@ cidout(PG_FUNCTION_ARGS) CommandId c = PG_GETARG_COMMANDID(0); char *result = (char *) palloc(16); - snprintf(result, 16, "%lu", (unsigned long) c); + snprintf(result, 16, "%u", c); PG_RETURN_CSTRING(result); } diff --git a/src/backend/utils/adt/xid8funcs.c b/src/backend/utils/adt/xid8funcs.c index 1da3964ca6fb8..4b3f7a69b3bcf 100644 --- a/src/backend/utils/adt/xid8funcs.c +++ b/src/backend/utils/adt/xid8funcs.c @@ -39,6 +39,7 @@ #include "utils/memutils.h" #include "utils/snapmgr.h" #include "utils/xid8.h" +#include "varatt.h" /* @@ -193,7 +194,7 @@ is_visible_fxid(FullTransactionId value, const pg_snapshot *snap) #ifdef USE_BSEARCH_IF_NXIP_GREATER else if (snap->nxip > USE_BSEARCH_IF_NXIP_GREATER) { - void *res; + const void *res; res = bsearch(&value, snap->xip, snap->nxip, sizeof(FullTransactionId), cmp_fxid); diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index a4150bff2eaea..c8ab9d61c6834 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -84,7 +84,6 @@ #include "catalog/namespace.h" #include "catalog/pg_class.h" #include "catalog/pg_type.h" -#include "commands/dbcommands.h" #include "executor/spi.h" #include "executor/tablefunc.h" #include "fmgr.h" @@ -529,14 +528,36 @@ xmltext(PG_FUNCTION_ARGS) #ifdef USE_LIBXML text *arg = PG_GETARG_TEXT_PP(0); text *result; - xmlChar *xmlbuf = NULL; + volatile xmlChar *xmlbuf = NULL; + PgXmlErrorContext *xmlerrcxt; + + /* First we gotta spin up some error handling. */ + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + + PG_TRY(); + { + xmlbuf = xmlEncodeSpecialChars(NULL, xml_text2xmlChar(arg)); + + if (xmlbuf == NULL || xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); - xmlbuf = xmlEncodeSpecialChars(NULL, xml_text2xmlChar(arg)); + result = cstring_to_text_with_len((const char *) xmlbuf, + xmlStrlen((const xmlChar *) xmlbuf)); + } + PG_CATCH(); + { + if (xmlbuf) + xmlFree((xmlChar *) xmlbuf); - Assert(xmlbuf); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); + + xmlFree((xmlChar *) xmlbuf); + pg_xml_done(xmlerrcxt, false); - result = cstring_to_text_with_len((const char *) xmlbuf, xmlStrlen(xmlbuf)); - xmlFree(xmlbuf); PG_RETURN_XML_P(result); #else NO_XML_SUPPORT(); @@ -663,7 +684,7 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) volatile xmlBufferPtr buf = NULL; volatile xmlSaveCtxtPtr ctxt = NULL; ErrorSaveContext escontext = {T_ErrorSaveContext}; - PgXmlErrorContext *xmlerrcxt; + PgXmlErrorContext *volatile xmlerrcxt = NULL; #endif if (xmloption_arg != XMLOPTION_DOCUMENT && !indent) @@ -704,13 +725,18 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) return (text *) data; } - /* Otherwise, we gotta spin up some error handling. */ - xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - + /* + * Otherwise, we gotta spin up some error handling. Unlike most other + * routines in this module, we already have a libxml "doc" structure to + * free, so we need to call pg_xml_init() inside the PG_TRY and be + * prepared for it to fail (typically due to palloc OOM). + */ PG_TRY(); { size_t decl_len = 0; + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + /* The serialized data will go into this buffer. */ buf = xmlBufferCreate(); @@ -770,7 +796,10 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) if (oldroot != NULL) xmlFreeNode(oldroot); - xmlAddChildList(root, content_nodes); + if (xmlAddChildList(root, content_nodes) == NULL || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not append xml node list"); /* * We use this node to insert newlines in the dump. Note: in at @@ -838,10 +867,10 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) xmlSaveClose(ctxt); if (buf) xmlBufferFree(buf); - if (doc) - xmlFreeDoc(doc); + xmlFreeDoc(doc); - pg_xml_done(xmlerrcxt, true); + if (xmlerrcxt) + pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } @@ -862,8 +891,8 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) xmltype * xmlelement(XmlExpr *xexpr, - Datum *named_argvalue, bool *named_argnull, - Datum *argvalue, bool *argnull) + const Datum *named_argvalue, const bool *named_argnull, + const Datum *argvalue, const bool *argnull) { #ifdef USE_LIBXML xmltype *result; @@ -931,7 +960,10 @@ xmlelement(XmlExpr *xexpr, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate xmlTextWriter"); - xmlTextWriterStartElement(writer, (xmlChar *) xexpr->name); + if (xmlTextWriterStartElement(writer, (xmlChar *) xexpr->name) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not start xml element"); forboth(arg, named_arg_strings, narg, xexpr->arg_names) { @@ -939,19 +971,30 @@ xmlelement(XmlExpr *xexpr, char *argname = strVal(lfirst(narg)); if (str) - xmlTextWriterWriteAttribute(writer, - (xmlChar *) argname, - (xmlChar *) str); + { + if (xmlTextWriterWriteAttribute(writer, + (xmlChar *) argname, + (xmlChar *) str) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not write xml attribute"); + } } foreach(arg, arg_strings) { char *str = (char *) lfirst(arg); - xmlTextWriterWriteRaw(writer, (xmlChar *) str); + if (xmlTextWriterWriteRaw(writer, (xmlChar *) str) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not write raw xml text"); } - xmlTextWriterEndElement(writer); + if (xmlTextWriterEndElement(writer) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not end xml element"); /* we MUST do this now to flush data out to the buffer ... */ xmlFreeTextWriter(writer); @@ -1212,7 +1255,7 @@ pg_xml_init(PgXmlStrictness strictness) pg_xml_init_library(); /* Create error handling context structure */ - errcxt = (PgXmlErrorContext *) palloc(sizeof(PgXmlErrorContext)); + errcxt = palloc_object(PgXmlErrorContext); errcxt->magic = ERRCXT_MAGIC; errcxt->strictness = strictness; errcxt->err_occurred = false; @@ -1725,7 +1768,7 @@ xml_doctype_in_content(const xmlChar *str) * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode). * * If parsed_nodes isn't NULL and we parse in CONTENT mode, the list - * of parsed nodes from the xmlParseInNodeContext call will be returned + * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned * to *parsed_nodes. (It is caller's responsibility to free that.) * * Errors normally result in ereport(ERROR), but if escontext is an @@ -1751,6 +1794,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PgXmlErrorContext *xmlerrcxt; volatile xmlParserCtxtPtr ctxt = NULL; volatile xmlDocPtr doc = NULL; + volatile int save_keep_blanks = -1; /* * This step looks annoyingly redundant, but we must do it to have a @@ -1778,7 +1822,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PG_TRY(); { bool parse_as_document = false; - int options; int res_code; size_t count = 0; xmlChar *version = NULL; @@ -1809,18 +1852,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, parse_as_document = true; } - /* - * Select parse options. - * - * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) - * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined by - * internal DTD are applied'. As for external DTDs, we try to support - * them too (see SQL/XML:2008 GR 10.16.7.e), but that doesn't really - * happen because xmlPgEntityLoader prevents it. - */ - options = XML_PARSE_NOENT | XML_PARSE_DTDATTR - | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); - /* initialize output parameters */ if (parsed_xmloptiontype != NULL) *parsed_xmloptiontype = parse_as_document ? XMLOPTION_DOCUMENT : @@ -1830,11 +1861,26 @@ xml_parse(text *data, XmlOptionType xmloption_arg, if (parse_as_document) { + int options; + + /* set up parser context used by xmlCtxtReadDoc */ ctxt = xmlNewParserCtxt(); if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); + /* + * Select parse options. + * + * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) + * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined + * by internal DTD are applied'. As for external DTDs, we try to + * support them too (see SQL/XML:2008 GR 10.16.7.e), but that + * doesn't really happen because xmlPgEntityLoader prevents it. + */ + options = XML_PARSE_NOENT | XML_PARSE_DTDATTR + | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); + doc = xmlCtxtReadDoc(ctxt, utf8string, NULL, /* no URL */ "UTF-8", @@ -1856,10 +1902,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } else { - xmlNodePtr root; - xmlNodePtr oldroot PG_USED_FOR_ASSERTS_ONLY; - - /* set up document with empty root node to be the context node */ + /* set up document that xmlParseBalancedChunkMemory will add to */ doc = xmlNewDoc(version); if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, @@ -1872,43 +1915,22 @@ xml_parse(text *data, XmlOptionType xmloption_arg, "could not allocate XML document"); doc->standalone = standalone; - root = xmlNewNode(NULL, (const xmlChar *) "content-root"); - if (root == NULL || xmlerrcxt->err_occurred) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, - "could not allocate xml node"); - - /* - * This attaches root to doc, so we need not free it separately; - * and there can't yet be any old root to free. - */ - oldroot = xmlDocSetRootElement(doc, root); - Assert(oldroot == NULL); + /* set parse options --- have to do this the ugly way */ + save_keep_blanks = xmlKeepBlanksDefault(preserve_whitespace ? 1 : 0); /* allow empty content */ if (*(utf8string + count)) { - xmlNodePtr node_list = NULL; - xmlParserErrors res; - - res = xmlParseInNodeContext(root, - (char *) utf8string + count, - strlen((char *) utf8string + count), - options, - &node_list); - - if (res != XML_ERR_OK || xmlerrcxt->err_occurred) + res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, + utf8string + count, + parsed_nodes); + if (res_code != 0 || xmlerrcxt->err_occurred) { - xmlFreeNodeList(node_list); xml_errsave(escontext, xmlerrcxt, ERRCODE_INVALID_XML_CONTENT, "invalid XML content"); goto fail; } - - if (parsed_nodes != NULL) - *parsed_nodes = node_list; - else - xmlFreeNodeList(node_list); } } @@ -1917,6 +1939,8 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } PG_CATCH(); { + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); if (doc != NULL) xmlFreeDoc(doc); if (ctxt != NULL) @@ -1928,6 +1952,9 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } PG_END_TRY(); + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); + if (ctxt != NULL) xmlFreeParserCtxt(ctxt); @@ -2106,7 +2133,7 @@ xml_errorHandler(void *data, PgXmlErrorPtr error) node->type == XML_ELEMENT_NODE) ? node->name : NULL; int domain = error->domain; int level = error->level; - StringInfo errorBuf; + StringInfoData errorBuf; /* * Defend against someone passing us a bogus context struct. @@ -2183,16 +2210,16 @@ xml_errorHandler(void *data, PgXmlErrorPtr error) } /* Prepare error message in errorBuf */ - errorBuf = makeStringInfo(); + initStringInfo(&errorBuf); if (error->line > 0) - appendStringInfo(errorBuf, "line %d: ", error->line); + appendStringInfo(&errorBuf, "line %d: ", error->line); if (name != NULL) - appendStringInfo(errorBuf, "element %s: ", name); + appendStringInfo(&errorBuf, "element %s: ", name); if (error->message != NULL) - appendStringInfoString(errorBuf, error->message); + appendStringInfoString(&errorBuf, error->message); else - appendStringInfoString(errorBuf, "(no message provided)"); + appendStringInfoString(&errorBuf, "(no message provided)"); /* * Append context information to errorBuf. @@ -2210,11 +2237,11 @@ xml_errorHandler(void *data, PgXmlErrorPtr error) xmlGenericErrorFunc errFuncSaved = xmlGenericError; void *errCtxSaved = xmlGenericErrorContext; - xmlSetGenericErrorFunc(errorBuf, + xmlSetGenericErrorFunc(&errorBuf, (xmlGenericErrorFunc) appendStringInfo); /* Add context information to errorBuf */ - appendStringInfoLineSeparator(errorBuf); + appendStringInfoLineSeparator(&errorBuf); xmlParserPrintFileContext(input); @@ -2223,7 +2250,7 @@ xml_errorHandler(void *data, PgXmlErrorPtr error) } /* Get rid of any trailing newlines in errorBuf */ - chopStringInfoNewlines(errorBuf); + chopStringInfoNewlines(&errorBuf); /* * Legacy error handling mode. err_occurred is never set, we just add the @@ -2236,10 +2263,10 @@ xml_errorHandler(void *data, PgXmlErrorPtr error) if (xmlerrcxt->strictness == PG_XML_STRICTNESS_LEGACY) { appendStringInfoLineSeparator(&xmlerrcxt->err_buf); - appendBinaryStringInfo(&xmlerrcxt->err_buf, errorBuf->data, - errorBuf->len); + appendBinaryStringInfo(&xmlerrcxt->err_buf, errorBuf.data, + errorBuf.len); - destroyStringInfo(errorBuf); + pfree(errorBuf.data); return; } @@ -2254,23 +2281,23 @@ xml_errorHandler(void *data, PgXmlErrorPtr error) if (level >= XML_ERR_ERROR) { appendStringInfoLineSeparator(&xmlerrcxt->err_buf); - appendBinaryStringInfo(&xmlerrcxt->err_buf, errorBuf->data, - errorBuf->len); + appendBinaryStringInfo(&xmlerrcxt->err_buf, errorBuf.data, + errorBuf.len); xmlerrcxt->err_occurred = true; } else if (level >= XML_ERR_WARNING) { ereport(WARNING, - (errmsg_internal("%s", errorBuf->data))); + (errmsg_internal("%s", errorBuf.data))); } else { ereport(NOTICE, - (errmsg_internal("%s", errorBuf->data))); + (errmsg_internal("%s", errorBuf.data))); } - destroyStringInfo(errorBuf); + pfree(errorBuf.data); } @@ -4220,20 +4247,27 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) } else { - xmlChar *str; + volatile xmlChar *str = NULL; - str = xmlXPathCastNodeToString(cur); PG_TRY(); { + char *escaped; + + str = xmlXPathCastNodeToString(cur); + if (str == NULL || xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); + /* Here we rely on XML having the same representation as TEXT */ - char *escaped = escape_xml((char *) str); + escaped = escape_xml((char *) str); result = (xmltype *) cstring_to_text(escaped); pfree(escaped); } PG_FINALLY(); { - xmlFree(str); + if (str) + xmlFree((xmlChar *) str); } PG_END_TRY(); } @@ -4699,10 +4733,10 @@ XmlTableInitOpaque(TableFuncScanState *state, int natts) XmlTableBuilderData *xtCxt; PgXmlErrorContext *xmlerrcxt; - xtCxt = palloc0(sizeof(XmlTableBuilderData)); + xtCxt = palloc0_object(XmlTableBuilderData); xtCxt->magic = XMLTABLE_CONTEXT_MAGIC; xtCxt->natts = natts; - xtCxt->xpathscomp = palloc0(sizeof(xmlXPathCompExprPtr) * natts); + xtCxt->xpathscomp = palloc0_array(xmlXPathCompExprPtr, natts); xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); @@ -4861,7 +4895,7 @@ XmlTableSetColumnFilter(TableFuncScanState *state, const char *path, int colnum) XmlTableBuilderData *xtCxt; xmlChar *xstr; - Assert(PointerIsValid(path)); + Assert(path); xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetColumnFilter"); diff --git a/src/backend/utils/cache/attoptcache.c b/src/backend/utils/cache/attoptcache.c index 5c8360c08b5f8..45d1e2be007ba 100644 --- a/src/backend/utils/cache/attoptcache.c +++ b/src/backend/utils/cache/attoptcache.c @@ -86,7 +86,7 @@ relatt_cache_syshash(const void *key, Size keysize) const AttoptCacheKey *ckey = key; Assert(keysize == sizeof(*ckey)); - return GetSysCacheHashValue2(ATTNUM, ckey->attrelid, ckey->attnum); + return GetSysCacheHashValue2(ATTNUM, ObjectIdGetDatum(ckey->attrelid), Int32GetDatum(ckey->attnum)); } /* diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 657648996c235..1d09c66ac9592 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -117,10 +117,10 @@ static CatCTup *CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, static void ReleaseCatCacheWithOwner(HeapTuple tuple, ResourceOwner resowner); static void ReleaseCatCacheListWithOwner(CatCList *list, ResourceOwner resowner); -static void CatCacheFreeKeys(TupleDesc tupdesc, int nkeys, int *attnos, - Datum *keys); -static void CatCacheCopyKeys(TupleDesc tupdesc, int nkeys, int *attnos, - Datum *srckeys, Datum *dstkeys); +static void CatCacheFreeKeys(TupleDesc tupdesc, int nkeys, const int *attnos, + const Datum *keys); +static void CatCacheCopyKeys(TupleDesc tupdesc, int nkeys, const int *attnos, + const Datum *srckeys, Datum *dstkeys); /* @@ -213,7 +213,7 @@ namehashfast(Datum datum) { char *key = NameStr(*DatumGetName(datum)); - return hash_any((unsigned char *) key, strlen(key)); + return hash_bytes((unsigned char *) key, strlen(key)); } static bool @@ -317,6 +317,7 @@ GetCCHashEqFuncs(Oid keytype, CCHashFN *hashfunc, RegProcedure *eqfunc, CCFastEq case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: *hashfunc = int4hashfast; *fasteqfunc = int4eqfast; *eqfunc = F_OIDEQ; @@ -460,14 +461,14 @@ static void CatCachePrintStats(int code, Datum arg) { slist_iter iter; - long cc_searches = 0; - long cc_hits = 0; - long cc_neg_hits = 0; - long cc_newloads = 0; - long cc_invals = 0; - long cc_nlists = 0; - long cc_lsearches = 0; - long cc_lhits = 0; + uint64 cc_searches = 0; + uint64 cc_hits = 0; + uint64 cc_neg_hits = 0; + uint64 cc_newloads = 0; + uint64 cc_invals = 0; + uint64 cc_nlists = 0; + uint64 cc_lsearches = 0; + uint64 cc_lhits = 0; slist_foreach(iter, &CacheHdr->ch_caches) { @@ -475,7 +476,10 @@ CatCachePrintStats(int code, Datum arg) if (cache->cc_ntup == 0 && cache->cc_searches == 0) continue; /* don't print unused caches */ - elog(DEBUG2, "catcache %s/%u: %d tup, %ld srch, %ld+%ld=%ld hits, %ld+%ld=%ld loads, %ld invals, %d lists, %ld lsrch, %ld lhits", + elog(DEBUG2, "catcache %s/%u: %d tup, %" PRIu64 " srch, %" PRIu64 "+%" + PRIu64 "=%" PRIu64 " hits, %" PRIu64 "+%" PRIu64 "=%" + PRIu64 " loads, %" PRIu64 " invals, %d lists, %" PRIu64 + " lsrch, %" PRIu64 " lhits", cache->cc_relname, cache->cc_indexoid, cache->cc_ntup, @@ -499,7 +503,10 @@ CatCachePrintStats(int code, Datum arg) cc_lsearches += cache->cc_lsearches; cc_lhits += cache->cc_lhits; } - elog(DEBUG2, "catcache totals: %d tup, %ld srch, %ld+%ld=%ld hits, %ld+%ld=%ld loads, %ld invals, %ld lists, %ld lsrch, %ld lhits", + elog(DEBUG2, "catcache totals: %d tup, %" PRIu64 " srch, %" PRIu64 "+%" + PRIu64 "=%" PRIu64 " hits, %" PRIu64 "+%" PRIu64 "=%" PRIu64 + " loads, %" PRIu64 " invals, %" PRIu64 " lists, %" PRIu64 + " lsrch, %" PRIu64 " lhits", CacheHdr->ch_ntup, cc_searches, cc_hits, @@ -828,7 +835,7 @@ ResetCatalogCachesExt(bool debug_discard) * kinds of trouble if a cache flush occurs while loading cache entries. * We now avoid the need to do it by copying cc_tupdesc out of the relcache, * rather than relying on the relcache to keep a tupdesc for us. Of course - * this assumes the tupdesc of a cachable system table will not change...) + * this assumes the tupdesc of a cacheable system table will not change...) */ void CatalogCacheFlushCatalog(Oid catId) @@ -913,7 +920,7 @@ InitCatCache(int id, */ if (CacheHdr == NULL) { - CacheHdr = (CatCacheHeader *) palloc(sizeof(CatCacheHeader)); + CacheHdr = palloc_object(CatCacheHeader); slist_init(&CacheHdr->ch_caches); CacheHdr->ch_ntup = 0; #ifdef CATCACHE_STATS @@ -1661,7 +1668,7 @@ ReleaseCatCacheWithOwner(HeapTuple tuple, ResourceOwner resowner) ct->refcount--; if (resowner) - ResourceOwnerForgetCatCacheRef(CurrentResourceOwner, &ct->tuple); + ResourceOwnerForgetCatCacheRef(resowner, &ct->tuple); if ( #ifndef CATCACHE_FORCE_RELEASE @@ -2103,7 +2110,7 @@ ReleaseCatCacheListWithOwner(CatCList *list, ResourceOwner resowner) Assert(list->refcount > 0); list->refcount--; if (resowner) - ResourceOwnerForgetCatCacheListRef(CurrentResourceOwner, list); + ResourceOwnerForgetCatCacheListRef(resowner, list); if ( #ifndef CATCACHE_FORCE_RELEASE @@ -2236,7 +2243,7 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, { /* Set up keys for a negative cache entry */ oldcxt = MemoryContextSwitchTo(CacheMemoryContext); - ct = (CatCTup *) palloc(sizeof(CatCTup)); + ct = palloc_object(CatCTup); /* * Store keys - they'll point into separately allocated memory if not @@ -2278,21 +2285,18 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, * Helper routine that frees keys stored in the keys array. */ static void -CatCacheFreeKeys(TupleDesc tupdesc, int nkeys, int *attnos, Datum *keys) +CatCacheFreeKeys(TupleDesc tupdesc, int nkeys, const int *attnos, const Datum *keys) { int i; for (i = 0; i < nkeys; i++) { int attnum = attnos[i]; - Form_pg_attribute att; /* system attribute are not supported in caches */ Assert(attnum > 0); - att = TupleDescAttr(tupdesc, attnum - 1); - - if (!att->attbyval) + if (!TupleDescCompactAttr(tupdesc, attnum - 1)->attbyval) pfree(DatumGetPointer(keys[i])); } } @@ -2303,8 +2307,8 @@ CatCacheFreeKeys(TupleDesc tupdesc, int nkeys, int *attnos, Datum *keys) * context. */ static void -CatCacheCopyKeys(TupleDesc tupdesc, int nkeys, int *attnos, - Datum *srckeys, Datum *dstkeys) +CatCacheCopyKeys(TupleDesc tupdesc, int nkeys, const int *attnos, + const Datum *srckeys, Datum *dstkeys) { int i; @@ -2389,7 +2393,7 @@ PrepareToInvalidateCacheTuple(Relation relation, */ Assert(RelationIsValid(relation)); Assert(HeapTupleIsValid(tuple)); - Assert(PointerIsValid(function)); + Assert(function); Assert(CacheHdr != NULL); reloid = RelationGetRelid(relation); diff --git a/src/backend/utils/cache/evtcache.c b/src/backend/utils/cache/evtcache.c index ce596bf563856..7f8e246e8048b 100644 --- a/src/backend/utils/cache/evtcache.c +++ b/src/backend/utils/cache/evtcache.c @@ -78,7 +78,6 @@ BuildEventTriggerCache(void) { HASHCTL ctl; HTAB *cache; - MemoryContext oldcontext; Relation rel; Relation irel; SysScanDesc scan; @@ -110,9 +109,6 @@ BuildEventTriggerCache(void) (Datum) 0); } - /* Switch to correct memory context. */ - oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext); - /* Prevent the memory context from being nuked while we're rebuilding. */ EventTriggerCacheState = ETCS_REBUILD_STARTED; @@ -145,6 +141,7 @@ BuildEventTriggerCache(void) bool evttags_isnull; EventTriggerCacheEntry *entry; bool found; + MemoryContext oldcontext; /* Get next tuple. */ tup = systable_getnext_ordered(scan, ForwardScanDirection); @@ -171,8 +168,11 @@ BuildEventTriggerCache(void) else continue; + /* Switch to correct memory context. */ + oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext); + /* Allocate new cache item. */ - item = palloc0(sizeof(EventTriggerCacheItem)); + item = palloc0_object(EventTriggerCacheItem); item->fnoid = form->evtfoid; item->enabled = form->evtenabled; @@ -188,6 +188,9 @@ BuildEventTriggerCache(void) entry->triggerlist = lappend(entry->triggerlist, item); else entry->triggerlist = list_make1(item); + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); } /* Done with pg_event_trigger scan. */ @@ -195,9 +198,6 @@ BuildEventTriggerCache(void) index_close(irel, AccessShareLock); relation_close(rel, AccessShareLock); - /* Restore previous memory context. */ - MemoryContextSwitchTo(oldcontext); - /* Install new cache. */ EventTriggerCache = cache; @@ -240,6 +240,8 @@ DecodeTextArrayToBitmapset(Datum array) } pfree(elems); + if (arr != DatumGetPointer(array)) + pfree(arr); return bms; } diff --git a/src/backend/utils/cache/funccache.c b/src/backend/utils/cache/funccache.c index 150c502a6121b..afc048a051ead 100644 --- a/src/backend/utils/cache/funccache.c +++ b/src/backend/utils/cache/funccache.c @@ -491,6 +491,7 @@ cached_function_compile(FunctionCallInfo fcinfo, CachedFunctionHashKey hashkey; bool function_valid = false; bool hashkey_valid = false; + bool new_function = false; /* * Lookup the pg_proc tuple by Oid; we'll need it in any case @@ -570,13 +571,15 @@ cached_function_compile(FunctionCallInfo fcinfo, /* * Create the new function struct, if not done already. The function - * structs are never thrown away, so keep them in TopMemoryContext. + * cache entry will be kept for the life of the backend, so put it in + * TopMemoryContext. */ Assert(cacheEntrySize >= sizeof(CachedFunction)); if (function == NULL) { function = (CachedFunction *) MemoryContextAllocZero(TopMemoryContext, cacheEntrySize); + new_function = true; } else { @@ -585,17 +588,36 @@ cached_function_compile(FunctionCallInfo fcinfo, } /* - * Fill in the CachedFunction part. fn_hashkey and use_count remain - * zeroes for now. + * However, if function compilation fails, we'd like not to leak the + * function struct, so use a PG_TRY block to prevent that. (It's up + * to the compile callback function to avoid its own internal leakage + * in such cases.) Unfortunately, freeing the struct is only safe if + * we just allocated it: otherwise there are probably fn_extra + * pointers to it. */ - function->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); - function->fn_tid = procTup->t_self; - function->dcallback = dcallback; + PG_TRY(); + { + /* + * Do the hard, language-specific part. + */ + ccallback(fcinfo, procTup, &hashkey, function, forValidator); + } + PG_CATCH(); + { + if (new_function) + pfree(function); + PG_RE_THROW(); + } + PG_END_TRY(); /* - * Do the hard, language-specific part. + * Fill in the CachedFunction part. (We do this last to prevent the + * function from looking valid before it's fully built.) fn_hashkey + * will be set by cfunc_hashtable_insert; use_count remains zero. */ - ccallback(fcinfo, procTup, &hashkey, function, forValidator); + function->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); + function->fn_tid = procTup->t_self; + function->dcallback = dcallback; /* * Add the completed struct to the hash table. diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 02505c88b8e4c..11ed876264c11 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -98,9 +98,9 @@ * likewise send the invalidation immediately, before ending the change's * critical section. This includes inplace heap updates, relmap, and smgr. * - * When wal_level=logical, write invalidations into WAL at each command end to - * support the decoding of the in-progress transactions. See - * CommandEndInvalidationMessages. + * When effective_wal_level is 'logical', write invalidations into WAL at + * each command end to support the decoding of the in-progress transactions. + * See CommandEndInvalidationMessages. * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -758,7 +758,7 @@ PrepareInplaceInvalidationState(void) Assert(inplaceInvalInfo == NULL); /* gone after WAL insertion CritSection ends, so use current context */ - myInfo = (InvalidationInfo *) palloc0(sizeof(InvalidationInfo)); + myInfo = palloc0_object(InvalidationInfo); /* Stash our messages past end of the transactional messages, if any. */ if (transInvalInfo != NULL) @@ -1419,7 +1419,7 @@ CommandEndInvalidationMessages(void) ProcessInvalidationMessages(&transInvalInfo->ii.CurrentCmdInvalidMsgs, LocalExecuteInvalidationMessage); - /* WAL Log per-command invalidation messages for wal_level=logical */ + /* WAL Log per-command invalidation messages for logical decoding */ if (XLogLogicalInfoActive()) LogLogicalInvalidations(); @@ -1480,7 +1480,7 @@ CacheInvalidateHeapTupleCommon(Relation relation, else PrepareToInvalidateCacheTuple(relation, tuple, newtuple, RegisterCatcacheInvalidation, - (void *) info); + info); /* * Now, is this tuple one of the primary definers of a relcache entry? See @@ -1583,13 +1583,17 @@ CacheInvalidateHeapTuple(Relation relation, * implied. * * Like CacheInvalidateHeapTuple(), but for inplace updates. + * + * Just before and just after the inplace update, the tuple's cache keys must + * match those in key_equivalent_tuple. Cache keys consist of catcache lookup + * key columns and columns referencing pg_class.oid values, + * e.g. pg_constraint.conrelid, which would trigger relcache inval. */ void CacheInvalidateHeapTupleInplace(Relation relation, - HeapTuple tuple, - HeapTuple newtuple) + HeapTuple key_equivalent_tuple) { - CacheInvalidateHeapTupleCommon(relation, tuple, newtuple, + CacheInvalidateHeapTupleCommon(relation, key_equivalent_tuple, NULL, PrepareInplaceInvalidationState); } @@ -1753,7 +1757,7 @@ CacheInvalidateSmgr(RelFileLocatorBackend rlocator) SharedInvalidationMessage msg; /* verify optimization stated above stays valid */ - StaticAssertStmt(MAX_BACKENDS_BITS <= 23, + StaticAssertDecl(MAX_BACKENDS_BITS <= 23, "MAX_BACKENDS_BITS is too big for inval.c"); msg.sm.id = SHAREDINVALSMGR_ID; diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index c460a72b75d90..5aa7a26d95c3f 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -26,6 +26,7 @@ #include "catalog/pg_class.h" #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" +#include "catalog/pg_database.h" #include "catalog/pg_index.h" #include "catalog/pg_language.h" #include "catalog/pg_namespace.h" @@ -701,8 +702,7 @@ get_op_index_interpretation(Oid opno) if (cmptype == COMPARE_INVALID) continue; - thisresult = (OpIndexInterpretation *) - palloc(sizeof(OpIndexInterpretation)); + thisresult = palloc_object(OpIndexInterpretation); thisresult->opfamily_id = op_form->amopfamily; thisresult->cmptype = cmptype; thisresult->oplefttype = op_form->amoplefttype; @@ -747,8 +747,7 @@ get_op_index_interpretation(Oid opno) continue; /* OK, report it as COMPARE_NE */ - thisresult = (OpIndexInterpretation *) - palloc(sizeof(OpIndexInterpretation)); + thisresult = palloc_object(OpIndexInterpretation); thisresult->opfamily_id = op_form->amopfamily; thisresult->cmptype = COMPARE_NE; thisresult->oplefttype = op_form->amoplefttype; @@ -1247,6 +1246,32 @@ get_constraint_type(Oid conoid) return contype; } +/* ---------- DATABASE CACHE ---------- */ + +/* + * get_database_name - given a database OID, look up the name + * + * Returns a palloc'd string, or NULL if no such database. + */ +char * +get_database_name(Oid dbid) +{ + HeapTuple dbtuple; + char *result; + + dbtuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(dbid)); + if (HeapTupleIsValid(dbtuple)) + { + result = pstrdup(NameStr(((Form_pg_database) GETSTRUCT(dbtuple))->datname)); + ReleaseSysCache(dbtuple); + } + else + result = NULL; + + return result; +} + + /* ---------- LANGUAGE CACHE ---------- */ char * @@ -3817,7 +3842,7 @@ get_subscription_oid(const char *subname, bool missing_ok) Oid oid; oid = GetSysCacheOid2(SUBSCRIPTIONNAME, Anum_pg_subscription_oid, - MyDatabaseId, CStringGetDatum(subname)); + ObjectIdGetDatum(MyDatabaseId), CStringGetDatum(subname)); if (!OidIsValid(oid) && !missing_ok) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index f5d7d70def0e8..67e884400387e 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -167,18 +167,18 @@ RelationBuildPartitionKey(Relation relation) /* Allocate assorted arrays in the partkeycxt, which we'll fill below */ oldcxt = MemoryContextSwitchTo(partkeycxt); - key->partattrs = (AttrNumber *) palloc0(key->partnatts * sizeof(AttrNumber)); - key->partopfamily = (Oid *) palloc0(key->partnatts * sizeof(Oid)); - key->partopcintype = (Oid *) palloc0(key->partnatts * sizeof(Oid)); - key->partsupfunc = (FmgrInfo *) palloc0(key->partnatts * sizeof(FmgrInfo)); - - key->partcollation = (Oid *) palloc0(key->partnatts * sizeof(Oid)); - key->parttypid = (Oid *) palloc0(key->partnatts * sizeof(Oid)); - key->parttypmod = (int32 *) palloc0(key->partnatts * sizeof(int32)); - key->parttyplen = (int16 *) palloc0(key->partnatts * sizeof(int16)); - key->parttypbyval = (bool *) palloc0(key->partnatts * sizeof(bool)); - key->parttypalign = (char *) palloc0(key->partnatts * sizeof(char)); - key->parttypcoll = (Oid *) palloc0(key->partnatts * sizeof(Oid)); + key->partattrs = palloc0_array(AttrNumber, key->partnatts); + key->partopfamily = palloc0_array(Oid, key->partnatts); + key->partopcintype = palloc0_array(Oid, key->partnatts); + key->partsupfunc = palloc0_array(FmgrInfo, key->partnatts); + + key->partcollation = palloc0_array(Oid, key->partnatts); + key->parttypid = palloc0_array(Oid, key->partnatts); + key->parttypmod = palloc0_array(int32, key->partnatts); + key->parttyplen = palloc0_array(int16, key->partnatts); + key->parttypbyval = palloc0_array(bool, key->partnatts); + key->parttypalign = palloc0_array(char, key->partnatts); + key->parttypcoll = palloc0_array(Oid, key->partnatts); MemoryContextSwitchTo(oldcxt); /* determine support function number to search for */ diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 89a1c79e984d1..45261caaf477a 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -207,7 +207,7 @@ CreateCachedPlan(RawStmt *raw_parse_tree, */ oldcxt = MemoryContextSwitchTo(source_context); - plansource = (CachedPlanSource *) palloc0(sizeof(CachedPlanSource)); + plansource = palloc0_object(CachedPlanSource); plansource->magic = CACHEDPLANSOURCE_MAGIC; plansource->raw_parse_tree = copyObject(raw_parse_tree); plansource->analyzed_parse_tree = NULL; @@ -307,7 +307,7 @@ CreateOneShotCachedPlan(RawStmt *raw_parse_tree, * Create and fill the CachedPlanSource struct within the caller's memory * context. Most fields are just left empty for the moment. */ - plansource = (CachedPlanSource *) palloc0(sizeof(CachedPlanSource)); + plansource = palloc0_object(CachedPlanSource); plansource->magic = CACHEDPLANSOURCE_MAGIC; plansource->raw_parse_tree = raw_parse_tree; plansource->analyzed_parse_tree = NULL; @@ -463,14 +463,13 @@ CompleteCachedPlan(CachedPlanSource *plansource, /* * Save the final parameter types (or other parameter specification data) - * into the source_context, as well as our other parameters. Also save - * the result tuple descriptor. + * into the source_context, as well as our other parameters. */ MemoryContextSwitchTo(source_context); if (num_params > 0) { - plansource->param_types = (Oid *) palloc(num_params * sizeof(Oid)); + plansource->param_types = palloc_array(Oid, num_params); memcpy(plansource->param_types, param_types, num_params * sizeof(Oid)); } else @@ -480,9 +479,25 @@ CompleteCachedPlan(CachedPlanSource *plansource, plansource->parserSetupArg = parserSetupArg; plansource->cursor_options = cursor_options; plansource->fixed_result = fixed_result; - plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); + /* + * Also save the result tuple descriptor. PlanCacheComputeResultDesc may + * leak some cruft; normally we just accept that to save a copy step, but + * in USE_VALGRIND mode be tidy by running it in the caller's context. + */ +#ifdef USE_VALGRIND + MemoryContextSwitchTo(oldcxt); + plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); + if (plansource->resultDesc) + { + MemoryContextSwitchTo(source_context); + plansource->resultDesc = CreateTupleDescCopy(plansource->resultDesc); + MemoryContextSwitchTo(oldcxt); + } +#else + plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); MemoryContextSwitchTo(oldcxt); +#endif plansource->is_complete = true; plansource->is_valid = true; @@ -1104,7 +1119,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, /* * Create and fill the CachedPlan struct within the new context. */ - plan = (CachedPlan *) palloc(sizeof(CachedPlan)); + plan = palloc_object(CachedPlan); plan->magic = CACHEDPLAN_MAGIC; plan->stmt_list = plist; @@ -1283,6 +1298,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, CachedPlan *plan = NULL; List *qlist; bool customplan; + ListCell *lc; /* Assert caller is doing things in a sane order */ Assert(plansource->magic == CACHEDPLANSOURCE_MAGIC); @@ -1385,6 +1401,13 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, plan->is_saved = true; } + foreach(lc, plan->stmt_list) + { + PlannedStmt *pstmt = (PlannedStmt *) lfirst(lc); + + pstmt->planOrigin = customplan ? PLAN_STMT_CACHE_CUSTOM : PLAN_STMT_CACHE_GENERIC; + } + return plan; } @@ -1668,7 +1691,7 @@ CopyCachedPlan(CachedPlanSource *plansource) oldcxt = MemoryContextSwitchTo(source_context); - newsource = (CachedPlanSource *) palloc0(sizeof(CachedPlanSource)); + newsource = palloc0_object(CachedPlanSource); newsource->magic = CACHEDPLANSOURCE_MAGIC; newsource->raw_parse_tree = copyObject(plansource->raw_parse_tree); newsource->analyzed_parse_tree = copyObject(plansource->analyzed_parse_tree); @@ -1677,8 +1700,7 @@ CopyCachedPlan(CachedPlanSource *plansource) newsource->commandTag = plansource->commandTag; if (plansource->num_params > 0) { - newsource->param_types = (Oid *) - palloc(plansource->num_params * sizeof(Oid)); + newsource->param_types = palloc_array(Oid, plansource->num_params); memcpy(newsource->param_types, plansource->param_types, plansource->num_params * sizeof(Oid)); } @@ -1817,7 +1839,7 @@ GetCachedExpression(Node *expr) oldcxt = MemoryContextSwitchTo(cexpr_context); - cexpr = (CachedExpression *) palloc(sizeof(CachedExpression)); + cexpr = palloc_object(CachedExpression); cexpr->magic = CACHEDEXPR_MAGIC; cexpr->expr = copyObject(expr); cexpr->is_valid = true; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 559ba9cdb2cde..32d548677e2b8 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -422,7 +422,7 @@ AllocateRelationDesc(Form_pg_class relp) /* * allocate and zero space for new relation descriptor */ - relation = (Relation) palloc0(sizeof(RelationData)); + relation = palloc0_object(RelationData); /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; @@ -1902,7 +1902,7 @@ formrdesc(const char *relationName, Oid relationReltype, /* * allocate new relation desc, clear all fields of reldesc */ - relation = (Relation) palloc0(sizeof(RelationData)); + relation = palloc0_object(RelationData); /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; @@ -1994,7 +1994,7 @@ formrdesc(const char *relationName, Oid relationReltype, /* mark not-null status */ if (has_not_null) { - TupleConstr *constr = (TupleConstr *) palloc0(sizeof(TupleConstr)); + TupleConstr *constr = palloc0_object(TupleConstr); constr->has_not_null = true; relation->rd_att->constr = constr; @@ -2132,6 +2132,10 @@ RelationIdGetRelation(Oid relationId) Assert(rd->rd_isvalid || (rd->rd_isnailed && !criticalRelcachesBuilt)); } + + /* Consistency check to be paranoid introducing parallel temp scan. */ + Assert(!(rd != NULL && RelationUsesLocalBuffers(rd) && IsParallelWorker() && dirtied_localbufs != 0)); + return rd; } @@ -2142,6 +2146,10 @@ RelationIdGetRelation(Oid relationId) rd = RelationBuildDesc(relationId, true); if (RelationIsValid(rd)) RelationIncrementReferenceCount(rd); + + /* Consistency check to be paranoid introducing parallel temp scan. */ + Assert(!(rd != NULL && RelationUsesLocalBuffers(rd) && IsParallelWorker() && dirtied_localbufs != 0)); + return rd; } @@ -2896,7 +2904,7 @@ RelationForgetRelation(Oid rid) RelationIdCacheLookup(rid, relation); - if (!PointerIsValid(relation)) + if (!relation) return; /* not in cache, nothing to do */ if (!RelationHasReferenceCountZero(relation)) @@ -2941,7 +2949,7 @@ RelationCacheInvalidateEntry(Oid relationId) RelationIdCacheLookup(relationId, relation); - if (PointerIsValid(relation)) + if (relation) { relcacheInvalsReceived++; RelationFlushRelation(relation); @@ -3184,7 +3192,7 @@ AssertPendingSyncs_RelationCache(void) if ((LockTagType) locallock->tag.lock.locktag_type != LOCKTAG_RELATION) continue; - relid = ObjectIdGetDatum(locallock->tag.lock.locktag_field2); + relid = locallock->tag.lock.locktag_field2; r = RelationIdGetRelation(relid); if (!RelationIsValid(r)) continue; @@ -3579,7 +3587,7 @@ RelationBuildLocalRelation(const char *relname, /* * allocate a new relation descriptor and fill in basic state fields. */ - rel = (Relation) palloc0(sizeof(RelationData)); + rel = palloc0_object(RelationData); /* make sure relation is marked as having no open file yet */ rel->rd_smgr = NULL; @@ -3627,7 +3635,7 @@ RelationBuildLocalRelation(const char *relname, if (has_not_null) { - TupleConstr *constr = (TupleConstr *) palloc0(sizeof(TupleConstr)); + TupleConstr *constr = palloc0_object(TupleConstr); constr->has_not_null = true; rel->rd_att->constr = constr; @@ -4658,12 +4666,6 @@ CheckNNConstraintFetch(Relation relation) break; } - check[found].ccenforced = conform->conenforced; - check[found].ccvalid = conform->convalidated; - check[found].ccnoinherit = conform->connoinherit; - check[found].ccname = MemoryContextStrdup(CacheMemoryContext, - NameStr(conform->conname)); - /* Grab and test conbin is actually set */ val = fastgetattr(htup, Anum_pg_constraint_conbin, @@ -4676,7 +4678,13 @@ CheckNNConstraintFetch(Relation relation) /* detoast and convert to cstring in caller's context */ char *s = TextDatumGetCString(val); + check[found].ccenforced = conform->conenforced; + check[found].ccvalid = conform->convalidated; + check[found].ccnoinherit = conform->connoinherit; + check[found].ccname = MemoryContextStrdup(CacheMemoryContext, + NameStr(conform->conname)); check[found].ccbin = MemoryContextStrdup(CacheMemoryContext, s); + pfree(s); found++; } @@ -5643,7 +5651,7 @@ RelationGetIdentityKeyBitmap(Relation relation) * This should be called only for an index that is known to have an associated * exclusion constraint or primary key/unique constraint using WITHOUT * OVERLAPS. - + * * It returns arrays (palloc'd in caller's context) of the exclusion operator * OIDs, their underlying functions' OIDs, and their strategy numbers in the * index's opclasses. We cache all this information since it requires a fair @@ -5670,9 +5678,9 @@ RelationGetExclusionInfo(Relation indexRelation, indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRelation); /* Allocate result space in caller context */ - *operators = ops = (Oid *) palloc(sizeof(Oid) * indnkeyatts); - *procs = funcs = (Oid *) palloc(sizeof(Oid) * indnkeyatts); - *strategies = strats = (uint16 *) palloc(sizeof(uint16) * indnkeyatts); + *operators = ops = palloc_array(Oid, indnkeyatts); + *procs = funcs = palloc_array(Oid, indnkeyatts); + *strategies = strats = palloc_array(uint16, indnkeyatts); /* Quick exit if we have the data cached already */ if (indexRelation->rd_exclstrats != NULL) @@ -5763,9 +5771,9 @@ RelationGetExclusionInfo(Relation indexRelation, /* Save a copy of the results in the relcache entry. */ oldcxt = MemoryContextSwitchTo(indexRelation->rd_indexcxt); - indexRelation->rd_exclops = (Oid *) palloc(sizeof(Oid) * indnkeyatts); - indexRelation->rd_exclprocs = (Oid *) palloc(sizeof(Oid) * indnkeyatts); - indexRelation->rd_exclstrats = (uint16 *) palloc(sizeof(uint16) * indnkeyatts); + indexRelation->rd_exclops = palloc_array(Oid, indnkeyatts); + indexRelation->rd_exclprocs = palloc_array(Oid, indnkeyatts); + indexRelation->rd_exclstrats = palloc_array(uint16, indnkeyatts); memcpy(indexRelation->rd_exclops, ops, sizeof(Oid) * indnkeyatts); memcpy(indexRelation->rd_exclprocs, funcs, sizeof(Oid) * indnkeyatts); memcpy(indexRelation->rd_exclstrats, strats, sizeof(uint16) * indnkeyatts); @@ -5959,7 +5967,7 @@ RelationBuildPublicationDesc(Relation relation, PublicationDesc *pubdesc) /* Now save copy of the descriptor in the relcache entry. */ oldcxt = MemoryContextSwitchTo(CacheMemoryContext); - relation->rd_pubdesc = palloc(sizeof(PublicationDesc)); + relation->rd_pubdesc = palloc_object(PublicationDesc); memcpy(relation->rd_pubdesc, pubdesc, sizeof(PublicationDesc)); MemoryContextSwitchTo(oldcxt); } @@ -5967,7 +5975,7 @@ RelationBuildPublicationDesc(Relation relation, PublicationDesc *pubdesc) static bytea ** CopyIndexAttOptions(bytea **srcopts, int natts) { - bytea **opts = palloc(sizeof(*opts) * natts); + bytea **opts = palloc_array(bytea *, natts); for (int i = 0; i < natts; i++) { @@ -5999,7 +6007,7 @@ RelationGetIndexAttOptions(Relation relation, bool copy) return copy ? CopyIndexAttOptions(opts, natts) : opts; /* Get and parse opclass options. */ - opts = palloc0(sizeof(*opts) * natts); + opts = palloc0_array(bytea *, natts); for (i = 0; i < natts; i++) { @@ -6292,7 +6300,7 @@ load_relcache_init_file(bool shared) /* mark not-null status */ if (has_not_null) { - TupleConstr *constr = (TupleConstr *) palloc0(sizeof(TupleConstr)); + TupleConstr *constr = palloc0_object(TupleConstr); constr->has_not_null = true; rel->rd_att->constr = constr; @@ -6991,5 +6999,5 @@ ResOwnerReleaseRelation(Datum res) Assert(rel->rd_refcnt > 0); rel->rd_refcnt -= 1; - RelationCloseCleanup((Relation) res); + RelationCloseCleanup((Relation) DatumGetPointer(res)); } diff --git a/src/backend/utils/cache/relfilenumbermap.c b/src/backend/utils/cache/relfilenumbermap.c index 8a2f6f8c69318..0b6f9cf3fa191 100644 --- a/src/backend/utils/cache/relfilenumbermap.c +++ b/src/backend/utils/cache/relfilenumbermap.c @@ -130,6 +130,11 @@ InitializeRelfilenumberMap(void) * Map a relation's (tablespace, relfilenumber) to a relation's oid and cache * the result. * + * A temporary relation may share its relfilenumber with a permanent relation + * or temporary relations created in other backends. Being able to uniquely + * identify a temporary relation would require a backend's proc number, which + * we do not know about. Hence, this function ignores this case. + * * Returns InvalidOid if no relation matching the criteria could be found. */ Oid @@ -208,6 +213,9 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber) { Form_pg_class classform = (Form_pg_class) GETSTRUCT(ntp); + if (classform->relpersistence == RELPERSISTENCE_TEMP) + continue; + if (found) elog(ERROR, "unexpected duplicate for tablespace %u, relfilenumber %u", diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index f944453a1d884..0e70a8020b774 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -131,7 +131,7 @@ InitCatalogCache(void) cacheinfo[cacheId].nkeys, cacheinfo[cacheId].key, cacheinfo[cacheId].nbuckets); - if (!PointerIsValid(SysCache[cacheId])) + if (!SysCache[cacheId]) elog(ERROR, "could not initialize cache %u (%d)", cacheinfo[cacheId].reloid, cacheId); /* Accumulate data for OID lists, too */ @@ -211,8 +211,7 @@ SearchSysCache(int cacheId, Datum key3, Datum key4) { - Assert(cacheId >= 0 && cacheId < SysCacheSize && - PointerIsValid(SysCache[cacheId])); + Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); return SearchCatCache(SysCache[cacheId], key1, key2, key3, key4); } @@ -221,8 +220,7 @@ HeapTuple SearchSysCache1(int cacheId, Datum key1) { - Assert(cacheId >= 0 && cacheId < SysCacheSize && - PointerIsValid(SysCache[cacheId])); + Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); Assert(SysCache[cacheId]->cc_nkeys == 1); return SearchCatCache1(SysCache[cacheId], key1); @@ -232,8 +230,7 @@ HeapTuple SearchSysCache2(int cacheId, Datum key1, Datum key2) { - Assert(cacheId >= 0 && cacheId < SysCacheSize && - PointerIsValid(SysCache[cacheId])); + Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); Assert(SysCache[cacheId]->cc_nkeys == 2); return SearchCatCache2(SysCache[cacheId], key1, key2); @@ -243,8 +240,7 @@ HeapTuple SearchSysCache3(int cacheId, Datum key1, Datum key2, Datum key3) { - Assert(cacheId >= 0 && cacheId < SysCacheSize && - PointerIsValid(SysCache[cacheId])); + Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); Assert(SysCache[cacheId]->cc_nkeys == 3); return SearchCatCache3(SysCache[cacheId], key1, key2, key3); @@ -254,8 +250,7 @@ HeapTuple SearchSysCache4(int cacheId, Datum key1, Datum key2, Datum key3, Datum key4) { - Assert(cacheId >= 0 && cacheId < SysCacheSize && - PointerIsValid(SysCache[cacheId])); + Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); Assert(SysCache[cacheId]->cc_nkeys == 4); return SearchCatCache4(SysCache[cacheId], key1, key2, key3, key4); @@ -459,9 +454,9 @@ GetSysCacheOid(int cacheId, tuple = SearchSysCache(cacheId, key1, key2, key3, key4); if (!HeapTupleIsValid(tuple)) return InvalidOid; - result = heap_getattr(tuple, oidcol, - SysCache[cacheId]->cc_tupdesc, - &isNull); + result = DatumGetObjectId(heap_getattr(tuple, oidcol, + SysCache[cacheId]->cc_tupdesc, + &isNull)); Assert(!isNull); /* columns used as oids should never be NULL */ ReleaseSysCache(tuple); return result; @@ -607,13 +602,12 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, * valid (because the caller recently fetched the tuple via this same * cache), but there are cases where we have to initialize the cache here. */ - if (cacheId < 0 || cacheId >= SysCacheSize || - !PointerIsValid(SysCache[cacheId])) + if (cacheId < 0 || cacheId >= SysCacheSize || !SysCache[cacheId]) elog(ERROR, "invalid cache ID: %d", cacheId); - if (!PointerIsValid(SysCache[cacheId]->cc_tupdesc)) + if (!SysCache[cacheId]->cc_tupdesc) { InitCatCachePhase2(SysCache[cacheId], false); - Assert(PointerIsValid(SysCache[cacheId]->cc_tupdesc)); + Assert(SysCache[cacheId]->cc_tupdesc); } return heap_getattr(tup, attributeNumber, @@ -664,8 +658,7 @@ GetSysCacheHashValue(int cacheId, Datum key3, Datum key4) { - if (cacheId < 0 || cacheId >= SysCacheSize || - !PointerIsValid(SysCache[cacheId])) + if (cacheId < 0 || cacheId >= SysCacheSize || !SysCache[cacheId]) elog(ERROR, "invalid cache ID: %d", cacheId); return GetCatCacheHashValue(SysCache[cacheId], key1, key2, key3, key4); @@ -678,8 +671,7 @@ struct catclist * SearchSysCacheList(int cacheId, int nkeys, Datum key1, Datum key2, Datum key3) { - if (cacheId < 0 || cacheId >= SysCacheSize || - !PointerIsValid(SysCache[cacheId])) + if (cacheId < 0 || cacheId >= SysCacheSize || !SysCache[cacheId]) elog(ERROR, "invalid cache ID: %d", cacheId); return SearchCatCacheList(SysCache[cacheId], nkeys, @@ -701,7 +693,7 @@ SysCacheInvalidate(int cacheId, uint32 hashValue) elog(ERROR, "invalid cache ID: %d", cacheId); /* if this cache isn't initialized yet, no need to do anything */ - if (!PointerIsValid(SysCache[cacheId])) + if (!SysCache[cacheId]) return; CatCacheInvalidate(SysCache[cacheId], hashValue); diff --git a/src/backend/utils/cache/ts_cache.c b/src/backend/utils/cache/ts_cache.c index 18cccd778fd8c..e8ae53238d07a 100644 --- a/src/backend/utils/cache/ts_cache.c +++ b/src/backend/utils/cache/ts_cache.c @@ -321,7 +321,9 @@ lookup_ts_dictionary_cache(Oid dictId) /* * Init method runs in dictionary's private memory context, and we - * make sure the options are stored there too + * make sure the options are stored there too. This typically + * results in a small amount of memory leakage, but it's not worth + * complicating the API for tmplinit functions to avoid it. */ oldcontext = MemoryContextSwitchTo(entry->dictCtx); diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index f9aec38a11fb3..0c17d99d0210f 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -1171,9 +1171,6 @@ load_domaintype_info(TypeCacheEntry *typentry) elog(ERROR, "domain \"%s\" constraint \"%s\" has NULL conbin", NameStr(typTup->typname), NameStr(c->conname)); - /* Convert conbin to C string in caller context */ - constring = TextDatumGetCString(val); - /* Create the DomainConstraintCache object and context if needed */ if (dcc == NULL) { @@ -1189,9 +1186,8 @@ load_domaintype_info(TypeCacheEntry *typentry) dcc->dccRefCount = 0; } - /* Create node trees in DomainConstraintCache's context */ - oldcxt = MemoryContextSwitchTo(dcc->dccContext); - + /* Convert conbin to a node tree, still in caller's context */ + constring = TextDatumGetCString(val); check_expr = (Expr *) stringToNode(constring); /* @@ -1206,10 +1202,13 @@ load_domaintype_info(TypeCacheEntry *typentry) */ check_expr = expression_planner(check_expr); + /* Create only the minimally needed stuff in dccContext */ + oldcxt = MemoryContextSwitchTo(dcc->dccContext); + r = makeNode(DomainConstraintState); r->constrainttype = DOM_CONSTRAINT_CHECK; r->name = pstrdup(NameStr(c->conname)); - r->check_expr = check_expr; + r->check_expr = copyObject(check_expr); r->check_exprstate = NULL; MemoryContextSwitchTo(oldcxt); @@ -2765,7 +2764,7 @@ load_enum_cache_data(TypeCacheEntry *tcache) * through. */ maxitems = 64; - items = (EnumItem *) palloc(sizeof(EnumItem) * maxitems); + items = palloc_array(EnumItem, maxitems); numitems = 0; /* Scan pg_enum for the members of the target enum type. */ diff --git a/src/backend/utils/error/assert.c b/src/backend/utils/error/assert.c index 84b94f5e5f472..9dbd9efe41a64 100644 --- a/src/backend/utils/error/assert.c +++ b/src/backend/utils/error/assert.c @@ -32,8 +32,7 @@ ExceptionalCondition(const char *conditionName, int lineNumber) { /* Report the failure on stderr (or local equivalent) */ - if (!PointerIsValid(conditionName) - || !PointerIsValid(fileName)) + if (!conditionName || !fileName) write_stderr("TRAP: ExceptionalCondition: bad arguments in PID %d\n", (int) getpid()); else diff --git a/src/backend/utils/error/csvlog.c b/src/backend/utils/error/csvlog.c index fdac3c048e36a..c3159ed7d979b 100644 --- a/src/backend/utils/error/csvlog.c +++ b/src/backend/utils/error/csvlog.c @@ -120,7 +120,7 @@ write_csvlog(ErrorData *edata) appendStringInfoChar(&buf, ','); /* session id */ - appendStringInfo(&buf, INT64_HEX_FORMAT ".%x", MyStartTime, MyProcPid); + appendStringInfo(&buf, "%" PRIx64 ".%x", MyStartTime, MyProcPid); appendStringInfoChar(&buf, ','); /* Line number */ diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 47af743990fe9..4c5a928320816 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -542,11 +542,20 @@ errfinish(const char *filename, int lineno, const char *funcname) /* Emit the message to the right places */ EmitErrorReport(); - /* Now free up subsidiary data attached to stack entry, and release it */ - FreeErrorDataContents(edata); - errordata_stack_depth--; + /* + * If this is the outermost recursion level, we can clean up by resetting + * ErrorContext altogether (compare FlushErrorState), which is good + * because it cleans up any random leakages that might have occurred in + * places such as context callback functions. If we're nested, we can + * only safely remove the subsidiary data of the current stack entry. + */ + if (errordata_stack_depth == 0 && recursion_depth == 1) + MemoryContextReset(ErrorContext); + else + FreeErrorDataContents(edata); - /* Exit error-handling context */ + /* Release stack entry and exit error-handling context */ + errordata_stack_depth--; MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -1128,12 +1137,15 @@ set_backtrace(ErrorData *edata, int num_skip) nframes = backtrace(buf, lengthof(buf)); strfrms = backtrace_symbols(buf, nframes); - if (strfrms == NULL) - return; - - for (int i = num_skip; i < nframes; i++) - appendStringInfo(&errtrace, "\n%s", strfrms[i]); - free(strfrms); + if (strfrms != NULL) + { + for (int i = num_skip; i < nframes; i++) + appendStringInfo(&errtrace, "\n%s", strfrms[i]); + free(strfrms); + } + else + appendStringInfoString(&errtrace, + "insufficient memory for backtrace generation"); } #else appendStringInfoString(&errtrace, @@ -1762,7 +1774,7 @@ CopyErrorData(void) Assert(CurrentMemoryContext != ErrorContext); /* Copy the struct itself */ - newedata = (ErrorData *) palloc(sizeof(ErrorData)); + newedata = palloc_object(ErrorData); memcpy(newedata, edata, sizeof(ErrorData)); /* @@ -2956,12 +2968,12 @@ log_status_format(StringInfo buf, const char *format, ErrorData *edata) { char strfbuf[128]; - snprintf(strfbuf, sizeof(strfbuf) - 1, INT64_HEX_FORMAT ".%x", + snprintf(strfbuf, sizeof(strfbuf) - 1, "%" PRIx64 ".%x", MyStartTime, MyProcPid); appendStringInfo(buf, "%*s", padding, strfbuf); } else - appendStringInfo(buf, INT64_HEX_FORMAT ".%x", MyStartTime, MyProcPid); + appendStringInfo(buf, "%" PRIx64 ".%x", MyStartTime, MyProcPid); break; case 'p': if (padding != 0) @@ -3783,13 +3795,24 @@ write_stderr(const char *fmt,...) { va_list ap; + va_start(ap, fmt); + vwrite_stderr(fmt, ap); + va_end(ap); +} + + +/* + * Write errors to stderr (or by equal means when stderr is + * not available) - va_list version + */ +void +vwrite_stderr(const char *fmt, va_list ap) +{ #ifdef WIN32 char errbuf[2048]; /* Arbitrary size? */ #endif fmt = _(fmt); - - va_start(ap, fmt); #ifndef WIN32 /* On Unix, we just fprintf to stderr */ vfprintf(stderr, fmt, ap); @@ -3812,5 +3835,4 @@ write_stderr(const char *fmt,...) fflush(stderr); } #endif - va_end(ap); } diff --git a/src/backend/utils/error/jsonlog.c b/src/backend/utils/error/jsonlog.c index 519eacf17f83c..2619f49904201 100644 --- a/src/backend/utils/error/jsonlog.c +++ b/src/backend/utils/error/jsonlog.c @@ -168,7 +168,7 @@ write_jsonlog(ErrorData *edata) } /* Session id */ - appendJSONKeyValueFmt(&buf, "session_id", true, INT64_HEX_FORMAT ".%x", + appendJSONKeyValueFmt(&buf, "session_id", true, "%" PRIx64 ".%x", MyStartTime, MyProcPid); /* Line number */ diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c index 603632581d04a..1366521f471e2 100644 --- a/src/backend/utils/fmgr/dfmgr.c +++ b/src/backend/utils/fmgr/dfmgr.c @@ -99,6 +99,21 @@ load_external_function(const char *filename, const char *funcname, void *lib_handle; void *retval; + /* + * For extensions with hardcoded '$libdir/' library names, we strip the + * prefix to allow the library search path to be used. This is done only + * for simple names (e.g., "$libdir/foo"), not for nested paths (e.g., + * "$libdir/foo/bar"). + * + * For nested paths, 'expand_dynamic_library_name' directly expands the + * '$libdir' macro, so we leave them untouched. + */ + if (strncmp(filename, "$libdir/", 8) == 0) + { + if (first_dir_separator(filename + 8) == NULL) + filename += 8; + } + /* Expand the possibly-abbreviated filename to an exact path name */ fullname = expand_dynamic_library_name(filename); @@ -456,14 +471,6 @@ expand_dynamic_library_name(const char *name) Assert(name); - /* - * If the value starts with "$libdir/", strip that. This is because many - * extensions have hardcoded '$libdir/foo' as their library name, which - * prevents using the path. - */ - if (strncmp(name, "$libdir/", 8) == 0) - name += 8; - have_slash = (first_dir_separator(name) != NULL); if (!have_slash) diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index 782291d999832..0fe63c6bb830c 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/detoast.h" +#include "access/htup_details.h" #include "catalog/pg_language.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" @@ -1570,7 +1571,6 @@ InputFunctionCall(FmgrInfo *flinfo, char *str, Oid typioparam, int32 typmod) * This is basically like InputFunctionCall, but the converted Datum is * returned into *result while the function result is true for success or * false for failure. Also, the caller may pass an ErrorSaveContext node. - * (We declare that as "fmNodePtr" to avoid including nodes.h in fmgr.h.) * * If escontext points to an ErrorSaveContext, any "soft" errors detected by * the input function will be reported by filling the escontext struct and @@ -1584,7 +1584,7 @@ InputFunctionCall(FmgrInfo *flinfo, char *str, Oid typioparam, int32 typmod) bool InputFunctionCallSafe(FmgrInfo *flinfo, char *str, Oid typioparam, int32 typmod, - fmNodePtr escontext, + Node *escontext, Datum *result) { LOCAL_FCINFO(fcinfo, 3); @@ -1639,7 +1639,7 @@ InputFunctionCallSafe(FmgrInfo *flinfo, char *str, bool DirectInputFunctionCallSafe(PGFunction func, char *str, Oid typioparam, int32 typmod, - fmNodePtr escontext, + Node *escontext, Datum *result) { LOCAL_FCINFO(fcinfo, 3); @@ -1788,41 +1788,6 @@ OidSendFunctionCall(Oid functionId, Datum val) } -/*------------------------------------------------------------------------- - * Support routines for standard maybe-pass-by-reference datatypes - * - * int8 and float8 can be passed by value if Datum is wide enough. - * (For backwards-compatibility reasons, we allow pass-by-ref to be chosen - * at compile time even if pass-by-val is possible.) - * - * Note: there is only one switch controlling the pass-by-value option for - * both int8 and float8; this is to avoid making things unduly complicated - * for the timestamp types, which might have either representation. - *------------------------------------------------------------------------- - */ - -#ifndef USE_FLOAT8_BYVAL /* controls int8 too */ - -Datum -Int64GetDatum(int64 X) -{ - int64 *retval = (int64 *) palloc(sizeof(int64)); - - *retval = X; - return PointerGetDatum(retval); -} - -Datum -Float8GetDatum(float8 X) -{ - float8 *retval = (float8 *) palloc(sizeof(float8)); - - *retval = X; - return PointerGetDatum(retval); -} -#endif /* USE_FLOAT8_BYVAL */ - - /*------------------------------------------------------------------------- * Support routines for toastable datatypes *------------------------------------------------------------------------- diff --git a/src/backend/utils/fmgr/funcapi.c b/src/backend/utils/fmgr/funcapi.c index 5f2317211c9d4..f40879f0617c4 100644 --- a/src/backend/utils/fmgr/funcapi.c +++ b/src/backend/utils/fmgr/funcapi.c @@ -1436,7 +1436,7 @@ get_func_arg_info(HeapTuple procTup, &elems, NULL, &nelems); if (nelems != numargs) /* should not happen */ elog(ERROR, "proargnames must have the same number of elements as the function has arguments"); - *p_argnames = (char **) palloc(sizeof(char *) * numargs); + *p_argnames = palloc_array(char *, numargs); for (i = 0; i < numargs; i++) (*p_argnames)[i] = TextDatumGetCString(elems[i]); } diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1ad155d446e51..ac94b9e93c6e3 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -22,10 +22,11 @@ * lookup key's hash value as a partition number --- this will work because * of the way calc_bucket() maps hash values to bucket numbers. * - * For hash tables in shared memory, the memory allocator function should - * match malloc's semantics of returning NULL on failure. For hash tables - * in local memory, we typically use palloc() which will throw error on - * failure. The code in this file has to cope with both cases. + * The memory allocator function should match malloc's semantics of returning + * NULL on failure. (This is essential for hash tables in shared memory. + * For hash tables in local memory, we used to use palloc() which will throw + * error on failure; but we no longer do, so it's untested whether this + * module will still cope with that behavior.) * * dynahash.c provides support for these types of lookup keys: * @@ -79,9 +80,8 @@ * are not implemented; otherwise functionality is identical. * * Compilation controls: - * HASH_DEBUG controls some informative traces, mainly for debugging. - * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained; - * when combined with HASH_DEBUG, these are displayed by hdestroy(). + * HASH_STATISTICS causes some usage statistics to be maintained, which can be + * logged by calling hash_stats(). * * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor * concatenation property, in probably unnecessary code 'optimization'. @@ -98,10 +98,10 @@ #include "access/xact.h" #include "common/hashfn.h" +#include "lib/ilist.h" #include "port/pg_bitutils.h" #include "storage/shmem.h" #include "storage/spin.h" -#include "utils/dynahash.h" #include "utils/memutils.h" @@ -153,7 +153,7 @@ typedef HASHBUCKET *HASHSEGMENT; typedef struct { slock_t mutex; /* spinlock for this freelist */ - long nentries; /* number of entries in associated buckets */ + int64 nentries; /* number of entries in associated buckets */ HASHELEMENT *freeList; /* chain of free elements */ } FreeListData; @@ -181,8 +181,8 @@ struct HASHHDR /* These fields can change, but not in a partitioned table */ /* Also, dsize can't change in a shared table, even if unpartitioned */ - long dsize; /* directory size */ - long nsegs; /* number of allocated segments (<= dsize) */ + int64 dsize; /* directory size */ + int64 nsegs; /* number of allocated segments (<= dsize) */ uint32 max_bucket; /* ID of maximum bucket in use */ uint32 high_mask; /* mask to modulo into entire table */ uint32 low_mask; /* mask to modulo into lower half of table */ @@ -190,11 +190,12 @@ struct HASHHDR /* These fields are fixed at hashtable creation */ Size keysize; /* hash key length in bytes */ Size entrysize; /* total user element size in bytes */ - long num_partitions; /* # partitions (must be power of 2), or 0 */ - long max_dsize; /* 'dsize' limit if directory is fixed size */ - long ssize; /* segment size --- must be power of 2 */ + int64 num_partitions; /* # partitions (must be power of 2), or 0 */ + int64 max_dsize; /* 'dsize' limit if directory is fixed size */ + int64 ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ int nelem_alloc; /* number of entries to allocate at once */ + bool isfixed; /* if true, don't enlarge */ #ifdef HASH_STATISTICS @@ -202,8 +203,9 @@ struct HASHHDR * Count statistics here. NB: stats code doesn't bother with mutex, so * counts could be corrupted a bit in a partitioned table. */ - long accesses; - long collisions; + uint64 accesses; + uint64 collisions; + uint64 expansions; #endif }; @@ -227,15 +229,24 @@ struct HTAB MemoryContext hcxt; /* memory context if default allocator used */ char *tabname; /* table name (for error messages) */ bool isshared; /* true if table is in shared memory */ - bool isfixed; /* if true, don't enlarge */ /* freezing a shared table isn't allowed, so we can keep state here */ bool frozen; /* true = no more inserts allowed */ /* We keep local copies of these fixed values to reduce contention */ Size keysize; /* hash key length in bytes */ - long ssize; /* segment size --- must be power of 2 */ + int64 ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ + + /* + * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of + * all the element blocks they have allocated. This pacifies Valgrind, + * which would otherwise often claim that the element blocks are "possibly + * lost" for lack of any non-interior pointers to their starts. + */ +#ifdef USE_VALGRIND + slist_head element_blocks; +#endif }; /* @@ -254,12 +265,6 @@ struct HTAB */ #define MOD(x,y) ((x) & ((y)-1)) -#ifdef HASH_STATISTICS -static long hash_accesses, - hash_collisions, - hash_expansions; -#endif - /* * Private function prototypes */ @@ -271,12 +276,13 @@ static bool expand_table(HTAB *hashp); static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx); static void hdefault(HTAB *hashp); static int choose_nelem_alloc(Size entrysize); -static bool init_htab(HTAB *hashp, long nelem); +static bool init_htab(HTAB *hashp, int64 nelem); pg_noreturn static void hash_corrupted(HTAB *hashp); static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr); -static long next_pow2_long(long num); -static int next_pow2_int(long num); +static int my_log2(int64 num); +static int64 next_pow2_int64(int64 num); +static int next_pow2_int(int64 num); static void register_seq_scan(HTAB *hashp); static void deregister_seq_scan(HTAB *hashp); static bool has_seq_scans(HTAB *hashp); @@ -349,7 +355,7 @@ string_compare(const char *key1, const char *key2, Size keysize) * large nelem will penalize hash_seq_search speed without buying much. */ HTAB * -hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags) +hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags) { HTAB *hashp; HASHHDR *hctl; @@ -618,8 +624,10 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags) } } + /* Set isfixed if requested, but not till after we build initial entries */ if (flags & HASH_FIXED_SIZE) - hashp->isfixed = true; + hctl->isfixed = true; + return hashp; } @@ -644,8 +652,10 @@ hdefault(HTAB *hashp) hctl->ssize = DEF_SEGSIZE; hctl->sshift = DEF_SEGSIZE_SHIFT; + hctl->isfixed = false; /* can be enlarged */ + #ifdef HASH_STATISTICS - hctl->accesses = hctl->collisions = 0; + hctl->accesses = hctl->collisions = hctl->expansions = 0; #endif } @@ -687,7 +697,7 @@ choose_nelem_alloc(Size entrysize) * arrays */ static bool -init_htab(HTAB *hashp, long nelem) +init_htab(HTAB *hashp, int64 nelem) { HASHHDR *hctl = hashp->hctl; HASHSEGMENT *segp; @@ -759,17 +769,6 @@ init_htab(HTAB *hashp, long nelem) /* Choose number of entries to allocate at a time */ hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize); -#ifdef HASH_DEBUG - fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n", - "TABLE POINTER ", hashp, - "DIRECTORY SIZE ", hctl->dsize, - "SEGMENT SIZE ", hctl->ssize, - "SEGMENT SHIFT ", hctl->sshift, - "MAX BUCKET ", hctl->max_bucket, - "HIGH MASK ", hctl->high_mask, - "LOW MASK ", hctl->low_mask, - "NSEGS ", hctl->nsegs); -#endif return true; } @@ -781,10 +780,10 @@ init_htab(HTAB *hashp, long nelem) * NB: assumes that all hash structure parameters have default values! */ Size -hash_estimate_size(long num_entries, Size entrysize) +hash_estimate_size(int64 num_entries, Size entrysize) { Size size; - long nBuckets, + int64 nBuckets, nSegments, nDirEntries, nElementAllocs, @@ -792,9 +791,9 @@ hash_estimate_size(long num_entries, Size entrysize) elementAllocCnt; /* estimate number of buckets wanted */ - nBuckets = next_pow2_long(num_entries); + nBuckets = next_pow2_int64(num_entries); /* # of segments needed for nBuckets */ - nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); + nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1); /* directory entries */ nDirEntries = DEF_DIRSIZE; while (nDirEntries < nSegments) @@ -827,17 +826,17 @@ hash_estimate_size(long num_entries, Size entrysize) * * XXX this had better agree with the behavior of init_htab()... */ -long -hash_select_dirsize(long num_entries) +int64 +hash_select_dirsize(int64 num_entries) { - long nBuckets, + int64 nBuckets, nSegments, nDirEntries; /* estimate number of buckets wanted */ - nBuckets = next_pow2_long(num_entries); + nBuckets = next_pow2_int64(num_entries); /* # of segments needed for nBuckets */ - nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); + nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1); /* directory entries */ nDirEntries = DEF_DIRSIZE; while (nDirEntries < nSegments) @@ -872,7 +871,7 @@ hash_destroy(HTAB *hashp) /* so this hashtable must have its own context */ Assert(hashp->hcxt != NULL); - hash_stats("destroy", hashp); + hash_stats(__func__, hashp); /* * Free everything by destroying the hash table's memory context. @@ -882,19 +881,16 @@ hash_destroy(HTAB *hashp) } void -hash_stats(const char *where, HTAB *hashp) +hash_stats(const char *caller, HTAB *hashp) { #ifdef HASH_STATISTICS - fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n", - where, hashp->hctl->accesses, hashp->hctl->collisions); - - fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", - hash_get_num_entries(hashp), (long) hashp->hctl->keysize, - hashp->hctl->max_bucket, hashp->hctl->nsegs); - fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", - where, hash_accesses, hash_collisions); - fprintf(stderr, "hash_stats: total expansions %ld\n", - hash_expansions); + HASHHDR *hctl = hashp->hctl; + + elog(DEBUG4, + "hash_stats: Caller: %s Table Name: \"%s\" Accesses: " UINT64_FORMAT " Collisions: " UINT64_FORMAT " Expansions: " UINT64_FORMAT " Entries: " INT64_FORMAT " Key Size: %zu Max Bucket: %u Segment Count: " INT64_FORMAT, + caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses, + hctl->collisions, hctl->expansions, hash_get_num_entries(hashp), + hctl->keysize, hctl->max_bucket, hctl->nsegs); #endif } @@ -980,7 +976,6 @@ hash_search_with_hash_value(HTAB *hashp, HashCompareFunc match; #ifdef HASH_STATISTICS - hash_accesses++; hctl->accesses++; #endif @@ -998,7 +993,7 @@ hash_search_with_hash_value(HTAB *hashp, * Can't split if running in partitioned mode, nor if frozen, nor if * table is the subject of any active hash_seq_search scans. */ - if (hctl->freeList[0].nentries > (long) hctl->max_bucket && + if (hctl->freeList[0].nentries > (int64) hctl->max_bucket && !IS_PARTITIONED(hctl) && !hashp->frozen && !has_seq_scans(hashp)) (void) expand_table(hashp); @@ -1024,7 +1019,6 @@ hash_search_with_hash_value(HTAB *hashp, prevBucketPtr = &(currBucket->link); currBucket = *prevBucketPtr; #ifdef HASH_STATISTICS - hash_collisions++; hctl->collisions++; #endif } @@ -1158,7 +1152,8 @@ hash_update_hash_key(HTAB *hashp, HashCompareFunc match; #ifdef HASH_STATISTICS - hash_accesses++; + HASHHDR *hctl = hashp->hctl; + hctl->accesses++; #endif @@ -1212,7 +1207,6 @@ hash_update_hash_key(HTAB *hashp, prevBucketPtr = &(currBucket->link); currBucket = *prevBucketPtr; #ifdef HASH_STATISTICS - hash_collisions++; hctl->collisions++; #endif } @@ -1338,11 +1332,11 @@ get_hash_entry(HTAB *hashp, int freelist_idx) /* * hash_get_num_entries -- get the number of entries in a hashtable */ -long +int64 hash_get_num_entries(HTAB *hashp) { int i; - long sum = hashp->hctl->freeList[0].nentries; + int64 sum = hashp->hctl->freeList[0].nentries; /* * We currently don't bother with acquiring the mutexes; it's only @@ -1423,9 +1417,9 @@ hash_seq_search(HASH_SEQ_STATUS *status) HTAB *hashp; HASHHDR *hctl; uint32 max_bucket; - long ssize; - long segment_num; - long segment_ndx; + int64 ssize; + int64 segment_num; + int64 segment_ndx; HASHSEGMENT segp; uint32 curBucket; HASHELEMENT *curElem; @@ -1554,11 +1548,11 @@ expand_table(HTAB *hashp) HASHHDR *hctl = hashp->hctl; HASHSEGMENT old_seg, new_seg; - long old_bucket, + int64 old_bucket, new_bucket; - long new_segnum, + int64 new_segnum, new_segndx; - long old_segnum, + int64 old_segnum, old_segndx; HASHBUCKET *oldlink, *newlink; @@ -1568,7 +1562,7 @@ expand_table(HTAB *hashp) Assert(!IS_PARTITIONED(hctl)); #ifdef HASH_STATISTICS - hash_expansions++; + hctl->expansions++; #endif new_bucket = hctl->max_bucket + 1; @@ -1626,7 +1620,7 @@ expand_table(HTAB *hashp) currElement = nextElement) { nextElement = currElement->link; - if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket) + if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket) { *oldlink = currElement; oldlink = &currElement->link; @@ -1650,9 +1644,9 @@ dir_realloc(HTAB *hashp) { HASHSEGMENT *p; HASHSEGMENT *old_p; - long new_dsize; - long old_dirsize; - long new_dirsize; + int64 new_dsize; + int64 old_dirsize; + int64 new_dirsize; if (hashp->hctl->max_dsize != NO_MAX_DSIZE) return false; @@ -1708,23 +1702,51 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx) { HASHHDR *hctl = hashp->hctl; Size elementSize; + Size requestSize; + char *allocedBlock; HASHELEMENT *firstElement; HASHELEMENT *tmpElement; HASHELEMENT *prevElement; int i; - if (hashp->isfixed) + if (hctl->isfixed) return false; /* Each element has a HASHELEMENT header plus user data. */ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); + requestSize = nelem * elementSize; + + /* Add space for slist_node list link if we need one. */ +#ifdef USE_VALGRIND + if (!hashp->isshared) + requestSize += MAXALIGN(sizeof(slist_node)); +#endif + + /* Allocate the memory. */ CurrentDynaHashCxt = hashp->hcxt; - firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); + allocedBlock = hashp->alloc(requestSize); - if (!firstElement) + if (!allocedBlock) return false; + /* + * If USE_VALGRIND, each allocated block of elements of a non-shared + * hashtable is chained into a list, so that Valgrind won't think it's + * been leaked. + */ +#ifdef USE_VALGRIND + if (hashp->isshared) + firstElement = (HASHELEMENT *) allocedBlock; + else + { + slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock); + firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node))); + } +#else + firstElement = (HASHELEMENT *) allocedBlock; +#endif + /* prepare to link all the new entries into the freelist */ prevElement = NULL; tmpElement = firstElement; @@ -1758,8 +1780,8 @@ hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr) { HASHHDR *hctl = hashp->hctl; HASHSEGMENT segp; - long segment_num; - long segment_ndx; + int64 segment_num; + int64 segment_ndx; uint32 bucket; bucket = calc_bucket(hctl, hashvalue); @@ -1791,26 +1813,22 @@ hash_corrupted(HTAB *hashp) } /* calculate ceil(log base 2) of num */ -int -my_log2(long num) +static int +my_log2(int64 num) { /* * guard against too-large input, which would be invalid for * pg_ceil_log2_*() */ - if (num > LONG_MAX / 2) - num = LONG_MAX / 2; + if (num > PG_INT64_MAX / 2) + num = PG_INT64_MAX / 2; -#if SIZEOF_LONG < 8 - return pg_ceil_log2_32(num); -#else return pg_ceil_log2_64(num); -#endif } -/* calculate first power of 2 >= num, bounded to what will fit in a long */ -static long -next_pow2_long(long num) +/* calculate first power of 2 >= num, bounded to what will fit in a int64 */ +static int64 +next_pow2_int64(int64 num) { /* my_log2's internal range check is sufficient */ return 1L << my_log2(num); @@ -1818,7 +1836,7 @@ next_pow2_long(long num) /* calculate first power of 2 >= num, bounded to what will fit in an int */ static int -next_pow2_int(long num) +next_pow2_int(int64 num) { if (num > INT_MAX / 2) num = INT_MAX / 2; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 43b4dbccc3de6..fec79992c8dea 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -266,62 +266,11 @@ GetBackendTypeDesc(BackendType backendType) switch (backendType) { - case B_INVALID: - backendDesc = gettext_noop("not initialized"); - break; - case B_ARCHIVER: - backendDesc = gettext_noop("archiver"); - break; - case B_AUTOVAC_LAUNCHER: - backendDesc = gettext_noop("autovacuum launcher"); - break; - case B_AUTOVAC_WORKER: - backendDesc = gettext_noop("autovacuum worker"); - break; - case B_BACKEND: - backendDesc = gettext_noop("client backend"); - break; - case B_DEAD_END_BACKEND: - backendDesc = gettext_noop("dead-end client backend"); - break; - case B_BG_WORKER: - backendDesc = gettext_noop("background worker"); - break; - case B_BG_WRITER: - backendDesc = gettext_noop("background writer"); - break; - case B_CHECKPOINTER: - backendDesc = gettext_noop("checkpointer"); - break; - case B_IO_WORKER: - backendDesc = gettext_noop("io worker"); - break; - case B_LOGGER: - backendDesc = gettext_noop("logger"); - break; - case B_SLOTSYNC_WORKER: - backendDesc = gettext_noop("slotsync worker"); - break; - case B_STANDALONE_BACKEND: - backendDesc = gettext_noop("standalone backend"); - break; - case B_STARTUP: - backendDesc = gettext_noop("startup"); - break; - case B_WAL_RECEIVER: - backendDesc = gettext_noop("walreceiver"); - break; - case B_WAL_SENDER: - backendDesc = gettext_noop("walsender"); - break; - case B_WAL_SUMMARIZER: - backendDesc = gettext_noop("walsummarizer"); - break; - case B_WAL_WRITER: - backendDesc = gettext_noop("walwriter"); - break; +#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \ + case bktype: backendDesc = description; break; +#include "postmaster/proctypelist.h" +#undef PG_PROCTYPE } - return backendDesc; } @@ -1099,7 +1048,8 @@ EstimateClientConnectionInfoSpace(void) * Serialize MyClientConnectionInfo for use by parallel workers. */ void -SerializeClientConnectionInfo(Size maxsize, char *start_address) +SerializeClientConnectionInfo(Size maxsize PG_USED_FOR_ASSERTS_ONLY, + char *start_address) { SerializedClientConnectionInfo serialized = {0}; @@ -1183,7 +1133,6 @@ UnlinkLockFiles(int status, Datum arg) /* Should we complain if the unlink fails? */ } /* Since we're about to exit, no need to reclaim storage */ - lock_files = NIL; /* * Lock file removal should always be the last externally visible action diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index c86ceefda940b..b7e94ca45bdb1 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -417,12 +417,11 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype); ctype = TextDatumGetCString(datum); - if (pg_perm_setlocale(LC_COLLATE, collate) == NULL) - ereport(FATAL, - (errmsg("database locale is incompatible with operating system"), - errdetail("The database was initialized with LC_COLLATE \"%s\", " - " which is not recognized by setlocale().", collate), - errhint("Recreate the database with another locale or install the missing locale."))); + /* + * Historcally, we set LC_COLLATE from datcollate, as well. That's no + * longer necessary because all collation behavior is handled through + * pg_locale_t. + */ if (pg_perm_setlocale(LC_CTYPE, ctype) == NULL) ereport(FATAL, @@ -431,10 +430,6 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect " which is not recognized by setlocale().", ctype), errhint("Recreate the database with another locale or install the missing locale."))); - if (strcmp(ctype, "C") == 0 || - strcmp(ctype, "POSIX") == 0) - database_ctype_is_c = true; - init_database_collation(); /* @@ -658,6 +653,9 @@ BaseInit(void) /* Initialize lock manager's local structs */ InitLockManagerAccess(); + /* Initialize logical info WAL logging state */ + InitializeProcessXLogLogicalInfo(); + /* * Initialize replication slots after pgstat. The exit hook might need to * drop ephemeral slots, which in turn triggers stats reporting. @@ -1265,7 +1263,7 @@ process_startup_options(Port *port, bool am_superuser) maxac = 2 + (strlen(port->cmdline_options) + 1) / 2; - av = (char **) palloc(maxac * sizeof(char *)); + av = palloc_array(char *, maxac); ac = 0; av[ac++] = "postgres"; diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index ad789b31e54b5..806e0e7f0dd46 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -50,11 +50,11 @@ $(eval $(call map_rule,gbk,UCS_to_most.pl,CP936.TXT,GBK)) $(eval $(call map_rule,johab,UCS_to_JOHAB.pl,JOHAB.TXT)) $(eval $(call map_rule,uhc,UCS_to_UHC.pl,windows-949-2000.xml)) $(eval $(call map_rule,euc_jp,UCS_to_EUC_JP.pl,CP932.TXT JIS0212.TXT)) -$(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml)) +$(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb18030-2022.ucm)) $(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT)) $(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT)) $(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT)) -$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.xml)) +$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb18030-2022.ucm)) $(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT)) $(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt)) $(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt)) @@ -75,9 +75,12 @@ BIG5.TXT CNS11643.TXT: euc-jis-2004-std.txt sjis-0213-2004-std.txt: $(DOWNLOAD) http://x0213.org/codetable/$(@F) -gb-18030-2000.xml windows-949-2000.xml: +windows-949-2000.xml: $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F) +gb18030-2022.ucm: + $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu/refs/heads/main/icu4c/source/data/mappings/$(@F) + GB2312.TXT: $(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt' diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl index f7776631e4c18..777c6c3d07f10 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl @@ -2,16 +2,17 @@ # # Copyright (c) 2007-2025, PostgreSQL Global Development Group # -# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl # -# Generate UTF-8 <--> GB18030 code conversion tables from -# "gb-18030-2000.xml", obtained from -# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# Generate UTF-8 <--> EUC_CN code conversion tables from +# "gb18030-2022.ucm", obtained from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/ # # The lines we care about in the source file look like -# -# where the "u" field is the Unicode code point in hex, -# and the "b" field is the hex byte sequence for GB18030 +# \xYY[\xYY...] |n +# where XXXX is the Unicode code point in hex, +# and the \xYY... is the hex byte sequence for GB18030, +# and n is a flag indicating the type of mapping. use strict; use warnings FATAL => 'all'; @@ -22,7 +23,7 @@ # Read the input -my $in_file = "gb-18030-2000.xml"; +my $in_file = "gb18030-2022.ucm"; open(my $in, '<', $in_file) || die("cannot open $in_file"); @@ -30,9 +31,18 @@ while (<$in>) { - next if (!m/\s+ + ((?:\\x[0-9A-Fa-f]{2})+)\s+ + \|(\d+)/x; + my ($u, $c, $flag) = ($1, $2, $3); + $c =~ s/\\x//g; + + # We only want round-trip mappings + next if ($flag ne '0'); + my $ucs = hex($u); my $code = hex($c); diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index ddcbd6ef0c478..a3be0d5700dc3 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -5,13 +5,14 @@ # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # # Generate UTF-8 <--> GB18030 code conversion tables from -# "gb-18030-2000.xml", obtained from -# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# "gb18030-2022.ucm", obtained from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/ # # The lines we care about in the source file look like -# -# where the "u" field is the Unicode code point in hex, -# and the "b" field is the hex byte sequence for GB18030 +# \xYY[\xYY...] |n +# where XXXX is the Unicode code point in hex, +# and the \xYY... is the hex byte sequence for GB18030, +# and n is a flag indicating the type of mapping. use strict; use warnings FATAL => 'all'; @@ -22,7 +23,7 @@ # Read the input -my $in_file = "gb-18030-2000.xml"; +my $in_file = "gb18030-2022.ucm"; open(my $in, '<', $in_file) || die("cannot open $in_file"); @@ -30,9 +31,18 @@ while (<$in>) { - next if (!m/\s+ + ((?:\\x[0-9A-Fa-f]{2})+)\s+ + \|(\d+)/x; + my ($u, $c, $flag) = ($1, $2, $3); + $c =~ s/\\x//g; + + # We only want round-trip mappings + next if ($flag ne '0'); + my $ucs = hex($u); my $code = hex($c); if ($code >= 0x80 && $ucs >= 0x0080) diff --git a/src/backend/utils/mb/Unicode/gb-18030-2000.xml b/src/backend/utils/mb/Unicode/gb-18030-2000.xml deleted file mode 100644 index fbbc9e334e9b3..0000000000000 --- a/src/backend/utils/mb/Unicode/gb-18030-2000.xml +++ /dev/null @@ -1,30916 +0,0 @@ - - - - - - 0x80 appears to be a valid (and unassigned) single-byte code, added to the validity. - - - New mapping data, changing all four-byte mappings to the BMP. - Removed mappings to single surrogates. - - - Original table. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/backend/utils/mb/Unicode/gb18030_to_utf8.map b/src/backend/utils/mb/Unicode/gb18030_to_utf8.map index 1c90d48fbaf76..da2b2047db52d 100644 --- a/src/backend/utils/mb/Unicode/gb18030_to_utf8.map +++ b/src/backend/utils/mb/Unicode/gb18030_to_utf8.map @@ -1,7 +1,7 @@ /* src/backend/utils/mb/Unicode/gb18030_to_utf8.map */ /* This file is generated by src/backend/utils/mb/Unicode/UCS_to_GB18030.pl */ -static const uint32 gb18030_to_unicode_tree_table[32795]; +static const uint32 gb18030_to_unicode_tree_table[32911]; static const pg_mb_radix_tree gb18030_to_unicode_tree = { @@ -37,7 +37,7 @@ static const pg_mb_radix_tree gb18030_to_unicode_tree = 0x39 /* b4_4_upper */ }; -static const uint32 gb18030_to_unicode_tree_table[32795] = +static const uint32 gb18030_to_unicode_tree_table[32911] = { /*** Dummy map, for invalid values - offset 0x00000 ***/ @@ -1885,7 +1885,7 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* 94 */ 0xee9799, 0xee979a, 0xee979b, 0xee979c, /* 98 */ 0xee979d, 0xee979e, 0xee979f, 0xee97a0, /* 9c */ 0xee97a1, 0xee97a2, 0xee97a3, 0xee97a4, - /* a0 */ 0xee97a5, 0xefbc81, 0xefbc82, 0xefbc83, + /* a0 */ 0x000000, 0xefbc81, 0xefbc82, 0xefbc83, /* a4 */ 0xefbfa5, 0xefbc85, 0xefbc86, 0xefbc87, /* a8 */ 0xefbc88, 0xefbc89, 0xefbc8a, 0xefbc8b, /* ac */ 0xefbc8c, 0xefbc8d, 0xefbc8e, 0xefbc8f, @@ -2052,13 +2052,13 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* cc */ 0x00cebc, 0x00cebd, 0x00cebe, 0x00cebf, /* d0 */ 0x00cf80, 0x00cf81, 0x00cf83, 0x00cf84, /* d4 */ 0x00cf85, 0x00cf86, 0x00cf87, 0x00cf88, - /* d8 */ 0x00cf89, 0xee9e8d, 0xee9e8e, 0xee9e8f, - /* dc */ 0xee9e90, 0xee9e91, 0xee9e92, 0xee9e93, + /* d8 */ 0x00cf89, 0xefb890, 0xefb892, 0xefb891, + /* dc */ 0xefb893, 0xefb894, 0xefb895, 0xefb896, /* e0 */ 0xefb8b5, 0xefb8b6, 0xefb8b9, 0xefb8ba, /* e4 */ 0xefb8bf, 0xefb980, 0xefb8bd, 0xefb8be, /* e8 */ 0xefb981, 0xefb982, 0xefb983, 0xefb984, - /* ec */ 0xee9e94, 0xee9e95, 0xefb8bb, 0xefb8bc, - /* f0 */ 0xefb8b7, 0xefb8b8, 0xefb8b1, 0xee9e96, + /* ec */ 0xefb897, 0xefb898, 0xefb8bb, 0xefb8bc, + /* f0 */ 0xefb8b7, 0xefb8b8, 0xefb8b1, 0xefb899, /* f4 */ 0xefb8b3, 0xefb8b4, 0xee9e97, 0xee9e98, /* f8 */ 0xee9e99, 0xee9e9a, 0xee9e9b, 0xee9e9c, /* fc */ 0xee9e9d, 0xee9e9e, 0xee9e9f, @@ -2147,7 +2147,7 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* b0 */ 0x00c3b2, 0x00c5ab, 0x00c3ba, 0x00c794, /* b4 */ 0x00c3b9, 0x00c796, 0x00c798, 0x00c79a, /* b8 */ 0x00c79c, 0x00c3bc, 0x00c3aa, 0x00c991, - /* bc */ 0xee9f87, 0x00c584, 0x00c588, 0x00c7b9, + /* bc */ 0xe1b8bf, 0x00c584, 0x00c588, 0x00c7b9, /* c0 */ 0x00c9a1, 0xee9f89, 0xee9f8a, 0xee9f8b, /* c4 */ 0xee9f8c, 0xe38485, 0xe38486, 0xe38487, /* c8 */ 0xe38488, 0xe38489, 0xe3848a, 0xe3848b, @@ -6508,25 +6508,25 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* 4c */ 0xefa8a4, 0xefa8a7, 0xefa8a8, 0xefa8a9, /* 50 */ 0xe2ba81, 0xeea096, 0xeea097, 0xeea098, /* 54 */ 0xe2ba84, 0xe391b3, 0xe39187, 0xe2ba88, - /* 58 */ 0xe2ba8b, 0xeea09e, 0xe3969e, 0xe3989a, + /* 58 */ 0xe2ba8b, 0xe9beb4, 0xe3969e, 0xe3989a, /* 5c */ 0xe3988e, 0xe2ba8c, 0xe2ba97, 0xe3a5ae, - /* 60 */ 0xe3a498, 0xeea0a6, 0xe3a78f, 0xe3a79f, - /* 64 */ 0xe3a9b3, 0xe3a790, 0xeea0ab, 0xeea0ac, + /* 60 */ 0xe3a498, 0xe9beb5, 0xe3a78f, 0xe3a79f, + /* 64 */ 0xe3a9b3, 0xe3a790, 0xe9beb6, 0xe9beb7, /* 68 */ 0xe3ad8e, 0xe3b1ae, 0xe3b3a0, 0xe2baa7, - /* 6c */ 0xeea0b1, 0xeea0b2, 0xe2baaa, 0xe48196, + /* 6c */ 0xeea0b1, 0xe9beb8, 0xe2baaa, 0xe48196, /* 70 */ 0xe4859f, 0xe2baae, 0xe48cb7, 0xe2bab3, /* 74 */ 0xe2bab6, 0xe2bab7, 0xeea0bb, 0xe48eb1, /* 78 */ 0xe48eac, 0xe2babb, 0xe48f9d, 0xe49396, - /* 7c */ 0xe499a1, 0xe4998c, 0xeea183, 0x000000, + /* 7c */ 0xe499a1, 0xe4998c, 0xe9beb9, 0x000000, /* 80 */ 0xe49ca3, 0xe49ca9, 0xe49dbc, 0xe49e8d, /* 84 */ 0xe2bb8a, 0xe4a587, 0xe4a5ba, 0xe4a5bd, /* 88 */ 0xe4a682, 0xe4a683, 0xe4a685, 0xe4a686, /* 8c */ 0xe4a69f, 0xe4a69b, 0xe4a6b7, 0xe4a6b6, - /* 90 */ 0xeea194, 0xeea195, 0xe4b2a3, 0xe4b29f, + /* 90 */ 0xe9beba, 0xeea195, 0xe4b2a3, 0xe4b29f, /* 94 */ 0xe4b2a0, 0xe4b2a1, 0xe4b1b7, 0xe4b2a2, /* 98 */ 0xe4b493, 0xe4b494, 0xe4b495, 0xe4b496, /* 9c */ 0xe4b497, 0xe4b498, 0xe4b499, 0xe4b6ae, - /* a0 */ 0xeea1a4, 0xee91a8, 0xee91a9, 0xee91aa, + /* a0 */ 0xe9bebb, 0xee91a8, 0xee91a9, 0xee91aa, /* a4 */ 0xee91ab, 0xee91ac, 0xee91ad, 0xee91ae, /* a8 */ 0xee91af, 0xee91b0, 0xee91b1, 0xee91b2, /* ac */ 0xee91b3, 0xee91b4, 0xee91b5, 0xee91b6, @@ -6558,55 +6558,86 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /*** Four byte table, byte #2: 81xx - offset 0x05f43 ***/ /* 30 */ 0x005f67, 0x000000, 0x000000, 0x000000, - /* 34 */ 0x000000, 0x000000, 0x005fc1, 0x00603f, - /* 38 */ 0x006067, 0x0060e5, + /* 34 */ 0x000000, 0x005fb9, 0x00602d, 0x0060ab, + /* 38 */ 0x0060d3, 0x006151, /*** Four byte table, byte #2: 82xx - offset 0x05f4d ***/ - /* 30 */ 0x006163, 0x0061e1, 0x006235, 0x0062b3, - /* 34 */ 0x00631c, 0x00639a, + /* 30 */ 0x0061cf, 0x00624d, 0x0062a1, 0x00631f, + /* 34 */ 0x006388, 0x006406, /* 4 trailing zero values shared with next segment */ /*** Four byte table, byte #2: 83xx - offset 0x05f53 ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* 34 */ 0x000000, 0x000000, 0x0063d2, 0x000000, + /* 34 */ 0x000000, 0x000000, 0x00643e, 0x000000, /* 38 */ 0x000000, 0x000000, /*** Four byte table, byte #2: 84xx - offset 0x05f5d ***/ - /* 30 */ 0x00644c, 0x0064c6, 0x000000, 0x000000, + /* 30 */ 0x0064b8, 0x006532, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 38 */ 0x000000, 0x000000, /*** Four byte table, byte #3: 8130xx - offset 0x05f67 ***/ - /* 81 */ 0x006544, 0x00654e, 0x006558, 0x006562, - /* 85 */ 0x00656c, 0x006576, 0x006580, 0x00658a, - /* 89 */ 0x006594, 0x00659e, 0x0065a8, 0x0065b2, - /* 8d */ 0x0065bc, 0x0065c6, 0x0065d0, 0x0065da, - /* 91 */ 0x0065e4, 0x0065ee, 0x0065f8, 0x006602, - /* 95 */ 0x00660c, 0x006616, 0x006620, 0x00662a, - /* 99 */ 0x006634, 0x00663e, 0x006648, 0x006652, - /* 9d */ 0x00665c, 0x006666, 0x006670, 0x00667a, - /* a1 */ 0x006684, 0x00668e, 0x006698, 0x0066a2, - /* a5 */ 0x0066ac, 0x0066b6, 0x0066c0, 0x0066ca, - /* a9 */ 0x0066d4, 0x0066de, 0x0066e8, 0x0066f2, - /* ad */ 0x0066fc, 0x006706, 0x006710, 0x00671a, - /* b1 */ 0x006724, 0x00672e, 0x006738, 0x006742, - /* b5 */ 0x00674c, 0x006756, 0x006760, 0x00676a, - /* b9 */ 0x006774, 0x00677e, 0x006788, 0x006792, - /* bd */ 0x00679c, 0x0067a6, 0x0067b0, 0x0067ba, - /* c1 */ 0x0067c4, 0x0067ce, 0x0067d8, 0x0067e2, - /* c5 */ 0x0067ec, 0x0067f6, 0x006800, 0x00680a, - /* c9 */ 0x006814, 0x00681e, 0x006828, 0x006832, - /* cd */ 0x00683c, 0x006846, 0x006850, 0x00685a, - /* d1 */ 0x006864, 0x00686e, 0x000000, 0x000000, + /* 81 */ 0x0065b0, 0x0065ba, 0x0065c4, 0x0065ce, + /* 85 */ 0x0065d8, 0x0065e2, 0x0065ec, 0x0065f6, + /* 89 */ 0x006600, 0x00660a, 0x006614, 0x00661e, + /* 8d */ 0x006628, 0x006632, 0x00663c, 0x006646, + /* 91 */ 0x006650, 0x00665a, 0x006664, 0x00666e, + /* 95 */ 0x006678, 0x006682, 0x00668c, 0x006696, + /* 99 */ 0x0066a0, 0x0066aa, 0x0066b4, 0x0066be, + /* 9d */ 0x0066c8, 0x0066d2, 0x0066dc, 0x0066e6, + /* a1 */ 0x0066f0, 0x0066fa, 0x006704, 0x00670e, + /* a5 */ 0x006718, 0x006722, 0x00672c, 0x006736, + /* a9 */ 0x006740, 0x00674a, 0x006754, 0x00675e, + /* ad */ 0x006768, 0x006772, 0x00677c, 0x006786, + /* b1 */ 0x006790, 0x00679a, 0x0067a4, 0x0067ae, + /* b5 */ 0x0067b8, 0x0067c2, 0x0067cc, 0x0067d6, + /* b9 */ 0x0067e0, 0x0067ea, 0x0067f4, 0x0067fe, + /* bd */ 0x006808, 0x006812, 0x00681c, 0x006826, + /* c1 */ 0x006830, 0x00683a, 0x006844, 0x00684e, + /* c5 */ 0x006858, 0x006862, 0x00686c, 0x006876, + /* c9 */ 0x006880, 0x00688a, 0x006894, 0x00689e, + /* cd */ 0x0068a8, 0x0068b2, 0x0068bc, 0x0068c6, + /* d1 */ 0x0068d0, 0x0068da, + /* 44 trailing zero values shared with next segment */ + + /*** Four byte table, byte #3: 8135xx - offset 0x05fb9 ***/ + + /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 85 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 89 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 8d */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 91 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 95 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 99 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 9d */ 0x000000, 0x000000, 0x000000, 0x000000, + /* a1 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* a5 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* a9 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* ad */ 0x000000, 0x000000, 0x000000, 0x000000, + /* b1 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* b5 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* b9 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* bd */ 0x000000, 0x000000, 0x000000, 0x000000, + /* c1 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* c5 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* c9 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* cd */ 0x000000, 0x000000, 0x000000, 0x000000, + /* d1 */ 0x000000, 0x000000, 0x000000, 0x000000, /* d5 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* d9 */ 0x000000, 0x000000, - /* 36 trailing zero values shared with next segment */ + /* d9 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* dd */ 0x000000, 0x000000, 0x000000, 0x000000, + /* e1 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* e5 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* e9 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* ed */ 0x000000, 0x000000, 0x000000, 0x000000, + /* f1 */ 0x000000, 0x000000, 0x000000, 0x0068e4, + /* 10 trailing zero values shared with next segment */ - /*** Four byte table, byte #3: 8136xx - offset 0x05fc1 ***/ + /*** Four byte table, byte #3: 8136xx - offset 0x0602d ***/ /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 85 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6617,45 +6648,45 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* 99 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 9d */ 0x000000, 0x000000, 0x000000, 0x000000, /* a1 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* a5 */ 0x006878, 0x006882, 0x00688c, 0x006896, - /* a9 */ 0x0068a0, 0x0068aa, 0x0068b4, 0x0068be, - /* ad */ 0x0068c8, 0x0068d2, 0x0068dc, 0x0068e6, - /* b1 */ 0x0068f0, 0x0068fa, 0x006904, 0x00690e, - /* b5 */ 0x006918, 0x006922, 0x00692c, 0x006936, - /* b9 */ 0x006940, 0x00694a, 0x006954, 0x00695e, - /* bd */ 0x006968, 0x006972, 0x00697c, 0x006986, - /* c1 */ 0x006990, 0x00699a, 0x0069a4, 0x0069ae, - /* c5 */ 0x0069b8, 0x0069c2, 0x0069cc, 0x0069d6, - /* c9 */ 0x0069e0, 0x0069ea, 0x0069f4, 0x0069fe, - /* cd */ 0x006a08, 0x006a12, 0x006a1c, 0x006a26, - /* d1 */ 0x006a30, 0x006a3a, 0x006a44, 0x006a4e, - /* d5 */ 0x006a58, 0x006a62, 0x006a6c, 0x006a76, - /* d9 */ 0x006a80, 0x006a8a, 0x006a94, 0x006a9e, - /* dd */ 0x006aa8, 0x006ab2, 0x006abc, 0x006ac6, - /* e1 */ 0x006ad0, 0x006ada, 0x006ae4, 0x006aee, - /* e5 */ 0x006af8, 0x006b02, 0x006b0c, 0x006b16, - /* e9 */ 0x006b20, 0x006b2a, 0x006b34, 0x006b3e, - /* ed */ 0x006b48, 0x006b52, 0x006b5c, 0x006b66, - /* f1 */ 0x006b70, 0x006b7a, 0x006b84, 0x006b8e, - /* f5 */ 0x006b98, 0x006ba2, 0x006bac, 0x006bb6, - /* f9 */ 0x006bc0, 0x006bca, 0x006bd4, 0x006bde, - /* fd */ 0x006be8, 0x006bf2, - - /*** Four byte table, byte #3: 8137xx - offset 0x0603f ***/ - - /* 81 */ 0x006bfc, 0x006c06, 0x006c10, 0x006c1a, - /* 85 */ 0x006c24, 0x006c2e, 0x006c38, 0x006c42, - /* 89 */ 0x006c4c, 0x006c56, 0x006c60, 0x006c6a, - /* 8d */ 0x006c74, 0x006c7e, 0x006c88, 0x006c92, - /* 91 */ 0x006c9c, 0x006ca6, 0x006cb0, 0x006cba, - /* 95 */ 0x006cc4, 0x006cce, 0x006cd8, 0x006ce2, - /* 99 */ 0x006cec, 0x006cf6, 0x006d00, 0x006d0a, - /* 9d */ 0x006d14, 0x006d1e, 0x006d28, 0x006d32, - /* a1 */ 0x006d3c, 0x006d46, 0x006d50, 0x006d5a, - /* a5 */ 0x006d64, 0x006d6e, 0x006d78, 0x006d82, + /* a5 */ 0x0068ec, 0x0068f6, 0x006900, 0x00690a, + /* a9 */ 0x006914, 0x00691e, 0x006928, 0x006932, + /* ad */ 0x00693c, 0x006946, 0x006950, 0x00695a, + /* b1 */ 0x006964, 0x00696e, 0x006978, 0x006982, + /* b5 */ 0x00698c, 0x006996, 0x0069a0, 0x0069aa, + /* b9 */ 0x0069b4, 0x0069be, 0x0069c8, 0x0069d2, + /* bd */ 0x0069dc, 0x0069e6, 0x0069f0, 0x0069fa, + /* c1 */ 0x006a04, 0x006a0e, 0x006a18, 0x006a22, + /* c5 */ 0x006a2c, 0x006a36, 0x006a40, 0x006a4a, + /* c9 */ 0x006a54, 0x006a5e, 0x006a68, 0x006a72, + /* cd */ 0x006a7c, 0x006a86, 0x006a90, 0x006a9a, + /* d1 */ 0x006aa4, 0x006aae, 0x006ab8, 0x006ac2, + /* d5 */ 0x006acc, 0x006ad6, 0x006ae0, 0x006aea, + /* d9 */ 0x006af4, 0x006afe, 0x006b08, 0x006b12, + /* dd */ 0x006b1c, 0x006b26, 0x006b30, 0x006b3a, + /* e1 */ 0x006b44, 0x006b4e, 0x006b58, 0x006b62, + /* e5 */ 0x006b6c, 0x006b76, 0x006b80, 0x006b8a, + /* e9 */ 0x006b94, 0x006b9e, 0x006ba8, 0x006bb2, + /* ed */ 0x006bbc, 0x006bc6, 0x006bd0, 0x006bda, + /* f1 */ 0x006be4, 0x006bee, 0x006bf8, 0x006c02, + /* f5 */ 0x006c0c, 0x006c16, 0x006c20, 0x006c2a, + /* f9 */ 0x006c34, 0x006c3e, 0x006c48, 0x006c52, + /* fd */ 0x006c5c, 0x006c66, + + /*** Four byte table, byte #3: 8137xx - offset 0x060ab ***/ + + /* 81 */ 0x006c70, 0x006c7a, 0x006c84, 0x006c8e, + /* 85 */ 0x006c98, 0x006ca2, 0x006cac, 0x006cb6, + /* 89 */ 0x006cc0, 0x006cca, 0x006cd4, 0x006cde, + /* 8d */ 0x006ce8, 0x006cf2, 0x006cfc, 0x006d06, + /* 91 */ 0x006d10, 0x006d1a, 0x006d24, 0x006d2e, + /* 95 */ 0x006d38, 0x006d42, 0x006d4c, 0x006d56, + /* 99 */ 0x006d60, 0x006d6a, 0x006d74, 0x006d7e, + /* 9d */ 0x006d88, 0x006d92, 0x006d9c, 0x006da6, + /* a1 */ 0x006db0, 0x006dba, 0x006dc4, 0x006dce, + /* a5 */ 0x006dd8, 0x006de2, 0x006dec, 0x006df6, /* 86 trailing zero values shared with next segment */ - /*** Four byte table, byte #3: 8138xx - offset 0x06067 ***/ + /*** Four byte table, byte #3: 8138xx - offset 0x060d3 ***/ /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 85 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6688,55 +6719,55 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* f1 */ 0x000000, 0x000000, 0x000000, 0x000000, /* f5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* f9 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* fd */ 0x006d8b, 0x006d95, - - /*** Four byte table, byte #3: 8139xx - offset 0x060e5 ***/ - - /* 81 */ 0x006d9f, 0x006da9, 0x006db3, 0x006dbd, - /* 85 */ 0x006dc7, 0x006dd1, 0x006ddb, 0x006de5, - /* 89 */ 0x006def, 0x006df9, 0x006e03, 0x006e0d, - /* 8d */ 0x006e17, 0x006e21, 0x006e2b, 0x006e35, - /* 91 */ 0x006e3f, 0x006e49, 0x006e53, 0x006e5d, - /* 95 */ 0x006e67, 0x006e71, 0x006e7b, 0x006e85, - /* 99 */ 0x006e8f, 0x006e99, 0x006ea3, 0x006ead, - /* 9d */ 0x006eb7, 0x006ec1, 0x006ecb, 0x006ed5, - /* a1 */ 0x006edf, 0x006ee9, 0x006ef3, 0x006efd, - /* a5 */ 0x006f07, 0x006f11, 0x006f1b, 0x006f25, - /* a9 */ 0x006f2f, 0x006f39, 0x006f43, 0x006f4d, - /* ad */ 0x006f57, 0x006f61, 0x006f6b, 0x006f75, - /* b1 */ 0x006f7f, 0x006f89, 0x006f93, 0x006f9d, - /* b5 */ 0x006fa7, 0x006fb1, 0x006fbb, 0x006fc5, - /* b9 */ 0x006fcf, 0x006fd9, 0x006fe3, 0x006fed, - /* bd */ 0x006ff7, 0x007001, 0x00700b, 0x007015, - /* c1 */ 0x00701f, 0x007029, 0x007033, 0x00703d, - /* c5 */ 0x007047, 0x007051, 0x00705b, 0x007065, - /* c9 */ 0x00706f, 0x007079, 0x007083, 0x00708d, - /* cd */ 0x007097, 0x0070a1, 0x0070ab, 0x0070b5, - /* d1 */ 0x0070bf, 0x0070c9, 0x0070d3, 0x0070dd, - /* d5 */ 0x0070e7, 0x0070f1, 0x0070fb, 0x007105, - /* d9 */ 0x00710f, 0x007119, 0x007123, 0x00712d, - /* dd */ 0x007137, 0x007141, 0x00714b, 0x007155, - /* e1 */ 0x00715f, 0x007169, 0x007173, 0x00717d, - /* e5 */ 0x007187, 0x007191, 0x00719b, 0x0071a5, - /* e9 */ 0x0071af, 0x0071b9, 0x0071c3, 0x0071cd, - /* ed */ 0x0071d7, 0x0071e1, 0x0071eb, 0x0071f5, - /* f1 */ 0x0071ff, 0x007209, 0x007213, 0x00721d, - /* f5 */ 0x007227, 0x007231, 0x00723b, 0x007245, - /* f9 */ 0x00724f, 0x007259, 0x007263, 0x00726d, - /* fd */ 0x007277, 0x007281, - - /*** Four byte table, byte #3: 8230xx - offset 0x06163 ***/ - - /* 81 */ 0x00728b, 0x007295, 0x00729f, 0x0072a9, - /* 85 */ 0x0072b3, 0x0072bd, 0x0072c7, 0x0072d1, - /* 89 */ 0x0072db, 0x0072e5, 0x0072ef, 0x0072f9, - /* 8d */ 0x007303, 0x00730d, 0x007317, 0x007321, - /* 91 */ 0x00732b, 0x007335, 0x00733f, 0x007349, - /* 95 */ 0x007353, 0x00735d, 0x007367, 0x007371, - /* 99 */ 0x00737b, 0x007385, 0x00738f, 0x007399, - /* 9d */ 0x0073a3, 0x0073ad, 0x0073b7, 0x0073c1, - /* a1 */ 0x0073cb, 0x0073d5, 0x0073df, 0x0073e9, - /* a5 */ 0x0073f3, 0x0073fd, 0x000000, 0x000000, + /* fd */ 0x006dff, 0x006e09, + + /*** Four byte table, byte #3: 8139xx - offset 0x06151 ***/ + + /* 81 */ 0x006e13, 0x006e1d, 0x006e27, 0x006e31, + /* 85 */ 0x006e3b, 0x006e45, 0x006e4f, 0x006e59, + /* 89 */ 0x006e63, 0x006e6d, 0x006e77, 0x006e81, + /* 8d */ 0x006e8b, 0x006e95, 0x006e9f, 0x006ea9, + /* 91 */ 0x006eb3, 0x006ebd, 0x006ec7, 0x006ed1, + /* 95 */ 0x006edb, 0x006ee5, 0x006eef, 0x006ef9, + /* 99 */ 0x006f03, 0x006f0d, 0x006f17, 0x006f21, + /* 9d */ 0x006f2b, 0x006f35, 0x006f3f, 0x006f49, + /* a1 */ 0x006f53, 0x006f5d, 0x006f67, 0x006f71, + /* a5 */ 0x006f7b, 0x006f85, 0x006f8f, 0x006f99, + /* a9 */ 0x006fa3, 0x006fad, 0x006fb7, 0x006fc1, + /* ad */ 0x006fcb, 0x006fd5, 0x006fdf, 0x006fe9, + /* b1 */ 0x006ff3, 0x006ffd, 0x007007, 0x007011, + /* b5 */ 0x00701b, 0x007025, 0x00702f, 0x007039, + /* b9 */ 0x007043, 0x00704d, 0x007057, 0x007061, + /* bd */ 0x00706b, 0x007075, 0x00707f, 0x007089, + /* c1 */ 0x007093, 0x00709d, 0x0070a7, 0x0070b1, + /* c5 */ 0x0070bb, 0x0070c5, 0x0070cf, 0x0070d9, + /* c9 */ 0x0070e3, 0x0070ed, 0x0070f7, 0x007101, + /* cd */ 0x00710b, 0x007115, 0x00711f, 0x007129, + /* d1 */ 0x007133, 0x00713d, 0x007147, 0x007151, + /* d5 */ 0x00715b, 0x007165, 0x00716f, 0x007179, + /* d9 */ 0x007183, 0x00718d, 0x007197, 0x0071a1, + /* dd */ 0x0071ab, 0x0071b5, 0x0071bf, 0x0071c9, + /* e1 */ 0x0071d3, 0x0071dd, 0x0071e7, 0x0071f1, + /* e5 */ 0x0071fb, 0x007205, 0x00720f, 0x007219, + /* e9 */ 0x007223, 0x00722d, 0x007237, 0x007241, + /* ed */ 0x00724b, 0x007255, 0x00725f, 0x007269, + /* f1 */ 0x007273, 0x00727d, 0x007287, 0x007291, + /* f5 */ 0x00729b, 0x0072a5, 0x0072af, 0x0072b9, + /* f9 */ 0x0072c3, 0x0072cd, 0x0072d7, 0x0072e1, + /* fd */ 0x0072eb, 0x0072f5, + + /*** Four byte table, byte #3: 8230xx - offset 0x061cf ***/ + + /* 81 */ 0x0072ff, 0x007309, 0x007313, 0x00731d, + /* 85 */ 0x007327, 0x007331, 0x00733b, 0x007345, + /* 89 */ 0x00734f, 0x007359, 0x007363, 0x00736d, + /* 8d */ 0x007377, 0x007381, 0x00738b, 0x007395, + /* 91 */ 0x00739f, 0x0073a9, 0x0073b3, 0x0073bd, + /* 95 */ 0x0073c7, 0x0073d1, 0x0073db, 0x0073e5, + /* 99 */ 0x0073ef, 0x0073f9, 0x007403, 0x00740d, + /* 9d */ 0x007417, 0x007421, 0x00742b, 0x007435, + /* a1 */ 0x00743f, 0x007449, 0x007453, 0x00745d, + /* a5 */ 0x007467, 0x007471, 0x000000, 0x000000, /* a9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* ad */ 0x000000, 0x000000, 0x000000, 0x000000, /* b1 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6755,37 +6786,37 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* e5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* e9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* ed */ 0x000000, 0x000000, 0x000000, 0x000000, - /* f1 */ 0x000000, 0x007400, 0x00740a, 0x007414, - /* f5 */ 0x00741e, 0x007428, 0x007432, 0x00743c, - /* f9 */ 0x007446, 0x007450, 0x00745a, 0x007464, - /* fd */ 0x00746e, 0x007478, - - /*** Four byte table, byte #3: 8231xx - offset 0x061e1 ***/ - - /* 81 */ 0x007482, 0x00748c, 0x007496, 0x0074a0, - /* 85 */ 0x0074aa, 0x0074b4, 0x0074be, 0x0074c8, - /* 89 */ 0x0074d2, 0x0074dc, 0x0074e6, 0x0074f0, - /* 8d */ 0x0074fa, 0x007504, 0x00750e, 0x007518, - /* 91 */ 0x007522, 0x00752c, 0x007536, 0x007540, - /* 95 */ 0x00754a, 0x007554, 0x00755e, 0x007568, - /* 99 */ 0x007572, 0x00757c, 0x007586, 0x007590, - /* 9d */ 0x00759a, 0x0075a4, 0x0075ae, 0x0075b8, - /* a1 */ 0x0075c2, 0x0075cc, 0x0075d6, 0x0075e0, - /* a5 */ 0x0075ea, 0x0075f4, 0x0075fe, 0x007608, - /* a9 */ 0x007612, 0x00761c, 0x007626, 0x007630, - /* ad */ 0x00763a, 0x007644, 0x00764e, 0x007658, - /* b1 */ 0x007662, 0x00766c, 0x007676, 0x007680, - /* b5 */ 0x00768a, 0x007694, 0x00769e, 0x0076a8, - /* b9 */ 0x0076b2, 0x0076bc, 0x0076c6, 0x0076d0, - /* bd */ 0x0076da, 0x0076e4, 0x0076ee, 0x0076f8, - /* c1 */ 0x007702, 0x00770c, 0x007716, 0x007720, - /* c5 */ 0x00772a, 0x007734, 0x00773e, 0x007748, - /* c9 */ 0x007752, 0x00775c, 0x007766, 0x007770, - /* cd */ 0x00777a, 0x007784, 0x00778e, 0x007798, - /* d1 */ 0x0077a2, 0x0077ac, 0x0077b6, 0x0077c0, + /* f1 */ 0x000000, 0x007474, 0x00747e, 0x007488, + /* f5 */ 0x007492, 0x00749c, 0x0074a6, 0x0074b0, + /* f9 */ 0x0074ba, 0x0074c4, 0x0074ce, 0x0074d8, + /* fd */ 0x0074e2, 0x0074ec, + + /*** Four byte table, byte #3: 8231xx - offset 0x0624d ***/ + + /* 81 */ 0x0074f6, 0x007500, 0x00750a, 0x007514, + /* 85 */ 0x00751e, 0x007528, 0x007532, 0x00753c, + /* 89 */ 0x007546, 0x007550, 0x00755a, 0x007564, + /* 8d */ 0x00756e, 0x007578, 0x007582, 0x00758c, + /* 91 */ 0x007596, 0x0075a0, 0x0075aa, 0x0075b4, + /* 95 */ 0x0075be, 0x0075c8, 0x0075d2, 0x0075dc, + /* 99 */ 0x0075e6, 0x0075f0, 0x0075fa, 0x007604, + /* 9d */ 0x00760e, 0x007618, 0x007622, 0x00762c, + /* a1 */ 0x007636, 0x007640, 0x00764a, 0x007654, + /* a5 */ 0x00765e, 0x007668, 0x007672, 0x00767c, + /* a9 */ 0x007686, 0x007690, 0x00769a, 0x0076a4, + /* ad */ 0x0076ae, 0x0076b8, 0x0076c2, 0x0076cc, + /* b1 */ 0x0076d6, 0x0076e0, 0x0076ea, 0x0076f4, + /* b5 */ 0x0076fe, 0x007708, 0x007712, 0x00771c, + /* b9 */ 0x007726, 0x007730, 0x00773a, 0x007744, + /* bd */ 0x00774e, 0x007758, 0x007762, 0x00776c, + /* c1 */ 0x007776, 0x007780, 0x00778a, 0x007794, + /* c5 */ 0x00779e, 0x0077a8, 0x0077b2, 0x0077bc, + /* c9 */ 0x0077c6, 0x0077d0, 0x0077da, 0x0077e4, + /* cd */ 0x0077ee, 0x0077f8, 0x007802, 0x00780c, + /* d1 */ 0x007816, 0x007820, 0x00782a, 0x007834, /* 42 trailing zero values shared with next segment */ - /*** Four byte table, byte #3: 8232xx - offset 0x06235 ***/ + /*** Four byte table, byte #3: 8232xx - offset 0x062a1 ***/ /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 85 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6798,14 +6829,14 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* a1 */ 0x000000, 0x000000, 0x000000, 0x000000, /* a5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* a9 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* ad */ 0x000000, 0x000000, 0x0077c8, 0x0077d2, - /* b1 */ 0x0077dc, 0x0077e6, 0x0077f0, 0x0077fa, - /* b5 */ 0x007804, 0x00780e, 0x007818, 0x007822, - /* b9 */ 0x00782c, 0x007836, 0x007840, 0x00784a, - /* bd */ 0x007854, 0x00785e, 0x007868, 0x007872, - /* c1 */ 0x00787c, 0x007886, 0x007890, 0x00789a, - /* c5 */ 0x0078a4, 0x0078ae, 0x0078b8, 0x0078c2, - /* c9 */ 0x0078cc, 0x000000, 0x000000, 0x000000, + /* ad */ 0x000000, 0x000000, 0x00783c, 0x007846, + /* b1 */ 0x007850, 0x00785a, 0x007864, 0x00786e, + /* b5 */ 0x007878, 0x007882, 0x00788c, 0x007896, + /* b9 */ 0x0078a0, 0x0078aa, 0x0078b4, 0x0078be, + /* bd */ 0x0078c8, 0x0078d2, 0x0078dc, 0x0078e6, + /* c1 */ 0x0078f0, 0x0078fa, 0x007904, 0x00790e, + /* c5 */ 0x007918, 0x007922, 0x00792c, 0x007936, + /* c9 */ 0x007940, 0x000000, 0x000000, 0x000000, /* cd */ 0x000000, 0x000000, 0x000000, 0x000000, /* d1 */ 0x000000, 0x000000, 0x000000, 0x000000, /* d5 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6816,21 +6847,21 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* e9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* ed */ 0x000000, 0x000000, 0x000000, 0x000000, /* f1 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* f5 */ 0x000000, 0x000000, 0x000000, 0x0078d3, - /* f9 */ 0x0078dd, 0x0078e7, 0x0078f1, 0x0078fb, - /* fd */ 0x007905, 0x00790f, - - /*** Four byte table, byte #3: 8233xx - offset 0x062b3 ***/ - - /* 81 */ 0x007919, 0x007923, 0x00792d, 0x007937, - /* 85 */ 0x007941, 0x00794b, 0x007955, 0x00795f, - /* 89 */ 0x007969, 0x007973, 0x00797d, 0x007987, - /* 8d */ 0x007991, 0x00799b, 0x0079a5, 0x0079af, - /* 91 */ 0x0079b9, 0x0079c3, 0x0079cd, 0x0079d7, - /* 95 */ 0x0079e1, 0x0079eb, 0x0079f5, 0x0079ff, - /* 99 */ 0x007a09, 0x007a13, 0x007a1d, 0x007a27, - /* 9d */ 0x007a31, 0x007a3b, 0x007a45, 0x007a4f, - /* a1 */ 0x007a59, 0x007a63, 0x007a6d, 0x000000, + /* f5 */ 0x000000, 0x000000, 0x000000, 0x007947, + /* f9 */ 0x007951, 0x00795b, 0x007965, 0x00796f, + /* fd */ 0x007979, 0x007983, + + /*** Four byte table, byte #3: 8233xx - offset 0x0631f ***/ + + /* 81 */ 0x00798d, 0x007997, 0x0079a1, 0x0079ab, + /* 85 */ 0x0079b5, 0x0079bf, 0x0079c9, 0x0079d3, + /* 89 */ 0x0079dd, 0x0079e7, 0x0079f1, 0x0079fb, + /* 8d */ 0x007a05, 0x007a0f, 0x007a19, 0x007a23, + /* 91 */ 0x007a2d, 0x007a37, 0x007a41, 0x007a4b, + /* 95 */ 0x007a55, 0x007a5f, 0x007a69, 0x007a73, + /* 99 */ 0x007a7d, 0x007a87, 0x007a91, 0x007a9b, + /* 9d */ 0x007aa5, 0x007aaf, 0x007ab9, 0x007ac3, + /* a1 */ 0x007acd, 0x007ad7, 0x007ae1, 0x000000, /* a5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* a9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* ad */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6840,28 +6871,28 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* bd */ 0x000000, 0x000000, 0x000000, 0x000000, /* c1 */ 0x000000, 0x000000, 0x000000, 0x000000, /* c5 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* c9 */ 0x007a76, 0x007a80, 0x007a8a, 0x007a94, - /* cd */ 0x007a9e, 0x007aa8, 0x007ab2, 0x007abc, - /* d1 */ 0x007ac6, 0x007ad0, 0x007ada, 0x007ae4, - /* d5 */ 0x007aee, 0x007af8, 0x007b02, 0x007b0c, - /* d9 */ 0x007b16, 0x007b20, 0x007b2a, 0x007b34, - /* dd */ 0x007b3e, 0x007b48, 0x007b52, 0x007b5c, - /* e1 */ 0x007b66, 0x007b70, 0x007b7a, 0x007b84, - /* e5 */ 0x007b8e, 0x007b98, 0x007ba2, 0x007bac, + /* c9 */ 0x007aea, 0x007af4, 0x007afe, 0x007b08, + /* cd */ 0x007b12, 0x007b1c, 0x007b26, 0x007b30, + /* d1 */ 0x007b3a, 0x007b44, 0x007b4e, 0x007b58, + /* d5 */ 0x007b62, 0x007b6c, 0x007b76, 0x007b80, + /* d9 */ 0x007b8a, 0x007b94, 0x007b9e, 0x007ba8, + /* dd */ 0x007bb2, 0x007bbc, 0x007bc6, 0x007bd0, + /* e1 */ 0x007bda, 0x007be4, 0x007bee, 0x007bf8, + /* e5 */ 0x007c02, 0x007c0c, 0x007c16, 0x007c20, /* e9 */ 0x000000, /* 21 trailing zero values shared with next segment */ - /*** Four byte table, byte #3: 8234xx - offset 0x0631c ***/ + /*** Four byte table, byte #3: 8234xx - offset 0x06388 ***/ /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 85 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 89 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 8d */ 0x000000, 0x000000, 0x000000, 0x000000, /* 91 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* 95 */ 0x000000, 0x007bb4, 0x007bbe, 0x007bc8, - /* 99 */ 0x007bd2, 0x007bdc, 0x007be6, 0x007bf0, - /* 9d */ 0x007bfa, 0x007c04, 0x007c0e, 0x007c18, - /* a1 */ 0x007c22, 0x000000, 0x000000, 0x000000, + /* 95 */ 0x000000, 0x007c28, 0x007c32, 0x007c3c, + /* 99 */ 0x007c46, 0x007c50, 0x007c5a, 0x007c64, + /* 9d */ 0x007c6e, 0x007c78, 0x007c82, 0x007c8c, + /* a1 */ 0x007c96, 0x000000, 0x000000, 0x000000, /* a5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* a9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* ad */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6878,20 +6909,20 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* d9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* dd */ 0x000000, 0x000000, 0x000000, 0x000000, /* e1 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* e5 */ 0x000000, 0x000000, 0x007c28, 0x007c32, - /* e9 */ 0x007c3c, 0x007c46, 0x007c50, 0x007c5a, - /* ed */ 0x007c64, 0x007c6e, 0x007c78, 0x007c82, - /* f1 */ 0x007c8c, 0x007c96, 0x007ca0, 0x007caa, - /* f5 */ 0x007cb4, 0x007cbe, 0x007cc8, 0x007cd2, - /* f9 */ 0x007cdc, 0x007ce6, 0x007cf0, 0x007cfa, - /* fd */ 0x007d04, 0x007d0e, - - /*** Four byte table, byte #3: 8235xx - offset 0x0639a ***/ - - /* 81 */ 0x007d18, 0x007d22, 0x007d2c, 0x007d36, - /* 85 */ 0x007d40, 0x007d4a, 0x007d54, 0x007d5e, - /* 89 */ 0x007d68, 0x007d72, 0x007d7c, 0x007d86, - /* 8d */ 0x007d90, 0x007d9a, 0x007da4, 0x000000, + /* e5 */ 0x000000, 0x000000, 0x007c9c, 0x007ca6, + /* e9 */ 0x007cb0, 0x007cba, 0x007cc4, 0x007cce, + /* ed */ 0x007cd8, 0x007ce2, 0x007cec, 0x007cf6, + /* f1 */ 0x007d00, 0x007d0a, 0x007d14, 0x007d1e, + /* f5 */ 0x007d28, 0x007d32, 0x007d3c, 0x007d46, + /* f9 */ 0x007d50, 0x007d5a, 0x007d64, 0x007d6e, + /* fd */ 0x007d78, 0x007d82, + + /*** Four byte table, byte #3: 8235xx - offset 0x06406 ***/ + + /* 81 */ 0x007d8c, 0x007d96, 0x007da0, 0x007daa, + /* 85 */ 0x007db4, 0x007dbe, 0x007dc8, 0x007dd2, + /* 89 */ 0x007ddc, 0x007de6, 0x007df0, 0x007dfa, + /* 8d */ 0x007e04, 0x007e0e, 0x007e18, 0x000000, /* 91 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 95 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 99 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6904,7 +6935,7 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* b5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 70 trailing zero values shared with next segment */ - /*** Four byte table, byte #3: 8336xx - offset 0x063d2 ***/ + /*** Four byte table, byte #3: 8336xx - offset 0x0643e ***/ /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 85 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6923,9 +6954,9 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* b9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* bd */ 0x000000, 0x000000, 0x000000, 0x000000, /* c1 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* c5 */ 0x000000, 0x000000, 0x007da7, 0x007db1, - /* c9 */ 0x007dbb, 0x007dc5, 0x007dcf, 0x007dd9, - /* cd */ 0x007de3, 0x007ded, 0x007df7, 0x000000, + /* c5 */ 0x000000, 0x000000, 0x007e1b, 0x007e25, + /* c9 */ 0x007e2f, 0x007e39, 0x007e43, 0x007e4d, + /* cd */ 0x007e57, 0x007e61, 0x007e6b, 0x000000, /* d1 */ 0x000000, 0x000000, 0x000000, 0x000000, /* d5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* d9 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6939,15 +6970,15 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* f9 */ 0x000000, 0x000000, /* 4 trailing zero values shared with next segment */ - /*** Four byte table, byte #3: 8430xx - offset 0x0644c ***/ + /*** Four byte table, byte #3: 8430xx - offset 0x064b8 ***/ /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* 85 */ 0x007e01, 0x007e0b, 0x007e15, 0x007e1f, - /* 89 */ 0x007e29, 0x007e33, 0x007e3d, 0x007e47, - /* 8d */ 0x007e51, 0x007e5b, 0x007e65, 0x007e6f, - /* 91 */ 0x007e79, 0x007e83, 0x007e8d, 0x007e97, - /* 95 */ 0x007ea1, 0x007eab, 0x007eb5, 0x007ebf, - /* 99 */ 0x007ec9, 0x007ed3, 0x007edd, 0x007ee7, + /* 85 */ 0x007e75, 0x007e7f, 0x007e89, 0x007e93, + /* 89 */ 0x007e9d, 0x007ea7, 0x007eb1, 0x007ebb, + /* 8d */ 0x007ec5, 0x007ecf, 0x007ed9, 0x007ee3, + /* 91 */ 0x007eed, 0x007ef7, 0x007f01, 0x007f0b, + /* 95 */ 0x007f15, 0x007f1f, 0x007f29, 0x007f33, + /* 99 */ 0x007f3d, 0x007f47, 0x007f51, 0x007f5b, /* 9d */ 0x000000, 0x000000, 0x000000, 0x000000, /* a1 */ 0x000000, 0x000000, 0x000000, 0x000000, /* a5 */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -6974,17 +7005,17 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* f9 */ 0x000000, 0x000000, /* 4 trailing zero values shared with next segment */ - /*** Four byte table, byte #3: 8431xx - offset 0x064c6 ***/ + /*** Four byte table, byte #3: 8431xx - offset 0x06532 ***/ /* 81 */ 0x000000, 0x000000, 0x000000, 0x000000, - /* 85 */ 0x007eef, 0x007ef9, 0x007f03, 0x007f0d, - /* 89 */ 0x007f17, 0x007f21, 0x007f2b, 0x007f35, - /* 8d */ 0x007f3f, 0x007f49, 0x007f53, 0x007f5d, - /* 91 */ 0x007f67, 0x007f71, 0x007f7b, 0x007f85, - /* 95 */ 0x007f8f, 0x007f99, 0x007fa3, 0x007fad, - /* 99 */ 0x007fb7, 0x007fc1, 0x007fcb, 0x007fd5, - /* 9d */ 0x007fdf, 0x007fe9, 0x007ff3, 0x007ffd, - /* a1 */ 0x008007, 0x008011, 0x000000, 0x000000, + /* 85 */ 0x007f63, 0x007f6d, 0x007f77, 0x007f81, + /* 89 */ 0x007f8b, 0x007f95, 0x007f9f, 0x007fa9, + /* 8d */ 0x007fb3, 0x007fbd, 0x007fc7, 0x007fd1, + /* 91 */ 0x007fdb, 0x007fe5, 0x007fef, 0x007ff9, + /* 95 */ 0x008003, 0x00800d, 0x008017, 0x008021, + /* 99 */ 0x00802b, 0x008035, 0x00803f, 0x008049, + /* 9d */ 0x008053, 0x00805d, 0x008067, 0x008071, + /* a1 */ 0x00807b, 0x008085, 0x000000, 0x000000, /* a5 */ 0x000000, 0x000000, 0x000000, 0x000000, /* a9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* ad */ 0x000000, 0x000000, 0x000000, 0x000000, @@ -7009,4141 +7040,4147 @@ static const uint32 gb18030_to_unicode_tree_table[32795] = /* f9 */ 0x000000, 0x000000, 0x000000, 0x000000, /* fd */ 0x000000, 0x000000, - /*** Four byte table, leaf: 813081xx - offset 0x06544 ***/ + /*** Four byte table, leaf: 813081xx - offset 0x065b0 ***/ /* 30 */ 0x00c280, 0x00c281, 0x00c282, 0x00c283, /* 34 */ 0x00c284, 0x00c285, 0x00c286, 0x00c287, /* 38 */ 0x00c288, 0x00c289, - /*** Four byte table, leaf: 813082xx - offset 0x0654e ***/ + /*** Four byte table, leaf: 813082xx - offset 0x065ba ***/ /* 30 */ 0x00c28a, 0x00c28b, 0x00c28c, 0x00c28d, /* 34 */ 0x00c28e, 0x00c28f, 0x00c290, 0x00c291, /* 38 */ 0x00c292, 0x00c293, - /*** Four byte table, leaf: 813083xx - offset 0x06558 ***/ + /*** Four byte table, leaf: 813083xx - offset 0x065c4 ***/ /* 30 */ 0x00c294, 0x00c295, 0x00c296, 0x00c297, /* 34 */ 0x00c298, 0x00c299, 0x00c29a, 0x00c29b, /* 38 */ 0x00c29c, 0x00c29d, - /*** Four byte table, leaf: 813084xx - offset 0x06562 ***/ + /*** Four byte table, leaf: 813084xx - offset 0x065ce ***/ /* 30 */ 0x00c29e, 0x00c29f, 0x00c2a0, 0x00c2a1, /* 34 */ 0x00c2a2, 0x00c2a3, 0x00c2a5, 0x00c2a6, /* 38 */ 0x00c2a9, 0x00c2aa, - /*** Four byte table, leaf: 813085xx - offset 0x0656c ***/ + /*** Four byte table, leaf: 813085xx - offset 0x065d8 ***/ /* 30 */ 0x00c2ab, 0x00c2ac, 0x00c2ad, 0x00c2ae, /* 34 */ 0x00c2af, 0x00c2b2, 0x00c2b3, 0x00c2b4, /* 38 */ 0x00c2b5, 0x00c2b6, - /*** Four byte table, leaf: 813086xx - offset 0x06576 ***/ + /*** Four byte table, leaf: 813086xx - offset 0x065e2 ***/ /* 30 */ 0x00c2b8, 0x00c2b9, 0x00c2ba, 0x00c2bb, /* 34 */ 0x00c2bc, 0x00c2bd, 0x00c2be, 0x00c2bf, /* 38 */ 0x00c380, 0x00c381, - /*** Four byte table, leaf: 813087xx - offset 0x06580 ***/ + /*** Four byte table, leaf: 813087xx - offset 0x065ec ***/ /* 30 */ 0x00c382, 0x00c383, 0x00c384, 0x00c385, /* 34 */ 0x00c386, 0x00c387, 0x00c388, 0x00c389, /* 38 */ 0x00c38a, 0x00c38b, - /*** Four byte table, leaf: 813088xx - offset 0x0658a ***/ + /*** Four byte table, leaf: 813088xx - offset 0x065f6 ***/ /* 30 */ 0x00c38c, 0x00c38d, 0x00c38e, 0x00c38f, /* 34 */ 0x00c390, 0x00c391, 0x00c392, 0x00c393, /* 38 */ 0x00c394, 0x00c395, - /*** Four byte table, leaf: 813089xx - offset 0x06594 ***/ + /*** Four byte table, leaf: 813089xx - offset 0x06600 ***/ /* 30 */ 0x00c396, 0x00c398, 0x00c399, 0x00c39a, /* 34 */ 0x00c39b, 0x00c39c, 0x00c39d, 0x00c39e, /* 38 */ 0x00c39f, 0x00c3a2, - /*** Four byte table, leaf: 81308axx - offset 0x0659e ***/ + /*** Four byte table, leaf: 81308axx - offset 0x0660a ***/ /* 30 */ 0x00c3a3, 0x00c3a4, 0x00c3a5, 0x00c3a6, /* 34 */ 0x00c3a7, 0x00c3ab, 0x00c3ae, 0x00c3af, /* 38 */ 0x00c3b0, 0x00c3b1, - /*** Four byte table, leaf: 81308bxx - offset 0x065a8 ***/ + /*** Four byte table, leaf: 81308bxx - offset 0x06614 ***/ /* 30 */ 0x00c3b4, 0x00c3b5, 0x00c3b6, 0x00c3b8, /* 34 */ 0x00c3bb, 0x00c3bd, 0x00c3be, 0x00c3bf, /* 38 */ 0x00c480, 0x00c482, - /*** Four byte table, leaf: 81308cxx - offset 0x065b2 ***/ + /*** Four byte table, leaf: 81308cxx - offset 0x0661e ***/ /* 30 */ 0x00c483, 0x00c484, 0x00c485, 0x00c486, /* 34 */ 0x00c487, 0x00c488, 0x00c489, 0x00c48a, /* 38 */ 0x00c48b, 0x00c48c, - /*** Four byte table, leaf: 81308dxx - offset 0x065bc ***/ + /*** Four byte table, leaf: 81308dxx - offset 0x06628 ***/ /* 30 */ 0x00c48d, 0x00c48e, 0x00c48f, 0x00c490, /* 34 */ 0x00c491, 0x00c492, 0x00c494, 0x00c495, /* 38 */ 0x00c496, 0x00c497, - /*** Four byte table, leaf: 81308exx - offset 0x065c6 ***/ + /*** Four byte table, leaf: 81308exx - offset 0x06632 ***/ /* 30 */ 0x00c498, 0x00c499, 0x00c49a, 0x00c49c, /* 34 */ 0x00c49d, 0x00c49e, 0x00c49f, 0x00c4a0, /* 38 */ 0x00c4a1, 0x00c4a2, - /*** Four byte table, leaf: 81308fxx - offset 0x065d0 ***/ + /*** Four byte table, leaf: 81308fxx - offset 0x0663c ***/ /* 30 */ 0x00c4a3, 0x00c4a4, 0x00c4a5, 0x00c4a6, /* 34 */ 0x00c4a7, 0x00c4a8, 0x00c4a9, 0x00c4aa, /* 38 */ 0x00c4ac, 0x00c4ad, - /*** Four byte table, leaf: 813090xx - offset 0x065da ***/ + /*** Four byte table, leaf: 813090xx - offset 0x06646 ***/ /* 30 */ 0x00c4ae, 0x00c4af, 0x00c4b0, 0x00c4b1, /* 34 */ 0x00c4b2, 0x00c4b3, 0x00c4b4, 0x00c4b5, /* 38 */ 0x00c4b6, 0x00c4b7, - /*** Four byte table, leaf: 813091xx - offset 0x065e4 ***/ + /*** Four byte table, leaf: 813091xx - offset 0x06650 ***/ /* 30 */ 0x00c4b8, 0x00c4b9, 0x00c4ba, 0x00c4bb, /* 34 */ 0x00c4bc, 0x00c4bd, 0x00c4be, 0x00c4bf, /* 38 */ 0x00c580, 0x00c581, - /*** Four byte table, leaf: 813092xx - offset 0x065ee ***/ + /*** Four byte table, leaf: 813092xx - offset 0x0665a ***/ /* 30 */ 0x00c582, 0x00c583, 0x00c585, 0x00c586, /* 34 */ 0x00c587, 0x00c589, 0x00c58a, 0x00c58b, /* 38 */ 0x00c58c, 0x00c58e, - /*** Four byte table, leaf: 813093xx - offset 0x065f8 ***/ + /*** Four byte table, leaf: 813093xx - offset 0x06664 ***/ /* 30 */ 0x00c58f, 0x00c590, 0x00c591, 0x00c592, /* 34 */ 0x00c593, 0x00c594, 0x00c595, 0x00c596, /* 38 */ 0x00c597, 0x00c598, - /*** Four byte table, leaf: 813094xx - offset 0x06602 ***/ + /*** Four byte table, leaf: 813094xx - offset 0x0666e ***/ /* 30 */ 0x00c599, 0x00c59a, 0x00c59b, 0x00c59c, /* 34 */ 0x00c59d, 0x00c59e, 0x00c59f, 0x00c5a0, /* 38 */ 0x00c5a1, 0x00c5a2, - /*** Four byte table, leaf: 813095xx - offset 0x0660c ***/ + /*** Four byte table, leaf: 813095xx - offset 0x06678 ***/ /* 30 */ 0x00c5a3, 0x00c5a4, 0x00c5a5, 0x00c5a6, /* 34 */ 0x00c5a7, 0x00c5a8, 0x00c5a9, 0x00c5aa, /* 38 */ 0x00c5ac, 0x00c5ad, - /*** Four byte table, leaf: 813096xx - offset 0x06616 ***/ + /*** Four byte table, leaf: 813096xx - offset 0x06682 ***/ /* 30 */ 0x00c5ae, 0x00c5af, 0x00c5b0, 0x00c5b1, /* 34 */ 0x00c5b2, 0x00c5b3, 0x00c5b4, 0x00c5b5, /* 38 */ 0x00c5b6, 0x00c5b7, - /*** Four byte table, leaf: 813097xx - offset 0x06620 ***/ + /*** Four byte table, leaf: 813097xx - offset 0x0668c ***/ /* 30 */ 0x00c5b8, 0x00c5b9, 0x00c5ba, 0x00c5bb, /* 34 */ 0x00c5bc, 0x00c5bd, 0x00c5be, 0x00c5bf, /* 38 */ 0x00c680, 0x00c681, - /*** Four byte table, leaf: 813098xx - offset 0x0662a ***/ + /*** Four byte table, leaf: 813098xx - offset 0x06696 ***/ /* 30 */ 0x00c682, 0x00c683, 0x00c684, 0x00c685, /* 34 */ 0x00c686, 0x00c687, 0x00c688, 0x00c689, /* 38 */ 0x00c68a, 0x00c68b, - /*** Four byte table, leaf: 813099xx - offset 0x06634 ***/ + /*** Four byte table, leaf: 813099xx - offset 0x066a0 ***/ /* 30 */ 0x00c68c, 0x00c68d, 0x00c68e, 0x00c68f, /* 34 */ 0x00c690, 0x00c691, 0x00c692, 0x00c693, /* 38 */ 0x00c694, 0x00c695, - /*** Four byte table, leaf: 81309axx - offset 0x0663e ***/ + /*** Four byte table, leaf: 81309axx - offset 0x066aa ***/ /* 30 */ 0x00c696, 0x00c697, 0x00c698, 0x00c699, /* 34 */ 0x00c69a, 0x00c69b, 0x00c69c, 0x00c69d, /* 38 */ 0x00c69e, 0x00c69f, - /*** Four byte table, leaf: 81309bxx - offset 0x06648 ***/ + /*** Four byte table, leaf: 81309bxx - offset 0x066b4 ***/ /* 30 */ 0x00c6a0, 0x00c6a1, 0x00c6a2, 0x00c6a3, /* 34 */ 0x00c6a4, 0x00c6a5, 0x00c6a6, 0x00c6a7, /* 38 */ 0x00c6a8, 0x00c6a9, - /*** Four byte table, leaf: 81309cxx - offset 0x06652 ***/ + /*** Four byte table, leaf: 81309cxx - offset 0x066be ***/ /* 30 */ 0x00c6aa, 0x00c6ab, 0x00c6ac, 0x00c6ad, /* 34 */ 0x00c6ae, 0x00c6af, 0x00c6b0, 0x00c6b1, /* 38 */ 0x00c6b2, 0x00c6b3, - /*** Four byte table, leaf: 81309dxx - offset 0x0665c ***/ + /*** Four byte table, leaf: 81309dxx - offset 0x066c8 ***/ /* 30 */ 0x00c6b4, 0x00c6b5, 0x00c6b6, 0x00c6b7, /* 34 */ 0x00c6b8, 0x00c6b9, 0x00c6ba, 0x00c6bb, /* 38 */ 0x00c6bc, 0x00c6bd, - /*** Four byte table, leaf: 81309exx - offset 0x06666 ***/ + /*** Four byte table, leaf: 81309exx - offset 0x066d2 ***/ /* 30 */ 0x00c6be, 0x00c6bf, 0x00c780, 0x00c781, /* 34 */ 0x00c782, 0x00c783, 0x00c784, 0x00c785, /* 38 */ 0x00c786, 0x00c787, - /*** Four byte table, leaf: 81309fxx - offset 0x06670 ***/ + /*** Four byte table, leaf: 81309fxx - offset 0x066dc ***/ /* 30 */ 0x00c788, 0x00c789, 0x00c78a, 0x00c78b, /* 34 */ 0x00c78c, 0x00c78d, 0x00c78f, 0x00c791, /* 38 */ 0x00c793, 0x00c795, - /*** Four byte table, leaf: 8130a0xx - offset 0x0667a ***/ + /*** Four byte table, leaf: 8130a0xx - offset 0x066e6 ***/ /* 30 */ 0x00c797, 0x00c799, 0x00c79b, 0x00c79d, /* 34 */ 0x00c79e, 0x00c79f, 0x00c7a0, 0x00c7a1, /* 38 */ 0x00c7a2, 0x00c7a3, - /*** Four byte table, leaf: 8130a1xx - offset 0x06684 ***/ + /*** Four byte table, leaf: 8130a1xx - offset 0x066f0 ***/ /* 30 */ 0x00c7a4, 0x00c7a5, 0x00c7a6, 0x00c7a7, /* 34 */ 0x00c7a8, 0x00c7a9, 0x00c7aa, 0x00c7ab, /* 38 */ 0x00c7ac, 0x00c7ad, - /*** Four byte table, leaf: 8130a2xx - offset 0x0668e ***/ + /*** Four byte table, leaf: 8130a2xx - offset 0x066fa ***/ /* 30 */ 0x00c7ae, 0x00c7af, 0x00c7b0, 0x00c7b1, /* 34 */ 0x00c7b2, 0x00c7b3, 0x00c7b4, 0x00c7b5, /* 38 */ 0x00c7b6, 0x00c7b7, - /*** Four byte table, leaf: 8130a3xx - offset 0x06698 ***/ + /*** Four byte table, leaf: 8130a3xx - offset 0x06704 ***/ /* 30 */ 0x00c7b8, 0x00c7ba, 0x00c7bb, 0x00c7bc, /* 34 */ 0x00c7bd, 0x00c7be, 0x00c7bf, 0x00c880, /* 38 */ 0x00c881, 0x00c882, - /*** Four byte table, leaf: 8130a4xx - offset 0x066a2 ***/ + /*** Four byte table, leaf: 8130a4xx - offset 0x0670e ***/ /* 30 */ 0x00c883, 0x00c884, 0x00c885, 0x00c886, /* 34 */ 0x00c887, 0x00c888, 0x00c889, 0x00c88a, /* 38 */ 0x00c88b, 0x00c88c, - /*** Four byte table, leaf: 8130a5xx - offset 0x066ac ***/ + /*** Four byte table, leaf: 8130a5xx - offset 0x06718 ***/ /* 30 */ 0x00c88d, 0x00c88e, 0x00c88f, 0x00c890, /* 34 */ 0x00c891, 0x00c892, 0x00c893, 0x00c894, /* 38 */ 0x00c895, 0x00c896, - /*** Four byte table, leaf: 8130a6xx - offset 0x066b6 ***/ + /*** Four byte table, leaf: 8130a6xx - offset 0x06722 ***/ /* 30 */ 0x00c897, 0x00c898, 0x00c899, 0x00c89a, /* 34 */ 0x00c89b, 0x00c89c, 0x00c89d, 0x00c89e, /* 38 */ 0x00c89f, 0x00c8a0, - /*** Four byte table, leaf: 8130a7xx - offset 0x066c0 ***/ + /*** Four byte table, leaf: 8130a7xx - offset 0x0672c ***/ /* 30 */ 0x00c8a1, 0x00c8a2, 0x00c8a3, 0x00c8a4, /* 34 */ 0x00c8a5, 0x00c8a6, 0x00c8a7, 0x00c8a8, /* 38 */ 0x00c8a9, 0x00c8aa, - /*** Four byte table, leaf: 8130a8xx - offset 0x066ca ***/ + /*** Four byte table, leaf: 8130a8xx - offset 0x06736 ***/ /* 30 */ 0x00c8ab, 0x00c8ac, 0x00c8ad, 0x00c8ae, /* 34 */ 0x00c8af, 0x00c8b0, 0x00c8b1, 0x00c8b2, /* 38 */ 0x00c8b3, 0x00c8b4, - /*** Four byte table, leaf: 8130a9xx - offset 0x066d4 ***/ + /*** Four byte table, leaf: 8130a9xx - offset 0x06740 ***/ /* 30 */ 0x00c8b5, 0x00c8b6, 0x00c8b7, 0x00c8b8, /* 34 */ 0x00c8b9, 0x00c8ba, 0x00c8bb, 0x00c8bc, /* 38 */ 0x00c8bd, 0x00c8be, - /*** Four byte table, leaf: 8130aaxx - offset 0x066de ***/ + /*** Four byte table, leaf: 8130aaxx - offset 0x0674a ***/ /* 30 */ 0x00c8bf, 0x00c980, 0x00c981, 0x00c982, /* 34 */ 0x00c983, 0x00c984, 0x00c985, 0x00c986, /* 38 */ 0x00c987, 0x00c988, - /*** Four byte table, leaf: 8130abxx - offset 0x066e8 ***/ + /*** Four byte table, leaf: 8130abxx - offset 0x06754 ***/ /* 30 */ 0x00c989, 0x00c98a, 0x00c98b, 0x00c98c, /* 34 */ 0x00c98d, 0x00c98e, 0x00c98f, 0x00c990, /* 38 */ 0x00c992, 0x00c993, - /*** Four byte table, leaf: 8130acxx - offset 0x066f2 ***/ + /*** Four byte table, leaf: 8130acxx - offset 0x0675e ***/ /* 30 */ 0x00c994, 0x00c995, 0x00c996, 0x00c997, /* 34 */ 0x00c998, 0x00c999, 0x00c99a, 0x00c99b, /* 38 */ 0x00c99c, 0x00c99d, - /*** Four byte table, leaf: 8130adxx - offset 0x066fc ***/ + /*** Four byte table, leaf: 8130adxx - offset 0x06768 ***/ /* 30 */ 0x00c99e, 0x00c99f, 0x00c9a0, 0x00c9a2, /* 34 */ 0x00c9a3, 0x00c9a4, 0x00c9a5, 0x00c9a6, /* 38 */ 0x00c9a7, 0x00c9a8, - /*** Four byte table, leaf: 8130aexx - offset 0x06706 ***/ + /*** Four byte table, leaf: 8130aexx - offset 0x06772 ***/ /* 30 */ 0x00c9a9, 0x00c9aa, 0x00c9ab, 0x00c9ac, /* 34 */ 0x00c9ad, 0x00c9ae, 0x00c9af, 0x00c9b0, /* 38 */ 0x00c9b1, 0x00c9b2, - /*** Four byte table, leaf: 8130afxx - offset 0x06710 ***/ + /*** Four byte table, leaf: 8130afxx - offset 0x0677c ***/ /* 30 */ 0x00c9b3, 0x00c9b4, 0x00c9b5, 0x00c9b6, /* 34 */ 0x00c9b7, 0x00c9b8, 0x00c9b9, 0x00c9ba, /* 38 */ 0x00c9bb, 0x00c9bc, - /*** Four byte table, leaf: 8130b0xx - offset 0x0671a ***/ + /*** Four byte table, leaf: 8130b0xx - offset 0x06786 ***/ /* 30 */ 0x00c9bd, 0x00c9be, 0x00c9bf, 0x00ca80, /* 34 */ 0x00ca81, 0x00ca82, 0x00ca83, 0x00ca84, /* 38 */ 0x00ca85, 0x00ca86, - /*** Four byte table, leaf: 8130b1xx - offset 0x06724 ***/ + /*** Four byte table, leaf: 8130b1xx - offset 0x06790 ***/ /* 30 */ 0x00ca87, 0x00ca88, 0x00ca89, 0x00ca8a, /* 34 */ 0x00ca8b, 0x00ca8c, 0x00ca8d, 0x00ca8e, /* 38 */ 0x00ca8f, 0x00ca90, - /*** Four byte table, leaf: 8130b2xx - offset 0x0672e ***/ + /*** Four byte table, leaf: 8130b2xx - offset 0x0679a ***/ /* 30 */ 0x00ca91, 0x00ca92, 0x00ca93, 0x00ca94, /* 34 */ 0x00ca95, 0x00ca96, 0x00ca97, 0x00ca98, /* 38 */ 0x00ca99, 0x00ca9a, - /*** Four byte table, leaf: 8130b3xx - offset 0x06738 ***/ + /*** Four byte table, leaf: 8130b3xx - offset 0x067a4 ***/ /* 30 */ 0x00ca9b, 0x00ca9c, 0x00ca9d, 0x00ca9e, /* 34 */ 0x00ca9f, 0x00caa0, 0x00caa1, 0x00caa2, /* 38 */ 0x00caa3, 0x00caa4, - /*** Four byte table, leaf: 8130b4xx - offset 0x06742 ***/ + /*** Four byte table, leaf: 8130b4xx - offset 0x067ae ***/ /* 30 */ 0x00caa5, 0x00caa6, 0x00caa7, 0x00caa8, /* 34 */ 0x00caa9, 0x00caaa, 0x00caab, 0x00caac, /* 38 */ 0x00caad, 0x00caae, - /*** Four byte table, leaf: 8130b5xx - offset 0x0674c ***/ + /*** Four byte table, leaf: 8130b5xx - offset 0x067b8 ***/ /* 30 */ 0x00caaf, 0x00cab0, 0x00cab1, 0x00cab2, /* 34 */ 0x00cab3, 0x00cab4, 0x00cab5, 0x00cab6, /* 38 */ 0x00cab7, 0x00cab8, - /*** Four byte table, leaf: 8130b6xx - offset 0x06756 ***/ + /*** Four byte table, leaf: 8130b6xx - offset 0x067c2 ***/ /* 30 */ 0x00cab9, 0x00caba, 0x00cabb, 0x00cabc, /* 34 */ 0x00cabd, 0x00cabe, 0x00cabf, 0x00cb80, /* 38 */ 0x00cb81, 0x00cb82, - /*** Four byte table, leaf: 8130b7xx - offset 0x06760 ***/ + /*** Four byte table, leaf: 8130b7xx - offset 0x067cc ***/ /* 30 */ 0x00cb83, 0x00cb84, 0x00cb85, 0x00cb86, /* 34 */ 0x00cb88, 0x00cb8c, 0x00cb8d, 0x00cb8e, /* 38 */ 0x00cb8f, 0x00cb90, - /*** Four byte table, leaf: 8130b8xx - offset 0x0676a ***/ + /*** Four byte table, leaf: 8130b8xx - offset 0x067d6 ***/ /* 30 */ 0x00cb91, 0x00cb92, 0x00cb93, 0x00cb94, /* 34 */ 0x00cb95, 0x00cb96, 0x00cb97, 0x00cb98, /* 38 */ 0x00cb9a, 0x00cb9b, - /*** Four byte table, leaf: 8130b9xx - offset 0x06774 ***/ + /*** Four byte table, leaf: 8130b9xx - offset 0x067e0 ***/ /* 30 */ 0x00cb9c, 0x00cb9d, 0x00cb9e, 0x00cb9f, /* 34 */ 0x00cba0, 0x00cba1, 0x00cba2, 0x00cba3, /* 38 */ 0x00cba4, 0x00cba5, - /*** Four byte table, leaf: 8130baxx - offset 0x0677e ***/ + /*** Four byte table, leaf: 8130baxx - offset 0x067ea ***/ /* 30 */ 0x00cba6, 0x00cba7, 0x00cba8, 0x00cba9, /* 34 */ 0x00cbaa, 0x00cbab, 0x00cbac, 0x00cbad, /* 38 */ 0x00cbae, 0x00cbaf, - /*** Four byte table, leaf: 8130bbxx - offset 0x06788 ***/ + /*** Four byte table, leaf: 8130bbxx - offset 0x067f4 ***/ /* 30 */ 0x00cbb0, 0x00cbb1, 0x00cbb2, 0x00cbb3, /* 34 */ 0x00cbb4, 0x00cbb5, 0x00cbb6, 0x00cbb7, /* 38 */ 0x00cbb8, 0x00cbb9, - /*** Four byte table, leaf: 8130bcxx - offset 0x06792 ***/ + /*** Four byte table, leaf: 8130bcxx - offset 0x067fe ***/ /* 30 */ 0x00cbba, 0x00cbbb, 0x00cbbc, 0x00cbbd, /* 34 */ 0x00cbbe, 0x00cbbf, 0x00cc80, 0x00cc81, /* 38 */ 0x00cc82, 0x00cc83, - /*** Four byte table, leaf: 8130bdxx - offset 0x0679c ***/ + /*** Four byte table, leaf: 8130bdxx - offset 0x06808 ***/ /* 30 */ 0x00cc84, 0x00cc85, 0x00cc86, 0x00cc87, /* 34 */ 0x00cc88, 0x00cc89, 0x00cc8a, 0x00cc8b, /* 38 */ 0x00cc8c, 0x00cc8d, - /*** Four byte table, leaf: 8130bexx - offset 0x067a6 ***/ + /*** Four byte table, leaf: 8130bexx - offset 0x06812 ***/ /* 30 */ 0x00cc8e, 0x00cc8f, 0x00cc90, 0x00cc91, /* 34 */ 0x00cc92, 0x00cc93, 0x00cc94, 0x00cc95, /* 38 */ 0x00cc96, 0x00cc97, - /*** Four byte table, leaf: 8130bfxx - offset 0x067b0 ***/ + /*** Four byte table, leaf: 8130bfxx - offset 0x0681c ***/ /* 30 */ 0x00cc98, 0x00cc99, 0x00cc9a, 0x00cc9b, /* 34 */ 0x00cc9c, 0x00cc9d, 0x00cc9e, 0x00cc9f, /* 38 */ 0x00cca0, 0x00cca1, - /*** Four byte table, leaf: 8130c0xx - offset 0x067ba ***/ + /*** Four byte table, leaf: 8130c0xx - offset 0x06826 ***/ /* 30 */ 0x00cca2, 0x00cca3, 0x00cca4, 0x00cca5, /* 34 */ 0x00cca6, 0x00cca7, 0x00cca8, 0x00cca9, /* 38 */ 0x00ccaa, 0x00ccab, - /*** Four byte table, leaf: 8130c1xx - offset 0x067c4 ***/ + /*** Four byte table, leaf: 8130c1xx - offset 0x06830 ***/ /* 30 */ 0x00ccac, 0x00ccad, 0x00ccae, 0x00ccaf, /* 34 */ 0x00ccb0, 0x00ccb1, 0x00ccb2, 0x00ccb3, /* 38 */ 0x00ccb4, 0x00ccb5, - /*** Four byte table, leaf: 8130c2xx - offset 0x067ce ***/ + /*** Four byte table, leaf: 8130c2xx - offset 0x0683a ***/ /* 30 */ 0x00ccb6, 0x00ccb7, 0x00ccb8, 0x00ccb9, /* 34 */ 0x00ccba, 0x00ccbb, 0x00ccbc, 0x00ccbd, /* 38 */ 0x00ccbe, 0x00ccbf, - /*** Four byte table, leaf: 8130c3xx - offset 0x067d8 ***/ + /*** Four byte table, leaf: 8130c3xx - offset 0x06844 ***/ /* 30 */ 0x00cd80, 0x00cd81, 0x00cd82, 0x00cd83, /* 34 */ 0x00cd84, 0x00cd85, 0x00cd86, 0x00cd87, /* 38 */ 0x00cd88, 0x00cd89, - /*** Four byte table, leaf: 8130c4xx - offset 0x067e2 ***/ + /*** Four byte table, leaf: 8130c4xx - offset 0x0684e ***/ /* 30 */ 0x00cd8a, 0x00cd8b, 0x00cd8c, 0x00cd8d, /* 34 */ 0x00cd8e, 0x00cd8f, 0x00cd90, 0x00cd91, /* 38 */ 0x00cd92, 0x00cd93, - /*** Four byte table, leaf: 8130c5xx - offset 0x067ec ***/ + /*** Four byte table, leaf: 8130c5xx - offset 0x06858 ***/ /* 30 */ 0x00cd94, 0x00cd95, 0x00cd96, 0x00cd97, /* 34 */ 0x00cd98, 0x00cd99, 0x00cd9a, 0x00cd9b, /* 38 */ 0x00cd9c, 0x00cd9d, - /*** Four byte table, leaf: 8130c6xx - offset 0x067f6 ***/ + /*** Four byte table, leaf: 8130c6xx - offset 0x06862 ***/ /* 30 */ 0x00cd9e, 0x00cd9f, 0x00cda0, 0x00cda1, /* 34 */ 0x00cda2, 0x00cda3, 0x00cda4, 0x00cda5, /* 38 */ 0x00cda6, 0x00cda7, - /*** Four byte table, leaf: 8130c7xx - offset 0x06800 ***/ + /*** Four byte table, leaf: 8130c7xx - offset 0x0686c ***/ /* 30 */ 0x00cda8, 0x00cda9, 0x00cdaa, 0x00cdab, /* 34 */ 0x00cdac, 0x00cdad, 0x00cdae, 0x00cdaf, /* 38 */ 0x00cdb0, 0x00cdb1, - /*** Four byte table, leaf: 8130c8xx - offset 0x0680a ***/ + /*** Four byte table, leaf: 8130c8xx - offset 0x06876 ***/ /* 30 */ 0x00cdb2, 0x00cdb3, 0x00cdb4, 0x00cdb5, /* 34 */ 0x00cdb6, 0x00cdb7, 0x00cdb8, 0x00cdb9, /* 38 */ 0x00cdba, 0x00cdbb, - /*** Four byte table, leaf: 8130c9xx - offset 0x06814 ***/ + /*** Four byte table, leaf: 8130c9xx - offset 0x06880 ***/ /* 30 */ 0x00cdbc, 0x00cdbd, 0x00cdbe, 0x00cdbf, /* 34 */ 0x00ce80, 0x00ce81, 0x00ce82, 0x00ce83, /* 38 */ 0x00ce84, 0x00ce85, - /*** Four byte table, leaf: 8130caxx - offset 0x0681e ***/ + /*** Four byte table, leaf: 8130caxx - offset 0x0688a ***/ /* 30 */ 0x00ce86, 0x00ce87, 0x00ce88, 0x00ce89, /* 34 */ 0x00ce8a, 0x00ce8b, 0x00ce8c, 0x00ce8d, /* 38 */ 0x00ce8e, 0x00ce8f, - /*** Four byte table, leaf: 8130cbxx - offset 0x06828 ***/ + /*** Four byte table, leaf: 8130cbxx - offset 0x06894 ***/ /* 30 */ 0x00ce90, 0x00cea2, 0x00ceaa, 0x00ceab, /* 34 */ 0x00ceac, 0x00cead, 0x00ceae, 0x00ceaf, /* 38 */ 0x00ceb0, 0x00cf82, - /*** Four byte table, leaf: 8130ccxx - offset 0x06832 ***/ + /*** Four byte table, leaf: 8130ccxx - offset 0x0689e ***/ /* 30 */ 0x00cf8a, 0x00cf8b, 0x00cf8c, 0x00cf8d, /* 34 */ 0x00cf8e, 0x00cf8f, 0x00cf90, 0x00cf91, /* 38 */ 0x00cf92, 0x00cf93, - /*** Four byte table, leaf: 8130cdxx - offset 0x0683c ***/ + /*** Four byte table, leaf: 8130cdxx - offset 0x068a8 ***/ /* 30 */ 0x00cf94, 0x00cf95, 0x00cf96, 0x00cf97, /* 34 */ 0x00cf98, 0x00cf99, 0x00cf9a, 0x00cf9b, /* 38 */ 0x00cf9c, 0x00cf9d, - /*** Four byte table, leaf: 8130cexx - offset 0x06846 ***/ + /*** Four byte table, leaf: 8130cexx - offset 0x068b2 ***/ /* 30 */ 0x00cf9e, 0x00cf9f, 0x00cfa0, 0x00cfa1, /* 34 */ 0x00cfa2, 0x00cfa3, 0x00cfa4, 0x00cfa5, /* 38 */ 0x00cfa6, 0x00cfa7, - /*** Four byte table, leaf: 8130cfxx - offset 0x06850 ***/ + /*** Four byte table, leaf: 8130cfxx - offset 0x068bc ***/ /* 30 */ 0x00cfa8, 0x00cfa9, 0x00cfaa, 0x00cfab, /* 34 */ 0x00cfac, 0x00cfad, 0x00cfae, 0x00cfaf, /* 38 */ 0x00cfb0, 0x00cfb1, - /*** Four byte table, leaf: 8130d0xx - offset 0x0685a ***/ + /*** Four byte table, leaf: 8130d0xx - offset 0x068c6 ***/ /* 30 */ 0x00cfb2, 0x00cfb3, 0x00cfb4, 0x00cfb5, /* 34 */ 0x00cfb6, 0x00cfb7, 0x00cfb8, 0x00cfb9, /* 38 */ 0x00cfba, 0x00cfbb, - /*** Four byte table, leaf: 8130d1xx - offset 0x06864 ***/ + /*** Four byte table, leaf: 8130d1xx - offset 0x068d0 ***/ /* 30 */ 0x00cfbc, 0x00cfbd, 0x00cfbe, 0x00cfbf, /* 34 */ 0x00d080, 0x00d082, 0x00d083, 0x00d084, /* 38 */ 0x00d085, 0x00d086, - /*** Four byte table, leaf: 8130d2xx - offset 0x0686e ***/ + /*** Four byte table, leaf: 8130d2xx - offset 0x068da ***/ /* 30 */ 0x00d087, 0x00d088, 0x00d089, 0x00d08a, /* 34 */ 0x00d08b, 0x00d08c, 0x00d08d, 0x00d08e, /* 38 */ 0x00d08f, 0x00d190, - /*** Four byte table, leaf: 8136a5xx - offset 0x06878 ***/ + /*** Four byte table, leaf: 8135f4xx - offset 0x068e4 ***/ + + /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, + /* 34 */ 0x000000, 0x000000, 0x000000, 0xee9f87, + /* 2 trailing zero values shared with next segment */ + + /*** Four byte table, leaf: 8136a5xx - offset 0x068ec ***/ /* 30 */ 0x000000, 0x000000, 0xe28091, 0xe28092, /* 34 */ 0xe28097, 0xe2809a, 0xe2809b, 0xe2809e, /* 38 */ 0xe2809f, 0xe280a0, - /*** Four byte table, leaf: 8136a6xx - offset 0x06882 ***/ + /*** Four byte table, leaf: 8136a6xx - offset 0x068f6 ***/ /* 30 */ 0xe280a1, 0xe280a2, 0xe280a3, 0xe280a4, /* 34 */ 0xe280a7, 0xe280a8, 0xe280a9, 0xe280aa, /* 38 */ 0xe280ab, 0xe280ac, - /*** Four byte table, leaf: 8136a7xx - offset 0x0688c ***/ + /*** Four byte table, leaf: 8136a7xx - offset 0x06900 ***/ /* 30 */ 0xe280ad, 0xe280ae, 0xe280af, 0xe280b1, /* 34 */ 0xe280b4, 0xe280b6, 0xe280b7, 0xe280b8, /* 38 */ 0xe280b9, 0xe280ba, - /*** Four byte table, leaf: 8136a8xx - offset 0x06896 ***/ + /*** Four byte table, leaf: 8136a8xx - offset 0x0690a ***/ /* 30 */ 0xe280bc, 0xe280bd, 0xe280be, 0xe280bf, /* 34 */ 0xe28180, 0xe28181, 0xe28182, 0xe28183, /* 38 */ 0xe28184, 0xe28185, - /*** Four byte table, leaf: 8136a9xx - offset 0x068a0 ***/ + /*** Four byte table, leaf: 8136a9xx - offset 0x06914 ***/ /* 30 */ 0xe28186, 0xe28187, 0xe28188, 0xe28189, /* 34 */ 0xe2818a, 0xe2818b, 0xe2818c, 0xe2818d, /* 38 */ 0xe2818e, 0xe2818f, - /*** Four byte table, leaf: 8136aaxx - offset 0x068aa ***/ + /*** Four byte table, leaf: 8136aaxx - offset 0x0691e ***/ /* 30 */ 0xe28190, 0xe28191, 0xe28192, 0xe28193, /* 34 */ 0xe28194, 0xe28195, 0xe28196, 0xe28197, /* 38 */ 0xe28198, 0xe28199, - /*** Four byte table, leaf: 8136abxx - offset 0x068b4 ***/ + /*** Four byte table, leaf: 8136abxx - offset 0x06928 ***/ /* 30 */ 0xe2819a, 0xe2819b, 0xe2819c, 0xe2819d, /* 34 */ 0xe2819e, 0xe2819f, 0xe281a0, 0xe281a1, /* 38 */ 0xe281a2, 0xe281a3, - /*** Four byte table, leaf: 8136acxx - offset 0x068be ***/ + /*** Four byte table, leaf: 8136acxx - offset 0x06932 ***/ /* 30 */ 0xe281a4, 0xe281a5, 0xe281a6, 0xe281a7, /* 34 */ 0xe281a8, 0xe281a9, 0xe281aa, 0xe281ab, /* 38 */ 0xe281ac, 0xe281ad, - /*** Four byte table, leaf: 8136adxx - offset 0x068c8 ***/ + /*** Four byte table, leaf: 8136adxx - offset 0x0693c ***/ /* 30 */ 0xe281ae, 0xe281af, 0xe281b0, 0xe281b1, /* 34 */ 0xe281b2, 0xe281b3, 0xe281b4, 0xe281b5, /* 38 */ 0xe281b6, 0xe281b7, - /*** Four byte table, leaf: 8136aexx - offset 0x068d2 ***/ + /*** Four byte table, leaf: 8136aexx - offset 0x06946 ***/ /* 30 */ 0xe281b8, 0xe281b9, 0xe281ba, 0xe281bb, /* 34 */ 0xe281bc, 0xe281bd, 0xe281be, 0xe281bf, /* 38 */ 0xe28280, 0xe28281, - /*** Four byte table, leaf: 8136afxx - offset 0x068dc ***/ + /*** Four byte table, leaf: 8136afxx - offset 0x06950 ***/ /* 30 */ 0xe28282, 0xe28283, 0xe28284, 0xe28285, /* 34 */ 0xe28286, 0xe28287, 0xe28288, 0xe28289, /* 38 */ 0xe2828a, 0xe2828b, - /*** Four byte table, leaf: 8136b0xx - offset 0x068e6 ***/ + /*** Four byte table, leaf: 8136b0xx - offset 0x0695a ***/ /* 30 */ 0xe2828c, 0xe2828d, 0xe2828e, 0xe2828f, /* 34 */ 0xe28290, 0xe28291, 0xe28292, 0xe28293, /* 38 */ 0xe28294, 0xe28295, - /*** Four byte table, leaf: 8136b1xx - offset 0x068f0 ***/ + /*** Four byte table, leaf: 8136b1xx - offset 0x06964 ***/ /* 30 */ 0xe28296, 0xe28297, 0xe28298, 0xe28299, /* 34 */ 0xe2829a, 0xe2829b, 0xe2829c, 0xe2829d, /* 38 */ 0xe2829e, 0xe2829f, - /*** Four byte table, leaf: 8136b2xx - offset 0x068fa ***/ + /*** Four byte table, leaf: 8136b2xx - offset 0x0696e ***/ /* 30 */ 0xe282a0, 0xe282a1, 0xe282a2, 0xe282a3, /* 34 */ 0xe282a4, 0xe282a5, 0xe282a6, 0xe282a7, /* 38 */ 0xe282a8, 0xe282a9, - /*** Four byte table, leaf: 8136b3xx - offset 0x06904 ***/ + /*** Four byte table, leaf: 8136b3xx - offset 0x06978 ***/ /* 30 */ 0xe282aa, 0xe282ab, 0xe282ad, 0xe282ae, /* 34 */ 0xe282af, 0xe282b0, 0xe282b1, 0xe282b2, /* 38 */ 0xe282b3, 0xe282b4, - /*** Four byte table, leaf: 8136b4xx - offset 0x0690e ***/ + /*** Four byte table, leaf: 8136b4xx - offset 0x06982 ***/ /* 30 */ 0xe282b5, 0xe282b6, 0xe282b7, 0xe282b8, /* 34 */ 0xe282b9, 0xe282ba, 0xe282bb, 0xe282bc, /* 38 */ 0xe282bd, 0xe282be, - /*** Four byte table, leaf: 8136b5xx - offset 0x06918 ***/ + /*** Four byte table, leaf: 8136b5xx - offset 0x0698c ***/ /* 30 */ 0xe282bf, 0xe28380, 0xe28381, 0xe28382, /* 34 */ 0xe28383, 0xe28384, 0xe28385, 0xe28386, /* 38 */ 0xe28387, 0xe28388, - /*** Four byte table, leaf: 8136b6xx - offset 0x06922 ***/ + /*** Four byte table, leaf: 8136b6xx - offset 0x06996 ***/ /* 30 */ 0xe28389, 0xe2838a, 0xe2838b, 0xe2838c, /* 34 */ 0xe2838d, 0xe2838e, 0xe2838f, 0xe28390, /* 38 */ 0xe28391, 0xe28392, - /*** Four byte table, leaf: 8136b7xx - offset 0x0692c ***/ + /*** Four byte table, leaf: 8136b7xx - offset 0x069a0 ***/ /* 30 */ 0xe28393, 0xe28394, 0xe28395, 0xe28396, /* 34 */ 0xe28397, 0xe28398, 0xe28399, 0xe2839a, /* 38 */ 0xe2839b, 0xe2839c, - /*** Four byte table, leaf: 8136b8xx - offset 0x06936 ***/ + /*** Four byte table, leaf: 8136b8xx - offset 0x069aa ***/ /* 30 */ 0xe2839d, 0xe2839e, 0xe2839f, 0xe283a0, /* 34 */ 0xe283a1, 0xe283a2, 0xe283a3, 0xe283a4, /* 38 */ 0xe283a5, 0xe283a6, - /*** Four byte table, leaf: 8136b9xx - offset 0x06940 ***/ + /*** Four byte table, leaf: 8136b9xx - offset 0x069b4 ***/ /* 30 */ 0xe283a7, 0xe283a8, 0xe283a9, 0xe283aa, /* 34 */ 0xe283ab, 0xe283ac, 0xe283ad, 0xe283ae, /* 38 */ 0xe283af, 0xe283b0, - /*** Four byte table, leaf: 8136baxx - offset 0x0694a ***/ + /*** Four byte table, leaf: 8136baxx - offset 0x069be ***/ /* 30 */ 0xe283b1, 0xe283b2, 0xe283b3, 0xe283b4, /* 34 */ 0xe283b5, 0xe283b6, 0xe283b7, 0xe283b8, /* 38 */ 0xe283b9, 0xe283ba, - /*** Four byte table, leaf: 8136bbxx - offset 0x06954 ***/ + /*** Four byte table, leaf: 8136bbxx - offset 0x069c8 ***/ /* 30 */ 0xe283bb, 0xe283bc, 0xe283bd, 0xe283be, /* 34 */ 0xe283bf, 0xe28480, 0xe28481, 0xe28482, /* 38 */ 0xe28484, 0xe28486, - /*** Four byte table, leaf: 8136bcxx - offset 0x0695e ***/ + /*** Four byte table, leaf: 8136bcxx - offset 0x069d2 ***/ /* 30 */ 0xe28487, 0xe28488, 0xe2848a, 0xe2848b, /* 34 */ 0xe2848c, 0xe2848d, 0xe2848e, 0xe2848f, /* 38 */ 0xe28490, 0xe28491, - /*** Four byte table, leaf: 8136bdxx - offset 0x06968 ***/ + /*** Four byte table, leaf: 8136bdxx - offset 0x069dc ***/ /* 30 */ 0xe28492, 0xe28493, 0xe28494, 0xe28495, /* 34 */ 0xe28497, 0xe28498, 0xe28499, 0xe2849a, /* 38 */ 0xe2849b, 0xe2849c, - /*** Four byte table, leaf: 8136bexx - offset 0x06972 ***/ + /*** Four byte table, leaf: 8136bexx - offset 0x069e6 ***/ /* 30 */ 0xe2849d, 0xe2849e, 0xe2849f, 0xe284a0, /* 34 */ 0xe284a2, 0xe284a3, 0xe284a4, 0xe284a5, /* 38 */ 0xe284a6, 0xe284a7, - /*** Four byte table, leaf: 8136bfxx - offset 0x0697c ***/ + /*** Four byte table, leaf: 8136bfxx - offset 0x069f0 ***/ /* 30 */ 0xe284a8, 0xe284a9, 0xe284aa, 0xe284ab, /* 34 */ 0xe284ac, 0xe284ad, 0xe284ae, 0xe284af, /* 38 */ 0xe284b0, 0xe284b1, - /*** Four byte table, leaf: 8136c0xx - offset 0x06986 ***/ + /*** Four byte table, leaf: 8136c0xx - offset 0x069fa ***/ /* 30 */ 0xe284b2, 0xe284b3, 0xe284b4, 0xe284b5, /* 34 */ 0xe284b6, 0xe284b7, 0xe284b8, 0xe284b9, /* 38 */ 0xe284ba, 0xe284bb, - /*** Four byte table, leaf: 8136c1xx - offset 0x06990 ***/ + /*** Four byte table, leaf: 8136c1xx - offset 0x06a04 ***/ /* 30 */ 0xe284bc, 0xe284bd, 0xe284be, 0xe284bf, /* 34 */ 0xe28580, 0xe28581, 0xe28582, 0xe28583, /* 38 */ 0xe28584, 0xe28585, - /*** Four byte table, leaf: 8136c2xx - offset 0x0699a ***/ + /*** Four byte table, leaf: 8136c2xx - offset 0x06a0e ***/ /* 30 */ 0xe28586, 0xe28587, 0xe28588, 0xe28589, /* 34 */ 0xe2858a, 0xe2858b, 0xe2858c, 0xe2858d, /* 38 */ 0xe2858e, 0xe2858f, - /*** Four byte table, leaf: 8136c3xx - offset 0x069a4 ***/ + /*** Four byte table, leaf: 8136c3xx - offset 0x06a18 ***/ /* 30 */ 0xe28590, 0xe28591, 0xe28592, 0xe28593, /* 34 */ 0xe28594, 0xe28595, 0xe28596, 0xe28597, /* 38 */ 0xe28598, 0xe28599, - /*** Four byte table, leaf: 8136c4xx - offset 0x069ae ***/ + /*** Four byte table, leaf: 8136c4xx - offset 0x06a22 ***/ /* 30 */ 0xe2859a, 0xe2859b, 0xe2859c, 0xe2859d, /* 34 */ 0xe2859e, 0xe2859f, 0xe285ac, 0xe285ad, /* 38 */ 0xe285ae, 0xe285af, - /*** Four byte table, leaf: 8136c5xx - offset 0x069b8 ***/ + /*** Four byte table, leaf: 8136c5xx - offset 0x06a2c ***/ /* 30 */ 0xe285ba, 0xe285bb, 0xe285bc, 0xe285bd, /* 34 */ 0xe285be, 0xe285bf, 0xe28680, 0xe28681, /* 38 */ 0xe28682, 0xe28683, - /*** Four byte table, leaf: 8136c6xx - offset 0x069c2 ***/ + /*** Four byte table, leaf: 8136c6xx - offset 0x06a36 ***/ /* 30 */ 0xe28684, 0xe28685, 0xe28686, 0xe28687, /* 34 */ 0xe28688, 0xe28689, 0xe2868a, 0xe2868b, /* 38 */ 0xe2868c, 0xe2868d, - /*** Four byte table, leaf: 8136c7xx - offset 0x069cc ***/ + /*** Four byte table, leaf: 8136c7xx - offset 0x06a40 ***/ /* 30 */ 0xe2868e, 0xe2868f, 0xe28694, 0xe28695, /* 34 */ 0xe2869a, 0xe2869b, 0xe2869c, 0xe2869d, /* 38 */ 0xe2869e, 0xe2869f, - /*** Four byte table, leaf: 8136c8xx - offset 0x069d6 ***/ + /*** Four byte table, leaf: 8136c8xx - offset 0x06a4a ***/ /* 30 */ 0xe286a0, 0xe286a1, 0xe286a2, 0xe286a3, /* 34 */ 0xe286a4, 0xe286a5, 0xe286a6, 0xe286a7, /* 38 */ 0xe286a8, 0xe286a9, - /*** Four byte table, leaf: 8136c9xx - offset 0x069e0 ***/ + /*** Four byte table, leaf: 8136c9xx - offset 0x06a54 ***/ /* 30 */ 0xe286aa, 0xe286ab, 0xe286ac, 0xe286ad, /* 34 */ 0xe286ae, 0xe286af, 0xe286b0, 0xe286b1, /* 38 */ 0xe286b2, 0xe286b3, - /*** Four byte table, leaf: 8136caxx - offset 0x069ea ***/ + /*** Four byte table, leaf: 8136caxx - offset 0x06a5e ***/ /* 30 */ 0xe286b4, 0xe286b5, 0xe286b6, 0xe286b7, /* 34 */ 0xe286b8, 0xe286b9, 0xe286ba, 0xe286bb, /* 38 */ 0xe286bc, 0xe286bd, - /*** Four byte table, leaf: 8136cbxx - offset 0x069f4 ***/ + /*** Four byte table, leaf: 8136cbxx - offset 0x06a68 ***/ /* 30 */ 0xe286be, 0xe286bf, 0xe28780, 0xe28781, /* 34 */ 0xe28782, 0xe28783, 0xe28784, 0xe28785, /* 38 */ 0xe28786, 0xe28787, - /*** Four byte table, leaf: 8136ccxx - offset 0x069fe ***/ + /*** Four byte table, leaf: 8136ccxx - offset 0x06a72 ***/ /* 30 */ 0xe28788, 0xe28789, 0xe2878a, 0xe2878b, /* 34 */ 0xe2878c, 0xe2878d, 0xe2878e, 0xe2878f, /* 38 */ 0xe28790, 0xe28791, - /*** Four byte table, leaf: 8136cdxx - offset 0x06a08 ***/ + /*** Four byte table, leaf: 8136cdxx - offset 0x06a7c ***/ /* 30 */ 0xe28792, 0xe28793, 0xe28794, 0xe28795, /* 34 */ 0xe28796, 0xe28797, 0xe28798, 0xe28799, /* 38 */ 0xe2879a, 0xe2879b, - /*** Four byte table, leaf: 8136cexx - offset 0x06a12 ***/ + /*** Four byte table, leaf: 8136cexx - offset 0x06a86 ***/ /* 30 */ 0xe2879c, 0xe2879d, 0xe2879e, 0xe2879f, /* 34 */ 0xe287a0, 0xe287a1, 0xe287a2, 0xe287a3, /* 38 */ 0xe287a4, 0xe287a5, - /*** Four byte table, leaf: 8136cfxx - offset 0x06a1c ***/ + /*** Four byte table, leaf: 8136cfxx - offset 0x06a90 ***/ /* 30 */ 0xe287a6, 0xe287a7, 0xe287a8, 0xe287a9, /* 34 */ 0xe287aa, 0xe287ab, 0xe287ac, 0xe287ad, /* 38 */ 0xe287ae, 0xe287af, - /*** Four byte table, leaf: 8136d0xx - offset 0x06a26 ***/ + /*** Four byte table, leaf: 8136d0xx - offset 0x06a9a ***/ /* 30 */ 0xe287b0, 0xe287b1, 0xe287b2, 0xe287b3, /* 34 */ 0xe287b4, 0xe287b5, 0xe287b6, 0xe287b7, /* 38 */ 0xe287b8, 0xe287b9, - /*** Four byte table, leaf: 8136d1xx - offset 0x06a30 ***/ + /*** Four byte table, leaf: 8136d1xx - offset 0x06aa4 ***/ /* 30 */ 0xe287ba, 0xe287bb, 0xe287bc, 0xe287bd, /* 34 */ 0xe287be, 0xe287bf, 0xe28880, 0xe28881, /* 38 */ 0xe28882, 0xe28883, - /*** Four byte table, leaf: 8136d2xx - offset 0x06a3a ***/ + /*** Four byte table, leaf: 8136d2xx - offset 0x06aae ***/ /* 30 */ 0xe28884, 0xe28885, 0xe28886, 0xe28887, /* 34 */ 0xe28889, 0xe2888a, 0xe2888b, 0xe2888c, /* 38 */ 0xe2888d, 0xe2888e, - /*** Four byte table, leaf: 8136d3xx - offset 0x06a44 ***/ + /*** Four byte table, leaf: 8136d3xx - offset 0x06ab8 ***/ /* 30 */ 0xe28890, 0xe28892, 0xe28893, 0xe28894, /* 34 */ 0xe28896, 0xe28897, 0xe28898, 0xe28899, /* 38 */ 0xe2889b, 0xe2889c, - /*** Four byte table, leaf: 8136d4xx - offset 0x06a4e ***/ + /*** Four byte table, leaf: 8136d4xx - offset 0x06ac2 ***/ /* 30 */ 0xe288a1, 0xe288a2, 0xe288a4, 0xe288a6, /* 34 */ 0xe288ac, 0xe288ad, 0xe288af, 0xe288b0, /* 38 */ 0xe288b1, 0xe288b2, - /*** Four byte table, leaf: 8136d5xx - offset 0x06a58 ***/ + /*** Four byte table, leaf: 8136d5xx - offset 0x06acc ***/ /* 30 */ 0xe288b3, 0xe288b8, 0xe288b9, 0xe288ba, /* 34 */ 0xe288bb, 0xe288bc, 0xe288be, 0xe288bf, /* 38 */ 0xe28980, 0xe28981, - /*** Four byte table, leaf: 8136d6xx - offset 0x06a62 ***/ + /*** Four byte table, leaf: 8136d6xx - offset 0x06ad6 ***/ /* 30 */ 0xe28982, 0xe28983, 0xe28984, 0xe28985, /* 34 */ 0xe28986, 0xe28987, 0xe28989, 0xe2898a, /* 38 */ 0xe2898b, 0xe2898d, - /*** Four byte table, leaf: 8136d7xx - offset 0x06a6c ***/ + /*** Four byte table, leaf: 8136d7xx - offset 0x06ae0 ***/ /* 30 */ 0xe2898e, 0xe2898f, 0xe28990, 0xe28991, /* 34 */ 0xe28993, 0xe28994, 0xe28995, 0xe28996, /* 38 */ 0xe28997, 0xe28998, - /*** Four byte table, leaf: 8136d8xx - offset 0x06a76 ***/ + /*** Four byte table, leaf: 8136d8xx - offset 0x06aea ***/ /* 30 */ 0xe28999, 0xe2899a, 0xe2899b, 0xe2899c, /* 34 */ 0xe2899d, 0xe2899e, 0xe2899f, 0xe289a2, /* 38 */ 0xe289a3, 0xe289a8, - /*** Four byte table, leaf: 8136d9xx - offset 0x06a80 ***/ + /*** Four byte table, leaf: 8136d9xx - offset 0x06af4 ***/ /* 30 */ 0xe289a9, 0xe289aa, 0xe289ab, 0xe289ac, /* 34 */ 0xe289ad, 0xe289b0, 0xe289b1, 0xe289b2, /* 38 */ 0xe289b3, 0xe289b4, - /*** Four byte table, leaf: 8136daxx - offset 0x06a8a ***/ + /*** Four byte table, leaf: 8136daxx - offset 0x06afe ***/ /* 30 */ 0xe289b5, 0xe289b6, 0xe289b7, 0xe289b8, /* 34 */ 0xe289b9, 0xe289ba, 0xe289bb, 0xe289bc, /* 38 */ 0xe289bd, 0xe289be, - /*** Four byte table, leaf: 8136dbxx - offset 0x06a94 ***/ + /*** Four byte table, leaf: 8136dbxx - offset 0x06b08 ***/ /* 30 */ 0xe289bf, 0xe28a80, 0xe28a81, 0xe28a82, /* 34 */ 0xe28a83, 0xe28a84, 0xe28a85, 0xe28a86, /* 38 */ 0xe28a87, 0xe28a88, - /*** Four byte table, leaf: 8136dcxx - offset 0x06a9e ***/ + /*** Four byte table, leaf: 8136dcxx - offset 0x06b12 ***/ /* 30 */ 0xe28a89, 0xe28a8a, 0xe28a8b, 0xe28a8c, /* 34 */ 0xe28a8d, 0xe28a8e, 0xe28a8f, 0xe28a90, /* 38 */ 0xe28a91, 0xe28a92, - /*** Four byte table, leaf: 8136ddxx - offset 0x06aa8 ***/ + /*** Four byte table, leaf: 8136ddxx - offset 0x06b1c ***/ /* 30 */ 0xe28a93, 0xe28a94, 0xe28a96, 0xe28a97, /* 34 */ 0xe28a98, 0xe28a9a, 0xe28a9b, 0xe28a9c, /* 38 */ 0xe28a9d, 0xe28a9e, - /*** Four byte table, leaf: 8136dexx - offset 0x06ab2 ***/ + /*** Four byte table, leaf: 8136dexx - offset 0x06b26 ***/ /* 30 */ 0xe28a9f, 0xe28aa0, 0xe28aa1, 0xe28aa2, /* 34 */ 0xe28aa3, 0xe28aa4, 0xe28aa6, 0xe28aa7, /* 38 */ 0xe28aa8, 0xe28aa9, - /*** Four byte table, leaf: 8136dfxx - offset 0x06abc ***/ + /*** Four byte table, leaf: 8136dfxx - offset 0x06b30 ***/ /* 30 */ 0xe28aaa, 0xe28aab, 0xe28aac, 0xe28aad, /* 34 */ 0xe28aae, 0xe28aaf, 0xe28ab0, 0xe28ab1, /* 38 */ 0xe28ab2, 0xe28ab3, - /*** Four byte table, leaf: 8136e0xx - offset 0x06ac6 ***/ + /*** Four byte table, leaf: 8136e0xx - offset 0x06b3a ***/ /* 30 */ 0xe28ab4, 0xe28ab5, 0xe28ab6, 0xe28ab7, /* 34 */ 0xe28ab8, 0xe28ab9, 0xe28aba, 0xe28abb, /* 38 */ 0xe28abc, 0xe28abd, - /*** Four byte table, leaf: 8136e1xx - offset 0x06ad0 ***/ + /*** Four byte table, leaf: 8136e1xx - offset 0x06b44 ***/ /* 30 */ 0xe28abe, 0xe28b80, 0xe28b81, 0xe28b82, /* 34 */ 0xe28b83, 0xe28b84, 0xe28b85, 0xe28b86, /* 38 */ 0xe28b87, 0xe28b88, - /*** Four byte table, leaf: 8136e2xx - offset 0x06ada ***/ + /*** Four byte table, leaf: 8136e2xx - offset 0x06b4e ***/ /* 30 */ 0xe28b89, 0xe28b8a, 0xe28b8b, 0xe28b8c, /* 34 */ 0xe28b8d, 0xe28b8e, 0xe28b8f, 0xe28b90, /* 38 */ 0xe28b91, 0xe28b92, - /*** Four byte table, leaf: 8136e3xx - offset 0x06ae4 ***/ + /*** Four byte table, leaf: 8136e3xx - offset 0x06b58 ***/ /* 30 */ 0xe28b93, 0xe28b94, 0xe28b95, 0xe28b96, /* 34 */ 0xe28b97, 0xe28b98, 0xe28b99, 0xe28b9a, /* 38 */ 0xe28b9b, 0xe28b9c, - /*** Four byte table, leaf: 8136e4xx - offset 0x06aee ***/ + /*** Four byte table, leaf: 8136e4xx - offset 0x06b62 ***/ /* 30 */ 0xe28b9d, 0xe28b9e, 0xe28b9f, 0xe28ba0, /* 34 */ 0xe28ba1, 0xe28ba2, 0xe28ba3, 0xe28ba4, /* 38 */ 0xe28ba5, 0xe28ba6, - /*** Four byte table, leaf: 8136e5xx - offset 0x06af8 ***/ + /*** Four byte table, leaf: 8136e5xx - offset 0x06b6c ***/ /* 30 */ 0xe28ba7, 0xe28ba8, 0xe28ba9, 0xe28baa, /* 34 */ 0xe28bab, 0xe28bac, 0xe28bad, 0xe28bae, /* 38 */ 0xe28baf, 0xe28bb0, - /*** Four byte table, leaf: 8136e6xx - offset 0x06b02 ***/ + /*** Four byte table, leaf: 8136e6xx - offset 0x06b76 ***/ /* 30 */ 0xe28bb1, 0xe28bb2, 0xe28bb3, 0xe28bb4, /* 34 */ 0xe28bb5, 0xe28bb6, 0xe28bb7, 0xe28bb8, /* 38 */ 0xe28bb9, 0xe28bba, - /*** Four byte table, leaf: 8136e7xx - offset 0x06b0c ***/ + /*** Four byte table, leaf: 8136e7xx - offset 0x06b80 ***/ /* 30 */ 0xe28bbb, 0xe28bbc, 0xe28bbd, 0xe28bbe, /* 34 */ 0xe28bbf, 0xe28c80, 0xe28c81, 0xe28c82, /* 38 */ 0xe28c83, 0xe28c84, - /*** Four byte table, leaf: 8136e8xx - offset 0x06b16 ***/ + /*** Four byte table, leaf: 8136e8xx - offset 0x06b8a ***/ /* 30 */ 0xe28c85, 0xe28c86, 0xe28c87, 0xe28c88, /* 34 */ 0xe28c89, 0xe28c8a, 0xe28c8b, 0xe28c8c, /* 38 */ 0xe28c8d, 0xe28c8e, - /*** Four byte table, leaf: 8136e9xx - offset 0x06b20 ***/ + /*** Four byte table, leaf: 8136e9xx - offset 0x06b94 ***/ /* 30 */ 0xe28c8f, 0xe28c90, 0xe28c91, 0xe28c93, /* 34 */ 0xe28c94, 0xe28c95, 0xe28c96, 0xe28c97, /* 38 */ 0xe28c98, 0xe28c99, - /*** Four byte table, leaf: 8136eaxx - offset 0x06b2a ***/ + /*** Four byte table, leaf: 8136eaxx - offset 0x06b9e ***/ /* 30 */ 0xe28c9a, 0xe28c9b, 0xe28c9c, 0xe28c9d, /* 34 */ 0xe28c9e, 0xe28c9f, 0xe28ca0, 0xe28ca1, /* 38 */ 0xe28ca2, 0xe28ca3, - /*** Four byte table, leaf: 8136ebxx - offset 0x06b34 ***/ + /*** Four byte table, leaf: 8136ebxx - offset 0x06ba8 ***/ /* 30 */ 0xe28ca4, 0xe28ca5, 0xe28ca6, 0xe28ca7, /* 34 */ 0xe28ca8, 0xe28ca9, 0xe28caa, 0xe28cab, /* 38 */ 0xe28cac, 0xe28cad, - /*** Four byte table, leaf: 8136ecxx - offset 0x06b3e ***/ + /*** Four byte table, leaf: 8136ecxx - offset 0x06bb2 ***/ /* 30 */ 0xe28cae, 0xe28caf, 0xe28cb0, 0xe28cb1, /* 34 */ 0xe28cb2, 0xe28cb3, 0xe28cb4, 0xe28cb5, /* 38 */ 0xe28cb6, 0xe28cb7, - /*** Four byte table, leaf: 8136edxx - offset 0x06b48 ***/ + /*** Four byte table, leaf: 8136edxx - offset 0x06bbc ***/ /* 30 */ 0xe28cb8, 0xe28cb9, 0xe28cba, 0xe28cbb, /* 34 */ 0xe28cbc, 0xe28cbd, 0xe28cbe, 0xe28cbf, /* 38 */ 0xe28d80, 0xe28d81, - /*** Four byte table, leaf: 8136eexx - offset 0x06b52 ***/ + /*** Four byte table, leaf: 8136eexx - offset 0x06bc6 ***/ /* 30 */ 0xe28d82, 0xe28d83, 0xe28d84, 0xe28d85, /* 34 */ 0xe28d86, 0xe28d87, 0xe28d88, 0xe28d89, /* 38 */ 0xe28d8a, 0xe28d8b, - /*** Four byte table, leaf: 8136efxx - offset 0x06b5c ***/ + /*** Four byte table, leaf: 8136efxx - offset 0x06bd0 ***/ /* 30 */ 0xe28d8c, 0xe28d8d, 0xe28d8e, 0xe28d8f, /* 34 */ 0xe28d90, 0xe28d91, 0xe28d92, 0xe28d93, /* 38 */ 0xe28d94, 0xe28d95, - /*** Four byte table, leaf: 8136f0xx - offset 0x06b66 ***/ + /*** Four byte table, leaf: 8136f0xx - offset 0x06bda ***/ /* 30 */ 0xe28d96, 0xe28d97, 0xe28d98, 0xe28d99, /* 34 */ 0xe28d9a, 0xe28d9b, 0xe28d9c, 0xe28d9d, /* 38 */ 0xe28d9e, 0xe28d9f, - /*** Four byte table, leaf: 8136f1xx - offset 0x06b70 ***/ + /*** Four byte table, leaf: 8136f1xx - offset 0x06be4 ***/ /* 30 */ 0xe28da0, 0xe28da1, 0xe28da2, 0xe28da3, /* 34 */ 0xe28da4, 0xe28da5, 0xe28da6, 0xe28da7, /* 38 */ 0xe28da8, 0xe28da9, - /*** Four byte table, leaf: 8136f2xx - offset 0x06b7a ***/ + /*** Four byte table, leaf: 8136f2xx - offset 0x06bee ***/ /* 30 */ 0xe28daa, 0xe28dab, 0xe28dac, 0xe28dad, /* 34 */ 0xe28dae, 0xe28daf, 0xe28db0, 0xe28db1, /* 38 */ 0xe28db2, 0xe28db3, - /*** Four byte table, leaf: 8136f3xx - offset 0x06b84 ***/ + /*** Four byte table, leaf: 8136f3xx - offset 0x06bf8 ***/ /* 30 */ 0xe28db4, 0xe28db5, 0xe28db6, 0xe28db7, /* 34 */ 0xe28db8, 0xe28db9, 0xe28dba, 0xe28dbb, /* 38 */ 0xe28dbc, 0xe28dbd, - /*** Four byte table, leaf: 8136f4xx - offset 0x06b8e ***/ + /*** Four byte table, leaf: 8136f4xx - offset 0x06c02 ***/ /* 30 */ 0xe28dbe, 0xe28dbf, 0xe28e80, 0xe28e81, /* 34 */ 0xe28e82, 0xe28e83, 0xe28e84, 0xe28e85, /* 38 */ 0xe28e86, 0xe28e87, - /*** Four byte table, leaf: 8136f5xx - offset 0x06b98 ***/ + /*** Four byte table, leaf: 8136f5xx - offset 0x06c0c ***/ /* 30 */ 0xe28e88, 0xe28e89, 0xe28e8a, 0xe28e8b, /* 34 */ 0xe28e8c, 0xe28e8d, 0xe28e8e, 0xe28e8f, /* 38 */ 0xe28e90, 0xe28e91, - /*** Four byte table, leaf: 8136f6xx - offset 0x06ba2 ***/ + /*** Four byte table, leaf: 8136f6xx - offset 0x06c16 ***/ /* 30 */ 0xe28e92, 0xe28e93, 0xe28e94, 0xe28e95, /* 34 */ 0xe28e96, 0xe28e97, 0xe28e98, 0xe28e99, /* 38 */ 0xe28e9a, 0xe28e9b, - /*** Four byte table, leaf: 8136f7xx - offset 0x06bac ***/ + /*** Four byte table, leaf: 8136f7xx - offset 0x06c20 ***/ /* 30 */ 0xe28e9c, 0xe28e9d, 0xe28e9e, 0xe28e9f, /* 34 */ 0xe28ea0, 0xe28ea1, 0xe28ea2, 0xe28ea3, /* 38 */ 0xe28ea4, 0xe28ea5, - /*** Four byte table, leaf: 8136f8xx - offset 0x06bb6 ***/ + /*** Four byte table, leaf: 8136f8xx - offset 0x06c2a ***/ /* 30 */ 0xe28ea6, 0xe28ea7, 0xe28ea8, 0xe28ea9, /* 34 */ 0xe28eaa, 0xe28eab, 0xe28eac, 0xe28ead, /* 38 */ 0xe28eae, 0xe28eaf, - /*** Four byte table, leaf: 8136f9xx - offset 0x06bc0 ***/ + /*** Four byte table, leaf: 8136f9xx - offset 0x06c34 ***/ /* 30 */ 0xe28eb0, 0xe28eb1, 0xe28eb2, 0xe28eb3, /* 34 */ 0xe28eb4, 0xe28eb5, 0xe28eb6, 0xe28eb7, /* 38 */ 0xe28eb8, 0xe28eb9, - /*** Four byte table, leaf: 8136faxx - offset 0x06bca ***/ + /*** Four byte table, leaf: 8136faxx - offset 0x06c3e ***/ /* 30 */ 0xe28eba, 0xe28ebb, 0xe28ebc, 0xe28ebd, /* 34 */ 0xe28ebe, 0xe28ebf, 0xe28f80, 0xe28f81, /* 38 */ 0xe28f82, 0xe28f83, - /*** Four byte table, leaf: 8136fbxx - offset 0x06bd4 ***/ + /*** Four byte table, leaf: 8136fbxx - offset 0x06c48 ***/ /* 30 */ 0xe28f84, 0xe28f85, 0xe28f86, 0xe28f87, /* 34 */ 0xe28f88, 0xe28f89, 0xe28f8a, 0xe28f8b, /* 38 */ 0xe28f8c, 0xe28f8d, - /*** Four byte table, leaf: 8136fcxx - offset 0x06bde ***/ + /*** Four byte table, leaf: 8136fcxx - offset 0x06c52 ***/ /* 30 */ 0xe28f8e, 0xe28f8f, 0xe28f90, 0xe28f91, /* 34 */ 0xe28f92, 0xe28f93, 0xe28f94, 0xe28f95, /* 38 */ 0xe28f96, 0xe28f97, - /*** Four byte table, leaf: 8136fdxx - offset 0x06be8 ***/ + /*** Four byte table, leaf: 8136fdxx - offset 0x06c5c ***/ /* 30 */ 0xe28f98, 0xe28f99, 0xe28f9a, 0xe28f9b, /* 34 */ 0xe28f9c, 0xe28f9d, 0xe28f9e, 0xe28f9f, /* 38 */ 0xe28fa0, 0xe28fa1, - /*** Four byte table, leaf: 8136fexx - offset 0x06bf2 ***/ + /*** Four byte table, leaf: 8136fexx - offset 0x06c66 ***/ /* 30 */ 0xe28fa2, 0xe28fa3, 0xe28fa4, 0xe28fa5, /* 34 */ 0xe28fa6, 0xe28fa7, 0xe28fa8, 0xe28fa9, /* 38 */ 0xe28faa, 0xe28fab, - /*** Four byte table, leaf: 813781xx - offset 0x06bfc ***/ + /*** Four byte table, leaf: 813781xx - offset 0x06c70 ***/ /* 30 */ 0xe28fac, 0xe28fad, 0xe28fae, 0xe28faf, /* 34 */ 0xe28fb0, 0xe28fb1, 0xe28fb2, 0xe28fb3, /* 38 */ 0xe28fb4, 0xe28fb5, - /*** Four byte table, leaf: 813782xx - offset 0x06c06 ***/ + /*** Four byte table, leaf: 813782xx - offset 0x06c7a ***/ /* 30 */ 0xe28fb6, 0xe28fb7, 0xe28fb8, 0xe28fb9, /* 34 */ 0xe28fba, 0xe28fbb, 0xe28fbc, 0xe28fbd, /* 38 */ 0xe28fbe, 0xe28fbf, - /*** Four byte table, leaf: 813783xx - offset 0x06c10 ***/ + /*** Four byte table, leaf: 813783xx - offset 0x06c84 ***/ /* 30 */ 0xe29080, 0xe29081, 0xe29082, 0xe29083, /* 34 */ 0xe29084, 0xe29085, 0xe29086, 0xe29087, /* 38 */ 0xe29088, 0xe29089, - /*** Four byte table, leaf: 813784xx - offset 0x06c1a ***/ + /*** Four byte table, leaf: 813784xx - offset 0x06c8e ***/ /* 30 */ 0xe2908a, 0xe2908b, 0xe2908c, 0xe2908d, /* 34 */ 0xe2908e, 0xe2908f, 0xe29090, 0xe29091, /* 38 */ 0xe29092, 0xe29093, - /*** Four byte table, leaf: 813785xx - offset 0x06c24 ***/ + /*** Four byte table, leaf: 813785xx - offset 0x06c98 ***/ /* 30 */ 0xe29094, 0xe29095, 0xe29096, 0xe29097, /* 34 */ 0xe29098, 0xe29099, 0xe2909a, 0xe2909b, /* 38 */ 0xe2909c, 0xe2909d, - /*** Four byte table, leaf: 813786xx - offset 0x06c2e ***/ + /*** Four byte table, leaf: 813786xx - offset 0x06ca2 ***/ /* 30 */ 0xe2909e, 0xe2909f, 0xe290a0, 0xe290a1, /* 34 */ 0xe290a2, 0xe290a3, 0xe290a4, 0xe290a5, /* 38 */ 0xe290a6, 0xe290a7, - /*** Four byte table, leaf: 813787xx - offset 0x06c38 ***/ + /*** Four byte table, leaf: 813787xx - offset 0x06cac ***/ /* 30 */ 0xe290a8, 0xe290a9, 0xe290aa, 0xe290ab, /* 34 */ 0xe290ac, 0xe290ad, 0xe290ae, 0xe290af, /* 38 */ 0xe290b0, 0xe290b1, - /*** Four byte table, leaf: 813788xx - offset 0x06c42 ***/ + /*** Four byte table, leaf: 813788xx - offset 0x06cb6 ***/ /* 30 */ 0xe290b2, 0xe290b3, 0xe290b4, 0xe290b5, /* 34 */ 0xe290b6, 0xe290b7, 0xe290b8, 0xe290b9, /* 38 */ 0xe290ba, 0xe290bb, - /*** Four byte table, leaf: 813789xx - offset 0x06c4c ***/ + /*** Four byte table, leaf: 813789xx - offset 0x06cc0 ***/ /* 30 */ 0xe290bc, 0xe290bd, 0xe290be, 0xe290bf, /* 34 */ 0xe29180, 0xe29181, 0xe29182, 0xe29183, /* 38 */ 0xe29184, 0xe29185, - /*** Four byte table, leaf: 81378axx - offset 0x06c56 ***/ + /*** Four byte table, leaf: 81378axx - offset 0x06cca ***/ /* 30 */ 0xe29186, 0xe29187, 0xe29188, 0xe29189, /* 34 */ 0xe2918a, 0xe2918b, 0xe2918c, 0xe2918d, /* 38 */ 0xe2918e, 0xe2918f, - /*** Four byte table, leaf: 81378bxx - offset 0x06c60 ***/ + /*** Four byte table, leaf: 81378bxx - offset 0x06cd4 ***/ /* 30 */ 0xe29190, 0xe29191, 0xe29192, 0xe29193, /* 34 */ 0xe29194, 0xe29195, 0xe29196, 0xe29197, /* 38 */ 0xe29198, 0xe29199, - /*** Four byte table, leaf: 81378cxx - offset 0x06c6a ***/ + /*** Four byte table, leaf: 81378cxx - offset 0x06cde ***/ /* 30 */ 0xe2919a, 0xe2919b, 0xe2919c, 0xe2919d, /* 34 */ 0xe2919e, 0xe2919f, 0xe291aa, 0xe291ab, /* 38 */ 0xe291ac, 0xe291ad, - /*** Four byte table, leaf: 81378dxx - offset 0x06c74 ***/ + /*** Four byte table, leaf: 81378dxx - offset 0x06ce8 ***/ /* 30 */ 0xe291ae, 0xe291af, 0xe291b0, 0xe291b1, /* 34 */ 0xe291b2, 0xe291b3, 0xe2929c, 0xe2929d, /* 38 */ 0xe2929e, 0xe2929f, - /*** Four byte table, leaf: 81378exx - offset 0x06c7e ***/ + /*** Four byte table, leaf: 81378exx - offset 0x06cf2 ***/ /* 30 */ 0xe292a0, 0xe292a1, 0xe292a2, 0xe292a3, /* 34 */ 0xe292a4, 0xe292a5, 0xe292a6, 0xe292a7, /* 38 */ 0xe292a8, 0xe292a9, - /*** Four byte table, leaf: 81378fxx - offset 0x06c88 ***/ + /*** Four byte table, leaf: 81378fxx - offset 0x06cfc ***/ /* 30 */ 0xe292aa, 0xe292ab, 0xe292ac, 0xe292ad, /* 34 */ 0xe292ae, 0xe292af, 0xe292b0, 0xe292b1, /* 38 */ 0xe292b2, 0xe292b3, - /*** Four byte table, leaf: 813790xx - offset 0x06c92 ***/ + /*** Four byte table, leaf: 813790xx - offset 0x06d06 ***/ /* 30 */ 0xe292b4, 0xe292b5, 0xe292b6, 0xe292b7, /* 34 */ 0xe292b8, 0xe292b9, 0xe292ba, 0xe292bb, /* 38 */ 0xe292bc, 0xe292bd, - /*** Four byte table, leaf: 813791xx - offset 0x06c9c ***/ + /*** Four byte table, leaf: 813791xx - offset 0x06d10 ***/ /* 30 */ 0xe292be, 0xe292bf, 0xe29380, 0xe29381, /* 34 */ 0xe29382, 0xe29383, 0xe29384, 0xe29385, /* 38 */ 0xe29386, 0xe29387, - /*** Four byte table, leaf: 813792xx - offset 0x06ca6 ***/ + /*** Four byte table, leaf: 813792xx - offset 0x06d1a ***/ /* 30 */ 0xe29388, 0xe29389, 0xe2938a, 0xe2938b, /* 34 */ 0xe2938c, 0xe2938d, 0xe2938e, 0xe2938f, /* 38 */ 0xe29390, 0xe29391, - /*** Four byte table, leaf: 813793xx - offset 0x06cb0 ***/ + /*** Four byte table, leaf: 813793xx - offset 0x06d24 ***/ /* 30 */ 0xe29392, 0xe29393, 0xe29394, 0xe29395, /* 34 */ 0xe29396, 0xe29397, 0xe29398, 0xe29399, /* 38 */ 0xe2939a, 0xe2939b, - /*** Four byte table, leaf: 813794xx - offset 0x06cba ***/ + /*** Four byte table, leaf: 813794xx - offset 0x06d2e ***/ /* 30 */ 0xe2939c, 0xe2939d, 0xe2939e, 0xe2939f, /* 34 */ 0xe293a0, 0xe293a1, 0xe293a2, 0xe293a3, /* 38 */ 0xe293a4, 0xe293a5, - /*** Four byte table, leaf: 813795xx - offset 0x06cc4 ***/ + /*** Four byte table, leaf: 813795xx - offset 0x06d38 ***/ /* 30 */ 0xe293a6, 0xe293a7, 0xe293a8, 0xe293a9, /* 34 */ 0xe293aa, 0xe293ab, 0xe293ac, 0xe293ad, /* 38 */ 0xe293ae, 0xe293af, - /*** Four byte table, leaf: 813796xx - offset 0x06cce ***/ + /*** Four byte table, leaf: 813796xx - offset 0x06d42 ***/ /* 30 */ 0xe293b0, 0xe293b1, 0xe293b2, 0xe293b3, /* 34 */ 0xe293b4, 0xe293b5, 0xe293b6, 0xe293b7, /* 38 */ 0xe293b8, 0xe293b9, - /*** Four byte table, leaf: 813797xx - offset 0x06cd8 ***/ + /*** Four byte table, leaf: 813797xx - offset 0x06d4c ***/ /* 30 */ 0xe293ba, 0xe293bb, 0xe293bc, 0xe293bd, /* 34 */ 0xe293be, 0xe293bf, 0xe2958c, 0xe2958d, /* 38 */ 0xe2958e, 0xe2958f, - /*** Four byte table, leaf: 813798xx - offset 0x06ce2 ***/ + /*** Four byte table, leaf: 813798xx - offset 0x06d56 ***/ /* 30 */ 0xe295b4, 0xe295b5, 0xe295b6, 0xe295b7, /* 34 */ 0xe295b8, 0xe295b9, 0xe295ba, 0xe295bb, /* 38 */ 0xe295bc, 0xe295bd, - /*** Four byte table, leaf: 813799xx - offset 0x06cec ***/ + /*** Four byte table, leaf: 813799xx - offset 0x06d60 ***/ /* 30 */ 0xe295be, 0xe295bf, 0xe29680, 0xe29690, /* 34 */ 0xe29691, 0xe29692, 0xe29696, 0xe29697, /* 38 */ 0xe29698, 0xe29699, - /*** Four byte table, leaf: 81379axx - offset 0x06cf6 ***/ + /*** Four byte table, leaf: 81379axx - offset 0x06d6a ***/ /* 30 */ 0xe2969a, 0xe2969b, 0xe2969c, 0xe2969d, /* 34 */ 0xe2969e, 0xe2969f, 0xe296a2, 0xe296a3, /* 38 */ 0xe296a4, 0xe296a5, - /*** Four byte table, leaf: 81379bxx - offset 0x06d00 ***/ + /*** Four byte table, leaf: 81379bxx - offset 0x06d74 ***/ /* 30 */ 0xe296a6, 0xe296a7, 0xe296a8, 0xe296a9, /* 34 */ 0xe296aa, 0xe296ab, 0xe296ac, 0xe296ad, /* 38 */ 0xe296ae, 0xe296af, - /*** Four byte table, leaf: 81379cxx - offset 0x06d0a ***/ + /*** Four byte table, leaf: 81379cxx - offset 0x06d7e ***/ /* 30 */ 0xe296b0, 0xe296b1, 0xe296b4, 0xe296b5, /* 34 */ 0xe296b6, 0xe296b7, 0xe296b8, 0xe296b9, /* 38 */ 0xe296ba, 0xe296bb, - /*** Four byte table, leaf: 81379dxx - offset 0x06d14 ***/ + /*** Four byte table, leaf: 81379dxx - offset 0x06d88 ***/ /* 30 */ 0xe296be, 0xe296bf, 0xe29780, 0xe29781, /* 34 */ 0xe29782, 0xe29783, 0xe29784, 0xe29785, /* 38 */ 0xe29788, 0xe29789, - /*** Four byte table, leaf: 81379exx - offset 0x06d1e ***/ + /*** Four byte table, leaf: 81379exx - offset 0x06d92 ***/ /* 30 */ 0xe2978a, 0xe2978c, 0xe2978d, 0xe29790, /* 34 */ 0xe29791, 0xe29792, 0xe29793, 0xe29794, /* 38 */ 0xe29795, 0xe29796, - /*** Four byte table, leaf: 81379fxx - offset 0x06d28 ***/ + /*** Four byte table, leaf: 81379fxx - offset 0x06d9c ***/ /* 30 */ 0xe29797, 0xe29798, 0xe29799, 0xe2979a, /* 34 */ 0xe2979b, 0xe2979c, 0xe2979d, 0xe2979e, /* 38 */ 0xe2979f, 0xe297a0, - /*** Four byte table, leaf: 8137a0xx - offset 0x06d32 ***/ + /*** Four byte table, leaf: 8137a0xx - offset 0x06da6 ***/ /* 30 */ 0xe297a1, 0xe297a6, 0xe297a7, 0xe297a8, /* 34 */ 0xe297a9, 0xe297aa, 0xe297ab, 0xe297ac, /* 38 */ 0xe297ad, 0xe297ae, - /*** Four byte table, leaf: 8137a1xx - offset 0x06d3c ***/ + /*** Four byte table, leaf: 8137a1xx - offset 0x06db0 ***/ /* 30 */ 0xe297af, 0xe297b0, 0xe297b1, 0xe297b2, /* 34 */ 0xe297b3, 0xe297b4, 0xe297b5, 0xe297b6, /* 38 */ 0xe297b7, 0xe297b8, - /*** Four byte table, leaf: 8137a2xx - offset 0x06d46 ***/ + /*** Four byte table, leaf: 8137a2xx - offset 0x06dba ***/ /* 30 */ 0xe297b9, 0xe297ba, 0xe297bb, 0xe297bc, /* 34 */ 0xe297bd, 0xe297be, 0xe297bf, 0xe29880, /* 38 */ 0xe29881, 0xe29882, - /*** Four byte table, leaf: 8137a3xx - offset 0x06d50 ***/ + /*** Four byte table, leaf: 8137a3xx - offset 0x06dc4 ***/ /* 30 */ 0xe29883, 0xe29884, 0xe29887, 0xe29888, /* 34 */ 0xe2988a, 0xe2988b, 0xe2988c, 0xe2988d, /* 38 */ 0xe2988e, 0xe2988f, - /*** Four byte table, leaf: 8137a4xx - offset 0x06d5a ***/ + /*** Four byte table, leaf: 8137a4xx - offset 0x06dce ***/ /* 30 */ 0xe29890, 0xe29891, 0xe29892, 0xe29893, /* 34 */ 0xe29894, 0xe29895, 0xe29896, 0xe29897, /* 38 */ 0xe29898, 0xe29899, - /*** Four byte table, leaf: 8137a5xx - offset 0x06d64 ***/ + /*** Four byte table, leaf: 8137a5xx - offset 0x06dd8 ***/ /* 30 */ 0xe2989a, 0xe2989b, 0xe2989c, 0xe2989d, /* 34 */ 0xe2989e, 0xe2989f, 0xe298a0, 0xe298a1, /* 38 */ 0xe298a2, 0xe298a3, - /*** Four byte table, leaf: 8137a6xx - offset 0x06d6e ***/ + /*** Four byte table, leaf: 8137a6xx - offset 0x06de2 ***/ /* 30 */ 0xe298a4, 0xe298a5, 0xe298a6, 0xe298a7, /* 34 */ 0xe298a8, 0xe298a9, 0xe298aa, 0xe298ab, /* 38 */ 0xe298ac, 0xe298ad, - /*** Four byte table, leaf: 8137a7xx - offset 0x06d78 ***/ + /*** Four byte table, leaf: 8137a7xx - offset 0x06dec ***/ /* 30 */ 0xe298ae, 0xe298af, 0xe298b0, 0xe298b1, /* 34 */ 0xe298b2, 0xe298b3, 0xe298b4, 0xe298b5, /* 38 */ 0xe298b6, 0xe298b7, - /*** Four byte table, leaf: 8137a8xx - offset 0x06d82 ***/ + /*** Four byte table, leaf: 8137a8xx - offset 0x06df6 ***/ /* 30 */ 0xe298b8, 0xe298b9, 0xe298ba, 0xe298bb, /* 34 */ 0xe298bc, 0xe298bd, 0xe298be, 0xe298bf, /* 38 */ 0xe29981, /* 1 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 8138fdxx - offset 0x06d8b ***/ + /*** Four byte table, leaf: 8138fdxx - offset 0x06dff ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 38 */ 0x000000, 0xe2ba82, - /*** Four byte table, leaf: 8138fexx - offset 0x06d95 ***/ + /*** Four byte table, leaf: 8138fexx - offset 0x06e09 ***/ /* 30 */ 0xe2ba83, 0xe2ba85, 0xe2ba86, 0xe2ba87, /* 34 */ 0xe2ba89, 0xe2ba8a, 0xe2ba8d, 0xe2ba8e, /* 38 */ 0xe2ba8f, 0xe2ba90, - /*** Four byte table, leaf: 813981xx - offset 0x06d9f ***/ + /*** Four byte table, leaf: 813981xx - offset 0x06e13 ***/ /* 30 */ 0xe2ba91, 0xe2ba92, 0xe2ba93, 0xe2ba94, /* 34 */ 0xe2ba95, 0xe2ba96, 0xe2ba98, 0xe2ba99, /* 38 */ 0xe2ba9a, 0xe2ba9b, - /*** Four byte table, leaf: 813982xx - offset 0x06da9 ***/ + /*** Four byte table, leaf: 813982xx - offset 0x06e1d ***/ /* 30 */ 0xe2ba9c, 0xe2ba9d, 0xe2ba9e, 0xe2ba9f, /* 34 */ 0xe2baa0, 0xe2baa1, 0xe2baa2, 0xe2baa3, /* 38 */ 0xe2baa4, 0xe2baa5, - /*** Four byte table, leaf: 813983xx - offset 0x06db3 ***/ + /*** Four byte table, leaf: 813983xx - offset 0x06e27 ***/ /* 30 */ 0xe2baa6, 0xe2baa8, 0xe2baa9, 0xe2baab, /* 34 */ 0xe2baac, 0xe2baad, 0xe2baaf, 0xe2bab0, /* 38 */ 0xe2bab1, 0xe2bab2, - /*** Four byte table, leaf: 813984xx - offset 0x06dbd ***/ + /*** Four byte table, leaf: 813984xx - offset 0x06e31 ***/ /* 30 */ 0xe2bab4, 0xe2bab5, 0xe2bab8, 0xe2bab9, /* 34 */ 0xe2baba, 0xe2babc, 0xe2babd, 0xe2babe, /* 38 */ 0xe2babf, 0xe2bb80, - /*** Four byte table, leaf: 813985xx - offset 0x06dc7 ***/ + /*** Four byte table, leaf: 813985xx - offset 0x06e3b ***/ /* 30 */ 0xe2bb81, 0xe2bb82, 0xe2bb83, 0xe2bb84, /* 34 */ 0xe2bb85, 0xe2bb86, 0xe2bb87, 0xe2bb88, /* 38 */ 0xe2bb89, 0xe2bb8b, - /*** Four byte table, leaf: 813986xx - offset 0x06dd1 ***/ + /*** Four byte table, leaf: 813986xx - offset 0x06e45 ***/ /* 30 */ 0xe2bb8c, 0xe2bb8d, 0xe2bb8e, 0xe2bb8f, /* 34 */ 0xe2bb90, 0xe2bb91, 0xe2bb92, 0xe2bb93, /* 38 */ 0xe2bb94, 0xe2bb95, - /*** Four byte table, leaf: 813987xx - offset 0x06ddb ***/ + /*** Four byte table, leaf: 813987xx - offset 0x06e4f ***/ /* 30 */ 0xe2bb96, 0xe2bb97, 0xe2bb98, 0xe2bb99, /* 34 */ 0xe2bb9a, 0xe2bb9b, 0xe2bb9c, 0xe2bb9d, /* 38 */ 0xe2bb9e, 0xe2bb9f, - /*** Four byte table, leaf: 813988xx - offset 0x06de5 ***/ + /*** Four byte table, leaf: 813988xx - offset 0x06e59 ***/ /* 30 */ 0xe2bba0, 0xe2bba1, 0xe2bba2, 0xe2bba3, /* 34 */ 0xe2bba4, 0xe2bba5, 0xe2bba6, 0xe2bba7, /* 38 */ 0xe2bba8, 0xe2bba9, - /*** Four byte table, leaf: 813989xx - offset 0x06def ***/ + /*** Four byte table, leaf: 813989xx - offset 0x06e63 ***/ /* 30 */ 0xe2bbaa, 0xe2bbab, 0xe2bbac, 0xe2bbad, /* 34 */ 0xe2bbae, 0xe2bbaf, 0xe2bbb0, 0xe2bbb1, /* 38 */ 0xe2bbb2, 0xe2bbb3, - /*** Four byte table, leaf: 81398axx - offset 0x06df9 ***/ + /*** Four byte table, leaf: 81398axx - offset 0x06e6d ***/ /* 30 */ 0xe2bbb4, 0xe2bbb5, 0xe2bbb6, 0xe2bbb7, /* 34 */ 0xe2bbb8, 0xe2bbb9, 0xe2bbba, 0xe2bbbb, /* 38 */ 0xe2bbbc, 0xe2bbbd, - /*** Four byte table, leaf: 81398bxx - offset 0x06e03 ***/ + /*** Four byte table, leaf: 81398bxx - offset 0x06e77 ***/ /* 30 */ 0xe2bbbe, 0xe2bbbf, 0xe2bc80, 0xe2bc81, /* 34 */ 0xe2bc82, 0xe2bc83, 0xe2bc84, 0xe2bc85, /* 38 */ 0xe2bc86, 0xe2bc87, - /*** Four byte table, leaf: 81398cxx - offset 0x06e0d ***/ + /*** Four byte table, leaf: 81398cxx - offset 0x06e81 ***/ /* 30 */ 0xe2bc88, 0xe2bc89, 0xe2bc8a, 0xe2bc8b, /* 34 */ 0xe2bc8c, 0xe2bc8d, 0xe2bc8e, 0xe2bc8f, /* 38 */ 0xe2bc90, 0xe2bc91, - /*** Four byte table, leaf: 81398dxx - offset 0x06e17 ***/ + /*** Four byte table, leaf: 81398dxx - offset 0x06e8b ***/ /* 30 */ 0xe2bc92, 0xe2bc93, 0xe2bc94, 0xe2bc95, /* 34 */ 0xe2bc96, 0xe2bc97, 0xe2bc98, 0xe2bc99, /* 38 */ 0xe2bc9a, 0xe2bc9b, - /*** Four byte table, leaf: 81398exx - offset 0x06e21 ***/ + /*** Four byte table, leaf: 81398exx - offset 0x06e95 ***/ /* 30 */ 0xe2bc9c, 0xe2bc9d, 0xe2bc9e, 0xe2bc9f, /* 34 */ 0xe2bca0, 0xe2bca1, 0xe2bca2, 0xe2bca3, /* 38 */ 0xe2bca4, 0xe2bca5, - /*** Four byte table, leaf: 81398fxx - offset 0x06e2b ***/ + /*** Four byte table, leaf: 81398fxx - offset 0x06e9f ***/ /* 30 */ 0xe2bca6, 0xe2bca7, 0xe2bca8, 0xe2bca9, /* 34 */ 0xe2bcaa, 0xe2bcab, 0xe2bcac, 0xe2bcad, /* 38 */ 0xe2bcae, 0xe2bcaf, - /*** Four byte table, leaf: 813990xx - offset 0x06e35 ***/ + /*** Four byte table, leaf: 813990xx - offset 0x06ea9 ***/ /* 30 */ 0xe2bcb0, 0xe2bcb1, 0xe2bcb2, 0xe2bcb3, /* 34 */ 0xe2bcb4, 0xe2bcb5, 0xe2bcb6, 0xe2bcb7, /* 38 */ 0xe2bcb8, 0xe2bcb9, - /*** Four byte table, leaf: 813991xx - offset 0x06e3f ***/ + /*** Four byte table, leaf: 813991xx - offset 0x06eb3 ***/ /* 30 */ 0xe2bcba, 0xe2bcbb, 0xe2bcbc, 0xe2bcbd, /* 34 */ 0xe2bcbe, 0xe2bcbf, 0xe2bd80, 0xe2bd81, /* 38 */ 0xe2bd82, 0xe2bd83, - /*** Four byte table, leaf: 813992xx - offset 0x06e49 ***/ + /*** Four byte table, leaf: 813992xx - offset 0x06ebd ***/ /* 30 */ 0xe2bd84, 0xe2bd85, 0xe2bd86, 0xe2bd87, /* 34 */ 0xe2bd88, 0xe2bd89, 0xe2bd8a, 0xe2bd8b, /* 38 */ 0xe2bd8c, 0xe2bd8d, - /*** Four byte table, leaf: 813993xx - offset 0x06e53 ***/ + /*** Four byte table, leaf: 813993xx - offset 0x06ec7 ***/ /* 30 */ 0xe2bd8e, 0xe2bd8f, 0xe2bd90, 0xe2bd91, /* 34 */ 0xe2bd92, 0xe2bd93, 0xe2bd94, 0xe2bd95, /* 38 */ 0xe2bd96, 0xe2bd97, - /*** Four byte table, leaf: 813994xx - offset 0x06e5d ***/ + /*** Four byte table, leaf: 813994xx - offset 0x06ed1 ***/ /* 30 */ 0xe2bd98, 0xe2bd99, 0xe2bd9a, 0xe2bd9b, /* 34 */ 0xe2bd9c, 0xe2bd9d, 0xe2bd9e, 0xe2bd9f, /* 38 */ 0xe2bda0, 0xe2bda1, - /*** Four byte table, leaf: 813995xx - offset 0x06e67 ***/ + /*** Four byte table, leaf: 813995xx - offset 0x06edb ***/ /* 30 */ 0xe2bda2, 0xe2bda3, 0xe2bda4, 0xe2bda5, /* 34 */ 0xe2bda6, 0xe2bda7, 0xe2bda8, 0xe2bda9, /* 38 */ 0xe2bdaa, 0xe2bdab, - /*** Four byte table, leaf: 813996xx - offset 0x06e71 ***/ + /*** Four byte table, leaf: 813996xx - offset 0x06ee5 ***/ /* 30 */ 0xe2bdac, 0xe2bdad, 0xe2bdae, 0xe2bdaf, /* 34 */ 0xe2bdb0, 0xe2bdb1, 0xe2bdb2, 0xe2bdb3, /* 38 */ 0xe2bdb4, 0xe2bdb5, - /*** Four byte table, leaf: 813997xx - offset 0x06e7b ***/ + /*** Four byte table, leaf: 813997xx - offset 0x06eef ***/ /* 30 */ 0xe2bdb6, 0xe2bdb7, 0xe2bdb8, 0xe2bdb9, /* 34 */ 0xe2bdba, 0xe2bdbb, 0xe2bdbc, 0xe2bdbd, /* 38 */ 0xe2bdbe, 0xe2bdbf, - /*** Four byte table, leaf: 813998xx - offset 0x06e85 ***/ + /*** Four byte table, leaf: 813998xx - offset 0x06ef9 ***/ /* 30 */ 0xe2be80, 0xe2be81, 0xe2be82, 0xe2be83, /* 34 */ 0xe2be84, 0xe2be85, 0xe2be86, 0xe2be87, /* 38 */ 0xe2be88, 0xe2be89, - /*** Four byte table, leaf: 813999xx - offset 0x06e8f ***/ + /*** Four byte table, leaf: 813999xx - offset 0x06f03 ***/ /* 30 */ 0xe2be8a, 0xe2be8b, 0xe2be8c, 0xe2be8d, /* 34 */ 0xe2be8e, 0xe2be8f, 0xe2be90, 0xe2be91, /* 38 */ 0xe2be92, 0xe2be93, - /*** Four byte table, leaf: 81399axx - offset 0x06e99 ***/ + /*** Four byte table, leaf: 81399axx - offset 0x06f0d ***/ /* 30 */ 0xe2be94, 0xe2be95, 0xe2be96, 0xe2be97, /* 34 */ 0xe2be98, 0xe2be99, 0xe2be9a, 0xe2be9b, /* 38 */ 0xe2be9c, 0xe2be9d, - /*** Four byte table, leaf: 81399bxx - offset 0x06ea3 ***/ + /*** Four byte table, leaf: 81399bxx - offset 0x06f17 ***/ /* 30 */ 0xe2be9e, 0xe2be9f, 0xe2bea0, 0xe2bea1, /* 34 */ 0xe2bea2, 0xe2bea3, 0xe2bea4, 0xe2bea5, /* 38 */ 0xe2bea6, 0xe2bea7, - /*** Four byte table, leaf: 81399cxx - offset 0x06ead ***/ + /*** Four byte table, leaf: 81399cxx - offset 0x06f21 ***/ /* 30 */ 0xe2bea8, 0xe2bea9, 0xe2beaa, 0xe2beab, /* 34 */ 0xe2beac, 0xe2bead, 0xe2beae, 0xe2beaf, /* 38 */ 0xe2beb0, 0xe2beb1, - /*** Four byte table, leaf: 81399dxx - offset 0x06eb7 ***/ + /*** Four byte table, leaf: 81399dxx - offset 0x06f2b ***/ /* 30 */ 0xe2beb2, 0xe2beb3, 0xe2beb4, 0xe2beb5, /* 34 */ 0xe2beb6, 0xe2beb7, 0xe2beb8, 0xe2beb9, /* 38 */ 0xe2beba, 0xe2bebb, - /*** Four byte table, leaf: 81399exx - offset 0x06ec1 ***/ + /*** Four byte table, leaf: 81399exx - offset 0x06f35 ***/ /* 30 */ 0xe2bebc, 0xe2bebd, 0xe2bebe, 0xe2bebf, /* 34 */ 0xe2bf80, 0xe2bf81, 0xe2bf82, 0xe2bf83, /* 38 */ 0xe2bf84, 0xe2bf85, - /*** Four byte table, leaf: 81399fxx - offset 0x06ecb ***/ + /*** Four byte table, leaf: 81399fxx - offset 0x06f3f ***/ /* 30 */ 0xe2bf86, 0xe2bf87, 0xe2bf88, 0xe2bf89, /* 34 */ 0xe2bf8a, 0xe2bf8b, 0xe2bf8c, 0xe2bf8d, /* 38 */ 0xe2bf8e, 0xe2bf8f, - /*** Four byte table, leaf: 8139a0xx - offset 0x06ed5 ***/ + /*** Four byte table, leaf: 8139a0xx - offset 0x06f49 ***/ /* 30 */ 0xe2bf90, 0xe2bf91, 0xe2bf92, 0xe2bf93, /* 34 */ 0xe2bf94, 0xe2bf95, 0xe2bf96, 0xe2bf97, /* 38 */ 0xe2bf98, 0xe2bf99, - /*** Four byte table, leaf: 8139a1xx - offset 0x06edf ***/ + /*** Four byte table, leaf: 8139a1xx - offset 0x06f53 ***/ /* 30 */ 0xe2bf9a, 0xe2bf9b, 0xe2bf9c, 0xe2bf9d, /* 34 */ 0xe2bf9e, 0xe2bf9f, 0xe2bfa0, 0xe2bfa1, /* 38 */ 0xe2bfa2, 0xe2bfa3, - /*** Four byte table, leaf: 8139a2xx - offset 0x06ee9 ***/ + /*** Four byte table, leaf: 8139a2xx - offset 0x06f5d ***/ /* 30 */ 0xe2bfa4, 0xe2bfa5, 0xe2bfa6, 0xe2bfa7, /* 34 */ 0xe2bfa8, 0xe2bfa9, 0xe2bfaa, 0xe2bfab, /* 38 */ 0xe2bfac, 0xe2bfad, - /*** Four byte table, leaf: 8139a3xx - offset 0x06ef3 ***/ + /*** Four byte table, leaf: 8139a3xx - offset 0x06f67 ***/ /* 30 */ 0xe2bfae, 0xe2bfaf, 0xe2bfbc, 0xe2bfbd, /* 34 */ 0xe2bfbe, 0xe2bfbf, 0xe38084, 0xe38098, /* 38 */ 0xe38099, 0xe3809a, - /*** Four byte table, leaf: 8139a4xx - offset 0x06efd ***/ + /*** Four byte table, leaf: 8139a4xx - offset 0x06f71 ***/ /* 30 */ 0xe3809b, 0xe3809c, 0xe3809f, 0xe380a0, /* 34 */ 0xe380aa, 0xe380ab, 0xe380ac, 0xe380ad, /* 38 */ 0xe380ae, 0xe380af, - /*** Four byte table, leaf: 8139a5xx - offset 0x06f07 ***/ + /*** Four byte table, leaf: 8139a5xx - offset 0x06f7b ***/ /* 30 */ 0xe380b0, 0xe380b1, 0xe380b2, 0xe380b3, /* 34 */ 0xe380b4, 0xe380b5, 0xe380b6, 0xe380b7, /* 38 */ 0xe380b8, 0xe380b9, - /*** Four byte table, leaf: 8139a6xx - offset 0x06f11 ***/ + /*** Four byte table, leaf: 8139a6xx - offset 0x06f85 ***/ /* 30 */ 0xe380ba, 0xe380bb, 0xe380bc, 0xe380bd, /* 34 */ 0xe380bf, 0xe38180, 0xe38294, 0xe38295, /* 38 */ 0xe38296, 0xe38297, - /*** Four byte table, leaf: 8139a7xx - offset 0x06f1b ***/ + /*** Four byte table, leaf: 8139a7xx - offset 0x06f8f ***/ /* 30 */ 0xe38298, 0xe38299, 0xe3829a, 0xe3829f, /* 34 */ 0xe382a0, 0xe383b7, 0xe383b8, 0xe383b9, /* 38 */ 0xe383ba, 0xe383bb, - /*** Four byte table, leaf: 8139a8xx - offset 0x06f25 ***/ + /*** Four byte table, leaf: 8139a8xx - offset 0x06f99 ***/ /* 30 */ 0xe383bf, 0xe38480, 0xe38481, 0xe38482, /* 34 */ 0xe38483, 0xe38484, 0xe384aa, 0xe384ab, /* 38 */ 0xe384ac, 0xe384ad, - /*** Four byte table, leaf: 8139a9xx - offset 0x06f2f ***/ + /*** Four byte table, leaf: 8139a9xx - offset 0x06fa3 ***/ /* 30 */ 0xe384ae, 0xe384af, 0xe384b0, 0xe384b1, /* 34 */ 0xe384b2, 0xe384b3, 0xe384b4, 0xe384b5, /* 38 */ 0xe384b6, 0xe384b7, - /*** Four byte table, leaf: 8139aaxx - offset 0x06f39 ***/ + /*** Four byte table, leaf: 8139aaxx - offset 0x06fad ***/ /* 30 */ 0xe384b8, 0xe384b9, 0xe384ba, 0xe384bb, /* 34 */ 0xe384bc, 0xe384bd, 0xe384be, 0xe384bf, /* 38 */ 0xe38580, 0xe38581, - /*** Four byte table, leaf: 8139abxx - offset 0x06f43 ***/ + /*** Four byte table, leaf: 8139abxx - offset 0x06fb7 ***/ /* 30 */ 0xe38582, 0xe38583, 0xe38584, 0xe38585, /* 34 */ 0xe38586, 0xe38587, 0xe38588, 0xe38589, /* 38 */ 0xe3858a, 0xe3858b, - /*** Four byte table, leaf: 8139acxx - offset 0x06f4d ***/ + /*** Four byte table, leaf: 8139acxx - offset 0x06fc1 ***/ /* 30 */ 0xe3858c, 0xe3858d, 0xe3858e, 0xe3858f, /* 34 */ 0xe38590, 0xe38591, 0xe38592, 0xe38593, /* 38 */ 0xe38594, 0xe38595, - /*** Four byte table, leaf: 8139adxx - offset 0x06f57 ***/ + /*** Four byte table, leaf: 8139adxx - offset 0x06fcb ***/ /* 30 */ 0xe38596, 0xe38597, 0xe38598, 0xe38599, /* 34 */ 0xe3859a, 0xe3859b, 0xe3859c, 0xe3859d, /* 38 */ 0xe3859e, 0xe3859f, - /*** Four byte table, leaf: 8139aexx - offset 0x06f61 ***/ + /*** Four byte table, leaf: 8139aexx - offset 0x06fd5 ***/ /* 30 */ 0xe385a0, 0xe385a1, 0xe385a2, 0xe385a3, /* 34 */ 0xe385a4, 0xe385a5, 0xe385a6, 0xe385a7, /* 38 */ 0xe385a8, 0xe385a9, - /*** Four byte table, leaf: 8139afxx - offset 0x06f6b ***/ + /*** Four byte table, leaf: 8139afxx - offset 0x06fdf ***/ /* 30 */ 0xe385aa, 0xe385ab, 0xe385ac, 0xe385ad, /* 34 */ 0xe385ae, 0xe385af, 0xe385b0, 0xe385b1, /* 38 */ 0xe385b2, 0xe385b3, - /*** Four byte table, leaf: 8139b0xx - offset 0x06f75 ***/ + /*** Four byte table, leaf: 8139b0xx - offset 0x06fe9 ***/ /* 30 */ 0xe385b4, 0xe385b5, 0xe385b6, 0xe385b7, /* 34 */ 0xe385b8, 0xe385b9, 0xe385ba, 0xe385bb, /* 38 */ 0xe385bc, 0xe385bd, - /*** Four byte table, leaf: 8139b1xx - offset 0x06f7f ***/ + /*** Four byte table, leaf: 8139b1xx - offset 0x06ff3 ***/ /* 30 */ 0xe385be, 0xe385bf, 0xe38680, 0xe38681, /* 34 */ 0xe38682, 0xe38683, 0xe38684, 0xe38685, /* 38 */ 0xe38686, 0xe38687, - /*** Four byte table, leaf: 8139b2xx - offset 0x06f89 ***/ + /*** Four byte table, leaf: 8139b2xx - offset 0x06ffd ***/ /* 30 */ 0xe38688, 0xe38689, 0xe3868a, 0xe3868b, /* 34 */ 0xe3868c, 0xe3868d, 0xe3868e, 0xe3868f, /* 38 */ 0xe38690, 0xe38691, - /*** Four byte table, leaf: 8139b3xx - offset 0x06f93 ***/ + /*** Four byte table, leaf: 8139b3xx - offset 0x07007 ***/ /* 30 */ 0xe38692, 0xe38693, 0xe38694, 0xe38695, /* 34 */ 0xe38696, 0xe38697, 0xe38698, 0xe38699, /* 38 */ 0xe3869a, 0xe3869b, - /*** Four byte table, leaf: 8139b4xx - offset 0x06f9d ***/ + /*** Four byte table, leaf: 8139b4xx - offset 0x07011 ***/ /* 30 */ 0xe3869c, 0xe3869d, 0xe3869e, 0xe3869f, /* 34 */ 0xe386a0, 0xe386a1, 0xe386a2, 0xe386a3, /* 38 */ 0xe386a4, 0xe386a5, - /*** Four byte table, leaf: 8139b5xx - offset 0x06fa7 ***/ + /*** Four byte table, leaf: 8139b5xx - offset 0x0701b ***/ /* 30 */ 0xe386a6, 0xe386a7, 0xe386a8, 0xe386a9, /* 34 */ 0xe386aa, 0xe386ab, 0xe386ac, 0xe386ad, /* 38 */ 0xe386ae, 0xe386af, - /*** Four byte table, leaf: 8139b6xx - offset 0x06fb1 ***/ + /*** Four byte table, leaf: 8139b6xx - offset 0x07025 ***/ /* 30 */ 0xe386b0, 0xe386b1, 0xe386b2, 0xe386b3, /* 34 */ 0xe386b4, 0xe386b5, 0xe386b6, 0xe386b7, /* 38 */ 0xe386b8, 0xe386b9, - /*** Four byte table, leaf: 8139b7xx - offset 0x06fbb ***/ + /*** Four byte table, leaf: 8139b7xx - offset 0x0702f ***/ /* 30 */ 0xe386ba, 0xe386bb, 0xe386bc, 0xe386bd, /* 34 */ 0xe386be, 0xe386bf, 0xe38780, 0xe38781, /* 38 */ 0xe38782, 0xe38783, - /*** Four byte table, leaf: 8139b8xx - offset 0x06fc5 ***/ + /*** Four byte table, leaf: 8139b8xx - offset 0x07039 ***/ /* 30 */ 0xe38784, 0xe38785, 0xe38786, 0xe38787, /* 34 */ 0xe38788, 0xe38789, 0xe3878a, 0xe3878b, /* 38 */ 0xe3878c, 0xe3878d, - /*** Four byte table, leaf: 8139b9xx - offset 0x06fcf ***/ + /*** Four byte table, leaf: 8139b9xx - offset 0x07043 ***/ /* 30 */ 0xe3878e, 0xe3878f, 0xe38790, 0xe38791, /* 34 */ 0xe38792, 0xe38793, 0xe38794, 0xe38795, /* 38 */ 0xe38796, 0xe38797, - /*** Four byte table, leaf: 8139baxx - offset 0x06fd9 ***/ + /*** Four byte table, leaf: 8139baxx - offset 0x0704d ***/ /* 30 */ 0xe38798, 0xe38799, 0xe3879a, 0xe3879b, /* 34 */ 0xe3879c, 0xe3879d, 0xe3879e, 0xe3879f, /* 38 */ 0xe387a0, 0xe387a1, - /*** Four byte table, leaf: 8139bbxx - offset 0x06fe3 ***/ + /*** Four byte table, leaf: 8139bbxx - offset 0x07057 ***/ /* 30 */ 0xe387a2, 0xe387a3, 0xe387a4, 0xe387a5, /* 34 */ 0xe387a6, 0xe387a7, 0xe387a8, 0xe387a9, /* 38 */ 0xe387aa, 0xe387ab, - /*** Four byte table, leaf: 8139bcxx - offset 0x06fed ***/ + /*** Four byte table, leaf: 8139bcxx - offset 0x07061 ***/ /* 30 */ 0xe387ac, 0xe387ad, 0xe387ae, 0xe387af, /* 34 */ 0xe387b0, 0xe387b1, 0xe387b2, 0xe387b3, /* 38 */ 0xe387b4, 0xe387b5, - /*** Four byte table, leaf: 8139bdxx - offset 0x06ff7 ***/ + /*** Four byte table, leaf: 8139bdxx - offset 0x0706b ***/ /* 30 */ 0xe387b6, 0xe387b7, 0xe387b8, 0xe387b9, /* 34 */ 0xe387ba, 0xe387bb, 0xe387bc, 0xe387bd, /* 38 */ 0xe387be, 0xe387bf, - /*** Four byte table, leaf: 8139bexx - offset 0x07001 ***/ + /*** Four byte table, leaf: 8139bexx - offset 0x07075 ***/ /* 30 */ 0xe38880, 0xe38881, 0xe38882, 0xe38883, /* 34 */ 0xe38884, 0xe38885, 0xe38886, 0xe38887, /* 38 */ 0xe38888, 0xe38889, - /*** Four byte table, leaf: 8139bfxx - offset 0x0700b ***/ + /*** Four byte table, leaf: 8139bfxx - offset 0x0707f ***/ /* 30 */ 0xe3888a, 0xe3888b, 0xe3888c, 0xe3888d, /* 34 */ 0xe3888e, 0xe3888f, 0xe38890, 0xe38891, /* 38 */ 0xe38892, 0xe38893, - /*** Four byte table, leaf: 8139c0xx - offset 0x07015 ***/ + /*** Four byte table, leaf: 8139c0xx - offset 0x07089 ***/ /* 30 */ 0xe38894, 0xe38895, 0xe38896, 0xe38897, /* 34 */ 0xe38898, 0xe38899, 0xe3889a, 0xe3889b, /* 38 */ 0xe3889c, 0xe3889d, - /*** Four byte table, leaf: 8139c1xx - offset 0x0701f ***/ + /*** Four byte table, leaf: 8139c1xx - offset 0x07093 ***/ /* 30 */ 0xe3889e, 0xe3889f, 0xe388aa, 0xe388ab, /* 34 */ 0xe388ac, 0xe388ad, 0xe388ae, 0xe388af, /* 38 */ 0xe388b0, 0xe388b2, - /*** Four byte table, leaf: 8139c2xx - offset 0x07029 ***/ + /*** Four byte table, leaf: 8139c2xx - offset 0x0709d ***/ /* 30 */ 0xe388b3, 0xe388b4, 0xe388b5, 0xe388b6, /* 34 */ 0xe388b7, 0xe388b8, 0xe388b9, 0xe388ba, /* 38 */ 0xe388bb, 0xe388bc, - /*** Four byte table, leaf: 8139c3xx - offset 0x07033 ***/ + /*** Four byte table, leaf: 8139c3xx - offset 0x070a7 ***/ /* 30 */ 0xe388bd, 0xe388be, 0xe388bf, 0xe38980, /* 34 */ 0xe38981, 0xe38982, 0xe38983, 0xe38984, /* 38 */ 0xe38985, 0xe38986, - /*** Four byte table, leaf: 8139c4xx - offset 0x0703d ***/ + /*** Four byte table, leaf: 8139c4xx - offset 0x070b1 ***/ /* 30 */ 0xe38987, 0xe38988, 0xe38989, 0xe3898a, /* 34 */ 0xe3898b, 0xe3898c, 0xe3898d, 0xe3898e, /* 38 */ 0xe3898f, 0xe38990, - /*** Four byte table, leaf: 8139c5xx - offset 0x07047 ***/ + /*** Four byte table, leaf: 8139c5xx - offset 0x070bb ***/ /* 30 */ 0xe38991, 0xe38992, 0xe38993, 0xe38994, /* 34 */ 0xe38995, 0xe38996, 0xe38997, 0xe38998, /* 38 */ 0xe38999, 0xe3899a, - /*** Four byte table, leaf: 8139c6xx - offset 0x07051 ***/ + /*** Four byte table, leaf: 8139c6xx - offset 0x070c5 ***/ /* 30 */ 0xe3899b, 0xe3899c, 0xe3899d, 0xe3899e, /* 34 */ 0xe3899f, 0xe389a0, 0xe389a1, 0xe389a2, /* 38 */ 0xe389a3, 0xe389a4, - /*** Four byte table, leaf: 8139c7xx - offset 0x0705b ***/ + /*** Four byte table, leaf: 8139c7xx - offset 0x070cf ***/ /* 30 */ 0xe389a5, 0xe389a6, 0xe389a7, 0xe389a8, /* 34 */ 0xe389a9, 0xe389aa, 0xe389ab, 0xe389ac, /* 38 */ 0xe389ad, 0xe389ae, - /*** Four byte table, leaf: 8139c8xx - offset 0x07065 ***/ + /*** Four byte table, leaf: 8139c8xx - offset 0x070d9 ***/ /* 30 */ 0xe389af, 0xe389b0, 0xe389b1, 0xe389b2, /* 34 */ 0xe389b3, 0xe389b4, 0xe389b5, 0xe389b6, /* 38 */ 0xe389b7, 0xe389b8, - /*** Four byte table, leaf: 8139c9xx - offset 0x0706f ***/ + /*** Four byte table, leaf: 8139c9xx - offset 0x070e3 ***/ /* 30 */ 0xe389b9, 0xe389ba, 0xe389bb, 0xe389bc, /* 34 */ 0xe389bd, 0xe389be, 0xe389bf, 0xe38a80, /* 38 */ 0xe38a81, 0xe38a82, - /*** Four byte table, leaf: 8139caxx - offset 0x07079 ***/ + /*** Four byte table, leaf: 8139caxx - offset 0x070ed ***/ /* 30 */ 0xe38a83, 0xe38a84, 0xe38a85, 0xe38a86, /* 34 */ 0xe38a87, 0xe38a88, 0xe38a89, 0xe38a8a, /* 38 */ 0xe38a8b, 0xe38a8c, - /*** Four byte table, leaf: 8139cbxx - offset 0x07083 ***/ + /*** Four byte table, leaf: 8139cbxx - offset 0x070f7 ***/ /* 30 */ 0xe38a8d, 0xe38a8e, 0xe38a8f, 0xe38a90, /* 34 */ 0xe38a91, 0xe38a92, 0xe38a93, 0xe38a94, /* 38 */ 0xe38a95, 0xe38a96, - /*** Four byte table, leaf: 8139ccxx - offset 0x0708d ***/ + /*** Four byte table, leaf: 8139ccxx - offset 0x07101 ***/ /* 30 */ 0xe38a97, 0xe38a98, 0xe38a99, 0xe38a9a, /* 34 */ 0xe38a9b, 0xe38a9c, 0xe38a9d, 0xe38a9e, /* 38 */ 0xe38a9f, 0xe38aa0, - /*** Four byte table, leaf: 8139cdxx - offset 0x07097 ***/ + /*** Four byte table, leaf: 8139cdxx - offset 0x0710b ***/ /* 30 */ 0xe38aa1, 0xe38aa2, 0xe38aa4, 0xe38aa5, /* 34 */ 0xe38aa6, 0xe38aa7, 0xe38aa8, 0xe38aa9, /* 38 */ 0xe38aaa, 0xe38aab, - /*** Four byte table, leaf: 8139cexx - offset 0x070a1 ***/ + /*** Four byte table, leaf: 8139cexx - offset 0x07115 ***/ /* 30 */ 0xe38aac, 0xe38aad, 0xe38aae, 0xe38aaf, /* 34 */ 0xe38ab0, 0xe38ab1, 0xe38ab2, 0xe38ab3, /* 38 */ 0xe38ab4, 0xe38ab5, - /*** Four byte table, leaf: 8139cfxx - offset 0x070ab ***/ + /*** Four byte table, leaf: 8139cfxx - offset 0x0711f ***/ /* 30 */ 0xe38ab6, 0xe38ab7, 0xe38ab8, 0xe38ab9, /* 34 */ 0xe38aba, 0xe38abb, 0xe38abc, 0xe38abd, /* 38 */ 0xe38abe, 0xe38abf, - /*** Four byte table, leaf: 8139d0xx - offset 0x070b5 ***/ + /*** Four byte table, leaf: 8139d0xx - offset 0x07129 ***/ /* 30 */ 0xe38b80, 0xe38b81, 0xe38b82, 0xe38b83, /* 34 */ 0xe38b84, 0xe38b85, 0xe38b86, 0xe38b87, /* 38 */ 0xe38b88, 0xe38b89, - /*** Four byte table, leaf: 8139d1xx - offset 0x070bf ***/ + /*** Four byte table, leaf: 8139d1xx - offset 0x07133 ***/ /* 30 */ 0xe38b8a, 0xe38b8b, 0xe38b8c, 0xe38b8d, /* 34 */ 0xe38b8e, 0xe38b8f, 0xe38b90, 0xe38b91, /* 38 */ 0xe38b92, 0xe38b93, - /*** Four byte table, leaf: 8139d2xx - offset 0x070c9 ***/ + /*** Four byte table, leaf: 8139d2xx - offset 0x0713d ***/ /* 30 */ 0xe38b94, 0xe38b95, 0xe38b96, 0xe38b97, /* 34 */ 0xe38b98, 0xe38b99, 0xe38b9a, 0xe38b9b, /* 38 */ 0xe38b9c, 0xe38b9d, - /*** Four byte table, leaf: 8139d3xx - offset 0x070d3 ***/ + /*** Four byte table, leaf: 8139d3xx - offset 0x07147 ***/ /* 30 */ 0xe38b9e, 0xe38b9f, 0xe38ba0, 0xe38ba1, /* 34 */ 0xe38ba2, 0xe38ba3, 0xe38ba4, 0xe38ba5, /* 38 */ 0xe38ba6, 0xe38ba7, - /*** Four byte table, leaf: 8139d4xx - offset 0x070dd ***/ + /*** Four byte table, leaf: 8139d4xx - offset 0x07151 ***/ /* 30 */ 0xe38ba8, 0xe38ba9, 0xe38baa, 0xe38bab, /* 34 */ 0xe38bac, 0xe38bad, 0xe38bae, 0xe38baf, /* 38 */ 0xe38bb0, 0xe38bb1, - /*** Four byte table, leaf: 8139d5xx - offset 0x070e7 ***/ + /*** Four byte table, leaf: 8139d5xx - offset 0x0715b ***/ /* 30 */ 0xe38bb2, 0xe38bb3, 0xe38bb4, 0xe38bb5, /* 34 */ 0xe38bb6, 0xe38bb7, 0xe38bb8, 0xe38bb9, /* 38 */ 0xe38bba, 0xe38bbb, - /*** Four byte table, leaf: 8139d6xx - offset 0x070f1 ***/ + /*** Four byte table, leaf: 8139d6xx - offset 0x07165 ***/ /* 30 */ 0xe38bbc, 0xe38bbd, 0xe38bbe, 0xe38bbf, /* 34 */ 0xe38c80, 0xe38c81, 0xe38c82, 0xe38c83, /* 38 */ 0xe38c84, 0xe38c85, - /*** Four byte table, leaf: 8139d7xx - offset 0x070fb ***/ + /*** Four byte table, leaf: 8139d7xx - offset 0x0716f ***/ /* 30 */ 0xe38c86, 0xe38c87, 0xe38c88, 0xe38c89, /* 34 */ 0xe38c8a, 0xe38c8b, 0xe38c8c, 0xe38c8d, /* 38 */ 0xe38c8e, 0xe38c8f, - /*** Four byte table, leaf: 8139d8xx - offset 0x07105 ***/ + /*** Four byte table, leaf: 8139d8xx - offset 0x07179 ***/ /* 30 */ 0xe38c90, 0xe38c91, 0xe38c92, 0xe38c93, /* 34 */ 0xe38c94, 0xe38c95, 0xe38c96, 0xe38c97, /* 38 */ 0xe38c98, 0xe38c99, - /*** Four byte table, leaf: 8139d9xx - offset 0x0710f ***/ + /*** Four byte table, leaf: 8139d9xx - offset 0x07183 ***/ /* 30 */ 0xe38c9a, 0xe38c9b, 0xe38c9c, 0xe38c9d, /* 34 */ 0xe38c9e, 0xe38c9f, 0xe38ca0, 0xe38ca1, /* 38 */ 0xe38ca2, 0xe38ca3, - /*** Four byte table, leaf: 8139daxx - offset 0x07119 ***/ + /*** Four byte table, leaf: 8139daxx - offset 0x0718d ***/ /* 30 */ 0xe38ca4, 0xe38ca5, 0xe38ca6, 0xe38ca7, /* 34 */ 0xe38ca8, 0xe38ca9, 0xe38caa, 0xe38cab, /* 38 */ 0xe38cac, 0xe38cad, - /*** Four byte table, leaf: 8139dbxx - offset 0x07123 ***/ + /*** Four byte table, leaf: 8139dbxx - offset 0x07197 ***/ /* 30 */ 0xe38cae, 0xe38caf, 0xe38cb0, 0xe38cb1, /* 34 */ 0xe38cb2, 0xe38cb3, 0xe38cb4, 0xe38cb5, /* 38 */ 0xe38cb6, 0xe38cb7, - /*** Four byte table, leaf: 8139dcxx - offset 0x0712d ***/ + /*** Four byte table, leaf: 8139dcxx - offset 0x071a1 ***/ /* 30 */ 0xe38cb8, 0xe38cb9, 0xe38cba, 0xe38cbb, /* 34 */ 0xe38cbc, 0xe38cbd, 0xe38cbe, 0xe38cbf, /* 38 */ 0xe38d80, 0xe38d81, - /*** Four byte table, leaf: 8139ddxx - offset 0x07137 ***/ + /*** Four byte table, leaf: 8139ddxx - offset 0x071ab ***/ /* 30 */ 0xe38d82, 0xe38d83, 0xe38d84, 0xe38d85, /* 34 */ 0xe38d86, 0xe38d87, 0xe38d88, 0xe38d89, /* 38 */ 0xe38d8a, 0xe38d8b, - /*** Four byte table, leaf: 8139dexx - offset 0x07141 ***/ + /*** Four byte table, leaf: 8139dexx - offset 0x071b5 ***/ /* 30 */ 0xe38d8c, 0xe38d8d, 0xe38d8e, 0xe38d8f, /* 34 */ 0xe38d90, 0xe38d91, 0xe38d92, 0xe38d93, /* 38 */ 0xe38d94, 0xe38d95, - /*** Four byte table, leaf: 8139dfxx - offset 0x0714b ***/ + /*** Four byte table, leaf: 8139dfxx - offset 0x071bf ***/ /* 30 */ 0xe38d96, 0xe38d97, 0xe38d98, 0xe38d99, /* 34 */ 0xe38d9a, 0xe38d9b, 0xe38d9c, 0xe38d9d, /* 38 */ 0xe38d9e, 0xe38d9f, - /*** Four byte table, leaf: 8139e0xx - offset 0x07155 ***/ + /*** Four byte table, leaf: 8139e0xx - offset 0x071c9 ***/ /* 30 */ 0xe38da0, 0xe38da1, 0xe38da2, 0xe38da3, /* 34 */ 0xe38da4, 0xe38da5, 0xe38da6, 0xe38da7, /* 38 */ 0xe38da8, 0xe38da9, - /*** Four byte table, leaf: 8139e1xx - offset 0x0715f ***/ + /*** Four byte table, leaf: 8139e1xx - offset 0x071d3 ***/ /* 30 */ 0xe38daa, 0xe38dab, 0xe38dac, 0xe38dad, /* 34 */ 0xe38dae, 0xe38daf, 0xe38db0, 0xe38db1, /* 38 */ 0xe38db2, 0xe38db3, - /*** Four byte table, leaf: 8139e2xx - offset 0x07169 ***/ + /*** Four byte table, leaf: 8139e2xx - offset 0x071dd ***/ /* 30 */ 0xe38db4, 0xe38db5, 0xe38db6, 0xe38db7, /* 34 */ 0xe38db8, 0xe38db9, 0xe38dba, 0xe38dbb, /* 38 */ 0xe38dbc, 0xe38dbd, - /*** Four byte table, leaf: 8139e3xx - offset 0x07173 ***/ + /*** Four byte table, leaf: 8139e3xx - offset 0x071e7 ***/ /* 30 */ 0xe38dbe, 0xe38dbf, 0xe38e80, 0xe38e81, /* 34 */ 0xe38e82, 0xe38e83, 0xe38e84, 0xe38e85, /* 38 */ 0xe38e86, 0xe38e87, - /*** Four byte table, leaf: 8139e4xx - offset 0x0717d ***/ + /*** Four byte table, leaf: 8139e4xx - offset 0x071f1 ***/ /* 30 */ 0xe38e88, 0xe38e89, 0xe38e8a, 0xe38e8b, /* 34 */ 0xe38e8c, 0xe38e8d, 0xe38e90, 0xe38e91, /* 38 */ 0xe38e92, 0xe38e93, - /*** Four byte table, leaf: 8139e5xx - offset 0x07187 ***/ + /*** Four byte table, leaf: 8139e5xx - offset 0x071fb ***/ /* 30 */ 0xe38e94, 0xe38e95, 0xe38e96, 0xe38e97, /* 34 */ 0xe38e98, 0xe38e99, 0xe38e9a, 0xe38e9b, /* 38 */ 0xe38e9f, 0xe38ea0, - /*** Four byte table, leaf: 8139e6xx - offset 0x07191 ***/ + /*** Four byte table, leaf: 8139e6xx - offset 0x07205 ***/ /* 30 */ 0xe38ea2, 0xe38ea3, 0xe38ea4, 0xe38ea5, /* 34 */ 0xe38ea6, 0xe38ea7, 0xe38ea8, 0xe38ea9, /* 38 */ 0xe38eaa, 0xe38eab, - /*** Four byte table, leaf: 8139e7xx - offset 0x0719b ***/ + /*** Four byte table, leaf: 8139e7xx - offset 0x0720f ***/ /* 30 */ 0xe38eac, 0xe38ead, 0xe38eae, 0xe38eaf, /* 34 */ 0xe38eb0, 0xe38eb1, 0xe38eb2, 0xe38eb3, /* 38 */ 0xe38eb4, 0xe38eb5, - /*** Four byte table, leaf: 8139e8xx - offset 0x071a5 ***/ + /*** Four byte table, leaf: 8139e8xx - offset 0x07219 ***/ /* 30 */ 0xe38eb6, 0xe38eb7, 0xe38eb8, 0xe38eb9, /* 34 */ 0xe38eba, 0xe38ebb, 0xe38ebc, 0xe38ebd, /* 38 */ 0xe38ebe, 0xe38ebf, - /*** Four byte table, leaf: 8139e9xx - offset 0x071af ***/ + /*** Four byte table, leaf: 8139e9xx - offset 0x07223 ***/ /* 30 */ 0xe38f80, 0xe38f81, 0xe38f82, 0xe38f83, /* 34 */ 0xe38f85, 0xe38f86, 0xe38f87, 0xe38f88, /* 38 */ 0xe38f89, 0xe38f8a, - /*** Four byte table, leaf: 8139eaxx - offset 0x071b9 ***/ + /*** Four byte table, leaf: 8139eaxx - offset 0x0722d ***/ /* 30 */ 0xe38f8b, 0xe38f8c, 0xe38f8d, 0xe38f8f, /* 34 */ 0xe38f90, 0xe38f93, 0xe38f94, 0xe38f96, /* 38 */ 0xe38f97, 0xe38f98, - /*** Four byte table, leaf: 8139ebxx - offset 0x071c3 ***/ + /*** Four byte table, leaf: 8139ebxx - offset 0x07237 ***/ /* 30 */ 0xe38f99, 0xe38f9a, 0xe38f9b, 0xe38f9c, /* 34 */ 0xe38f9d, 0xe38f9e, 0xe38f9f, 0xe38fa0, /* 38 */ 0xe38fa1, 0xe38fa2, - /*** Four byte table, leaf: 8139ecxx - offset 0x071cd ***/ + /*** Four byte table, leaf: 8139ecxx - offset 0x07241 ***/ /* 30 */ 0xe38fa3, 0xe38fa4, 0xe38fa5, 0xe38fa6, /* 34 */ 0xe38fa7, 0xe38fa8, 0xe38fa9, 0xe38faa, /* 38 */ 0xe38fab, 0xe38fac, - /*** Four byte table, leaf: 8139edxx - offset 0x071d7 ***/ + /*** Four byte table, leaf: 8139edxx - offset 0x0724b ***/ /* 30 */ 0xe38fad, 0xe38fae, 0xe38faf, 0xe38fb0, /* 34 */ 0xe38fb1, 0xe38fb2, 0xe38fb3, 0xe38fb4, /* 38 */ 0xe38fb5, 0xe38fb6, - /*** Four byte table, leaf: 8139eexx - offset 0x071e1 ***/ + /*** Four byte table, leaf: 8139eexx - offset 0x07255 ***/ /* 30 */ 0xe38fb7, 0xe38fb8, 0xe38fb9, 0xe38fba, /* 34 */ 0xe38fbb, 0xe38fbc, 0xe38fbd, 0xe38fbe, /* 38 */ 0xe38fbf, 0xe39080, - /*** Four byte table, leaf: 8139efxx - offset 0x071eb ***/ + /*** Four byte table, leaf: 8139efxx - offset 0x0725f ***/ /* 30 */ 0xe39081, 0xe39082, 0xe39083, 0xe39084, /* 34 */ 0xe39085, 0xe39086, 0xe39087, 0xe39088, /* 38 */ 0xe39089, 0xe3908a, - /*** Four byte table, leaf: 8139f0xx - offset 0x071f5 ***/ + /*** Four byte table, leaf: 8139f0xx - offset 0x07269 ***/ /* 30 */ 0xe3908b, 0xe3908c, 0xe3908d, 0xe3908e, /* 34 */ 0xe3908f, 0xe39090, 0xe39091, 0xe39092, /* 38 */ 0xe39093, 0xe39094, - /*** Four byte table, leaf: 8139f1xx - offset 0x071ff ***/ + /*** Four byte table, leaf: 8139f1xx - offset 0x07273 ***/ /* 30 */ 0xe39095, 0xe39096, 0xe39097, 0xe39098, /* 34 */ 0xe39099, 0xe3909a, 0xe3909b, 0xe3909c, /* 38 */ 0xe3909d, 0xe3909e, - /*** Four byte table, leaf: 8139f2xx - offset 0x07209 ***/ + /*** Four byte table, leaf: 8139f2xx - offset 0x0727d ***/ /* 30 */ 0xe3909f, 0xe390a0, 0xe390a1, 0xe390a2, /* 34 */ 0xe390a3, 0xe390a4, 0xe390a5, 0xe390a6, /* 38 */ 0xe390a7, 0xe390a8, - /*** Four byte table, leaf: 8139f3xx - offset 0x07213 ***/ + /*** Four byte table, leaf: 8139f3xx - offset 0x07287 ***/ /* 30 */ 0xe390a9, 0xe390aa, 0xe390ab, 0xe390ac, /* 34 */ 0xe390ad, 0xe390ae, 0xe390af, 0xe390b0, /* 38 */ 0xe390b1, 0xe390b2, - /*** Four byte table, leaf: 8139f4xx - offset 0x0721d ***/ + /*** Four byte table, leaf: 8139f4xx - offset 0x07291 ***/ /* 30 */ 0xe390b3, 0xe390b4, 0xe390b5, 0xe390b6, /* 34 */ 0xe390b7, 0xe390b8, 0xe390b9, 0xe390ba, /* 38 */ 0xe390bb, 0xe390bc, - /*** Four byte table, leaf: 8139f5xx - offset 0x07227 ***/ + /*** Four byte table, leaf: 8139f5xx - offset 0x0729b ***/ /* 30 */ 0xe390bd, 0xe390be, 0xe390bf, 0xe39180, /* 34 */ 0xe39181, 0xe39182, 0xe39183, 0xe39184, /* 38 */ 0xe39185, 0xe39186, - /*** Four byte table, leaf: 8139f6xx - offset 0x07231 ***/ + /*** Four byte table, leaf: 8139f6xx - offset 0x072a5 ***/ /* 30 */ 0xe39188, 0xe39189, 0xe3918a, 0xe3918b, /* 34 */ 0xe3918c, 0xe3918d, 0xe3918e, 0xe3918f, /* 38 */ 0xe39190, 0xe39191, - /*** Four byte table, leaf: 8139f7xx - offset 0x0723b ***/ + /*** Four byte table, leaf: 8139f7xx - offset 0x072af ***/ /* 30 */ 0xe39192, 0xe39193, 0xe39194, 0xe39195, /* 34 */ 0xe39196, 0xe39197, 0xe39198, 0xe39199, /* 38 */ 0xe3919a, 0xe3919b, - /*** Four byte table, leaf: 8139f8xx - offset 0x07245 ***/ + /*** Four byte table, leaf: 8139f8xx - offset 0x072b9 ***/ /* 30 */ 0xe3919c, 0xe3919d, 0xe3919e, 0xe3919f, /* 34 */ 0xe391a0, 0xe391a1, 0xe391a2, 0xe391a3, /* 38 */ 0xe391a4, 0xe391a5, - /*** Four byte table, leaf: 8139f9xx - offset 0x0724f ***/ + /*** Four byte table, leaf: 8139f9xx - offset 0x072c3 ***/ /* 30 */ 0xe391a6, 0xe391a7, 0xe391a8, 0xe391a9, /* 34 */ 0xe391aa, 0xe391ab, 0xe391ac, 0xe391ad, /* 38 */ 0xe391ae, 0xe391af, - /*** Four byte table, leaf: 8139faxx - offset 0x07259 ***/ + /*** Four byte table, leaf: 8139faxx - offset 0x072cd ***/ /* 30 */ 0xe391b0, 0xe391b1, 0xe391b2, 0xe391b4, /* 34 */ 0xe391b5, 0xe391b6, 0xe391b7, 0xe391b8, /* 38 */ 0xe391b9, 0xe391ba, - /*** Four byte table, leaf: 8139fbxx - offset 0x07263 ***/ + /*** Four byte table, leaf: 8139fbxx - offset 0x072d7 ***/ /* 30 */ 0xe391bb, 0xe391bc, 0xe391bd, 0xe391be, /* 34 */ 0xe391bf, 0xe39280, 0xe39281, 0xe39282, /* 38 */ 0xe39283, 0xe39284, - /*** Four byte table, leaf: 8139fcxx - offset 0x0726d ***/ + /*** Four byte table, leaf: 8139fcxx - offset 0x072e1 ***/ /* 30 */ 0xe39285, 0xe39286, 0xe39287, 0xe39288, /* 34 */ 0xe39289, 0xe3928a, 0xe3928b, 0xe3928c, /* 38 */ 0xe3928d, 0xe3928e, - /*** Four byte table, leaf: 8139fdxx - offset 0x07277 ***/ + /*** Four byte table, leaf: 8139fdxx - offset 0x072eb ***/ /* 30 */ 0xe3928f, 0xe39290, 0xe39291, 0xe39292, /* 34 */ 0xe39293, 0xe39294, 0xe39295, 0xe39296, /* 38 */ 0xe39297, 0xe39298, - /*** Four byte table, leaf: 8139fexx - offset 0x07281 ***/ + /*** Four byte table, leaf: 8139fexx - offset 0x072f5 ***/ /* 30 */ 0xe39299, 0xe3929a, 0xe3929b, 0xe3929c, /* 34 */ 0xe3929d, 0xe3929e, 0xe3929f, 0xe392a0, /* 38 */ 0xe392a1, 0xe392a2, - /*** Four byte table, leaf: 823081xx - offset 0x0728b ***/ + /*** Four byte table, leaf: 823081xx - offset 0x072ff ***/ /* 30 */ 0xe392a3, 0xe392a4, 0xe392a5, 0xe392a6, /* 34 */ 0xe392a7, 0xe392a8, 0xe392a9, 0xe392aa, /* 38 */ 0xe392ab, 0xe392ac, - /*** Four byte table, leaf: 823082xx - offset 0x07295 ***/ + /*** Four byte table, leaf: 823082xx - offset 0x07309 ***/ /* 30 */ 0xe392ad, 0xe392ae, 0xe392af, 0xe392b0, /* 34 */ 0xe392b1, 0xe392b2, 0xe392b3, 0xe392b4, /* 38 */ 0xe392b5, 0xe392b6, - /*** Four byte table, leaf: 823083xx - offset 0x0729f ***/ + /*** Four byte table, leaf: 823083xx - offset 0x07313 ***/ /* 30 */ 0xe392b7, 0xe392b8, 0xe392b9, 0xe392ba, /* 34 */ 0xe392bb, 0xe392bc, 0xe392bd, 0xe392be, /* 38 */ 0xe392bf, 0xe39380, - /*** Four byte table, leaf: 823084xx - offset 0x072a9 ***/ + /*** Four byte table, leaf: 823084xx - offset 0x0731d ***/ /* 30 */ 0xe39381, 0xe39382, 0xe39383, 0xe39384, /* 34 */ 0xe39385, 0xe39386, 0xe39387, 0xe39388, /* 38 */ 0xe39389, 0xe3938a, - /*** Four byte table, leaf: 823085xx - offset 0x072b3 ***/ + /*** Four byte table, leaf: 823085xx - offset 0x07327 ***/ /* 30 */ 0xe3938b, 0xe3938c, 0xe3938d, 0xe3938e, /* 34 */ 0xe3938f, 0xe39390, 0xe39391, 0xe39392, /* 38 */ 0xe39393, 0xe39394, - /*** Four byte table, leaf: 823086xx - offset 0x072bd ***/ + /*** Four byte table, leaf: 823086xx - offset 0x07331 ***/ /* 30 */ 0xe39395, 0xe39396, 0xe39397, 0xe39398, /* 34 */ 0xe39399, 0xe3939a, 0xe3939b, 0xe3939c, /* 38 */ 0xe3939d, 0xe3939e, - /*** Four byte table, leaf: 823087xx - offset 0x072c7 ***/ + /*** Four byte table, leaf: 823087xx - offset 0x0733b ***/ /* 30 */ 0xe3939f, 0xe393a0, 0xe393a1, 0xe393a2, /* 34 */ 0xe393a3, 0xe393a4, 0xe393a5, 0xe393a6, /* 38 */ 0xe393a7, 0xe393a8, - /*** Four byte table, leaf: 823088xx - offset 0x072d1 ***/ + /*** Four byte table, leaf: 823088xx - offset 0x07345 ***/ /* 30 */ 0xe393a9, 0xe393aa, 0xe393ab, 0xe393ac, /* 34 */ 0xe393ad, 0xe393ae, 0xe393af, 0xe393b0, /* 38 */ 0xe393b1, 0xe393b2, - /*** Four byte table, leaf: 823089xx - offset 0x072db ***/ + /*** Four byte table, leaf: 823089xx - offset 0x0734f ***/ /* 30 */ 0xe393b3, 0xe393b4, 0xe393b5, 0xe393b6, /* 34 */ 0xe393b7, 0xe393b8, 0xe393b9, 0xe393ba, /* 38 */ 0xe393bb, 0xe393bc, - /*** Four byte table, leaf: 82308axx - offset 0x072e5 ***/ + /*** Four byte table, leaf: 82308axx - offset 0x07359 ***/ /* 30 */ 0xe393bd, 0xe393be, 0xe393bf, 0xe39480, /* 34 */ 0xe39481, 0xe39482, 0xe39483, 0xe39484, /* 38 */ 0xe39485, 0xe39486, - /*** Four byte table, leaf: 82308bxx - offset 0x072ef ***/ + /*** Four byte table, leaf: 82308bxx - offset 0x07363 ***/ /* 30 */ 0xe39487, 0xe39488, 0xe39489, 0xe3948a, /* 34 */ 0xe3948b, 0xe3948c, 0xe3948d, 0xe3948e, /* 38 */ 0xe3948f, 0xe39490, - /*** Four byte table, leaf: 82308cxx - offset 0x072f9 ***/ + /*** Four byte table, leaf: 82308cxx - offset 0x0736d ***/ /* 30 */ 0xe39491, 0xe39492, 0xe39493, 0xe39494, /* 34 */ 0xe39495, 0xe39496, 0xe39497, 0xe39498, /* 38 */ 0xe39499, 0xe3949a, - /*** Four byte table, leaf: 82308dxx - offset 0x07303 ***/ + /*** Four byte table, leaf: 82308dxx - offset 0x07377 ***/ /* 30 */ 0xe3949b, 0xe3949c, 0xe3949d, 0xe3949e, /* 34 */ 0xe3949f, 0xe394a0, 0xe394a1, 0xe394a2, /* 38 */ 0xe394a3, 0xe394a4, - /*** Four byte table, leaf: 82308exx - offset 0x0730d ***/ + /*** Four byte table, leaf: 82308exx - offset 0x07381 ***/ /* 30 */ 0xe394a5, 0xe394a6, 0xe394a7, 0xe394a8, /* 34 */ 0xe394a9, 0xe394aa, 0xe394ab, 0xe394ac, /* 38 */ 0xe394ad, 0xe394ae, - /*** Four byte table, leaf: 82308fxx - offset 0x07317 ***/ + /*** Four byte table, leaf: 82308fxx - offset 0x0738b ***/ /* 30 */ 0xe394af, 0xe394b0, 0xe394b1, 0xe394b2, /* 34 */ 0xe394b3, 0xe394b4, 0xe394b5, 0xe394b6, /* 38 */ 0xe394b7, 0xe394b8, - /*** Four byte table, leaf: 823090xx - offset 0x07321 ***/ + /*** Four byte table, leaf: 823090xx - offset 0x07395 ***/ /* 30 */ 0xe394b9, 0xe394ba, 0xe394bb, 0xe394bc, /* 34 */ 0xe394bd, 0xe394be, 0xe394bf, 0xe39580, /* 38 */ 0xe39581, 0xe39582, - /*** Four byte table, leaf: 823091xx - offset 0x0732b ***/ + /*** Four byte table, leaf: 823091xx - offset 0x0739f ***/ /* 30 */ 0xe39583, 0xe39584, 0xe39585, 0xe39586, /* 34 */ 0xe39587, 0xe39588, 0xe39589, 0xe3958a, /* 38 */ 0xe3958b, 0xe3958c, - /*** Four byte table, leaf: 823092xx - offset 0x07335 ***/ + /*** Four byte table, leaf: 823092xx - offset 0x073a9 ***/ /* 30 */ 0xe3958d, 0xe3958e, 0xe3958f, 0xe39590, /* 34 */ 0xe39591, 0xe39592, 0xe39593, 0xe39594, /* 38 */ 0xe39595, 0xe39596, - /*** Four byte table, leaf: 823093xx - offset 0x0733f ***/ + /*** Four byte table, leaf: 823093xx - offset 0x073b3 ***/ /* 30 */ 0xe39597, 0xe39598, 0xe39599, 0xe3959a, /* 34 */ 0xe3959b, 0xe3959c, 0xe3959d, 0xe3959e, /* 38 */ 0xe3959f, 0xe395a0, - /*** Four byte table, leaf: 823094xx - offset 0x07349 ***/ + /*** Four byte table, leaf: 823094xx - offset 0x073bd ***/ /* 30 */ 0xe395a1, 0xe395a2, 0xe395a3, 0xe395a4, /* 34 */ 0xe395a5, 0xe395a6, 0xe395a7, 0xe395a8, /* 38 */ 0xe395a9, 0xe395aa, - /*** Four byte table, leaf: 823095xx - offset 0x07353 ***/ + /*** Four byte table, leaf: 823095xx - offset 0x073c7 ***/ /* 30 */ 0xe395ab, 0xe395ac, 0xe395ad, 0xe395ae, /* 34 */ 0xe395af, 0xe395b0, 0xe395b1, 0xe395b2, /* 38 */ 0xe395b3, 0xe395b4, - /*** Four byte table, leaf: 823096xx - offset 0x0735d ***/ + /*** Four byte table, leaf: 823096xx - offset 0x073d1 ***/ /* 30 */ 0xe395b5, 0xe395b6, 0xe395b7, 0xe395b8, /* 34 */ 0xe395b9, 0xe395ba, 0xe395bb, 0xe395bc, /* 38 */ 0xe395bd, 0xe395be, - /*** Four byte table, leaf: 823097xx - offset 0x07367 ***/ + /*** Four byte table, leaf: 823097xx - offset 0x073db ***/ /* 30 */ 0xe395bf, 0xe39680, 0xe39681, 0xe39682, /* 34 */ 0xe39683, 0xe39684, 0xe39685, 0xe39686, /* 38 */ 0xe39687, 0xe39688, - /*** Four byte table, leaf: 823098xx - offset 0x07371 ***/ + /*** Four byte table, leaf: 823098xx - offset 0x073e5 ***/ /* 30 */ 0xe39689, 0xe3968a, 0xe3968b, 0xe3968c, /* 34 */ 0xe3968d, 0xe3968e, 0xe3968f, 0xe39690, /* 38 */ 0xe39691, 0xe39692, - /*** Four byte table, leaf: 823099xx - offset 0x0737b ***/ + /*** Four byte table, leaf: 823099xx - offset 0x073ef ***/ /* 30 */ 0xe39693, 0xe39694, 0xe39695, 0xe39696, /* 34 */ 0xe39697, 0xe39698, 0xe39699, 0xe3969a, /* 38 */ 0xe3969b, 0xe3969c, - /*** Four byte table, leaf: 82309axx - offset 0x07385 ***/ + /*** Four byte table, leaf: 82309axx - offset 0x073f9 ***/ /* 30 */ 0xe3969d, 0xe3969f, 0xe396a0, 0xe396a1, /* 34 */ 0xe396a2, 0xe396a3, 0xe396a4, 0xe396a5, /* 38 */ 0xe396a6, 0xe396a7, - /*** Four byte table, leaf: 82309bxx - offset 0x0738f ***/ + /*** Four byte table, leaf: 82309bxx - offset 0x07403 ***/ /* 30 */ 0xe396a8, 0xe396a9, 0xe396aa, 0xe396ab, /* 34 */ 0xe396ac, 0xe396ad, 0xe396ae, 0xe396af, /* 38 */ 0xe396b0, 0xe396b1, - /*** Four byte table, leaf: 82309cxx - offset 0x07399 ***/ + /*** Four byte table, leaf: 82309cxx - offset 0x0740d ***/ /* 30 */ 0xe396b2, 0xe396b3, 0xe396b4, 0xe396b5, /* 34 */ 0xe396b6, 0xe396b7, 0xe396b8, 0xe396b9, /* 38 */ 0xe396ba, 0xe396bb, - /*** Four byte table, leaf: 82309dxx - offset 0x073a3 ***/ + /*** Four byte table, leaf: 82309dxx - offset 0x07417 ***/ /* 30 */ 0xe396bc, 0xe396bd, 0xe396be, 0xe396bf, /* 34 */ 0xe39780, 0xe39781, 0xe39782, 0xe39783, /* 38 */ 0xe39784, 0xe39785, - /*** Four byte table, leaf: 82309exx - offset 0x073ad ***/ + /*** Four byte table, leaf: 82309exx - offset 0x07421 ***/ /* 30 */ 0xe39786, 0xe39787, 0xe39788, 0xe39789, /* 34 */ 0xe3978a, 0xe3978b, 0xe3978c, 0xe3978d, /* 38 */ 0xe3978e, 0xe3978f, - /*** Four byte table, leaf: 82309fxx - offset 0x073b7 ***/ + /*** Four byte table, leaf: 82309fxx - offset 0x0742b ***/ /* 30 */ 0xe39790, 0xe39791, 0xe39792, 0xe39793, /* 34 */ 0xe39794, 0xe39795, 0xe39796, 0xe39797, /* 38 */ 0xe39798, 0xe39799, - /*** Four byte table, leaf: 8230a0xx - offset 0x073c1 ***/ + /*** Four byte table, leaf: 8230a0xx - offset 0x07435 ***/ /* 30 */ 0xe3979a, 0xe3979b, 0xe3979c, 0xe3979d, /* 34 */ 0xe3979e, 0xe3979f, 0xe397a0, 0xe397a1, /* 38 */ 0xe397a2, 0xe397a3, - /*** Four byte table, leaf: 8230a1xx - offset 0x073cb ***/ + /*** Four byte table, leaf: 8230a1xx - offset 0x0743f ***/ /* 30 */ 0xe397a4, 0xe397a5, 0xe397a6, 0xe397a7, /* 34 */ 0xe397a8, 0xe397a9, 0xe397aa, 0xe397ab, /* 38 */ 0xe397ac, 0xe397ad, - /*** Four byte table, leaf: 8230a2xx - offset 0x073d5 ***/ + /*** Four byte table, leaf: 8230a2xx - offset 0x07449 ***/ /* 30 */ 0xe397ae, 0xe397af, 0xe397b0, 0xe397b1, /* 34 */ 0xe397b2, 0xe397b3, 0xe397b4, 0xe397b5, /* 38 */ 0xe397b6, 0xe397b7, - /*** Four byte table, leaf: 8230a3xx - offset 0x073df ***/ + /*** Four byte table, leaf: 8230a3xx - offset 0x07453 ***/ /* 30 */ 0xe397b8, 0xe397b9, 0xe397ba, 0xe397bb, /* 34 */ 0xe397bc, 0xe397bd, 0xe397be, 0xe397bf, /* 38 */ 0xe39880, 0xe39881, - /*** Four byte table, leaf: 8230a4xx - offset 0x073e9 ***/ + /*** Four byte table, leaf: 8230a4xx - offset 0x0745d ***/ /* 30 */ 0xe39882, 0xe39883, 0xe39884, 0xe39885, /* 34 */ 0xe39886, 0xe39887, 0xe39888, 0xe39889, /* 38 */ 0xe3988a, 0xe3988b, - /*** Four byte table, leaf: 8230a5xx - offset 0x073f3 ***/ + /*** Four byte table, leaf: 8230a5xx - offset 0x07467 ***/ /* 30 */ 0xe3988c, 0xe3988d, 0xe3988f, 0xe39890, /* 34 */ 0xe39891, 0xe39892, 0xe39893, 0xe39894, /* 38 */ 0xe39895, 0xe39896, - /*** Four byte table, leaf: 8230a6xx - offset 0x073fd ***/ + /*** Four byte table, leaf: 8230a6xx - offset 0x07471 ***/ /* 30 */ 0xe39897, 0xe39898, 0xe39899, /* 7 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 8230f2xx - offset 0x07400 ***/ + /*** Four byte table, leaf: 8230f2xx - offset 0x07474 ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 38 */ 0xe3a499, 0xe3a49a, - /*** Four byte table, leaf: 8230f3xx - offset 0x0740a ***/ + /*** Four byte table, leaf: 8230f3xx - offset 0x0747e ***/ /* 30 */ 0xe3a49b, 0xe3a49c, 0xe3a49d, 0xe3a49e, /* 34 */ 0xe3a49f, 0xe3a4a0, 0xe3a4a1, 0xe3a4a2, /* 38 */ 0xe3a4a3, 0xe3a4a4, - /*** Four byte table, leaf: 8230f4xx - offset 0x07414 ***/ + /*** Four byte table, leaf: 8230f4xx - offset 0x07488 ***/ /* 30 */ 0xe3a4a5, 0xe3a4a6, 0xe3a4a7, 0xe3a4a8, /* 34 */ 0xe3a4a9, 0xe3a4aa, 0xe3a4ab, 0xe3a4ac, /* 38 */ 0xe3a4ad, 0xe3a4ae, - /*** Four byte table, leaf: 8230f5xx - offset 0x0741e ***/ + /*** Four byte table, leaf: 8230f5xx - offset 0x07492 ***/ /* 30 */ 0xe3a4af, 0xe3a4b0, 0xe3a4b1, 0xe3a4b2, /* 34 */ 0xe3a4b3, 0xe3a4b4, 0xe3a4b5, 0xe3a4b6, /* 38 */ 0xe3a4b7, 0xe3a4b8, - /*** Four byte table, leaf: 8230f6xx - offset 0x07428 ***/ + /*** Four byte table, leaf: 8230f6xx - offset 0x0749c ***/ /* 30 */ 0xe3a4b9, 0xe3a4ba, 0xe3a4bb, 0xe3a4bc, /* 34 */ 0xe3a4bd, 0xe3a4be, 0xe3a4bf, 0xe3a580, /* 38 */ 0xe3a581, 0xe3a582, - /*** Four byte table, leaf: 8230f7xx - offset 0x07432 ***/ + /*** Four byte table, leaf: 8230f7xx - offset 0x074a6 ***/ /* 30 */ 0xe3a583, 0xe3a584, 0xe3a585, 0xe3a586, /* 34 */ 0xe3a587, 0xe3a588, 0xe3a589, 0xe3a58a, /* 38 */ 0xe3a58b, 0xe3a58c, - /*** Four byte table, leaf: 8230f8xx - offset 0x0743c ***/ + /*** Four byte table, leaf: 8230f8xx - offset 0x074b0 ***/ /* 30 */ 0xe3a58d, 0xe3a58e, 0xe3a58f, 0xe3a590, /* 34 */ 0xe3a591, 0xe3a592, 0xe3a593, 0xe3a594, /* 38 */ 0xe3a595, 0xe3a596, - /*** Four byte table, leaf: 8230f9xx - offset 0x07446 ***/ + /*** Four byte table, leaf: 8230f9xx - offset 0x074ba ***/ /* 30 */ 0xe3a597, 0xe3a598, 0xe3a599, 0xe3a59a, /* 34 */ 0xe3a59b, 0xe3a59c, 0xe3a59d, 0xe3a59e, /* 38 */ 0xe3a59f, 0xe3a5a0, - /*** Four byte table, leaf: 8230faxx - offset 0x07450 ***/ + /*** Four byte table, leaf: 8230faxx - offset 0x074c4 ***/ /* 30 */ 0xe3a5a1, 0xe3a5a2, 0xe3a5a3, 0xe3a5a4, /* 34 */ 0xe3a5a5, 0xe3a5a6, 0xe3a5a7, 0xe3a5a8, /* 38 */ 0xe3a5a9, 0xe3a5aa, - /*** Four byte table, leaf: 8230fbxx - offset 0x0745a ***/ + /*** Four byte table, leaf: 8230fbxx - offset 0x074ce ***/ /* 30 */ 0xe3a5ab, 0xe3a5ac, 0xe3a5ad, 0xe3a5af, /* 34 */ 0xe3a5b0, 0xe3a5b1, 0xe3a5b2, 0xe3a5b3, /* 38 */ 0xe3a5b4, 0xe3a5b5, - /*** Four byte table, leaf: 8230fcxx - offset 0x07464 ***/ + /*** Four byte table, leaf: 8230fcxx - offset 0x074d8 ***/ /* 30 */ 0xe3a5b6, 0xe3a5b7, 0xe3a5b8, 0xe3a5b9, /* 34 */ 0xe3a5ba, 0xe3a5bb, 0xe3a5bc, 0xe3a5bd, /* 38 */ 0xe3a5be, 0xe3a5bf, - /*** Four byte table, leaf: 8230fdxx - offset 0x0746e ***/ + /*** Four byte table, leaf: 8230fdxx - offset 0x074e2 ***/ /* 30 */ 0xe3a680, 0xe3a681, 0xe3a682, 0xe3a683, /* 34 */ 0xe3a684, 0xe3a685, 0xe3a686, 0xe3a687, /* 38 */ 0xe3a688, 0xe3a689, - /*** Four byte table, leaf: 8230fexx - offset 0x07478 ***/ + /*** Four byte table, leaf: 8230fexx - offset 0x074ec ***/ /* 30 */ 0xe3a68a, 0xe3a68b, 0xe3a68c, 0xe3a68d, /* 34 */ 0xe3a68e, 0xe3a68f, 0xe3a690, 0xe3a691, /* 38 */ 0xe3a692, 0xe3a693, - /*** Four byte table, leaf: 823181xx - offset 0x07482 ***/ + /*** Four byte table, leaf: 823181xx - offset 0x074f6 ***/ /* 30 */ 0xe3a694, 0xe3a695, 0xe3a696, 0xe3a697, /* 34 */ 0xe3a698, 0xe3a699, 0xe3a69a, 0xe3a69b, /* 38 */ 0xe3a69c, 0xe3a69d, - /*** Four byte table, leaf: 823182xx - offset 0x0748c ***/ + /*** Four byte table, leaf: 823182xx - offset 0x07500 ***/ /* 30 */ 0xe3a69e, 0xe3a69f, 0xe3a6a0, 0xe3a6a1, /* 34 */ 0xe3a6a2, 0xe3a6a3, 0xe3a6a4, 0xe3a6a5, /* 38 */ 0xe3a6a6, 0xe3a6a7, - /*** Four byte table, leaf: 823183xx - offset 0x07496 ***/ + /*** Four byte table, leaf: 823183xx - offset 0x0750a ***/ /* 30 */ 0xe3a6a8, 0xe3a6a9, 0xe3a6aa, 0xe3a6ab, /* 34 */ 0xe3a6ac, 0xe3a6ad, 0xe3a6ae, 0xe3a6af, /* 38 */ 0xe3a6b0, 0xe3a6b1, - /*** Four byte table, leaf: 823184xx - offset 0x074a0 ***/ + /*** Four byte table, leaf: 823184xx - offset 0x07514 ***/ /* 30 */ 0xe3a6b2, 0xe3a6b3, 0xe3a6b4, 0xe3a6b5, /* 34 */ 0xe3a6b6, 0xe3a6b7, 0xe3a6b8, 0xe3a6b9, /* 38 */ 0xe3a6ba, 0xe3a6bb, - /*** Four byte table, leaf: 823185xx - offset 0x074aa ***/ + /*** Four byte table, leaf: 823185xx - offset 0x0751e ***/ /* 30 */ 0xe3a6bc, 0xe3a6bd, 0xe3a6be, 0xe3a6bf, /* 34 */ 0xe3a780, 0xe3a781, 0xe3a782, 0xe3a783, /* 38 */ 0xe3a784, 0xe3a785, - /*** Four byte table, leaf: 823186xx - offset 0x074b4 ***/ + /*** Four byte table, leaf: 823186xx - offset 0x07528 ***/ /* 30 */ 0xe3a786, 0xe3a787, 0xe3a788, 0xe3a789, /* 34 */ 0xe3a78a, 0xe3a78b, 0xe3a78c, 0xe3a78d, /* 38 */ 0xe3a78e, 0xe3a791, - /*** Four byte table, leaf: 823187xx - offset 0x074be ***/ + /*** Four byte table, leaf: 823187xx - offset 0x07532 ***/ /* 30 */ 0xe3a792, 0xe3a793, 0xe3a794, 0xe3a795, /* 34 */ 0xe3a796, 0xe3a797, 0xe3a798, 0xe3a799, /* 38 */ 0xe3a79a, 0xe3a79b, - /*** Four byte table, leaf: 823188xx - offset 0x074c8 ***/ + /*** Four byte table, leaf: 823188xx - offset 0x0753c ***/ /* 30 */ 0xe3a79c, 0xe3a79d, 0xe3a79e, 0xe3a7a0, /* 34 */ 0xe3a7a1, 0xe3a7a2, 0xe3a7a3, 0xe3a7a4, /* 38 */ 0xe3a7a5, 0xe3a7a6, - /*** Four byte table, leaf: 823189xx - offset 0x074d2 ***/ + /*** Four byte table, leaf: 823189xx - offset 0x07546 ***/ /* 30 */ 0xe3a7a7, 0xe3a7a8, 0xe3a7a9, 0xe3a7aa, /* 34 */ 0xe3a7ab, 0xe3a7ac, 0xe3a7ad, 0xe3a7ae, /* 38 */ 0xe3a7af, 0xe3a7b0, - /*** Four byte table, leaf: 82318axx - offset 0x074dc ***/ + /*** Four byte table, leaf: 82318axx - offset 0x07550 ***/ /* 30 */ 0xe3a7b1, 0xe3a7b2, 0xe3a7b3, 0xe3a7b4, /* 34 */ 0xe3a7b5, 0xe3a7b6, 0xe3a7b7, 0xe3a7b8, /* 38 */ 0xe3a7b9, 0xe3a7ba, - /*** Four byte table, leaf: 82318bxx - offset 0x074e6 ***/ + /*** Four byte table, leaf: 82318bxx - offset 0x0755a ***/ /* 30 */ 0xe3a7bb, 0xe3a7bc, 0xe3a7bd, 0xe3a7be, /* 34 */ 0xe3a7bf, 0xe3a880, 0xe3a881, 0xe3a882, /* 38 */ 0xe3a883, 0xe3a884, - /*** Four byte table, leaf: 82318cxx - offset 0x074f0 ***/ + /*** Four byte table, leaf: 82318cxx - offset 0x07564 ***/ /* 30 */ 0xe3a885, 0xe3a886, 0xe3a887, 0xe3a888, /* 34 */ 0xe3a889, 0xe3a88a, 0xe3a88b, 0xe3a88c, /* 38 */ 0xe3a88d, 0xe3a88e, - /*** Four byte table, leaf: 82318dxx - offset 0x074fa ***/ + /*** Four byte table, leaf: 82318dxx - offset 0x0756e ***/ /* 30 */ 0xe3a88f, 0xe3a890, 0xe3a891, 0xe3a892, /* 34 */ 0xe3a893, 0xe3a894, 0xe3a895, 0xe3a896, /* 38 */ 0xe3a897, 0xe3a898, - /*** Four byte table, leaf: 82318exx - offset 0x07504 ***/ + /*** Four byte table, leaf: 82318exx - offset 0x07578 ***/ /* 30 */ 0xe3a899, 0xe3a89a, 0xe3a89b, 0xe3a89c, /* 34 */ 0xe3a89d, 0xe3a89e, 0xe3a89f, 0xe3a8a0, /* 38 */ 0xe3a8a1, 0xe3a8a2, - /*** Four byte table, leaf: 82318fxx - offset 0x0750e ***/ + /*** Four byte table, leaf: 82318fxx - offset 0x07582 ***/ /* 30 */ 0xe3a8a3, 0xe3a8a4, 0xe3a8a5, 0xe3a8a6, /* 34 */ 0xe3a8a7, 0xe3a8a8, 0xe3a8a9, 0xe3a8aa, /* 38 */ 0xe3a8ab, 0xe3a8ac, - /*** Four byte table, leaf: 823190xx - offset 0x07518 ***/ + /*** Four byte table, leaf: 823190xx - offset 0x0758c ***/ /* 30 */ 0xe3a8ad, 0xe3a8ae, 0xe3a8af, 0xe3a8b0, /* 34 */ 0xe3a8b1, 0xe3a8b2, 0xe3a8b3, 0xe3a8b4, /* 38 */ 0xe3a8b5, 0xe3a8b6, - /*** Four byte table, leaf: 823191xx - offset 0x07522 ***/ + /*** Four byte table, leaf: 823191xx - offset 0x07596 ***/ /* 30 */ 0xe3a8b7, 0xe3a8b8, 0xe3a8b9, 0xe3a8ba, /* 34 */ 0xe3a8bb, 0xe3a8bc, 0xe3a8bd, 0xe3a8be, /* 38 */ 0xe3a8bf, 0xe3a980, - /*** Four byte table, leaf: 823192xx - offset 0x0752c ***/ + /*** Four byte table, leaf: 823192xx - offset 0x075a0 ***/ /* 30 */ 0xe3a981, 0xe3a982, 0xe3a983, 0xe3a984, /* 34 */ 0xe3a985, 0xe3a986, 0xe3a987, 0xe3a988, /* 38 */ 0xe3a989, 0xe3a98a, - /*** Four byte table, leaf: 823193xx - offset 0x07536 ***/ + /*** Four byte table, leaf: 823193xx - offset 0x075aa ***/ /* 30 */ 0xe3a98b, 0xe3a98c, 0xe3a98d, 0xe3a98e, /* 34 */ 0xe3a98f, 0xe3a990, 0xe3a991, 0xe3a992, /* 38 */ 0xe3a993, 0xe3a994, - /*** Four byte table, leaf: 823194xx - offset 0x07540 ***/ + /*** Four byte table, leaf: 823194xx - offset 0x075b4 ***/ /* 30 */ 0xe3a995, 0xe3a996, 0xe3a997, 0xe3a998, /* 34 */ 0xe3a999, 0xe3a99a, 0xe3a99b, 0xe3a99c, /* 38 */ 0xe3a99d, 0xe3a99e, - /*** Four byte table, leaf: 823195xx - offset 0x0754a ***/ + /*** Four byte table, leaf: 823195xx - offset 0x075be ***/ /* 30 */ 0xe3a99f, 0xe3a9a0, 0xe3a9a1, 0xe3a9a2, /* 34 */ 0xe3a9a3, 0xe3a9a4, 0xe3a9a5, 0xe3a9a6, /* 38 */ 0xe3a9a7, 0xe3a9a8, - /*** Four byte table, leaf: 823196xx - offset 0x07554 ***/ + /*** Four byte table, leaf: 823196xx - offset 0x075c8 ***/ /* 30 */ 0xe3a9a9, 0xe3a9aa, 0xe3a9ab, 0xe3a9ac, /* 34 */ 0xe3a9ad, 0xe3a9ae, 0xe3a9af, 0xe3a9b0, /* 38 */ 0xe3a9b1, 0xe3a9b2, - /*** Four byte table, leaf: 823197xx - offset 0x0755e ***/ + /*** Four byte table, leaf: 823197xx - offset 0x075d2 ***/ /* 30 */ 0xe3a9b4, 0xe3a9b5, 0xe3a9b6, 0xe3a9b7, /* 34 */ 0xe3a9b8, 0xe3a9b9, 0xe3a9ba, 0xe3a9bb, /* 38 */ 0xe3a9bc, 0xe3a9bd, - /*** Four byte table, leaf: 823198xx - offset 0x07568 ***/ + /*** Four byte table, leaf: 823198xx - offset 0x075dc ***/ /* 30 */ 0xe3a9be, 0xe3a9bf, 0xe3aa80, 0xe3aa81, /* 34 */ 0xe3aa82, 0xe3aa83, 0xe3aa84, 0xe3aa85, /* 38 */ 0xe3aa86, 0xe3aa87, - /*** Four byte table, leaf: 823199xx - offset 0x07572 ***/ + /*** Four byte table, leaf: 823199xx - offset 0x075e6 ***/ /* 30 */ 0xe3aa88, 0xe3aa89, 0xe3aa8a, 0xe3aa8b, /* 34 */ 0xe3aa8c, 0xe3aa8d, 0xe3aa8e, 0xe3aa8f, /* 38 */ 0xe3aa90, 0xe3aa91, - /*** Four byte table, leaf: 82319axx - offset 0x0757c ***/ + /*** Four byte table, leaf: 82319axx - offset 0x075f0 ***/ /* 30 */ 0xe3aa92, 0xe3aa93, 0xe3aa94, 0xe3aa95, /* 34 */ 0xe3aa96, 0xe3aa97, 0xe3aa98, 0xe3aa99, /* 38 */ 0xe3aa9a, 0xe3aa9b, - /*** Four byte table, leaf: 82319bxx - offset 0x07586 ***/ + /*** Four byte table, leaf: 82319bxx - offset 0x075fa ***/ /* 30 */ 0xe3aa9c, 0xe3aa9d, 0xe3aa9e, 0xe3aa9f, /* 34 */ 0xe3aaa0, 0xe3aaa1, 0xe3aaa2, 0xe3aaa3, /* 38 */ 0xe3aaa4, 0xe3aaa5, - /*** Four byte table, leaf: 82319cxx - offset 0x07590 ***/ + /*** Four byte table, leaf: 82319cxx - offset 0x07604 ***/ /* 30 */ 0xe3aaa6, 0xe3aaa7, 0xe3aaa8, 0xe3aaa9, /* 34 */ 0xe3aaaa, 0xe3aaab, 0xe3aaac, 0xe3aaad, /* 38 */ 0xe3aaae, 0xe3aaaf, - /*** Four byte table, leaf: 82319dxx - offset 0x0759a ***/ + /*** Four byte table, leaf: 82319dxx - offset 0x0760e ***/ /* 30 */ 0xe3aab0, 0xe3aab1, 0xe3aab2, 0xe3aab3, /* 34 */ 0xe3aab4, 0xe3aab5, 0xe3aab6, 0xe3aab7, /* 38 */ 0xe3aab8, 0xe3aab9, - /*** Four byte table, leaf: 82319exx - offset 0x075a4 ***/ + /*** Four byte table, leaf: 82319exx - offset 0x07618 ***/ /* 30 */ 0xe3aaba, 0xe3aabb, 0xe3aabc, 0xe3aabd, /* 34 */ 0xe3aabe, 0xe3aabf, 0xe3ab80, 0xe3ab81, /* 38 */ 0xe3ab82, 0xe3ab83, - /*** Four byte table, leaf: 82319fxx - offset 0x075ae ***/ + /*** Four byte table, leaf: 82319fxx - offset 0x07622 ***/ /* 30 */ 0xe3ab84, 0xe3ab85, 0xe3ab86, 0xe3ab87, /* 34 */ 0xe3ab88, 0xe3ab89, 0xe3ab8a, 0xe3ab8b, /* 38 */ 0xe3ab8c, 0xe3ab8d, - /*** Four byte table, leaf: 8231a0xx - offset 0x075b8 ***/ + /*** Four byte table, leaf: 8231a0xx - offset 0x0762c ***/ /* 30 */ 0xe3ab8e, 0xe3ab8f, 0xe3ab90, 0xe3ab91, /* 34 */ 0xe3ab92, 0xe3ab93, 0xe3ab94, 0xe3ab95, /* 38 */ 0xe3ab96, 0xe3ab97, - /*** Four byte table, leaf: 8231a1xx - offset 0x075c2 ***/ + /*** Four byte table, leaf: 8231a1xx - offset 0x07636 ***/ /* 30 */ 0xe3ab98, 0xe3ab99, 0xe3ab9a, 0xe3ab9b, /* 34 */ 0xe3ab9c, 0xe3ab9d, 0xe3ab9e, 0xe3ab9f, /* 38 */ 0xe3aba0, 0xe3aba1, - /*** Four byte table, leaf: 8231a2xx - offset 0x075cc ***/ + /*** Four byte table, leaf: 8231a2xx - offset 0x07640 ***/ /* 30 */ 0xe3aba2, 0xe3aba3, 0xe3aba4, 0xe3aba5, /* 34 */ 0xe3aba6, 0xe3aba7, 0xe3aba8, 0xe3aba9, /* 38 */ 0xe3abaa, 0xe3abab, - /*** Four byte table, leaf: 8231a3xx - offset 0x075d6 ***/ + /*** Four byte table, leaf: 8231a3xx - offset 0x0764a ***/ /* 30 */ 0xe3abac, 0xe3abad, 0xe3abae, 0xe3abaf, /* 34 */ 0xe3abb0, 0xe3abb1, 0xe3abb2, 0xe3abb3, /* 38 */ 0xe3abb4, 0xe3abb5, - /*** Four byte table, leaf: 8231a4xx - offset 0x075e0 ***/ + /*** Four byte table, leaf: 8231a4xx - offset 0x07654 ***/ /* 30 */ 0xe3abb6, 0xe3abb7, 0xe3abb8, 0xe3abb9, /* 34 */ 0xe3abba, 0xe3abbb, 0xe3abbc, 0xe3abbd, /* 38 */ 0xe3abbe, 0xe3abbf, - /*** Four byte table, leaf: 8231a5xx - offset 0x075ea ***/ + /*** Four byte table, leaf: 8231a5xx - offset 0x0765e ***/ /* 30 */ 0xe3ac80, 0xe3ac81, 0xe3ac82, 0xe3ac83, /* 34 */ 0xe3ac84, 0xe3ac85, 0xe3ac86, 0xe3ac87, /* 38 */ 0xe3ac88, 0xe3ac89, - /*** Four byte table, leaf: 8231a6xx - offset 0x075f4 ***/ + /*** Four byte table, leaf: 8231a6xx - offset 0x07668 ***/ /* 30 */ 0xe3ac8a, 0xe3ac8b, 0xe3ac8c, 0xe3ac8d, /* 34 */ 0xe3ac8e, 0xe3ac8f, 0xe3ac90, 0xe3ac91, /* 38 */ 0xe3ac92, 0xe3ac93, - /*** Four byte table, leaf: 8231a7xx - offset 0x075fe ***/ + /*** Four byte table, leaf: 8231a7xx - offset 0x07672 ***/ /* 30 */ 0xe3ac94, 0xe3ac95, 0xe3ac96, 0xe3ac97, /* 34 */ 0xe3ac98, 0xe3ac99, 0xe3ac9a, 0xe3ac9b, /* 38 */ 0xe3ac9c, 0xe3ac9d, - /*** Four byte table, leaf: 8231a8xx - offset 0x07608 ***/ + /*** Four byte table, leaf: 8231a8xx - offset 0x0767c ***/ /* 30 */ 0xe3ac9e, 0xe3ac9f, 0xe3aca0, 0xe3aca1, /* 34 */ 0xe3aca2, 0xe3aca3, 0xe3aca4, 0xe3aca5, /* 38 */ 0xe3aca6, 0xe3aca7, - /*** Four byte table, leaf: 8231a9xx - offset 0x07612 ***/ + /*** Four byte table, leaf: 8231a9xx - offset 0x07686 ***/ /* 30 */ 0xe3aca8, 0xe3aca9, 0xe3acaa, 0xe3acab, /* 34 */ 0xe3acac, 0xe3acad, 0xe3acae, 0xe3acaf, /* 38 */ 0xe3acb0, 0xe3acb1, - /*** Four byte table, leaf: 8231aaxx - offset 0x0761c ***/ + /*** Four byte table, leaf: 8231aaxx - offset 0x07690 ***/ /* 30 */ 0xe3acb2, 0xe3acb3, 0xe3acb4, 0xe3acb5, /* 34 */ 0xe3acb6, 0xe3acb7, 0xe3acb8, 0xe3acb9, /* 38 */ 0xe3acba, 0xe3acbb, - /*** Four byte table, leaf: 8231abxx - offset 0x07626 ***/ + /*** Four byte table, leaf: 8231abxx - offset 0x0769a ***/ /* 30 */ 0xe3acbc, 0xe3acbd, 0xe3acbe, 0xe3acbf, /* 34 */ 0xe3ad80, 0xe3ad81, 0xe3ad82, 0xe3ad83, /* 38 */ 0xe3ad84, 0xe3ad85, - /*** Four byte table, leaf: 8231acxx - offset 0x07630 ***/ + /*** Four byte table, leaf: 8231acxx - offset 0x076a4 ***/ /* 30 */ 0xe3ad86, 0xe3ad87, 0xe3ad88, 0xe3ad89, /* 34 */ 0xe3ad8a, 0xe3ad8b, 0xe3ad8c, 0xe3ad8d, /* 38 */ 0xe3ad8f, 0xe3ad90, - /*** Four byte table, leaf: 8231adxx - offset 0x0763a ***/ + /*** Four byte table, leaf: 8231adxx - offset 0x076ae ***/ /* 30 */ 0xe3ad91, 0xe3ad92, 0xe3ad93, 0xe3ad94, /* 34 */ 0xe3ad95, 0xe3ad96, 0xe3ad97, 0xe3ad98, /* 38 */ 0xe3ad99, 0xe3ad9a, - /*** Four byte table, leaf: 8231aexx - offset 0x07644 ***/ + /*** Four byte table, leaf: 8231aexx - offset 0x076b8 ***/ /* 30 */ 0xe3ad9b, 0xe3ad9c, 0xe3ad9d, 0xe3ad9e, /* 34 */ 0xe3ad9f, 0xe3ada0, 0xe3ada1, 0xe3ada2, /* 38 */ 0xe3ada3, 0xe3ada4, - /*** Four byte table, leaf: 8231afxx - offset 0x0764e ***/ + /*** Four byte table, leaf: 8231afxx - offset 0x076c2 ***/ /* 30 */ 0xe3ada5, 0xe3ada6, 0xe3ada7, 0xe3ada8, /* 34 */ 0xe3ada9, 0xe3adaa, 0xe3adab, 0xe3adac, /* 38 */ 0xe3adad, 0xe3adae, - /*** Four byte table, leaf: 8231b0xx - offset 0x07658 ***/ + /*** Four byte table, leaf: 8231b0xx - offset 0x076cc ***/ /* 30 */ 0xe3adaf, 0xe3adb0, 0xe3adb1, 0xe3adb2, /* 34 */ 0xe3adb3, 0xe3adb4, 0xe3adb5, 0xe3adb6, /* 38 */ 0xe3adb7, 0xe3adb8, - /*** Four byte table, leaf: 8231b1xx - offset 0x07662 ***/ + /*** Four byte table, leaf: 8231b1xx - offset 0x076d6 ***/ /* 30 */ 0xe3adb9, 0xe3adba, 0xe3adbb, 0xe3adbc, /* 34 */ 0xe3adbd, 0xe3adbe, 0xe3adbf, 0xe3ae80, /* 38 */ 0xe3ae81, 0xe3ae82, - /*** Four byte table, leaf: 8231b2xx - offset 0x0766c ***/ + /*** Four byte table, leaf: 8231b2xx - offset 0x076e0 ***/ /* 30 */ 0xe3ae83, 0xe3ae84, 0xe3ae85, 0xe3ae86, /* 34 */ 0xe3ae87, 0xe3ae88, 0xe3ae89, 0xe3ae8a, /* 38 */ 0xe3ae8b, 0xe3ae8c, - /*** Four byte table, leaf: 8231b3xx - offset 0x07676 ***/ + /*** Four byte table, leaf: 8231b3xx - offset 0x076ea ***/ /* 30 */ 0xe3ae8d, 0xe3ae8e, 0xe3ae8f, 0xe3ae90, /* 34 */ 0xe3ae91, 0xe3ae92, 0xe3ae93, 0xe3ae94, /* 38 */ 0xe3ae95, 0xe3ae96, - /*** Four byte table, leaf: 8231b4xx - offset 0x07680 ***/ + /*** Four byte table, leaf: 8231b4xx - offset 0x076f4 ***/ /* 30 */ 0xe3ae97, 0xe3ae98, 0xe3ae99, 0xe3ae9a, /* 34 */ 0xe3ae9b, 0xe3ae9c, 0xe3ae9d, 0xe3ae9e, /* 38 */ 0xe3ae9f, 0xe3aea0, - /*** Four byte table, leaf: 8231b5xx - offset 0x0768a ***/ + /*** Four byte table, leaf: 8231b5xx - offset 0x076fe ***/ /* 30 */ 0xe3aea1, 0xe3aea2, 0xe3aea3, 0xe3aea4, /* 34 */ 0xe3aea5, 0xe3aea6, 0xe3aea7, 0xe3aea8, /* 38 */ 0xe3aea9, 0xe3aeaa, - /*** Four byte table, leaf: 8231b6xx - offset 0x07694 ***/ + /*** Four byte table, leaf: 8231b6xx - offset 0x07708 ***/ /* 30 */ 0xe3aeab, 0xe3aeac, 0xe3aead, 0xe3aeae, /* 34 */ 0xe3aeaf, 0xe3aeb0, 0xe3aeb1, 0xe3aeb2, /* 38 */ 0xe3aeb3, 0xe3aeb4, - /*** Four byte table, leaf: 8231b7xx - offset 0x0769e ***/ + /*** Four byte table, leaf: 8231b7xx - offset 0x07712 ***/ /* 30 */ 0xe3aeb5, 0xe3aeb6, 0xe3aeb7, 0xe3aeb8, /* 34 */ 0xe3aeb9, 0xe3aeba, 0xe3aebb, 0xe3aebc, /* 38 */ 0xe3aebd, 0xe3aebe, - /*** Four byte table, leaf: 8231b8xx - offset 0x076a8 ***/ + /*** Four byte table, leaf: 8231b8xx - offset 0x0771c ***/ /* 30 */ 0xe3aebf, 0xe3af80, 0xe3af81, 0xe3af82, /* 34 */ 0xe3af83, 0xe3af84, 0xe3af85, 0xe3af86, /* 38 */ 0xe3af87, 0xe3af88, - /*** Four byte table, leaf: 8231b9xx - offset 0x076b2 ***/ + /*** Four byte table, leaf: 8231b9xx - offset 0x07726 ***/ /* 30 */ 0xe3af89, 0xe3af8a, 0xe3af8b, 0xe3af8c, /* 34 */ 0xe3af8d, 0xe3af8e, 0xe3af8f, 0xe3af90, /* 38 */ 0xe3af91, 0xe3af92, - /*** Four byte table, leaf: 8231baxx - offset 0x076bc ***/ + /*** Four byte table, leaf: 8231baxx - offset 0x07730 ***/ /* 30 */ 0xe3af93, 0xe3af94, 0xe3af95, 0xe3af96, /* 34 */ 0xe3af97, 0xe3af98, 0xe3af99, 0xe3af9a, /* 38 */ 0xe3af9b, 0xe3af9c, - /*** Four byte table, leaf: 8231bbxx - offset 0x076c6 ***/ + /*** Four byte table, leaf: 8231bbxx - offset 0x0773a ***/ /* 30 */ 0xe3af9d, 0xe3af9e, 0xe3af9f, 0xe3afa0, /* 34 */ 0xe3afa1, 0xe3afa2, 0xe3afa3, 0xe3afa4, /* 38 */ 0xe3afa5, 0xe3afa6, - /*** Four byte table, leaf: 8231bcxx - offset 0x076d0 ***/ + /*** Four byte table, leaf: 8231bcxx - offset 0x07744 ***/ /* 30 */ 0xe3afa7, 0xe3afa8, 0xe3afa9, 0xe3afaa, /* 34 */ 0xe3afab, 0xe3afac, 0xe3afad, 0xe3afae, /* 38 */ 0xe3afaf, 0xe3afb0, - /*** Four byte table, leaf: 8231bdxx - offset 0x076da ***/ + /*** Four byte table, leaf: 8231bdxx - offset 0x0774e ***/ /* 30 */ 0xe3afb1, 0xe3afb2, 0xe3afb3, 0xe3afb4, /* 34 */ 0xe3afb5, 0xe3afb6, 0xe3afb7, 0xe3afb8, /* 38 */ 0xe3afb9, 0xe3afba, - /*** Four byte table, leaf: 8231bexx - offset 0x076e4 ***/ + /*** Four byte table, leaf: 8231bexx - offset 0x07758 ***/ /* 30 */ 0xe3afbb, 0xe3afbc, 0xe3afbd, 0xe3afbe, /* 34 */ 0xe3afbf, 0xe3b080, 0xe3b081, 0xe3b082, /* 38 */ 0xe3b083, 0xe3b084, - /*** Four byte table, leaf: 8231bfxx - offset 0x076ee ***/ + /*** Four byte table, leaf: 8231bfxx - offset 0x07762 ***/ /* 30 */ 0xe3b085, 0xe3b086, 0xe3b087, 0xe3b088, /* 34 */ 0xe3b089, 0xe3b08a, 0xe3b08b, 0xe3b08c, /* 38 */ 0xe3b08d, 0xe3b08e, - /*** Four byte table, leaf: 8231c0xx - offset 0x076f8 ***/ + /*** Four byte table, leaf: 8231c0xx - offset 0x0776c ***/ /* 30 */ 0xe3b08f, 0xe3b090, 0xe3b091, 0xe3b092, /* 34 */ 0xe3b093, 0xe3b094, 0xe3b095, 0xe3b096, /* 38 */ 0xe3b097, 0xe3b098, - /*** Four byte table, leaf: 8231c1xx - offset 0x07702 ***/ + /*** Four byte table, leaf: 8231c1xx - offset 0x07776 ***/ /* 30 */ 0xe3b099, 0xe3b09a, 0xe3b09b, 0xe3b09c, /* 34 */ 0xe3b09d, 0xe3b09e, 0xe3b09f, 0xe3b0a0, /* 38 */ 0xe3b0a1, 0xe3b0a2, - /*** Four byte table, leaf: 8231c2xx - offset 0x0770c ***/ + /*** Four byte table, leaf: 8231c2xx - offset 0x07780 ***/ /* 30 */ 0xe3b0a3, 0xe3b0a4, 0xe3b0a5, 0xe3b0a6, /* 34 */ 0xe3b0a7, 0xe3b0a8, 0xe3b0a9, 0xe3b0aa, /* 38 */ 0xe3b0ab, 0xe3b0ac, - /*** Four byte table, leaf: 8231c3xx - offset 0x07716 ***/ + /*** Four byte table, leaf: 8231c3xx - offset 0x0778a ***/ /* 30 */ 0xe3b0ad, 0xe3b0ae, 0xe3b0af, 0xe3b0b0, /* 34 */ 0xe3b0b1, 0xe3b0b2, 0xe3b0b3, 0xe3b0b4, /* 38 */ 0xe3b0b5, 0xe3b0b6, - /*** Four byte table, leaf: 8231c4xx - offset 0x07720 ***/ + /*** Four byte table, leaf: 8231c4xx - offset 0x07794 ***/ /* 30 */ 0xe3b0b7, 0xe3b0b8, 0xe3b0b9, 0xe3b0ba, /* 34 */ 0xe3b0bb, 0xe3b0bc, 0xe3b0bd, 0xe3b0be, /* 38 */ 0xe3b0bf, 0xe3b180, - /*** Four byte table, leaf: 8231c5xx - offset 0x0772a ***/ + /*** Four byte table, leaf: 8231c5xx - offset 0x0779e ***/ /* 30 */ 0xe3b181, 0xe3b182, 0xe3b183, 0xe3b184, /* 34 */ 0xe3b185, 0xe3b186, 0xe3b187, 0xe3b188, /* 38 */ 0xe3b189, 0xe3b18a, - /*** Four byte table, leaf: 8231c6xx - offset 0x07734 ***/ + /*** Four byte table, leaf: 8231c6xx - offset 0x077a8 ***/ /* 30 */ 0xe3b18b, 0xe3b18c, 0xe3b18d, 0xe3b18e, /* 34 */ 0xe3b18f, 0xe3b190, 0xe3b191, 0xe3b192, /* 38 */ 0xe3b193, 0xe3b194, - /*** Four byte table, leaf: 8231c7xx - offset 0x0773e ***/ + /*** Four byte table, leaf: 8231c7xx - offset 0x077b2 ***/ /* 30 */ 0xe3b195, 0xe3b196, 0xe3b197, 0xe3b198, /* 34 */ 0xe3b199, 0xe3b19a, 0xe3b19b, 0xe3b19c, /* 38 */ 0xe3b19d, 0xe3b19e, - /*** Four byte table, leaf: 8231c8xx - offset 0x07748 ***/ + /*** Four byte table, leaf: 8231c8xx - offset 0x077bc ***/ /* 30 */ 0xe3b19f, 0xe3b1a0, 0xe3b1a1, 0xe3b1a2, /* 34 */ 0xe3b1a3, 0xe3b1a4, 0xe3b1a5, 0xe3b1a6, /* 38 */ 0xe3b1a7, 0xe3b1a8, - /*** Four byte table, leaf: 8231c9xx - offset 0x07752 ***/ + /*** Four byte table, leaf: 8231c9xx - offset 0x077c6 ***/ /* 30 */ 0xe3b1a9, 0xe3b1aa, 0xe3b1ab, 0xe3b1ac, /* 34 */ 0xe3b1ad, 0xe3b1af, 0xe3b1b0, 0xe3b1b1, /* 38 */ 0xe3b1b2, 0xe3b1b3, - /*** Four byte table, leaf: 8231caxx - offset 0x0775c ***/ + /*** Four byte table, leaf: 8231caxx - offset 0x077d0 ***/ /* 30 */ 0xe3b1b4, 0xe3b1b5, 0xe3b1b6, 0xe3b1b7, /* 34 */ 0xe3b1b8, 0xe3b1b9, 0xe3b1ba, 0xe3b1bb, /* 38 */ 0xe3b1bc, 0xe3b1bd, - /*** Four byte table, leaf: 8231cbxx - offset 0x07766 ***/ + /*** Four byte table, leaf: 8231cbxx - offset 0x077da ***/ /* 30 */ 0xe3b1be, 0xe3b1bf, 0xe3b280, 0xe3b281, /* 34 */ 0xe3b282, 0xe3b283, 0xe3b284, 0xe3b285, /* 38 */ 0xe3b286, 0xe3b287, - /*** Four byte table, leaf: 8231ccxx - offset 0x07770 ***/ + /*** Four byte table, leaf: 8231ccxx - offset 0x077e4 ***/ /* 30 */ 0xe3b288, 0xe3b289, 0xe3b28a, 0xe3b28b, /* 34 */ 0xe3b28c, 0xe3b28d, 0xe3b28e, 0xe3b28f, /* 38 */ 0xe3b290, 0xe3b291, - /*** Four byte table, leaf: 8231cdxx - offset 0x0777a ***/ + /*** Four byte table, leaf: 8231cdxx - offset 0x077ee ***/ /* 30 */ 0xe3b292, 0xe3b293, 0xe3b294, 0xe3b295, /* 34 */ 0xe3b296, 0xe3b297, 0xe3b298, 0xe3b299, /* 38 */ 0xe3b29a, 0xe3b29b, - /*** Four byte table, leaf: 8231cexx - offset 0x07784 ***/ + /*** Four byte table, leaf: 8231cexx - offset 0x077f8 ***/ /* 30 */ 0xe3b29c, 0xe3b29d, 0xe3b29e, 0xe3b29f, /* 34 */ 0xe3b2a0, 0xe3b2a1, 0xe3b2a2, 0xe3b2a3, /* 38 */ 0xe3b2a4, 0xe3b2a5, - /*** Four byte table, leaf: 8231cfxx - offset 0x0778e ***/ + /*** Four byte table, leaf: 8231cfxx - offset 0x07802 ***/ /* 30 */ 0xe3b2a6, 0xe3b2a7, 0xe3b2a8, 0xe3b2a9, /* 34 */ 0xe3b2aa, 0xe3b2ab, 0xe3b2ac, 0xe3b2ad, /* 38 */ 0xe3b2ae, 0xe3b2af, - /*** Four byte table, leaf: 8231d0xx - offset 0x07798 ***/ + /*** Four byte table, leaf: 8231d0xx - offset 0x0780c ***/ /* 30 */ 0xe3b2b0, 0xe3b2b1, 0xe3b2b2, 0xe3b2b3, /* 34 */ 0xe3b2b4, 0xe3b2b5, 0xe3b2b6, 0xe3b2b7, /* 38 */ 0xe3b2b8, 0xe3b2b9, - /*** Four byte table, leaf: 8231d1xx - offset 0x077a2 ***/ + /*** Four byte table, leaf: 8231d1xx - offset 0x07816 ***/ /* 30 */ 0xe3b2ba, 0xe3b2bb, 0xe3b2bc, 0xe3b2bd, /* 34 */ 0xe3b2be, 0xe3b2bf, 0xe3b380, 0xe3b381, /* 38 */ 0xe3b382, 0xe3b383, - /*** Four byte table, leaf: 8231d2xx - offset 0x077ac ***/ + /*** Four byte table, leaf: 8231d2xx - offset 0x07820 ***/ /* 30 */ 0xe3b384, 0xe3b385, 0xe3b386, 0xe3b387, /* 34 */ 0xe3b388, 0xe3b389, 0xe3b38a, 0xe3b38b, /* 38 */ 0xe3b38c, 0xe3b38d, - /*** Four byte table, leaf: 8231d3xx - offset 0x077b6 ***/ + /*** Four byte table, leaf: 8231d3xx - offset 0x0782a ***/ /* 30 */ 0xe3b38e, 0xe3b38f, 0xe3b390, 0xe3b391, /* 34 */ 0xe3b392, 0xe3b393, 0xe3b394, 0xe3b395, /* 38 */ 0xe3b396, 0xe3b397, - /*** Four byte table, leaf: 8231d4xx - offset 0x077c0 ***/ + /*** Four byte table, leaf: 8231d4xx - offset 0x07834 ***/ /* 30 */ 0xe3b398, 0xe3b399, 0xe3b39a, 0xe3b39b, /* 34 */ 0xe3b39c, 0xe3b39d, 0xe3b39e, 0xe3b39f, /* 2 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 8232afxx - offset 0x077c8 ***/ + /*** Four byte table, leaf: 8232afxx - offset 0x0783c ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0xe48197, /* 34 */ 0xe48198, 0xe48199, 0xe4819a, 0xe4819b, /* 38 */ 0xe4819c, 0xe4819d, - /*** Four byte table, leaf: 8232b0xx - offset 0x077d2 ***/ + /*** Four byte table, leaf: 8232b0xx - offset 0x07846 ***/ /* 30 */ 0xe4819e, 0xe4819f, 0xe481a0, 0xe481a1, /* 34 */ 0xe481a2, 0xe481a3, 0xe481a4, 0xe481a5, /* 38 */ 0xe481a6, 0xe481a7, - /*** Four byte table, leaf: 8232b1xx - offset 0x077dc ***/ + /*** Four byte table, leaf: 8232b1xx - offset 0x07850 ***/ /* 30 */ 0xe481a8, 0xe481a9, 0xe481aa, 0xe481ab, /* 34 */ 0xe481ac, 0xe481ad, 0xe481ae, 0xe481af, /* 38 */ 0xe481b0, 0xe481b1, - /*** Four byte table, leaf: 8232b2xx - offset 0x077e6 ***/ + /*** Four byte table, leaf: 8232b2xx - offset 0x0785a ***/ /* 30 */ 0xe481b2, 0xe481b3, 0xe481b4, 0xe481b5, /* 34 */ 0xe481b6, 0xe481b7, 0xe481b8, 0xe481b9, /* 38 */ 0xe481ba, 0xe481bb, - /*** Four byte table, leaf: 8232b3xx - offset 0x077f0 ***/ + /*** Four byte table, leaf: 8232b3xx - offset 0x07864 ***/ /* 30 */ 0xe481bc, 0xe481bd, 0xe481be, 0xe481bf, /* 34 */ 0xe48280, 0xe48281, 0xe48282, 0xe48283, /* 38 */ 0xe48284, 0xe48285, - /*** Four byte table, leaf: 8232b4xx - offset 0x077fa ***/ + /*** Four byte table, leaf: 8232b4xx - offset 0x0786e ***/ /* 30 */ 0xe48286, 0xe48287, 0xe48288, 0xe48289, /* 34 */ 0xe4828a, 0xe4828b, 0xe4828c, 0xe4828d, /* 38 */ 0xe4828e, 0xe4828f, - /*** Four byte table, leaf: 8232b5xx - offset 0x07804 ***/ + /*** Four byte table, leaf: 8232b5xx - offset 0x07878 ***/ /* 30 */ 0xe48290, 0xe48291, 0xe48292, 0xe48293, /* 34 */ 0xe48294, 0xe48295, 0xe48296, 0xe48297, /* 38 */ 0xe48298, 0xe48299, - /*** Four byte table, leaf: 8232b6xx - offset 0x0780e ***/ + /*** Four byte table, leaf: 8232b6xx - offset 0x07882 ***/ /* 30 */ 0xe4829a, 0xe4829b, 0xe4829c, 0xe4829d, /* 34 */ 0xe4829e, 0xe4829f, 0xe482a0, 0xe482a1, /* 38 */ 0xe482a2, 0xe482a3, - /*** Four byte table, leaf: 8232b7xx - offset 0x07818 ***/ + /*** Four byte table, leaf: 8232b7xx - offset 0x0788c ***/ /* 30 */ 0xe482a4, 0xe482a5, 0xe482a6, 0xe482a7, /* 34 */ 0xe482a8, 0xe482a9, 0xe482aa, 0xe482ab, /* 38 */ 0xe482ac, 0xe482ad, - /*** Four byte table, leaf: 8232b8xx - offset 0x07822 ***/ + /*** Four byte table, leaf: 8232b8xx - offset 0x07896 ***/ /* 30 */ 0xe482ae, 0xe482af, 0xe482b0, 0xe482b1, /* 34 */ 0xe482b2, 0xe482b3, 0xe482b4, 0xe482b5, /* 38 */ 0xe482b6, 0xe482b7, - /*** Four byte table, leaf: 8232b9xx - offset 0x0782c ***/ + /*** Four byte table, leaf: 8232b9xx - offset 0x078a0 ***/ /* 30 */ 0xe482b8, 0xe482b9, 0xe482ba, 0xe482bb, /* 34 */ 0xe482bc, 0xe482bd, 0xe482be, 0xe482bf, /* 38 */ 0xe48380, 0xe48381, - /*** Four byte table, leaf: 8232baxx - offset 0x07836 ***/ + /*** Four byte table, leaf: 8232baxx - offset 0x078aa ***/ /* 30 */ 0xe48382, 0xe48383, 0xe48384, 0xe48385, /* 34 */ 0xe48386, 0xe48387, 0xe48388, 0xe48389, /* 38 */ 0xe4838a, 0xe4838b, - /*** Four byte table, leaf: 8232bbxx - offset 0x07840 ***/ + /*** Four byte table, leaf: 8232bbxx - offset 0x078b4 ***/ /* 30 */ 0xe4838c, 0xe4838d, 0xe4838e, 0xe4838f, /* 34 */ 0xe48390, 0xe48391, 0xe48392, 0xe48393, /* 38 */ 0xe48394, 0xe48395, - /*** Four byte table, leaf: 8232bcxx - offset 0x0784a ***/ + /*** Four byte table, leaf: 8232bcxx - offset 0x078be ***/ /* 30 */ 0xe48396, 0xe48397, 0xe48398, 0xe48399, /* 34 */ 0xe4839a, 0xe4839b, 0xe4839c, 0xe4839d, /* 38 */ 0xe4839e, 0xe4839f, - /*** Four byte table, leaf: 8232bdxx - offset 0x07854 ***/ + /*** Four byte table, leaf: 8232bdxx - offset 0x078c8 ***/ /* 30 */ 0xe483a0, 0xe483a1, 0xe483a2, 0xe483a3, /* 34 */ 0xe483a4, 0xe483a5, 0xe483a6, 0xe483a7, /* 38 */ 0xe483a8, 0xe483a9, - /*** Four byte table, leaf: 8232bexx - offset 0x0785e ***/ + /*** Four byte table, leaf: 8232bexx - offset 0x078d2 ***/ /* 30 */ 0xe483aa, 0xe483ab, 0xe483ac, 0xe483ad, /* 34 */ 0xe483ae, 0xe483af, 0xe483b0, 0xe483b1, /* 38 */ 0xe483b2, 0xe483b3, - /*** Four byte table, leaf: 8232bfxx - offset 0x07868 ***/ + /*** Four byte table, leaf: 8232bfxx - offset 0x078dc ***/ /* 30 */ 0xe483b4, 0xe483b5, 0xe483b6, 0xe483b7, /* 34 */ 0xe483b8, 0xe483b9, 0xe483ba, 0xe483bb, /* 38 */ 0xe483bc, 0xe483bd, - /*** Four byte table, leaf: 8232c0xx - offset 0x07872 ***/ + /*** Four byte table, leaf: 8232c0xx - offset 0x078e6 ***/ /* 30 */ 0xe483be, 0xe483bf, 0xe48480, 0xe48481, /* 34 */ 0xe48482, 0xe48483, 0xe48484, 0xe48485, /* 38 */ 0xe48486, 0xe48487, - /*** Four byte table, leaf: 8232c1xx - offset 0x0787c ***/ + /*** Four byte table, leaf: 8232c1xx - offset 0x078f0 ***/ /* 30 */ 0xe48488, 0xe48489, 0xe4848a, 0xe4848b, /* 34 */ 0xe4848c, 0xe4848d, 0xe4848e, 0xe4848f, /* 38 */ 0xe48490, 0xe48491, - /*** Four byte table, leaf: 8232c2xx - offset 0x07886 ***/ + /*** Four byte table, leaf: 8232c2xx - offset 0x078fa ***/ /* 30 */ 0xe48492, 0xe48493, 0xe48494, 0xe48495, /* 34 */ 0xe48496, 0xe48497, 0xe48498, 0xe48499, /* 38 */ 0xe4849a, 0xe4849b, - /*** Four byte table, leaf: 8232c3xx - offset 0x07890 ***/ + /*** Four byte table, leaf: 8232c3xx - offset 0x07904 ***/ /* 30 */ 0xe4849c, 0xe4849d, 0xe4849e, 0xe4849f, /* 34 */ 0xe484a0, 0xe484a1, 0xe484a2, 0xe484a3, /* 38 */ 0xe484a4, 0xe484a5, - /*** Four byte table, leaf: 8232c4xx - offset 0x0789a ***/ + /*** Four byte table, leaf: 8232c4xx - offset 0x0790e ***/ /* 30 */ 0xe484a6, 0xe484a7, 0xe484a8, 0xe484a9, /* 34 */ 0xe484aa, 0xe484ab, 0xe484ac, 0xe484ad, /* 38 */ 0xe484ae, 0xe484af, - /*** Four byte table, leaf: 8232c5xx - offset 0x078a4 ***/ + /*** Four byte table, leaf: 8232c5xx - offset 0x07918 ***/ /* 30 */ 0xe484b0, 0xe484b1, 0xe484b2, 0xe484b3, /* 34 */ 0xe484b4, 0xe484b5, 0xe484b6, 0xe484b7, /* 38 */ 0xe484b8, 0xe484b9, - /*** Four byte table, leaf: 8232c6xx - offset 0x078ae ***/ + /*** Four byte table, leaf: 8232c6xx - offset 0x07922 ***/ /* 30 */ 0xe484ba, 0xe484bb, 0xe484bc, 0xe484bd, /* 34 */ 0xe484be, 0xe484bf, 0xe48580, 0xe48581, /* 38 */ 0xe48582, 0xe48583, - /*** Four byte table, leaf: 8232c7xx - offset 0x078b8 ***/ + /*** Four byte table, leaf: 8232c7xx - offset 0x0792c ***/ /* 30 */ 0xe48584, 0xe48585, 0xe48586, 0xe48587, /* 34 */ 0xe48588, 0xe48589, 0xe4858a, 0xe4858b, /* 38 */ 0xe4858c, 0xe4858d, - /*** Four byte table, leaf: 8232c8xx - offset 0x078c2 ***/ + /*** Four byte table, leaf: 8232c8xx - offset 0x07936 ***/ /* 30 */ 0xe4858e, 0xe4858f, 0xe48590, 0xe48591, /* 34 */ 0xe48592, 0xe48593, 0xe48594, 0xe48595, /* 38 */ 0xe48596, 0xe48597, - /*** Four byte table, leaf: 8232c9xx - offset 0x078cc ***/ + /*** Four byte table, leaf: 8232c9xx - offset 0x07940 ***/ /* 30 */ 0xe48598, 0xe48599, 0xe4859a, 0xe4859b, /* 34 */ 0xe4859c, 0xe4859d, 0xe4859e, /* 3 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 8232f8xx - offset 0x078d3 ***/ + /*** Four byte table, leaf: 8232f8xx - offset 0x07947 ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 38 */ 0xe48cb8, 0xe48cb9, - /*** Four byte table, leaf: 8232f9xx - offset 0x078dd ***/ + /*** Four byte table, leaf: 8232f9xx - offset 0x07951 ***/ /* 30 */ 0xe48cba, 0xe48cbb, 0xe48cbc, 0xe48cbd, /* 34 */ 0xe48cbe, 0xe48cbf, 0xe48d80, 0xe48d81, /* 38 */ 0xe48d82, 0xe48d83, - /*** Four byte table, leaf: 8232faxx - offset 0x078e7 ***/ + /*** Four byte table, leaf: 8232faxx - offset 0x0795b ***/ /* 30 */ 0xe48d84, 0xe48d85, 0xe48d86, 0xe48d87, /* 34 */ 0xe48d88, 0xe48d89, 0xe48d8a, 0xe48d8b, /* 38 */ 0xe48d8c, 0xe48d8d, - /*** Four byte table, leaf: 8232fbxx - offset 0x078f1 ***/ + /*** Four byte table, leaf: 8232fbxx - offset 0x07965 ***/ /* 30 */ 0xe48d8e, 0xe48d8f, 0xe48d90, 0xe48d91, /* 34 */ 0xe48d92, 0xe48d93, 0xe48d94, 0xe48d95, /* 38 */ 0xe48d96, 0xe48d97, - /*** Four byte table, leaf: 8232fcxx - offset 0x078fb ***/ + /*** Four byte table, leaf: 8232fcxx - offset 0x0796f ***/ /* 30 */ 0xe48d98, 0xe48d99, 0xe48d9a, 0xe48d9b, /* 34 */ 0xe48d9c, 0xe48d9d, 0xe48d9e, 0xe48d9f, /* 38 */ 0xe48da0, 0xe48da1, - /*** Four byte table, leaf: 8232fdxx - offset 0x07905 ***/ + /*** Four byte table, leaf: 8232fdxx - offset 0x07979 ***/ /* 30 */ 0xe48da2, 0xe48da3, 0xe48da4, 0xe48da5, /* 34 */ 0xe48da6, 0xe48da7, 0xe48da8, 0xe48da9, /* 38 */ 0xe48daa, 0xe48dab, - /*** Four byte table, leaf: 8232fexx - offset 0x0790f ***/ + /*** Four byte table, leaf: 8232fexx - offset 0x07983 ***/ /* 30 */ 0xe48dac, 0xe48dad, 0xe48dae, 0xe48daf, /* 34 */ 0xe48db0, 0xe48db1, 0xe48db2, 0xe48db3, /* 38 */ 0xe48db4, 0xe48db5, - /*** Four byte table, leaf: 823381xx - offset 0x07919 ***/ + /*** Four byte table, leaf: 823381xx - offset 0x0798d ***/ /* 30 */ 0xe48db6, 0xe48db7, 0xe48db8, 0xe48db9, /* 34 */ 0xe48dba, 0xe48dbb, 0xe48dbc, 0xe48dbd, /* 38 */ 0xe48dbe, 0xe48dbf, - /*** Four byte table, leaf: 823382xx - offset 0x07923 ***/ + /*** Four byte table, leaf: 823382xx - offset 0x07997 ***/ /* 30 */ 0xe48e80, 0xe48e81, 0xe48e82, 0xe48e83, /* 34 */ 0xe48e84, 0xe48e85, 0xe48e86, 0xe48e87, /* 38 */ 0xe48e88, 0xe48e89, - /*** Four byte table, leaf: 823383xx - offset 0x0792d ***/ + /*** Four byte table, leaf: 823383xx - offset 0x079a1 ***/ /* 30 */ 0xe48e8a, 0xe48e8b, 0xe48e8c, 0xe48e8d, /* 34 */ 0xe48e8e, 0xe48e8f, 0xe48e90, 0xe48e91, /* 38 */ 0xe48e92, 0xe48e93, - /*** Four byte table, leaf: 823384xx - offset 0x07937 ***/ + /*** Four byte table, leaf: 823384xx - offset 0x079ab ***/ /* 30 */ 0xe48e94, 0xe48e95, 0xe48e96, 0xe48e97, /* 34 */ 0xe48e98, 0xe48e99, 0xe48e9a, 0xe48e9b, /* 38 */ 0xe48e9c, 0xe48e9d, - /*** Four byte table, leaf: 823385xx - offset 0x07941 ***/ + /*** Four byte table, leaf: 823385xx - offset 0x079b5 ***/ /* 30 */ 0xe48e9e, 0xe48e9f, 0xe48ea0, 0xe48ea1, /* 34 */ 0xe48ea2, 0xe48ea3, 0xe48ea4, 0xe48ea5, /* 38 */ 0xe48ea6, 0xe48ea7, - /*** Four byte table, leaf: 823386xx - offset 0x0794b ***/ + /*** Four byte table, leaf: 823386xx - offset 0x079bf ***/ /* 30 */ 0xe48ea8, 0xe48ea9, 0xe48eaa, 0xe48eab, /* 34 */ 0xe48ead, 0xe48eae, 0xe48eaf, 0xe48eb0, /* 38 */ 0xe48eb2, 0xe48eb3, - /*** Four byte table, leaf: 823387xx - offset 0x07955 ***/ + /*** Four byte table, leaf: 823387xx - offset 0x079c9 ***/ /* 30 */ 0xe48eb4, 0xe48eb5, 0xe48eb6, 0xe48eb7, /* 34 */ 0xe48eb8, 0xe48eb9, 0xe48eba, 0xe48ebb, /* 38 */ 0xe48ebc, 0xe48ebd, - /*** Four byte table, leaf: 823388xx - offset 0x0795f ***/ + /*** Four byte table, leaf: 823388xx - offset 0x079d3 ***/ /* 30 */ 0xe48ebe, 0xe48ebf, 0xe48f80, 0xe48f81, /* 34 */ 0xe48f82, 0xe48f83, 0xe48f84, 0xe48f85, /* 38 */ 0xe48f86, 0xe48f87, - /*** Four byte table, leaf: 823389xx - offset 0x07969 ***/ + /*** Four byte table, leaf: 823389xx - offset 0x079dd ***/ /* 30 */ 0xe48f88, 0xe48f89, 0xe48f8a, 0xe48f8b, /* 34 */ 0xe48f8c, 0xe48f8d, 0xe48f8e, 0xe48f8f, /* 38 */ 0xe48f90, 0xe48f91, - /*** Four byte table, leaf: 82338axx - offset 0x07973 ***/ + /*** Four byte table, leaf: 82338axx - offset 0x079e7 ***/ /* 30 */ 0xe48f92, 0xe48f93, 0xe48f94, 0xe48f95, /* 34 */ 0xe48f96, 0xe48f97, 0xe48f98, 0xe48f99, /* 38 */ 0xe48f9a, 0xe48f9b, - /*** Four byte table, leaf: 82338bxx - offset 0x0797d ***/ + /*** Four byte table, leaf: 82338bxx - offset 0x079f1 ***/ /* 30 */ 0xe48f9c, 0xe48f9e, 0xe48f9f, 0xe48fa0, /* 34 */ 0xe48fa1, 0xe48fa2, 0xe48fa3, 0xe48fa4, /* 38 */ 0xe48fa5, 0xe48fa6, - /*** Four byte table, leaf: 82338cxx - offset 0x07987 ***/ + /*** Four byte table, leaf: 82338cxx - offset 0x079fb ***/ /* 30 */ 0xe48fa7, 0xe48fa8, 0xe48fa9, 0xe48faa, /* 34 */ 0xe48fab, 0xe48fac, 0xe48fad, 0xe48fae, /* 38 */ 0xe48faf, 0xe48fb0, - /*** Four byte table, leaf: 82338dxx - offset 0x07991 ***/ + /*** Four byte table, leaf: 82338dxx - offset 0x07a05 ***/ /* 30 */ 0xe48fb1, 0xe48fb2, 0xe48fb3, 0xe48fb4, /* 34 */ 0xe48fb5, 0xe48fb6, 0xe48fb7, 0xe48fb8, /* 38 */ 0xe48fb9, 0xe48fba, - /*** Four byte table, leaf: 82338exx - offset 0x0799b ***/ + /*** Four byte table, leaf: 82338exx - offset 0x07a0f ***/ /* 30 */ 0xe48fbb, 0xe48fbc, 0xe48fbd, 0xe48fbe, /* 34 */ 0xe48fbf, 0xe49080, 0xe49081, 0xe49082, /* 38 */ 0xe49083, 0xe49084, - /*** Four byte table, leaf: 82338fxx - offset 0x079a5 ***/ + /*** Four byte table, leaf: 82338fxx - offset 0x07a19 ***/ /* 30 */ 0xe49085, 0xe49086, 0xe49087, 0xe49088, /* 34 */ 0xe49089, 0xe4908a, 0xe4908b, 0xe4908c, /* 38 */ 0xe4908d, 0xe4908e, - /*** Four byte table, leaf: 823390xx - offset 0x079af ***/ + /*** Four byte table, leaf: 823390xx - offset 0x07a23 ***/ /* 30 */ 0xe4908f, 0xe49090, 0xe49091, 0xe49092, /* 34 */ 0xe49093, 0xe49094, 0xe49095, 0xe49096, /* 38 */ 0xe49097, 0xe49098, - /*** Four byte table, leaf: 823391xx - offset 0x079b9 ***/ + /*** Four byte table, leaf: 823391xx - offset 0x07a2d ***/ /* 30 */ 0xe49099, 0xe4909a, 0xe4909b, 0xe4909c, /* 34 */ 0xe4909d, 0xe4909e, 0xe4909f, 0xe490a0, /* 38 */ 0xe490a1, 0xe490a2, - /*** Four byte table, leaf: 823392xx - offset 0x079c3 ***/ + /*** Four byte table, leaf: 823392xx - offset 0x07a37 ***/ /* 30 */ 0xe490a3, 0xe490a4, 0xe490a5, 0xe490a6, /* 34 */ 0xe490a7, 0xe490a8, 0xe490a9, 0xe490aa, /* 38 */ 0xe490ab, 0xe490ac, - /*** Four byte table, leaf: 823393xx - offset 0x079cd ***/ + /*** Four byte table, leaf: 823393xx - offset 0x07a41 ***/ /* 30 */ 0xe490ad, 0xe490ae, 0xe490af, 0xe490b0, /* 34 */ 0xe490b1, 0xe490b2, 0xe490b3, 0xe490b4, /* 38 */ 0xe490b5, 0xe490b6, - /*** Four byte table, leaf: 823394xx - offset 0x079d7 ***/ + /*** Four byte table, leaf: 823394xx - offset 0x07a4b ***/ /* 30 */ 0xe490b7, 0xe490b8, 0xe490b9, 0xe490ba, /* 34 */ 0xe490bb, 0xe490bc, 0xe490bd, 0xe490be, /* 38 */ 0xe490bf, 0xe49180, - /*** Four byte table, leaf: 823395xx - offset 0x079e1 ***/ + /*** Four byte table, leaf: 823395xx - offset 0x07a55 ***/ /* 30 */ 0xe49181, 0xe49182, 0xe49183, 0xe49184, /* 34 */ 0xe49185, 0xe49186, 0xe49187, 0xe49188, /* 38 */ 0xe49189, 0xe4918a, - /*** Four byte table, leaf: 823396xx - offset 0x079eb ***/ + /*** Four byte table, leaf: 823396xx - offset 0x07a5f ***/ /* 30 */ 0xe4918b, 0xe4918c, 0xe4918d, 0xe4918e, /* 34 */ 0xe4918f, 0xe49190, 0xe49191, 0xe49192, /* 38 */ 0xe49193, 0xe49194, - /*** Four byte table, leaf: 823397xx - offset 0x079f5 ***/ + /*** Four byte table, leaf: 823397xx - offset 0x07a69 ***/ /* 30 */ 0xe49195, 0xe49196, 0xe49197, 0xe49198, /* 34 */ 0xe49199, 0xe4919a, 0xe4919b, 0xe4919c, /* 38 */ 0xe4919d, 0xe4919e, - /*** Four byte table, leaf: 823398xx - offset 0x079ff ***/ + /*** Four byte table, leaf: 823398xx - offset 0x07a73 ***/ /* 30 */ 0xe4919f, 0xe491a0, 0xe491a1, 0xe491a2, /* 34 */ 0xe491a3, 0xe491a4, 0xe491a5, 0xe491a6, /* 38 */ 0xe491a7, 0xe491a8, - /*** Four byte table, leaf: 823399xx - offset 0x07a09 ***/ + /*** Four byte table, leaf: 823399xx - offset 0x07a7d ***/ /* 30 */ 0xe491a9, 0xe491aa, 0xe491ab, 0xe491ac, /* 34 */ 0xe491ad, 0xe491ae, 0xe491af, 0xe491b0, /* 38 */ 0xe491b1, 0xe491b2, - /*** Four byte table, leaf: 82339axx - offset 0x07a13 ***/ + /*** Four byte table, leaf: 82339axx - offset 0x07a87 ***/ /* 30 */ 0xe491b3, 0xe491b4, 0xe491b5, 0xe491b6, /* 34 */ 0xe491b7, 0xe491b8, 0xe491b9, 0xe491ba, /* 38 */ 0xe491bb, 0xe491bc, - /*** Four byte table, leaf: 82339bxx - offset 0x07a1d ***/ + /*** Four byte table, leaf: 82339bxx - offset 0x07a91 ***/ /* 30 */ 0xe491bd, 0xe491be, 0xe491bf, 0xe49280, /* 34 */ 0xe49281, 0xe49282, 0xe49283, 0xe49284, /* 38 */ 0xe49285, 0xe49286, - /*** Four byte table, leaf: 82339cxx - offset 0x07a27 ***/ + /*** Four byte table, leaf: 82339cxx - offset 0x07a9b ***/ /* 30 */ 0xe49287, 0xe49288, 0xe49289, 0xe4928a, /* 34 */ 0xe4928b, 0xe4928c, 0xe4928d, 0xe4928e, /* 38 */ 0xe4928f, 0xe49290, - /*** Four byte table, leaf: 82339dxx - offset 0x07a31 ***/ + /*** Four byte table, leaf: 82339dxx - offset 0x07aa5 ***/ /* 30 */ 0xe49291, 0xe49292, 0xe49293, 0xe49294, /* 34 */ 0xe49295, 0xe49296, 0xe49297, 0xe49298, /* 38 */ 0xe49299, 0xe4929a, - /*** Four byte table, leaf: 82339exx - offset 0x07a3b ***/ + /*** Four byte table, leaf: 82339exx - offset 0x07aaf ***/ /* 30 */ 0xe4929b, 0xe4929c, 0xe4929d, 0xe4929e, /* 34 */ 0xe4929f, 0xe492a0, 0xe492a1, 0xe492a2, /* 38 */ 0xe492a3, 0xe492a4, - /*** Four byte table, leaf: 82339fxx - offset 0x07a45 ***/ + /*** Four byte table, leaf: 82339fxx - offset 0x07ab9 ***/ /* 30 */ 0xe492a5, 0xe492a6, 0xe492a7, 0xe492a8, /* 34 */ 0xe492a9, 0xe492aa, 0xe492ab, 0xe492ac, /* 38 */ 0xe492ad, 0xe492ae, - /*** Four byte table, leaf: 8233a0xx - offset 0x07a4f ***/ + /*** Four byte table, leaf: 8233a0xx - offset 0x07ac3 ***/ /* 30 */ 0xe492af, 0xe492b0, 0xe492b1, 0xe492b2, /* 34 */ 0xe492b3, 0xe492b4, 0xe492b5, 0xe492b6, /* 38 */ 0xe492b7, 0xe492b8, - /*** Four byte table, leaf: 8233a1xx - offset 0x07a59 ***/ + /*** Four byte table, leaf: 8233a1xx - offset 0x07acd ***/ /* 30 */ 0xe492b9, 0xe492ba, 0xe492bb, 0xe492bc, /* 34 */ 0xe492bd, 0xe492be, 0xe492bf, 0xe49380, /* 38 */ 0xe49381, 0xe49382, - /*** Four byte table, leaf: 8233a2xx - offset 0x07a63 ***/ + /*** Four byte table, leaf: 8233a2xx - offset 0x07ad7 ***/ /* 30 */ 0xe49383, 0xe49384, 0xe49385, 0xe49386, /* 34 */ 0xe49387, 0xe49388, 0xe49389, 0xe4938a, /* 38 */ 0xe4938b, 0xe4938c, - /*** Four byte table, leaf: 8233a3xx - offset 0x07a6d ***/ + /*** Four byte table, leaf: 8233a3xx - offset 0x07ae1 ***/ /* 30 */ 0xe4938d, 0xe4938e, 0xe4938f, 0xe49390, /* 34 */ 0xe49391, 0xe49392, 0xe49393, 0xe49394, /* 38 */ 0xe49395, /* 1 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 8233c9xx - offset 0x07a76 ***/ + /*** Four byte table, leaf: 8233c9xx - offset 0x07aea ***/ /* 30 */ 0x000000, 0x000000, 0xe4998d, 0xe4998e, /* 34 */ 0xe4998f, 0xe49990, 0xe49991, 0xe49992, /* 38 */ 0xe49993, 0xe49994, - /*** Four byte table, leaf: 8233caxx - offset 0x07a80 ***/ + /*** Four byte table, leaf: 8233caxx - offset 0x07af4 ***/ /* 30 */ 0xe49995, 0xe49996, 0xe49997, 0xe49998, /* 34 */ 0xe49999, 0xe4999a, 0xe4999b, 0xe4999c, /* 38 */ 0xe4999d, 0xe4999e, - /*** Four byte table, leaf: 8233cbxx - offset 0x07a8a ***/ + /*** Four byte table, leaf: 8233cbxx - offset 0x07afe ***/ /* 30 */ 0xe4999f, 0xe499a0, 0xe499a2, 0xe499a3, /* 34 */ 0xe499a4, 0xe499a5, 0xe499a6, 0xe499a7, /* 38 */ 0xe499a8, 0xe499a9, - /*** Four byte table, leaf: 8233ccxx - offset 0x07a94 ***/ + /*** Four byte table, leaf: 8233ccxx - offset 0x07b08 ***/ /* 30 */ 0xe499aa, 0xe499ab, 0xe499ac, 0xe499ad, /* 34 */ 0xe499ae, 0xe499af, 0xe499b0, 0xe499b1, /* 38 */ 0xe499b2, 0xe499b3, - /*** Four byte table, leaf: 8233cdxx - offset 0x07a9e ***/ + /*** Four byte table, leaf: 8233cdxx - offset 0x07b12 ***/ /* 30 */ 0xe499b4, 0xe499b5, 0xe499b6, 0xe499b7, /* 34 */ 0xe499b8, 0xe499b9, 0xe499ba, 0xe499bb, /* 38 */ 0xe499bc, 0xe499bd, - /*** Four byte table, leaf: 8233cexx - offset 0x07aa8 ***/ + /*** Four byte table, leaf: 8233cexx - offset 0x07b1c ***/ /* 30 */ 0xe499be, 0xe499bf, 0xe49a80, 0xe49a81, /* 34 */ 0xe49a82, 0xe49a83, 0xe49a84, 0xe49a85, /* 38 */ 0xe49a86, 0xe49a87, - /*** Four byte table, leaf: 8233cfxx - offset 0x07ab2 ***/ + /*** Four byte table, leaf: 8233cfxx - offset 0x07b26 ***/ /* 30 */ 0xe49a88, 0xe49a89, 0xe49a8a, 0xe49a8b, /* 34 */ 0xe49a8c, 0xe49a8d, 0xe49a8e, 0xe49a8f, /* 38 */ 0xe49a90, 0xe49a91, - /*** Four byte table, leaf: 8233d0xx - offset 0x07abc ***/ + /*** Four byte table, leaf: 8233d0xx - offset 0x07b30 ***/ /* 30 */ 0xe49a92, 0xe49a93, 0xe49a94, 0xe49a95, /* 34 */ 0xe49a96, 0xe49a97, 0xe49a98, 0xe49a99, /* 38 */ 0xe49a9a, 0xe49a9b, - /*** Four byte table, leaf: 8233d1xx - offset 0x07ac6 ***/ + /*** Four byte table, leaf: 8233d1xx - offset 0x07b3a ***/ /* 30 */ 0xe49a9c, 0xe49a9d, 0xe49a9e, 0xe49a9f, /* 34 */ 0xe49aa0, 0xe49aa1, 0xe49aa2, 0xe49aa3, /* 38 */ 0xe49aa4, 0xe49aa5, - /*** Four byte table, leaf: 8233d2xx - offset 0x07ad0 ***/ + /*** Four byte table, leaf: 8233d2xx - offset 0x07b44 ***/ /* 30 */ 0xe49aa6, 0xe49aa7, 0xe49aa8, 0xe49aa9, /* 34 */ 0xe49aaa, 0xe49aab, 0xe49aac, 0xe49aad, /* 38 */ 0xe49aae, 0xe49aaf, - /*** Four byte table, leaf: 8233d3xx - offset 0x07ada ***/ + /*** Four byte table, leaf: 8233d3xx - offset 0x07b4e ***/ /* 30 */ 0xe49ab0, 0xe49ab1, 0xe49ab2, 0xe49ab3, /* 34 */ 0xe49ab4, 0xe49ab5, 0xe49ab6, 0xe49ab7, /* 38 */ 0xe49ab8, 0xe49ab9, - /*** Four byte table, leaf: 8233d4xx - offset 0x07ae4 ***/ + /*** Four byte table, leaf: 8233d4xx - offset 0x07b58 ***/ /* 30 */ 0xe49aba, 0xe49abb, 0xe49abc, 0xe49abd, /* 34 */ 0xe49abe, 0xe49abf, 0xe49b80, 0xe49b81, /* 38 */ 0xe49b82, 0xe49b83, - /*** Four byte table, leaf: 8233d5xx - offset 0x07aee ***/ + /*** Four byte table, leaf: 8233d5xx - offset 0x07b62 ***/ /* 30 */ 0xe49b84, 0xe49b85, 0xe49b86, 0xe49b87, /* 34 */ 0xe49b88, 0xe49b89, 0xe49b8a, 0xe49b8b, /* 38 */ 0xe49b8c, 0xe49b8d, - /*** Four byte table, leaf: 8233d6xx - offset 0x07af8 ***/ + /*** Four byte table, leaf: 8233d6xx - offset 0x07b6c ***/ /* 30 */ 0xe49b8e, 0xe49b8f, 0xe49b90, 0xe49b91, /* 34 */ 0xe49b92, 0xe49b93, 0xe49b94, 0xe49b95, /* 38 */ 0xe49b96, 0xe49b97, - /*** Four byte table, leaf: 8233d7xx - offset 0x07b02 ***/ + /*** Four byte table, leaf: 8233d7xx - offset 0x07b76 ***/ /* 30 */ 0xe49b98, 0xe49b99, 0xe49b9a, 0xe49b9b, /* 34 */ 0xe49b9c, 0xe49b9d, 0xe49b9e, 0xe49b9f, /* 38 */ 0xe49ba0, 0xe49ba1, - /*** Four byte table, leaf: 8233d8xx - offset 0x07b0c ***/ + /*** Four byte table, leaf: 8233d8xx - offset 0x07b80 ***/ /* 30 */ 0xe49ba2, 0xe49ba3, 0xe49ba4, 0xe49ba5, /* 34 */ 0xe49ba6, 0xe49ba7, 0xe49ba8, 0xe49ba9, /* 38 */ 0xe49baa, 0xe49bab, - /*** Four byte table, leaf: 8233d9xx - offset 0x07b16 ***/ + /*** Four byte table, leaf: 8233d9xx - offset 0x07b8a ***/ /* 30 */ 0xe49bac, 0xe49bad, 0xe49bae, 0xe49baf, /* 34 */ 0xe49bb0, 0xe49bb1, 0xe49bb2, 0xe49bb3, /* 38 */ 0xe49bb4, 0xe49bb5, - /*** Four byte table, leaf: 8233daxx - offset 0x07b20 ***/ + /*** Four byte table, leaf: 8233daxx - offset 0x07b94 ***/ /* 30 */ 0xe49bb6, 0xe49bb7, 0xe49bb8, 0xe49bb9, /* 34 */ 0xe49bba, 0xe49bbb, 0xe49bbc, 0xe49bbd, /* 38 */ 0xe49bbe, 0xe49bbf, - /*** Four byte table, leaf: 8233dbxx - offset 0x07b2a ***/ + /*** Four byte table, leaf: 8233dbxx - offset 0x07b9e ***/ /* 30 */ 0xe49c80, 0xe49c81, 0xe49c82, 0xe49c83, /* 34 */ 0xe49c84, 0xe49c85, 0xe49c86, 0xe49c87, /* 38 */ 0xe49c88, 0xe49c89, - /*** Four byte table, leaf: 8233dcxx - offset 0x07b34 ***/ + /*** Four byte table, leaf: 8233dcxx - offset 0x07ba8 ***/ /* 30 */ 0xe49c8a, 0xe49c8b, 0xe49c8c, 0xe49c8d, /* 34 */ 0xe49c8e, 0xe49c8f, 0xe49c90, 0xe49c91, /* 38 */ 0xe49c92, 0xe49c93, - /*** Four byte table, leaf: 8233ddxx - offset 0x07b3e ***/ + /*** Four byte table, leaf: 8233ddxx - offset 0x07bb2 ***/ /* 30 */ 0xe49c94, 0xe49c95, 0xe49c96, 0xe49c97, /* 34 */ 0xe49c98, 0xe49c99, 0xe49c9a, 0xe49c9b, /* 38 */ 0xe49c9c, 0xe49c9d, - /*** Four byte table, leaf: 8233dexx - offset 0x07b48 ***/ + /*** Four byte table, leaf: 8233dexx - offset 0x07bbc ***/ /* 30 */ 0xe49c9e, 0xe49c9f, 0xe49ca0, 0xe49ca1, /* 34 */ 0xe49ca2, 0xe49ca4, 0xe49ca5, 0xe49ca6, /* 38 */ 0xe49ca7, 0xe49ca8, - /*** Four byte table, leaf: 8233dfxx - offset 0x07b52 ***/ + /*** Four byte table, leaf: 8233dfxx - offset 0x07bc6 ***/ /* 30 */ 0xe49caa, 0xe49cab, 0xe49cac, 0xe49cad, /* 34 */ 0xe49cae, 0xe49caf, 0xe49cb0, 0xe49cb1, /* 38 */ 0xe49cb2, 0xe49cb3, - /*** Four byte table, leaf: 8233e0xx - offset 0x07b5c ***/ + /*** Four byte table, leaf: 8233e0xx - offset 0x07bd0 ***/ /* 30 */ 0xe49cb4, 0xe49cb5, 0xe49cb6, 0xe49cb7, /* 34 */ 0xe49cb8, 0xe49cb9, 0xe49cba, 0xe49cbb, /* 38 */ 0xe49cbc, 0xe49cbd, - /*** Four byte table, leaf: 8233e1xx - offset 0x07b66 ***/ + /*** Four byte table, leaf: 8233e1xx - offset 0x07bda ***/ /* 30 */ 0xe49cbe, 0xe49cbf, 0xe49d80, 0xe49d81, /* 34 */ 0xe49d82, 0xe49d83, 0xe49d84, 0xe49d85, /* 38 */ 0xe49d86, 0xe49d87, - /*** Four byte table, leaf: 8233e2xx - offset 0x07b70 ***/ + /*** Four byte table, leaf: 8233e2xx - offset 0x07be4 ***/ /* 30 */ 0xe49d88, 0xe49d89, 0xe49d8a, 0xe49d8b, /* 34 */ 0xe49d8c, 0xe49d8d, 0xe49d8e, 0xe49d8f, /* 38 */ 0xe49d90, 0xe49d91, - /*** Four byte table, leaf: 8233e3xx - offset 0x07b7a ***/ + /*** Four byte table, leaf: 8233e3xx - offset 0x07bee ***/ /* 30 */ 0xe49d92, 0xe49d93, 0xe49d94, 0xe49d95, /* 34 */ 0xe49d96, 0xe49d97, 0xe49d98, 0xe49d99, /* 38 */ 0xe49d9a, 0xe49d9b, - /*** Four byte table, leaf: 8233e4xx - offset 0x07b84 ***/ + /*** Four byte table, leaf: 8233e4xx - offset 0x07bf8 ***/ /* 30 */ 0xe49d9c, 0xe49d9d, 0xe49d9e, 0xe49d9f, /* 34 */ 0xe49da0, 0xe49da1, 0xe49da2, 0xe49da3, /* 38 */ 0xe49da4, 0xe49da5, - /*** Four byte table, leaf: 8233e5xx - offset 0x07b8e ***/ + /*** Four byte table, leaf: 8233e5xx - offset 0x07c02 ***/ /* 30 */ 0xe49da6, 0xe49da7, 0xe49da8, 0xe49da9, /* 34 */ 0xe49daa, 0xe49dab, 0xe49dac, 0xe49dad, /* 38 */ 0xe49dae, 0xe49daf, - /*** Four byte table, leaf: 8233e6xx - offset 0x07b98 ***/ + /*** Four byte table, leaf: 8233e6xx - offset 0x07c0c ***/ /* 30 */ 0xe49db0, 0xe49db1, 0xe49db2, 0xe49db3, /* 34 */ 0xe49db4, 0xe49db5, 0xe49db6, 0xe49db7, /* 38 */ 0xe49db8, 0xe49db9, - /*** Four byte table, leaf: 8233e7xx - offset 0x07ba2 ***/ + /*** Four byte table, leaf: 8233e7xx - offset 0x07c16 ***/ /* 30 */ 0xe49dba, 0xe49dbb, 0xe49dbd, 0xe49dbe, /* 34 */ 0xe49dbf, 0xe49e80, 0xe49e81, 0xe49e82, /* 38 */ 0xe49e83, 0xe49e84, - /*** Four byte table, leaf: 8233e8xx - offset 0x07bac ***/ + /*** Four byte table, leaf: 8233e8xx - offset 0x07c20 ***/ /* 30 */ 0xe49e85, 0xe49e86, 0xe49e87, 0xe49e88, /* 34 */ 0xe49e89, 0xe49e8a, 0xe49e8b, 0xe49e8c, /* 2 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 823496xx - offset 0x07bb4 ***/ + /*** Four byte table, leaf: 823496xx - offset 0x07c28 ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 38 */ 0x000000, 0xe4a588, - /*** Four byte table, leaf: 823497xx - offset 0x07bbe ***/ + /*** Four byte table, leaf: 823497xx - offset 0x07c32 ***/ /* 30 */ 0xe4a589, 0xe4a58a, 0xe4a58b, 0xe4a58c, /* 34 */ 0xe4a58d, 0xe4a58e, 0xe4a58f, 0xe4a590, /* 38 */ 0xe4a591, 0xe4a592, - /*** Four byte table, leaf: 823498xx - offset 0x07bc8 ***/ + /*** Four byte table, leaf: 823498xx - offset 0x07c3c ***/ /* 30 */ 0xe4a593, 0xe4a594, 0xe4a595, 0xe4a596, /* 34 */ 0xe4a597, 0xe4a598, 0xe4a599, 0xe4a59a, /* 38 */ 0xe4a59b, 0xe4a59c, - /*** Four byte table, leaf: 823499xx - offset 0x07bd2 ***/ + /*** Four byte table, leaf: 823499xx - offset 0x07c46 ***/ /* 30 */ 0xe4a59d, 0xe4a59e, 0xe4a59f, 0xe4a5a0, /* 34 */ 0xe4a5a1, 0xe4a5a2, 0xe4a5a3, 0xe4a5a4, /* 38 */ 0xe4a5a5, 0xe4a5a6, - /*** Four byte table, leaf: 82349axx - offset 0x07bdc ***/ + /*** Four byte table, leaf: 82349axx - offset 0x07c50 ***/ /* 30 */ 0xe4a5a7, 0xe4a5a8, 0xe4a5a9, 0xe4a5aa, /* 34 */ 0xe4a5ab, 0xe4a5ac, 0xe4a5ad, 0xe4a5ae, /* 38 */ 0xe4a5af, 0xe4a5b0, - /*** Four byte table, leaf: 82349bxx - offset 0x07be6 ***/ + /*** Four byte table, leaf: 82349bxx - offset 0x07c5a ***/ /* 30 */ 0xe4a5b1, 0xe4a5b2, 0xe4a5b3, 0xe4a5b4, /* 34 */ 0xe4a5b5, 0xe4a5b6, 0xe4a5b7, 0xe4a5b8, /* 38 */ 0xe4a5b9, 0xe4a5bb, - /*** Four byte table, leaf: 82349cxx - offset 0x07bf0 ***/ + /*** Four byte table, leaf: 82349cxx - offset 0x07c64 ***/ /* 30 */ 0xe4a5bc, 0xe4a5be, 0xe4a5bf, 0xe4a680, /* 34 */ 0xe4a681, 0xe4a684, 0xe4a687, 0xe4a688, /* 38 */ 0xe4a689, 0xe4a68a, - /*** Four byte table, leaf: 82349dxx - offset 0x07bfa ***/ + /*** Four byte table, leaf: 82349dxx - offset 0x07c6e ***/ /* 30 */ 0xe4a68b, 0xe4a68c, 0xe4a68d, 0xe4a68e, /* 34 */ 0xe4a68f, 0xe4a690, 0xe4a691, 0xe4a692, /* 38 */ 0xe4a693, 0xe4a694, - /*** Four byte table, leaf: 82349exx - offset 0x07c04 ***/ + /*** Four byte table, leaf: 82349exx - offset 0x07c78 ***/ /* 30 */ 0xe4a695, 0xe4a696, 0xe4a697, 0xe4a698, /* 34 */ 0xe4a699, 0xe4a69a, 0xe4a69c, 0xe4a69d, /* 38 */ 0xe4a69e, 0xe4a6a0, - /*** Four byte table, leaf: 82349fxx - offset 0x07c0e ***/ + /*** Four byte table, leaf: 82349fxx - offset 0x07c82 ***/ /* 30 */ 0xe4a6a1, 0xe4a6a2, 0xe4a6a3, 0xe4a6a4, /* 34 */ 0xe4a6a5, 0xe4a6a6, 0xe4a6a7, 0xe4a6a8, /* 38 */ 0xe4a6a9, 0xe4a6aa, - /*** Four byte table, leaf: 8234a0xx - offset 0x07c18 ***/ + /*** Four byte table, leaf: 8234a0xx - offset 0x07c8c ***/ /* 30 */ 0xe4a6ab, 0xe4a6ac, 0xe4a6ad, 0xe4a6ae, /* 34 */ 0xe4a6af, 0xe4a6b0, 0xe4a6b1, 0xe4a6b2, /* 38 */ 0xe4a6b3, 0xe4a6b4, - /*** Four byte table, leaf: 8234a1xx - offset 0x07c22 ***/ + /*** Four byte table, leaf: 8234a1xx - offset 0x07c96 ***/ /* 30 */ 0xe4a6b5, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, /* 4 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 8234e7xx - offset 0x07c28 ***/ + /*** Four byte table, leaf: 8234e7xx - offset 0x07c9c ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0xe4b1b8, 0xe4b1b9, 0xe4b1ba, 0xe4b1bb, /* 38 */ 0xe4b1bc, 0xe4b1bd, - /*** Four byte table, leaf: 8234e8xx - offset 0x07c32 ***/ + /*** Four byte table, leaf: 8234e8xx - offset 0x07ca6 ***/ /* 30 */ 0xe4b1be, 0xe4b1bf, 0xe4b280, 0xe4b281, /* 34 */ 0xe4b282, 0xe4b283, 0xe4b284, 0xe4b285, /* 38 */ 0xe4b286, 0xe4b287, - /*** Four byte table, leaf: 8234e9xx - offset 0x07c3c ***/ + /*** Four byte table, leaf: 8234e9xx - offset 0x07cb0 ***/ /* 30 */ 0xe4b288, 0xe4b289, 0xe4b28a, 0xe4b28b, /* 34 */ 0xe4b28c, 0xe4b28d, 0xe4b28e, 0xe4b28f, /* 38 */ 0xe4b290, 0xe4b291, - /*** Four byte table, leaf: 8234eaxx - offset 0x07c46 ***/ + /*** Four byte table, leaf: 8234eaxx - offset 0x07cba ***/ /* 30 */ 0xe4b292, 0xe4b293, 0xe4b294, 0xe4b295, /* 34 */ 0xe4b296, 0xe4b297, 0xe4b298, 0xe4b299, /* 38 */ 0xe4b29a, 0xe4b29b, - /*** Four byte table, leaf: 8234ebxx - offset 0x07c50 ***/ + /*** Four byte table, leaf: 8234ebxx - offset 0x07cc4 ***/ /* 30 */ 0xe4b29c, 0xe4b29d, 0xe4b29e, 0xe4b2a4, /* 34 */ 0xe4b2a5, 0xe4b2a6, 0xe4b2a7, 0xe4b2a8, /* 38 */ 0xe4b2a9, 0xe4b2aa, - /*** Four byte table, leaf: 8234ecxx - offset 0x07c5a ***/ + /*** Four byte table, leaf: 8234ecxx - offset 0x07cce ***/ /* 30 */ 0xe4b2ab, 0xe4b2ac, 0xe4b2ad, 0xe4b2ae, /* 34 */ 0xe4b2af, 0xe4b2b0, 0xe4b2b1, 0xe4b2b2, /* 38 */ 0xe4b2b3, 0xe4b2b4, - /*** Four byte table, leaf: 8234edxx - offset 0x07c64 ***/ + /*** Four byte table, leaf: 8234edxx - offset 0x07cd8 ***/ /* 30 */ 0xe4b2b5, 0xe4b2b6, 0xe4b2b7, 0xe4b2b8, /* 34 */ 0xe4b2b9, 0xe4b2ba, 0xe4b2bb, 0xe4b2bc, /* 38 */ 0xe4b2bd, 0xe4b2be, - /*** Four byte table, leaf: 8234eexx - offset 0x07c6e ***/ + /*** Four byte table, leaf: 8234eexx - offset 0x07ce2 ***/ /* 30 */ 0xe4b2bf, 0xe4b380, 0xe4b381, 0xe4b382, /* 34 */ 0xe4b383, 0xe4b384, 0xe4b385, 0xe4b386, /* 38 */ 0xe4b387, 0xe4b388, - /*** Four byte table, leaf: 8234efxx - offset 0x07c78 ***/ + /*** Four byte table, leaf: 8234efxx - offset 0x07cec ***/ /* 30 */ 0xe4b389, 0xe4b38a, 0xe4b38b, 0xe4b38c, /* 34 */ 0xe4b38d, 0xe4b38e, 0xe4b38f, 0xe4b390, /* 38 */ 0xe4b391, 0xe4b392, - /*** Four byte table, leaf: 8234f0xx - offset 0x07c82 ***/ + /*** Four byte table, leaf: 8234f0xx - offset 0x07cf6 ***/ /* 30 */ 0xe4b393, 0xe4b394, 0xe4b395, 0xe4b396, /* 34 */ 0xe4b397, 0xe4b398, 0xe4b399, 0xe4b39a, /* 38 */ 0xe4b39b, 0xe4b39c, - /*** Four byte table, leaf: 8234f1xx - offset 0x07c8c ***/ + /*** Four byte table, leaf: 8234f1xx - offset 0x07d00 ***/ /* 30 */ 0xe4b39d, 0xe4b39e, 0xe4b39f, 0xe4b3a0, /* 34 */ 0xe4b3a1, 0xe4b3a2, 0xe4b3a3, 0xe4b3a4, /* 38 */ 0xe4b3a5, 0xe4b3a6, - /*** Four byte table, leaf: 8234f2xx - offset 0x07c96 ***/ + /*** Four byte table, leaf: 8234f2xx - offset 0x07d0a ***/ /* 30 */ 0xe4b3a7, 0xe4b3a8, 0xe4b3a9, 0xe4b3aa, /* 34 */ 0xe4b3ab, 0xe4b3ac, 0xe4b3ad, 0xe4b3ae, /* 38 */ 0xe4b3af, 0xe4b3b0, - /*** Four byte table, leaf: 8234f3xx - offset 0x07ca0 ***/ + /*** Four byte table, leaf: 8234f3xx - offset 0x07d14 ***/ /* 30 */ 0xe4b3b1, 0xe4b3b2, 0xe4b3b3, 0xe4b3b4, /* 34 */ 0xe4b3b5, 0xe4b3b6, 0xe4b3b7, 0xe4b3b8, /* 38 */ 0xe4b3b9, 0xe4b3ba, - /*** Four byte table, leaf: 8234f4xx - offset 0x07caa ***/ + /*** Four byte table, leaf: 8234f4xx - offset 0x07d1e ***/ /* 30 */ 0xe4b3bb, 0xe4b3bc, 0xe4b3bd, 0xe4b3be, /* 34 */ 0xe4b3bf, 0xe4b480, 0xe4b481, 0xe4b482, /* 38 */ 0xe4b483, 0xe4b484, - /*** Four byte table, leaf: 8234f5xx - offset 0x07cb4 ***/ + /*** Four byte table, leaf: 8234f5xx - offset 0x07d28 ***/ /* 30 */ 0xe4b485, 0xe4b486, 0xe4b487, 0xe4b488, /* 34 */ 0xe4b489, 0xe4b48a, 0xe4b48b, 0xe4b48c, /* 38 */ 0xe4b48d, 0xe4b48e, - /*** Four byte table, leaf: 8234f6xx - offset 0x07cbe ***/ + /*** Four byte table, leaf: 8234f6xx - offset 0x07d32 ***/ /* 30 */ 0xe4b48f, 0xe4b490, 0xe4b491, 0xe4b492, /* 34 */ 0xe4b49a, 0xe4b49b, 0xe4b49c, 0xe4b49d, /* 38 */ 0xe4b49e, 0xe4b49f, - /*** Four byte table, leaf: 8234f7xx - offset 0x07cc8 ***/ + /*** Four byte table, leaf: 8234f7xx - offset 0x07d3c ***/ /* 30 */ 0xe4b4a0, 0xe4b4a1, 0xe4b4a2, 0xe4b4a3, /* 34 */ 0xe4b4a4, 0xe4b4a5, 0xe4b4a6, 0xe4b4a7, /* 38 */ 0xe4b4a8, 0xe4b4a9, - /*** Four byte table, leaf: 8234f8xx - offset 0x07cd2 ***/ + /*** Four byte table, leaf: 8234f8xx - offset 0x07d46 ***/ /* 30 */ 0xe4b4aa, 0xe4b4ab, 0xe4b4ac, 0xe4b4ad, /* 34 */ 0xe4b4ae, 0xe4b4af, 0xe4b4b0, 0xe4b4b1, /* 38 */ 0xe4b4b2, 0xe4b4b3, - /*** Four byte table, leaf: 8234f9xx - offset 0x07cdc ***/ + /*** Four byte table, leaf: 8234f9xx - offset 0x07d50 ***/ /* 30 */ 0xe4b4b4, 0xe4b4b5, 0xe4b4b6, 0xe4b4b7, /* 34 */ 0xe4b4b8, 0xe4b4b9, 0xe4b4ba, 0xe4b4bb, /* 38 */ 0xe4b4bc, 0xe4b4bd, - /*** Four byte table, leaf: 8234faxx - offset 0x07ce6 ***/ + /*** Four byte table, leaf: 8234faxx - offset 0x07d5a ***/ /* 30 */ 0xe4b4be, 0xe4b4bf, 0xe4b580, 0xe4b581, /* 34 */ 0xe4b582, 0xe4b583, 0xe4b584, 0xe4b585, /* 38 */ 0xe4b586, 0xe4b587, - /*** Four byte table, leaf: 8234fbxx - offset 0x07cf0 ***/ + /*** Four byte table, leaf: 8234fbxx - offset 0x07d64 ***/ /* 30 */ 0xe4b588, 0xe4b589, 0xe4b58a, 0xe4b58b, /* 34 */ 0xe4b58c, 0xe4b58d, 0xe4b58e, 0xe4b58f, /* 38 */ 0xe4b590, 0xe4b591, - /*** Four byte table, leaf: 8234fcxx - offset 0x07cfa ***/ + /*** Four byte table, leaf: 8234fcxx - offset 0x07d6e ***/ /* 30 */ 0xe4b592, 0xe4b593, 0xe4b594, 0xe4b595, /* 34 */ 0xe4b596, 0xe4b597, 0xe4b598, 0xe4b599, /* 38 */ 0xe4b59a, 0xe4b59b, - /*** Four byte table, leaf: 8234fdxx - offset 0x07d04 ***/ + /*** Four byte table, leaf: 8234fdxx - offset 0x07d78 ***/ /* 30 */ 0xe4b59c, 0xe4b59d, 0xe4b59e, 0xe4b59f, /* 34 */ 0xe4b5a0, 0xe4b5a1, 0xe4b5a2, 0xe4b5a3, /* 38 */ 0xe4b5a4, 0xe4b5a5, - /*** Four byte table, leaf: 8234fexx - offset 0x07d0e ***/ + /*** Four byte table, leaf: 8234fexx - offset 0x07d82 ***/ /* 30 */ 0xe4b5a6, 0xe4b5a7, 0xe4b5a8, 0xe4b5a9, /* 34 */ 0xe4b5aa, 0xe4b5ab, 0xe4b5ac, 0xe4b5ad, /* 38 */ 0xe4b5ae, 0xe4b5af, - /*** Four byte table, leaf: 823581xx - offset 0x07d18 ***/ + /*** Four byte table, leaf: 823581xx - offset 0x07d8c ***/ /* 30 */ 0xe4b5b0, 0xe4b5b1, 0xe4b5b2, 0xe4b5b3, /* 34 */ 0xe4b5b4, 0xe4b5b5, 0xe4b5b6, 0xe4b5b7, /* 38 */ 0xe4b5b8, 0xe4b5b9, - /*** Four byte table, leaf: 823582xx - offset 0x07d22 ***/ + /*** Four byte table, leaf: 823582xx - offset 0x07d96 ***/ /* 30 */ 0xe4b5ba, 0xe4b5bb, 0xe4b5bc, 0xe4b5bd, /* 34 */ 0xe4b5be, 0xe4b5bf, 0xe4b680, 0xe4b681, /* 38 */ 0xe4b682, 0xe4b683, - /*** Four byte table, leaf: 823583xx - offset 0x07d2c ***/ + /*** Four byte table, leaf: 823583xx - offset 0x07da0 ***/ /* 30 */ 0xe4b684, 0xe4b685, 0xe4b686, 0xe4b687, /* 34 */ 0xe4b688, 0xe4b689, 0xe4b68a, 0xe4b68b, /* 38 */ 0xe4b68c, 0xe4b68d, - /*** Four byte table, leaf: 823584xx - offset 0x07d36 ***/ + /*** Four byte table, leaf: 823584xx - offset 0x07daa ***/ /* 30 */ 0xe4b68e, 0xe4b68f, 0xe4b690, 0xe4b691, /* 34 */ 0xe4b692, 0xe4b693, 0xe4b694, 0xe4b695, /* 38 */ 0xe4b696, 0xe4b697, - /*** Four byte table, leaf: 823585xx - offset 0x07d40 ***/ + /*** Four byte table, leaf: 823585xx - offset 0x07db4 ***/ /* 30 */ 0xe4b698, 0xe4b699, 0xe4b69a, 0xe4b69b, /* 34 */ 0xe4b69c, 0xe4b69d, 0xe4b69e, 0xe4b69f, /* 38 */ 0xe4b6a0, 0xe4b6a1, - /*** Four byte table, leaf: 823586xx - offset 0x07d4a ***/ + /*** Four byte table, leaf: 823586xx - offset 0x07dbe ***/ /* 30 */ 0xe4b6a2, 0xe4b6a3, 0xe4b6a4, 0xe4b6a5, /* 34 */ 0xe4b6a6, 0xe4b6a7, 0xe4b6a8, 0xe4b6a9, /* 38 */ 0xe4b6aa, 0xe4b6ab, - /*** Four byte table, leaf: 823587xx - offset 0x07d54 ***/ + /*** Four byte table, leaf: 823587xx - offset 0x07dc8 ***/ /* 30 */ 0xe4b6ac, 0xe4b6ad, 0xe4b6af, 0xe4b6b0, /* 34 */ 0xe4b6b1, 0xe4b6b2, 0xe4b6b3, 0xe4b6b4, /* 38 */ 0xe4b6b5, 0xe4b6b6, - /*** Four byte table, leaf: 823588xx - offset 0x07d5e ***/ + /*** Four byte table, leaf: 823588xx - offset 0x07dd2 ***/ /* 30 */ 0xe4b6b7, 0xe4b6b8, 0xe4b6b9, 0xe4b6ba, /* 34 */ 0xe4b6bb, 0xe4b6bc, 0xe4b6bd, 0xe4b6be, /* 38 */ 0xe4b6bf, 0xe4b780, - /*** Four byte table, leaf: 823589xx - offset 0x07d68 ***/ + /*** Four byte table, leaf: 823589xx - offset 0x07ddc ***/ /* 30 */ 0xe4b781, 0xe4b782, 0xe4b783, 0xe4b784, /* 34 */ 0xe4b785, 0xe4b786, 0xe4b787, 0xe4b788, /* 38 */ 0xe4b789, 0xe4b78a, - /*** Four byte table, leaf: 82358axx - offset 0x07d72 ***/ + /*** Four byte table, leaf: 82358axx - offset 0x07de6 ***/ /* 30 */ 0xe4b78b, 0xe4b78c, 0xe4b78d, 0xe4b78e, /* 34 */ 0xe4b78f, 0xe4b790, 0xe4b791, 0xe4b792, /* 38 */ 0xe4b793, 0xe4b794, - /*** Four byte table, leaf: 82358bxx - offset 0x07d7c ***/ + /*** Four byte table, leaf: 82358bxx - offset 0x07df0 ***/ /* 30 */ 0xe4b795, 0xe4b796, 0xe4b797, 0xe4b798, /* 34 */ 0xe4b799, 0xe4b79a, 0xe4b79b, 0xe4b79c, /* 38 */ 0xe4b79d, 0xe4b79e, - /*** Four byte table, leaf: 82358cxx - offset 0x07d86 ***/ + /*** Four byte table, leaf: 82358cxx - offset 0x07dfa ***/ /* 30 */ 0xe4b79f, 0xe4b7a0, 0xe4b7a1, 0xe4b7a2, /* 34 */ 0xe4b7a3, 0xe4b7a4, 0xe4b7a5, 0xe4b7a6, /* 38 */ 0xe4b7a7, 0xe4b7a8, - /*** Four byte table, leaf: 82358dxx - offset 0x07d90 ***/ + /*** Four byte table, leaf: 82358dxx - offset 0x07e04 ***/ /* 30 */ 0xe4b7a9, 0xe4b7aa, 0xe4b7ab, 0xe4b7ac, /* 34 */ 0xe4b7ad, 0xe4b7ae, 0xe4b7af, 0xe4b7b0, /* 38 */ 0xe4b7b1, 0xe4b7b2, - /*** Four byte table, leaf: 82358exx - offset 0x07d9a ***/ + /*** Four byte table, leaf: 82358exx - offset 0x07e0e ***/ /* 30 */ 0xe4b7b3, 0xe4b7b4, 0xe4b7b5, 0xe4b7b6, /* 34 */ 0xe4b7b7, 0xe4b7b8, 0xe4b7b9, 0xe4b7ba, /* 38 */ 0xe4b7bb, 0xe4b7bc, - /*** Four byte table, leaf: 82358fxx - offset 0x07da4 ***/ + /*** Four byte table, leaf: 82358fxx - offset 0x07e18 ***/ /* 30 */ 0xe4b7bd, 0xe4b7be, 0xe4b7bf, /* 7 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 8336c7xx - offset 0x07da7 ***/ + /*** Four byte table, leaf: 8336c7xx - offset 0x07e1b ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 38 */ 0x000000, 0xee9dac, - /*** Four byte table, leaf: 8336c8xx - offset 0x07db1 ***/ + /*** Four byte table, leaf: 8336c8xx - offset 0x07e25 ***/ /* 30 */ 0xee9f88, 0xee9fa7, 0xee9fa8, 0xee9fa9, /* 34 */ 0xee9faa, 0xee9fab, 0xee9fac, 0xee9fad, /* 38 */ 0xee9fae, 0xee9faf, - /*** Four byte table, leaf: 8336c9xx - offset 0x07dbb ***/ + /*** Four byte table, leaf: 8336c9xx - offset 0x07e2f ***/ /* 30 */ 0xee9fb0, 0xee9fb1, 0xee9fb2, 0xee9fb3, /* 34 */ 0xeea095, 0xeea099, 0xeea09a, 0xeea09b, /* 38 */ 0xeea09c, 0xeea09d, - /*** Four byte table, leaf: 8336caxx - offset 0x07dc5 ***/ + /*** Four byte table, leaf: 8336caxx - offset 0x07e39 ***/ /* 30 */ 0xeea09f, 0xeea0a0, 0xeea0a1, 0xeea0a2, /* 34 */ 0xeea0a3, 0xeea0a4, 0xeea0a5, 0xeea0a7, /* 38 */ 0xeea0a8, 0xeea0a9, - /*** Four byte table, leaf: 8336cbxx - offset 0x07dcf ***/ + /*** Four byte table, leaf: 8336cbxx - offset 0x07e43 ***/ /* 30 */ 0xeea0aa, 0xeea0ad, 0xeea0ae, 0xeea0af, /* 34 */ 0xeea0b0, 0xeea0b3, 0xeea0b4, 0xeea0b5, /* 38 */ 0xeea0b6, 0xeea0b7, - /*** Four byte table, leaf: 8336ccxx - offset 0x07dd9 ***/ + /*** Four byte table, leaf: 8336ccxx - offset 0x07e4d ***/ /* 30 */ 0xeea0b8, 0xeea0b9, 0xeea0ba, 0xeea0bc, /* 34 */ 0xeea0bd, 0xeea0be, 0xeea0bf, 0xeea180, /* 38 */ 0xeea181, 0xeea182, - /*** Four byte table, leaf: 8336cdxx - offset 0x07de3 ***/ + /*** Four byte table, leaf: 8336cdxx - offset 0x07e57 ***/ /* 30 */ 0xeea184, 0xeea185, 0xeea186, 0xeea187, /* 34 */ 0xeea188, 0xeea189, 0xeea18a, 0xeea18b, /* 38 */ 0xeea18c, 0xeea18d, - /*** Four byte table, leaf: 8336cexx - offset 0x07ded ***/ + /*** Four byte table, leaf: 8336cexx - offset 0x07e61 ***/ /* 30 */ 0xeea18e, 0xeea18f, 0xeea190, 0xeea191, /* 34 */ 0xeea192, 0xeea193, 0xeea196, 0xeea197, /* 38 */ 0xeea198, 0xeea199, - /*** Four byte table, leaf: 8336cfxx - offset 0x07df7 ***/ + /*** Four byte table, leaf: 8336cfxx - offset 0x07e6b ***/ /* 30 */ 0xeea19a, 0xeea19b, 0xeea19c, 0xeea19d, /* 34 */ 0xeea19e, 0xeea19f, 0xeea1a0, 0xeea1a1, /* 38 */ 0xeea1a2, 0xeea1a3, - /*** Four byte table, leaf: 843085xx - offset 0x07e01 ***/ + /*** Four byte table, leaf: 843085xx - offset 0x07e75 ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0xefa4ad, 0xefa4ae, 0xefa4af, /* 38 */ 0xefa4b0, 0xefa4b1, - /*** Four byte table, leaf: 843086xx - offset 0x07e0b ***/ + /*** Four byte table, leaf: 843086xx - offset 0x07e7f ***/ /* 30 */ 0xefa4b2, 0xefa4b3, 0xefa4b4, 0xefa4b5, /* 34 */ 0xefa4b6, 0xefa4b7, 0xefa4b8, 0xefa4b9, /* 38 */ 0xefa4ba, 0xefa4bb, - /*** Four byte table, leaf: 843087xx - offset 0x07e15 ***/ + /*** Four byte table, leaf: 843087xx - offset 0x07e89 ***/ /* 30 */ 0xefa4bc, 0xefa4bd, 0xefa4be, 0xefa4bf, /* 34 */ 0xefa580, 0xefa581, 0xefa582, 0xefa583, /* 38 */ 0xefa584, 0xefa585, - /*** Four byte table, leaf: 843088xx - offset 0x07e1f ***/ + /*** Four byte table, leaf: 843088xx - offset 0x07e93 ***/ /* 30 */ 0xefa586, 0xefa587, 0xefa588, 0xefa589, /* 34 */ 0xefa58a, 0xefa58b, 0xefa58c, 0xefa58d, /* 38 */ 0xefa58e, 0xefa58f, - /*** Four byte table, leaf: 843089xx - offset 0x07e29 ***/ + /*** Four byte table, leaf: 843089xx - offset 0x07e9d ***/ /* 30 */ 0xefa590, 0xefa591, 0xefa592, 0xefa593, /* 34 */ 0xefa594, 0xefa595, 0xefa596, 0xefa597, /* 38 */ 0xefa598, 0xefa599, - /*** Four byte table, leaf: 84308axx - offset 0x07e33 ***/ + /*** Four byte table, leaf: 84308axx - offset 0x07ea7 ***/ /* 30 */ 0xefa59a, 0xefa59b, 0xefa59c, 0xefa59d, /* 34 */ 0xefa59e, 0xefa59f, 0xefa5a0, 0xefa5a1, /* 38 */ 0xefa5a2, 0xefa5a3, - /*** Four byte table, leaf: 84308bxx - offset 0x07e3d ***/ + /*** Four byte table, leaf: 84308bxx - offset 0x07eb1 ***/ /* 30 */ 0xefa5a4, 0xefa5a5, 0xefa5a6, 0xefa5a7, /* 34 */ 0xefa5a8, 0xefa5a9, 0xefa5aa, 0xefa5ab, /* 38 */ 0xefa5ac, 0xefa5ad, - /*** Four byte table, leaf: 84308cxx - offset 0x07e47 ***/ + /*** Four byte table, leaf: 84308cxx - offset 0x07ebb ***/ /* 30 */ 0xefa5ae, 0xefa5af, 0xefa5b0, 0xefa5b1, /* 34 */ 0xefa5b2, 0xefa5b3, 0xefa5b4, 0xefa5b5, /* 38 */ 0xefa5b6, 0xefa5b7, - /*** Four byte table, leaf: 84308dxx - offset 0x07e51 ***/ + /*** Four byte table, leaf: 84308dxx - offset 0x07ec5 ***/ /* 30 */ 0xefa5b8, 0xefa5ba, 0xefa5bb, 0xefa5bc, /* 34 */ 0xefa5bd, 0xefa5be, 0xefa5bf, 0xefa680, /* 38 */ 0xefa681, 0xefa682, - /*** Four byte table, leaf: 84308exx - offset 0x07e5b ***/ + /*** Four byte table, leaf: 84308exx - offset 0x07ecf ***/ /* 30 */ 0xefa683, 0xefa684, 0xefa685, 0xefa686, /* 34 */ 0xefa687, 0xefa688, 0xefa689, 0xefa68a, /* 38 */ 0xefa68b, 0xefa68c, - /*** Four byte table, leaf: 84308fxx - offset 0x07e65 ***/ + /*** Four byte table, leaf: 84308fxx - offset 0x07ed9 ***/ /* 30 */ 0xefa68d, 0xefa68e, 0xefa68f, 0xefa690, /* 34 */ 0xefa691, 0xefa692, 0xefa693, 0xefa694, /* 38 */ 0xefa696, 0xefa697, - /*** Four byte table, leaf: 843090xx - offset 0x07e6f ***/ + /*** Four byte table, leaf: 843090xx - offset 0x07ee3 ***/ /* 30 */ 0xefa698, 0xefa699, 0xefa69a, 0xefa69b, /* 34 */ 0xefa69c, 0xefa69d, 0xefa69e, 0xefa69f, /* 38 */ 0xefa6a0, 0xefa6a1, - /*** Four byte table, leaf: 843091xx - offset 0x07e79 ***/ + /*** Four byte table, leaf: 843091xx - offset 0x07eed ***/ /* 30 */ 0xefa6a2, 0xefa6a3, 0xefa6a4, 0xefa6a5, /* 34 */ 0xefa6a6, 0xefa6a7, 0xefa6a8, 0xefa6a9, /* 38 */ 0xefa6aa, 0xefa6ab, - /*** Four byte table, leaf: 843092xx - offset 0x07e83 ***/ + /*** Four byte table, leaf: 843092xx - offset 0x07ef7 ***/ /* 30 */ 0xefa6ac, 0xefa6ad, 0xefa6ae, 0xefa6af, /* 34 */ 0xefa6b0, 0xefa6b1, 0xefa6b2, 0xefa6b3, /* 38 */ 0xefa6b4, 0xefa6b5, - /*** Four byte table, leaf: 843093xx - offset 0x07e8d ***/ + /*** Four byte table, leaf: 843093xx - offset 0x07f01 ***/ /* 30 */ 0xefa6b6, 0xefa6b7, 0xefa6b8, 0xefa6b9, /* 34 */ 0xefa6ba, 0xefa6bb, 0xefa6bc, 0xefa6bd, /* 38 */ 0xefa6be, 0xefa6bf, - /*** Four byte table, leaf: 843094xx - offset 0x07e97 ***/ + /*** Four byte table, leaf: 843094xx - offset 0x07f0b ***/ /* 30 */ 0xefa780, 0xefa781, 0xefa782, 0xefa783, /* 34 */ 0xefa784, 0xefa785, 0xefa786, 0xefa787, /* 38 */ 0xefa788, 0xefa789, - /*** Four byte table, leaf: 843095xx - offset 0x07ea1 ***/ + /*** Four byte table, leaf: 843095xx - offset 0x07f15 ***/ /* 30 */ 0xefa78a, 0xefa78b, 0xefa78c, 0xefa78d, /* 34 */ 0xefa78e, 0xefa78f, 0xefa790, 0xefa791, /* 38 */ 0xefa792, 0xefa793, - /*** Four byte table, leaf: 843096xx - offset 0x07eab ***/ + /*** Four byte table, leaf: 843096xx - offset 0x07f1f ***/ /* 30 */ 0xefa794, 0xefa795, 0xefa796, 0xefa797, /* 34 */ 0xefa798, 0xefa799, 0xefa79a, 0xefa79b, /* 38 */ 0xefa79c, 0xefa79d, - /*** Four byte table, leaf: 843097xx - offset 0x07eb5 ***/ + /*** Four byte table, leaf: 843097xx - offset 0x07f29 ***/ /* 30 */ 0xefa79e, 0xefa79f, 0xefa7a0, 0xefa7a1, /* 34 */ 0xefa7a2, 0xefa7a3, 0xefa7a4, 0xefa7a5, /* 38 */ 0xefa7a6, 0xefa7a8, - /*** Four byte table, leaf: 843098xx - offset 0x07ebf ***/ + /*** Four byte table, leaf: 843098xx - offset 0x07f33 ***/ /* 30 */ 0xefa7a9, 0xefa7aa, 0xefa7ab, 0xefa7ac, /* 34 */ 0xefa7ad, 0xefa7ae, 0xefa7af, 0xefa7b0, /* 38 */ 0xefa7b2, 0xefa7b3, - /*** Four byte table, leaf: 843099xx - offset 0x07ec9 ***/ + /*** Four byte table, leaf: 843099xx - offset 0x07f3d ***/ /* 30 */ 0xefa7b4, 0xefa7b5, 0xefa7b6, 0xefa7b7, /* 34 */ 0xefa7b8, 0xefa7b9, 0xefa7ba, 0xefa7bb, /* 38 */ 0xefa7bc, 0xefa7bd, - /*** Four byte table, leaf: 84309axx - offset 0x07ed3 ***/ + /*** Four byte table, leaf: 84309axx - offset 0x07f47 ***/ /* 30 */ 0xefa7be, 0xefa7bf, 0xefa880, 0xefa881, /* 34 */ 0xefa882, 0xefa883, 0xefa884, 0xefa885, /* 38 */ 0xefa886, 0xefa887, - /*** Four byte table, leaf: 84309bxx - offset 0x07edd ***/ + /*** Four byte table, leaf: 84309bxx - offset 0x07f51 ***/ /* 30 */ 0xefa888, 0xefa889, 0xefa88a, 0xefa88b, /* 34 */ 0xefa890, 0xefa892, 0xefa895, 0xefa896, /* 38 */ 0xefa897, 0xefa899, - /*** Four byte table, leaf: 84309cxx - offset 0x07ee7 ***/ + /*** Four byte table, leaf: 84309cxx - offset 0x07f5b ***/ /* 30 */ 0xefa89a, 0xefa89b, 0xefa89c, 0xefa89d, /* 34 */ 0xefa89e, 0xefa8a2, 0xefa8a5, 0xefa8a6, /* 2 trailing zero values shared with next segment */ - /*** Four byte table, leaf: 843185xx - offset 0x07eef ***/ + /*** Four byte table, leaf: 843185xx - offset 0x07f63 ***/ /* 30 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, /* 38 */ 0xefb8b2, 0xefb985, - /*** Four byte table, leaf: 843186xx - offset 0x07ef9 ***/ + /*** Four byte table, leaf: 843186xx - offset 0x07f6d ***/ /* 30 */ 0xefb986, 0xefb987, 0xefb988, 0xefb993, /* 34 */ 0xefb998, 0xefb9a7, 0xefb9ac, 0xefb9ad, /* 38 */ 0xefb9ae, 0xefb9af, - /*** Four byte table, leaf: 843187xx - offset 0x07f03 ***/ + /*** Four byte table, leaf: 843187xx - offset 0x07f77 ***/ /* 30 */ 0xefb9b0, 0xefb9b1, 0xefb9b2, 0xefb9b3, /* 34 */ 0xefb9b4, 0xefb9b5, 0xefb9b6, 0xefb9b7, /* 38 */ 0xefb9b8, 0xefb9b9, - /*** Four byte table, leaf: 843188xx - offset 0x07f0d ***/ + /*** Four byte table, leaf: 843188xx - offset 0x07f81 ***/ /* 30 */ 0xefb9ba, 0xefb9bb, 0xefb9bc, 0xefb9bd, /* 34 */ 0xefb9be, 0xefb9bf, 0xefba80, 0xefba81, /* 38 */ 0xefba82, 0xefba83, - /*** Four byte table, leaf: 843189xx - offset 0x07f17 ***/ + /*** Four byte table, leaf: 843189xx - offset 0x07f8b ***/ /* 30 */ 0xefba84, 0xefba85, 0xefba86, 0xefba87, /* 34 */ 0xefba88, 0xefba89, 0xefba8a, 0xefba8b, /* 38 */ 0xefba8c, 0xefba8d, - /*** Four byte table, leaf: 84318axx - offset 0x07f21 ***/ + /*** Four byte table, leaf: 84318axx - offset 0x07f95 ***/ /* 30 */ 0xefba8e, 0xefba8f, 0xefba90, 0xefba91, /* 34 */ 0xefba92, 0xefba93, 0xefba94, 0xefba95, /* 38 */ 0xefba96, 0xefba97, - /*** Four byte table, leaf: 84318bxx - offset 0x07f2b ***/ + /*** Four byte table, leaf: 84318bxx - offset 0x07f9f ***/ /* 30 */ 0xefba98, 0xefba99, 0xefba9a, 0xefba9b, /* 34 */ 0xefba9c, 0xefba9d, 0xefba9e, 0xefba9f, /* 38 */ 0xefbaa0, 0xefbaa1, - /*** Four byte table, leaf: 84318cxx - offset 0x07f35 ***/ + /*** Four byte table, leaf: 84318cxx - offset 0x07fa9 ***/ /* 30 */ 0xefbaa2, 0xefbaa3, 0xefbaa4, 0xefbaa5, /* 34 */ 0xefbaa6, 0xefbaa7, 0xefbaa8, 0xefbaa9, /* 38 */ 0xefbaaa, 0xefbaab, - /*** Four byte table, leaf: 84318dxx - offset 0x07f3f ***/ + /*** Four byte table, leaf: 84318dxx - offset 0x07fb3 ***/ /* 30 */ 0xefbaac, 0xefbaad, 0xefbaae, 0xefbaaf, /* 34 */ 0xefbab0, 0xefbab1, 0xefbab2, 0xefbab3, /* 38 */ 0xefbab4, 0xefbab5, - /*** Four byte table, leaf: 84318exx - offset 0x07f49 ***/ + /*** Four byte table, leaf: 84318exx - offset 0x07fbd ***/ /* 30 */ 0xefbab6, 0xefbab7, 0xefbab8, 0xefbab9, /* 34 */ 0xefbaba, 0xefbabb, 0xefbabc, 0xefbabd, /* 38 */ 0xefbabe, 0xefbabf, - /*** Four byte table, leaf: 84318fxx - offset 0x07f53 ***/ + /*** Four byte table, leaf: 84318fxx - offset 0x07fc7 ***/ /* 30 */ 0xefbb80, 0xefbb81, 0xefbb82, 0xefbb83, /* 34 */ 0xefbb84, 0xefbb85, 0xefbb86, 0xefbb87, /* 38 */ 0xefbb88, 0xefbb89, - /*** Four byte table, leaf: 843190xx - offset 0x07f5d ***/ + /*** Four byte table, leaf: 843190xx - offset 0x07fd1 ***/ /* 30 */ 0xefbb8a, 0xefbb8b, 0xefbb8c, 0xefbb8d, /* 34 */ 0xefbb8e, 0xefbb8f, 0xefbb90, 0xefbb91, /* 38 */ 0xefbb92, 0xefbb93, - /*** Four byte table, leaf: 843191xx - offset 0x07f67 ***/ + /*** Four byte table, leaf: 843191xx - offset 0x07fdb ***/ /* 30 */ 0xefbb94, 0xefbb95, 0xefbb96, 0xefbb97, /* 34 */ 0xefbb98, 0xefbb99, 0xefbb9a, 0xefbb9b, /* 38 */ 0xefbb9c, 0xefbb9d, - /*** Four byte table, leaf: 843192xx - offset 0x07f71 ***/ + /*** Four byte table, leaf: 843192xx - offset 0x07fe5 ***/ /* 30 */ 0xefbb9e, 0xefbb9f, 0xefbba0, 0xefbba1, /* 34 */ 0xefbba2, 0xefbba3, 0xefbba4, 0xefbba5, /* 38 */ 0xefbba6, 0xefbba7, - /*** Four byte table, leaf: 843193xx - offset 0x07f7b ***/ + /*** Four byte table, leaf: 843193xx - offset 0x07fef ***/ /* 30 */ 0xefbba8, 0xefbba9, 0xefbbaa, 0xefbbab, /* 34 */ 0xefbbac, 0xefbbad, 0xefbbae, 0xefbbaf, /* 38 */ 0xefbbb0, 0xefbbb1, - /*** Four byte table, leaf: 843194xx - offset 0x07f85 ***/ + /*** Four byte table, leaf: 843194xx - offset 0x07ff9 ***/ /* 30 */ 0xefbbb2, 0xefbbb3, 0xefbbb4, 0xefbbb5, /* 34 */ 0xefbbb6, 0xefbbb7, 0xefbbb8, 0xefbbb9, /* 38 */ 0xefbbba, 0xefbbbb, - /*** Four byte table, leaf: 843195xx - offset 0x07f8f ***/ + /*** Four byte table, leaf: 843195xx - offset 0x08003 ***/ /* 30 */ 0xefbbbc, 0xefbbbd, 0xefbbbe, 0xefbbbf, /* 34 */ 0xefbc80, 0xefbd9f, 0xefbda0, 0xefbda1, /* 38 */ 0xefbda2, 0xefbda3, - /*** Four byte table, leaf: 843196xx - offset 0x07f99 ***/ + /*** Four byte table, leaf: 843196xx - offset 0x0800d ***/ /* 30 */ 0xefbda4, 0xefbda5, 0xefbda6, 0xefbda7, /* 34 */ 0xefbda8, 0xefbda9, 0xefbdaa, 0xefbdab, /* 38 */ 0xefbdac, 0xefbdad, - /*** Four byte table, leaf: 843197xx - offset 0x07fa3 ***/ + /*** Four byte table, leaf: 843197xx - offset 0x08017 ***/ /* 30 */ 0xefbdae, 0xefbdaf, 0xefbdb0, 0xefbdb1, /* 34 */ 0xefbdb2, 0xefbdb3, 0xefbdb4, 0xefbdb5, /* 38 */ 0xefbdb6, 0xefbdb7, - /*** Four byte table, leaf: 843198xx - offset 0x07fad ***/ + /*** Four byte table, leaf: 843198xx - offset 0x08021 ***/ /* 30 */ 0xefbdb8, 0xefbdb9, 0xefbdba, 0xefbdbb, /* 34 */ 0xefbdbc, 0xefbdbd, 0xefbdbe, 0xefbdbf, /* 38 */ 0xefbe80, 0xefbe81, - /*** Four byte table, leaf: 843199xx - offset 0x07fb7 ***/ + /*** Four byte table, leaf: 843199xx - offset 0x0802b ***/ /* 30 */ 0xefbe82, 0xefbe83, 0xefbe84, 0xefbe85, /* 34 */ 0xefbe86, 0xefbe87, 0xefbe88, 0xefbe89, /* 38 */ 0xefbe8a, 0xefbe8b, - /*** Four byte table, leaf: 84319axx - offset 0x07fc1 ***/ + /*** Four byte table, leaf: 84319axx - offset 0x08035 ***/ /* 30 */ 0xefbe8c, 0xefbe8d, 0xefbe8e, 0xefbe8f, /* 34 */ 0xefbe90, 0xefbe91, 0xefbe92, 0xefbe93, /* 38 */ 0xefbe94, 0xefbe95, - /*** Four byte table, leaf: 84319bxx - offset 0x07fcb ***/ + /*** Four byte table, leaf: 84319bxx - offset 0x0803f ***/ /* 30 */ 0xefbe96, 0xefbe97, 0xefbe98, 0xefbe99, /* 34 */ 0xefbe9a, 0xefbe9b, 0xefbe9c, 0xefbe9d, /* 38 */ 0xefbe9e, 0xefbe9f, - /*** Four byte table, leaf: 84319cxx - offset 0x07fd5 ***/ + /*** Four byte table, leaf: 84319cxx - offset 0x08049 ***/ /* 30 */ 0xefbea0, 0xefbea1, 0xefbea2, 0xefbea3, /* 34 */ 0xefbea4, 0xefbea5, 0xefbea6, 0xefbea7, /* 38 */ 0xefbea8, 0xefbea9, - /*** Four byte table, leaf: 84319dxx - offset 0x07fdf ***/ + /*** Four byte table, leaf: 84319dxx - offset 0x08053 ***/ /* 30 */ 0xefbeaa, 0xefbeab, 0xefbeac, 0xefbead, /* 34 */ 0xefbeae, 0xefbeaf, 0xefbeb0, 0xefbeb1, /* 38 */ 0xefbeb2, 0xefbeb3, - /*** Four byte table, leaf: 84319exx - offset 0x07fe9 ***/ + /*** Four byte table, leaf: 84319exx - offset 0x0805d ***/ /* 30 */ 0xefbeb4, 0xefbeb5, 0xefbeb6, 0xefbeb7, /* 34 */ 0xefbeb8, 0xefbeb9, 0xefbeba, 0xefbebb, /* 38 */ 0xefbebc, 0xefbebd, - /*** Four byte table, leaf: 84319fxx - offset 0x07ff3 ***/ + /*** Four byte table, leaf: 84319fxx - offset 0x08067 ***/ /* 30 */ 0xefbebe, 0xefbebf, 0xefbf80, 0xefbf81, /* 34 */ 0xefbf82, 0xefbf83, 0xefbf84, 0xefbf85, /* 38 */ 0xefbf86, 0xefbf87, - /*** Four byte table, leaf: 8431a0xx - offset 0x07ffd ***/ + /*** Four byte table, leaf: 8431a0xx - offset 0x08071 ***/ /* 30 */ 0xefbf88, 0xefbf89, 0xefbf8a, 0xefbf8b, /* 34 */ 0xefbf8c, 0xefbf8d, 0xefbf8e, 0xefbf8f, /* 38 */ 0xefbf90, 0xefbf91, - /*** Four byte table, leaf: 8431a1xx - offset 0x08007 ***/ + /*** Four byte table, leaf: 8431a1xx - offset 0x0807b ***/ /* 30 */ 0xefbf92, 0xefbf93, 0xefbf94, 0xefbf95, /* 34 */ 0xefbf96, 0xefbf97, 0xefbf98, 0xefbf99, /* 38 */ 0xefbf9a, 0xefbf9b, - /*** Four byte table, leaf: 8431a2xx - offset 0x08011 ***/ + /*** Four byte table, leaf: 8431a2xx - offset 0x08085 ***/ /* 30 */ 0xefbf9c, 0xefbf9d, 0xefbf9e, 0xefbf9f, /* 34 */ 0x000000, 0x000000, 0x000000, 0x000000, diff --git a/src/backend/utils/mb/Unicode/utf8_to_gb18030.map b/src/backend/utils/mb/Unicode/utf8_to_gb18030.map index 6c6b660bebe87..0b76e72b5832d 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_gb18030.map +++ b/src/backend/utils/mb/Unicode/utf8_to_gb18030.map @@ -1,7 +1,7 @@ /* src/backend/utils/mb/Unicode/utf8_to_gb18030.map */ /* This file is generated by src/backend/utils/mb/Unicode/UCS_to_GB18030.pl */ -static const uint32 gb18030_from_unicode_tree_table[31972]; +static const uint32 gb18030_from_unicode_tree_table[32106]; static const pg_mb_radix_tree gb18030_from_unicode_tree = { @@ -19,7 +19,7 @@ static const pg_mb_radix_tree gb18030_from_unicode_tree = 0xbf, /* b2_2_upper */ 0x0450, /* offset of table for 3-byte inputs */ - 0xe2, /* b3_1_lower */ + 0xe1, /* b3_1_lower */ 0xef, /* b3_1_upper */ 0x80, /* b3_2_lower */ 0xbf, /* b3_2_upper */ @@ -37,7 +37,7 @@ static const pg_mb_radix_tree gb18030_from_unicode_tree = 0x00 /* b4_4_upper */ }; -static const uint32 gb18030_from_unicode_tree_table[31972] = +static const uint32 gb18030_from_unicode_tree_table[32106] = { /*** Dummy map, for invalid values - offset 0x00000 ***/ @@ -371,20 +371,20 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /*** Three byte table, byte #1: xx - offset 0x00450 ***/ - /* e2 */ 0x0000045e, 0x0000049e, 0x000004dd, 0x0000051d, - /* e6 */ 0x0000055d, 0x0000059d, 0x000005dd, 0x0000061d, - /* ea */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* ee */ 0x0000065d, 0x0000067f, + /* e1 */ 0x0000045f, 0x0000049f, 0x000004df, 0x0000051e, + /* e5 */ 0x0000055e, 0x0000059e, 0x000005de, 0x0000061e, + /* e9 */ 0x0000065e, 0x00000000, 0x00000000, 0x00000000, + /* ed */ 0x00000000, 0x0000069e, 0x000006c0, - /*** Three byte table, byte #2: e2xx - offset 0x0045e ***/ + /*** Three byte table, byte #2: e1xx - offset 0x0045f ***/ - /* 80 */ 0x000006bf, 0x000006ff, 0x0000073f, 0x0000077f, - /* 84 */ 0x000007bf, 0x000007ff, 0x0000083f, 0x0000087f, - /* 88 */ 0x000008bf, 0x000008ff, 0x0000093f, 0x0000097f, - /* 8c */ 0x000009bf, 0x000009ff, 0x00000a3f, 0x00000a7f, - /* 90 */ 0x00000abf, 0x00000aff, 0x00000b3f, 0x00000b7f, - /* 94 */ 0x00000bbf, 0x00000bff, 0x00000c3f, 0x00000c7f, - /* 98 */ 0x00000cbf, 0x00000cff, 0x00000000, 0x00000000, + /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 88 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 8c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 90 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 94 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 98 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 9c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* a4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -392,157 +392,195 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* ac */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* b0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* b4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* b8 */ 0x00000000, 0x00000000, 0x00000d3e, 0x00000d7e, - /* bc */ 0x00000dbe, 0x00000dfe, 0x00000e3e, 0x00000e7e, - - /*** Three byte table, byte #2: e3xx - offset 0x0049e ***/ - - /* 80 */ 0x00000ebe, 0x00000efe, 0x00000f3e, 0x00000f7e, - /* 84 */ 0x00000fbe, 0x00000ffe, 0x0000103e, 0x0000107e, - /* 88 */ 0x000010be, 0x000010fe, 0x0000113e, 0x0000117e, - /* 8c */ 0x000011be, 0x000011fe, 0x0000123e, 0x0000127e, - /* 90 */ 0x000012be, 0x000012fe, 0x0000133e, 0x0000137e, - /* 94 */ 0x000013be, 0x000013fe, 0x0000143e, 0x0000147e, - /* 98 */ 0x000014be, 0x00000000, 0x00000000, 0x00000000, + /* b8 */ 0x00000700, 0x00000000, 0x00000000, 0x00000000, + /* bc */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + + /*** Three byte table, byte #2: e2xx - offset 0x0049f ***/ + + /* 80 */ 0x00000740, 0x00000780, 0x000007c0, 0x00000800, + /* 84 */ 0x00000840, 0x00000880, 0x000008c0, 0x00000900, + /* 88 */ 0x00000940, 0x00000980, 0x000009c0, 0x00000a00, + /* 8c */ 0x00000a40, 0x00000a80, 0x00000ac0, 0x00000b00, + /* 90 */ 0x00000b40, 0x00000b80, 0x00000bc0, 0x00000c00, + /* 94 */ 0x00000c40, 0x00000c80, 0x00000cc0, 0x00000d00, + /* 98 */ 0x00000d40, 0x00000d80, 0x00000000, 0x00000000, /* 9c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* a4 */ 0x000014e6, 0x00001526, 0x00001566, 0x000015a6, - /* a8 */ 0x000015e6, 0x00001626, 0x00001666, 0x000016a6, - /* ac */ 0x000016e6, 0x00001726, 0x00001766, 0x000017a6, - /* b0 */ 0x000017e6, 0x00001826, 0x00001866, 0x000018a6, + /* a4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* a8 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* ac */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* b0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* b4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* b8 */ 0x00000000, 0x00000000, 0x00000dbf, 0x00000dff, + /* bc */ 0x00000e3f, 0x00000e7f, 0x00000ebf, 0x00000eff, + + /*** Three byte table, byte #2: e3xx - offset 0x004df ***/ + + /* 80 */ 0x00000f3f, 0x00000f7f, 0x00000fbf, 0x00000fff, + /* 84 */ 0x0000103f, 0x0000107f, 0x000010bf, 0x000010ff, + /* 88 */ 0x0000113f, 0x0000117f, 0x000011bf, 0x000011ff, + /* 8c */ 0x0000123f, 0x0000127f, 0x000012bf, 0x000012ff, + /* 90 */ 0x0000133f, 0x0000137f, 0x000013bf, 0x000013ff, + /* 94 */ 0x0000143f, 0x0000147f, 0x000014bf, 0x000014ff, + /* 98 */ 0x0000153f, 0x00000000, 0x00000000, 0x00000000, + /* 9c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* a4 */ 0x00001567, 0x000015a7, 0x000015e7, 0x00001627, + /* a8 */ 0x00001667, 0x000016a7, 0x000016e7, 0x00001727, + /* ac */ 0x00001767, 0x000017a7, 0x000017e7, 0x00001827, + /* b0 */ 0x00001867, 0x000018a7, 0x000018e7, 0x00001927, /* b4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* b8 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* bc */ 0x00000000, 0x00000000, 0x00000000, /* 1 trailing zero values shared with next segment */ - /*** Three byte table, byte #2: e4xx - offset 0x004dd ***/ + /*** Three byte table, byte #2: e4xx - offset 0x0051e ***/ - /* 80 */ 0x00000000, 0x000018d0, 0x00001910, 0x00001950, - /* 84 */ 0x00001990, 0x000019d0, 0x00000000, 0x00000000, + /* 80 */ 0x00000000, 0x00001951, 0x00001991, 0x000019d1, + /* 84 */ 0x00001a11, 0x00001a51, 0x00000000, 0x00000000, /* 88 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* 8c */ 0x000019f0, 0x00001a30, 0x00001a70, 0x00001ab0, - /* 90 */ 0x00001af0, 0x00001b30, 0x00001b70, 0x00001bb0, + /* 8c */ 0x00001a71, 0x00001ab1, 0x00001af1, 0x00001b31, + /* 90 */ 0x00001b71, 0x00001bb1, 0x00001bf1, 0x00001c31, /* 94 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* 98 */ 0x00000000, 0x00001be4, 0x00001c24, 0x00001c64, - /* 9c */ 0x00001ca4, 0x00001ce4, 0x00001d24, 0x00000000, + /* 98 */ 0x00000000, 0x00001c65, 0x00001ca5, 0x00001ce5, + /* 9c */ 0x00001d25, 0x00001d65, 0x00001da5, 0x00000000, /* a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* a4 */ 0x00000000, 0x00001d5d, 0x00001d9d, 0x00000000, + /* a4 */ 0x00000000, 0x00001dde, 0x00001e1e, 0x00000000, /* a8 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* ac */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* b0 */ 0x00000000, 0x00001dd5, 0x00001e15, 0x00001e55, - /* b4 */ 0x00001e95, 0x00001ed5, 0x00001f15, 0x00001f55, - /* b8 */ 0x00001f95, 0x00001fd5, 0x00002015, 0x00002055, - /* bc */ 0x00002095, 0x000020d5, 0x00002115, 0x00002155, - - /*** Three byte table, byte #2: e5xx - offset 0x0051d ***/ - - /* 80 */ 0x00002195, 0x000021d5, 0x00002215, 0x00002255, - /* 84 */ 0x00002295, 0x000022d5, 0x00002315, 0x00002355, - /* 88 */ 0x00002395, 0x000023d5, 0x00002415, 0x00002455, - /* 8c */ 0x00002495, 0x000024d5, 0x00002515, 0x00002555, - /* 90 */ 0x00002595, 0x000025d5, 0x00002615, 0x00002655, - /* 94 */ 0x00002695, 0x000026d5, 0x00002715, 0x00002755, - /* 98 */ 0x00002795, 0x000027d5, 0x00002815, 0x00002855, - /* 9c */ 0x00002895, 0x000028d5, 0x00002915, 0x00002955, - /* a0 */ 0x00002995, 0x000029d5, 0x00002a15, 0x00002a55, - /* a4 */ 0x00002a95, 0x00002ad5, 0x00002b15, 0x00002b55, - /* a8 */ 0x00002b95, 0x00002bd5, 0x00002c15, 0x00002c55, - /* ac */ 0x00002c95, 0x00002cd5, 0x00002d15, 0x00002d55, - /* b0 */ 0x00002d95, 0x00002dd5, 0x00002e15, 0x00002e55, - /* b4 */ 0x00002e95, 0x00002ed5, 0x00002f15, 0x00002f55, - /* b8 */ 0x00002f95, 0x00002fd5, 0x00003015, 0x00003055, - /* bc */ 0x00003095, 0x000030d5, 0x00003115, 0x00003155, - - /*** Three byte table, byte #2: e6xx - offset 0x0055d ***/ - - /* 80 */ 0x00003195, 0x000031d5, 0x00003215, 0x00003255, - /* 84 */ 0x00003295, 0x000032d5, 0x00003315, 0x00003355, - /* 88 */ 0x00003395, 0x000033d5, 0x00003415, 0x00003455, - /* 8c */ 0x00003495, 0x000034d5, 0x00003515, 0x00003555, - /* 90 */ 0x00003595, 0x000035d5, 0x00003615, 0x00003655, - /* 94 */ 0x00003695, 0x000036d5, 0x00003715, 0x00003755, - /* 98 */ 0x00003795, 0x000037d5, 0x00003815, 0x00003855, - /* 9c */ 0x00003895, 0x000038d5, 0x00003915, 0x00003955, - /* a0 */ 0x00003995, 0x000039d5, 0x00003a15, 0x00003a55, - /* a4 */ 0x00003a95, 0x00003ad5, 0x00003b15, 0x00003b55, - /* a8 */ 0x00003b95, 0x00003bd5, 0x00003c15, 0x00003c55, - /* ac */ 0x00003c95, 0x00003cd5, 0x00003d15, 0x00003d55, - /* b0 */ 0x00003d95, 0x00003dd5, 0x00003e15, 0x00003e55, - /* b4 */ 0x00003e95, 0x00003ed5, 0x00003f15, 0x00003f55, - /* b8 */ 0x00003f95, 0x00003fd5, 0x00004015, 0x00004055, - /* bc */ 0x00004095, 0x000040d5, 0x00004115, 0x00004155, - - /*** Three byte table, byte #2: e7xx - offset 0x0059d ***/ - - /* 80 */ 0x00004195, 0x000041d5, 0x00004215, 0x00004255, - /* 84 */ 0x00004295, 0x000042d5, 0x00004315, 0x00004355, - /* 88 */ 0x00004395, 0x000043d5, 0x00004415, 0x00004455, - /* 8c */ 0x00004495, 0x000044d5, 0x00004515, 0x00004555, - /* 90 */ 0x00004595, 0x000045d5, 0x00004615, 0x00004655, - /* 94 */ 0x00004695, 0x000046d5, 0x00004715, 0x00004755, - /* 98 */ 0x00004795, 0x000047d5, 0x00004815, 0x00004855, - /* 9c */ 0x00004895, 0x000048d5, 0x00004915, 0x00004955, - /* a0 */ 0x00004995, 0x000049d5, 0x00004a15, 0x00004a55, - /* a4 */ 0x00004a95, 0x00004ad5, 0x00004b15, 0x00004b55, - /* a8 */ 0x00004b95, 0x00004bd5, 0x00004c15, 0x00004c55, - /* ac */ 0x00004c95, 0x00004cd5, 0x00004d15, 0x00004d55, - /* b0 */ 0x00004d95, 0x00004dd5, 0x00004e15, 0x00004e55, - /* b4 */ 0x00004e95, 0x00004ed5, 0x00004f15, 0x00004f55, - /* b8 */ 0x00004f95, 0x00004fd5, 0x00005015, 0x00005055, - /* bc */ 0x00005095, 0x000050d5, 0x00005115, 0x00005155, - - /*** Three byte table, byte #2: e8xx - offset 0x005dd ***/ - - /* 80 */ 0x00005195, 0x000051d5, 0x00005215, 0x00005255, - /* 84 */ 0x00005295, 0x000052d5, 0x00005315, 0x00005355, - /* 88 */ 0x00005395, 0x000053d5, 0x00005415, 0x00005455, - /* 8c */ 0x00005495, 0x000054d5, 0x00005515, 0x00005555, - /* 90 */ 0x00005595, 0x000055d5, 0x00005615, 0x00005655, - /* 94 */ 0x00005695, 0x000056d5, 0x00005715, 0x00005755, - /* 98 */ 0x00005795, 0x000057d5, 0x00005815, 0x00005855, - /* 9c */ 0x00005895, 0x000058d5, 0x00005915, 0x00005955, - /* a0 */ 0x00005995, 0x000059d5, 0x00005a15, 0x00005a55, - /* a4 */ 0x00005a95, 0x00005ad5, 0x00005b15, 0x00005b55, - /* a8 */ 0x00005b95, 0x00005bd5, 0x00005c15, 0x00005c55, - /* ac */ 0x00005c95, 0x00005cd5, 0x00005d15, 0x00005d55, - /* b0 */ 0x00005d95, 0x00005dd5, 0x00005e15, 0x00005e55, - /* b4 */ 0x00005e95, 0x00005ed5, 0x00005f15, 0x00005f55, - /* b8 */ 0x00005f95, 0x00005fd5, 0x00006015, 0x00006055, - /* bc */ 0x00006095, 0x000060d5, 0x00006115, 0x00006155, - - /*** Three byte table, byte #2: e9xx - offset 0x0061d ***/ - - /* 80 */ 0x00006195, 0x000061d5, 0x00006215, 0x00006255, - /* 84 */ 0x00006295, 0x000062d5, 0x00006315, 0x00006355, - /* 88 */ 0x00006395, 0x000063d5, 0x00006415, 0x00006455, - /* 8c */ 0x00006495, 0x000064d5, 0x00006515, 0x00006555, - /* 90 */ 0x00006595, 0x000065d5, 0x00006615, 0x00006655, - /* 94 */ 0x00006695, 0x000066d5, 0x00006715, 0x00006755, - /* 98 */ 0x00006795, 0x000067d5, 0x00006815, 0x00006855, - /* 9c */ 0x00006895, 0x000068d5, 0x00006915, 0x00006955, - /* a0 */ 0x00006995, 0x000069d5, 0x00006a15, 0x00006a55, - /* a4 */ 0x00006a95, 0x00006ad5, 0x00006b15, 0x00006b55, - /* a8 */ 0x00006b95, 0x00006bd5, 0x00006c15, 0x00006c55, - /* ac */ 0x00006c95, 0x00006cd5, 0x00006d15, 0x00006d55, - /* b0 */ 0x00006d95, 0x00006dd5, 0x00006e15, 0x00006e55, - /* b4 */ 0x00006e95, 0x00006ed5, 0x00006f15, 0x00006f55, - /* b8 */ 0x00006f95, 0x00006fd5, 0x00007015, 0x00007055, - /* bc */ 0x00007095, 0x000070d5, 0x00007115, 0x00000000, - - /*** Three byte table, byte #2: eexx - offset 0x0065d ***/ - - /* 80 */ 0x00007155, 0x00007195, 0x000071d5, 0x00007215, - /* 84 */ 0x00007255, 0x00007295, 0x000072d5, 0x00007315, - /* 88 */ 0x00007355, 0x00007395, 0x000073d5, 0x00007415, - /* 8c */ 0x00007455, 0x00007495, 0x000074d5, 0x00007515, - /* 90 */ 0x00007555, 0x00007595, 0x000075d5, 0x00007615, - /* 94 */ 0x00007655, 0x00007695, 0x000076d5, 0x00007715, - /* 98 */ 0x00007755, 0x00007795, 0x000077d5, 0x00007815, - /* 9c */ 0x00007855, 0x00007895, 0x000078d5, 0x00007915, - /* a0 */ 0x00007955, 0x00007995, + /* b0 */ 0x00000000, 0x00001e56, 0x00001e96, 0x00001ed6, + /* b4 */ 0x00001f16, 0x00001f56, 0x00001f96, 0x00001fd6, + /* b8 */ 0x00002016, 0x00002056, 0x00002096, 0x000020d6, + /* bc */ 0x00002116, 0x00002156, 0x00002196, 0x000021d6, + + /*** Three byte table, byte #2: e5xx - offset 0x0055e ***/ + + /* 80 */ 0x00002216, 0x00002256, 0x00002296, 0x000022d6, + /* 84 */ 0x00002316, 0x00002356, 0x00002396, 0x000023d6, + /* 88 */ 0x00002416, 0x00002456, 0x00002496, 0x000024d6, + /* 8c */ 0x00002516, 0x00002556, 0x00002596, 0x000025d6, + /* 90 */ 0x00002616, 0x00002656, 0x00002696, 0x000026d6, + /* 94 */ 0x00002716, 0x00002756, 0x00002796, 0x000027d6, + /* 98 */ 0x00002816, 0x00002856, 0x00002896, 0x000028d6, + /* 9c */ 0x00002916, 0x00002956, 0x00002996, 0x000029d6, + /* a0 */ 0x00002a16, 0x00002a56, 0x00002a96, 0x00002ad6, + /* a4 */ 0x00002b16, 0x00002b56, 0x00002b96, 0x00002bd6, + /* a8 */ 0x00002c16, 0x00002c56, 0x00002c96, 0x00002cd6, + /* ac */ 0x00002d16, 0x00002d56, 0x00002d96, 0x00002dd6, + /* b0 */ 0x00002e16, 0x00002e56, 0x00002e96, 0x00002ed6, + /* b4 */ 0x00002f16, 0x00002f56, 0x00002f96, 0x00002fd6, + /* b8 */ 0x00003016, 0x00003056, 0x00003096, 0x000030d6, + /* bc */ 0x00003116, 0x00003156, 0x00003196, 0x000031d6, + + /*** Three byte table, byte #2: e6xx - offset 0x0059e ***/ + + /* 80 */ 0x00003216, 0x00003256, 0x00003296, 0x000032d6, + /* 84 */ 0x00003316, 0x00003356, 0x00003396, 0x000033d6, + /* 88 */ 0x00003416, 0x00003456, 0x00003496, 0x000034d6, + /* 8c */ 0x00003516, 0x00003556, 0x00003596, 0x000035d6, + /* 90 */ 0x00003616, 0x00003656, 0x00003696, 0x000036d6, + /* 94 */ 0x00003716, 0x00003756, 0x00003796, 0x000037d6, + /* 98 */ 0x00003816, 0x00003856, 0x00003896, 0x000038d6, + /* 9c */ 0x00003916, 0x00003956, 0x00003996, 0x000039d6, + /* a0 */ 0x00003a16, 0x00003a56, 0x00003a96, 0x00003ad6, + /* a4 */ 0x00003b16, 0x00003b56, 0x00003b96, 0x00003bd6, + /* a8 */ 0x00003c16, 0x00003c56, 0x00003c96, 0x00003cd6, + /* ac */ 0x00003d16, 0x00003d56, 0x00003d96, 0x00003dd6, + /* b0 */ 0x00003e16, 0x00003e56, 0x00003e96, 0x00003ed6, + /* b4 */ 0x00003f16, 0x00003f56, 0x00003f96, 0x00003fd6, + /* b8 */ 0x00004016, 0x00004056, 0x00004096, 0x000040d6, + /* bc */ 0x00004116, 0x00004156, 0x00004196, 0x000041d6, + + /*** Three byte table, byte #2: e7xx - offset 0x005de ***/ + + /* 80 */ 0x00004216, 0x00004256, 0x00004296, 0x000042d6, + /* 84 */ 0x00004316, 0x00004356, 0x00004396, 0x000043d6, + /* 88 */ 0x00004416, 0x00004456, 0x00004496, 0x000044d6, + /* 8c */ 0x00004516, 0x00004556, 0x00004596, 0x000045d6, + /* 90 */ 0x00004616, 0x00004656, 0x00004696, 0x000046d6, + /* 94 */ 0x00004716, 0x00004756, 0x00004796, 0x000047d6, + /* 98 */ 0x00004816, 0x00004856, 0x00004896, 0x000048d6, + /* 9c */ 0x00004916, 0x00004956, 0x00004996, 0x000049d6, + /* a0 */ 0x00004a16, 0x00004a56, 0x00004a96, 0x00004ad6, + /* a4 */ 0x00004b16, 0x00004b56, 0x00004b96, 0x00004bd6, + /* a8 */ 0x00004c16, 0x00004c56, 0x00004c96, 0x00004cd6, + /* ac */ 0x00004d16, 0x00004d56, 0x00004d96, 0x00004dd6, + /* b0 */ 0x00004e16, 0x00004e56, 0x00004e96, 0x00004ed6, + /* b4 */ 0x00004f16, 0x00004f56, 0x00004f96, 0x00004fd6, + /* b8 */ 0x00005016, 0x00005056, 0x00005096, 0x000050d6, + /* bc */ 0x00005116, 0x00005156, 0x00005196, 0x000051d6, + + /*** Three byte table, byte #2: e8xx - offset 0x0061e ***/ + + /* 80 */ 0x00005216, 0x00005256, 0x00005296, 0x000052d6, + /* 84 */ 0x00005316, 0x00005356, 0x00005396, 0x000053d6, + /* 88 */ 0x00005416, 0x00005456, 0x00005496, 0x000054d6, + /* 8c */ 0x00005516, 0x00005556, 0x00005596, 0x000055d6, + /* 90 */ 0x00005616, 0x00005656, 0x00005696, 0x000056d6, + /* 94 */ 0x00005716, 0x00005756, 0x00005796, 0x000057d6, + /* 98 */ 0x00005816, 0x00005856, 0x00005896, 0x000058d6, + /* 9c */ 0x00005916, 0x00005956, 0x00005996, 0x000059d6, + /* a0 */ 0x00005a16, 0x00005a56, 0x00005a96, 0x00005ad6, + /* a4 */ 0x00005b16, 0x00005b56, 0x00005b96, 0x00005bd6, + /* a8 */ 0x00005c16, 0x00005c56, 0x00005c96, 0x00005cd6, + /* ac */ 0x00005d16, 0x00005d56, 0x00005d96, 0x00005dd6, + /* b0 */ 0x00005e16, 0x00005e56, 0x00005e96, 0x00005ed6, + /* b4 */ 0x00005f16, 0x00005f56, 0x00005f96, 0x00005fd6, + /* b8 */ 0x00006016, 0x00006056, 0x00006096, 0x000060d6, + /* bc */ 0x00006116, 0x00006156, 0x00006196, 0x000061d6, + + /*** Three byte table, byte #2: e9xx - offset 0x0065e ***/ + + /* 80 */ 0x00006216, 0x00006256, 0x00006296, 0x000062d6, + /* 84 */ 0x00006316, 0x00006356, 0x00006396, 0x000063d6, + /* 88 */ 0x00006416, 0x00006456, 0x00006496, 0x000064d6, + /* 8c */ 0x00006516, 0x00006556, 0x00006596, 0x000065d6, + /* 90 */ 0x00006616, 0x00006656, 0x00006696, 0x000066d6, + /* 94 */ 0x00006716, 0x00006756, 0x00006796, 0x000067d6, + /* 98 */ 0x00006816, 0x00006856, 0x00006896, 0x000068d6, + /* 9c */ 0x00006916, 0x00006956, 0x00006996, 0x000069d6, + /* a0 */ 0x00006a16, 0x00006a56, 0x00006a96, 0x00006ad6, + /* a4 */ 0x00006b16, 0x00006b56, 0x00006b96, 0x00006bd6, + /* a8 */ 0x00006c16, 0x00006c56, 0x00006c96, 0x00006cd6, + /* ac */ 0x00006d16, 0x00006d56, 0x00006d96, 0x00006dd6, + /* b0 */ 0x00006e16, 0x00006e56, 0x00006e96, 0x00006ed6, + /* b4 */ 0x00006f16, 0x00006f56, 0x00006f96, 0x00006fd6, + /* b8 */ 0x00007016, 0x00007056, 0x00007096, 0x000070d6, + /* bc */ 0x00007116, 0x00007156, 0x00007196, 0x00000000, + + /*** Three byte table, byte #2: eexx - offset 0x0069e ***/ + + /* 80 */ 0x000071d6, 0x00007216, 0x00007256, 0x00007296, + /* 84 */ 0x000072d6, 0x00007316, 0x00007356, 0x00007396, + /* 88 */ 0x000073d6, 0x00007416, 0x00007456, 0x00007496, + /* 8c */ 0x000074d6, 0x00007516, 0x00007556, 0x00007596, + /* 90 */ 0x000075d6, 0x00007616, 0x00007656, 0x00007696, + /* 94 */ 0x000076d6, 0x00007716, 0x00007756, 0x00007796, + /* 98 */ 0x000077d6, 0x00007816, 0x00007856, 0x00007896, + /* 9c */ 0x000078d6, 0x00007916, 0x00007956, 0x00007996, + /* a0 */ 0x000079d6, 0x00007a16, /* 30 trailing zero values shared with next segment */ - /*** Three byte table, byte #2: efxx - offset 0x0067f ***/ + /*** Three byte table, byte #2: efxx - offset 0x006c0 ***/ + + /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 88 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 8c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 90 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 94 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 98 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 9c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* a4 */ 0x00007a3a, 0x00007a7a, 0x00007aba, 0x00007afa, + /* a8 */ 0x00007b3a, 0x00000000, 0x00000000, 0x00000000, + /* ac */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* b0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* b4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* b8 */ 0x00007b6a, 0x00007baa, 0x00007bea, 0x00007c2a, + /* bc */ 0x00007c6a, 0x00007caa, 0x00007cea, 0x00007d2a, + + /*** Three byte table, leaf: e1b8xx - offset 0x00700 ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -553,15 +591,15 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* 98 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 9c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* a4 */ 0x000079ba, 0x000079fa, 0x00007a3a, 0x00007a7a, - /* a8 */ 0x00007aba, 0x00000000, 0x00000000, 0x00000000, + /* a4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* a8 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* ac */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* b0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* b4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* b8 */ 0x00007ae4, 0x00007b24, 0x00007b64, 0x00007ba4, - /* bc */ 0x00007be4, 0x00007c24, 0x00007c64, 0x00007ca4, + /* b8 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* bc */ 0x00000000, 0x00000000, 0x00000000, 0x0000a8bc, - /*** Three byte table, leaf: e280xx - offset 0x006bf ***/ + /*** Three byte table, leaf: e280xx - offset 0x00740 ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -580,7 +618,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136a737, 0x8136a738, 0x8136a739, 0x0000a1f9, /* bc */ 0x8136a830, 0x8136a831, 0x8136a832, 0x8136a833, - /*** Three byte table, leaf: e281xx - offset 0x006ff ***/ + /*** Three byte table, leaf: e281xx - offset 0x00780 ***/ /* 80 */ 0x8136a834, 0x8136a835, 0x8136a836, 0x8136a837, /* 84 */ 0x8136a838, 0x8136a839, 0x8136a930, 0x8136a931, @@ -599,7 +637,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136ae30, 0x8136ae31, 0x8136ae32, 0x8136ae33, /* bc */ 0x8136ae34, 0x8136ae35, 0x8136ae36, 0x8136ae37, - /*** Three byte table, leaf: e282xx - offset 0x0073f ***/ + /*** Three byte table, leaf: e282xx - offset 0x007c0 ***/ /* 80 */ 0x8136ae38, 0x8136ae39, 0x8136af30, 0x8136af31, /* 84 */ 0x8136af32, 0x8136af33, 0x8136af34, 0x8136af35, @@ -618,7 +656,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136b433, 0x8136b434, 0x8136b435, 0x8136b436, /* bc */ 0x8136b437, 0x8136b438, 0x8136b439, 0x8136b530, - /*** Three byte table, leaf: e283xx - offset 0x0077f ***/ + /*** Three byte table, leaf: e283xx - offset 0x00800 ***/ /* 80 */ 0x8136b531, 0x8136b532, 0x8136b533, 0x8136b534, /* 84 */ 0x8136b535, 0x8136b536, 0x8136b537, 0x8136b538, @@ -637,7 +675,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136ba37, 0x8136ba38, 0x8136ba39, 0x8136bb30, /* bc */ 0x8136bb31, 0x8136bb32, 0x8136bb33, 0x8136bb34, - /*** Three byte table, leaf: e284xx - offset 0x007bf ***/ + /*** Three byte table, leaf: e284xx - offset 0x00840 ***/ /* 80 */ 0x8136bb35, 0x8136bb36, 0x8136bb37, 0x0000a1e6, /* 84 */ 0x8136bb38, 0x0000a847, 0x8136bb39, 0x8136bc30, @@ -656,7 +694,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136c036, 0x8136c037, 0x8136c038, 0x8136c039, /* bc */ 0x8136c130, 0x8136c131, 0x8136c132, 0x8136c133, - /*** Three byte table, leaf: e285xx - offset 0x007ff ***/ + /*** Three byte table, leaf: e285xx - offset 0x00880 ***/ /* 80 */ 0x8136c134, 0x8136c135, 0x8136c136, 0x8136c137, /* 84 */ 0x8136c138, 0x8136c139, 0x8136c230, 0x8136c231, @@ -675,7 +713,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a2a9, 0x0000a2aa, 0x8136c530, 0x8136c531, /* bc */ 0x8136c532, 0x8136c533, 0x8136c534, 0x8136c535, - /*** Three byte table, leaf: e286xx - offset 0x0083f ***/ + /*** Three byte table, leaf: e286xx - offset 0x008c0 ***/ /* 80 */ 0x8136c536, 0x8136c537, 0x8136c538, 0x8136c539, /* 84 */ 0x8136c630, 0x8136c631, 0x8136c632, 0x8136c633, @@ -694,7 +732,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136ca34, 0x8136ca35, 0x8136ca36, 0x8136ca37, /* bc */ 0x8136ca38, 0x8136ca39, 0x8136cb30, 0x8136cb31, - /*** Three byte table, leaf: e287xx - offset 0x0087f ***/ + /*** Three byte table, leaf: e287xx - offset 0x00900 ***/ /* 80 */ 0x8136cb32, 0x8136cb33, 0x8136cb34, 0x8136cb35, /* 84 */ 0x8136cb36, 0x8136cb37, 0x8136cb38, 0x8136cb39, @@ -713,7 +751,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136d038, 0x8136d039, 0x8136d130, 0x8136d131, /* bc */ 0x8136d132, 0x8136d133, 0x8136d134, 0x8136d135, - /*** Three byte table, leaf: e288xx - offset 0x008bf ***/ + /*** Three byte table, leaf: e288xx - offset 0x00940 ***/ /* 80 */ 0x8136d136, 0x8136d137, 0x8136d138, 0x8136d139, /* 84 */ 0x8136d230, 0x8136d231, 0x8136d232, 0x8136d233, @@ -732,7 +770,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136d531, 0x8136d532, 0x8136d533, 0x8136d534, /* bc */ 0x8136d535, 0x0000a1d7, 0x8136d536, 0x8136d537, - /*** Three byte table, leaf: e289xx - offset 0x008ff ***/ + /*** Three byte table, leaf: e289xx - offset 0x00980 ***/ /* 80 */ 0x8136d538, 0x8136d539, 0x8136d630, 0x8136d631, /* 84 */ 0x8136d632, 0x8136d633, 0x8136d634, 0x8136d635, @@ -751,7 +789,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136da33, 0x8136da34, 0x8136da35, 0x8136da36, /* bc */ 0x8136da37, 0x8136da38, 0x8136da39, 0x8136db30, - /*** Three byte table, leaf: e28axx - offset 0x0093f ***/ + /*** Three byte table, leaf: e28axx - offset 0x009c0 ***/ /* 80 */ 0x8136db31, 0x8136db32, 0x8136db33, 0x8136db34, /* 84 */ 0x8136db35, 0x8136db36, 0x8136db37, 0x8136db38, @@ -770,7 +808,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136e034, 0x8136e035, 0x8136e036, 0x8136e037, /* bc */ 0x8136e038, 0x8136e039, 0x8136e130, 0x0000a853, - /*** Three byte table, leaf: e28bxx - offset 0x0097f ***/ + /*** Three byte table, leaf: e28bxx - offset 0x00a00 ***/ /* 80 */ 0x8136e131, 0x8136e132, 0x8136e133, 0x8136e134, /* 84 */ 0x8136e135, 0x8136e136, 0x8136e137, 0x8136e138, @@ -789,7 +827,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136e637, 0x8136e638, 0x8136e639, 0x8136e730, /* bc */ 0x8136e731, 0x8136e732, 0x8136e733, 0x8136e734, - /*** Three byte table, leaf: e28cxx - offset 0x009bf ***/ + /*** Three byte table, leaf: e28cxx - offset 0x00a40 ***/ /* 80 */ 0x8136e735, 0x8136e736, 0x8136e737, 0x8136e738, /* 84 */ 0x8136e739, 0x8136e830, 0x8136e831, 0x8136e832, @@ -808,7 +846,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136ed30, 0x8136ed31, 0x8136ed32, 0x8136ed33, /* bc */ 0x8136ed34, 0x8136ed35, 0x8136ed36, 0x8136ed37, - /*** Three byte table, leaf: e28dxx - offset 0x009ff ***/ + /*** Three byte table, leaf: e28dxx - offset 0x00a80 ***/ /* 80 */ 0x8136ed38, 0x8136ed39, 0x8136ee30, 0x8136ee31, /* 84 */ 0x8136ee32, 0x8136ee33, 0x8136ee34, 0x8136ee35, @@ -827,7 +865,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136f334, 0x8136f335, 0x8136f336, 0x8136f337, /* bc */ 0x8136f338, 0x8136f339, 0x8136f430, 0x8136f431, - /*** Three byte table, leaf: e28exx - offset 0x00a3f ***/ + /*** Three byte table, leaf: e28exx - offset 0x00ac0 ***/ /* 80 */ 0x8136f432, 0x8136f433, 0x8136f434, 0x8136f435, /* 84 */ 0x8136f436, 0x8136f437, 0x8136f438, 0x8136f439, @@ -846,7 +884,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8136f938, 0x8136f939, 0x8136fa30, 0x8136fa31, /* bc */ 0x8136fa32, 0x8136fa33, 0x8136fa34, 0x8136fa35, - /*** Three byte table, leaf: e28fxx - offset 0x00a7f ***/ + /*** Three byte table, leaf: e28fxx - offset 0x00b00 ***/ /* 80 */ 0x8136fa36, 0x8136fa37, 0x8136fa38, 0x8136fa39, /* 84 */ 0x8136fb30, 0x8136fb31, 0x8136fb32, 0x8136fb33, @@ -865,7 +903,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81378232, 0x81378233, 0x81378234, 0x81378235, /* bc */ 0x81378236, 0x81378237, 0x81378238, 0x81378239, - /*** Three byte table, leaf: e290xx - offset 0x00abf ***/ + /*** Three byte table, leaf: e290xx - offset 0x00b40 ***/ /* 80 */ 0x81378330, 0x81378331, 0x81378332, 0x81378333, /* 84 */ 0x81378334, 0x81378335, 0x81378336, 0x81378337, @@ -884,7 +922,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81378836, 0x81378837, 0x81378838, 0x81378839, /* bc */ 0x81378930, 0x81378931, 0x81378932, 0x81378933, - /*** Three byte table, leaf: e291xx - offset 0x00aff ***/ + /*** Three byte table, leaf: e291xx - offset 0x00b80 ***/ /* 80 */ 0x81378934, 0x81378935, 0x81378936, 0x81378937, /* 84 */ 0x81378938, 0x81378939, 0x81378a30, 0x81378a31, @@ -903,7 +941,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a2c9, 0x0000a2ca, 0x0000a2cb, 0x0000a2cc, /* bc */ 0x0000a2cd, 0x0000a2ce, 0x0000a2cf, 0x0000a2d0, - /*** Three byte table, leaf: e292xx - offset 0x00b3f ***/ + /*** Three byte table, leaf: e292xx - offset 0x00bc0 ***/ /* 80 */ 0x0000a2d1, 0x0000a2d2, 0x0000a2d3, 0x0000a2d4, /* 84 */ 0x0000a2d5, 0x0000a2d6, 0x0000a2d7, 0x0000a2d8, @@ -922,7 +960,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81379034, 0x81379035, 0x81379036, 0x81379037, /* bc */ 0x81379038, 0x81379039, 0x81379130, 0x81379131, - /*** Three byte table, leaf: e293xx - offset 0x00b7f ***/ + /*** Three byte table, leaf: e293xx - offset 0x00c00 ***/ /* 80 */ 0x81379132, 0x81379133, 0x81379134, 0x81379135, /* 84 */ 0x81379136, 0x81379137, 0x81379138, 0x81379139, @@ -941,7 +979,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81379638, 0x81379639, 0x81379730, 0x81379731, /* bc */ 0x81379732, 0x81379733, 0x81379734, 0x81379735, - /*** Three byte table, leaf: e294xx - offset 0x00bbf ***/ + /*** Three byte table, leaf: e294xx - offset 0x00c40 ***/ /* 80 */ 0x0000a9a4, 0x0000a9a5, 0x0000a9a6, 0x0000a9a7, /* 84 */ 0x0000a9a8, 0x0000a9a9, 0x0000a9aa, 0x0000a9ab, @@ -960,7 +998,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a9dc, 0x0000a9dd, 0x0000a9de, 0x0000a9df, /* bc */ 0x0000a9e0, 0x0000a9e1, 0x0000a9e2, 0x0000a9e3, - /*** Three byte table, leaf: e295xx - offset 0x00bff ***/ + /*** Three byte table, leaf: e295xx - offset 0x00c80 ***/ /* 80 */ 0x0000a9e4, 0x0000a9e5, 0x0000a9e6, 0x0000a9e7, /* 84 */ 0x0000a9e8, 0x0000a9e9, 0x0000a9ea, 0x0000a9eb, @@ -979,7 +1017,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81379834, 0x81379835, 0x81379836, 0x81379837, /* bc */ 0x81379838, 0x81379839, 0x81379930, 0x81379931, - /*** Three byte table, leaf: e296xx - offset 0x00c3f ***/ + /*** Three byte table, leaf: e296xx - offset 0x00cc0 ***/ /* 80 */ 0x81379932, 0x0000a878, 0x0000a879, 0x0000a87a, /* 84 */ 0x0000a87b, 0x0000a87c, 0x0000a87d, 0x0000a87e, @@ -998,7 +1036,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81379c36, 0x81379c37, 0x81379c38, 0x81379c39, /* bc */ 0x0000a88b, 0x0000a88c, 0x81379d30, 0x81379d31, - /*** Three byte table, leaf: e297xx - offset 0x00c7f ***/ + /*** Three byte table, leaf: e297xx - offset 0x00d00 ***/ /* 80 */ 0x81379d32, 0x81379d33, 0x81379d34, 0x81379d35, /* 84 */ 0x81379d36, 0x81379d37, 0x0000a1f4, 0x0000a1f3, @@ -1017,7 +1055,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8137a139, 0x8137a230, 0x8137a231, 0x8137a232, /* bc */ 0x8137a233, 0x8137a234, 0x8137a235, 0x8137a236, - /*** Three byte table, leaf: e298xx - offset 0x00cbf ***/ + /*** Three byte table, leaf: e298xx - offset 0x00d40 ***/ /* 80 */ 0x8137a237, 0x8137a238, 0x8137a239, 0x8137a330, /* 84 */ 0x8137a331, 0x0000a1ef, 0x0000a1ee, 0x8137a332, @@ -1036,7 +1074,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8137a830, 0x8137a831, 0x8137a832, 0x8137a833, /* bc */ 0x8137a834, 0x8137a835, 0x8137a836, 0x8137a837, - /*** Three byte table, leaf: e299xx - offset 0x00cff ***/ + /*** Three byte table, leaf: e299xx - offset 0x00d80 ***/ /* 80 */ 0x0000a1e2, 0x8137a838, 0x0000a1e1, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -1056,7 +1094,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* bc */ 0x00000000, 0x00000000, 0x00000000, /* 1 trailing zero values shared with next segment */ - /*** Three byte table, leaf: e2baxx - offset 0x00d3e ***/ + /*** Three byte table, leaf: e2baxx - offset 0x00dbf ***/ /* 80 */ 0x00000000, 0x0000fe50, 0x8138fd39, 0x8138fe30, /* 84 */ 0x0000fe54, 0x8138fe31, 0x8138fe32, 0x8138fe33, @@ -1075,7 +1113,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81398432, 0x81398433, 0x81398434, 0x0000fe79, /* bc */ 0x81398435, 0x81398436, 0x81398437, 0x81398438, - /*** Three byte table, leaf: e2bbxx - offset 0x00d7e ***/ + /*** Three byte table, leaf: e2bbxx - offset 0x00dff ***/ /* 80 */ 0x81398439, 0x81398530, 0x81398531, 0x81398532, /* 84 */ 0x81398533, 0x81398534, 0x81398535, 0x81398536, @@ -1094,7 +1132,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81398a34, 0x81398a35, 0x81398a36, 0x81398a37, /* bc */ 0x81398a38, 0x81398a39, 0x81398b30, 0x81398b31, - /*** Three byte table, leaf: e2bcxx - offset 0x00dbe ***/ + /*** Three byte table, leaf: e2bcxx - offset 0x00e3f ***/ /* 80 */ 0x81398b32, 0x81398b33, 0x81398b34, 0x81398b35, /* 84 */ 0x81398b36, 0x81398b37, 0x81398b38, 0x81398b39, @@ -1113,7 +1151,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81399038, 0x81399039, 0x81399130, 0x81399131, /* bc */ 0x81399132, 0x81399133, 0x81399134, 0x81399135, - /*** Three byte table, leaf: e2bdxx - offset 0x00dfe ***/ + /*** Three byte table, leaf: e2bdxx - offset 0x00e7f ***/ /* 80 */ 0x81399136, 0x81399137, 0x81399138, 0x81399139, /* 84 */ 0x81399230, 0x81399231, 0x81399232, 0x81399233, @@ -1132,7 +1170,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81399732, 0x81399733, 0x81399734, 0x81399735, /* bc */ 0x81399736, 0x81399737, 0x81399738, 0x81399739, - /*** Three byte table, leaf: e2bexx - offset 0x00e3e ***/ + /*** Three byte table, leaf: e2bexx - offset 0x00ebf ***/ /* 80 */ 0x81399830, 0x81399831, 0x81399832, 0x81399833, /* 84 */ 0x81399834, 0x81399835, 0x81399836, 0x81399837, @@ -1151,7 +1189,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x81399d36, 0x81399d37, 0x81399d38, 0x81399d39, /* bc */ 0x81399e30, 0x81399e31, 0x81399e32, 0x81399e33, - /*** Three byte table, leaf: e2bfxx - offset 0x00e7e ***/ + /*** Three byte table, leaf: e2bfxx - offset 0x00eff ***/ /* 80 */ 0x81399e34, 0x81399e35, 0x81399e36, 0x81399e37, /* 84 */ 0x81399e38, 0x81399e39, 0x81399f30, 0x81399f31, @@ -1170,7 +1208,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a992, 0x0000a993, 0x0000a994, 0x0000a995, /* bc */ 0x8139a332, 0x8139a333, 0x8139a334, 0x8139a335, - /*** Three byte table, leaf: e380xx - offset 0x00ebe ***/ + /*** Three byte table, leaf: e380xx - offset 0x00f3f ***/ /* 80 */ 0x0000a1a1, 0x0000a1a2, 0x0000a1a3, 0x0000a1a8, /* 84 */ 0x8139a336, 0x0000a1a9, 0x0000a965, 0x0000a996, @@ -1189,7 +1227,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139a538, 0x8139a539, 0x8139a630, 0x8139a631, /* bc */ 0x8139a632, 0x8139a633, 0x0000a989, 0x8139a634, - /*** Three byte table, leaf: e381xx - offset 0x00efe ***/ + /*** Three byte table, leaf: e381xx - offset 0x00f7f ***/ /* 80 */ 0x8139a635, 0x0000a4a1, 0x0000a4a2, 0x0000a4a3, /* 84 */ 0x0000a4a4, 0x0000a4a5, 0x0000a4a6, 0x0000a4a7, @@ -1208,7 +1246,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a4d8, 0x0000a4d9, 0x0000a4da, 0x0000a4db, /* bc */ 0x0000a4dc, 0x0000a4dd, 0x0000a4de, 0x0000a4df, - /*** Three byte table, leaf: e382xx - offset 0x00f3e ***/ + /*** Three byte table, leaf: e382xx - offset 0x00fbf ***/ /* 80 */ 0x0000a4e0, 0x0000a4e1, 0x0000a4e2, 0x0000a4e3, /* 84 */ 0x0000a4e4, 0x0000a4e5, 0x0000a4e6, 0x0000a4e7, @@ -1227,7 +1265,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a5b8, 0x0000a5b9, 0x0000a5ba, 0x0000a5bb, /* bc */ 0x0000a5bc, 0x0000a5bd, 0x0000a5be, 0x0000a5bf, - /*** Three byte table, leaf: e383xx - offset 0x00f7e ***/ + /*** Three byte table, leaf: e383xx - offset 0x00fff ***/ /* 80 */ 0x0000a5c0, 0x0000a5c1, 0x0000a5c2, 0x0000a5c3, /* 84 */ 0x0000a5c4, 0x0000a5c5, 0x0000a5c6, 0x0000a5c7, @@ -1246,7 +1284,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139a736, 0x8139a737, 0x8139a738, 0x8139a739, /* bc */ 0x0000a960, 0x0000a963, 0x0000a964, 0x8139a830, - /*** Three byte table, leaf: e384xx - offset 0x00fbe ***/ + /*** Three byte table, leaf: e384xx - offset 0x0103f ***/ /* 80 */ 0x8139a831, 0x8139a832, 0x8139a833, 0x8139a834, /* 84 */ 0x8139a835, 0x0000a8c5, 0x0000a8c6, 0x0000a8c7, @@ -1265,7 +1303,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139aa30, 0x8139aa31, 0x8139aa32, 0x8139aa33, /* bc */ 0x8139aa34, 0x8139aa35, 0x8139aa36, 0x8139aa37, - /*** Three byte table, leaf: e385xx - offset 0x00ffe ***/ + /*** Three byte table, leaf: e385xx - offset 0x0107f ***/ /* 80 */ 0x8139aa38, 0x8139aa39, 0x8139ab30, 0x8139ab31, /* 84 */ 0x8139ab32, 0x8139ab33, 0x8139ab34, 0x8139ab35, @@ -1284,7 +1322,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139b034, 0x8139b035, 0x8139b036, 0x8139b037, /* bc */ 0x8139b038, 0x8139b039, 0x8139b130, 0x8139b131, - /*** Three byte table, leaf: e386xx - offset 0x0103e ***/ + /*** Three byte table, leaf: e386xx - offset 0x010bf ***/ /* 80 */ 0x8139b132, 0x8139b133, 0x8139b134, 0x8139b135, /* 84 */ 0x8139b136, 0x8139b137, 0x8139b138, 0x8139b139, @@ -1303,7 +1341,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139b638, 0x8139b639, 0x8139b730, 0x8139b731, /* bc */ 0x8139b732, 0x8139b733, 0x8139b734, 0x8139b735, - /*** Three byte table, leaf: e387xx - offset 0x0107e ***/ + /*** Three byte table, leaf: e387xx - offset 0x010ff ***/ /* 80 */ 0x8139b736, 0x8139b737, 0x8139b738, 0x8139b739, /* 84 */ 0x8139b830, 0x8139b831, 0x8139b832, 0x8139b833, @@ -1322,7 +1360,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139bd32, 0x8139bd33, 0x8139bd34, 0x8139bd35, /* bc */ 0x8139bd36, 0x8139bd37, 0x8139bd38, 0x8139bd39, - /*** Three byte table, leaf: e388xx - offset 0x010be ***/ + /*** Three byte table, leaf: e388xx - offset 0x0113f ***/ /* 80 */ 0x8139be30, 0x8139be31, 0x8139be32, 0x8139be33, /* 84 */ 0x8139be34, 0x8139be35, 0x8139be36, 0x8139be37, @@ -1341,7 +1379,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139c235, 0x8139c236, 0x8139c237, 0x8139c238, /* bc */ 0x8139c239, 0x8139c330, 0x8139c331, 0x8139c332, - /*** Three byte table, leaf: e389xx - offset 0x010fe ***/ + /*** Three byte table, leaf: e389xx - offset 0x0117f ***/ /* 80 */ 0x8139c333, 0x8139c334, 0x8139c335, 0x8139c336, /* 84 */ 0x8139c337, 0x8139c338, 0x8139c339, 0x8139c430, @@ -1360,7 +1398,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139c839, 0x8139c930, 0x8139c931, 0x8139c932, /* bc */ 0x8139c933, 0x8139c934, 0x8139c935, 0x8139c936, - /*** Three byte table, leaf: e38axx - offset 0x0113e ***/ + /*** Three byte table, leaf: e38axx - offset 0x011bf ***/ /* 80 */ 0x8139c937, 0x8139c938, 0x8139c939, 0x8139ca30, /* 84 */ 0x8139ca31, 0x8139ca32, 0x8139ca33, 0x8139ca34, @@ -1379,7 +1417,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139cf32, 0x8139cf33, 0x8139cf34, 0x8139cf35, /* bc */ 0x8139cf36, 0x8139cf37, 0x8139cf38, 0x8139cf39, - /*** Three byte table, leaf: e38bxx - offset 0x0117e ***/ + /*** Three byte table, leaf: e38bxx - offset 0x011ff ***/ /* 80 */ 0x8139d030, 0x8139d031, 0x8139d032, 0x8139d033, /* 84 */ 0x8139d034, 0x8139d035, 0x8139d036, 0x8139d037, @@ -1398,7 +1436,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139d536, 0x8139d537, 0x8139d538, 0x8139d539, /* bc */ 0x8139d630, 0x8139d631, 0x8139d632, 0x8139d633, - /*** Three byte table, leaf: e38cxx - offset 0x011be ***/ + /*** Three byte table, leaf: e38cxx - offset 0x0123f ***/ /* 80 */ 0x8139d634, 0x8139d635, 0x8139d636, 0x8139d637, /* 84 */ 0x8139d638, 0x8139d639, 0x8139d730, 0x8139d731, @@ -1417,7 +1455,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139dc30, 0x8139dc31, 0x8139dc32, 0x8139dc33, /* bc */ 0x8139dc34, 0x8139dc35, 0x8139dc36, 0x8139dc37, - /*** Three byte table, leaf: e38dxx - offset 0x011fe ***/ + /*** Three byte table, leaf: e38dxx - offset 0x0127f ***/ /* 80 */ 0x8139dc38, 0x8139dc39, 0x8139dd30, 0x8139dd31, /* 84 */ 0x8139dd32, 0x8139dd33, 0x8139dd34, 0x8139dd35, @@ -1436,7 +1474,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139e234, 0x8139e235, 0x8139e236, 0x8139e237, /* bc */ 0x8139e238, 0x8139e239, 0x8139e330, 0x8139e331, - /*** Three byte table, leaf: e38exx - offset 0x0123e ***/ + /*** Three byte table, leaf: e38exx - offset 0x012bf ***/ /* 80 */ 0x8139e332, 0x8139e333, 0x8139e334, 0x8139e335, /* 84 */ 0x8139e336, 0x8139e337, 0x8139e338, 0x8139e339, @@ -1455,7 +1493,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139e832, 0x8139e833, 0x8139e834, 0x8139e835, /* bc */ 0x8139e836, 0x8139e837, 0x8139e838, 0x8139e839, - /*** Three byte table, leaf: e38fxx - offset 0x0127e ***/ + /*** Three byte table, leaf: e38fxx - offset 0x012ff ***/ /* 80 */ 0x8139e930, 0x8139e931, 0x8139e932, 0x8139e933, /* 84 */ 0x0000a950, 0x8139e934, 0x8139e935, 0x8139e936, @@ -1474,7 +1512,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139ee31, 0x8139ee32, 0x8139ee33, 0x8139ee34, /* bc */ 0x8139ee35, 0x8139ee36, 0x8139ee37, 0x8139ee38, - /*** Three byte table, leaf: e390xx - offset 0x012be ***/ + /*** Three byte table, leaf: e390xx - offset 0x0133f ***/ /* 80 */ 0x8139ee39, 0x8139ef30, 0x8139ef31, 0x8139ef32, /* 84 */ 0x8139ef33, 0x8139ef34, 0x8139ef35, 0x8139ef36, @@ -1493,7 +1531,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139f435, 0x8139f436, 0x8139f437, 0x8139f438, /* bc */ 0x8139f439, 0x8139f530, 0x8139f531, 0x8139f532, - /*** Three byte table, leaf: e391xx - offset 0x012fe ***/ + /*** Three byte table, leaf: e391xx - offset 0x0137f ***/ /* 80 */ 0x8139f533, 0x8139f534, 0x8139f535, 0x8139f536, /* 84 */ 0x8139f537, 0x8139f538, 0x8139f539, 0x0000fe56, @@ -1512,7 +1550,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8139fa37, 0x8139fa38, 0x8139fa39, 0x8139fb30, /* bc */ 0x8139fb31, 0x8139fb32, 0x8139fb33, 0x8139fb34, - /*** Three byte table, leaf: e392xx - offset 0x0133e ***/ + /*** Three byte table, leaf: e392xx - offset 0x013bf ***/ /* 80 */ 0x8139fb35, 0x8139fb36, 0x8139fb37, 0x8139fb38, /* 84 */ 0x8139fb39, 0x8139fc30, 0x8139fc31, 0x8139fc32, @@ -1531,7 +1569,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82308331, 0x82308332, 0x82308333, 0x82308334, /* bc */ 0x82308335, 0x82308336, 0x82308337, 0x82308338, - /*** Three byte table, leaf: e393xx - offset 0x0137e ***/ + /*** Three byte table, leaf: e393xx - offset 0x013ff ***/ /* 80 */ 0x82308339, 0x82308430, 0x82308431, 0x82308432, /* 84 */ 0x82308433, 0x82308434, 0x82308435, 0x82308436, @@ -1550,7 +1588,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82308935, 0x82308936, 0x82308937, 0x82308938, /* bc */ 0x82308939, 0x82308a30, 0x82308a31, 0x82308a32, - /*** Three byte table, leaf: e394xx - offset 0x013be ***/ + /*** Three byte table, leaf: e394xx - offset 0x0143f ***/ /* 80 */ 0x82308a33, 0x82308a34, 0x82308a35, 0x82308a36, /* 84 */ 0x82308a37, 0x82308a38, 0x82308a39, 0x82308b30, @@ -1569,7 +1607,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82308f39, 0x82309030, 0x82309031, 0x82309032, /* bc */ 0x82309033, 0x82309034, 0x82309035, 0x82309036, - /*** Three byte table, leaf: e395xx - offset 0x013fe ***/ + /*** Three byte table, leaf: e395xx - offset 0x0147f ***/ /* 80 */ 0x82309037, 0x82309038, 0x82309039, 0x82309130, /* 84 */ 0x82309131, 0x82309132, 0x82309133, 0x82309134, @@ -1588,7 +1626,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82309633, 0x82309634, 0x82309635, 0x82309636, /* bc */ 0x82309637, 0x82309638, 0x82309639, 0x82309730, - /*** Three byte table, leaf: e396xx - offset 0x0143e ***/ + /*** Three byte table, leaf: e396xx - offset 0x014bf ***/ /* 80 */ 0x82309731, 0x82309732, 0x82309733, 0x82309734, /* 84 */ 0x82309735, 0x82309736, 0x82309737, 0x82309738, @@ -1607,7 +1645,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82309c36, 0x82309c37, 0x82309c38, 0x82309c39, /* bc */ 0x82309d30, 0x82309d31, 0x82309d32, 0x82309d33, - /*** Three byte table, leaf: e397xx - offset 0x0147e ***/ + /*** Three byte table, leaf: e397xx - offset 0x014ff ***/ /* 80 */ 0x82309d34, 0x82309d35, 0x82309d36, 0x82309d37, /* 84 */ 0x82309d38, 0x82309d39, 0x82309e30, 0x82309e31, @@ -1626,7 +1664,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8230a330, 0x8230a331, 0x8230a332, 0x8230a333, /* bc */ 0x8230a334, 0x8230a335, 0x8230a336, 0x8230a337, - /*** Three byte table, leaf: e398xx - offset 0x014be ***/ + /*** Three byte table, leaf: e398xx - offset 0x0153f ***/ /* 80 */ 0x8230a338, 0x8230a339, 0x8230a430, 0x8230a431, /* 84 */ 0x8230a432, 0x8230a433, 0x8230a434, 0x8230a435, @@ -1640,7 +1678,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* a4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 24 trailing zero values shared with next segment */ - /*** Three byte table, leaf: e3a4xx - offset 0x014e6 ***/ + /*** Three byte table, leaf: e3a4xx - offset 0x01567 ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -1659,7 +1697,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8230f539, 0x8230f630, 0x8230f631, 0x8230f632, /* bc */ 0x8230f633, 0x8230f634, 0x8230f635, 0x8230f636, - /*** Three byte table, leaf: e3a5xx - offset 0x01526 ***/ + /*** Three byte table, leaf: e3a5xx - offset 0x015a7 ***/ /* 80 */ 0x8230f637, 0x8230f638, 0x8230f639, 0x8230f730, /* 84 */ 0x8230f731, 0x8230f732, 0x8230f733, 0x8230f734, @@ -1678,7 +1716,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8230fc32, 0x8230fc33, 0x8230fc34, 0x8230fc35, /* bc */ 0x8230fc36, 0x8230fc37, 0x8230fc38, 0x8230fc39, - /*** Three byte table, leaf: e3a6xx - offset 0x01566 ***/ + /*** Three byte table, leaf: e3a6xx - offset 0x015e7 ***/ /* 80 */ 0x8230fd30, 0x8230fd31, 0x8230fd32, 0x8230fd33, /* 84 */ 0x8230fd34, 0x8230fd35, 0x8230fd36, 0x8230fd37, @@ -1697,7 +1735,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82318436, 0x82318437, 0x82318438, 0x82318439, /* bc */ 0x82318530, 0x82318531, 0x82318532, 0x82318533, - /*** Three byte table, leaf: e3a7xx - offset 0x015a6 ***/ + /*** Three byte table, leaf: e3a7xx - offset 0x01627 ***/ /* 80 */ 0x82318534, 0x82318535, 0x82318536, 0x82318537, /* 84 */ 0x82318538, 0x82318539, 0x82318630, 0x82318631, @@ -1716,7 +1754,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82318a37, 0x82318a38, 0x82318a39, 0x82318b30, /* bc */ 0x82318b31, 0x82318b32, 0x82318b33, 0x82318b34, - /*** Three byte table, leaf: e3a8xx - offset 0x015e6 ***/ + /*** Three byte table, leaf: e3a8xx - offset 0x01667 ***/ /* 80 */ 0x82318b35, 0x82318b36, 0x82318b37, 0x82318b38, /* 84 */ 0x82318b39, 0x82318c30, 0x82318c31, 0x82318c32, @@ -1735,7 +1773,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82319131, 0x82319132, 0x82319133, 0x82319134, /* bc */ 0x82319135, 0x82319136, 0x82319137, 0x82319138, - /*** Three byte table, leaf: e3a9xx - offset 0x01626 ***/ + /*** Three byte table, leaf: e3a9xx - offset 0x016a7 ***/ /* 80 */ 0x82319139, 0x82319230, 0x82319231, 0x82319232, /* 84 */ 0x82319233, 0x82319234, 0x82319235, 0x82319236, @@ -1754,7 +1792,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82319734, 0x82319735, 0x82319736, 0x82319737, /* bc */ 0x82319738, 0x82319739, 0x82319830, 0x82319831, - /*** Three byte table, leaf: e3aaxx - offset 0x01666 ***/ + /*** Three byte table, leaf: e3aaxx - offset 0x016e7 ***/ /* 80 */ 0x82319832, 0x82319833, 0x82319834, 0x82319835, /* 84 */ 0x82319836, 0x82319837, 0x82319838, 0x82319839, @@ -1773,7 +1811,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82319d38, 0x82319d39, 0x82319e30, 0x82319e31, /* bc */ 0x82319e32, 0x82319e33, 0x82319e34, 0x82319e35, - /*** Three byte table, leaf: e3abxx - offset 0x016a6 ***/ + /*** Three byte table, leaf: e3abxx - offset 0x01727 ***/ /* 80 */ 0x82319e36, 0x82319e37, 0x82319e38, 0x82319e39, /* 84 */ 0x82319f30, 0x82319f31, 0x82319f32, 0x82319f33, @@ -1792,7 +1830,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231a432, 0x8231a433, 0x8231a434, 0x8231a435, /* bc */ 0x8231a436, 0x8231a437, 0x8231a438, 0x8231a439, - /*** Three byte table, leaf: e3acxx - offset 0x016e6 ***/ + /*** Three byte table, leaf: e3acxx - offset 0x01767 ***/ /* 80 */ 0x8231a530, 0x8231a531, 0x8231a532, 0x8231a533, /* 84 */ 0x8231a534, 0x8231a535, 0x8231a536, 0x8231a537, @@ -1811,7 +1849,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231aa36, 0x8231aa37, 0x8231aa38, 0x8231aa39, /* bc */ 0x8231ab30, 0x8231ab31, 0x8231ab32, 0x8231ab33, - /*** Three byte table, leaf: e3adxx - offset 0x01726 ***/ + /*** Three byte table, leaf: e3adxx - offset 0x017a7 ***/ /* 80 */ 0x8231ab34, 0x8231ab35, 0x8231ab36, 0x8231ab37, /* 84 */ 0x8231ab38, 0x8231ab39, 0x8231ac30, 0x8231ac31, @@ -1830,7 +1868,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231b039, 0x8231b130, 0x8231b131, 0x8231b132, /* bc */ 0x8231b133, 0x8231b134, 0x8231b135, 0x8231b136, - /*** Three byte table, leaf: e3aexx - offset 0x01766 ***/ + /*** Three byte table, leaf: e3aexx - offset 0x017e7 ***/ /* 80 */ 0x8231b137, 0x8231b138, 0x8231b139, 0x8231b230, /* 84 */ 0x8231b231, 0x8231b232, 0x8231b233, 0x8231b234, @@ -1849,7 +1887,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231b733, 0x8231b734, 0x8231b735, 0x8231b736, /* bc */ 0x8231b737, 0x8231b738, 0x8231b739, 0x8231b830, - /*** Three byte table, leaf: e3afxx - offset 0x017a6 ***/ + /*** Three byte table, leaf: e3afxx - offset 0x01827 ***/ /* 80 */ 0x8231b831, 0x8231b832, 0x8231b833, 0x8231b834, /* 84 */ 0x8231b835, 0x8231b836, 0x8231b837, 0x8231b838, @@ -1868,7 +1906,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231bd37, 0x8231bd38, 0x8231bd39, 0x8231be30, /* bc */ 0x8231be31, 0x8231be32, 0x8231be33, 0x8231be34, - /*** Three byte table, leaf: e3b0xx - offset 0x017e6 ***/ + /*** Three byte table, leaf: e3b0xx - offset 0x01867 ***/ /* 80 */ 0x8231be35, 0x8231be36, 0x8231be37, 0x8231be38, /* 84 */ 0x8231be39, 0x8231bf30, 0x8231bf31, 0x8231bf32, @@ -1887,7 +1925,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231c431, 0x8231c432, 0x8231c433, 0x8231c434, /* bc */ 0x8231c435, 0x8231c436, 0x8231c437, 0x8231c438, - /*** Three byte table, leaf: e3b1xx - offset 0x01826 ***/ + /*** Three byte table, leaf: e3b1xx - offset 0x018a7 ***/ /* 80 */ 0x8231c439, 0x8231c530, 0x8231c531, 0x8231c532, /* 84 */ 0x8231c533, 0x8231c534, 0x8231c535, 0x8231c536, @@ -1906,7 +1944,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231ca34, 0x8231ca35, 0x8231ca36, 0x8231ca37, /* bc */ 0x8231ca38, 0x8231ca39, 0x8231cb30, 0x8231cb31, - /*** Three byte table, leaf: e3b2xx - offset 0x01866 ***/ + /*** Three byte table, leaf: e3b2xx - offset 0x018e7 ***/ /* 80 */ 0x8231cb32, 0x8231cb33, 0x8231cb34, 0x8231cb35, /* 84 */ 0x8231cb36, 0x8231cb37, 0x8231cb38, 0x8231cb39, @@ -1925,7 +1963,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8231d038, 0x8231d039, 0x8231d130, 0x8231d131, /* bc */ 0x8231d132, 0x8231d133, 0x8231d134, 0x8231d135, - /*** Three byte table, leaf: e3b3xx - offset 0x018a6 ***/ + /*** Three byte table, leaf: e3b3xx - offset 0x01927 ***/ /* 80 */ 0x8231d136, 0x8231d137, 0x8231d138, 0x8231d139, /* 84 */ 0x8231d230, 0x8231d231, 0x8231d232, 0x8231d233, @@ -1940,7 +1978,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* a8 */ 0x00000000, 0x00000000, /* 22 trailing zero values shared with next segment */ - /*** Three byte table, leaf: e481xx - offset 0x018d0 ***/ + /*** Three byte table, leaf: e481xx - offset 0x01951 ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -1959,7 +1997,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8232b236, 0x8232b237, 0x8232b238, 0x8232b239, /* bc */ 0x8232b330, 0x8232b331, 0x8232b332, 0x8232b333, - /*** Three byte table, leaf: e482xx - offset 0x01910 ***/ + /*** Three byte table, leaf: e482xx - offset 0x01991 ***/ /* 80 */ 0x8232b334, 0x8232b335, 0x8232b336, 0x8232b337, /* 84 */ 0x8232b338, 0x8232b339, 0x8232b430, 0x8232b431, @@ -1978,7 +2016,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8232b930, 0x8232b931, 0x8232b932, 0x8232b933, /* bc */ 0x8232b934, 0x8232b935, 0x8232b936, 0x8232b937, - /*** Three byte table, leaf: e483xx - offset 0x01950 ***/ + /*** Three byte table, leaf: e483xx - offset 0x019d1 ***/ /* 80 */ 0x8232b938, 0x8232b939, 0x8232ba30, 0x8232ba31, /* 84 */ 0x8232ba32, 0x8232ba33, 0x8232ba34, 0x8232ba35, @@ -1997,7 +2035,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8232bf34, 0x8232bf35, 0x8232bf36, 0x8232bf37, /* bc */ 0x8232bf38, 0x8232bf39, 0x8232c030, 0x8232c031, - /*** Three byte table, leaf: e484xx - offset 0x01990 ***/ + /*** Three byte table, leaf: e484xx - offset 0x01a11 ***/ /* 80 */ 0x8232c032, 0x8232c033, 0x8232c034, 0x8232c035, /* 84 */ 0x8232c036, 0x8232c037, 0x8232c038, 0x8232c039, @@ -2016,7 +2054,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8232c538, 0x8232c539, 0x8232c630, 0x8232c631, /* bc */ 0x8232c632, 0x8232c633, 0x8232c634, 0x8232c635, - /*** Three byte table, leaf: e485xx - offset 0x019d0 ***/ + /*** Three byte table, leaf: e485xx - offset 0x01a51 ***/ /* 80 */ 0x8232c636, 0x8232c637, 0x8232c638, 0x8232c639, /* 84 */ 0x8232c730, 0x8232c731, 0x8232c732, 0x8232c733, @@ -2028,7 +2066,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* 9c */ 0x8232c934, 0x8232c935, 0x8232c936, 0x0000fe70, /* 32 trailing zero values shared with next segment */ - /*** Three byte table, leaf: e48cxx - offset 0x019f0 ***/ + /*** Three byte table, leaf: e48cxx - offset 0x01a71 ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -2047,7 +2085,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8232f838, 0x8232f839, 0x8232f930, 0x8232f931, /* bc */ 0x8232f932, 0x8232f933, 0x8232f934, 0x8232f935, - /*** Three byte table, leaf: e48dxx - offset 0x01a30 ***/ + /*** Three byte table, leaf: e48dxx - offset 0x01ab1 ***/ /* 80 */ 0x8232f936, 0x8232f937, 0x8232f938, 0x8232f939, /* 84 */ 0x8232fa30, 0x8232fa31, 0x8232fa32, 0x8232fa33, @@ -2066,7 +2104,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82338132, 0x82338133, 0x82338134, 0x82338135, /* bc */ 0x82338136, 0x82338137, 0x82338138, 0x82338139, - /*** Three byte table, leaf: e48exx - offset 0x01a70 ***/ + /*** Three byte table, leaf: e48exx - offset 0x01af1 ***/ /* 80 */ 0x82338230, 0x82338231, 0x82338232, 0x82338233, /* 84 */ 0x82338234, 0x82338235, 0x82338236, 0x82338237, @@ -2085,7 +2123,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82338734, 0x82338735, 0x82338736, 0x82338737, /* bc */ 0x82338738, 0x82338739, 0x82338830, 0x82338831, - /*** Three byte table, leaf: e48fxx - offset 0x01ab0 ***/ + /*** Three byte table, leaf: e48fxx - offset 0x01b31 ***/ /* 80 */ 0x82338832, 0x82338833, 0x82338834, 0x82338835, /* 84 */ 0x82338836, 0x82338837, 0x82338838, 0x82338839, @@ -2104,7 +2142,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82338d37, 0x82338d38, 0x82338d39, 0x82338e30, /* bc */ 0x82338e31, 0x82338e32, 0x82338e33, 0x82338e34, - /*** Three byte table, leaf: e490xx - offset 0x01af0 ***/ + /*** Three byte table, leaf: e490xx - offset 0x01b71 ***/ /* 80 */ 0x82338e35, 0x82338e36, 0x82338e37, 0x82338e38, /* 84 */ 0x82338e39, 0x82338f30, 0x82338f31, 0x82338f32, @@ -2123,7 +2161,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82339431, 0x82339432, 0x82339433, 0x82339434, /* bc */ 0x82339435, 0x82339436, 0x82339437, 0x82339438, - /*** Three byte table, leaf: e491xx - offset 0x01b30 ***/ + /*** Three byte table, leaf: e491xx - offset 0x01bb1 ***/ /* 80 */ 0x82339439, 0x82339530, 0x82339531, 0x82339532, /* 84 */ 0x82339533, 0x82339534, 0x82339535, 0x82339536, @@ -2142,7 +2180,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82339a35, 0x82339a36, 0x82339a37, 0x82339a38, /* bc */ 0x82339a39, 0x82339b30, 0x82339b31, 0x82339b32, - /*** Three byte table, leaf: e492xx - offset 0x01b70 ***/ + /*** Three byte table, leaf: e492xx - offset 0x01bf1 ***/ /* 80 */ 0x82339b33, 0x82339b34, 0x82339b35, 0x82339b36, /* 84 */ 0x82339b37, 0x82339b38, 0x82339b39, 0x82339c30, @@ -2161,7 +2199,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8233a039, 0x8233a130, 0x8233a131, 0x8233a132, /* bc */ 0x8233a133, 0x8233a134, 0x8233a135, 0x8233a136, - /*** Three byte table, leaf: e493xx - offset 0x01bb0 ***/ + /*** Three byte table, leaf: e493xx - offset 0x01c31 ***/ /* 80 */ 0x8233a137, 0x8233a138, 0x8233a139, 0x8233a230, /* 84 */ 0x8233a231, 0x8233a232, 0x8233a233, 0x8233a234, @@ -2178,7 +2216,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 12 trailing zero values shared with next segment */ - /*** Three byte table, leaf: e499xx - offset 0x01be4 ***/ + /*** Three byte table, leaf: e499xx - offset 0x01c65 ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -2197,7 +2235,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8233cd34, 0x8233cd35, 0x8233cd36, 0x8233cd37, /* bc */ 0x8233cd38, 0x8233cd39, 0x8233ce30, 0x8233ce31, - /*** Three byte table, leaf: e49axx - offset 0x01c24 ***/ + /*** Three byte table, leaf: e49axx - offset 0x01ca5 ***/ /* 80 */ 0x8233ce32, 0x8233ce33, 0x8233ce34, 0x8233ce35, /* 84 */ 0x8233ce36, 0x8233ce37, 0x8233ce38, 0x8233ce39, @@ -2216,7 +2254,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8233d338, 0x8233d339, 0x8233d430, 0x8233d431, /* bc */ 0x8233d432, 0x8233d433, 0x8233d434, 0x8233d435, - /*** Three byte table, leaf: e49bxx - offset 0x01c64 ***/ + /*** Three byte table, leaf: e49bxx - offset 0x01ce5 ***/ /* 80 */ 0x8233d436, 0x8233d437, 0x8233d438, 0x8233d439, /* 84 */ 0x8233d530, 0x8233d531, 0x8233d532, 0x8233d533, @@ -2235,7 +2273,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8233da32, 0x8233da33, 0x8233da34, 0x8233da35, /* bc */ 0x8233da36, 0x8233da37, 0x8233da38, 0x8233da39, - /*** Three byte table, leaf: e49cxx - offset 0x01ca4 ***/ + /*** Three byte table, leaf: e49cxx - offset 0x01d25 ***/ /* 80 */ 0x8233db30, 0x8233db31, 0x8233db32, 0x8233db33, /* 84 */ 0x8233db34, 0x8233db35, 0x8233db36, 0x8233db37, @@ -2254,7 +2292,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8233e034, 0x8233e035, 0x8233e036, 0x8233e037, /* bc */ 0x8233e038, 0x8233e039, 0x8233e130, 0x8233e131, - /*** Three byte table, leaf: e49dxx - offset 0x01ce4 ***/ + /*** Three byte table, leaf: e49dxx - offset 0x01d65 ***/ /* 80 */ 0x8233e132, 0x8233e133, 0x8233e134, 0x8233e135, /* 84 */ 0x8233e136, 0x8233e137, 0x8233e138, 0x8233e139, @@ -2273,7 +2311,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8233e638, 0x8233e639, 0x8233e730, 0x8233e731, /* bc */ 0x0000fe82, 0x8233e732, 0x8233e733, 0x8233e734, - /*** Three byte table, leaf: e49exx - offset 0x01d24 ***/ + /*** Three byte table, leaf: e49exx - offset 0x01da5 ***/ /* 80 */ 0x8233e735, 0x8233e736, 0x8233e737, 0x8233e738, /* 84 */ 0x8233e739, 0x8233e830, 0x8233e831, 0x8233e832, @@ -2292,7 +2330,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00000000, /* 7 trailing zero values shared with next segment */ - /*** Three byte table, leaf: e4a5xx - offset 0x01d5d ***/ + /*** Three byte table, leaf: e4a5xx - offset 0x01dde ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x0000fe85, @@ -2311,7 +2349,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82349b37, 0x82349b38, 0x0000fe86, 0x82349b39, /* bc */ 0x82349c30, 0x0000fe87, 0x82349c31, 0x82349c32, - /*** Three byte table, leaf: e4a6xx - offset 0x01d9d ***/ + /*** Three byte table, leaf: e4a6xx - offset 0x01e1e ***/ /* 80 */ 0x82349c33, 0x82349c34, 0x0000fe88, 0x0000fe89, /* 84 */ 0x82349c35, 0x0000fe8a, 0x0000fe8b, 0x82349c36, @@ -2329,7 +2367,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b4 */ 0x8234a039, 0x8234a130, 0x0000fe8f, 0x0000fe8e, /* 8 trailing zero values shared with next segment */ - /*** Three byte table, leaf: e4b1xx - offset 0x01dd5 ***/ + /*** Three byte table, leaf: e4b1xx - offset 0x01e56 ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -2348,7 +2386,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8234e734, 0x8234e735, 0x8234e736, 0x8234e737, /* bc */ 0x8234e738, 0x8234e739, 0x8234e830, 0x8234e831, - /*** Three byte table, leaf: e4b2xx - offset 0x01e15 ***/ + /*** Three byte table, leaf: e4b2xx - offset 0x01e96 ***/ /* 80 */ 0x8234e832, 0x8234e833, 0x8234e834, 0x8234e835, /* 84 */ 0x8234e836, 0x8234e837, 0x8234e838, 0x8234e839, @@ -2367,7 +2405,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8234ed33, 0x8234ed34, 0x8234ed35, 0x8234ed36, /* bc */ 0x8234ed37, 0x8234ed38, 0x8234ed39, 0x8234ee30, - /*** Three byte table, leaf: e4b3xx - offset 0x01e55 ***/ + /*** Three byte table, leaf: e4b3xx - offset 0x01ed6 ***/ /* 80 */ 0x8234ee31, 0x8234ee32, 0x8234ee33, 0x8234ee34, /* 84 */ 0x8234ee35, 0x8234ee36, 0x8234ee37, 0x8234ee38, @@ -2386,7 +2424,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8234f337, 0x8234f338, 0x8234f339, 0x8234f430, /* bc */ 0x8234f431, 0x8234f432, 0x8234f433, 0x8234f434, - /*** Three byte table, leaf: e4b4xx - offset 0x01e95 ***/ + /*** Three byte table, leaf: e4b4xx - offset 0x01f16 ***/ /* 80 */ 0x8234f435, 0x8234f436, 0x8234f437, 0x8234f438, /* 84 */ 0x8234f439, 0x8234f530, 0x8234f531, 0x8234f532, @@ -2405,7 +2443,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x8234f934, 0x8234f935, 0x8234f936, 0x8234f937, /* bc */ 0x8234f938, 0x8234f939, 0x8234fa30, 0x8234fa31, - /*** Three byte table, leaf: e4b5xx - offset 0x01ed5 ***/ + /*** Three byte table, leaf: e4b5xx - offset 0x01f56 ***/ /* 80 */ 0x8234fa32, 0x8234fa33, 0x8234fa34, 0x8234fa35, /* 84 */ 0x8234fa36, 0x8234fa37, 0x8234fa38, 0x8234fa39, @@ -2424,7 +2462,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82358138, 0x82358139, 0x82358230, 0x82358231, /* bc */ 0x82358232, 0x82358233, 0x82358234, 0x82358235, - /*** Three byte table, leaf: e4b6xx - offset 0x01f15 ***/ + /*** Three byte table, leaf: e4b6xx - offset 0x01f96 ***/ /* 80 */ 0x82358236, 0x82358237, 0x82358238, 0x82358239, /* 84 */ 0x82358330, 0x82358331, 0x82358332, 0x82358333, @@ -2443,7 +2481,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82358831, 0x82358832, 0x82358833, 0x82358834, /* bc */ 0x82358835, 0x82358836, 0x82358837, 0x82358838, - /*** Three byte table, leaf: e4b7xx - offset 0x01f55 ***/ + /*** Three byte table, leaf: e4b7xx - offset 0x01fd6 ***/ /* 80 */ 0x82358839, 0x82358930, 0x82358931, 0x82358932, /* 84 */ 0x82358933, 0x82358934, 0x82358935, 0x82358936, @@ -2462,7 +2500,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x82358e35, 0x82358e36, 0x82358e37, 0x82358e38, /* bc */ 0x82358e39, 0x82358f30, 0x82358f31, 0x82358f32, - /*** Three byte table, leaf: e4b8xx - offset 0x01f95 ***/ + /*** Three byte table, leaf: e4b8xx - offset 0x02016 ***/ /* 80 */ 0x0000d2bb, 0x0000b6a1, 0x00008140, 0x0000c6df, /* 84 */ 0x00008141, 0x00008142, 0x00008143, 0x0000cdf2, @@ -2481,7 +2519,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cde8, 0x0000b5a4, 0x0000ceaa, 0x0000d6f7, /* bc */ 0x00008153, 0x0000c0f6, 0x0000bed9, 0x0000d8af, - /*** Three byte table, leaf: e4b9xx - offset 0x01fd5 ***/ + /*** Three byte table, leaf: e4b9xx - offset 0x02056 ***/ /* 80 */ 0x00008154, 0x00008155, 0x00008156, 0x0000c4cb, /* 84 */ 0x00008157, 0x0000bec3, 0x00008158, 0x0000d8b1, @@ -2500,7 +2538,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008170, 0x00008171, 0x00008172, 0x00008173, /* bc */ 0x00008174, 0x00008175, 0x0000c7ac, 0x00008176, - /*** Three byte table, leaf: e4baxx - offset 0x02015 ***/ + /*** Three byte table, leaf: e4baxx - offset 0x02096 ***/ /* 80 */ 0x00008177, 0x00008178, 0x00008179, 0x0000817a, /* 84 */ 0x0000817b, 0x0000817c, 0x0000c1cb, 0x0000817d, @@ -2519,7 +2557,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000818f, 0x00008190, 0x0000c8cb, 0x0000d8e9, /* bc */ 0x00008191, 0x00008192, 0x00008193, 0x0000d2da, - /*** Three byte table, leaf: e4bbxx - offset 0x02055 ***/ + /*** Three byte table, leaf: e4bbxx - offset 0x020d6 ***/ /* 80 */ 0x0000cab2, 0x0000c8ca, 0x0000d8ec, 0x0000d8ea, /* 84 */ 0x0000d8c6, 0x0000bdf6, 0x0000c6cd, 0x0000b3f0, @@ -2538,7 +2576,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000081a6, 0x000081a7, 0x000081a8, 0x0000c8ce, /* bc */ 0x000081a9, 0x0000b7dd, 0x000081aa, 0x0000b7c2, - /*** Three byte table, leaf: e4bcxx - offset 0x02095 ***/ + /*** Three byte table, leaf: e4bcxx - offset 0x02116 ***/ /* 80 */ 0x000081ab, 0x0000c6f3, 0x000081ac, 0x000081ad, /* 84 */ 0x000081ae, 0x000081af, 0x000081b0, 0x000081b1, @@ -2557,7 +2595,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c9ec, 0x000081c7, 0x0000cbc5, 0x000081c8, /* bc */ 0x0000cbc6, 0x0000d9a4, 0x000081c9, 0x000081ca, - /*** Three byte table, leaf: e4bdxx - offset 0x020d5 ***/ + /*** Three byte table, leaf: e4bdxx - offset 0x02156 ***/ /* 80 */ 0x000081cb, 0x000081cc, 0x000081cd, 0x0000b5e8, /* 84 */ 0x000081ce, 0x000081cf, 0x0000b5ab, 0x000081d0, @@ -2576,7 +2614,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000081e5, 0x000081e6, 0x000081e7, 0x0000d9ac, /* bc */ 0x0000d9ae, 0x000081e8, 0x0000d9ab, 0x0000cab9, - /*** Three byte table, leaf: e4bexx - offset 0x02115 ***/ + /*** Three byte table, leaf: e4bexx - offset 0x02196 ***/ /* 80 */ 0x000081e9, 0x000081ea, 0x000081eb, 0x0000d9a9, /* 84 */ 0x0000d6b6, 0x000081ec, 0x000081ed, 0x000081ee, @@ -2595,7 +2633,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000824a, 0x0000824b, 0x0000824c, 0x0000824d, /* bc */ 0x0000824e, 0x0000824f, 0x00008250, 0x0000b1e3, - /*** Three byte table, leaf: e4bfxx - offset 0x02155 ***/ + /*** Three byte table, leaf: e4bfxx - offset 0x021d6 ***/ /* 80 */ 0x00008251, 0x00008252, 0x00008253, 0x0000b4d9, /* 84 */ 0x0000b6ed, 0x0000d9b4, 0x00008254, 0x00008255, @@ -2614,7 +2652,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d9ba, 0x0000826f, 0x0000b0b3, 0x00008270, /* bc */ 0x00008271, 0x00008272, 0x0000d9c2, 0x00008273, - /*** Three byte table, leaf: e580xx - offset 0x02195 ***/ + /*** Three byte table, leaf: e580xx - offset 0x02216 ***/ /* 80 */ 0x00008274, 0x00008275, 0x00008276, 0x00008277, /* 84 */ 0x00008278, 0x00008279, 0x0000827a, 0x0000827b, @@ -2633,7 +2671,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000829a, 0x0000829b, 0x0000d5ae, 0x0000829c, /* bc */ 0x0000d6b5, 0x0000829d, 0x0000c7e3, 0x0000829e, - /*** Three byte table, leaf: e581xx - offset 0x021d5 ***/ + /*** Three byte table, leaf: e581xx - offset 0x02256 ***/ /* 80 */ 0x0000829f, 0x000082a0, 0x000082a1, 0x0000d9c8, /* 84 */ 0x000082a2, 0x000082a3, 0x000082a4, 0x0000bcd9, @@ -2652,7 +2690,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000082ca, 0x000082cb, 0x000082cc, 0x0000d9cd, /* bc */ 0x000082cd, 0x000082ce, 0x0000d9c7, 0x0000b3a5, - /*** Three byte table, leaf: e582xx - offset 0x02215 ***/ + /*** Three byte table, leaf: e582xx - offset 0x02296 ***/ /* 80 */ 0x0000bffe, 0x000082cf, 0x000082d0, 0x000082d1, /* 84 */ 0x000082d2, 0x0000b8b5, 0x000082d3, 0x000082d4, @@ -2671,7 +2709,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000082fc, 0x000082fd, 0x0000d9d1, 0x0000c9b5, /* bc */ 0x000082fe, 0x00008340, 0x00008341, 0x00008342, - /*** Three byte table, leaf: e583xx - offset 0x02255 ***/ + /*** Three byte table, leaf: e583xx - offset 0x022d6 ***/ /* 80 */ 0x00008343, 0x00008344, 0x00008345, 0x00008346, /* 84 */ 0x00008347, 0x00008348, 0x00008349, 0x0000834a, @@ -2690,7 +2728,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008371, 0x00008372, 0x00008373, 0x0000c6a7, /* bc */ 0x00008374, 0x00008375, 0x00008376, 0x00008377, - /*** Three byte table, leaf: e584xx - offset 0x02295 ***/ + /*** Three byte table, leaf: e584xx - offset 0x02316 ***/ /* 80 */ 0x00008378, 0x00008379, 0x0000837a, 0x0000837b, /* 84 */ 0x0000837c, 0x0000837d, 0x0000d9d3, 0x0000d9d8, @@ -2709,7 +2747,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000083ac, 0x000083ad, 0x000083ae, 0x000083af, /* bc */ 0x000083b0, 0x000083b1, 0x000083b2, 0x0000b6f9, - /*** Three byte table, leaf: e585xx - offset 0x022d5 ***/ + /*** Three byte table, leaf: e585xx - offset 0x02356 ***/ /* 80 */ 0x0000d8a3, 0x0000d4ca, 0x000083b3, 0x0000d4aa, /* 84 */ 0x0000d0d6, 0x0000b3e4, 0x0000d5d7, 0x000083b4, @@ -2728,7 +2766,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b5e4, 0x0000d7c8, 0x000083cd, 0x0000d1f8, /* bc */ 0x0000bce6, 0x0000cade, 0x000083ce, 0x000083cf, - /*** Three byte table, leaf: e586xx - offset 0x02315 ***/ + /*** Three byte table, leaf: e586xx - offset 0x02396 ***/ /* 80 */ 0x0000bcbd, 0x0000d9e6, 0x0000d8e7, 0x000083d0, /* 84 */ 0x000083d1, 0x0000c4da, 0x000083d2, 0x000083d3, @@ -2747,7 +2785,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000083eb, 0x000083ec, 0x000083ed, 0x0000b6b3, /* bc */ 0x0000d9fe, 0x0000d9fd, 0x000083ee, 0x000083ef, - /*** Three byte table, leaf: e587xx - offset 0x02355 ***/ + /*** Three byte table, leaf: e587xx - offset 0x023d6 ***/ /* 80 */ 0x0000bebb, 0x000083f0, 0x000083f1, 0x000083f2, /* 84 */ 0x0000c6e0, 0x000083f3, 0x0000d7bc, 0x0000daa1, @@ -2766,7 +2804,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cdb9, 0x0000b0bc, 0x0000b3f6, 0x0000bbf7, /* bc */ 0x0000dbca, 0x0000baaf, 0x00008454, 0x0000d4e4, - /*** Three byte table, leaf: e588xx - offset 0x02395 ***/ + /*** Three byte table, leaf: e588xx - offset 0x02416 ***/ /* 80 */ 0x0000b5b6, 0x0000b5f3, 0x0000d8d6, 0x0000c8d0, /* 84 */ 0x00008455, 0x00008456, 0x0000b7d6, 0x0000c7d0, @@ -2785,7 +2823,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c8af, 0x0000c9b2, 0x0000b4cc, 0x0000bfcc, /* bc */ 0x0000846f, 0x0000b9f4, 0x00008470, 0x0000d8db, - /*** Three byte table, leaf: e589xx - offset 0x023d5 ***/ + /*** Three byte table, leaf: e589xx - offset 0x02456 ***/ /* 80 */ 0x0000d8dc, 0x0000b6e7, 0x0000bcc1, 0x0000ccea, /* 84 */ 0x00008471, 0x00008472, 0x00008473, 0x00008474, @@ -2804,7 +2842,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008496, 0x00008497, 0x00008498, 0x00008499, /* bc */ 0x0000849a, 0x0000d8e2, 0x0000849b, 0x0000bdcb, - /*** Three byte table, leaf: e58axx - offset 0x02415 ***/ + /*** Three byte table, leaf: e58axx - offset 0x02496 ***/ /* 80 */ 0x0000849c, 0x0000d8e4, 0x0000d8e3, 0x0000849d, /* 84 */ 0x0000849e, 0x0000849f, 0x000084a0, 0x000084a1, @@ -2823,7 +2861,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000084be, 0x000084bf, 0x000084c0, 0x000084c1, /* bc */ 0x000084c2, 0x000084c3, 0x0000dbc0, 0x0000cac6, - /*** Three byte table, leaf: e58bxx - offset 0x02455 ***/ + /*** Three byte table, leaf: e58bxx - offset 0x024d6 ***/ /* 80 */ 0x000084c4, 0x000084c5, 0x000084c6, 0x0000b2aa, /* 84 */ 0x000084c7, 0x000084c8, 0x000084c9, 0x0000d3c2, @@ -2842,7 +2880,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000084f1, 0x0000d9e8, 0x0000c9d7, 0x000084f2, /* bc */ 0x000084f3, 0x000084f4, 0x0000b9b4, 0x0000cef0, - /*** Three byte table, leaf: e58cxx - offset 0x02495 ***/ + /*** Three byte table, leaf: e58cxx - offset 0x02516 ***/ /* 80 */ 0x0000d4c8, 0x000084f5, 0x000084f6, 0x000084f7, /* 84 */ 0x000084f8, 0x0000b0fc, 0x0000b4d2, 0x000084f9, @@ -2861,7 +2899,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000855b, 0x0000c6a5, 0x0000c7f8, 0x0000d2bd, /* bc */ 0x0000855c, 0x0000855d, 0x0000d8d2, 0x0000c4e4, - /*** Three byte table, leaf: e58dxx - offset 0x024d5 ***/ + /*** Three byte table, leaf: e58dxx - offset 0x02556 ***/ /* 80 */ 0x0000855e, 0x0000caae, 0x0000855f, 0x0000c7a7, /* 84 */ 0x00008560, 0x0000d8a6, 0x00008561, 0x0000c9fd, @@ -2880,7 +2918,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d0b6, 0x00008572, 0x0000dae1, 0x00008573, /* bc */ 0x00008574, 0x00008575, 0x00008576, 0x0000c7e4, - /*** Three byte table, leaf: e58exx - offset 0x02515 ***/ + /*** Three byte table, leaf: e58exx - offset 0x02596 ***/ /* 80 */ 0x00008577, 0x00008578, 0x0000b3a7, 0x00008579, /* 84 */ 0x0000b6f2, 0x0000ccfc, 0x0000c0fa, 0x0000857a, @@ -2899,7 +2937,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000859b, 0x0000859c, 0x0000859d, 0x0000c8a5, /* bc */ 0x0000859e, 0x0000859f, 0x000085a0, 0x0000cfd8, - /*** Three byte table, leaf: e58fxx - offset 0x02555 ***/ + /*** Three byte table, leaf: e58fxx - offset 0x025d6 ***/ /* 80 */ 0x000085a1, 0x0000c8fe, 0x0000b2ce, 0x000085a2, /* 84 */ 0x000085a3, 0x000085a4, 0x000085a5, 0x000085a6, @@ -2918,7 +2956,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cbbe, 0x0000ccbe, 0x000085b5, 0x0000dfb7, /* bc */ 0x0000b5f0, 0x0000dfb4, 0x000085b6, 0x000085b7, - /*** Three byte table, leaf: e590xx - offset 0x02595 ***/ + /*** Three byte table, leaf: e590xx - offset 0x02616 ***/ /* 80 */ 0x000085b8, 0x0000d3f5, 0x000085b9, 0x0000b3d4, /* 84 */ 0x0000b8f7, 0x000085ba, 0x0000dfba, 0x000085bb, @@ -2937,7 +2975,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cefc, 0x0000b4b5, 0x000085ca, 0x0000cec7, /* bc */ 0x0000baf0, 0x000085cb, 0x0000cee1, 0x000085cc, - /*** Three byte table, leaf: e591xx - offset 0x025d5 ***/ + /*** Three byte table, leaf: e591xx - offset 0x02656 ***/ /* 80 */ 0x0000d1bd, 0x000085cd, 0x000085ce, 0x0000dfc0, /* 84 */ 0x000085cf, 0x000085d0, 0x0000b4f4, 0x000085d1, @@ -2956,7 +2994,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c5de, 0x000085ea, 0x000085eb, 0x0000c9eb, /* bc */ 0x0000baf4, 0x0000c3fc, 0x000085ec, 0x000085ed, - /*** Three byte table, leaf: e592xx - offset 0x02615 ***/ + /*** Three byte table, leaf: e592xx - offset 0x02696 ***/ /* 80 */ 0x0000bed7, 0x000085ee, 0x0000dfc6, 0x000085ef, /* 84 */ 0x0000dfcd, 0x000085f0, 0x0000c5d8, 0x000085f1, @@ -2975,7 +3013,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cfcc, 0x00008648, 0x00008649, 0x0000dfdd, /* bc */ 0x0000864a, 0x0000d1ca, 0x0000864b, 0x0000dfde, - /*** Three byte table, leaf: e593xx - offset 0x02655 ***/ + /*** Three byte table, leaf: e593xx - offset 0x026d6 ***/ /* 80 */ 0x0000b0a7, 0x0000c6b7, 0x0000dfd3, 0x0000864c, /* 84 */ 0x0000bae5, 0x0000864d, 0x0000b6df, 0x0000cddb, @@ -2994,7 +3032,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008661, 0x00008662, 0x0000b2b8, 0x00008663, /* bc */ 0x0000badf, 0x0000dfec, 0x00008664, 0x0000dbc1, - /*** Three byte table, leaf: e594xx - offset 0x02695 ***/ + /*** Three byte table, leaf: e594xx - offset 0x02716 ***/ /* 80 */ 0x00008665, 0x0000d1e4, 0x00008666, 0x00008667, /* 84 */ 0x00008668, 0x00008669, 0x0000cbf4, 0x0000b4bd, @@ -3013,7 +3051,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008688, 0x00008689, 0x0000868a, 0x0000868b, /* bc */ 0x0000dffe, 0x0000868c, 0x0000cdd9, 0x0000dffc, - /*** Three byte table, leaf: e595xx - offset 0x026d5 ***/ + /*** Three byte table, leaf: e595xx - offset 0x02756 ***/ /* 80 */ 0x0000868d, 0x0000dffa, 0x0000868e, 0x0000bfd0, /* 84 */ 0x0000d7c4, 0x0000868f, 0x0000c9cc, 0x00008690, @@ -3032,7 +3070,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d0a5, 0x000086af, 0x000086b0, 0x0000e0b4, /* bc */ 0x0000cce4, 0x000086b1, 0x0000e0b1, 0x000086b2, - /*** Three byte table, leaf: e596xx - offset 0x02715 ***/ + /*** Three byte table, leaf: e596xx - offset 0x02796 ***/ /* 80 */ 0x0000bfa6, 0x0000e0af, 0x0000ceb9, 0x0000e0ab, /* 84 */ 0x0000c9c6, 0x000086b3, 0x000086b4, 0x0000c0ae, @@ -3051,7 +3089,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000086d4, 0x0000e0ad, 0x000086d5, 0x0000d3f7, /* bc */ 0x000086d6, 0x0000e0b6, 0x0000e0b7, 0x000086d7, - /*** Three byte table, leaf: e597xx - offset 0x02755 ***/ + /*** Three byte table, leaf: e597xx - offset 0x027d6 ***/ /* 80 */ 0x000086d8, 0x000086d9, 0x000086da, 0x000086db, /* 84 */ 0x0000e0c4, 0x0000d0e1, 0x000086dc, 0x000086dd, @@ -3070,7 +3108,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000086f5, 0x000086f6, 0x000086f7, 0x000086f8, /* bc */ 0x000086f9, 0x0000cbd4, 0x0000e0d5, 0x000086fa, - /*** Three byte table, leaf: e598xx - offset 0x02795 ***/ + /*** Three byte table, leaf: e598xx - offset 0x02816 ***/ /* 80 */ 0x0000e0d6, 0x0000e0d2, 0x000086fb, 0x000086fc, /* 84 */ 0x000086fd, 0x000086fe, 0x00008740, 0x00008741, @@ -3089,7 +3127,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008760, 0x0000e0da, 0x00008761, 0x0000cefb, /* bc */ 0x00008762, 0x00008763, 0x00008764, 0x0000bad9, - /*** Three byte table, leaf: e599xx - offset 0x027d5 ***/ + /*** Three byte table, leaf: e599xx - offset 0x02856 ***/ /* 80 */ 0x00008765, 0x00008766, 0x00008767, 0x00008768, /* 84 */ 0x00008769, 0x0000876a, 0x0000876b, 0x0000876c, @@ -3108,7 +3146,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000878d, 0x0000878e, 0x0000878f, 0x0000e0e7, /* bc */ 0x0000e0e8, 0x00008790, 0x00008791, 0x00008792, - /*** Three byte table, leaf: e59axx - offset 0x02815 ***/ + /*** Three byte table, leaf: e59axx - offset 0x02896 ***/ /* 80 */ 0x00008793, 0x00008794, 0x00008795, 0x00008796, /* 84 */ 0x00008797, 0x0000e0e9, 0x0000e0e3, 0x00008798, @@ -3127,7 +3165,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000087c3, 0x000087c4, 0x000087c5, 0x000087c6, /* bc */ 0x0000bdc0, 0x000087c7, 0x000087c8, 0x000087c9, - /*** Three byte table, leaf: e59bxx - offset 0x02855 ***/ + /*** Three byte table, leaf: e59bxx - offset 0x028d6 ***/ /* 80 */ 0x000087ca, 0x000087cb, 0x000087cc, 0x000087cd, /* 84 */ 0x000087ce, 0x000087cf, 0x000087d0, 0x000087d1, @@ -3146,7 +3184,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000087f0, 0x0000e0f2, 0x0000b9cc, 0x000087f1, /* bc */ 0x000087f2, 0x0000b9fa, 0x0000cdbc, 0x0000e0f3, - /*** Three byte table, leaf: e59cxx - offset 0x02895 ***/ + /*** Three byte table, leaf: e59cxx - offset 0x02916 ***/ /* 80 */ 0x000087f3, 0x000087f4, 0x000087f5, 0x0000c6d4, /* 84 */ 0x0000e0f4, 0x000087f6, 0x0000d4b2, 0x000087f7, @@ -3165,7 +3203,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000885a, 0x0000dbdb, 0x0000b3a1, 0x0000dbdf, /* bc */ 0x0000885b, 0x0000885c, 0x0000bbf8, 0x0000885d, - /*** Three byte table, leaf: e59dxx - offset 0x028d5 ***/ + /*** Three byte table, leaf: e59dxx - offset 0x02956 ***/ /* 80 */ 0x0000d6b7, 0x0000885e, 0x0000dbe0, 0x0000885f, /* 84 */ 0x00008860, 0x00008861, 0x00008862, 0x0000bef9, @@ -3184,7 +3222,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008878, 0x00008879, 0x0000887a, 0x0000dbe6, /* bc */ 0x0000dbe5, 0x0000887b, 0x0000887c, 0x0000887d, - /*** Three byte table, leaf: e59exx - offset 0x02915 ***/ + /*** Three byte table, leaf: e59exx - offset 0x02996 ***/ /* 80 */ 0x0000887e, 0x00008880, 0x0000b4b9, 0x0000c0ac, /* 84 */ 0x0000c2a2, 0x0000dbe2, 0x0000dbe4, 0x00008881, @@ -3203,7 +3241,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dbf9, 0x000088a0, 0x000088a1, 0x000088a2, /* bc */ 0x000088a3, 0x000088a4, 0x000088a5, 0x000088a6, - /*** Three byte table, leaf: e59fxx - offset 0x02955 ***/ + /*** Three byte table, leaf: e59fxx - offset 0x029d6 ***/ /* 80 */ 0x000088a7, 0x000088a8, 0x0000b9a1, 0x0000b0a3, /* 84 */ 0x000088a9, 0x000088aa, 0x000088ab, 0x000088ac, @@ -3222,7 +3260,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dbfc, 0x0000c5e0, 0x0000bbf9, 0x000088cd, /* bc */ 0x000088ce, 0x0000dca3, 0x000088cf, 0x000088d0, - /*** Three byte table, leaf: e5a0xx - offset 0x02995 ***/ + /*** Three byte table, leaf: e5a0xx - offset 0x02a16 ***/ /* 80 */ 0x0000dca5, 0x000088d1, 0x0000ccc3, 0x000088d2, /* 84 */ 0x000088d3, 0x000088d4, 0x0000b6d1, 0x0000ddc0, @@ -3241,7 +3279,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000088f9, 0x000088fa, 0x000088fb, 0x000088fc, /* bc */ 0x000088fd, 0x000088fe, 0x00008940, 0x00008941, - /*** Three byte table, leaf: e5a1xx - offset 0x029d5 ***/ + /*** Three byte table, leaf: e5a1xx - offset 0x02a56 ***/ /* 80 */ 0x00008942, 0x00008943, 0x00008944, 0x00008945, /* 84 */ 0x0000dca8, 0x00008946, 0x00008947, 0x00008948, @@ -3260,7 +3298,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008970, 0x00008971, 0x00008972, 0x00008973, /* bc */ 0x00008974, 0x00008975, 0x0000dbd3, 0x00008976, - /*** Three byte table, leaf: e5a2xx - offset 0x02a15 ***/ + /*** Three byte table, leaf: e5a2xx - offset 0x02a96 ***/ /* 80 */ 0x0000dcaf, 0x0000dcac, 0x00008977, 0x0000beb3, /* 84 */ 0x00008978, 0x0000cafb, 0x00008979, 0x0000897a, @@ -3279,7 +3317,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000089a3, 0x000089a4, 0x000089a5, 0x000089a6, /* bc */ 0x0000dbd4, 0x000089a7, 0x000089a8, 0x000089a9, - /*** Three byte table, leaf: e5a3xx - offset 0x02a55 ***/ + /*** Three byte table, leaf: e5a3xx - offset 0x02ad6 ***/ /* 80 */ 0x000089aa, 0x0000b1da, 0x000089ab, 0x000089ac, /* 84 */ 0x000089ad, 0x0000dbd5, 0x000089ae, 0x000089af, @@ -3298,7 +3336,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000089d7, 0x0000d2bc, 0x000089d8, 0x000089d9, /* bc */ 0x000089da, 0x000089db, 0x000089dc, 0x000089dd, - /*** Three byte table, leaf: e5a4xx - offset 0x02a95 ***/ + /*** Three byte table, leaf: e5a4xx - offset 0x02b16 ***/ /* 80 */ 0x000089de, 0x000089df, 0x0000e2ba, 0x000089e0, /* 84 */ 0x0000b4a6, 0x000089e1, 0x000089e2, 0x0000b1b8, @@ -3317,7 +3355,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bfe4, 0x0000bcd0, 0x0000b6e1, 0x000089fe, /* bc */ 0x0000dec5, 0x00008a40, 0x00008a41, 0x00008a42, - /*** Three byte table, leaf: e5a5xx - offset 0x02ad5 ***/ + /*** Three byte table, leaf: e5a5xx - offset 0x02b56 ***/ /* 80 */ 0x00008a43, 0x0000dec6, 0x0000dbbc, 0x00008a44, /* 84 */ 0x0000d1d9, 0x00008a45, 0x00008a46, 0x0000c6e6, @@ -3336,7 +3374,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bce9, 0x0000cbfd, 0x00008a65, 0x00008a66, /* bc */ 0x00008a67, 0x0000bac3, 0x00008a68, 0x00008a69, - /*** Three byte table, leaf: e5a6xx - offset 0x02b15 ***/ + /*** Three byte table, leaf: e5a6xx - offset 0x02b96 ***/ /* 80 */ 0x00008a6a, 0x0000e5f9, 0x0000c8e7, 0x0000e5fa, /* 84 */ 0x0000cdfd, 0x00008a6b, 0x0000d7b1, 0x0000b8be, @@ -3355,7 +3393,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008a8a, 0x0000c3c3, 0x00008a8b, 0x0000c6de, /* bc */ 0x00008a8c, 0x00008a8d, 0x0000e6aa, 0x00008a8e, - /*** Three byte table, leaf: e5a7xx - offset 0x02b55 ***/ + /*** Three byte table, leaf: e5a7xx - offset 0x02bd6 ***/ /* 80 */ 0x00008a8f, 0x00008a90, 0x00008a91, 0x00008a92, /* 84 */ 0x00008a93, 0x00008a94, 0x0000c4b7, 0x00008a95, @@ -3374,7 +3412,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008ab6, 0x0000e6b1, 0x00008ab7, 0x0000d2f6, /* bc */ 0x00008ab8, 0x00008ab9, 0x00008aba, 0x0000d7cb, - /*** Three byte table, leaf: e5a8xx - offset 0x02b95 ***/ + /*** Three byte table, leaf: e5a8xx - offset 0x02c16 ***/ /* 80 */ 0x00008abb, 0x0000cdfe, 0x00008abc, 0x0000cdde, /* 84 */ 0x0000c2a6, 0x0000e6ab, 0x0000e6ac, 0x0000bdbf, @@ -3393,7 +3431,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008add, 0x00008ade, 0x00008adf, 0x00008ae0, /* bc */ 0x0000e6bd, 0x00008ae1, 0x00008ae2, 0x00008ae3, - /*** Three byte table, leaf: e5a9xx - offset 0x02bd5 ***/ + /*** Three byte table, leaf: e5a9xx - offset 0x02c56 ***/ /* 80 */ 0x0000e6b9, 0x00008ae4, 0x00008ae5, 0x00008ae6, /* 84 */ 0x00008ae7, 0x00008ae8, 0x0000c6c5, 0x00008ae9, @@ -3412,7 +3450,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008b50, 0x00008b51, 0x0000e6c4, 0x00008b52, /* bc */ 0x00008b53, 0x00008b54, 0x00008b55, 0x0000d0f6, - /*** Three byte table, leaf: e5aaxx - offset 0x02c15 ***/ + /*** Three byte table, leaf: e5aaxx - offset 0x02c96 ***/ /* 80 */ 0x00008b56, 0x00008b57, 0x00008b58, 0x00008b59, /* 84 */ 0x00008b5a, 0x00008b5b, 0x00008b5c, 0x00008b5d, @@ -3431,7 +3469,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e6ca, 0x00008b88, 0x00008b89, 0x00008b8a, /* bc */ 0x00008b8b, 0x00008b8c, 0x0000e6c5, 0x00008b8d, - /*** Three byte table, leaf: e5abxx - offset 0x02c55 ***/ + /*** Three byte table, leaf: e5abxx - offset 0x02cd6 ***/ /* 80 */ 0x00008b8e, 0x0000bcde, 0x0000c9a9, 0x00008b8f, /* 84 */ 0x00008b90, 0x00008b91, 0x00008b92, 0x00008b93, @@ -3450,7 +3488,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008bb6, 0x00008bb7, 0x00008bb8, 0x00008bb9, /* bc */ 0x00008bba, 0x00008bbb, 0x00008bbc, 0x00008bbd, - /*** Three byte table, leaf: e5acxx - offset 0x02c95 ***/ + /*** Three byte table, leaf: e5acxx - offset 0x02d16 ***/ /* 80 */ 0x00008bbe, 0x00008bbf, 0x00008bc0, 0x00008bc1, /* 84 */ 0x00008bc2, 0x00008bc3, 0x00008bc4, 0x00008bc5, @@ -3469,7 +3507,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008bf0, 0x00008bf1, 0x00008bf2, 0x00008bf3, /* bc */ 0x00008bf4, 0x00008bf5, 0x00008bf6, 0x00008bf7, - /*** Three byte table, leaf: e5adxx - offset 0x02cd5 ***/ + /*** Three byte table, leaf: e5adxx - offset 0x02d56 ***/ /* 80 */ 0x0000e6d7, 0x00008bf8, 0x00008bf9, 0x00008bfa, /* 84 */ 0x00008bfb, 0x00008bfc, 0x00008bfd, 0x00008bfe, @@ -3488,7 +3526,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008c57, 0x00008c58, 0x0000c8e6, 0x00008c59, /* bc */ 0x00008c5a, 0x0000c4f5, 0x00008c5b, 0x00008c5c, - /*** Three byte table, leaf: e5aexx - offset 0x02d15 ***/ + /*** Three byte table, leaf: e5aexx - offset 0x02d96 ***/ /* 80 */ 0x0000e5b2, 0x0000c4fe, 0x00008c5d, 0x0000cbfc, /* 84 */ 0x0000e5b3, 0x0000d5ac, 0x00008c5e, 0x0000d3ee, @@ -3507,7 +3545,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e5b7, 0x0000c8dd, 0x00008c72, 0x00008c73, /* bc */ 0x00008c74, 0x0000bfed, 0x0000b1f6, 0x0000cbde, - /*** Three byte table, leaf: e5afxx - offset 0x02d55 ***/ + /*** Three byte table, leaf: e5afxx - offset 0x02dd6 ***/ /* 80 */ 0x00008c75, 0x00008c76, 0x0000bcc5, 0x00008c77, /* 84 */ 0x0000bcc4, 0x0000d2fa, 0x0000c3dc, 0x0000bfdc, @@ -3526,7 +3564,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b4e7, 0x0000b6d4, 0x0000cbc2, 0x0000d1b0, /* bc */ 0x0000b5bc, 0x00008c9c, 0x00008c9d, 0x0000cad9, - /*** Three byte table, leaf: e5b0xx - offset 0x02d95 ***/ + /*** Three byte table, leaf: e5b0xx - offset 0x02e16 ***/ /* 80 */ 0x00008c9e, 0x0000b7e2, 0x00008c9f, 0x00008ca0, /* 84 */ 0x0000c9e4, 0x00008ca1, 0x0000bdab, 0x00008ca2, @@ -3545,7 +3583,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000caac, 0x0000d2fc, 0x0000b3df, 0x0000e5ea, /* bc */ 0x0000c4e1, 0x0000bea1, 0x0000ceb2, 0x0000c4f2, - /*** Three byte table, leaf: e5b1xx - offset 0x02dd5 ***/ + /*** Three byte table, leaf: e5b1xx - offset 0x02e56 ***/ /* 80 */ 0x0000bed6, 0x0000c6a8, 0x0000b2e3, 0x00008cc1, /* 84 */ 0x00008cc2, 0x0000bed3, 0x00008cc3, 0x00008cc4, @@ -3564,7 +3602,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008ce2, 0x0000d2d9, 0x0000e1a8, 0x00008ce3, /* bc */ 0x00008ce4, 0x00008ce5, 0x00008ce6, 0x0000d3ec, - /*** Three byte table, leaf: e5b2xx - offset 0x02e15 ***/ + /*** Three byte table, leaf: e5b2xx - offset 0x02e96 ***/ /* 80 */ 0x00008ce7, 0x0000cbea, 0x0000c6f1, 0x00008ce8, /* 84 */ 0x00008ce9, 0x00008cea, 0x00008ceb, 0x00008cec, @@ -3583,7 +3621,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b0b6, 0x00008d47, 0x00008d48, 0x00008d49, /* bc */ 0x00008d4a, 0x0000e1b4, 0x00008d4b, 0x0000bff9, - /*** Three byte table, leaf: e5b3xx - offset 0x02e55 ***/ + /*** Three byte table, leaf: e5b3xx - offset 0x02ed6 ***/ /* 80 */ 0x00008d4c, 0x0000e1b9, 0x00008d4d, 0x00008d4e, /* 84 */ 0x0000e1bb, 0x00008d4f, 0x00008d50, 0x00008d51, @@ -3602,7 +3640,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008d77, 0x00008d78, 0x00008d79, 0x0000befe, /* bc */ 0x00008d7a, 0x00008d7b, 0x00008d7c, 0x00008d7d, - /*** Three byte table, leaf: e5b4xx - offset 0x02e95 ***/ + /*** Three byte table, leaf: e5b4xx - offset 0x02f16 ***/ /* 80 */ 0x00008d7e, 0x00008d80, 0x0000e1c0, 0x0000e1c1, /* 84 */ 0x00008d81, 0x00008d82, 0x0000e1c7, 0x0000b3e7, @@ -3621,7 +3659,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008da7, 0x00008da8, 0x00008da9, 0x00008daa, /* bc */ 0x00008dab, 0x0000e1cc, 0x0000e1ca, 0x00008dac, - /*** Three byte table, leaf: e5b5xx - offset 0x02ed5 ***/ + /*** Three byte table, leaf: e5b5xx - offset 0x02f56 ***/ /* 80 */ 0x00008dad, 0x00008dae, 0x00008daf, 0x00008db0, /* 84 */ 0x00008db1, 0x00008db2, 0x00008db3, 0x0000effa, @@ -3640,7 +3678,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008dd9, 0x00008dda, 0x00008ddb, 0x00008ddc, /* bc */ 0x00008ddd, 0x00008dde, 0x00008ddf, 0x00008de0, - /*** Three byte table, leaf: e5b6xx - offset 0x02f15 ***/ + /*** Three byte table, leaf: e5b6xx - offset 0x02f96 ***/ /* 80 */ 0x00008de1, 0x00008de2, 0x0000e1d6, 0x00008de3, /* 84 */ 0x00008de4, 0x00008de5, 0x00008de6, 0x00008de7, @@ -3659,7 +3697,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008e56, 0x00008e57, 0x00008e58, 0x00008e59, /* bc */ 0x00008e5a, 0x00008e5b, 0x00008e5c, 0x00008e5d, - /*** Three byte table, leaf: e5b7xx - offset 0x02f55 ***/ + /*** Three byte table, leaf: e5b7xx - offset 0x02fd6 ***/ /* 80 */ 0x00008e5e, 0x00008e5f, 0x00008e60, 0x00008e61, /* 84 */ 0x00008e62, 0x0000e1db, 0x00008e63, 0x00008e64, @@ -3678,7 +3716,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008e83, 0x00008e84, 0x00008e85, 0x00008e86, /* bc */ 0x00008e87, 0x0000d9e3, 0x0000bded, 0x00008e88, - /*** Three byte table, leaf: e5b8xx - offset 0x02f95 ***/ + /*** Three byte table, leaf: e5b8xx - offset 0x03016 ***/ /* 80 */ 0x00008e89, 0x0000b1d2, 0x0000cad0, 0x0000b2bc, /* 84 */ 0x00008e8a, 0x0000cba7, 0x0000b7ab, 0x00008e8b, @@ -3697,7 +3735,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b3a3, 0x00008ea8, 0x00008ea9, 0x0000e0fd, /* bc */ 0x0000e0fe, 0x0000c3b1, 0x00008eaa, 0x00008eab, - /*** Three byte table, leaf: e5b9xx - offset 0x02fd5 ***/ + /*** Three byte table, leaf: e5b9xx - offset 0x03056 ***/ /* 80 */ 0x00008eac, 0x00008ead, 0x0000c3dd, 0x00008eae, /* 84 */ 0x0000e1a2, 0x0000b7f9, 0x00008eaf, 0x00008eb0, @@ -3716,7 +3754,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d0d2, 0x00008ed6, 0x0000e7db, 0x0000bbc3, /* bc */ 0x0000d3d7, 0x0000d3c4, 0x00008ed7, 0x0000b9e3, - /*** Three byte table, leaf: e5baxx - offset 0x03015 ***/ + /*** Three byte table, leaf: e5baxx - offset 0x03096 ***/ /* 80 */ 0x0000e2cf, 0x00008ed8, 0x00008ed9, 0x00008eda, /* 84 */ 0x0000d7af, 0x00008edb, 0x0000c7ec, 0x0000b1d3, @@ -3735,7 +3773,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d3b9, 0x0000e2d5, 0x00008ef4, 0x00008ef5, /* bc */ 0x00008ef6, 0x00008ef7, 0x0000e2d7, 0x00008ef8, - /*** Three byte table, leaf: e5bbxx - offset 0x03055 ***/ + /*** Three byte table, leaf: e5bbxx - offset 0x030d6 ***/ /* 80 */ 0x00008ef9, 0x00008efa, 0x00008efb, 0x00008efc, /* 84 */ 0x00008efd, 0x00008efe, 0x00008f40, 0x00008f41, @@ -3754,7 +3792,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008f66, 0x00008f67, 0x0000bda8, 0x00008f68, /* bc */ 0x00008f69, 0x00008f6a, 0x0000dec3, 0x0000d8a5, - /*** Three byte table, leaf: e5bcxx - offset 0x03095 ***/ + /*** Three byte table, leaf: e5bcxx - offset 0x03116 ***/ /* 80 */ 0x0000bfaa, 0x0000dbcd, 0x0000d2ec, 0x0000c6fa, /* 84 */ 0x0000c5aa, 0x00008f6b, 0x00008f6c, 0x00008f6d, @@ -3773,7 +3811,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008f8b, 0x0000b5af, 0x0000c7bf, 0x00008f8c, /* bc */ 0x0000e5f6, 0x00008f8d, 0x00008f8e, 0x00008f8f, - /*** Three byte table, leaf: e5bdxx - offset 0x030d5 ***/ + /*** Three byte table, leaf: e5bdxx - offset 0x03156 ***/ /* 80 */ 0x0000ecb0, 0x00008f90, 0x00008f91, 0x00008f92, /* 84 */ 0x00008f93, 0x00008f94, 0x00008f95, 0x00008f96, @@ -3792,7 +3830,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008fb3, 0x0000d2db, 0x00008fb4, 0x0000b3b9, /* bc */ 0x0000b1cb, 0x00008fb5, 0x00008fb6, 0x00008fb7, - /*** Three byte table, leaf: e5bexx - offset 0x03115 ***/ + /*** Three byte table, leaf: e5bexx - offset 0x03196 ***/ /* 80 */ 0x0000cdf9, 0x0000d5f7, 0x0000e1de, 0x00008fb8, /* 84 */ 0x0000beb6, 0x0000b4fd, 0x00008fb9, 0x0000e1df, @@ -3811,7 +3849,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00008fd7, 0x00008fd8, 0x00008fd9, 0x00008fda, /* bc */ 0x0000e1e8, 0x0000bbd5, 0x00008fdb, 0x00008fdc, - /*** Three byte table, leaf: e5bfxx - offset 0x03155 ***/ + /*** Three byte table, leaf: e5bfxx - offset 0x031d6 ***/ /* 80 */ 0x00008fdd, 0x00008fde, 0x00008fdf, 0x0000d0c4, /* 84 */ 0x0000e2e0, 0x0000b1d8, 0x0000d2e4, 0x00008fe0, @@ -3830,7 +3868,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e2ee, 0x00008ffb, 0x00008ffc, 0x0000d0c3, /* bc */ 0x00008ffd, 0x0000baf6, 0x0000e2e9, 0x0000b7de, - /*** Three byte table, leaf: e680xx - offset 0x03195 ***/ + /*** Three byte table, leaf: e680xx - offset 0x03216 ***/ /* 80 */ 0x0000bbb3, 0x0000ccac, 0x0000cbcb, 0x0000e2e4, /* 84 */ 0x0000e2e6, 0x0000e2ea, 0x0000e2eb, 0x00008ffe, @@ -3849,7 +3887,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009059, 0x0000905a, 0x0000905b, 0x0000d7dc, /* bc */ 0x0000eda1, 0x0000905c, 0x0000905d, 0x0000e2f8, - /*** Three byte table, leaf: e681xx - offset 0x031d5 ***/ + /*** Three byte table, leaf: e681xx - offset 0x03256 ***/ /* 80 */ 0x0000905e, 0x0000eda5, 0x0000e2fe, 0x0000cad1, /* 84 */ 0x0000905f, 0x00009060, 0x00009061, 0x00009062, @@ -3868,7 +3906,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e2fa, 0x0000e2fb, 0x0000e2fd, 0x0000e2fc, /* bc */ 0x0000c4d5, 0x0000e3a2, 0x0000907d, 0x0000d3c1, - /*** Three byte table, leaf: e682xx - offset 0x03215 ***/ + /*** Three byte table, leaf: e682xx - offset 0x03296 ***/ /* 80 */ 0x0000907e, 0x00009080, 0x00009081, 0x0000e3a7, /* 84 */ 0x0000c7c4, 0x00009082, 0x00009083, 0x00009084, @@ -3887,7 +3925,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bcc2, 0x000090a0, 0x000090a1, 0x0000e3ac, /* bc */ 0x0000b5bf, 0x000090a2, 0x000090a3, 0x000090a4, - /*** Three byte table, leaf: e683xx - offset 0x03255 ***/ + /*** Three byte table, leaf: e683xx - offset 0x032d6 ***/ /* 80 */ 0x000090a5, 0x000090a6, 0x000090a7, 0x000090a8, /* 84 */ 0x000090a9, 0x0000c7e9, 0x0000e3b0, 0x000090aa, @@ -3906,7 +3944,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000090c4, 0x0000c8c7, 0x0000d0ca, 0x000090c5, /* bc */ 0x000090c6, 0x000090c7, 0x000090c8, 0x000090c9, - /*** Three byte table, leaf: e684xx - offset 0x03295 ***/ + /*** Three byte table, leaf: e684xx - offset 0x03316 ***/ /* 80 */ 0x0000e3b8, 0x0000b3ee, 0x000090ca, 0x000090cb, /* 84 */ 0x000090cc, 0x000090cd, 0x0000eda9, 0x000090ce, @@ -3925,7 +3963,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000090f1, 0x000090f2, 0x000090f3, 0x000090f4, /* bc */ 0x000090f5, 0x000090f6, 0x000090f7, 0x0000d4b8, - /*** Three byte table, leaf: e685xx - offset 0x032d5 ***/ + /*** Three byte table, leaf: e685xx - offset 0x03356 ***/ /* 80 */ 0x000090f8, 0x000090f9, 0x000090fa, 0x000090fb, /* 84 */ 0x000090fc, 0x000090fd, 0x000090fe, 0x00009140, @@ -3944,7 +3982,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009164, 0x00009165, 0x00009166, 0x00009167, /* bc */ 0x00009168, 0x00009169, 0x0000916a, 0x0000916b, - /*** Three byte table, leaf: e686xx - offset 0x03315 ***/ + /*** Three byte table, leaf: e686xx - offset 0x03396 ***/ /* 80 */ 0x0000916c, 0x0000916d, 0x0000916e, 0x0000916f, /* 84 */ 0x00009170, 0x00009171, 0x00009172, 0x00009173, @@ -3963,7 +4001,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000919c, 0x0000919d, 0x0000919e, 0x0000919f, /* bc */ 0x000091a0, 0x000091a1, 0x0000bab6, 0x000091a2, - /*** Three byte table, leaf: e687xx - offset 0x03355 ***/ + /*** Three byte table, leaf: e687xx - offset 0x033d6 ***/ /* 80 */ 0x000091a3, 0x000091a4, 0x0000b6ae, 0x000091a5, /* 84 */ 0x000091a6, 0x000091a7, 0x000091a8, 0x000091a9, @@ -3982,7 +4020,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000091d2, 0x000091d3, 0x000091d4, 0x000091d5, /* bc */ 0x000091d6, 0x000091d7, 0x000091d8, 0x0000dcb2, - /*** Three byte table, leaf: e688xx - offset 0x03395 ***/ + /*** Three byte table, leaf: e688xx - offset 0x03416 ***/ /* 80 */ 0x000091d9, 0x000091da, 0x000091db, 0x000091dc, /* 84 */ 0x000091dd, 0x000091de, 0x0000edb0, 0x000091df, @@ -4001,7 +4039,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000091f5, 0x000091f6, 0x000091f7, 0x000091f8, /* bc */ 0x000091f9, 0x0000ece6, 0x0000ece5, 0x0000b7bf, - /*** Three byte table, leaf: e689xx - offset 0x033d5 ***/ + /*** Three byte table, leaf: e689xx - offset 0x03456 ***/ /* 80 */ 0x0000cbf9, 0x0000b1e2, 0x000091fa, 0x0000ece7, /* 84 */ 0x000091fb, 0x000091fc, 0x000091fd, 0x0000c9c8, @@ -4020,7 +4058,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009256, 0x0000c5fa, 0x00009257, 0x00009258, /* bc */ 0x0000b6f3, 0x00009259, 0x0000d5d2, 0x0000b3d0, - /*** Three byte table, leaf: e68axx - offset 0x03415 ***/ + /*** Three byte table, leaf: e68axx - offset 0x03496 ***/ /* 80 */ 0x0000bcbc, 0x0000925a, 0x0000925b, 0x0000925c, /* 84 */ 0x0000b3ad, 0x0000925d, 0x0000925e, 0x0000925f, @@ -4039,7 +4077,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000927a, 0x0000c4a8, 0x0000927b, 0x0000ded3, /* bc */ 0x0000d1ba, 0x0000b3e9, 0x0000927c, 0x0000c3f2, - /*** Three byte table, leaf: e68bxx - offset 0x03455 ***/ + /*** Three byte table, leaf: e68bxx - offset 0x034d6 ***/ /* 80 */ 0x0000927d, 0x0000927e, 0x0000b7f7, 0x00009280, /* 84 */ 0x0000d6f4, 0x0000b5a3, 0x0000b2f0, 0x0000c4b4, @@ -4058,7 +4096,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000928f, 0x00009290, 0x00009291, 0x00009292, /* bc */ 0x0000c6b4, 0x0000d7a7, 0x0000cab0, 0x0000c4c3, - /*** Three byte table, leaf: e68cxx - offset 0x03495 ***/ + /*** Three byte table, leaf: e68cxx - offset 0x03516 ***/ /* 80 */ 0x00009293, 0x0000b3d6, 0x0000b9d2, 0x00009294, /* 84 */ 0x00009295, 0x00009296, 0x00009297, 0x0000d6b8, @@ -4077,7 +4115,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000092b3, 0x0000deda, 0x0000cda6, 0x000092b4, /* bc */ 0x000092b5, 0x0000cdec, 0x000092b6, 0x000092b7, - /*** Three byte table, leaf: e68dxx - offset 0x034d5 ***/ + /*** Three byte table, leaf: e68dxx - offset 0x03556 ***/ /* 80 */ 0x000092b8, 0x000092b9, 0x0000cee6, 0x0000dedc, /* 84 */ 0x000092ba, 0x0000cdb1, 0x0000c0a6, 0x000092bb, @@ -4096,7 +4134,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000092d8, 0x000092d9, 0x0000dee0, 0x0000c4ed, /* bc */ 0x000092da, 0x000092db, 0x000092dc, 0x000092dd, - /*** Three byte table, leaf: e68exx - offset 0x03515 ***/ + /*** Three byte table, leaf: e68exx - offset 0x03596 ***/ /* 80 */ 0x0000cfc6, 0x000092de, 0x0000b5e0, 0x000092df, /* 84 */ 0x000092e0, 0x000092e1, 0x000092e2, 0x0000b6de, @@ -4115,7 +4153,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b5a7, 0x000092fa, 0x0000b2f4, 0x000092fb, /* bc */ 0x0000dee8, 0x000092fc, 0x0000def2, 0x000092fd, - /*** Three byte table, leaf: e68fxx - offset 0x03555 ***/ + /*** Three byte table, leaf: e68fxx - offset 0x035d6 ***/ /* 80 */ 0x000092fe, 0x00009340, 0x00009341, 0x00009342, /* 84 */ 0x0000deed, 0x00009343, 0x0000def1, 0x00009344, @@ -4134,7 +4172,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000deea, 0x00009364, 0x00009365, 0x00009366, /* bc */ 0x00009367, 0x0000c0bf, 0x00009368, 0x0000deec, - /*** Three byte table, leaf: e690xx - offset 0x03595 ***/ + /*** Three byte table, leaf: e690xx - offset 0x03616 ***/ /* 80 */ 0x0000b2f3, 0x0000b8e9, 0x0000c2a7, 0x00009369, /* 84 */ 0x0000936a, 0x0000bdc1, 0x0000936b, 0x0000936c, @@ -4153,7 +4191,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000938e, 0x0000938f, 0x0000d0af, 0x00009390, /* bc */ 0x00009391, 0x0000b2eb, 0x00009392, 0x0000eba1, - /*** Three byte table, leaf: e691xx - offset 0x035d5 ***/ + /*** Three byte table, leaf: e691xx - offset 0x03656 ***/ /* 80 */ 0x00009393, 0x0000def4, 0x00009394, 0x00009395, /* 84 */ 0x0000c9e3, 0x0000def3, 0x0000b0da, 0x0000d2a1, @@ -4172,7 +4210,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c3fe, 0x0000c4a1, 0x0000dfa1, 0x000093bd, /* bc */ 0x000093be, 0x000093bf, 0x000093c0, 0x000093c1, - /*** Three byte table, leaf: e692xx - offset 0x03615 ***/ + /*** Three byte table, leaf: e692xx - offset 0x03696 ***/ /* 80 */ 0x000093c2, 0x000093c3, 0x0000c1cc, 0x000093c4, /* 84 */ 0x0000defc, 0x0000beef, 0x000093c5, 0x0000c6b2, @@ -4191,7 +4229,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dfa3, 0x000093e8, 0x0000dfa5, 0x000093e9, /* bc */ 0x0000bab3, 0x000093ea, 0x000093eb, 0x000093ec, - /*** Three byte table, leaf: e693xx - offset 0x03655 ***/ + /*** Three byte table, leaf: e693xx - offset 0x036d6 ***/ /* 80 */ 0x0000dfa6, 0x000093ed, 0x0000c0de, 0x000093ee, /* 84 */ 0x000093ef, 0x0000c9c3, 0x000093f0, 0x000093f1, @@ -4210,7 +4248,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009459, 0x0000945a, 0x0000945b, 0x0000945c, /* bc */ 0x0000945d, 0x0000945e, 0x0000945f, 0x00009460, - /*** Three byte table, leaf: e694xx - offset 0x03695 ***/ + /*** Three byte table, leaf: e694xx - offset 0x03716 ***/ /* 80 */ 0x0000c5ca, 0x00009461, 0x00009462, 0x00009463, /* 84 */ 0x00009464, 0x00009465, 0x00009466, 0x00009467, @@ -4229,7 +4267,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d8fc, 0x0000b8c4, 0x0000948f, 0x0000b9a5, /* bc */ 0x00009490, 0x00009491, 0x0000b7c5, 0x0000d5fe, - /*** Three byte table, leaf: e695xx - offset 0x036d5 ***/ + /*** Three byte table, leaf: e695xx - offset 0x03756 ***/ /* 80 */ 0x00009492, 0x00009493, 0x00009494, 0x00009495, /* 84 */ 0x00009496, 0x0000b9ca, 0x00009497, 0x00009498, @@ -4248,7 +4286,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000094b5, 0x000094b6, 0x000094b7, 0x000094b8, /* bc */ 0x000094b9, 0x000094ba, 0x000094bb, 0x000094bc, - /*** Three byte table, leaf: e696xx - offset 0x03715 ***/ + /*** Three byte table, leaf: e696xx - offset 0x03796 ***/ /* 80 */ 0x000094bd, 0x000094be, 0x000094bf, 0x000094c0, /* 84 */ 0x000094c1, 0x000094c2, 0x000094c3, 0x0000cec4, @@ -4267,7 +4305,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000094e1, 0x0000b7bd, 0x000094e2, 0x000094e3, /* bc */ 0x0000ecb6, 0x0000caa9, 0x000094e4, 0x000094e5, - /*** Three byte table, leaf: e697xx - offset 0x03755 ***/ + /*** Three byte table, leaf: e697xx - offset 0x037d6 ***/ /* 80 */ 0x000094e6, 0x0000c5d4, 0x000094e7, 0x0000ecb9, /* 84 */ 0x0000ecb8, 0x0000c2c3, 0x0000ecb7, 0x000094e8, @@ -4286,7 +4324,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009544, 0x00009545, 0x0000cdfa, 0x00009546, /* bc */ 0x00009547, 0x00009548, 0x00009549, 0x0000954a, - /*** Three byte table, leaf: e698xx - offset 0x03795 ***/ + /*** Three byte table, leaf: e698xx - offset 0x03816 ***/ /* 80 */ 0x0000eac0, 0x0000954b, 0x0000b0ba, 0x0000eabe, /* 84 */ 0x0000954c, 0x0000954d, 0x0000c0a5, 0x0000954e, @@ -4305,7 +4343,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000956b, 0x0000956c, 0x0000956d, 0x0000956e, /* bc */ 0x0000d6e7, 0x0000956f, 0x0000cfd4, 0x00009570, - /*** Three byte table, leaf: e699xx - offset 0x037d5 ***/ + /*** Three byte table, leaf: e699xx - offset 0x03856 ***/ /* 80 */ 0x00009571, 0x0000eacb, 0x00009572, 0x0000bbce, /* 84 */ 0x00009573, 0x00009574, 0x00009575, 0x00009576, @@ -4324,7 +4362,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009593, 0x00009594, 0x0000d6c7, 0x00009595, /* bc */ 0x00009596, 0x00009597, 0x0000c1c0, 0x00009598, - /*** Three byte table, leaf: e69axx - offset 0x03815 ***/ + /*** Three byte table, leaf: e69axx - offset 0x03896 ***/ /* 80 */ 0x00009599, 0x0000959a, 0x0000d4dd, 0x0000959b, /* 84 */ 0x0000ead1, 0x0000959c, 0x0000959d, 0x0000cfbe, @@ -4343,7 +4381,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000095c5, 0x0000e5df, 0x000095c6, 0x000095c7, /* bc */ 0x000095c8, 0x000095c9, 0x0000ead5, 0x000095ca, - /*** Three byte table, leaf: e69bxx - offset 0x03855 ***/ + /*** Three byte table, leaf: e69bxx - offset 0x038d6 ***/ /* 80 */ 0x000095cb, 0x000095cc, 0x000095cd, 0x000095ce, /* 84 */ 0x000095cf, 0x000095d0, 0x000095d1, 0x000095d2, @@ -4362,7 +4400,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000095f8, 0x0000b2dc, 0x000095f9, 0x000095fa, /* bc */ 0x0000c2fc, 0x000095fb, 0x0000d4f8, 0x0000cce6, - /*** Three byte table, leaf: e69cxx - offset 0x03895 ***/ + /*** Three byte table, leaf: e69cxx - offset 0x03916 ***/ /* 80 */ 0x0000d7ee, 0x000095fc, 0x000095fd, 0x000095fe, /* 84 */ 0x00009640, 0x00009641, 0x00009642, 0x00009643, @@ -4381,7 +4419,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000965e, 0x0000965f, 0x0000bbfa, 0x00009660, /* bc */ 0x00009661, 0x0000d0e0, 0x00009662, 0x00009663, - /*** Three byte table, leaf: e69dxx - offset 0x038d5 ***/ + /*** Three byte table, leaf: e69dxx - offset 0x03956 ***/ /* 80 */ 0x0000c9b1, 0x00009664, 0x0000d4d3, 0x0000c8a8, /* 84 */ 0x00009665, 0x00009666, 0x0000b8cb, 0x00009667, @@ -4400,7 +4438,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009680, 0x00009681, 0x00009682, 0x00009683, /* bc */ 0x0000e8cc, 0x00009684, 0x0000cbc9, 0x0000b0e5, - /*** Three byte table, leaf: e69exx - offset 0x03915 ***/ + /*** Three byte table, leaf: e69exx - offset 0x03996 ***/ /* 80 */ 0x00009685, 0x0000bcab, 0x00009686, 0x00009687, /* 84 */ 0x0000b9b9, 0x00009688, 0x00009689, 0x0000e8c1, @@ -4419,7 +4457,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e8db, 0x000096a2, 0x000096a3, 0x000096a4, /* bc */ 0x000096a5, 0x000096a6, 0x000096a7, 0x000096a8, - /*** Three byte table, leaf: e69fxx - offset 0x03955 ***/ + /*** Three byte table, leaf: e69fxx - offset 0x039d6 ***/ /* 80 */ 0x000096a9, 0x0000e8de, 0x000096aa, 0x0000e8da, /* 84 */ 0x0000b1fa, 0x000096ab, 0x000096ac, 0x000096ad, @@ -4438,7 +4476,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000096c8, 0x000096c9, 0x000096ca, 0x000096cb, /* bc */ 0x000096cc, 0x0000e8df, 0x000096cd, 0x0000cac1, - /*** Three byte table, leaf: e6a0xx - offset 0x03995 ***/ + /*** Three byte table, leaf: e6a0xx - offset 0x03a16 ***/ /* 80 */ 0x0000e8d9, 0x000096ce, 0x000096cf, 0x000096d0, /* 84 */ 0x000096d1, 0x0000d5a4, 0x000096d2, 0x0000b1ea, @@ -4457,7 +4495,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bacb, 0x0000b8f9, 0x000096f1, 0x000096f2, /* bc */ 0x0000b8f1, 0x0000d4d4, 0x0000e8ef, 0x000096f3, - /*** Three byte table, leaf: e6a1xx - offset 0x039d5 ***/ + /*** Three byte table, leaf: e6a1xx - offset 0x03a56 ***/ /* 80 */ 0x0000e8ee, 0x0000e8ec, 0x0000b9f0, 0x0000ccd2, /* 84 */ 0x0000e8e6, 0x0000cea6, 0x0000bff2, 0x000096f4, @@ -4476,7 +4514,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000974e, 0x0000974f, 0x00009750, 0x00009751, /* bc */ 0x00009752, 0x00009753, 0x00009754, 0x00009755, - /*** Three byte table, leaf: e6a2xx - offset 0x03a15 ***/ + /*** Three byte table, leaf: e6a2xx - offset 0x03a96 ***/ /* 80 */ 0x00009756, 0x0000c1ba, 0x00009757, 0x0000e8e8, /* 84 */ 0x00009758, 0x0000c3b7, 0x0000b0f0, 0x00009759, @@ -4495,7 +4533,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000977e, 0x00009780, 0x00009781, 0x00009782, /* bc */ 0x00009783, 0x00009784, 0x00009785, 0x00009786, - /*** Three byte table, leaf: e6a3xx - offset 0x03a55 ***/ + /*** Three byte table, leaf: e6a3xx - offset 0x03ad6 ***/ /* 80 */ 0x0000bcec, 0x00009787, 0x0000e8f9, 0x00009788, /* 84 */ 0x00009789, 0x0000978a, 0x0000978b, 0x0000978c, @@ -4514,7 +4552,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000097b0, 0x0000e8fe, 0x0000b9d7, 0x000097b1, /* bc */ 0x0000e8fb, 0x000097b2, 0x000097b3, 0x000097b4, - /*** Three byte table, leaf: e6a4xx - offset 0x03a95 ***/ + /*** Three byte table, leaf: e6a4xx - offset 0x03b16 ***/ /* 80 */ 0x000097b5, 0x0000e9a4, 0x000097b6, 0x000097b7, /* 84 */ 0x000097b8, 0x0000d2ce, 0x000097b9, 0x000097ba, @@ -4533,7 +4571,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000097e0, 0x0000e9a9, 0x000097e1, 0x000097e2, /* bc */ 0x000097e3, 0x0000b4aa, 0x000097e4, 0x0000b4bb, - /*** Three byte table, leaf: e6a5xx - offset 0x03ad5 ***/ + /*** Three byte table, leaf: e6a5xx - offset 0x03b56 ***/ /* 80 */ 0x000097e5, 0x000097e6, 0x0000e9ab, 0x000097e7, /* 84 */ 0x000097e8, 0x000097e9, 0x000097ea, 0x000097eb, @@ -4552,7 +4590,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e9b1, 0x0000e9ba, 0x00009851, 0x00009852, /* bc */ 0x0000c2a5, 0x00009853, 0x00009854, 0x00009855, - /*** Three byte table, leaf: e6a6xx - offset 0x03b15 ***/ + /*** Three byte table, leaf: e6a6xx - offset 0x03b96 ***/ /* 80 */ 0x0000e9af, 0x00009856, 0x0000b8c5, 0x00009857, /* 84 */ 0x0000e9ad, 0x00009858, 0x0000d3dc, 0x0000e9b4, @@ -4571,7 +4609,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000987a, 0x0000987b, 0x0000987c, 0x0000e9bd, /* bc */ 0x0000987d, 0x0000987e, 0x00009880, 0x00009881, - /*** Three byte table, leaf: e6a7xx - offset 0x03b55 ***/ + /*** Three byte table, leaf: e6a7xx - offset 0x03bd6 ***/ /* 80 */ 0x00009882, 0x0000e9c2, 0x00009883, 0x00009884, /* 84 */ 0x00009885, 0x00009886, 0x00009887, 0x00009888, @@ -4590,7 +4628,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000098af, 0x000098b0, 0x000098b1, 0x000098b2, /* bc */ 0x000098b3, 0x0000b2db, 0x000098b4, 0x0000e9c8, - /*** Three byte table, leaf: e6a8xx - offset 0x03b95 ***/ + /*** Three byte table, leaf: e6a8xx - offset 0x03c16 ***/ /* 80 */ 0x000098b5, 0x000098b6, 0x000098b7, 0x000098b8, /* 84 */ 0x000098b9, 0x000098ba, 0x000098bb, 0x000098bc, @@ -4609,7 +4647,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000098e3, 0x000098e4, 0x000098e5, 0x000098e6, /* bc */ 0x000098e7, 0x0000e9d7, 0x0000e9d0, 0x000098e8, - /*** Three byte table, leaf: e6a9xx - offset 0x03bd5 ***/ + /*** Three byte table, leaf: e6a9xx - offset 0x03c56 ***/ /* 80 */ 0x000098e9, 0x000098ea, 0x000098eb, 0x000098ec, /* 84 */ 0x0000e9cf, 0x000098ed, 0x000098ee, 0x0000c7c1, @@ -4628,7 +4666,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009959, 0x0000e9d6, 0x0000995a, 0x0000995b, /* bc */ 0x0000e9da, 0x0000995c, 0x0000995d, 0x0000995e, - /*** Three byte table, leaf: e6aaxx - offset 0x03c15 ***/ + /*** Three byte table, leaf: e6aaxx - offset 0x03c96 ***/ /* 80 */ 0x0000ccb4, 0x0000995f, 0x00009960, 0x00009961, /* 84 */ 0x0000cfad, 0x00009962, 0x00009963, 0x00009964, @@ -4647,7 +4685,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000998e, 0x0000998f, 0x00009990, 0x00009991, /* bc */ 0x00009992, 0x00009993, 0x00009994, 0x00009995, - /*** Three byte table, leaf: e6abxx - offset 0x03c55 ***/ + /*** Three byte table, leaf: e6abxx - offset 0x03cd6 ***/ /* 80 */ 0x00009996, 0x00009997, 0x00009998, 0x00009999, /* 84 */ 0x0000999a, 0x0000999b, 0x0000999c, 0x0000999d, @@ -4666,7 +4704,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x000099ce, 0x000099cf, 0x000099d0, 0x000099d1, /* bc */ 0x000099d2, 0x000099d3, 0x000099d4, 0x000099d5, - /*** Three byte table, leaf: e6acxx - offset 0x03c95 ***/ + /*** Three byte table, leaf: e6acxx - offset 0x03d16 ***/ /* 80 */ 0x000099d6, 0x000099d7, 0x000099d8, 0x000099d9, /* 84 */ 0x000099da, 0x000099db, 0x000099dc, 0x000099dd, @@ -4685,7 +4723,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009a47, 0x0000eca5, 0x0000c6db, 0x00009a48, /* bc */ 0x00009a49, 0x00009a4a, 0x0000bfee, 0x00009a4b, - /*** Three byte table, leaf: e6adxx - offset 0x03cd5 ***/ + /*** Three byte table, leaf: e6adxx - offset 0x03d56 ***/ /* 80 */ 0x00009a4c, 0x00009a4d, 0x00009a4e, 0x0000eca6, /* 84 */ 0x00009a4f, 0x00009a50, 0x0000eca7, 0x0000d0aa, @@ -4704,7 +4742,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009a77, 0x0000b4f5, 0x00009a78, 0x0000cbc0, /* bc */ 0x0000bcdf, 0x00009a79, 0x00009a7a, 0x00009a7b, - /*** Three byte table, leaf: e6aexx - offset 0x03d15 ***/ + /*** Three byte table, leaf: e6aexx - offset 0x03d96 ***/ /* 80 */ 0x00009a7c, 0x0000e9e2, 0x0000e9e3, 0x0000d1ea, /* 84 */ 0x0000e9e5, 0x00009a7d, 0x0000b4f9, 0x0000e9e4, @@ -4723,7 +4761,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009aa0, 0x00009aa1, 0x00009aa2, 0x00009aa3, /* bc */ 0x00009aa4, 0x00009aa5, 0x00009aa6, 0x0000b5ee, - /*** Three byte table, leaf: e6afxx - offset 0x03d55 ***/ + /*** Three byte table, leaf: e6afxx - offset 0x03dd6 ***/ /* 80 */ 0x00009aa7, 0x0000bbd9, 0x0000ecb1, 0x00009aa8, /* 84 */ 0x00009aa9, 0x0000d2e3, 0x00009aaa, 0x00009aab, @@ -4742,7 +4780,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009acb, 0x0000eba8, 0x00009acc, 0x00009acd, /* bc */ 0x00009ace, 0x0000eba6, 0x00009acf, 0x00009ad0, - /*** Three byte table, leaf: e6b0xx - offset 0x03d95 ***/ + /*** Three byte table, leaf: e6b0xx - offset 0x03e16 ***/ /* 80 */ 0x00009ad1, 0x00009ad2, 0x00009ad3, 0x00009ad4, /* 84 */ 0x00009ad5, 0x0000eba9, 0x0000ebab, 0x0000ebaa, @@ -4761,7 +4799,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d3c0, 0x00009aeb, 0x00009aec, 0x00009aed, /* bc */ 0x00009aee, 0x0000d9db, 0x00009aef, 0x00009af0, - /*** Three byte table, leaf: e6b1xx - offset 0x03dd5 ***/ + /*** Three byte table, leaf: e6b1xx - offset 0x03e56 ***/ /* 80 */ 0x0000cda1, 0x0000d6ad, 0x0000c7f3, 0x00009af1, /* 84 */ 0x00009af2, 0x00009af3, 0x0000d9e0, 0x0000bbe3, @@ -4780,7 +4818,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009b50, 0x0000d0da, 0x00009b51, 0x00009b52, /* bc */ 0x00009b53, 0x0000c6fb, 0x0000b7da, 0x00009b54, - /*** Three byte table, leaf: e6b2xx - offset 0x03e15 ***/ + /*** Three byte table, leaf: e6b2xx - offset 0x03e96 ***/ /* 80 */ 0x00009b55, 0x0000c7df, 0x0000d2ca, 0x0000ced6, /* 84 */ 0x00009b56, 0x0000e3e4, 0x0000e3ec, 0x00009b57, @@ -4799,7 +4837,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b7d0, 0x0000d3cd, 0x00009b70, 0x0000d6ce, /* bc */ 0x0000d5d3, 0x0000b9c1, 0x0000d5b4, 0x0000d1d8, - /*** Three byte table, leaf: e6b3xx - offset 0x03e55 ***/ + /*** Three byte table, leaf: e6b3xx - offset 0x03ed6 ***/ /* 80 */ 0x00009b71, 0x00009b72, 0x00009b73, 0x00009b74, /* 84 */ 0x0000d0b9, 0x0000c7f6, 0x00009b75, 0x00009b76, @@ -4818,7 +4856,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e3f2, 0x00009b8d, 0x0000e3f8, 0x0000d0ba, /* bc */ 0x0000c6c3, 0x0000d4f3, 0x0000e3fe, 0x00009b8e, - /*** Three byte table, leaf: e6b4xx - offset 0x03e95 ***/ + /*** Three byte table, leaf: e6b4xx - offset 0x03f16 ***/ /* 80 */ 0x00009b8f, 0x0000bde0, 0x00009b90, 0x00009b91, /* 84 */ 0x0000e4a7, 0x00009b92, 0x00009b93, 0x0000e4a6, @@ -4837,7 +4875,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009bb2, 0x0000e4a1, 0x00009bb3, 0x0000bbee, /* bc */ 0x0000cddd, 0x0000c7a2, 0x0000c5c9, 0x00009bb4, - /*** Three byte table, leaf: e6b5xx - offset 0x03ed5 ***/ + /*** Three byte table, leaf: e6b5xx - offset 0x03f56 ***/ /* 80 */ 0x00009bb5, 0x0000c1f7, 0x00009bb6, 0x0000e4a4, /* 84 */ 0x00009bb7, 0x0000c7b3, 0x0000bdac, 0x0000bdbd, @@ -4856,7 +4894,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bdfe, 0x00009bd1, 0x00009bd2, 0x00009bd3, /* bc */ 0x0000e4bc, 0x00009bd4, 0x00009bd5, 0x00009bd6, - /*** Three byte table, leaf: e6b6xx - offset 0x03f15 ***/ + /*** Three byte table, leaf: e6b6xx - offset 0x03f96 ***/ /* 80 */ 0x00009bd7, 0x00009bd8, 0x0000cdbf, 0x00009bd9, /* 84 */ 0x00009bda, 0x0000c4f9, 0x00009bdb, 0x00009bdc, @@ -4875,7 +4913,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bad4, 0x00009bf3, 0x00009bf4, 0x00009bf5, /* bc */ 0x00009bf6, 0x00009bf7, 0x00009bf8, 0x0000e4c3, - /*** Three byte table, leaf: e6b7xx - offset 0x03f55 ***/ + /*** Three byte table, leaf: e6b7xx - offset 0x03fd6 ***/ /* 80 */ 0x0000b5ed, 0x00009bf9, 0x00009bfa, 0x00009bfb, /* 84 */ 0x0000d7cd, 0x0000e4c0, 0x0000cffd, 0x0000e4bf, @@ -4894,7 +4932,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009c5b, 0x0000d1cd, 0x00009c5c, 0x0000cced, /* bc */ 0x0000edb5, 0x00009c5d, 0x00009c5e, 0x00009c5f, - /*** Three byte table, leaf: e6b8xx - offset 0x03f95 ***/ + /*** Three byte table, leaf: e6b8xx - offset 0x04016 ***/ /* 80 */ 0x00009c60, 0x00009c61, 0x00009c62, 0x00009c63, /* 84 */ 0x00009c64, 0x0000c7e5, 0x00009c65, 0x00009c66, @@ -4913,7 +4951,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d3ce, 0x00009c82, 0x0000c3ec, 0x00009c83, /* bc */ 0x00009c84, 0x00009c85, 0x00009c86, 0x00009c87, - /*** Three byte table, leaf: e6b9xx - offset 0x03fd5 ***/ + /*** Three byte table, leaf: e6b9xx - offset 0x04056 ***/ /* 80 */ 0x00009c88, 0x00009c89, 0x00009c8a, 0x0000c5c8, /* 84 */ 0x0000e4d8, 0x00009c8b, 0x00009c8c, 0x00009c8d, @@ -4932,7 +4970,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009cb4, 0x00009cb5, 0x00009cb6, 0x00009cb7, /* bc */ 0x00009cb8, 0x00009cb9, 0x0000cde5, 0x0000caaa, - /*** Three byte table, leaf: e6baxx - offset 0x04015 ***/ + /*** Three byte table, leaf: e6baxx - offset 0x04096 ***/ /* 80 */ 0x00009cba, 0x00009cbb, 0x00009cbc, 0x0000c0a3, /* 84 */ 0x00009cbd, 0x0000bda6, 0x0000e4d3, 0x00009cbe, @@ -4951,7 +4989,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009cdf, 0x00009ce0, 0x0000c4e7, 0x0000e4e2, /* bc */ 0x00009ce1, 0x0000e4e1, 0x00009ce2, 0x00009ce3, - /*** Three byte table, leaf: e6bbxx - offset 0x04055 ***/ + /*** Three byte table, leaf: e6bbxx - offset 0x040d6 ***/ /* 80 */ 0x00009ce4, 0x0000b3fc, 0x0000e4e8, 0x00009ce5, /* 84 */ 0x00009ce6, 0x00009ce7, 0x00009ce8, 0x0000b5e1, @@ -4970,7 +5008,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009d47, 0x0000e4ef, 0x00009d48, 0x00009d49, /* bc */ 0x00009d4a, 0x00009d4b, 0x00009d4c, 0x00009d4d, - /*** Three byte table, leaf: e6bcxx - offset 0x04095 ***/ + /*** Three byte table, leaf: e6bcxx - offset 0x04116 ***/ /* 80 */ 0x00009d4e, 0x00009d4f, 0x0000c6af, 0x00009d50, /* 84 */ 0x00009d51, 0x00009d52, 0x0000c6e1, 0x00009d53, @@ -4989,7 +5027,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009d75, 0x00009d76, 0x00009d77, 0x00009d78, /* bc */ 0x00009d79, 0x00009d7a, 0x0000d1fa, 0x00009d7b, - /*** Three byte table, leaf: e6bdxx - offset 0x040d5 ***/ + /*** Three byte table, leaf: e6bdxx - offset 0x04156 ***/ /* 80 */ 0x00009d7c, 0x00009d7d, 0x00009d7e, 0x00009d80, /* 84 */ 0x00009d81, 0x00009d82, 0x0000e4eb, 0x0000e4ec, @@ -5008,7 +5046,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e4fa, 0x00009da8, 0x0000e4fd, 0x00009da9, /* bc */ 0x0000e4fc, 0x00009daa, 0x00009dab, 0x00009dac, - /*** Three byte table, leaf: e6bexx - offset 0x04115 ***/ + /*** Three byte table, leaf: e6bexx - offset 0x04196 ***/ /* 80 */ 0x00009dad, 0x00009dae, 0x00009daf, 0x00009db0, /* 84 */ 0x0000b3ce, 0x00009db1, 0x00009db2, 0x00009db3, @@ -5027,7 +5065,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009dda, 0x0000e5a3, 0x00009ddb, 0x00009ddc, /* bc */ 0x00009ddd, 0x00009dde, 0x00009ddf, 0x00009de0, - /*** Three byte table, leaf: e6bfxx - offset 0x04155 ***/ + /*** Three byte table, leaf: e6bfxx - offset 0x041d6 ***/ /* 80 */ 0x0000bca4, 0x00009de1, 0x0000e5a5, 0x00009de2, /* 84 */ 0x00009de3, 0x00009de4, 0x00009de5, 0x00009de6, @@ -5046,7 +5084,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009e50, 0x00009e51, 0x00009e52, 0x00009e53, /* bc */ 0x00009e54, 0x00009e55, 0x00009e56, 0x00009e57, - /*** Three byte table, leaf: e780xx - offset 0x04195 ***/ + /*** Three byte table, leaf: e780xx - offset 0x04216 ***/ /* 80 */ 0x00009e58, 0x00009e59, 0x00009e5a, 0x00009e5b, /* 84 */ 0x00009e5c, 0x00009e5d, 0x00009e5e, 0x00009e5f, @@ -5065,7 +5103,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009e8c, 0x0000e5ae, 0x00009e8d, 0x00009e8e, /* bc */ 0x00009e8f, 0x00009e90, 0x00009e91, 0x00009e92, - /*** Three byte table, leaf: e781xx - offset 0x041d5 ***/ + /*** Three byte table, leaf: e781xx - offset 0x04256 ***/ /* 80 */ 0x00009e93, 0x00009e94, 0x00009e95, 0x00009e96, /* 84 */ 0x00009e97, 0x00009e98, 0x00009e99, 0x00009e9a, @@ -5084,7 +5122,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bec4, 0x00009ec1, 0x00009ec2, 0x00009ec3, /* bc */ 0x0000d7c6, 0x00009ec4, 0x0000d4d6, 0x0000b2d3, - /*** Three byte table, leaf: e782xx - offset 0x04215 ***/ + /*** Three byte table, leaf: e782xx - offset 0x04296 ***/ /* 80 */ 0x0000ecbe, 0x00009ec5, 0x00009ec6, 0x00009ec7, /* 84 */ 0x00009ec8, 0x0000eac1, 0x00009ec9, 0x00009eca, @@ -5103,7 +5141,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d5a8, 0x0000b5e3, 0x00009ee9, 0x0000ecc2, /* bc */ 0x0000c1b6, 0x0000b3e3, 0x00009eea, 0x00009eeb, - /*** Three byte table, leaf: e783xx - offset 0x04255 ***/ + /*** Three byte table, leaf: e783xx - offset 0x042d6 ***/ /* 80 */ 0x0000ecc3, 0x0000cbb8, 0x0000c0c3, 0x0000ccfe, /* 84 */ 0x00009eec, 0x00009eed, 0x00009eee, 0x00009eef, @@ -5122,7 +5160,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009f51, 0x0000c5eb, 0x00009f52, 0x00009f53, /* bc */ 0x00009f54, 0x0000b7e9, 0x00009f55, 0x00009f56, - /*** Three byte table, leaf: e784xx - offset 0x04295 ***/ + /*** Three byte table, leaf: e784xx - offset 0x04316 ***/ /* 80 */ 0x00009f57, 0x00009f58, 0x00009f59, 0x00009f5a, /* 84 */ 0x00009f5b, 0x00009f5c, 0x00009f5d, 0x00009f5e, @@ -5141,7 +5179,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009f82, 0x00009f83, 0x00009f84, 0x00009f85, /* bc */ 0x00009f86, 0x00009f87, 0x00009f88, 0x00009f89, - /*** Three byte table, leaf: e785xx - offset 0x042d5 ***/ + /*** Three byte table, leaf: e785xx - offset 0x04356 ***/ /* 80 */ 0x00009f8a, 0x00009f8b, 0x00009f8c, 0x00009f8d, /* 84 */ 0x00009f8e, 0x0000ecd1, 0x00009f8f, 0x00009f90, @@ -5160,7 +5198,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ecd4, 0x00009fb5, 0x0000ecd5, 0x00009fb6, /* bc */ 0x00009fb7, 0x0000c9bf, 0x00009fb8, 0x00009fb9, - /*** Three byte table, leaf: e786xx - offset 0x04315 ***/ + /*** Three byte table, leaf: e786xx - offset 0x04396 ***/ /* 80 */ 0x00009fba, 0x00009fbb, 0x00009fbc, 0x00009fbd, /* 84 */ 0x0000cfa8, 0x00009fbe, 0x00009fbf, 0x00009fc0, @@ -5179,7 +5217,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x00009fe6, 0x0000ece4, 0x00009fe7, 0x00009fe8, /* bc */ 0x00009fe9, 0x00009fea, 0x00009feb, 0x00009fec, - /*** Three byte table, leaf: e787xx - offset 0x04355 ***/ + /*** Three byte table, leaf: e787xx - offset 0x043d6 ***/ /* 80 */ 0x00009fed, 0x00009fee, 0x00009fef, 0x0000c8bc, /* 84 */ 0x00009ff0, 0x00009ff1, 0x00009ff2, 0x00009ff3, @@ -5198,7 +5236,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a05e, 0x0000ecde, 0x0000a05f, 0x0000a060, /* bc */ 0x0000a061, 0x0000a062, 0x0000a063, 0x0000a064, - /*** Three byte table, leaf: e788xx - offset 0x04395 ***/ + /*** Three byte table, leaf: e788xx - offset 0x04416 ***/ /* 80 */ 0x0000a065, 0x0000a066, 0x0000a067, 0x0000a068, /* 84 */ 0x0000a069, 0x0000a06a, 0x0000b1ac, 0x0000a06b, @@ -5217,7 +5255,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b0d6, 0x0000b5f9, 0x0000a094, 0x0000d8b3, /* bc */ 0x0000a095, 0x0000cbac, 0x0000a096, 0x0000e3dd, - /*** Three byte table, leaf: e789xx - offset 0x043d5 ***/ + /*** Three byte table, leaf: e789xx - offset 0x04456 ***/ /* 80 */ 0x0000a097, 0x0000a098, 0x0000a099, 0x0000a09a, /* 84 */ 0x0000a09b, 0x0000a09c, 0x0000a09d, 0x0000c6ac, @@ -5236,7 +5274,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a0bc, 0x0000ccd8, 0x0000cefe, 0x0000a0bd, /* bc */ 0x0000a0be, 0x0000a0bf, 0x0000eaf5, 0x0000eaf6, - /*** Three byte table, leaf: e78axx - offset 0x04415 ***/ + /*** Three byte table, leaf: e78axx - offset 0x04496 ***/ /* 80 */ 0x0000cfac, 0x0000c0e7, 0x0000a0c0, 0x0000a0c1, /* 84 */ 0x0000eaf7, 0x0000a0c2, 0x0000a0c3, 0x0000a0c4, @@ -5255,7 +5293,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e1ef, 0x0000d3cc, 0x0000a0e8, 0x0000a0e9, /* bc */ 0x0000a0ea, 0x0000a0eb, 0x0000a0ec, 0x0000a0ed, - /*** Three byte table, leaf: e78bxx - offset 0x04455 ***/ + /*** Three byte table, leaf: e78bxx - offset 0x044d6 ***/ /* 80 */ 0x0000a0ee, 0x0000e1f1, 0x0000bff1, 0x0000e1f0, /* 84 */ 0x0000b5d2, 0x0000a0ef, 0x0000a0f0, 0x0000a0f1, @@ -5274,7 +5312,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c0ea, 0x0000aa4d, 0x0000e1fe, 0x0000e2a1, /* bc */ 0x0000c0c7, 0x0000aa4e, 0x0000aa4f, 0x0000aa50, - /*** Three byte table, leaf: e78cxx - offset 0x04495 ***/ + /*** Three byte table, leaf: e78cxx - offset 0x04516 ***/ /* 80 */ 0x0000aa51, 0x0000e1fb, 0x0000aa52, 0x0000e1fd, /* 84 */ 0x0000aa53, 0x0000aa54, 0x0000aa55, 0x0000aa56, @@ -5293,7 +5331,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e2ad, 0x0000e2aa, 0x0000aa72, 0x0000aa73, /* bc */ 0x0000aa74, 0x0000aa75, 0x0000bbab, 0x0000d4b3, - /*** Three byte table, leaf: e78dxx - offset 0x044d5 ***/ + /*** Three byte table, leaf: e78dxx - offset 0x04556 ***/ /* 80 */ 0x0000aa76, 0x0000aa77, 0x0000aa78, 0x0000aa79, /* 84 */ 0x0000aa7a, 0x0000aa7b, 0x0000aa7c, 0x0000aa7d, @@ -5312,7 +5350,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ab46, 0x0000ab47, 0x0000ab48, 0x0000ab49, /* bc */ 0x0000ab4a, 0x0000ab4b, 0x0000e2b5, 0x0000ab4c, - /*** Three byte table, leaf: e78exx - offset 0x04515 ***/ + /*** Three byte table, leaf: e78exx - offset 0x04596 ***/ /* 80 */ 0x0000ab4d, 0x0000ab4e, 0x0000ab4f, 0x0000ab50, /* 84 */ 0x0000d0fe, 0x0000ab51, 0x0000ab52, 0x0000c2ca, @@ -5331,7 +5369,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ab73, 0x0000ab74, 0x0000e7f4, 0x0000b2a3, /* bc */ 0x0000ab75, 0x0000ab76, 0x0000ab77, 0x0000ab78, - /*** Three byte table, leaf: e78fxx - offset 0x04555 ***/ + /*** Three byte table, leaf: e78fxx - offset 0x045d6 ***/ /* 80 */ 0x0000e7ea, 0x0000ab79, 0x0000e7e6, 0x0000ab7a, /* 84 */ 0x0000ab7b, 0x0000ab7c, 0x0000ab7d, 0x0000ab7e, @@ -5350,7 +5388,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ac40, 0x0000ac41, 0x0000ac42, 0x0000ac43, /* bc */ 0x0000ac44, 0x0000ac45, 0x0000ac46, 0x0000ac47, - /*** Three byte table, leaf: e790xx - offset 0x04595 ***/ + /*** Three byte table, leaf: e790xx - offset 0x04616 ***/ /* 80 */ 0x0000ac48, 0x0000ac49, 0x0000ac4a, 0x0000c7f2, /* 84 */ 0x0000ac4b, 0x0000c0c5, 0x0000c0ed, 0x0000ac4c, @@ -5369,7 +5407,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ac6b, 0x0000ac6c, 0x0000ac6d, 0x0000ac6e, /* bc */ 0x0000c7ed, 0x0000ac6f, 0x0000ac70, 0x0000ac71, - /*** Three byte table, leaf: e791xx - offset 0x045d5 ***/ + /*** Three byte table, leaf: e791xx - offset 0x04656 ***/ /* 80 */ 0x0000ac72, 0x0000e8a3, 0x0000ac73, 0x0000ac74, /* 84 */ 0x0000ac75, 0x0000ac76, 0x0000ac77, 0x0000ac78, @@ -5388,7 +5426,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ac9e, 0x0000ac9f, 0x0000aca0, 0x0000ad40, /* bc */ 0x0000ad41, 0x0000ad42, 0x0000e8aa, 0x0000ad43, - /*** Three byte table, leaf: e792xx - offset 0x04615 ***/ + /*** Three byte table, leaf: e792xx - offset 0x04696 ***/ /* 80 */ 0x0000e8ad, 0x0000e8ae, 0x0000ad44, 0x0000c1a7, /* 84 */ 0x0000ad45, 0x0000ad46, 0x0000ad47, 0x0000e8af, @@ -5407,7 +5445,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ad70, 0x0000ad71, 0x0000e8b7, 0x0000ad72, /* bc */ 0x0000ad73, 0x0000ad74, 0x0000ad75, 0x0000ad76, - /*** Three byte table, leaf: e793xx - offset 0x04655 ***/ + /*** Three byte table, leaf: e793xx - offset 0x046d6 ***/ /* 80 */ 0x0000ad77, 0x0000ad78, 0x0000ad79, 0x0000ad7a, /* 84 */ 0x0000ad7b, 0x0000ad7c, 0x0000ad7d, 0x0000ad7e, @@ -5426,7 +5464,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ae42, 0x0000ae43, 0x0000ae44, 0x0000ae45, /* bc */ 0x0000ae46, 0x0000ae47, 0x0000ae48, 0x0000eab3, - /*** Three byte table, leaf: e794xx - offset 0x04695 ***/ + /*** Three byte table, leaf: e794xx - offset 0x04716 ***/ /* 80 */ 0x0000ae49, 0x0000ae4a, 0x0000ae4b, 0x0000ae4c, /* 84 */ 0x0000d5e7, 0x0000ae4d, 0x0000ae4e, 0x0000ae4f, @@ -5445,7 +5483,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b5e9, 0x0000ae6a, 0x0000eeae, 0x0000bbad, /* bc */ 0x0000ae6b, 0x0000ae6c, 0x0000e7de, 0x0000ae6d, - /*** Three byte table, leaf: e795xx - offset 0x046d5 ***/ + /*** Three byte table, leaf: e795xx - offset 0x04756 ***/ /* 80 */ 0x0000eeaf, 0x0000ae6e, 0x0000ae6f, 0x0000ae70, /* 84 */ 0x0000ae71, 0x0000b3a9, 0x0000ae72, 0x0000ae73, @@ -5464,7 +5502,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bbfb, 0x0000eeb5, 0x0000ae96, 0x0000ae97, /* bc */ 0x0000ae98, 0x0000ae99, 0x0000ae9a, 0x0000e7dc, - /*** Three byte table, leaf: e796xx - offset 0x04715 ***/ + /*** Three byte table, leaf: e796xx - offset 0x04796 ***/ /* 80 */ 0x0000ae9b, 0x0000ae9c, 0x0000ae9d, 0x0000eeb6, /* 84 */ 0x0000ae9e, 0x0000ae9f, 0x0000bdae, 0x0000aea0, @@ -5483,7 +5521,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f0e3, 0x0000d5ee, 0x0000af56, 0x0000af57, /* bc */ 0x0000ccdb, 0x0000bed2, 0x0000bcb2, 0x0000af58, - /*** Three byte table, leaf: e797xx - offset 0x04755 ***/ + /*** Three byte table, leaf: e797xx - offset 0x047d6 ***/ /* 80 */ 0x0000af59, 0x0000af5a, 0x0000f0e8, 0x0000f0e7, /* 84 */ 0x0000f0e4, 0x0000b2a1, 0x0000af5b, 0x0000d6a2, @@ -5502,7 +5540,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000af76, 0x0000b1d4, 0x0000af77, 0x0000af78, /* bc */ 0x0000f0f3, 0x0000af79, 0x0000af7a, 0x0000f0f4, - /*** Three byte table, leaf: e798xx - offset 0x04795 ***/ + /*** Three byte table, leaf: e798xx - offset 0x04816 ***/ /* 80 */ 0x0000f0f6, 0x0000b4e1, 0x0000af7b, 0x0000f0f1, /* 84 */ 0x0000af7c, 0x0000f0f7, 0x0000af7d, 0x0000af7e, @@ -5521,7 +5559,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c8b3, 0x0000af9a, 0x0000af9b, 0x0000af9c, /* bc */ 0x0000f1a2, 0x0000af9d, 0x0000f1ab, 0x0000f1a8, - /*** Three byte table, leaf: e799xx - offset 0x047d5 ***/ + /*** Three byte table, leaf: e799xx - offset 0x04856 ***/ /* 80 */ 0x0000f1a5, 0x0000af9e, 0x0000af9f, 0x0000f1aa, /* 84 */ 0x0000afa0, 0x0000b040, 0x0000b041, 0x0000b042, @@ -5540,7 +5578,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b9ef, 0x0000b06a, 0x0000b06b, 0x0000b5c7, /* bc */ 0x0000b06c, 0x0000b0d7, 0x0000b0d9, 0x0000b06d, - /*** Three byte table, leaf: e79axx - offset 0x04815 ***/ + /*** Three byte table, leaf: e79axx - offset 0x04896 ***/ /* 80 */ 0x0000b06e, 0x0000b06f, 0x0000d4ed, 0x0000b070, /* 84 */ 0x0000b5c4, 0x0000b071, 0x0000bdd4, 0x0000bbca, @@ -5559,7 +5597,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b097, 0x0000b098, 0x0000b099, 0x0000b09a, /* bc */ 0x0000b09b, 0x0000b09c, 0x0000b09d, 0x0000c3f3, - /*** Three byte table, leaf: e79bxx - offset 0x04855 ***/ + /*** Three byte table, leaf: e79bxx - offset 0x048d6 ***/ /* 80 */ 0x0000b09e, 0x0000b09f, 0x0000d3db, 0x0000b0a0, /* 84 */ 0x0000b140, 0x0000d6d1, 0x0000c5e8, 0x0000b141, @@ -5578,7 +5616,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cfe0, 0x0000edef, 0x0000b15e, 0x0000b15f, /* bc */ 0x0000c5ce, 0x0000b160, 0x0000b6dc, 0x0000b161, - /*** Three byte table, leaf: e79cxx - offset 0x04895 ***/ + /*** Three byte table, leaf: e79cxx - offset 0x04916 ***/ /* 80 */ 0x0000b162, 0x0000caa1, 0x0000b163, 0x0000b164, /* 84 */ 0x0000eded, 0x0000b165, 0x0000b166, 0x0000edf0, @@ -5597,7 +5635,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000edf8, 0x0000b187, 0x0000ccf7, 0x0000b188, /* bc */ 0x0000d1db, 0x0000b189, 0x0000b18a, 0x0000b18b, - /*** Three byte table, leaf: e79dxx - offset 0x048d5 ***/ + /*** Three byte table, leaf: e79dxx - offset 0x04956 ***/ /* 80 */ 0x0000d7c5, 0x0000d5f6, 0x0000b18c, 0x0000edfc, /* 84 */ 0x0000b18d, 0x0000b18e, 0x0000b18f, 0x0000edfb, @@ -5616,7 +5654,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b253, 0x0000b6c3, 0x0000b254, 0x0000b255, /* bc */ 0x0000b256, 0x0000eea5, 0x0000d8ba, 0x0000eea3, - /*** Three byte table, leaf: e79exx - offset 0x04915 ***/ + /*** Three byte table, leaf: e79exx - offset 0x04996 ***/ /* 80 */ 0x0000eea6, 0x0000b257, 0x0000b258, 0x0000b259, /* 84 */ 0x0000c3e9, 0x0000b3f2, 0x0000b25a, 0x0000b25b, @@ -5635,7 +5673,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b27c, 0x0000b27d, 0x0000b27e, 0x0000d5b0, /* bc */ 0x0000b280, 0x0000eead, 0x0000b281, 0x0000f6c4, - /*** Three byte table, leaf: e79fxx - offset 0x04955 ***/ + /*** Three byte table, leaf: e79fxx - offset 0x049d6 ***/ /* 80 */ 0x0000b282, 0x0000b283, 0x0000b284, 0x0000b285, /* 84 */ 0x0000b286, 0x0000b287, 0x0000b288, 0x0000b289, @@ -5654,7 +5692,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000edb7, 0x0000b34a, 0x0000b34b, 0x0000b34c, /* bc */ 0x0000b34d, 0x0000cef9, 0x0000b7af, 0x0000bff3, - /*** Three byte table, leaf: e7a0xx - offset 0x04995 ***/ + /*** Three byte table, leaf: e7a0xx - offset 0x04a16 ***/ /* 80 */ 0x0000edb8, 0x0000c2eb, 0x0000c9b0, 0x0000b34e, /* 84 */ 0x0000b34f, 0x0000b350, 0x0000b351, 0x0000b352, @@ -5673,7 +5711,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d4d2, 0x0000edc1, 0x0000edc2, 0x0000edc3, /* bc */ 0x0000edc5, 0x0000b36c, 0x0000c0f9, 0x0000b36d, - /*** Three byte table, leaf: e7a1xx - offset 0x049d5 ***/ + /*** Three byte table, leaf: e7a1xx - offset 0x04a56 ***/ /* 80 */ 0x0000b4a1, 0x0000b36e, 0x0000b36f, 0x0000b370, /* 84 */ 0x0000b371, 0x0000b9e8, 0x0000b372, 0x0000edd0, @@ -5692,7 +5730,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b396, 0x0000b397, 0x0000b398, 0x0000b399, /* bc */ 0x0000c5f0, 0x0000b39a, 0x0000b39b, 0x0000b39c, - /*** Three byte table, leaf: e7a2xx - offset 0x04a15 ***/ + /*** Three byte table, leaf: e7a2xx - offset 0x04a96 ***/ /* 80 */ 0x0000b39d, 0x0000b39e, 0x0000b39f, 0x0000b3a0, /* 84 */ 0x0000b440, 0x0000b441, 0x0000b442, 0x0000edd6, @@ -5711,7 +5749,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b45e, 0x0000eddb, 0x0000b45f, 0x0000b460, /* bc */ 0x0000b461, 0x0000b462, 0x0000c4eb, 0x0000b463, - /*** Three byte table, leaf: e7a3xx - offset 0x04a55 ***/ + /*** Three byte table, leaf: e7a3xx - offset 0x04ad6 ***/ /* 80 */ 0x0000b464, 0x0000b4c5, 0x0000b465, 0x0000b466, /* 84 */ 0x0000b467, 0x0000b0f5, 0x0000b468, 0x0000b469, @@ -5730,7 +5768,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b48f, 0x0000b490, 0x0000bbc7, 0x0000b491, /* bc */ 0x0000b492, 0x0000b493, 0x0000b494, 0x0000b495, - /*** Three byte table, leaf: e7a4xx - offset 0x04a95 ***/ + /*** Three byte table, leaf: e7a4xx - offset 0x04b16 ***/ /* 80 */ 0x0000b496, 0x0000bdb8, 0x0000b497, 0x0000b498, /* 84 */ 0x0000b499, 0x0000ede2, 0x0000b49a, 0x0000b49b, @@ -5749,7 +5787,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b567, 0x0000b568, 0x0000cabe, 0x0000ecea, /* bc */ 0x0000c0f1, 0x0000b569, 0x0000c9e7, 0x0000b56a, - /*** Three byte table, leaf: e7a5xx - offset 0x04ad5 ***/ + /*** Three byte table, leaf: e7a5xx - offset 0x04b56 ***/ /* 80 */ 0x0000eceb, 0x0000c6ee, 0x0000b56b, 0x0000b56c, /* 84 */ 0x0000b56d, 0x0000b56e, 0x0000ecec, 0x0000b56f, @@ -5768,7 +5806,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bbf6, 0x0000b58e, 0x0000ecf7, 0x0000b58f, /* bc */ 0x0000b590, 0x0000b591, 0x0000b592, 0x0000b593, - /*** Three byte table, leaf: e7a6xx - offset 0x04b15 ***/ + /*** Three byte table, leaf: e7a6xx - offset 0x04b96 ***/ /* 80 */ 0x0000d9f7, 0x0000bdfb, 0x0000b594, 0x0000b595, /* 84 */ 0x0000c2bb, 0x0000ecf8, 0x0000b596, 0x0000b597, @@ -5787,7 +5825,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b662, 0x0000d3ed, 0x0000d8ae, 0x0000c0eb, /* bc */ 0x0000b663, 0x0000c7dd, 0x0000bacc, 0x0000b664, - /*** Three byte table, leaf: e7a7xx - offset 0x04b55 ***/ + /*** Three byte table, leaf: e7a7xx - offset 0x04bd6 ***/ /* 80 */ 0x0000d0e3, 0x0000cbbd, 0x0000b665, 0x0000cdba, /* 84 */ 0x0000b666, 0x0000b667, 0x0000b8d1, 0x0000b668, @@ -5806,7 +5844,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bdd5, 0x0000b689, 0x0000b68a, 0x0000d2c6, /* bc */ 0x0000b68b, 0x0000bbe0, 0x0000b68c, 0x0000b68d, - /*** Three byte table, leaf: e7a8xx - offset 0x04b95 ***/ + /*** Three byte table, leaf: e7a8xx - offset 0x04c16 ***/ /* 80 */ 0x0000cfa1, 0x0000b68e, 0x0000effc, 0x0000effb, /* 84 */ 0x0000b68f, 0x0000b690, 0x0000eff9, 0x0000b691, @@ -5825,7 +5863,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b756, 0x0000f0a1, 0x0000b757, 0x0000b5be, /* bc */ 0x0000bcda, 0x0000bbfc, 0x0000b758, 0x0000b8e5, - /*** Three byte table, leaf: e7a9xx - offset 0x04bd5 ***/ + /*** Three byte table, leaf: e7a9xx - offset 0x04c56 ***/ /* 80 */ 0x0000b759, 0x0000b75a, 0x0000b75b, 0x0000b75c, /* 84 */ 0x0000b75d, 0x0000b75e, 0x0000c4c2, 0x0000b75f, @@ -5844,7 +5882,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f1b6, 0x0000f1b7, 0x0000bfd5, 0x0000b78b, /* bc */ 0x0000b78c, 0x0000b78d, 0x0000b78e, 0x0000b4a9, - /*** Three byte table, leaf: e7aaxx - offset 0x04c15 ***/ + /*** Three byte table, leaf: e7aaxx - offset 0x04c96 ***/ /* 80 */ 0x0000f1b8, 0x0000cdbb, 0x0000b78f, 0x0000c7d4, /* 84 */ 0x0000d5ad, 0x0000b790, 0x0000f1b9, 0x0000b791, @@ -5863,7 +5901,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b84f, 0x0000b850, 0x0000b851, 0x0000b852, /* bc */ 0x0000b853, 0x0000b854, 0x0000b855, 0x0000c1fe, - /*** Three byte table, leaf: e7abxx - offset 0x04c55 ***/ + /*** Three byte table, leaf: e7abxx - offset 0x04cd6 ***/ /* 80 */ 0x0000b856, 0x0000b857, 0x0000b858, 0x0000b859, /* 84 */ 0x0000b85a, 0x0000b85b, 0x0000b85c, 0x0000b85d, @@ -5882,7 +5920,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b884, 0x0000d6f1, 0x0000f3c3, 0x0000b885, /* bc */ 0x0000b886, 0x0000f3c4, 0x0000b887, 0x0000b8cd, - /*** Three byte table, leaf: e7acxx - offset 0x04c95 ***/ + /*** Three byte table, leaf: e7acxx - offset 0x04d16 ***/ /* 80 */ 0x0000b888, 0x0000b889, 0x0000b88a, 0x0000f3c6, /* 84 */ 0x0000f3c7, 0x0000b88b, 0x0000b0ca, 0x0000b88c, @@ -5901,7 +5939,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f3cd, 0x0000b947, 0x0000bce3, 0x0000b948, /* bc */ 0x0000c1fd, 0x0000b949, 0x0000f3d6, 0x0000b94a, - /*** Three byte table, leaf: e7adxx - offset 0x04cd5 ***/ + /*** Three byte table, leaf: e7adxx - offset 0x04d56 ***/ /* 80 */ 0x0000b94b, 0x0000b94c, 0x0000b94d, 0x0000b94e, /* 84 */ 0x0000b94f, 0x0000f3da, 0x0000b950, 0x0000f3cc, @@ -5920,7 +5958,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b96d, 0x0000b3ef, 0x0000b96e, 0x0000f3e0, /* bc */ 0x0000b96f, 0x0000b970, 0x0000c7a9, 0x0000b971, - /*** Three byte table, leaf: e7aexx - offset 0x04d15 ***/ + /*** Three byte table, leaf: e7aexx - offset 0x04d96 ***/ /* 80 */ 0x0000bcf2, 0x0000b972, 0x0000b973, 0x0000b974, /* 84 */ 0x0000b975, 0x0000f3eb, 0x0000b976, 0x0000b977, @@ -5939,7 +5977,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f3e7, 0x0000b996, 0x0000b997, 0x0000b998, /* bc */ 0x0000b999, 0x0000b99a, 0x0000b99b, 0x0000b99c, - /*** Three byte table, leaf: e7afxx - offset 0x04d55 ***/ + /*** Three byte table, leaf: e7afxx - offset 0x04dd6 ***/ /* 80 */ 0x0000b99d, 0x0000f3f2, 0x0000b99e, 0x0000b99f, /* 84 */ 0x0000b9a0, 0x0000ba40, 0x0000d7ad, 0x0000c6aa, @@ -5958,7 +5996,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ba64, 0x0000ba65, 0x0000ba66, 0x0000ba67, /* bc */ 0x0000f3fb, 0x0000ba68, 0x0000f3fa, 0x0000ba69, - /*** Three byte table, leaf: e7b0xx - offset 0x04d95 ***/ + /*** Three byte table, leaf: e7b0xx - offset 0x04e16 ***/ /* 80 */ 0x0000ba6a, 0x0000ba6b, 0x0000ba6c, 0x0000ba6d, /* 84 */ 0x0000ba6e, 0x0000ba6f, 0x0000ba70, 0x0000b4d8, @@ -5977,7 +6015,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f4a4, 0x0000ba9a, 0x0000ba9b, 0x0000ba9c, /* bc */ 0x0000ba9d, 0x0000ba9e, 0x0000ba9f, 0x0000b2be, - /*** Three byte table, leaf: e7b1xx - offset 0x04dd5 ***/ + /*** Three byte table, leaf: e7b1xx - offset 0x04e56 ***/ /* 80 */ 0x0000f4a6, 0x0000f4a5, 0x0000baa0, 0x0000bb40, /* 84 */ 0x0000bb41, 0x0000bb42, 0x0000bb43, 0x0000bb44, @@ -5996,7 +6034,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bb72, 0x0000bb73, 0x0000bb74, 0x0000c0e0, /* bc */ 0x0000f4cc, 0x0000d7d1, 0x0000bb75, 0x0000bb76, - /*** Three byte table, leaf: e7b2xx - offset 0x04e15 ***/ + /*** Three byte table, leaf: e7b2xx - offset 0x04e96 ***/ /* 80 */ 0x0000bb77, 0x0000bb78, 0x0000bb79, 0x0000bb7a, /* 84 */ 0x0000bb7b, 0x0000bb7c, 0x0000bb7d, 0x0000bb7e, @@ -6015,7 +6053,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bb9e, 0x0000b4e2, 0x0000bb9f, 0x0000bba0, /* bc */ 0x0000f4d4, 0x0000f4d5, 0x0000beab, 0x0000bc40, - /*** Three byte table, leaf: e7b3xx - offset 0x04e55 ***/ + /*** Three byte table, leaf: e7b3xx - offset 0x04ed6 ***/ /* 80 */ 0x0000bc41, 0x0000f4d6, 0x0000bc42, 0x0000bc43, /* 84 */ 0x0000bc44, 0x0000f4db, 0x0000bc45, 0x0000f4d7, @@ -6034,7 +6072,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f4e9, 0x0000bc69, 0x0000bc6a, 0x0000cfb5, /* bc */ 0x0000bc6b, 0x0000bc6c, 0x0000bc6d, 0x0000bc6e, - /*** Three byte table, leaf: e7b4xx - offset 0x04e95 ***/ + /*** Three byte table, leaf: e7b4xx - offset 0x04f16 ***/ /* 80 */ 0x0000bc6f, 0x0000bc70, 0x0000bc71, 0x0000bc72, /* 84 */ 0x0000bc73, 0x0000bc74, 0x0000bc75, 0x0000bc76, @@ -6053,7 +6091,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bd41, 0x0000bd42, 0x0000bd43, 0x0000bd44, /* bc */ 0x0000bd45, 0x0000bd46, 0x0000bd47, 0x0000bd48, - /*** Three byte table, leaf: e7b5xx - offset 0x04ed5 ***/ + /*** Three byte table, leaf: e7b5xx - offset 0x04f56 ***/ /* 80 */ 0x0000bd49, 0x0000bd4a, 0x0000bd4b, 0x0000bd4c, /* 84 */ 0x0000bd4d, 0x0000bd4e, 0x0000bd4f, 0x0000bd50, @@ -6072,7 +6110,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bd80, 0x0000bd81, 0x0000bd82, 0x0000bd83, /* bc */ 0x0000bd84, 0x0000bd85, 0x0000bd86, 0x0000bd87, - /*** Three byte table, leaf: e7b6xx - offset 0x04f15 ***/ + /*** Three byte table, leaf: e7b6xx - offset 0x04f96 ***/ /* 80 */ 0x0000bd88, 0x0000bd89, 0x0000bd8a, 0x0000bd8b, /* 84 */ 0x0000bd8c, 0x0000bd8d, 0x0000bd8e, 0x0000bd8f, @@ -6091,7 +6129,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000be5d, 0x0000be5e, 0x0000be5f, 0x0000be60, /* bc */ 0x0000be61, 0x0000be62, 0x0000be63, 0x0000be64, - /*** Three byte table, leaf: e7b7xx - offset 0x04f55 ***/ + /*** Three byte table, leaf: e7b7xx - offset 0x04fd6 ***/ /* 80 */ 0x0000be65, 0x0000be66, 0x0000be67, 0x0000be68, /* 84 */ 0x0000be69, 0x0000be6a, 0x0000be6b, 0x0000be6c, @@ -6110,7 +6148,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000be9e, 0x0000be9f, 0x0000bea0, 0x0000bf40, /* bc */ 0x0000bf41, 0x0000bf42, 0x0000bf43, 0x0000bf44, - /*** Three byte table, leaf: e7b8xx - offset 0x04f95 ***/ + /*** Three byte table, leaf: e7b8xx - offset 0x05016 ***/ /* 80 */ 0x0000bf45, 0x0000bf46, 0x0000bf47, 0x0000bf48, /* 84 */ 0x0000bf49, 0x0000bf4a, 0x0000bf4b, 0x0000bf4c, @@ -6129,7 +6167,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bf7d, 0x0000bf7e, 0x0000bf80, 0x0000f7e3, /* bc */ 0x0000bf81, 0x0000bf82, 0x0000bf83, 0x0000bf84, - /*** Three byte table, leaf: e7b9xx - offset 0x04fd5 ***/ + /*** Three byte table, leaf: e7b9xx - offset 0x05056 ***/ /* 80 */ 0x0000bf85, 0x0000b7b1, 0x0000bf86, 0x0000bf87, /* 84 */ 0x0000bf88, 0x0000bf89, 0x0000bf8a, 0x0000f4ed, @@ -6148,7 +6186,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c05a, 0x0000c05b, 0x0000c05c, 0x0000c05d, /* bc */ 0x0000c05e, 0x0000c05f, 0x0000c060, 0x0000c061, - /*** Three byte table, leaf: e7baxx - offset 0x05015 ***/ + /*** Three byte table, leaf: e7baxx - offset 0x05096 ***/ /* 80 */ 0x0000c062, 0x0000c063, 0x0000d7eb, 0x0000c064, /* 84 */ 0x0000c065, 0x0000c066, 0x0000c067, 0x0000c068, @@ -6167,7 +6205,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d6bd, 0x0000cec6, 0x0000b7c4, 0x0000c082, /* bc */ 0x0000c083, 0x0000c5a6, 0x0000e7a3, 0x0000cfdf, - /*** Three byte table, leaf: e7bbxx - offset 0x05055 ***/ + /*** Three byte table, leaf: e7bbxx - offset 0x050d6 ***/ /* 80 */ 0x0000e7a4, 0x0000e7a5, 0x0000e7a6, 0x0000c1b7, /* 84 */ 0x0000d7e9, 0x0000c9f0, 0x0000cfb8, 0x0000d6af, @@ -6186,7 +6224,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b3f1, 0x0000c087, 0x0000e7b8, 0x0000e7b9, /* bc */ 0x0000d7db, 0x0000d5c0, 0x0000e7ba, 0x0000c2cc, - /*** Three byte table, leaf: e7bcxx - offset 0x05095 ***/ + /*** Three byte table, leaf: e7bcxx - offset 0x05116 ***/ /* 80 */ 0x0000d7ba, 0x0000e7bb, 0x0000e7bc, 0x0000e7bd, /* 84 */ 0x0000bcea, 0x0000c3e5, 0x0000c0c2, 0x0000e7be, @@ -6205,7 +6243,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b8d7, 0x0000c08c, 0x0000c8b1, 0x0000c08d, /* bc */ 0x0000c08e, 0x0000c08f, 0x0000c090, 0x0000c091, - /*** Three byte table, leaf: e7bdxx - offset 0x050d5 ***/ + /*** Three byte table, leaf: e7bdxx - offset 0x05156 ***/ /* 80 */ 0x0000c092, 0x0000c093, 0x0000f3bf, 0x0000c094, /* 84 */ 0x0000f3c0, 0x0000f3c1, 0x0000c095, 0x0000c096, @@ -6224,7 +6262,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c155, 0x0000eebe, 0x0000c156, 0x0000c157, /* bc */ 0x0000c158, 0x0000c159, 0x0000eec0, 0x0000c15a, - /*** Three byte table, leaf: e7bexx - offset 0x05115 ***/ + /*** Three byte table, leaf: e7bexx - offset 0x05196 ***/ /* 80 */ 0x0000c15b, 0x0000eebf, 0x0000c15c, 0x0000c15d, /* 84 */ 0x0000c15e, 0x0000c15f, 0x0000c160, 0x0000c161, @@ -6243,7 +6281,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d9fa, 0x0000b8fe, 0x0000c185, 0x0000c186, /* bc */ 0x0000e5f1, 0x0000d3f0, 0x0000c187, 0x0000f4e0, - /*** Three byte table, leaf: e7bfxx - offset 0x05155 ***/ + /*** Three byte table, leaf: e7bfxx - offset 0x051d6 ***/ /* 80 */ 0x0000c188, 0x0000cecc, 0x0000c189, 0x0000c18a, /* 84 */ 0x0000c18b, 0x0000b3e1, 0x0000c18c, 0x0000c18d, @@ -6262,7 +6300,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c24d, 0x0000c24e, 0x0000c24f, 0x0000b7ad, /* bc */ 0x0000d2ed, 0x0000c250, 0x0000c251, 0x0000c252, - /*** Three byte table, leaf: e880xx - offset 0x05195 ***/ + /*** Three byte table, leaf: e880xx - offset 0x05216 ***/ /* 80 */ 0x0000d2ab, 0x0000c0cf, 0x0000c253, 0x0000bfbc, /* 84 */ 0x0000eba3, 0x0000d5df, 0x0000eac8, 0x0000c254, @@ -6281,7 +6319,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cbca, 0x0000c26c, 0x0000c26d, 0x0000b3dc, /* bc */ 0x0000c26e, 0x0000b5a2, 0x0000c26f, 0x0000b9a2, - /*** Three byte table, leaf: e881xx - offset 0x051d5 ***/ + /*** Three byte table, leaf: e881xx - offset 0x05256 ***/ /* 80 */ 0x0000c270, 0x0000c271, 0x0000c4f4, 0x0000f1f5, /* 84 */ 0x0000c272, 0x0000c273, 0x0000f1f6, 0x0000c274, @@ -6300,7 +6338,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c29b, 0x0000c29c, 0x0000c29d, 0x0000c29e, /* bc */ 0x0000c29f, 0x0000c2a0, 0x0000c340, 0x0000edb2, - /*** Three byte table, leaf: e882xx - offset 0x05215 ***/ + /*** Three byte table, leaf: e882xx - offset 0x05296 ***/ /* 80 */ 0x0000edb1, 0x0000c341, 0x0000c342, 0x0000cbe0, /* 84 */ 0x0000d2de, 0x0000c343, 0x0000cbc1, 0x0000d5d8, @@ -6319,7 +6357,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c35a, 0x0000c35b, 0x0000b7ce, 0x0000c35c, /* bc */ 0x0000ebc2, 0x0000ebc4, 0x0000c9f6, 0x0000d6d7, - /*** Three byte table, leaf: e883xx - offset 0x05255 ***/ + /*** Three byte table, leaf: e883xx - offset 0x052d6 ***/ /* 80 */ 0x0000d5cd, 0x0000d0b2, 0x0000ebcf, 0x0000ceb8, /* 84 */ 0x0000ebd0, 0x0000c35d, 0x0000b5a8, 0x0000c35e, @@ -6338,7 +6376,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d0d8, 0x0000c373, 0x0000b0b7, 0x0000c374, /* bc */ 0x0000ebdd, 0x0000c4dc, 0x0000c375, 0x0000c376, - /*** Three byte table, leaf: e884xx - offset 0x05295 ***/ + /*** Three byte table, leaf: e884xx - offset 0x05316 ***/ /* 80 */ 0x0000c377, 0x0000c378, 0x0000d6ac, 0x0000c379, /* 84 */ 0x0000c37a, 0x0000c37b, 0x0000b4e0, 0x0000c37c, @@ -6357,7 +6395,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c1b3, 0x0000c39b, 0x0000c39c, 0x0000c39d, /* bc */ 0x0000c39e, 0x0000c39f, 0x0000c6a2, 0x0000c3a0, - /*** Three byte table, leaf: e885xx - offset 0x052d5 ***/ + /*** Three byte table, leaf: e885xx - offset 0x05356 ***/ /* 80 */ 0x0000c440, 0x0000c441, 0x0000c442, 0x0000c443, /* 84 */ 0x0000c444, 0x0000c445, 0x0000ccf3, 0x0000c446, @@ -6376,7 +6414,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c463, 0x0000b8b9, 0x0000cfd9, 0x0000c4e5, /* bc */ 0x0000ebef, 0x0000ebf0, 0x0000ccda, 0x0000cdc8, - /*** Three byte table, leaf: e886xx - offset 0x05315 ***/ + /*** Three byte table, leaf: e886xx - offset 0x05396 ***/ /* 80 */ 0x0000b0f2, 0x0000c464, 0x0000ebf6, 0x0000c465, /* 84 */ 0x0000c466, 0x0000c467, 0x0000c468, 0x0000c469, @@ -6395,7 +6433,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c48e, 0x0000c48f, 0x0000e2df, 0x0000ebfe, /* bc */ 0x0000c490, 0x0000c491, 0x0000c492, 0x0000c493, - /*** Three byte table, leaf: e887xx - offset 0x05355 ***/ + /*** Three byte table, leaf: e887xx - offset 0x053d6 ***/ /* 80 */ 0x0000cdce, 0x0000eca1, 0x0000b1db, 0x0000d3b7, /* 84 */ 0x0000c494, 0x0000c495, 0x0000d2dc, 0x0000c496, @@ -6414,7 +6452,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c55d, 0x0000c55e, 0x0000c55f, 0x0000d5e9, /* bc */ 0x0000beca, 0x0000c560, 0x0000f4a7, 0x0000c561, - /*** Three byte table, leaf: e888xx - offset 0x05395 ***/ + /*** Three byte table, leaf: e888xx - offset 0x05416 ***/ /* 80 */ 0x0000d2a8, 0x0000f4a8, 0x0000f4a9, 0x0000c562, /* 84 */ 0x0000f4aa, 0x0000becb, 0x0000d3df, 0x0000c563, @@ -6433,7 +6471,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f4b4, 0x0000b4ac, 0x0000c57b, 0x0000f4b5, /* bc */ 0x0000c57c, 0x0000c57d, 0x0000f4b8, 0x0000c57e, - /*** Three byte table, leaf: e889xx - offset 0x053d5 ***/ + /*** Three byte table, leaf: e889xx - offset 0x05456 ***/ /* 80 */ 0x0000c580, 0x0000c581, 0x0000c582, 0x0000c583, /* 84 */ 0x0000f4b9, 0x0000c584, 0x0000c585, 0x0000cda7, @@ -6452,7 +6490,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c648, 0x0000dcb3, 0x0000d2d5, 0x0000c649, /* bc */ 0x0000c64a, 0x0000dcb4, 0x0000b0ac, 0x0000dcb5, - /*** Three byte table, leaf: e88axx - offset 0x05415 ***/ + /*** Three byte table, leaf: e88axx - offset 0x05496 ***/ /* 80 */ 0x0000c64b, 0x0000c64c, 0x0000bdda, 0x0000c64d, /* 84 */ 0x0000dcb9, 0x0000c64e, 0x0000c64f, 0x0000c650, @@ -6471,7 +6509,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dcbf, 0x0000c7db, 0x0000c662, 0x0000c663, /* bc */ 0x0000c664, 0x0000d1bf, 0x0000dcc0, 0x0000c665, - /*** Three byte table, leaf: e88bxx - offset 0x05455 ***/ + /*** Three byte table, leaf: e88bxx - offset 0x054d6 ***/ /* 80 */ 0x0000c666, 0x0000dcca, 0x0000c667, 0x0000c668, /* 84 */ 0x0000dcd0, 0x0000c669, 0x0000c66a, 0x0000cead, @@ -6490,7 +6528,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c67e, 0x0000c6bb, 0x0000c680, 0x0000dcde, /* bc */ 0x0000c681, 0x0000c682, 0x0000c683, 0x0000c684, - /*** Three byte table, leaf: e88cxx - offset 0x05495 ***/ + /*** Three byte table, leaf: e88cxx - offset 0x05516 ***/ /* 80 */ 0x0000c685, 0x0000d7c2, 0x0000c3af, 0x0000b7b6, /* 84 */ 0x0000c7d1, 0x0000c3a9, 0x0000dce2, 0x0000dcd8, @@ -6509,7 +6547,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c8d7, 0x0000c8e3, 0x0000dcfb, 0x0000c69f, /* bc */ 0x0000dced, 0x0000c6a0, 0x0000c740, 0x0000c741, - /*** Three byte table, leaf: e88dxx - offset 0x054d5 ***/ + /*** Three byte table, leaf: e88dxx - offset 0x05556 ***/ /* 80 */ 0x0000dcf7, 0x0000c742, 0x0000c743, 0x0000dcf5, /* 84 */ 0x0000c744, 0x0000c745, 0x0000bea3, 0x0000dcf4, @@ -6528,7 +6566,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dda9, 0x0000c75b, 0x0000c75c, 0x0000ddb6, /* bc */ 0x0000ddb1, 0x0000ddb4, 0x0000c75d, 0x0000c75e, - /*** Three byte table, leaf: e88exx - offset 0x05515 ***/ + /*** Three byte table, leaf: e88exx - offset 0x05596 ***/ /* 80 */ 0x0000c75f, 0x0000c760, 0x0000c761, 0x0000c762, /* 84 */ 0x0000c763, 0x0000ddb0, 0x0000c6ce, 0x0000c764, @@ -6547,7 +6585,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ddb5, 0x0000d3a8, 0x0000ddba, 0x0000c782, /* bc */ 0x0000ddbb, 0x0000c3a7, 0x0000c783, 0x0000c784, - /*** Three byte table, leaf: e88fxx - offset 0x05555 ***/ + /*** Three byte table, leaf: e88fxx - offset 0x055d6 ***/ /* 80 */ 0x0000ddd2, 0x0000ddbc, 0x0000c785, 0x0000c786, /* 84 */ 0x0000c787, 0x0000ddd1, 0x0000c788, 0x0000b9bd, @@ -6566,7 +6604,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ddce, 0x0000ddcf, 0x0000c847, 0x0000c848, /* bc */ 0x0000c849, 0x0000ddc4, 0x0000c84a, 0x0000c84b, - /*** Three byte table, leaf: e890xx - offset 0x05595 ***/ + /*** Three byte table, leaf: e890xx - offset 0x05616 ***/ /* 80 */ 0x0000c84c, 0x0000ddbd, 0x0000c84d, 0x0000ddcd, /* 84 */ 0x0000ccd1, 0x0000c84e, 0x0000ddc9, 0x0000c84f, @@ -6585,7 +6623,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ddc7, 0x0000c871, 0x0000c872, 0x0000c873, /* bc */ 0x0000dde0, 0x0000c2e4, 0x0000c874, 0x0000c875, - /*** Three byte table, leaf: e891xx - offset 0x055d5 ***/ + /*** Three byte table, leaf: e891xx - offset 0x05656 ***/ /* 80 */ 0x0000c876, 0x0000c877, 0x0000c878, 0x0000c879, /* 84 */ 0x0000c87a, 0x0000c87b, 0x0000dde1, 0x0000c87c, @@ -6604,7 +6642,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dddf, 0x0000c89e, 0x0000dddd, 0x0000c89f, /* bc */ 0x0000c8a0, 0x0000c940, 0x0000c941, 0x0000c942, - /*** Three byte table, leaf: e892xx - offset 0x05615 ***/ + /*** Three byte table, leaf: e892xx - offset 0x05696 ***/ /* 80 */ 0x0000c943, 0x0000c944, 0x0000b5d9, 0x0000c945, /* 84 */ 0x0000c946, 0x0000c947, 0x0000c948, 0x0000dddb, @@ -6623,7 +6661,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d5f4, 0x0000ddf3, 0x0000ddf0, 0x0000c96d, /* bc */ 0x0000c96e, 0x0000ddec, 0x0000c96f, 0x0000ddef, - /*** Three byte table, leaf: e893xx - offset 0x05655 ***/ + /*** Three byte table, leaf: e893xx - offset 0x056d6 ***/ /* 80 */ 0x0000c970, 0x0000dde8, 0x0000c971, 0x0000c972, /* 84 */ 0x0000d0ee, 0x0000c973, 0x0000c974, 0x0000c975, @@ -6642,7 +6680,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c998, 0x0000c999, 0x0000c99a, 0x0000c99b, /* bc */ 0x0000dea4, 0x0000c99c, 0x0000c99d, 0x0000dea3, - /*** Three byte table, leaf: e894xx - offset 0x05695 ***/ + /*** Three byte table, leaf: e894xx - offset 0x05716 ***/ /* 80 */ 0x0000c99e, 0x0000c99f, 0x0000c9a0, 0x0000ca40, /* 84 */ 0x0000ca41, 0x0000ca42, 0x0000ca43, 0x0000ca44, @@ -6661,7 +6699,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ddfa, 0x0000ddfc, 0x0000ddfe, 0x0000dea2, /* bc */ 0x0000b0aa, 0x0000b1ce, 0x0000ca6b, 0x0000ca6c, - /*** Three byte table, leaf: e895xx - offset 0x056d5 ***/ + /*** Three byte table, leaf: e895xx - offset 0x05756 ***/ /* 80 */ 0x0000ca6d, 0x0000ca6e, 0x0000ca6f, 0x0000deac, /* 84 */ 0x0000ca70, 0x0000ca71, 0x0000ca72, 0x0000ca73, @@ -6680,7 +6718,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ca9b, 0x0000deb3, 0x0000deaa, 0x0000deae, /* bc */ 0x0000ca9c, 0x0000ca9d, 0x0000c0d9, 0x0000ca9e, - /*** Three byte table, leaf: e896xx - offset 0x05715 ***/ + /*** Three byte table, leaf: e896xx - offset 0x05796 ***/ /* 80 */ 0x0000ca9f, 0x0000caa0, 0x0000cb40, 0x0000cb41, /* 84 */ 0x0000b1a1, 0x0000deb6, 0x0000cb42, 0x0000deb1, @@ -6699,7 +6737,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cb69, 0x0000deb7, 0x0000cb6a, 0x0000cb6b, /* bc */ 0x0000cb6c, 0x0000cb6d, 0x0000cb6e, 0x0000cb6f, - /*** Three byte table, leaf: e897xx - offset 0x05755 ***/ + /*** Three byte table, leaf: e897xx - offset 0x057d6 ***/ /* 80 */ 0x0000cb70, 0x0000debb, 0x0000cb71, 0x0000cb72, /* 84 */ 0x0000cb73, 0x0000cb74, 0x0000cb75, 0x0000cb76, @@ -6718,7 +6756,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cba0, 0x0000cc40, 0x0000cc41, 0x0000d4e5, /* bc */ 0x0000cc42, 0x0000cc43, 0x0000cc44, 0x0000debd, - /*** Three byte table, leaf: e898xx - offset 0x05795 ***/ + /*** Three byte table, leaf: e898xx - offset 0x05816 ***/ /* 80 */ 0x0000cc45, 0x0000cc46, 0x0000cc47, 0x0000cc48, /* 84 */ 0x0000cc49, 0x0000debf, 0x0000cc4a, 0x0000cc4b, @@ -6737,7 +6775,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d5ba, 0x0000cc78, 0x0000cc79, 0x0000cc7a, /* bc */ 0x0000dec2, 0x0000cc7b, 0x0000cc7c, 0x0000cc7d, - /*** Three byte table, leaf: e899xx - offset 0x057d5 ***/ + /*** Three byte table, leaf: e899xx - offset 0x05856 ***/ /* 80 */ 0x0000cc7e, 0x0000cc80, 0x0000cc81, 0x0000cc82, /* 84 */ 0x0000cc83, 0x0000cc84, 0x0000cc85, 0x0000cc86, @@ -6756,7 +6794,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cd49, 0x0000bae7, 0x0000f2b3, 0x0000f2b5, /* bc */ 0x0000f2b4, 0x0000cbe4, 0x0000cfba, 0x0000f2b2, - /*** Three byte table, leaf: e89axx - offset 0x05815 ***/ + /*** Three byte table, leaf: e89axx - offset 0x05896 ***/ /* 80 */ 0x0000cab4, 0x0000d2cf, 0x0000c2ec, 0x0000cd4a, /* 84 */ 0x0000cd4b, 0x0000cd4c, 0x0000cd4d, 0x0000cd4e, @@ -6775,7 +6813,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cd6a, 0x0000cd6b, 0x0000f2c5, 0x0000cd6c, /* bc */ 0x0000cd6d, 0x0000cd6e, 0x0000cd6f, 0x0000cd70, - /*** Three byte table, leaf: e89bxx - offset 0x05855 ***/ + /*** Three byte table, leaf: e89bxx - offset 0x058d6 ***/ /* 80 */ 0x0000d6fb, 0x0000cd71, 0x0000cd72, 0x0000cd73, /* 84 */ 0x0000f2c1, 0x0000cd74, 0x0000c7f9, 0x0000c9df, @@ -6794,7 +6832,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f2d9, 0x0000d3bc, 0x0000cd90, 0x0000cd91, /* bc */ 0x0000cd92, 0x0000cd93, 0x0000b6ea, 0x0000cd94, - /*** Three byte table, leaf: e89cxx - offset 0x05895 ***/ + /*** Three byte table, leaf: e89cxx - offset 0x05916 ***/ /* 80 */ 0x0000caf1, 0x0000cd95, 0x0000b7e4, 0x0000f2d7, /* 84 */ 0x0000cd96, 0x0000cd97, 0x0000cd98, 0x0000f2d8, @@ -6813,7 +6851,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ce53, 0x0000ce54, 0x0000ce55, 0x0000f2df, /* bc */ 0x0000ce56, 0x0000ce57, 0x0000f2e4, 0x0000f2ea, - /*** Three byte table, leaf: e89dxx - offset 0x058d5 ***/ + /*** Three byte table, leaf: e89dxx - offset 0x05956 ***/ /* 80 */ 0x0000ce58, 0x0000ce59, 0x0000ce5a, 0x0000ce5b, /* 84 */ 0x0000ce5c, 0x0000ce5d, 0x0000ce5e, 0x0000d3ac, @@ -6832,7 +6870,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ce81, 0x0000ce82, 0x0000ce83, 0x0000f2ef, /* bc */ 0x0000f2f7, 0x0000f2ed, 0x0000f2ee, 0x0000ce84, - /*** Three byte table, leaf: e89exx - offset 0x05915 ***/ + /*** Three byte table, leaf: e89exx - offset 0x05996 ***/ /* 80 */ 0x0000ce85, 0x0000ce86, 0x0000f2eb, 0x0000f3a6, /* 84 */ 0x0000ce87, 0x0000f3a3, 0x0000ce88, 0x0000ce89, @@ -6851,7 +6889,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cf4c, 0x0000cf4d, 0x0000c2dd, 0x0000cf4e, /* bc */ 0x0000cf4f, 0x0000f3ae, 0x0000cf50, 0x0000cf51, - /*** Three byte table, leaf: e89fxx - offset 0x05955 ***/ + /*** Three byte table, leaf: e89fxx - offset 0x059d6 ***/ /* 80 */ 0x0000f3b0, 0x0000cf52, 0x0000cf53, 0x0000cf54, /* 84 */ 0x0000cf55, 0x0000cf56, 0x0000f3a1, 0x0000cf57, @@ -6870,7 +6908,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cf7e, 0x0000d0b7, 0x0000cf80, 0x0000cf81, /* bc */ 0x0000cf82, 0x0000cf83, 0x0000f3b8, 0x0000cf84, - /*** Three byte table, leaf: e8a0xx - offset 0x05995 ***/ + /*** Three byte table, leaf: e8a0xx - offset 0x05a16 ***/ /* 80 */ 0x0000cf85, 0x0000cf86, 0x0000cf87, 0x0000d9f9, /* 84 */ 0x0000cf88, 0x0000cf89, 0x0000cf8a, 0x0000cf8b, @@ -6889,7 +6927,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d053, 0x0000f3bc, 0x0000d054, 0x0000d055, /* bc */ 0x0000f3bd, 0x0000d056, 0x0000d057, 0x0000d058, - /*** Three byte table, leaf: e8a1xx - offset 0x059d5 ***/ + /*** Three byte table, leaf: e8a1xx - offset 0x05a56 ***/ /* 80 */ 0x0000d1aa, 0x0000d059, 0x0000d05a, 0x0000d05b, /* 84 */ 0x0000f4ac, 0x0000d0c6, 0x0000d05c, 0x0000d05d, @@ -6908,7 +6946,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d07c, 0x0000d07d, 0x0000d07e, 0x0000d080, /* bc */ 0x0000d081, 0x0000f1c5, 0x0000f4c0, 0x0000f1c6, - /*** Three byte table, leaf: e8a2xx - offset 0x05a15 ***/ + /*** Three byte table, leaf: e8a2xx - offset 0x05a96 ***/ /* 80 */ 0x0000d082, 0x0000d4ac, 0x0000f1c7, 0x0000d083, /* 84 */ 0x0000b0c0, 0x0000f4c1, 0x0000d084, 0x0000d085, @@ -6927,7 +6965,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d149, 0x0000d14a, 0x0000d14b, 0x0000d14c, /* bc */ 0x0000f1cb, 0x0000d14d, 0x0000d14e, 0x0000d14f, - /*** Three byte table, leaf: e8a3xx - offset 0x05a55 ***/ + /*** Three byte table, leaf: e8a3xx - offset 0x05ad6 ***/ /* 80 */ 0x0000d150, 0x0000b2c3, 0x0000c1d1, 0x0000d151, /* 84 */ 0x0000d152, 0x0000d7b0, 0x0000f1c9, 0x0000d153, @@ -6946,7 +6984,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c2e3, 0x0000b9fc, 0x0000d173, 0x0000d174, /* bc */ 0x0000f1d3, 0x0000d175, 0x0000f1d5, 0x0000d176, - /*** Three byte table, leaf: e8a4xx - offset 0x05a95 ***/ + /*** Three byte table, leaf: e8a4xx - offset 0x05b16 ***/ /* 80 */ 0x0000d177, 0x0000d178, 0x0000b9d3, 0x0000d179, /* 84 */ 0x0000d17a, 0x0000d17b, 0x0000d17c, 0x0000d17d, @@ -6965,7 +7003,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d240, 0x0000d241, 0x0000d242, 0x0000d243, /* bc */ 0x0000d244, 0x0000d245, 0x0000d246, 0x0000d247, - /*** Three byte table, leaf: e8a5xx - offset 0x05ad5 ***/ + /*** Three byte table, leaf: e8a5xx - offset 0x05b56 ***/ /* 80 */ 0x0000d248, 0x0000f1df, 0x0000d249, 0x0000d24a, /* 84 */ 0x0000cfe5, 0x0000d24b, 0x0000d24c, 0x0000d24d, @@ -6984,7 +7022,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d27b, 0x0000d27c, 0x0000d27d, 0x0000f1e1, /* bc */ 0x0000d27e, 0x0000d280, 0x0000d281, 0x0000cef7, - /*** Three byte table, leaf: e8a6xx - offset 0x05b15 ***/ + /*** Three byte table, leaf: e8a6xx - offset 0x05b96 ***/ /* 80 */ 0x0000d282, 0x0000d2aa, 0x0000d283, 0x0000f1fb, /* 84 */ 0x0000d284, 0x0000d285, 0x0000b8b2, 0x0000d286, @@ -7003,7 +7041,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d356, 0x0000d357, 0x0000d358, 0x0000d359, /* bc */ 0x0000d35a, 0x0000d35b, 0x0000d35c, 0x0000d35d, - /*** Three byte table, leaf: e8a7xx - offset 0x05b55 ***/ + /*** Three byte table, leaf: e8a7xx - offset 0x05bd6 ***/ /* 80 */ 0x0000d35e, 0x0000bcfb, 0x0000b9db, 0x0000d35f, /* 84 */ 0x0000b9e6, 0x0000c3d9, 0x0000cad3, 0x0000eae8, @@ -7022,7 +7060,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d37c, 0x0000d37d, 0x0000d37e, 0x0000d380, /* bc */ 0x0000d381, 0x0000d382, 0x0000d383, 0x0000d384, - /*** Three byte table, leaf: e8a8xx - offset 0x05b95 ***/ + /*** Three byte table, leaf: e8a8xx - offset 0x05c16 ***/ /* 80 */ 0x0000d1d4, 0x0000d385, 0x0000d386, 0x0000d387, /* 84 */ 0x0000d388, 0x0000d389, 0x0000d38a, 0x0000d9ea, @@ -7041,7 +7079,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d45a, 0x0000d45b, 0x0000d45c, 0x0000d45d, /* bc */ 0x0000d45e, 0x0000d45f, 0x0000f6a4, 0x0000d460, - /*** Three byte table, leaf: e8a9xx - offset 0x05bd5 ***/ + /*** Three byte table, leaf: e8a9xx - offset 0x05c56 ***/ /* 80 */ 0x0000d461, 0x0000d462, 0x0000d463, 0x0000d464, /* 84 */ 0x0000d465, 0x0000d466, 0x0000d467, 0x0000d468, @@ -7060,7 +7098,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d499, 0x0000d5b2, 0x0000d49a, 0x0000d49b, /* bc */ 0x0000d49c, 0x0000d49d, 0x0000d49e, 0x0000d49f, - /*** Three byte table, leaf: e8aaxx - offset 0x05c15 ***/ + /*** Three byte table, leaf: e8aaxx - offset 0x05c96 ***/ /* 80 */ 0x0000d4a0, 0x0000d540, 0x0000d541, 0x0000d542, /* 84 */ 0x0000d543, 0x0000d544, 0x0000d545, 0x0000d546, @@ -7079,7 +7117,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d574, 0x0000d575, 0x0000d576, 0x0000d577, /* bc */ 0x0000d578, 0x0000d579, 0x0000d57a, 0x0000d57b, - /*** Three byte table, leaf: e8abxx - offset 0x05c55 ***/ + /*** Three byte table, leaf: e8abxx - offset 0x05cd6 ***/ /* 80 */ 0x0000d57c, 0x0000d57d, 0x0000d57e, 0x0000d580, /* 84 */ 0x0000d581, 0x0000d582, 0x0000d583, 0x0000d584, @@ -7098,7 +7136,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d654, 0x0000d655, 0x0000d656, 0x0000d657, /* bc */ 0x0000d658, 0x0000d659, 0x0000d65a, 0x0000d65b, - /*** Three byte table, leaf: e8acxx - offset 0x05c95 ***/ + /*** Three byte table, leaf: e8acxx - offset 0x05d16 ***/ /* 80 */ 0x0000d65c, 0x0000d65d, 0x0000d65e, 0x0000d65f, /* 84 */ 0x0000d660, 0x0000d661, 0x0000d662, 0x0000e5c0, @@ -7117,7 +7155,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d693, 0x0000d694, 0x0000d695, 0x0000d696, /* bc */ 0x0000d697, 0x0000d698, 0x0000d699, 0x0000d69a, - /*** Three byte table, leaf: e8adxx - offset 0x05cd5 ***/ + /*** Three byte table, leaf: e8adxx - offset 0x05d56 ***/ /* 80 */ 0x0000d69b, 0x0000d69c, 0x0000d69d, 0x0000d69e, /* 84 */ 0x0000d69f, 0x0000d6a0, 0x0000d740, 0x0000d741, @@ -7136,7 +7174,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d770, 0x0000d771, 0x0000d772, 0x0000d773, /* bc */ 0x0000d774, 0x0000d775, 0x0000d776, 0x0000d777, - /*** Three byte table, leaf: e8aexx - offset 0x05d15 ***/ + /*** Three byte table, leaf: e8aexx - offset 0x05d96 ***/ /* 80 */ 0x0000d778, 0x0000d779, 0x0000d77a, 0x0000d77b, /* 84 */ 0x0000d77c, 0x0000d77d, 0x0000d77e, 0x0000d780, @@ -7155,7 +7193,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d0ed, 0x0000b6ef, 0x0000c2db, 0x0000d79b, /* bc */ 0x0000cbcf, 0x0000b7ed, 0x0000c9e8, 0x0000b7c3, - /*** Three byte table, leaf: e8afxx - offset 0x05d55 ***/ + /*** Three byte table, leaf: e8afxx - offset 0x05dd6 ***/ /* 80 */ 0x0000bef7, 0x0000d6a4, 0x0000daac, 0x0000daad, /* 84 */ 0x0000c6c0, 0x0000d7e7, 0x0000cab6, 0x0000d79c, @@ -7174,7 +7212,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d6ee, 0x0000dac1, 0x0000c5b5, 0x0000b6c1, /* bc */ 0x0000dac2, 0x0000b7cc, 0x0000bfce, 0x0000dac3, - /*** Three byte table, leaf: e8b0xx - offset 0x05d95 ***/ + /*** Three byte table, leaf: e8b0xx - offset 0x05e16 ***/ /* 80 */ 0x0000dac4, 0x0000cbad, 0x0000dac5, 0x0000b5f7, /* 84 */ 0x0000dac6, 0x0000c1c2, 0x0000d7bb, 0x0000dac7, @@ -7193,7 +7231,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d840, 0x0000d841, 0x0000d842, 0x0000d843, /* bc */ 0x0000d844, 0x0000d845, 0x0000d846, 0x0000d847, - /*** Three byte table, leaf: e8b1xx - offset 0x05dd5 ***/ + /*** Three byte table, leaf: e8b1xx - offset 0x05e56 ***/ /* 80 */ 0x0000d848, 0x0000bbed, 0x0000d849, 0x0000d84a, /* 84 */ 0x0000d84b, 0x0000d84c, 0x0000b6b9, 0x0000f4f8, @@ -7212,7 +7250,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f5f4, 0x0000b1aa, 0x0000b2f2, 0x0000d874, /* bc */ 0x0000d875, 0x0000d876, 0x0000d877, 0x0000d878, - /*** Three byte table, leaf: e8b2xx - offset 0x05e15 ***/ + /*** Three byte table, leaf: e8b2xx - offset 0x05e96 ***/ /* 80 */ 0x0000d879, 0x0000d87a, 0x0000f5f5, 0x0000d87b, /* 84 */ 0x0000d87c, 0x0000f5f7, 0x0000d87d, 0x0000d87e, @@ -7231,7 +7269,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d94a, 0x0000d94b, 0x0000d94c, 0x0000d94d, /* bc */ 0x0000d94e, 0x0000d94f, 0x0000d950, 0x0000d951, - /*** Three byte table, leaf: e8b3xx - offset 0x05e55 ***/ + /*** Three byte table, leaf: e8b3xx - offset 0x05ed6 ***/ /* 80 */ 0x0000d952, 0x0000d953, 0x0000d954, 0x0000d955, /* 84 */ 0x0000d956, 0x0000d957, 0x0000d958, 0x0000d959, @@ -7250,7 +7288,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d98b, 0x0000d98c, 0x0000d98d, 0x0000d98e, /* bc */ 0x0000d98f, 0x0000d990, 0x0000d991, 0x0000d992, - /*** Three byte table, leaf: e8b4xx - offset 0x05e95 ***/ + /*** Three byte table, leaf: e8b4xx - offset 0x05f16 ***/ /* 80 */ 0x0000d993, 0x0000d994, 0x0000d995, 0x0000d996, /* 84 */ 0x0000d997, 0x0000d998, 0x0000d999, 0x0000d99a, @@ -7269,7 +7307,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000c3b3, 0x0000b7d1, 0x0000bad8, 0x0000eadd, /* bc */ 0x0000d4f4, 0x0000eade, 0x0000bcd6, 0x0000bbdf, - /*** Three byte table, leaf: e8b5xx - offset 0x05ed5 ***/ + /*** Three byte table, leaf: e8b5xx - offset 0x05f56 ***/ /* 80 */ 0x0000eadf, 0x0000c1de, 0x0000c2b8, 0x0000d4df, /* 84 */ 0x0000d7ca, 0x0000eae0, 0x0000eae1, 0x0000eae4, @@ -7288,7 +7326,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000da5d, 0x0000da5e, 0x0000da5f, 0x0000da60, /* bc */ 0x0000da61, 0x0000da62, 0x0000da63, 0x0000da64, - /*** Three byte table, leaf: e8b6xx - offset 0x05f15 ***/ + /*** Three byte table, leaf: e8b6xx - offset 0x05f96 ***/ /* 80 */ 0x0000da65, 0x0000b3c3, 0x0000da66, 0x0000da67, /* 84 */ 0x0000f4f2, 0x0000b3ac, 0x0000da68, 0x0000da69, @@ -7307,7 +7345,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f5bb, 0x0000da91, 0x0000f5c3, 0x0000da92, /* bc */ 0x0000f5c2, 0x0000da93, 0x0000d6ba, 0x0000f5c1, - /*** Three byte table, leaf: e8b7xx - offset 0x05f55 ***/ + /*** Three byte table, leaf: e8b7xx - offset 0x05fd6 ***/ /* 80 */ 0x0000da94, 0x0000da95, 0x0000da96, 0x0000d4be, /* 84 */ 0x0000f5c4, 0x0000da97, 0x0000f5cc, 0x0000da98, @@ -7326,7 +7364,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f5cf, 0x0000f5d1, 0x0000b6e5, 0x0000f5d2, /* bc */ 0x0000db52, 0x0000f5d5, 0x0000db53, 0x0000db54, - /*** Three byte table, leaf: e8b8xx - offset 0x05f95 ***/ + /*** Three byte table, leaf: e8b8xx - offset 0x06016 ***/ /* 80 */ 0x0000db55, 0x0000db56, 0x0000db57, 0x0000db58, /* 84 */ 0x0000db59, 0x0000f5bd, 0x0000db5a, 0x0000db5b, @@ -7345,7 +7383,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000db7b, 0x0000f5df, 0x0000f5dd, 0x0000db7c, /* bc */ 0x0000db7d, 0x0000f5e1, 0x0000db7e, 0x0000db80, - /*** Three byte table, leaf: e8b9xx - offset 0x05fd5 ***/ + /*** Three byte table, leaf: e8b9xx - offset 0x06056 ***/ /* 80 */ 0x0000f5de, 0x0000f5e4, 0x0000f5e5, 0x0000db81, /* 84 */ 0x0000cce3, 0x0000db82, 0x0000db83, 0x0000e5bf, @@ -7364,7 +7402,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dc43, 0x0000dc44, 0x0000dc45, 0x0000dc46, /* bc */ 0x0000f5eb, 0x0000dc47, 0x0000dc48, 0x0000b4da, - /*** Three byte table, leaf: e8baxx - offset 0x06015 ***/ + /*** Three byte table, leaf: e8baxx - offset 0x06096 ***/ /* 80 */ 0x0000dc49, 0x0000d4ea, 0x0000dc4a, 0x0000dc4b, /* 84 */ 0x0000dc4c, 0x0000f5ee, 0x0000dc4d, 0x0000b3f9, @@ -7383,7 +7421,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dc75, 0x0000dc76, 0x0000ccc9, 0x0000dc77, /* bc */ 0x0000dc78, 0x0000dc79, 0x0000dc7a, 0x0000dc7b, - /*** Three byte table, leaf: e8bbxx - offset 0x06055 ***/ + /*** Three byte table, leaf: e8bbxx - offset 0x060d6 ***/ /* 80 */ 0x0000dc7c, 0x0000dc7d, 0x0000dc7e, 0x0000dc80, /* 84 */ 0x0000dc81, 0x0000dc82, 0x0000dc83, 0x0000dc84, @@ -7402,7 +7440,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dd53, 0x0000dd54, 0x0000dd55, 0x0000dd56, /* bc */ 0x0000dd57, 0x0000dd58, 0x0000dd59, 0x0000dd5a, - /*** Three byte table, leaf: e8bcxx - offset 0x06095 ***/ + /*** Three byte table, leaf: e8bcxx - offset 0x06116 ***/ /* 80 */ 0x0000dd5b, 0x0000dd5c, 0x0000dd5d, 0x0000dd5e, /* 84 */ 0x0000dd5f, 0x0000dd60, 0x0000dd61, 0x0000dd62, @@ -7421,7 +7459,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dd94, 0x0000dd95, 0x0000dd96, 0x0000dd97, /* bc */ 0x0000dd98, 0x0000dd99, 0x0000dd9a, 0x0000dd9b, - /*** Three byte table, leaf: e8bdxx - offset 0x060d5 ***/ + /*** Three byte table, leaf: e8bdxx - offset 0x06156 ***/ /* 80 */ 0x0000dd9c, 0x0000dd9d, 0x0000dd9e, 0x0000dd9f, /* 84 */ 0x0000dda0, 0x0000de40, 0x0000de41, 0x0000de42, @@ -7440,7 +7478,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e9f4, 0x0000e9f6, 0x0000e9f7, 0x0000c7e1, /* bc */ 0x0000e9f8, 0x0000d4d8, 0x0000e9f9, 0x0000bdce, - /*** Three byte table, leaf: e8bexx - offset 0x06115 ***/ + /*** Three byte table, leaf: e8bexx - offset 0x06196 ***/ /* 80 */ 0x0000de62, 0x0000e9fa, 0x0000e9fb, 0x0000bdcf, /* 84 */ 0x0000e9fc, 0x0000b8a8, 0x0000c1be, 0x0000e9fd, @@ -7459,7 +7497,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000de77, 0x0000b1df, 0x0000de78, 0x0000de79, /* bc */ 0x0000de7a, 0x0000c1c9, 0x0000b4ef, 0x0000de7b, - /*** Three byte table, leaf: e8bfxx - offset 0x06155 ***/ + /*** Three byte table, leaf: e8bfxx - offset 0x061d6 ***/ /* 80 */ 0x0000de7c, 0x0000c7a8, 0x0000d3d8, 0x0000de7d, /* 84 */ 0x0000c6f9, 0x0000d1b8, 0x0000de7e, 0x0000b9fd, @@ -7478,7 +7516,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b1c5, 0x0000bca3, 0x0000de95, 0x0000de96, /* bc */ 0x0000de97, 0x0000d7b7, 0x0000de98, 0x0000de99, - /*** Three byte table, leaf: e980xx - offset 0x06195 ***/ + /*** Three byte table, leaf: e980xx - offset 0x06216 ***/ /* 80 */ 0x0000cdcb, 0x0000cbcd, 0x0000caca, 0x0000ccd3, /* 84 */ 0x0000e5cc, 0x0000e5cb, 0x0000c4e6, 0x0000de9a, @@ -7497,7 +7535,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d2dd, 0x0000df51, 0x0000df52, 0x0000c2df, /* bc */ 0x0000b1c6, 0x0000df53, 0x0000d3e2, 0x0000df54, - /*** Three byte table, leaf: e981xx - offset 0x061d5 ***/ + /*** Three byte table, leaf: e981xx - offset 0x06256 ***/ /* 80 */ 0x0000df55, 0x0000b6dd, 0x0000cbec, 0x0000df56, /* 84 */ 0x0000e5d7, 0x0000df57, 0x0000df58, 0x0000d3f6, @@ -7516,7 +7554,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000df78, 0x0000df79, 0x0000df7a, 0x0000df7b, /* bc */ 0x0000df7c, 0x0000e5e1, 0x0000df7d, 0x0000b1dc, - /*** Three byte table, leaf: e982xx - offset 0x06215 ***/ + /*** Three byte table, leaf: e982xx - offset 0x06296 ***/ /* 80 */ 0x0000d1fb, 0x0000df7e, 0x0000e5e2, 0x0000e5e4, /* 84 */ 0x0000df80, 0x0000df81, 0x0000df82, 0x0000df83, @@ -7535,7 +7573,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000dba1, 0x0000d7de, 0x0000dafe, 0x0000c1da, /* bc */ 0x0000df9d, 0x0000df9e, 0x0000dba5, 0x0000df9f, - /*** Three byte table, leaf: e983xx - offset 0x06255 ***/ + /*** Three byte table, leaf: e983xx - offset 0x062d6 ***/ /* 80 */ 0x0000dfa0, 0x0000d3f4, 0x0000e040, 0x0000e041, /* 84 */ 0x0000dba7, 0x0000dba4, 0x0000e042, 0x0000dba8, @@ -7554,7 +7592,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b5a6, 0x0000e060, 0x0000e061, 0x0000e062, /* bc */ 0x0000e063, 0x0000b6bc, 0x0000dbb1, 0x0000e064, - /*** Three byte table, leaf: e984xx - offset 0x06295 ***/ + /*** Three byte table, leaf: e984xx - offset 0x06316 ***/ /* 80 */ 0x0000e065, 0x0000e066, 0x0000b6f5, 0x0000e067, /* 84 */ 0x0000dbb2, 0x0000e068, 0x0000e069, 0x0000e06a, @@ -7573,7 +7611,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e096, 0x0000dbb8, 0x0000e097, 0x0000e098, /* bc */ 0x0000e099, 0x0000e09a, 0x0000e09b, 0x0000e09c, - /*** Three byte table, leaf: e985xx - offset 0x062d5 ***/ + /*** Three byte table, leaf: e985xx - offset 0x06356 ***/ /* 80 */ 0x0000e09d, 0x0000e09e, 0x0000e09f, 0x0000dbb9, /* 84 */ 0x0000e0a0, 0x0000e140, 0x0000dbba, 0x0000e141, @@ -7592,7 +7630,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000cbe1, 0x0000f5aa, 0x0000e154, 0x0000e155, /* bc */ 0x0000e156, 0x0000f5a6, 0x0000f5a7, 0x0000c4f0, - /*** Three byte table, leaf: e986xx - offset 0x06315 ***/ + /*** Three byte table, leaf: e986xx - offset 0x06396 ***/ /* 80 */ 0x0000e157, 0x0000e158, 0x0000e159, 0x0000e15a, /* 84 */ 0x0000e15b, 0x0000f5ac, 0x0000e15c, 0x0000b4bc, @@ -7611,7 +7649,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e17c, 0x0000e17d, 0x0000f5b8, 0x0000e17e, /* bc */ 0x0000e180, 0x0000e181, 0x0000e182, 0x0000e183, - /*** Three byte table, leaf: e987xx - offset 0x06355 ***/ + /*** Three byte table, leaf: e987xx - offset 0x063d6 ***/ /* 80 */ 0x0000e184, 0x0000e185, 0x0000e186, 0x0000e187, /* 84 */ 0x0000e188, 0x0000e189, 0x0000e18a, 0x0000b2c9, @@ -7630,7 +7668,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e252, 0x0000e253, 0x0000e254, 0x0000e255, /* bc */ 0x0000e256, 0x0000e257, 0x0000e258, 0x0000e259, - /*** Three byte table, leaf: e988xx - offset 0x06395 ***/ + /*** Three byte table, leaf: e988xx - offset 0x06416 ***/ /* 80 */ 0x0000e25a, 0x0000e25b, 0x0000e25c, 0x0000e25d, /* 84 */ 0x0000e25e, 0x0000e25f, 0x0000e260, 0x0000e261, @@ -7649,7 +7687,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e293, 0x0000e294, 0x0000e295, 0x0000e296, /* bc */ 0x0000e297, 0x0000e298, 0x0000e299, 0x0000e29a, - /*** Three byte table, leaf: e989xx - offset 0x063d5 ***/ + /*** Three byte table, leaf: e989xx - offset 0x06456 ***/ /* 80 */ 0x0000e29b, 0x0000e29c, 0x0000e29d, 0x0000e29e, /* 84 */ 0x0000e29f, 0x0000e2a0, 0x0000e340, 0x0000e341, @@ -7668,7 +7706,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e371, 0x0000e372, 0x0000e373, 0x0000e374, /* bc */ 0x0000e375, 0x0000e376, 0x0000e377, 0x0000e378, - /*** Three byte table, leaf: e98axx - offset 0x06415 ***/ + /*** Three byte table, leaf: e98axx - offset 0x06496 ***/ /* 80 */ 0x0000e379, 0x0000e37a, 0x0000e37b, 0x0000e37c, /* 84 */ 0x0000e37d, 0x0000e37e, 0x0000e380, 0x0000e381, @@ -7687,7 +7725,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e44f, 0x0000e450, 0x0000e451, 0x0000e452, /* bc */ 0x0000e453, 0x0000e454, 0x0000e455, 0x0000e456, - /*** Three byte table, leaf: e98bxx - offset 0x06455 ***/ + /*** Three byte table, leaf: e98bxx - offset 0x064d6 ***/ /* 80 */ 0x0000e457, 0x0000e458, 0x0000e459, 0x0000e45a, /* 84 */ 0x0000e45b, 0x0000e45c, 0x0000e45d, 0x0000e45e, @@ -7706,7 +7744,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e48f, 0x0000e490, 0x0000e491, 0x0000e492, /* bc */ 0x0000e493, 0x0000e494, 0x0000e495, 0x0000e496, - /*** Three byte table, leaf: e98cxx - offset 0x06495 ***/ + /*** Three byte table, leaf: e98cxx - offset 0x06516 ***/ /* 80 */ 0x0000e497, 0x0000e498, 0x0000e499, 0x0000e49a, /* 84 */ 0x0000e49b, 0x0000e49c, 0x0000e49d, 0x0000e49e, @@ -7725,7 +7763,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e56e, 0x0000e56f, 0x0000e570, 0x0000e571, /* bc */ 0x0000e572, 0x0000e573, 0x0000f6c9, 0x0000e574, - /*** Three byte table, leaf: e98dxx - offset 0x064d5 ***/ + /*** Three byte table, leaf: e98dxx - offset 0x06556 ***/ /* 80 */ 0x0000e575, 0x0000e576, 0x0000e577, 0x0000e578, /* 84 */ 0x0000e579, 0x0000e57a, 0x0000e57b, 0x0000e57c, @@ -7744,7 +7782,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e64c, 0x0000e64d, 0x0000e64e, 0x0000e64f, /* bc */ 0x0000e650, 0x0000e651, 0x0000e652, 0x0000e653, - /*** Three byte table, leaf: e98exx - offset 0x06515 ***/ + /*** Three byte table, leaf: e98exx - offset 0x06596 ***/ /* 80 */ 0x0000e654, 0x0000e655, 0x0000e656, 0x0000e657, /* 84 */ 0x0000e658, 0x0000e659, 0x0000e65a, 0x0000e65b, @@ -7763,7 +7801,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e68c, 0x0000e68d, 0x0000e68e, 0x0000e68f, /* bc */ 0x0000e690, 0x0000e691, 0x0000e692, 0x0000e693, - /*** Three byte table, leaf: e98fxx - offset 0x06555 ***/ + /*** Three byte table, leaf: e98fxx - offset 0x065d6 ***/ /* 80 */ 0x0000e694, 0x0000e695, 0x0000e696, 0x0000e697, /* 84 */ 0x0000e698, 0x0000e699, 0x0000e69a, 0x0000e69b, @@ -7782,7 +7820,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e769, 0x0000e76a, 0x0000e76b, 0x0000e76c, /* bc */ 0x0000e76d, 0x0000e76e, 0x0000e76f, 0x0000e770, - /*** Three byte table, leaf: e990xx - offset 0x06595 ***/ + /*** Three byte table, leaf: e990xx - offset 0x06616 ***/ /* 80 */ 0x0000e771, 0x0000e772, 0x0000e773, 0x0000e774, /* 84 */ 0x0000e775, 0x0000e776, 0x0000e777, 0x0000e778, @@ -7801,7 +7839,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e849, 0x0000e84a, 0x0000e84b, 0x0000e84c, /* bc */ 0x0000e84d, 0x0000e84e, 0x0000f6cd, 0x0000e84f, - /*** Three byte table, leaf: e991xx - offset 0x065d5 ***/ + /*** Three byte table, leaf: e991xx - offset 0x06656 ***/ /* 80 */ 0x0000e850, 0x0000e851, 0x0000e852, 0x0000e853, /* 84 */ 0x0000e854, 0x0000e855, 0x0000e856, 0x0000e857, @@ -7820,7 +7858,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e888, 0x0000e889, 0x0000e88a, 0x0000e88b, /* bc */ 0x0000e88c, 0x0000e88d, 0x0000e88e, 0x0000e88f, - /*** Three byte table, leaf: e992xx - offset 0x06615 ***/ + /*** Three byte table, leaf: e992xx - offset 0x06696 ***/ /* 80 */ 0x0000e890, 0x0000e891, 0x0000e892, 0x0000e893, /* 84 */ 0x0000e894, 0x0000eec4, 0x0000eec5, 0x0000eec6, @@ -7839,7 +7877,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000eedf, 0x0000eee0, 0x0000eee1, 0x0000d7ea, /* bc */ 0x0000eee2, 0x0000eee3, 0x0000bcd8, 0x0000eee4, - /*** Three byte table, leaf: e993xx - offset 0x06655 ***/ + /*** Three byte table, leaf: e993xx - offset 0x066d6 ***/ /* 80 */ 0x0000d3cb, 0x0000ccfa, 0x0000b2ac, 0x0000c1e5, /* 84 */ 0x0000eee5, 0x0000c7a6, 0x0000c3ad, 0x0000e898, @@ -7858,7 +7896,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d6fd, 0x0000efa9, 0x0000c6cc, 0x0000e89e, /* bc */ 0x0000efaa, 0x0000efab, 0x0000c1b4, 0x0000efac, - /*** Three byte table, leaf: e994xx - offset 0x06695 ***/ + /*** Three byte table, leaf: e994xx - offset 0x06716 ***/ /* 80 */ 0x0000cffa, 0x0000cbf8, 0x0000efae, 0x0000efad, /* 84 */ 0x0000b3fa, 0x0000b9f8, 0x0000efaf, 0x0000efb0, @@ -7877,7 +7915,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000efca, 0x0000c7c2, 0x0000eff1, 0x0000b6cd, /* bc */ 0x0000efcb, 0x0000e942, 0x0000efcc, 0x0000efcd, - /*** Three byte table, leaf: e995xx - offset 0x066d5 ***/ + /*** Three byte table, leaf: e995xx - offset 0x06756 ***/ /* 80 */ 0x0000b6c6, 0x0000c3be, 0x0000efce, 0x0000e943, /* 84 */ 0x0000efd0, 0x0000efd1, 0x0000efd2, 0x0000d5f2, @@ -7896,7 +7934,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e94d, 0x0000e94e, 0x0000e94f, 0x0000e950, /* bc */ 0x0000e951, 0x0000e952, 0x0000e953, 0x0000b3a4, - /*** Three byte table, leaf: e996xx - offset 0x06715 ***/ + /*** Three byte table, leaf: e996xx - offset 0x06796 ***/ /* 80 */ 0x0000e954, 0x0000e955, 0x0000e956, 0x0000e957, /* 84 */ 0x0000e958, 0x0000e959, 0x0000e95a, 0x0000e95b, @@ -7915,7 +7953,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e98d, 0x0000e98e, 0x0000e98f, 0x0000e990, /* bc */ 0x0000e991, 0x0000e992, 0x0000e993, 0x0000e994, - /*** Three byte table, leaf: e997xx - offset 0x06755 ***/ + /*** Three byte table, leaf: e997xx - offset 0x067d6 ***/ /* 80 */ 0x0000e995, 0x0000e996, 0x0000e997, 0x0000e998, /* 84 */ 0x0000e999, 0x0000e99a, 0x0000e99b, 0x0000e99c, @@ -7934,7 +7972,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000d5a2, 0x0000c4d6, 0x0000b9eb, 0x0000cec5, /* bc */ 0x0000e3cb, 0x0000c3f6, 0x0000e3cc, 0x0000ea5d, - /*** Three byte table, leaf: e998xx - offset 0x06795 ***/ + /*** Three byte table, leaf: e998xx - offset 0x06816 ***/ /* 80 */ 0x0000b7a7, 0x0000b8f3, 0x0000bad2, 0x0000e3cd, /* 84 */ 0x0000e3ce, 0x0000d4c4, 0x0000e3cf, 0x0000ea5e, @@ -7953,7 +7991,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ea71, 0x0000ea72, 0x0000ea73, 0x0000d7e8, /* bc */ 0x0000dae8, 0x0000dae7, 0x0000ea74, 0x0000b0a2, - /*** Three byte table, leaf: e999xx - offset 0x067d5 ***/ + /*** Three byte table, leaf: e999xx - offset 0x06856 ***/ /* 80 */ 0x0000cdd3, 0x0000ea75, 0x0000dae9, 0x0000ea76, /* 84 */ 0x0000b8bd, 0x0000bcca, 0x0000c2bd, 0x0000c2a4, @@ -7972,7 +8010,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ea91, 0x0000ea92, 0x0000ea93, 0x0000ea94, /* bc */ 0x0000ea95, 0x0000ea96, 0x0000ea97, 0x0000ea98, - /*** Three byte table, leaf: e99axx - offset 0x06815 ***/ + /*** Three byte table, leaf: e99axx - offset 0x06896 ***/ /* 80 */ 0x0000ea99, 0x0000ea9a, 0x0000ea9b, 0x0000ea9c, /* 84 */ 0x0000ea9d, 0x0000d3e7, 0x0000c2a1, 0x0000ea9e, @@ -7991,7 +8029,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000eb60, 0x0000f6bf, 0x0000eb61, 0x0000eb62, /* bc */ 0x0000f6c0, 0x0000f6c1, 0x0000c4d1, 0x0000eb63, - /*** Three byte table, leaf: e99bxx - offset 0x06855 ***/ + /*** Three byte table, leaf: e99bxx - offset 0x068d6 ***/ /* 80 */ 0x0000c8b8, 0x0000d1e3, 0x0000eb64, 0x0000eb65, /* 84 */ 0x0000d0db, 0x0000d1c5, 0x0000bcaf, 0x0000b9cd, @@ -8010,7 +8048,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000eb88, 0x0000b1a2, 0x0000eb89, 0x0000eb8a, /* bc */ 0x0000eb8b, 0x0000eb8c, 0x0000ceed, 0x0000eb8d, - /*** Three byte table, leaf: e99cxx - offset 0x06895 ***/ + /*** Three byte table, leaf: e99cxx - offset 0x06916 ***/ /* 80 */ 0x0000d0e8, 0x0000f6ab, 0x0000eb8e, 0x0000eb8f, /* 84 */ 0x0000cff6, 0x0000eb90, 0x0000f6aa, 0x0000d5f0, @@ -8029,7 +8067,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000b0d4, 0x0000c5f9, 0x0000ec53, 0x0000ec54, /* bc */ 0x0000ec55, 0x0000ec56, 0x0000f6b2, 0x0000ec57, - /*** Three byte table, leaf: e99dxx - offset 0x068d5 ***/ + /*** Three byte table, leaf: e99dxx - offset 0x06956 ***/ /* 80 */ 0x0000ec58, 0x0000ec59, 0x0000ec5a, 0x0000ec5b, /* 84 */ 0x0000ec5c, 0x0000ec5d, 0x0000ec5e, 0x0000ec5f, @@ -8048,7 +8086,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ec83, 0x0000ec84, 0x0000ec85, 0x0000ec86, /* bc */ 0x0000f7b0, 0x0000ec87, 0x0000ec88, 0x0000ec89, - /*** Three byte table, leaf: e99exx - offset 0x06915 ***/ + /*** Three byte table, leaf: e99exx - offset 0x06996 ***/ /* 80 */ 0x0000ec8a, 0x0000ec8b, 0x0000ec8c, 0x0000ec8d, /* 84 */ 0x0000ec8e, 0x0000f7b1, 0x0000ec8f, 0x0000ec90, @@ -8067,7 +8105,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ed53, 0x0000ed54, 0x0000ed55, 0x0000ed56, /* bc */ 0x0000ed57, 0x0000ed58, 0x0000ed59, 0x0000ed5a, - /*** Three byte table, leaf: e99fxx - offset 0x06955 ***/ + /*** Three byte table, leaf: e99fxx - offset 0x069d6 ***/ /* 80 */ 0x0000ed5b, 0x0000ed5c, 0x0000ed5d, 0x0000ed5e, /* 84 */ 0x0000ed5f, 0x0000ed60, 0x0000ed61, 0x0000ed62, @@ -8086,7 +8124,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ed8a, 0x0000ed8b, 0x0000ed8c, 0x0000ed8d, /* bc */ 0x0000ed8e, 0x0000ed8f, 0x0000ed90, 0x0000ed91, - /*** Three byte table, leaf: e9a0xx - offset 0x06995 ***/ + /*** Three byte table, leaf: e9a0xx - offset 0x06a16 ***/ /* 80 */ 0x0000ed92, 0x0000ed93, 0x0000ed94, 0x0000ed95, /* 84 */ 0x0000ed96, 0x0000ed97, 0x0000ed98, 0x0000ed99, @@ -8105,7 +8143,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ee69, 0x0000ee6a, 0x0000ee6b, 0x0000ee6c, /* bc */ 0x0000ee6d, 0x0000ee6e, 0x0000ee6f, 0x0000ee70, - /*** Three byte table, leaf: e9a1xx - offset 0x069d5 ***/ + /*** Three byte table, leaf: e9a1xx - offset 0x06a56 ***/ /* 80 */ 0x0000ee71, 0x0000ee72, 0x0000ee73, 0x0000ee74, /* 84 */ 0x0000ee75, 0x0000ee76, 0x0000ee77, 0x0000ee78, @@ -8124,7 +8162,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f1fc, 0x0000cfee, 0x0000cbb3, 0x0000d0eb, /* bc */ 0x0000e7ef, 0x0000cde7, 0x0000b9cb, 0x0000b6d9, - /*** Three byte table, leaf: e9a2xx - offset 0x06a15 ***/ + /*** Three byte table, leaf: e9a2xx - offset 0x06a96 ***/ /* 80 */ 0x0000f1fd, 0x0000b0e4, 0x0000cbcc, 0x0000f1fe, /* 84 */ 0x0000d4a4, 0x0000c2ad, 0x0000c1ec, 0x0000c6c4, @@ -8143,7 +8181,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ef5c, 0x0000ef5d, 0x0000ef5e, 0x0000ef5f, /* bc */ 0x0000ef60, 0x0000ef61, 0x0000ef62, 0x0000ef63, - /*** Three byte table, leaf: e9a3xx - offset 0x06a55 ***/ + /*** Three byte table, leaf: e9a3xx - offset 0x06ad6 ***/ /* 80 */ 0x0000ef64, 0x0000ef65, 0x0000ef66, 0x0000ef67, /* 84 */ 0x0000ef68, 0x0000ef69, 0x0000ef6a, 0x0000ef6b, @@ -8162,7 +8200,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000ef91, 0x0000ef92, 0x0000ef93, 0x0000ef94, /* bc */ 0x0000ef95, 0x0000ef96, 0x0000ef97, 0x0000ef98, - /*** Three byte table, leaf: e9a4xx - offset 0x06a95 ***/ + /*** Three byte table, leaf: e9a4xx - offset 0x06b16 ***/ /* 80 */ 0x0000ef99, 0x0000ef9a, 0x0000ef9b, 0x0000ef9c, /* 84 */ 0x0000ef9d, 0x0000ef9e, 0x0000ef9f, 0x0000efa0, @@ -8181,7 +8219,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f06d, 0x0000f06e, 0x0000f06f, 0x0000f070, /* bc */ 0x0000f071, 0x0000f072, 0x0000f073, 0x0000f074, - /*** Three byte table, leaf: e9a5xx - offset 0x06ad5 ***/ + /*** Three byte table, leaf: e9a5xx - offset 0x06b56 ***/ /* 80 */ 0x0000f075, 0x0000f076, 0x0000f077, 0x0000f078, /* 84 */ 0x0000f079, 0x0000f07a, 0x0000f07b, 0x0000f07c, @@ -8200,7 +8238,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f09a, 0x0000f09b, 0x0000bdc8, 0x0000f09c, /* bc */ 0x0000b1fd, 0x0000e2c4, 0x0000f09d, 0x0000b6f6, - /*** Three byte table, leaf: e9a6xx - offset 0x06b15 ***/ + /*** Three byte table, leaf: e9a6xx - offset 0x06b96 ***/ /* 80 */ 0x0000e2c5, 0x0000c4d9, 0x0000f09e, 0x0000f09f, /* 84 */ 0x0000e2c6, 0x0000cfda, 0x0000b9dd, 0x0000e2c7, @@ -8219,7 +8257,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f15e, 0x0000f15f, 0x0000f160, 0x0000f161, /* bc */ 0x0000f162, 0x0000f163, 0x0000f164, 0x0000f165, - /*** Three byte table, leaf: e9a7xx - offset 0x06b55 ***/ + /*** Three byte table, leaf: e9a7xx - offset 0x06bd6 ***/ /* 80 */ 0x0000f166, 0x0000f167, 0x0000f168, 0x0000f169, /* 84 */ 0x0000f16a, 0x0000f16b, 0x0000f16c, 0x0000f16d, @@ -8238,7 +8276,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f19f, 0x0000f1a0, 0x0000f240, 0x0000f241, /* bc */ 0x0000f242, 0x0000f243, 0x0000f244, 0x0000f245, - /*** Three byte table, leaf: e9a8xx - offset 0x06b95 ***/ + /*** Three byte table, leaf: e9a8xx - offset 0x06c16 ***/ /* 80 */ 0x0000f246, 0x0000f247, 0x0000f248, 0x0000f249, /* 84 */ 0x0000f24a, 0x0000f24b, 0x0000f24c, 0x0000f24d, @@ -8257,7 +8295,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f27e, 0x0000f280, 0x0000f281, 0x0000f282, /* bc */ 0x0000f283, 0x0000f284, 0x0000f285, 0x0000f286, - /*** Three byte table, leaf: e9a9xx - offset 0x06bd5 ***/ + /*** Three byte table, leaf: e9a9xx - offset 0x06c56 ***/ /* 80 */ 0x0000f287, 0x0000f288, 0x0000f289, 0x0000f28a, /* 84 */ 0x0000f28b, 0x0000f28c, 0x0000f28d, 0x0000f28e, @@ -8276,7 +8314,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000e6e2, 0x0000bed4, 0x0000e6e3, 0x0000d7a4, /* bc */ 0x0000cdd5, 0x0000e6e5, 0x0000bcdd, 0x0000e6e4, - /*** Three byte table, leaf: e9aaxx - offset 0x06c15 ***/ + /*** Three byte table, leaf: e9aaxx - offset 0x06c96 ***/ /* 80 */ 0x0000e6e6, 0x0000e6e7, 0x0000c2ee, 0x0000f353, /* 84 */ 0x0000bdbe, 0x0000e6e8, 0x0000c2e6, 0x0000baa7, @@ -8295,7 +8333,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000baa1, 0x0000f366, 0x0000f7bf, 0x0000f367, /* bc */ 0x0000f7c0, 0x0000f368, 0x0000f369, 0x0000f36a, - /*** Three byte table, leaf: e9abxx - offset 0x06c55 ***/ + /*** Three byte table, leaf: e9abxx - offset 0x06cd6 ***/ /* 80 */ 0x0000f7c2, 0x0000f7c1, 0x0000f7c4, 0x0000f36b, /* 84 */ 0x0000f36c, 0x0000f7c3, 0x0000f36d, 0x0000f36e, @@ -8314,7 +8352,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f395, 0x0000f7db, 0x0000f396, 0x0000f7d9, /* bc */ 0x0000f397, 0x0000f398, 0x0000f399, 0x0000f39a, - /*** Three byte table, leaf: e9acxx - offset 0x06c95 ***/ + /*** Three byte table, leaf: e9acxx - offset 0x06d16 ***/ /* 80 */ 0x0000f39b, 0x0000f39c, 0x0000f39d, 0x0000d7d7, /* 84 */ 0x0000f39e, 0x0000f39f, 0x0000f3a0, 0x0000f440, @@ -8333,7 +8371,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f46a, 0x0000f46b, 0x0000f46c, 0x0000e5f7, /* bc */ 0x0000b9ed, 0x0000f46d, 0x0000f46e, 0x0000f46f, - /*** Three byte table, leaf: e9adxx - offset 0x06cd5 ***/ + /*** Three byte table, leaf: e9adxx - offset 0x06d56 ***/ /* 80 */ 0x0000f470, 0x0000bffd, 0x0000bbea, 0x0000f7c9, /* 84 */ 0x0000c6c7, 0x0000f7c8, 0x0000f471, 0x0000f7ca, @@ -8352,7 +8390,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f49d, 0x0000f49e, 0x0000f49f, 0x0000f4a0, /* bc */ 0x0000f540, 0x0000f541, 0x0000f542, 0x0000f543, - /*** Three byte table, leaf: e9aexx - offset 0x06d15 ***/ + /*** Three byte table, leaf: e9aexx - offset 0x06d96 ***/ /* 80 */ 0x0000f544, 0x0000f545, 0x0000f546, 0x0000f547, /* 84 */ 0x0000f548, 0x0000f549, 0x0000f54a, 0x0000f54b, @@ -8371,7 +8409,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f57c, 0x0000f57d, 0x0000f57e, 0x0000f580, /* bc */ 0x0000f581, 0x0000f582, 0x0000f583, 0x0000f584, - /*** Three byte table, leaf: e9afxx - offset 0x06d55 ***/ + /*** Three byte table, leaf: e9afxx - offset 0x06dd6 ***/ /* 80 */ 0x0000f585, 0x0000f586, 0x0000f587, 0x0000f588, /* 84 */ 0x0000f589, 0x0000f58a, 0x0000f58b, 0x0000f58c, @@ -8390,7 +8428,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f65c, 0x0000f65d, 0x0000f65e, 0x0000f65f, /* bc */ 0x0000f660, 0x0000f661, 0x0000f662, 0x0000f663, - /*** Three byte table, leaf: e9b0xx - offset 0x06d95 ***/ + /*** Three byte table, leaf: e9b0xx - offset 0x06e16 ***/ /* 80 */ 0x0000f664, 0x0000f665, 0x0000f666, 0x0000f667, /* 84 */ 0x0000f668, 0x0000f669, 0x0000f66a, 0x0000f66b, @@ -8409,7 +8447,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f69d, 0x0000f69e, 0x0000f69f, 0x0000f6a0, /* bc */ 0x0000f740, 0x0000f741, 0x0000f742, 0x0000f743, - /*** Three byte table, leaf: e9b1xx - offset 0x06dd5 ***/ + /*** Three byte table, leaf: e9b1xx - offset 0x06e56 ***/ /* 80 */ 0x0000f744, 0x0000f745, 0x0000f746, 0x0000f747, /* 84 */ 0x0000f748, 0x0000f749, 0x0000f74a, 0x0000f74b, @@ -8428,7 +8466,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f77c, 0x0000f77d, 0x0000f77e, 0x0000f780, /* bc */ 0x0000d3e3, 0x0000f781, 0x0000f782, 0x0000f6cf, - /*** Three byte table, leaf: e9b2xx - offset 0x06e15 ***/ + /*** Three byte table, leaf: e9b2xx - offset 0x06e96 ***/ /* 80 */ 0x0000f783, 0x0000c2b3, 0x0000f6d0, 0x0000f784, /* 84 */ 0x0000f785, 0x0000f6d1, 0x0000f6d2, 0x0000f6d3, @@ -8447,7 +8485,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000bea8, 0x0000f793, 0x0000f6f5, 0x0000f6f6, /* bc */ 0x0000f6f7, 0x0000f6f8, 0x0000f794, 0x0000f795, - /*** Three byte table, leaf: e9b3xx - offset 0x06e55 ***/ + /*** Three byte table, leaf: e9b3xx - offset 0x06ed6 ***/ /* 80 */ 0x0000f796, 0x0000f797, 0x0000f798, 0x0000c8fa, /* 84 */ 0x0000f6f9, 0x0000f6fa, 0x0000f6fb, 0x0000f6fc, @@ -8466,7 +8504,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f855, 0x0000f856, 0x0000f857, 0x0000f858, /* bc */ 0x0000f859, 0x0000f85a, 0x0000f85b, 0x0000f85c, - /*** Three byte table, leaf: e9b4xx - offset 0x06e95 ***/ + /*** Three byte table, leaf: e9b4xx - offset 0x06f16 ***/ /* 80 */ 0x0000f85d, 0x0000f85e, 0x0000f85f, 0x0000f860, /* 84 */ 0x0000f861, 0x0000f862, 0x0000f863, 0x0000f864, @@ -8485,7 +8523,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f896, 0x0000f897, 0x0000f898, 0x0000f899, /* bc */ 0x0000f89a, 0x0000f89b, 0x0000f89c, 0x0000f89d, - /*** Three byte table, leaf: e9b5xx - offset 0x06ed5 ***/ + /*** Three byte table, leaf: e9b5xx - offset 0x06f56 ***/ /* 80 */ 0x0000f89e, 0x0000f89f, 0x0000f8a0, 0x0000f940, /* 84 */ 0x0000f941, 0x0000f942, 0x0000f943, 0x0000f944, @@ -8504,7 +8542,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f975, 0x0000f976, 0x0000f977, 0x0000f978, /* bc */ 0x0000f979, 0x0000f97a, 0x0000f97b, 0x0000f97c, - /*** Three byte table, leaf: e9b6xx - offset 0x06f15 ***/ + /*** Three byte table, leaf: e9b6xx - offset 0x06f96 ***/ /* 80 */ 0x0000f97d, 0x0000f97e, 0x0000f980, 0x0000f981, /* 84 */ 0x0000f982, 0x0000f983, 0x0000f984, 0x0000f985, @@ -8523,7 +8561,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fa55, 0x0000fa56, 0x0000fa57, 0x0000fa58, /* bc */ 0x0000fa59, 0x0000fa5a, 0x0000fa5b, 0x0000fa5c, - /*** Three byte table, leaf: e9b7xx - offset 0x06f55 ***/ + /*** Three byte table, leaf: e9b7xx - offset 0x06fd6 ***/ /* 80 */ 0x0000fa5d, 0x0000fa5e, 0x0000fa5f, 0x0000fa60, /* 84 */ 0x0000fa61, 0x0000fa62, 0x0000fa63, 0x0000fa64, @@ -8542,7 +8580,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fa96, 0x0000fa97, 0x0000fa98, 0x0000fa99, /* bc */ 0x0000fa9a, 0x0000fa9b, 0x0000fa9c, 0x0000fa9d, - /*** Three byte table, leaf: e9b8xx - offset 0x06f95 ***/ + /*** Three byte table, leaf: e9b8xx - offset 0x07016 ***/ /* 80 */ 0x0000fa9e, 0x0000fa9f, 0x0000faa0, 0x0000fb40, /* 84 */ 0x0000fb41, 0x0000fb42, 0x0000fb43, 0x0000fb44, @@ -8561,7 +8599,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f0b9, 0x0000f0bb, 0x0000f0bc, 0x0000fb61, /* bc */ 0x0000fb62, 0x0000b8eb, 0x0000f0bd, 0x0000bae8, - /*** Three byte table, leaf: e9b9xx - offset 0x06fd5 ***/ + /*** Three byte table, leaf: e9b9xx - offset 0x07056 ***/ /* 80 */ 0x0000fb63, 0x0000f0be, 0x0000f0bf, 0x0000bee9, /* 84 */ 0x0000f0c0, 0x0000b6ec, 0x0000f0c1, 0x0000f0c2, @@ -8580,7 +8618,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fb78, 0x0000fb79, 0x0000fb7a, 0x0000fb7b, /* bc */ 0x0000fb7c, 0x0000fb7d, 0x0000f5ba, 0x0000c2b9, - /*** Three byte table, leaf: e9baxx - offset 0x07015 ***/ + /*** Three byte table, leaf: e9baxx - offset 0x07096 ***/ /* 80 */ 0x0000fb7e, 0x0000fb80, 0x0000f7e4, 0x0000fb81, /* 84 */ 0x0000fb82, 0x0000fb83, 0x0000fb84, 0x0000f7e5, @@ -8599,7 +8637,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f4ef, 0x0000fc4c, 0x0000fc4d, 0x0000c2e9, /* bc */ 0x0000fc4e, 0x0000f7e1, 0x0000f7e2, 0x0000fc4f, - /*** Three byte table, leaf: e9bbxx - offset 0x07055 ***/ + /*** Three byte table, leaf: e9bbxx - offset 0x070d6 ***/ /* 80 */ 0x0000fc50, 0x0000fc51, 0x0000fc52, 0x0000fc53, /* 84 */ 0x0000bbc6, 0x0000fc54, 0x0000fc55, 0x0000fc56, @@ -8618,7 +8656,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fc75, 0x0000ede9, 0x0000fc76, 0x0000edea, /* bc */ 0x0000edeb, 0x0000fc77, 0x0000f6bc, 0x0000fc78, - /*** Three byte table, leaf: e9bcxx - offset 0x07095 ***/ + /*** Three byte table, leaf: e9bcxx - offset 0x07116 ***/ /* 80 */ 0x0000fc79, 0x0000fc7a, 0x0000fc7b, 0x0000fc7c, /* 84 */ 0x0000fc7d, 0x0000fc7e, 0x0000fc80, 0x0000fc81, @@ -8637,7 +8675,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fd45, 0x0000f7fa, 0x0000fd46, 0x0000b1c7, /* bc */ 0x0000fd47, 0x0000f7fc, 0x0000f7fd, 0x0000fd48, - /*** Three byte table, leaf: e9bdxx - offset 0x070d5 ***/ + /*** Three byte table, leaf: e9bdxx - offset 0x07156 ***/ /* 80 */ 0x0000fd49, 0x0000fd4a, 0x0000fd4b, 0x0000fd4c, /* 84 */ 0x0000f7fe, 0x0000fd4d, 0x0000fd4e, 0x0000fd4f, @@ -8656,7 +8694,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fd7e, 0x0000fd80, 0x0000fd81, 0x0000fd82, /* bc */ 0x0000fd83, 0x0000fd84, 0x0000fd85, 0x0000b3dd, - /*** Three byte table, leaf: e9bexx - offset 0x07115 ***/ + /*** Three byte table, leaf: e9bexx - offset 0x07196 ***/ /* 80 */ 0x0000f6b3, 0x0000fd86, 0x0000fd87, 0x0000f6b4, /* 84 */ 0x0000c1e4, 0x0000f6b5, 0x0000f6b6, 0x0000f6b7, @@ -8671,11 +8709,11 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* a8 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* ac */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* b0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* b4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* b8 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* b4 */ 0x0000fe59, 0x0000fe61, 0x0000fe66, 0x0000fe67, + /* b8 */ 0x0000fe6d, 0x0000fe7e, 0x0000fe90, 0x0000fea0, /* bc */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /*** Three byte table, leaf: ee80xx - offset 0x07155 ***/ + /*** Three byte table, leaf: ee80xx - offset 0x071d6 ***/ /* 80 */ 0x0000aaa1, 0x0000aaa2, 0x0000aaa3, 0x0000aaa4, /* 84 */ 0x0000aaa5, 0x0000aaa6, 0x0000aaa7, 0x0000aaa8, @@ -8694,7 +8732,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000aad9, 0x0000aada, 0x0000aadb, 0x0000aadc, /* bc */ 0x0000aadd, 0x0000aade, 0x0000aadf, 0x0000aae0, - /*** Three byte table, leaf: ee81xx - offset 0x07195 ***/ + /*** Three byte table, leaf: ee81xx - offset 0x07216 ***/ /* 80 */ 0x0000aae1, 0x0000aae2, 0x0000aae3, 0x0000aae4, /* 84 */ 0x0000aae5, 0x0000aae6, 0x0000aae7, 0x0000aae8, @@ -8713,7 +8751,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000abbb, 0x0000abbc, 0x0000abbd, 0x0000abbe, /* bc */ 0x0000abbf, 0x0000abc0, 0x0000abc1, 0x0000abc2, - /*** Three byte table, leaf: ee82xx - offset 0x071d5 ***/ + /*** Three byte table, leaf: ee82xx - offset 0x07256 ***/ /* 80 */ 0x0000abc3, 0x0000abc4, 0x0000abc5, 0x0000abc6, /* 84 */ 0x0000abc7, 0x0000abc8, 0x0000abc9, 0x0000abca, @@ -8732,7 +8770,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000abfb, 0x0000abfc, 0x0000abfd, 0x0000abfe, /* bc */ 0x0000aca1, 0x0000aca2, 0x0000aca3, 0x0000aca4, - /*** Three byte table, leaf: ee83xx - offset 0x07215 ***/ + /*** Three byte table, leaf: ee83xx - offset 0x07296 ***/ /* 80 */ 0x0000aca5, 0x0000aca6, 0x0000aca7, 0x0000aca8, /* 84 */ 0x0000aca9, 0x0000acaa, 0x0000acab, 0x0000acac, @@ -8751,7 +8789,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000acdd, 0x0000acde, 0x0000acdf, 0x0000ace0, /* bc */ 0x0000ace1, 0x0000ace2, 0x0000ace3, 0x0000ace4, - /*** Three byte table, leaf: ee84xx - offset 0x07255 ***/ + /*** Three byte table, leaf: ee84xx - offset 0x072d6 ***/ /* 80 */ 0x0000ace5, 0x0000ace6, 0x0000ace7, 0x0000ace8, /* 84 */ 0x0000ace9, 0x0000acea, 0x0000aceb, 0x0000acec, @@ -8770,7 +8808,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000adbf, 0x0000adc0, 0x0000adc1, 0x0000adc2, /* bc */ 0x0000adc3, 0x0000adc4, 0x0000adc5, 0x0000adc6, - /*** Three byte table, leaf: ee85xx - offset 0x07295 ***/ + /*** Three byte table, leaf: ee85xx - offset 0x07316 ***/ /* 80 */ 0x0000adc7, 0x0000adc8, 0x0000adc9, 0x0000adca, /* 84 */ 0x0000adcb, 0x0000adcc, 0x0000adcd, 0x0000adce, @@ -8789,7 +8827,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000aea1, 0x0000aea2, 0x0000aea3, 0x0000aea4, /* bc */ 0x0000aea5, 0x0000aea6, 0x0000aea7, 0x0000aea8, - /*** Three byte table, leaf: ee86xx - offset 0x072d5 ***/ + /*** Three byte table, leaf: ee86xx - offset 0x07356 ***/ /* 80 */ 0x0000aea9, 0x0000aeaa, 0x0000aeab, 0x0000aeac, /* 84 */ 0x0000aead, 0x0000aeae, 0x0000aeaf, 0x0000aeb0, @@ -8808,7 +8846,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000aee1, 0x0000aee2, 0x0000aee3, 0x0000aee4, /* bc */ 0x0000aee5, 0x0000aee6, 0x0000aee7, 0x0000aee8, - /*** Three byte table, leaf: ee87xx - offset 0x07315 ***/ + /*** Three byte table, leaf: ee87xx - offset 0x07396 ***/ /* 80 */ 0x0000aee9, 0x0000aeea, 0x0000aeeb, 0x0000aeec, /* 84 */ 0x0000aeed, 0x0000aeee, 0x0000aeef, 0x0000aef0, @@ -8827,7 +8865,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000afc3, 0x0000afc4, 0x0000afc5, 0x0000afc6, /* bc */ 0x0000afc7, 0x0000afc8, 0x0000afc9, 0x0000afca, - /*** Three byte table, leaf: ee88xx - offset 0x07355 ***/ + /*** Three byte table, leaf: ee88xx - offset 0x073d6 ***/ /* 80 */ 0x0000afcb, 0x0000afcc, 0x0000afcd, 0x0000afce, /* 84 */ 0x0000afcf, 0x0000afd0, 0x0000afd1, 0x0000afd2, @@ -8846,7 +8884,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f8a5, 0x0000f8a6, 0x0000f8a7, 0x0000f8a8, /* bc */ 0x0000f8a9, 0x0000f8aa, 0x0000f8ab, 0x0000f8ac, - /*** Three byte table, leaf: ee89xx - offset 0x07395 ***/ + /*** Three byte table, leaf: ee89xx - offset 0x07416 ***/ /* 80 */ 0x0000f8ad, 0x0000f8ae, 0x0000f8af, 0x0000f8b0, /* 84 */ 0x0000f8b1, 0x0000f8b2, 0x0000f8b3, 0x0000f8b4, @@ -8865,7 +8903,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f8e5, 0x0000f8e6, 0x0000f8e7, 0x0000f8e8, /* bc */ 0x0000f8e9, 0x0000f8ea, 0x0000f8eb, 0x0000f8ec, - /*** Three byte table, leaf: ee8axx - offset 0x073d5 ***/ + /*** Three byte table, leaf: ee8axx - offset 0x07456 ***/ /* 80 */ 0x0000f8ed, 0x0000f8ee, 0x0000f8ef, 0x0000f8f0, /* 84 */ 0x0000f8f1, 0x0000f8f2, 0x0000f8f3, 0x0000f8f4, @@ -8884,7 +8922,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000f9c7, 0x0000f9c8, 0x0000f9c9, 0x0000f9ca, /* bc */ 0x0000f9cb, 0x0000f9cc, 0x0000f9cd, 0x0000f9ce, - /*** Three byte table, leaf: ee8bxx - offset 0x07415 ***/ + /*** Three byte table, leaf: ee8bxx - offset 0x07496 ***/ /* 80 */ 0x0000f9cf, 0x0000f9d0, 0x0000f9d1, 0x0000f9d2, /* 84 */ 0x0000f9d3, 0x0000f9d4, 0x0000f9d5, 0x0000f9d6, @@ -8903,7 +8941,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000faa9, 0x0000faaa, 0x0000faab, 0x0000faac, /* bc */ 0x0000faad, 0x0000faae, 0x0000faaf, 0x0000fab0, - /*** Three byte table, leaf: ee8cxx - offset 0x07455 ***/ + /*** Three byte table, leaf: ee8cxx - offset 0x074d6 ***/ /* 80 */ 0x0000fab1, 0x0000fab2, 0x0000fab3, 0x0000fab4, /* 84 */ 0x0000fab5, 0x0000fab6, 0x0000fab7, 0x0000fab8, @@ -8922,7 +8960,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fae9, 0x0000faea, 0x0000faeb, 0x0000faec, /* bc */ 0x0000faed, 0x0000faee, 0x0000faef, 0x0000faf0, - /*** Three byte table, leaf: ee8dxx - offset 0x07495 ***/ + /*** Three byte table, leaf: ee8dxx - offset 0x07516 ***/ /* 80 */ 0x0000faf1, 0x0000faf2, 0x0000faf3, 0x0000faf4, /* 84 */ 0x0000faf5, 0x0000faf6, 0x0000faf7, 0x0000faf8, @@ -8941,7 +8979,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fbcb, 0x0000fbcc, 0x0000fbcd, 0x0000fbce, /* bc */ 0x0000fbcf, 0x0000fbd0, 0x0000fbd1, 0x0000fbd2, - /*** Three byte table, leaf: ee8exx - offset 0x074d5 ***/ + /*** Three byte table, leaf: ee8exx - offset 0x07556 ***/ /* 80 */ 0x0000fbd3, 0x0000fbd4, 0x0000fbd5, 0x0000fbd6, /* 84 */ 0x0000fbd7, 0x0000fbd8, 0x0000fbd9, 0x0000fbda, @@ -8960,7 +8998,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fcad, 0x0000fcae, 0x0000fcaf, 0x0000fcb0, /* bc */ 0x0000fcb1, 0x0000fcb2, 0x0000fcb3, 0x0000fcb4, - /*** Three byte table, leaf: ee8fxx - offset 0x07515 ***/ + /*** Three byte table, leaf: ee8fxx - offset 0x07596 ***/ /* 80 */ 0x0000fcb5, 0x0000fcb6, 0x0000fcb7, 0x0000fcb8, /* 84 */ 0x0000fcb9, 0x0000fcba, 0x0000fcbb, 0x0000fcbc, @@ -8979,7 +9017,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fced, 0x0000fcee, 0x0000fcef, 0x0000fcf0, /* bc */ 0x0000fcf1, 0x0000fcf2, 0x0000fcf3, 0x0000fcf4, - /*** Three byte table, leaf: ee90xx - offset 0x07555 ***/ + /*** Three byte table, leaf: ee90xx - offset 0x075d6 ***/ /* 80 */ 0x0000fcf5, 0x0000fcf6, 0x0000fcf7, 0x0000fcf8, /* 84 */ 0x0000fcf9, 0x0000fcfa, 0x0000fcfb, 0x0000fcfc, @@ -8998,7 +9036,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fdcf, 0x0000fdd0, 0x0000fdd1, 0x0000fdd2, /* bc */ 0x0000fdd3, 0x0000fdd4, 0x0000fdd5, 0x0000fdd6, - /*** Three byte table, leaf: ee91xx - offset 0x07595 ***/ + /*** Three byte table, leaf: ee91xx - offset 0x07616 ***/ /* 80 */ 0x0000fdd7, 0x0000fdd8, 0x0000fdd9, 0x0000fdda, /* 84 */ 0x0000fddb, 0x0000fddc, 0x0000fddd, 0x0000fdde, @@ -9017,7 +9055,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000feb1, 0x0000feb2, 0x0000feb3, 0x0000feb4, /* bc */ 0x0000feb5, 0x0000feb6, 0x0000feb7, 0x0000feb8, - /*** Three byte table, leaf: ee92xx - offset 0x075d5 ***/ + /*** Three byte table, leaf: ee92xx - offset 0x07656 ***/ /* 80 */ 0x0000feb9, 0x0000feba, 0x0000febb, 0x0000febc, /* 84 */ 0x0000febd, 0x0000febe, 0x0000febf, 0x0000fec0, @@ -9036,7 +9074,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000fef1, 0x0000fef2, 0x0000fef3, 0x0000fef4, /* bc */ 0x0000fef5, 0x0000fef6, 0x0000fef7, 0x0000fef8, - /*** Three byte table, leaf: ee93xx - offset 0x07615 ***/ + /*** Three byte table, leaf: ee93xx - offset 0x07696 ***/ /* 80 */ 0x0000fef9, 0x0000fefa, 0x0000fefb, 0x0000fefc, /* 84 */ 0x0000fefd, 0x0000fefe, 0x0000a140, 0x0000a141, @@ -9055,7 +9093,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a172, 0x0000a173, 0x0000a174, 0x0000a175, /* bc */ 0x0000a176, 0x0000a177, 0x0000a178, 0x0000a179, - /*** Three byte table, leaf: ee94xx - offset 0x07655 ***/ + /*** Three byte table, leaf: ee94xx - offset 0x076d6 ***/ /* 80 */ 0x0000a17a, 0x0000a17b, 0x0000a17c, 0x0000a17d, /* 84 */ 0x0000a17e, 0x0000a180, 0x0000a181, 0x0000a182, @@ -9074,7 +9112,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a252, 0x0000a253, 0x0000a254, 0x0000a255, /* bc */ 0x0000a256, 0x0000a257, 0x0000a258, 0x0000a259, - /*** Three byte table, leaf: ee95xx - offset 0x07695 ***/ + /*** Three byte table, leaf: ee95xx - offset 0x07716 ***/ /* 80 */ 0x0000a25a, 0x0000a25b, 0x0000a25c, 0x0000a25d, /* 84 */ 0x0000a25e, 0x0000a25f, 0x0000a260, 0x0000a261, @@ -9093,7 +9131,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a293, 0x0000a294, 0x0000a295, 0x0000a296, /* bc */ 0x0000a297, 0x0000a298, 0x0000a299, 0x0000a29a, - /*** Three byte table, leaf: ee96xx - offset 0x076d5 ***/ + /*** Three byte table, leaf: ee96xx - offset 0x07756 ***/ /* 80 */ 0x0000a29b, 0x0000a29c, 0x0000a29d, 0x0000a29e, /* 84 */ 0x0000a29f, 0x0000a2a0, 0x0000a340, 0x0000a341, @@ -9112,7 +9150,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a372, 0x0000a373, 0x0000a374, 0x0000a375, /* bc */ 0x0000a376, 0x0000a377, 0x0000a378, 0x0000a379, - /*** Three byte table, leaf: ee97xx - offset 0x07715 ***/ + /*** Three byte table, leaf: ee97xx - offset 0x07796 ***/ /* 80 */ 0x0000a37a, 0x0000a37b, 0x0000a37c, 0x0000a37d, /* 84 */ 0x0000a37e, 0x0000a380, 0x0000a381, 0x0000a382, @@ -9123,7 +9161,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* 98 */ 0x0000a393, 0x0000a394, 0x0000a395, 0x0000a396, /* 9c */ 0x0000a397, 0x0000a398, 0x0000a399, 0x0000a39a, /* a0 */ 0x0000a39b, 0x0000a39c, 0x0000a39d, 0x0000a39e, - /* a4 */ 0x0000a39f, 0x0000a3a0, 0x0000a440, 0x0000a441, + /* a4 */ 0x0000a39f, 0x00000000, 0x0000a440, 0x0000a441, /* a8 */ 0x0000a442, 0x0000a443, 0x0000a444, 0x0000a445, /* ac */ 0x0000a446, 0x0000a447, 0x0000a448, 0x0000a449, /* b0 */ 0x0000a44a, 0x0000a44b, 0x0000a44c, 0x0000a44d, @@ -9131,7 +9169,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a452, 0x0000a453, 0x0000a454, 0x0000a455, /* bc */ 0x0000a456, 0x0000a457, 0x0000a458, 0x0000a459, - /*** Three byte table, leaf: ee98xx - offset 0x07755 ***/ + /*** Three byte table, leaf: ee98xx - offset 0x077d6 ***/ /* 80 */ 0x0000a45a, 0x0000a45b, 0x0000a45c, 0x0000a45d, /* 84 */ 0x0000a45e, 0x0000a45f, 0x0000a460, 0x0000a461, @@ -9150,7 +9188,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a493, 0x0000a494, 0x0000a495, 0x0000a496, /* bc */ 0x0000a497, 0x0000a498, 0x0000a499, 0x0000a49a, - /*** Three byte table, leaf: ee99xx - offset 0x07795 ***/ + /*** Three byte table, leaf: ee99xx - offset 0x07816 ***/ /* 80 */ 0x0000a49b, 0x0000a49c, 0x0000a49d, 0x0000a49e, /* 84 */ 0x0000a49f, 0x0000a4a0, 0x0000a540, 0x0000a541, @@ -9169,7 +9207,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a572, 0x0000a573, 0x0000a574, 0x0000a575, /* bc */ 0x0000a576, 0x0000a577, 0x0000a578, 0x0000a579, - /*** Three byte table, leaf: ee9axx - offset 0x077d5 ***/ + /*** Three byte table, leaf: ee9axx - offset 0x07856 ***/ /* 80 */ 0x0000a57a, 0x0000a57b, 0x0000a57c, 0x0000a57d, /* 84 */ 0x0000a57e, 0x0000a580, 0x0000a581, 0x0000a582, @@ -9188,7 +9226,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a652, 0x0000a653, 0x0000a654, 0x0000a655, /* bc */ 0x0000a656, 0x0000a657, 0x0000a658, 0x0000a659, - /*** Three byte table, leaf: ee9bxx - offset 0x07815 ***/ + /*** Three byte table, leaf: ee9bxx - offset 0x07896 ***/ /* 80 */ 0x0000a65a, 0x0000a65b, 0x0000a65c, 0x0000a65d, /* 84 */ 0x0000a65e, 0x0000a65f, 0x0000a660, 0x0000a661, @@ -9207,7 +9245,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a693, 0x0000a694, 0x0000a695, 0x0000a696, /* bc */ 0x0000a697, 0x0000a698, 0x0000a699, 0x0000a69a, - /*** Three byte table, leaf: ee9cxx - offset 0x07855 ***/ + /*** Three byte table, leaf: ee9cxx - offset 0x078d6 ***/ /* 80 */ 0x0000a69b, 0x0000a69c, 0x0000a69d, 0x0000a69e, /* 84 */ 0x0000a69f, 0x0000a6a0, 0x0000a740, 0x0000a741, @@ -9226,7 +9264,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a772, 0x0000a773, 0x0000a774, 0x0000a775, /* bc */ 0x0000a776, 0x0000a777, 0x0000a778, 0x0000a779, - /*** Three byte table, leaf: ee9dxx - offset 0x07895 ***/ + /*** Three byte table, leaf: ee9dxx - offset 0x07916 ***/ /* 80 */ 0x0000a77a, 0x0000a77b, 0x0000a77c, 0x0000a77d, /* 84 */ 0x0000a77e, 0x0000a780, 0x0000a781, 0x0000a782, @@ -9245,14 +9283,14 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a4fa, 0x0000a4fb, 0x0000a4fc, 0x0000a4fd, /* bc */ 0x0000a4fe, 0x0000a5f7, 0x0000a5f8, 0x0000a5f9, - /*** Three byte table, leaf: ee9exx - offset 0x078d5 ***/ + /*** Three byte table, leaf: ee9exx - offset 0x07956 ***/ /* 80 */ 0x0000a5fa, 0x0000a5fb, 0x0000a5fc, 0x0000a5fd, /* 84 */ 0x0000a5fe, 0x0000a6b9, 0x0000a6ba, 0x0000a6bb, /* 88 */ 0x0000a6bc, 0x0000a6bd, 0x0000a6be, 0x0000a6bf, - /* 8c */ 0x0000a6c0, 0x0000a6d9, 0x0000a6da, 0x0000a6db, - /* 90 */ 0x0000a6dc, 0x0000a6dd, 0x0000a6de, 0x0000a6df, - /* 94 */ 0x0000a6ec, 0x0000a6ed, 0x0000a6f3, 0x0000a6f6, + /* 8c */ 0x0000a6c0, 0x00000000, 0x00000000, 0x00000000, + /* 90 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 94 */ 0x00000000, 0x00000000, 0x00000000, 0x0000a6f6, /* 98 */ 0x0000a6f7, 0x0000a6f8, 0x0000a6f9, 0x0000a6fa, /* 9c */ 0x0000a6fb, 0x0000a6fc, 0x0000a6fd, 0x0000a6fe, /* a0 */ 0x0000a7c2, 0x0000a7c3, 0x0000a7c4, 0x0000a7c5, @@ -9264,10 +9302,10 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a7fb, 0x0000a7fc, 0x0000a7fd, 0x0000a7fe, /* bc */ 0x0000a896, 0x0000a897, 0x0000a898, 0x0000a899, - /*** Three byte table, leaf: ee9fxx - offset 0x07915 ***/ + /*** Three byte table, leaf: ee9fxx - offset 0x07996 ***/ /* 80 */ 0x0000a89a, 0x0000a89b, 0x0000a89c, 0x0000a89d, - /* 84 */ 0x0000a89e, 0x0000a89f, 0x0000a8a0, 0x0000a8bc, + /* 84 */ 0x0000a89e, 0x0000a89f, 0x0000a8a0, 0x8135f437, /* 88 */ 0x8336c830, 0x0000a8c1, 0x0000a8c2, 0x0000a8c3, /* 8c */ 0x0000a8c4, 0x0000a8ea, 0x0000a8eb, 0x0000a8ec, /* 90 */ 0x0000a8ed, 0x0000a8ee, 0x0000a8ef, 0x0000a8f0, @@ -9283,7 +9321,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a99b, 0x0000a99c, 0x0000a99d, 0x0000a99e, /* bc */ 0x0000a99f, 0x0000a9a0, 0x0000a9a1, 0x0000a9a2, - /*** Three byte table, leaf: eea0xx - offset 0x07955 ***/ + /*** Three byte table, leaf: eea0xx - offset 0x079d6 ***/ /* 80 */ 0x0000a9a3, 0x0000a9f0, 0x0000a9f1, 0x0000a9f2, /* 84 */ 0x0000a9f3, 0x0000a9f4, 0x0000a9f5, 0x0000a9f6, @@ -9292,31 +9330,30 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* 90 */ 0x0000d7fa, 0x0000d7fb, 0x0000d7fc, 0x0000d7fd, /* 94 */ 0x0000d7fe, 0x8336c934, 0x0000fe51, 0x0000fe52, /* 98 */ 0x0000fe53, 0x8336c935, 0x8336c936, 0x8336c937, - /* 9c */ 0x8336c938, 0x8336c939, 0x0000fe59, 0x8336ca30, + /* 9c */ 0x8336c938, 0x8336c939, 0x00000000, 0x8336ca30, /* a0 */ 0x8336ca31, 0x8336ca32, 0x8336ca33, 0x8336ca34, - /* a4 */ 0x8336ca35, 0x8336ca36, 0x0000fe61, 0x8336ca37, - /* a8 */ 0x8336ca38, 0x8336ca39, 0x8336cb30, 0x0000fe66, - /* ac */ 0x0000fe67, 0x8336cb31, 0x8336cb32, 0x8336cb33, - /* b0 */ 0x8336cb34, 0x0000fe6c, 0x0000fe6d, 0x8336cb35, + /* a4 */ 0x8336ca35, 0x8336ca36, 0x00000000, 0x8336ca37, + /* a8 */ 0x8336ca38, 0x8336ca39, 0x8336cb30, 0x00000000, + /* ac */ 0x00000000, 0x8336cb31, 0x8336cb32, 0x8336cb33, + /* b0 */ 0x8336cb34, 0x0000fe6c, 0x00000000, 0x8336cb35, /* b4 */ 0x8336cb36, 0x8336cb37, 0x8336cb38, 0x8336cb39, /* b8 */ 0x8336cc30, 0x8336cc31, 0x8336cc32, 0x0000fe76, /* bc */ 0x8336cc33, 0x8336cc34, 0x8336cc35, 0x8336cc36, - /*** Three byte table, leaf: eea1xx - offset 0x07995 ***/ + /*** Three byte table, leaf: eea1xx - offset 0x07a16 ***/ - /* 80 */ 0x8336cc37, 0x8336cc38, 0x8336cc39, 0x0000fe7e, + /* 80 */ 0x8336cc37, 0x8336cc38, 0x8336cc39, 0x00000000, /* 84 */ 0x8336cd30, 0x8336cd31, 0x8336cd32, 0x8336cd33, /* 88 */ 0x8336cd34, 0x8336cd35, 0x8336cd36, 0x8336cd37, /* 8c */ 0x8336cd38, 0x8336cd39, 0x8336ce30, 0x8336ce31, /* 90 */ 0x8336ce32, 0x8336ce33, 0x8336ce34, 0x8336ce35, - /* 94 */ 0x0000fe90, 0x0000fe91, 0x8336ce36, 0x8336ce37, + /* 94 */ 0x00000000, 0x0000fe91, 0x8336ce36, 0x8336ce37, /* 98 */ 0x8336ce38, 0x8336ce39, 0x8336cf30, 0x8336cf31, /* 9c */ 0x8336cf32, 0x8336cf33, 0x8336cf34, 0x8336cf35, /* a0 */ 0x8336cf36, 0x8336cf37, 0x8336cf38, 0x8336cf39, - /* a4 */ 0x0000fea0, - /* 27 trailing zero values shared with next segment */ + /* 28 trailing zero values shared with next segment */ - /*** Three byte table, leaf: efa4xx - offset 0x079ba ***/ + /*** Three byte table, leaf: efa4xx - offset 0x07a3a ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -9335,7 +9372,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84308636, 0x84308637, 0x84308638, 0x84308639, /* bc */ 0x84308730, 0x84308731, 0x84308732, 0x84308733, - /*** Three byte table, leaf: efa5xx - offset 0x079fa ***/ + /*** Three byte table, leaf: efa5xx - offset 0x07a7a ***/ /* 80 */ 0x84308734, 0x84308735, 0x84308736, 0x84308737, /* 84 */ 0x84308738, 0x84308739, 0x84308830, 0x84308831, @@ -9354,7 +9391,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84308d30, 0x0000fd9d, 0x84308d31, 0x84308d32, /* bc */ 0x84308d33, 0x84308d34, 0x84308d35, 0x84308d36, - /*** Three byte table, leaf: efa6xx - offset 0x07a3a ***/ + /*** Three byte table, leaf: efa6xx - offset 0x07aba ***/ /* 80 */ 0x84308d37, 0x84308d38, 0x84308d39, 0x84308e30, /* 84 */ 0x84308e31, 0x84308e32, 0x84308e33, 0x84308e34, @@ -9373,7 +9410,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84309332, 0x84309333, 0x84309334, 0x84309335, /* bc */ 0x84309336, 0x84309337, 0x84309338, 0x84309339, - /*** Three byte table, leaf: efa7xx - offset 0x07a7a ***/ + /*** Three byte table, leaf: efa7xx - offset 0x07afa ***/ /* 80 */ 0x84309430, 0x84309431, 0x84309432, 0x84309433, /* 84 */ 0x84309434, 0x84309435, 0x84309436, 0x84309437, @@ -9392,7 +9429,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84309934, 0x84309935, 0x84309936, 0x84309937, /* bc */ 0x84309938, 0x84309939, 0x84309a30, 0x84309a31, - /*** Three byte table, leaf: efa8xx - offset 0x07aba ***/ + /*** Three byte table, leaf: efa8xx - offset 0x07b3a ***/ /* 80 */ 0x84309a32, 0x84309a33, 0x84309a34, 0x84309a35, /* 84 */ 0x84309a36, 0x84309a37, 0x84309a38, 0x84309a39, @@ -9404,18 +9441,19 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* 9c */ 0x84309c32, 0x84309c33, 0x84309c34, 0x0000fe48, /* a0 */ 0x0000fe49, 0x0000fe4a, 0x84309c35, 0x0000fe4b, /* a4 */ 0x0000fe4c, 0x84309c36, 0x84309c37, 0x0000fe4d, - /* a8 */ 0x0000fe4e, 0x0000fe4f, - /* 22 trailing zero values shared with next segment */ + /* a8 */ 0x0000fe4e, 0x0000fe4f, 0x00000000, 0x00000000, + /* ac */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 16 trailing zero values shared with next segment */ - /*** Three byte table, leaf: efb8xx - offset 0x07ae4 ***/ + /*** Three byte table, leaf: efb8xx - offset 0x07b6a ***/ /* 80 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 84 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 88 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 8c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* 90 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* 94 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, - /* 98 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, + /* 90 */ 0x0000a6d9, 0x0000a6db, 0x0000a6da, 0x0000a6dc, + /* 94 */ 0x0000a6dd, 0x0000a6de, 0x0000a6df, 0x0000a6ec, + /* 98 */ 0x0000a6ed, 0x0000a6f3, 0x00000000, 0x00000000, /* 9c */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* a4 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -9426,7 +9464,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a6f1, 0x0000a6e2, 0x0000a6e3, 0x0000a6ee, /* bc */ 0x0000a6ef, 0x0000a6e6, 0x0000a6e7, 0x0000a6e4, - /*** Three byte table, leaf: efb9xx - offset 0x07b24 ***/ + /*** Three byte table, leaf: efb9xx - offset 0x07baa ***/ /* 80 */ 0x0000a6e5, 0x0000a6e8, 0x0000a6e9, 0x0000a6ea, /* 84 */ 0x0000a6eb, 0x84318539, 0x84318630, 0x84318631, @@ -9445,7 +9483,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84318738, 0x84318739, 0x84318830, 0x84318831, /* bc */ 0x84318832, 0x84318833, 0x84318834, 0x84318835, - /*** Three byte table, leaf: efbaxx - offset 0x07b64 ***/ + /*** Three byte table, leaf: efbaxx - offset 0x07bea ***/ /* 80 */ 0x84318836, 0x84318837, 0x84318838, 0x84318839, /* 84 */ 0x84318930, 0x84318931, 0x84318932, 0x84318933, @@ -9464,7 +9502,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84318e32, 0x84318e33, 0x84318e34, 0x84318e35, /* bc */ 0x84318e36, 0x84318e37, 0x84318e38, 0x84318e39, - /*** Three byte table, leaf: efbbxx - offset 0x07ba4 ***/ + /*** Three byte table, leaf: efbbxx - offset 0x07c2a ***/ /* 80 */ 0x84318f30, 0x84318f31, 0x84318f32, 0x84318f33, /* 84 */ 0x84318f34, 0x84318f35, 0x84318f36, 0x84318f37, @@ -9483,7 +9521,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84319436, 0x84319437, 0x84319438, 0x84319439, /* bc */ 0x84319530, 0x84319531, 0x84319532, 0x84319533, - /*** Three byte table, leaf: efbcxx - offset 0x07be4 ***/ + /*** Three byte table, leaf: efbcxx - offset 0x07c6a ***/ /* 80 */ 0x84319534, 0x0000a3a1, 0x0000a3a2, 0x0000a3a3, /* 84 */ 0x0000a1e7, 0x0000a3a5, 0x0000a3a6, 0x0000a3a7, @@ -9502,7 +9540,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x0000a3d8, 0x0000a3d9, 0x0000a3da, 0x0000a3db, /* bc */ 0x0000a3dc, 0x0000a3dd, 0x0000a3de, 0x0000a3df, - /*** Three byte table, leaf: efbdxx - offset 0x07c24 ***/ + /*** Three byte table, leaf: efbdxx - offset 0x07caa ***/ /* 80 */ 0x0000a3e0, 0x0000a3e1, 0x0000a3e2, 0x0000a3e3, /* 84 */ 0x0000a3e4, 0x0000a3e5, 0x0000a3e6, 0x0000a3e7, @@ -9521,7 +9559,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84319830, 0x84319831, 0x84319832, 0x84319833, /* bc */ 0x84319834, 0x84319835, 0x84319836, 0x84319837, - /*** Three byte table, leaf: efbexx - offset 0x07c64 ***/ + /*** Three byte table, leaf: efbexx - offset 0x07cea ***/ /* 80 */ 0x84319838, 0x84319839, 0x84319930, 0x84319931, /* 84 */ 0x84319932, 0x84319933, 0x84319934, 0x84319935, @@ -9540,7 +9578,7 @@ static const uint32 gb18030_from_unicode_tree_table[31972] = /* b8 */ 0x84319e34, 0x84319e35, 0x84319e36, 0x84319e37, /* bc */ 0x84319e38, 0x84319e39, 0x84319f30, 0x84319f31, - /*** Three byte table, leaf: efbfxx - offset 0x07ca4 ***/ + /*** Three byte table, leaf: efbfxx - offset 0x07d2a ***/ /* 80 */ 0x84319f32, 0x84319f33, 0x84319f34, 0x84319f35, /* 84 */ 0x84319f36, 0x84319f37, 0x84319f38, 0x84319f39, diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index 4a312ab429b6f..d53e885b067e4 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -484,7 +484,7 @@ pg_mb_radix_conv(const pg_mb_radix_tree *rt, * utf: input string in UTF8 encoding (need not be null-terminated) * len: length of input string (in bytes) * iso: pointer to the output area (must be large enough!) - (output string will be null-terminated) + * (output string will be null-terminated) * map: conversion map for single characters * cmap: conversion map for combined characters * (optional, pass NULL if none) @@ -694,7 +694,7 @@ UtfToLocal(const unsigned char *utf, int len, * iso: input string in local encoding (need not be null-terminated) * len: length of input string (in bytes) * utf: pointer to the output area (must be large enough!) - (output string will be null-terminated) + * (output string will be null-terminated) * map: conversion map for single characters * cmap: conversion map for combined characters * (optional, pass NULL if none) diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index ffc9c58cd130b..a512df935777d 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -124,7 +124,12 @@ utf8word_to_unicode(uint32 c) /* * Perform mapping of GB18030 ranges to UTF8 * - * The ranges we need to convert are specified in gb-18030-2000.xml. + * General description, and the range we need to convert for U+10000 and up: + * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/gb18030.html + * + * Ranges up to U+FFFF: + * https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/ranges.txt + * * All are ranges of 4-byte GB18030 codes. */ static uint32 diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 886ecbad87183..d1701d69b16e2 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -497,7 +497,8 @@ pg_do_encoding_conversion_buf(Oid proc, * Convert string to encoding encoding_name. The source * encoding is the DB encoding. * - * BYTEA convert_to(TEXT string, NAME encoding_name) */ + * BYTEA convert_to(TEXT string, NAME encoding_name) + */ Datum pg_convert_to(PG_FUNCTION_ARGS) { @@ -522,7 +523,8 @@ pg_convert_to(PG_FUNCTION_ARGS) * Convert string from encoding encoding_name. The destination * encoding is the DB encoding. * - * TEXT convert_from(BYTEA string, NAME encoding_name) */ + * TEXT convert_from(BYTEA string, NAME encoding_name) + */ Datum pg_convert_from(PG_FUNCTION_ARGS) { @@ -862,7 +864,7 @@ perform_default_encoding_conversion(const char *src, int len, * may call this outside any transaction, or in an aborted transaction. */ void -pg_unicode_to_server(pg_wchar c, unsigned char *s) +pg_unicode_to_server(char32_t c, unsigned char *s) { unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1]; int c_as_utf8_len; @@ -924,7 +926,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s) * but simply return false on conversion failure. */ bool -pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s) +pg_unicode_to_server_noerror(char32_t c, unsigned char *s) { unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1]; int c_as_utf8_len; @@ -1792,7 +1794,7 @@ pgwin32_message_to_UTF16(const char *str, int len, int *utf16len) */ if (codepage != 0) { - utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1)); + utf16 = palloc_array(WCHAR, len + 1); dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len); utf16[dstlen] = (WCHAR) 0; } @@ -1816,7 +1818,7 @@ pgwin32_message_to_UTF16(const char *str, int len, int *utf16len) else utf8 = (char *) str; - utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1)); + utf16 = palloc_array(WCHAR, len + 1); dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len); utf16[dstlen] = (WCHAR) 0; diff --git a/src/backend/utils/misc/Makefile b/src/backend/utils/misc/Makefile index b362ae437710d..f142d17178bdd 100644 --- a/src/backend/utils/misc/Makefile +++ b/src/backend/utils/misc/Makefile @@ -40,6 +40,9 @@ ifdef krb_srvtab override CPPFLAGS += -DPG_KRB_SRVTAB='"$(krb_srvtab)"' endif +# Force this dependency to be known even without dependency info built: +guc_tables.o: guc_tables.c $(top_builddir)/src/backend/utils/guc_tables.inc.c + include $(top_srcdir)/src/backend/common.mk clean: diff --git a/src/backend/utils/misc/conffiles.c b/src/backend/utils/misc/conffiles.c index 23ebad4749b59..e702d1d8e310a 100644 --- a/src/backend/utils/misc/conffiles.c +++ b/src/backend/utils/misc/conffiles.c @@ -108,7 +108,7 @@ GetConfFilesInDir(const char *includedir, const char *calling_file, * them prior to caller processing the contents. */ size_filenames = 32; - filenames = (char **) palloc(size_filenames * sizeof(char *)); + filenames = palloc_array(char *, size_filenames); *num_filenames = 0; while ((de = ReadDir(d, directory)) != NULL) diff --git a/src/backend/utils/misc/gen_guc_tables.pl b/src/backend/utils/misc/gen_guc_tables.pl new file mode 100644 index 0000000000000..64932e102ba3a --- /dev/null +++ b/src/backend/utils/misc/gen_guc_tables.pl @@ -0,0 +1,183 @@ +#!/usr/bin/perl +#---------------------------------------------------------------------- +# +# Generate guc_tables.c from guc_parameters.dat. +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/backend/utils/misc/gen_guc_tables.pl +# +#---------------------------------------------------------------------- + +use strict; +use warnings FATAL => 'all'; + +use FindBin; +use lib "$FindBin::RealBin/../../catalog"; +use Catalog; + +die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" unless @ARGV == 2; +my ($input_fname, $output_fname) = @ARGV; + +my $parse = Catalog::ParseData($input_fname); + +open my $ofh, '>', $output_fname or die; + +print_boilerplate($ofh, $output_fname, 'GUC tables'); +print_table($ofh); + +close $ofh; + + +# Adds double quotes and escapes as necessary for C strings. +sub dquote +{ + my ($s) = @_; + + return q{"} . $s =~ s/"/\\"/gr . q{"}; +} + +sub validate_guc_entry +{ + my ($entry) = @_; + + my @required_common = + qw(name type context group short_desc variable boot_val); + + my %required_by_type = ( + int => [qw(min max)], + real => [qw(min max)], + enum => [qw(options)], + bool => [], # no extra required fields + string => [], # no extra required fields + ); + + for my $f (@required_common) + { + unless (defined $entry->{$f}) + { + die sprintf( + qq{%s:%d: error: entry "%s" is missing required field "%s"\n}, + $input_fname, $entry->{line_number}, + $entry->{name} // '', $f); + } + } + + unless (exists $required_by_type{ $entry->{type} }) + { + die sprintf( + qq{%s:%d: error: entry "%s" has unrecognized GUC type "%s"\n}, + $input_fname, $entry->{line_number}, + $entry->{name}, $entry->{type} // ''); + } + + for my $f (@{ $required_by_type{ $entry->{type} } }) + { + unless (defined $entry->{$f}) + { + die sprintf( + qq{%s:%d: error: entry "%s" of type "%s" is missing required field "%s"\n}, + $input_fname, $entry->{line_number}, $entry->{name}, + $entry->{type}, $f); + } + } +} + +# Print GUC table. +sub print_table +{ + my ($ofh) = @_; + my $prev_name = undef; + + print $ofh "\n\n"; + print $ofh "struct config_generic ConfigureNames[] =\n"; + print $ofh "{\n"; + + foreach my $entry (@{$parse}) + { + validate_guc_entry($entry); + + if (defined($prev_name) && lc($prev_name) ge lc($entry->{name})) + { + die sprintf( + "entries are not in alphabetical order: \"%s\", \"%s\"\n", + $prev_name, $entry->{name}); + } + + print $ofh "#ifdef $entry->{ifdef}\n" if $entry->{ifdef}; + print $ofh "\t{\n"; + printf $ofh "\t\t.name = %s,\n", dquote($entry->{name}); + printf $ofh "\t\t.context = %s,\n", $entry->{context}; + printf $ofh "\t\t.group = %s,\n", $entry->{group}; + printf $ofh + "\t\t/* translator: GUC parameter \"%s\" short description */\n", + $entry->{name}; + printf $ofh "\t\t.short_desc = gettext_noop(%s),\n", + dquote($entry->{short_desc}); + + if ($entry->{long_desc}) + { + printf $ofh + "\t\t/* translator: GUC parameter \"%s\" long description */\n", + $entry->{name}; + printf $ofh "\t\t.long_desc = gettext_noop(%s),\n", + dquote($entry->{long_desc}); + } + printf $ofh "\t\t.flags = %s,\n", $entry->{flags} if $entry->{flags}; + printf $ofh "\t\t.vartype = %s,\n", ('PGC_' . uc($entry->{type})); + printf $ofh "\t\t._%s = {\n", $entry->{type}; + printf $ofh "\t\t\t.variable = &%s,\n", $entry->{variable}; + printf $ofh "\t\t\t.boot_val = %s,\n", $entry->{boot_val}; + printf $ofh "\t\t\t.min = %s,\n", $entry->{min} + if $entry->{type} eq 'int' || $entry->{type} eq 'real'; + printf $ofh "\t\t\t.max = %s,\n", $entry->{max} + if $entry->{type} eq 'int' || $entry->{type} eq 'real'; + printf $ofh "\t\t\t.options = %s,\n", $entry->{options} + if $entry->{type} eq 'enum'; + printf $ofh "\t\t\t.check_hook = %s,\n", $entry->{check_hook} + if $entry->{check_hook}; + printf $ofh "\t\t\t.assign_hook = %s,\n", $entry->{assign_hook} + if $entry->{assign_hook}; + printf $ofh "\t\t\t.show_hook = %s,\n", $entry->{show_hook} + if $entry->{show_hook}; + print $ofh "\t\t},\n"; + print $ofh "\t},\n"; + print $ofh "#endif\n" if $entry->{ifdef}; + print $ofh "\n"; + + $prev_name = $entry->{name}; + } + + print $ofh "\t/* End-of-list marker */\n"; + print $ofh "\t{0}\n"; + print $ofh "};\n"; + + return; +} + +sub print_boilerplate +{ + my ($fh, $fname, $descr) = @_; + printf $fh <name = NULL; item->value = NULL; item->errmsg = pstrdup(errmsg); @@ -482,7 +482,7 @@ ParseConfigFp(FILE *fp, const char *config_file, int depth, int elevel, else { /* ordinary variable, append to list */ - item = palloc(sizeof *item); + item = palloc_object(ConfigVariable); item->name = opt_name; item->value = opt_value; item->errmsg = NULL; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 667df448732f2..935c235e1b39d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -34,6 +34,7 @@ #include "catalog/objectaccess.h" #include "catalog/pg_authid.h" #include "catalog/pg_parameter_acl.h" +#include "catalog/pg_type.h" #include "guc_internal.h" #include "libpq/pqformat.h" #include "libpq/protocol.h" @@ -244,11 +245,12 @@ static void ReportGUCOption(struct config_generic *record); static void set_config_sourcefile(const char *name, char *sourcefile, int sourceline); static void reapply_stacked_values(struct config_generic *variable, - struct config_string *pHolder, + struct config_generic *pHolder, GucStack *stack, const char *curvalue, GucContext curscontext, GucSource cursource, Oid cursrole); +static void free_placeholder(struct config_generic *pHolder); static bool validate_option_array_item(const char *name, const char *value, bool skipIfNoPermissions); static void write_auto_conf_file(int fd, const char *filename, ConfigVariable *head); @@ -259,15 +261,15 @@ static bool assignable_custom_variable_name(const char *name, bool skip_errors, int elevel); static void do_serialize(char **destptr, Size *maxbytes, const char *fmt,...) pg_attribute_printf(3, 4); -static bool call_bool_check_hook(struct config_bool *conf, bool *newval, +static bool call_bool_check_hook(const struct config_generic *conf, bool *newval, void **extra, GucSource source, int elevel); -static bool call_int_check_hook(struct config_int *conf, int *newval, +static bool call_int_check_hook(const struct config_generic *conf, int *newval, void **extra, GucSource source, int elevel); -static bool call_real_check_hook(struct config_real *conf, double *newval, +static bool call_real_check_hook(const struct config_generic *conf, double *newval, void **extra, GucSource source, int elevel); -static bool call_string_check_hook(struct config_string *conf, char **newval, +static bool call_string_check_hook(const struct config_generic *conf, char **newval, void **extra, GucSource source, int elevel); -static bool call_enum_check_hook(struct config_enum *conf, int *newval, +static bool call_enum_check_hook(const struct config_generic *conf, int *newval, void **extra, GucSource source, int elevel); @@ -284,8 +286,7 @@ ProcessConfigFileInternal(GucContext context, bool applySettings, int elevel) bool error = false; bool applying = false; const char *ConfFileWithError; - ConfigVariable *item, - *head, + ConfigVariable *head, *tail; HASH_SEQ_STATUS status; GUCHashEntry *hentry; @@ -336,7 +337,7 @@ ProcessConfigFileInternal(GucContext context, bool applySettings, int elevel) /* * Prune all items except the last "data_directory" from the list. */ - for (item = head; item; item = item->next) + for (ConfigVariable *item = head; item; item = item->next) { if (!item->ignore && strcmp(item->name, "data_directory") == 0) @@ -384,7 +385,7 @@ ProcessConfigFileInternal(GucContext context, bool applySettings, int elevel) * variable mentioned in the file; and we detect duplicate entries in the * file and mark the earlier occurrences as ignorable. */ - for (item = head; item; item = item->next) + for (ConfigVariable *item = head; item; item = item->next) { struct config_generic *record; @@ -408,9 +409,7 @@ ProcessConfigFileInternal(GucContext context, bool applySettings, int elevel) * avoid the O(N^2) behavior here with some additional state, * but it seems unlikely to be worth the trouble. */ - ConfigVariable *pitem; - - for (pitem = head; pitem != item; pitem = pitem->next) + for (ConfigVariable *pitem = head; pitem != item; pitem = pitem->next) { if (!pitem->ignore && strcmp(pitem->name, item->name) == 0) @@ -454,7 +453,6 @@ ProcessConfigFileInternal(GucContext context, bool applySettings, int elevel) while ((hentry = (GUCHashEntry *) hash_seq_search(&status)) != NULL) { struct config_generic *gconf = hentry->gucvar; - GucStack *stack; if (gconf->reset_source != PGC_S_FILE || (gconf->status & GUC_IS_IN_FILE)) @@ -487,7 +485,7 @@ ProcessConfigFileInternal(GucContext context, bool applySettings, int elevel) gconf->reset_source = PGC_S_DEFAULT; if (gconf->source == PGC_S_FILE) set_guc_source(gconf, PGC_S_DEFAULT); - for (stack = gconf->stack; stack; stack = stack->prev) + for (GucStack *stack = gconf->stack; stack; stack = stack->prev) { if (stack->source == PGC_S_FILE) stack->source = PGC_S_DEFAULT; @@ -531,7 +529,7 @@ ProcessConfigFileInternal(GucContext context, bool applySettings, int elevel) /* * Now apply the values from the config file. */ - for (item = head; item; item = item->next) + for (ConfigVariable *item = head; item; item = item->next) { char *pre_value = NULL; int scres; @@ -705,15 +703,13 @@ guc_free(void *ptr) * Detect whether strval is referenced anywhere in a GUC string item */ static bool -string_field_used(struct config_string *conf, char *strval) +string_field_used(struct config_generic *conf, char *strval) { - GucStack *stack; - - if (strval == *(conf->variable) || - strval == conf->reset_val || - strval == conf->boot_val) + if (strval == *(conf->_string.variable) || + strval == conf->_string.reset_val || + strval == conf->_string.boot_val) return true; - for (stack = conf->gen.stack; stack; stack = stack->prev) + for (GucStack *stack = conf->stack; stack; stack = stack->prev) { if (strval == stack->prior.val.stringval || strval == stack->masked.val.stringval) @@ -728,7 +724,7 @@ string_field_used(struct config_string *conf, char *strval) * states). */ static void -set_string_field(struct config_string *conf, char **field, char *newval) +set_string_field(struct config_generic *conf, char **field, char *newval) { char *oldval = *field; @@ -746,34 +742,11 @@ set_string_field(struct config_string *conf, char **field, char *newval) static bool extra_field_used(struct config_generic *gconf, void *extra) { - GucStack *stack; - if (extra == gconf->extra) return true; - switch (gconf->vartype) - { - case PGC_BOOL: - if (extra == ((struct config_bool *) gconf)->reset_extra) - return true; - break; - case PGC_INT: - if (extra == ((struct config_int *) gconf)->reset_extra) - return true; - break; - case PGC_REAL: - if (extra == ((struct config_real *) gconf)->reset_extra) - return true; - break; - case PGC_STRING: - if (extra == ((struct config_string *) gconf)->reset_extra) - return true; - break; - case PGC_ENUM: - if (extra == ((struct config_enum *) gconf)->reset_extra) - return true; - break; - } - for (stack = gconf->stack; stack; stack = stack->prev) + if (extra == gconf->reset_extra) + return true; + for (GucStack *stack = gconf->stack; stack; stack = stack->prev) { if (extra == stack->prior.extra || extra == stack->masked.extra) @@ -814,25 +787,19 @@ set_stack_value(struct config_generic *gconf, config_var_value *val) switch (gconf->vartype) { case PGC_BOOL: - val->val.boolval = - *((struct config_bool *) gconf)->variable; + val->val.boolval = *gconf->_bool.variable; break; case PGC_INT: - val->val.intval = - *((struct config_int *) gconf)->variable; + val->val.intval = *gconf->_int.variable; break; case PGC_REAL: - val->val.realval = - *((struct config_real *) gconf)->variable; + val->val.realval = *gconf->_real.variable; break; case PGC_STRING: - set_string_field((struct config_string *) gconf, - &(val->val.stringval), - *((struct config_string *) gconf)->variable); + set_string_field(gconf, &(val->val.stringval), *gconf->_string.variable); break; case PGC_ENUM: - val->val.enumval = - *((struct config_enum *) gconf)->variable; + val->val.enumval = *gconf->_enum.variable; break; } set_extra_field(gconf, &(val->extra), gconf->extra); @@ -854,7 +821,7 @@ discard_stack_value(struct config_generic *gconf, config_var_value *val) /* no need to do anything */ break; case PGC_STRING: - set_string_field((struct config_string *) gconf, + set_string_field(gconf, &(val->val.stringval), NULL); break; @@ -877,7 +844,7 @@ get_guc_variables(int *num_vars) int i; *num_vars = hash_get_num_entries(guc_hashtab); - result = palloc(sizeof(struct config_generic *) * *num_vars); + result = palloc_array(struct config_generic *, *num_vars); /* Extract pointers from the hash table */ i = 0; @@ -907,7 +874,6 @@ build_guc_variables(void) HASHCTL hash_ctl; GUCHashEntry *hentry; bool found; - int i; /* * Create the memory context that will hold all GUC-related data. @@ -918,48 +884,10 @@ build_guc_variables(void) ALLOCSET_DEFAULT_SIZES); /* - * Count all the built-in variables, and set their vartypes correctly. + * Count all the built-in variables. */ - for (i = 0; ConfigureNamesBool[i].gen.name; i++) - { - struct config_bool *conf = &ConfigureNamesBool[i]; - - /* Rather than requiring vartype to be filled in by hand, do this: */ - conf->gen.vartype = PGC_BOOL; - num_vars++; - } - - for (i = 0; ConfigureNamesInt[i].gen.name; i++) - { - struct config_int *conf = &ConfigureNamesInt[i]; - - conf->gen.vartype = PGC_INT; - num_vars++; - } - - for (i = 0; ConfigureNamesReal[i].gen.name; i++) - { - struct config_real *conf = &ConfigureNamesReal[i]; - - conf->gen.vartype = PGC_REAL; - num_vars++; - } - - for (i = 0; ConfigureNamesString[i].gen.name; i++) - { - struct config_string *conf = &ConfigureNamesString[i]; - - conf->gen.vartype = PGC_STRING; - num_vars++; - } - - for (i = 0; ConfigureNamesEnum[i].gen.name; i++) - { - struct config_enum *conf = &ConfigureNamesEnum[i]; - - conf->gen.vartype = PGC_ENUM; + for (int i = 0; ConfigureNames[i].name; i++) num_vars++; - } /* * Create hash table with 20% slack @@ -976,57 +904,9 @@ build_guc_variables(void) &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); - for (i = 0; ConfigureNamesBool[i].gen.name; i++) - { - struct config_generic *gucvar = &ConfigureNamesBool[i].gen; - - hentry = (GUCHashEntry *) hash_search(guc_hashtab, - &gucvar->name, - HASH_ENTER, - &found); - Assert(!found); - hentry->gucvar = gucvar; - } - - for (i = 0; ConfigureNamesInt[i].gen.name; i++) - { - struct config_generic *gucvar = &ConfigureNamesInt[i].gen; - - hentry = (GUCHashEntry *) hash_search(guc_hashtab, - &gucvar->name, - HASH_ENTER, - &found); - Assert(!found); - hentry->gucvar = gucvar; - } - - for (i = 0; ConfigureNamesReal[i].gen.name; i++) - { - struct config_generic *gucvar = &ConfigureNamesReal[i].gen; - - hentry = (GUCHashEntry *) hash_search(guc_hashtab, - &gucvar->name, - HASH_ENTER, - &found); - Assert(!found); - hentry->gucvar = gucvar; - } - - for (i = 0; ConfigureNamesString[i].gen.name; i++) + for (int i = 0; ConfigureNames[i].name; i++) { - struct config_generic *gucvar = &ConfigureNamesString[i].gen; - - hentry = (GUCHashEntry *) hash_search(guc_hashtab, - &gucvar->name, - HASH_ENTER, - &found); - Assert(!found); - hentry->gucvar = gucvar; - } - - for (i = 0; ConfigureNamesEnum[i].gen.name; i++) - { - struct config_generic *gucvar = &ConfigureNamesEnum[i].gen; + struct config_generic *gucvar = &ConfigureNames[i]; hentry = (GUCHashEntry *) hash_search(guc_hashtab, &gucvar->name, @@ -1176,44 +1056,42 @@ assignable_custom_variable_name(const char *name, bool skip_errors, int elevel) static struct config_generic * add_placeholder_variable(const char *name, int elevel) { - size_t sz = sizeof(struct config_string) + sizeof(char *); - struct config_string *var; - struct config_generic *gen; + size_t sz = sizeof(struct config_generic) + sizeof(char *); + struct config_generic *var; - var = (struct config_string *) guc_malloc(elevel, sz); + var = (struct config_generic *) guc_malloc(elevel, sz); if (var == NULL) return NULL; memset(var, 0, sz); - gen = &var->gen; - gen->name = guc_strdup(elevel, name); - if (gen->name == NULL) + var->name = guc_strdup(elevel, name); + if (var->name == NULL) { guc_free(var); return NULL; } - gen->context = PGC_USERSET; - gen->group = CUSTOM_OPTIONS; - gen->short_desc = "GUC placeholder variable"; - gen->flags = GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_CUSTOM_PLACEHOLDER; - gen->vartype = PGC_STRING; + var->context = PGC_USERSET; + var->group = CUSTOM_OPTIONS; + var->short_desc = "GUC placeholder variable"; + var->flags = GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_CUSTOM_PLACEHOLDER; + var->vartype = PGC_STRING; /* * The char* is allocated at the end of the struct since we have no * 'static' place to point to. Note that the current value, as well as * the boot and reset values, start out NULL. */ - var->variable = (char **) (var + 1); + var->_string.variable = (char **) (var + 1); - if (!add_guc_variable((struct config_generic *) var, elevel)) + if (!add_guc_variable(var, elevel)) { - guc_free(unconstify(char *, gen->name)); + guc_free(unconstify(char *, var->name)); guc_free(var); return NULL; } - return gen; + return var; } /* @@ -1236,7 +1114,6 @@ find_option(const char *name, bool create_placeholders, bool skip_errors, int elevel) { GUCHashEntry *hentry; - int i; Assert(name); @@ -1253,7 +1130,7 @@ find_option(const char *name, bool create_placeholders, bool skip_errors, * set of supported old names is short enough that a brute-force search is * the best way. */ - for (i = 0; map_old_guc_names[i] != NULL; i += 2) + for (int i = 0; map_old_guc_names[i] != NULL; i += 2) { if (guc_name_compare(name, map_old_guc_names[i]) == 0) return find_option(map_old_guc_names[i + 1], false, @@ -1287,10 +1164,10 @@ find_option(const char *name, bool create_placeholders, bool skip_errors, static int guc_var_compare(const void *a, const void *b) { - const char *namea = **(const char **const *) a; - const char *nameb = **(const char **const *) b; + const struct config_generic *ca = *(const struct config_generic *const *) a; + const struct config_generic *cb = *(const struct config_generic *const *) b; - return guc_name_compare(namea, nameb); + return guc_name_compare(ca->name, cb->name); } /* @@ -1433,69 +1310,69 @@ check_GUC_name_for_parameter_acl(const char *name) */ #ifdef USE_ASSERT_CHECKING static bool -check_GUC_init(struct config_generic *gconf) +check_GUC_init(const struct config_generic *gconf) { /* Checks on values */ switch (gconf->vartype) { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) gconf; + const struct config_bool *conf = &gconf->_bool; if (*conf->variable && !conf->boot_val) { elog(LOG, "GUC (PGC_BOOL) %s, boot_val=%d, C-var=%d", - conf->gen.name, conf->boot_val, *conf->variable); + gconf->name, conf->boot_val, *conf->variable); return false; } break; } case PGC_INT: { - struct config_int *conf = (struct config_int *) gconf; + const struct config_int *conf = &gconf->_int; if (*conf->variable != 0 && *conf->variable != conf->boot_val) { elog(LOG, "GUC (PGC_INT) %s, boot_val=%d, C-var=%d", - conf->gen.name, conf->boot_val, *conf->variable); + gconf->name, conf->boot_val, *conf->variable); return false; } break; } case PGC_REAL: { - struct config_real *conf = (struct config_real *) gconf; + const struct config_real *conf = &gconf->_real; if (*conf->variable != 0.0 && *conf->variable != conf->boot_val) { elog(LOG, "GUC (PGC_REAL) %s, boot_val=%g, C-var=%g", - conf->gen.name, conf->boot_val, *conf->variable); + gconf->name, conf->boot_val, *conf->variable); return false; } break; } case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + const struct config_string *conf = &gconf->_string; if (*conf->variable != NULL && (conf->boot_val == NULL || strcmp(*conf->variable, conf->boot_val) != 0)) { elog(LOG, "GUC (PGC_STRING) %s, boot_val=%s, C-var=%s", - conf->gen.name, conf->boot_val ? conf->boot_val : "", *conf->variable); + gconf->name, conf->boot_val ? conf->boot_val : "", *conf->variable); return false; } break; } case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) gconf; + const struct config_enum *conf = &gconf->_enum; if (*conf->variable != conf->boot_val) { elog(LOG, "GUC (PGC_ENUM) %s, boot_val=%d, C-var=%d", - conf->gen.name, conf->boot_val, *conf->variable); + gconf->name, conf->boot_val, *conf->variable); return false; } break; @@ -1627,7 +1504,7 @@ InitializeGUCOptionsFromEnvironment(void) new_limit = 2048; source = PGC_S_DYNAMIC_DEFAULT; } - snprintf(limbuf, sizeof(limbuf), "%d", (int) new_limit); + snprintf(limbuf, sizeof(limbuf), "%zd", new_limit); SetConfigOption("max_stack_depth", limbuf, PGC_POSTMASTER, source); } @@ -1643,6 +1520,8 @@ InitializeGUCOptionsFromEnvironment(void) static void InitializeOneGUCOption(struct config_generic *gconf) { + void *extra = NULL; + gconf->status = 0; gconf->source = PGC_S_DEFAULT; gconf->reset_source = PGC_S_DEFAULT; @@ -1660,61 +1539,54 @@ InitializeOneGUCOption(struct config_generic *gconf) { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) gconf; + struct config_bool *conf = &gconf->_bool; bool newval = conf->boot_val; - void *extra = NULL; - if (!call_bool_check_hook(conf, &newval, &extra, + if (!call_bool_check_hook(gconf, &newval, &extra, PGC_S_DEFAULT, LOG)) elog(FATAL, "failed to initialize %s to %d", - conf->gen.name, (int) newval); + gconf->name, (int) newval); if (conf->assign_hook) conf->assign_hook(newval, extra); *conf->variable = conf->reset_val = newval; - conf->gen.extra = conf->reset_extra = extra; break; } case PGC_INT: { - struct config_int *conf = (struct config_int *) gconf; + struct config_int *conf = &gconf->_int; int newval = conf->boot_val; - void *extra = NULL; Assert(newval >= conf->min); Assert(newval <= conf->max); - if (!call_int_check_hook(conf, &newval, &extra, + if (!call_int_check_hook(gconf, &newval, &extra, PGC_S_DEFAULT, LOG)) elog(FATAL, "failed to initialize %s to %d", - conf->gen.name, newval); + gconf->name, newval); if (conf->assign_hook) conf->assign_hook(newval, extra); *conf->variable = conf->reset_val = newval; - conf->gen.extra = conf->reset_extra = extra; break; } case PGC_REAL: { - struct config_real *conf = (struct config_real *) gconf; + struct config_real *conf = &gconf->_real; double newval = conf->boot_val; - void *extra = NULL; Assert(newval >= conf->min); Assert(newval <= conf->max); - if (!call_real_check_hook(conf, &newval, &extra, + if (!call_real_check_hook(gconf, &newval, &extra, PGC_S_DEFAULT, LOG)) elog(FATAL, "failed to initialize %s to %g", - conf->gen.name, newval); + gconf->name, newval); if (conf->assign_hook) conf->assign_hook(newval, extra); *conf->variable = conf->reset_val = newval; - conf->gen.extra = conf->reset_extra = extra; break; } case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + struct config_string *conf = &gconf->_string; char *newval; - void *extra = NULL; /* non-NULL boot_val must always get strdup'd */ if (conf->boot_val != NULL) @@ -1722,33 +1594,32 @@ InitializeOneGUCOption(struct config_generic *gconf) else newval = NULL; - if (!call_string_check_hook(conf, &newval, &extra, + if (!call_string_check_hook(gconf, &newval, &extra, PGC_S_DEFAULT, LOG)) elog(FATAL, "failed to initialize %s to \"%s\"", - conf->gen.name, newval ? newval : ""); + gconf->name, newval ? newval : ""); if (conf->assign_hook) conf->assign_hook(newval, extra); *conf->variable = conf->reset_val = newval; - conf->gen.extra = conf->reset_extra = extra; break; } case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) gconf; + struct config_enum *conf = &gconf->_enum; int newval = conf->boot_val; - void *extra = NULL; - if (!call_enum_check_hook(conf, &newval, &extra, + if (!call_enum_check_hook(gconf, &newval, &extra, PGC_S_DEFAULT, LOG)) elog(FATAL, "failed to initialize %s to %d", - conf->gen.name, newval); + gconf->name, newval); if (conf->assign_hook) conf->assign_hook(newval, extra); *conf->variable = conf->reset_val = newval; - conf->gen.extra = conf->reset_extra = extra; break; } } + + gconf->extra = gconf->reset_extra = extra; } /* @@ -1787,7 +1658,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) char *fname; bool fname_is_malloced; struct stat stat_buf; - struct config_string *data_directory_rec; + struct config_generic *data_directory_rec; /* configdir is -D option, or $PGDATA if no -D */ if (userDoption) @@ -1802,7 +1673,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) configdir); if (errno == ENOENT) write_stderr("Run initdb or pg_basebackup to initialize a PostgreSQL data directory.\n"); - return false; + goto fail; } /* @@ -1829,7 +1700,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) "You must specify the --config-file or -D invocation " "option or set the PGDATA environment variable.\n", progname); - return false; + goto fail; } /* @@ -1850,8 +1721,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) { write_stderr("%s: could not access the server configuration file \"%s\": %m\n", progname, ConfigFileName); - free(configdir); - return false; + goto fail; } /* @@ -1868,10 +1738,10 @@ SelectConfigFiles(const char *userDoption, const char *progname) * Note: SetDataDir will copy and absolute-ize its argument, so we don't * have to. */ - data_directory_rec = (struct config_string *) + data_directory_rec = find_option("data_directory", false, false, PANIC); - if (*data_directory_rec->variable) - SetDataDir(*data_directory_rec->variable); + if (*data_directory_rec->_string.variable) + SetDataDir(*data_directory_rec->_string.variable); else if (configdir) SetDataDir(configdir); else @@ -1881,7 +1751,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) "or by the -D invocation option, or by the " "PGDATA environment variable.\n", progname, ConfigFileName); - return false; + goto fail; } /* @@ -1933,7 +1803,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) "or by the -D invocation option, or by the " "PGDATA environment variable.\n", progname, ConfigFileName); - return false; + goto fail; } SetConfigOption("hba_file", fname, PGC_POSTMASTER, PGC_S_OVERRIDE); @@ -1964,7 +1834,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) "or by the -D invocation option, or by the " "PGDATA environment variable.\n", progname, ConfigFileName); - return false; + goto fail; } SetConfigOption("ident_file", fname, PGC_POSTMASTER, PGC_S_OVERRIDE); @@ -1976,6 +1846,11 @@ SelectConfigFiles(const char *userDoption, const char *progname) free(configdir); return true; + +fail: + free(configdir); + + return false; } /* @@ -2028,62 +1903,62 @@ ResetAllOptions(void) { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) gconf; + struct config_bool *conf = &gconf->_bool; if (conf->assign_hook) conf->assign_hook(conf->reset_val, - conf->reset_extra); + gconf->reset_extra); *conf->variable = conf->reset_val; - set_extra_field(&conf->gen, &conf->gen.extra, - conf->reset_extra); + set_extra_field(gconf, &gconf->extra, + gconf->reset_extra); break; } case PGC_INT: { - struct config_int *conf = (struct config_int *) gconf; + struct config_int *conf = &gconf->_int; if (conf->assign_hook) conf->assign_hook(conf->reset_val, - conf->reset_extra); + gconf->reset_extra); *conf->variable = conf->reset_val; - set_extra_field(&conf->gen, &conf->gen.extra, - conf->reset_extra); + set_extra_field(gconf, &gconf->extra, + gconf->reset_extra); break; } case PGC_REAL: { - struct config_real *conf = (struct config_real *) gconf; + struct config_real *conf = &gconf->_real; if (conf->assign_hook) conf->assign_hook(conf->reset_val, - conf->reset_extra); + gconf->reset_extra); *conf->variable = conf->reset_val; - set_extra_field(&conf->gen, &conf->gen.extra, - conf->reset_extra); + set_extra_field(gconf, &gconf->extra, + gconf->reset_extra); break; } case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + struct config_string *conf = &gconf->_string; if (conf->assign_hook) conf->assign_hook(conf->reset_val, - conf->reset_extra); - set_string_field(conf, conf->variable, conf->reset_val); - set_extra_field(&conf->gen, &conf->gen.extra, - conf->reset_extra); + gconf->reset_extra); + set_string_field(gconf, conf->variable, conf->reset_val); + set_extra_field(gconf, &gconf->extra, + gconf->reset_extra); break; } case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) gconf; + struct config_enum *conf = &gconf->_enum; if (conf->assign_hook) conf->assign_hook(conf->reset_val, - conf->reset_extra); + gconf->reset_extra); *conf->variable = conf->reset_val; - set_extra_field(&conf->gen, &conf->gen.extra, - conf->reset_extra); + set_extra_field(gconf, &gconf->extra, + gconf->reset_extra); break; } } @@ -2403,17 +2278,17 @@ AtEOXact_GUC(bool isCommit, int nestLevel) { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) gconf; + struct config_bool *conf = &gconf->_bool; bool newval = newvalue.val.boolval; void *newextra = newvalue.extra; if (*conf->variable != newval || - conf->gen.extra != newextra) + gconf->extra != newextra) { if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(gconf, &gconf->extra, newextra); changed = true; } @@ -2421,17 +2296,17 @@ AtEOXact_GUC(bool isCommit, int nestLevel) } case PGC_INT: { - struct config_int *conf = (struct config_int *) gconf; + struct config_int *conf = &gconf->_int; int newval = newvalue.val.intval; void *newextra = newvalue.extra; if (*conf->variable != newval || - conf->gen.extra != newextra) + gconf->extra != newextra) { if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(gconf, &gconf->extra, newextra); changed = true; } @@ -2439,17 +2314,17 @@ AtEOXact_GUC(bool isCommit, int nestLevel) } case PGC_REAL: { - struct config_real *conf = (struct config_real *) gconf; + struct config_real *conf = &gconf->_real; double newval = newvalue.val.realval; void *newextra = newvalue.extra; if (*conf->variable != newval || - conf->gen.extra != newextra) + gconf->extra != newextra) { if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(gconf, &gconf->extra, newextra); changed = true; } @@ -2457,17 +2332,17 @@ AtEOXact_GUC(bool isCommit, int nestLevel) } case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + struct config_string *conf = &gconf->_string; char *newval = newvalue.val.stringval; void *newextra = newvalue.extra; if (*conf->variable != newval || - conf->gen.extra != newextra) + gconf->extra != newextra) { if (conf->assign_hook) conf->assign_hook(newval, newextra); - set_string_field(conf, conf->variable, newval); - set_extra_field(&conf->gen, &conf->gen.extra, + set_string_field(gconf, conf->variable, newval); + set_extra_field(gconf, &gconf->extra, newextra); changed = true; } @@ -2478,23 +2353,23 @@ AtEOXact_GUC(bool isCommit, int nestLevel) * we have type-specific code anyway, might as * well inline it. */ - set_string_field(conf, &stack->prior.val.stringval, NULL); - set_string_field(conf, &stack->masked.val.stringval, NULL); + set_string_field(gconf, &stack->prior.val.stringval, NULL); + set_string_field(gconf, &stack->masked.val.stringval, NULL); break; } case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) gconf; + struct config_enum *conf = &gconf->_enum; int newval = newvalue.val.enumval; void *newextra = newvalue.extra; if (*conf->variable != newval || - conf->gen.extra != newextra) + gconf->extra != newextra) { if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(gconf, &gconf->extra, newextra); changed = true; } @@ -2674,7 +2549,6 @@ convert_to_base_unit(double value, const char *unit, char unitstr[MAX_UNIT_LEN + 1]; int unitlen; const unit_conversion *table; - int i; /* extract unit string to compare to table entries */ unitlen = 0; @@ -2694,7 +2568,7 @@ convert_to_base_unit(double value, const char *unit, else table = time_unit_conversion_table; - for (i = 0; *table[i].unit; i++) + for (int i = 0; *table[i].unit; i++) { if (base_unit == table[i].base_unit && strcmp(unitstr, table[i].unit) == 0) @@ -2730,7 +2604,6 @@ convert_int_from_base_unit(int64 base_value, int base_unit, int64 *value, const char **unit) { const unit_conversion *table; - int i; *unit = NULL; @@ -2739,7 +2612,7 @@ convert_int_from_base_unit(int64 base_value, int base_unit, else table = time_unit_conversion_table; - for (i = 0; *table[i].unit; i++) + for (int i = 0; *table[i].unit; i++) { if (base_unit == table[i].base_unit) { @@ -2772,7 +2645,6 @@ convert_real_from_base_unit(double base_value, int base_unit, double *value, const char **unit) { const unit_conversion *table; - int i; *unit = NULL; @@ -2781,7 +2653,7 @@ convert_real_from_base_unit(double base_value, int base_unit, else table = time_unit_conversion_table; - for (i = 0; *table[i].unit; i++) + for (int i = 0; *table[i].unit; i++) { if (base_unit == table[i].base_unit) { @@ -3020,18 +2892,16 @@ parse_real(const char *value, double *result, int flags, const char **hintmsg) * allocated for modification. */ const char * -config_enum_lookup_by_value(struct config_enum *record, int val) +config_enum_lookup_by_value(const struct config_generic *record, int val) { - const struct config_enum_entry *entry; - - for (entry = record->options; entry && entry->name; entry++) + for (const struct config_enum_entry *entry = record->_enum.options; entry && entry->name; entry++) { if (entry->val == val) return entry->name; } elog(ERROR, "could not find enum option %d for %s", - val, record->gen.name); + val, record->name); return NULL; /* silence compiler */ } @@ -3043,12 +2913,10 @@ config_enum_lookup_by_value(struct config_enum *record, int val) * true. If it's not found, return false and retval is set to 0. */ bool -config_enum_lookup_by_name(struct config_enum *record, const char *value, +config_enum_lookup_by_name(const struct config_enum *record, const char *value, int *retval) { - const struct config_enum_entry *entry; - - for (entry = record->options; entry && entry->name; entry++) + for (const struct config_enum_entry *entry = record->options; entry && entry->name; entry++) { if (pg_strcasecmp(value, entry->name) == 0) { @@ -3069,10 +2937,9 @@ config_enum_lookup_by_name(struct config_enum *record, const char *value, * If suffix is non-NULL, it is added to the end of the string. */ char * -config_enum_get_options(struct config_enum *record, const char *prefix, +config_enum_get_options(const struct config_enum *record, const char *prefix, const char *suffix, const char *separator) { - const struct config_enum_entry *entry; StringInfoData retstr; int seplen; @@ -3080,7 +2947,7 @@ config_enum_get_options(struct config_enum *record, const char *prefix, appendStringInfoString(&retstr, prefix); seplen = strlen(separator); - for (entry = record->options; entry && entry->name; entry++) + for (const struct config_enum_entry *entry = record->options; entry && entry->name; entry++) { if (!entry->hidden) { @@ -3126,7 +2993,7 @@ config_enum_get_options(struct config_enum *record, const char *prefix, * Returns true if OK, false if not (or throws error, if elevel >= ERROR) */ static bool -parse_and_validate_value(struct config_generic *record, +parse_and_validate_value(const struct config_generic *record, const char *value, GucSource source, int elevel, union config_var_val *newval, void **newextra) @@ -3135,41 +3002,39 @@ parse_and_validate_value(struct config_generic *record, { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) record; - if (!parse_bool(value, &newval->boolval)) { ereport(elevel, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("parameter \"%s\" requires a Boolean value", - conf->gen.name))); + record->name))); return false; } - if (!call_bool_check_hook(conf, &newval->boolval, newextra, + if (!call_bool_check_hook(record, &newval->boolval, newextra, source, elevel)) return false; } break; case PGC_INT: { - struct config_int *conf = (struct config_int *) record; + const struct config_int *conf = &record->_int; const char *hintmsg; if (!parse_int(value, &newval->intval, - conf->gen.flags, &hintmsg)) + record->flags, &hintmsg)) { ereport(elevel, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for parameter \"%s\": \"%s\"", - conf->gen.name, value), + record->name, value), hintmsg ? errhint("%s", _(hintmsg)) : 0)); return false; } if (newval->intval < conf->min || newval->intval > conf->max) { - const char *unit = get_config_unit_name(conf->gen.flags); + const char *unit = get_config_unit_name(record->flags); const char *unitspace; if (unit) @@ -3181,36 +3046,36 @@ parse_and_validate_value(struct config_generic *record, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("%d%s%s is outside the valid range for parameter \"%s\" (%d%s%s .. %d%s%s)", newval->intval, unitspace, unit, - conf->gen.name, + record->name, conf->min, unitspace, unit, conf->max, unitspace, unit))); return false; } - if (!call_int_check_hook(conf, &newval->intval, newextra, + if (!call_int_check_hook(record, &newval->intval, newextra, source, elevel)) return false; } break; case PGC_REAL: { - struct config_real *conf = (struct config_real *) record; + const struct config_real *conf = &record->_real; const char *hintmsg; if (!parse_real(value, &newval->realval, - conf->gen.flags, &hintmsg)) + record->flags, &hintmsg)) { ereport(elevel, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for parameter \"%s\": \"%s\"", - conf->gen.name, value), + record->name, value), hintmsg ? errhint("%s", _(hintmsg)) : 0)); return false; } if (newval->realval < conf->min || newval->realval > conf->max) { - const char *unit = get_config_unit_name(conf->gen.flags); + const char *unit = get_config_unit_name(record->flags); const char *unitspace; if (unit) @@ -3222,21 +3087,19 @@ parse_and_validate_value(struct config_generic *record, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("%g%s%s is outside the valid range for parameter \"%s\" (%g%s%s .. %g%s%s)", newval->realval, unitspace, unit, - conf->gen.name, + record->name, conf->min, unitspace, unit, conf->max, unitspace, unit))); return false; } - if (!call_real_check_hook(conf, &newval->realval, newextra, + if (!call_real_check_hook(record, &newval->realval, newextra, source, elevel)) return false; } break; case PGC_STRING: { - struct config_string *conf = (struct config_string *) record; - /* * The value passed by the caller could be transient, so we * always strdup it. @@ -3249,12 +3112,12 @@ parse_and_validate_value(struct config_generic *record, * The only built-in "parsing" check we have is to apply * truncation if GUC_IS_NAME. */ - if (conf->gen.flags & GUC_IS_NAME) + if (record->flags & GUC_IS_NAME) truncate_identifier(newval->stringval, strlen(newval->stringval), true); - if (!call_string_check_hook(conf, &newval->stringval, newextra, + if (!call_string_check_hook(record, &newval->stringval, newextra, source, elevel)) { guc_free(newval->stringval); @@ -3265,28 +3128,39 @@ parse_and_validate_value(struct config_generic *record, break; case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) record; + const struct config_enum *conf = &record->_enum; if (!config_enum_lookup_by_name(conf, value, &newval->enumval)) { char *hintmsg; hintmsg = config_enum_get_options(conf, - "Available values: ", - ".", ", "); + _("Available values: "), + + /* + * translator: This is the terminator of a list of entity + * names. + */ + _("."), + + /* + * translator: This is a separator in a list of entity + * names. + */ + _(", ")); ereport(elevel, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for parameter \"%s\": \"%s\"", - conf->gen.name, value), - hintmsg ? errhint("%s", _(hintmsg)) : 0)); + record->name, value), + hintmsg ? errhint("%s", hintmsg) : 0)); if (hintmsg) pfree(hintmsg); return false; } - if (!call_enum_check_hook(conf, &newval->enumval, newextra, + if (!call_enum_check_hook(record, &newval->enumval, newextra, source, elevel)) return false; } @@ -3704,7 +3578,7 @@ set_config_with_handle(const char *name, config_handle *handle, { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) record; + struct config_bool *conf = &record->_bool; #define newval (newval_union.boolval) @@ -3718,23 +3592,23 @@ set_config_with_handle(const char *name, config_handle *handle, else if (source == PGC_S_DEFAULT) { newval = conf->boot_val; - if (!call_bool_check_hook(conf, &newval, &newextra, + if (!call_bool_check_hook(record, &newval, &newextra, source, elevel)) return 0; } else { newval = conf->reset_val; - newextra = conf->reset_extra; - source = conf->gen.reset_source; - context = conf->gen.reset_scontext; - srole = conf->gen.reset_srole; + newextra = record->reset_extra; + source = record->reset_source; + context = record->reset_scontext; + srole = record->reset_srole; } if (prohibitValueChange) { /* Release newextra, unless it's reset_extra */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); if (*conf->variable != newval) @@ -3743,7 +3617,7 @@ set_config_with_handle(const char *name, config_handle *handle, ereport(elevel, (errcode(ERRCODE_CANT_CHANGE_RUNTIME_PARAM), errmsg("parameter \"%s\" cannot be changed without restarting the server", - conf->gen.name))); + record->name))); return 0; } record->status &= ~GUC_PENDING_RESTART; @@ -3754,36 +3628,34 @@ set_config_with_handle(const char *name, config_handle *handle, { /* Save old value to support transaction abort */ if (!makeDefault) - push_old_value(&conf->gen, action); + push_old_value(record, action); if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(record, &record->extra, newextra); - set_guc_source(&conf->gen, source); - conf->gen.scontext = context; - conf->gen.srole = srole; + set_guc_source(record, source); + record->scontext = context; + record->srole = srole; } if (makeDefault) { - GucStack *stack; - - if (conf->gen.reset_source <= source) + if (record->reset_source <= source) { conf->reset_val = newval; - set_extra_field(&conf->gen, &conf->reset_extra, + set_extra_field(record, &record->reset_extra, newextra); - conf->gen.reset_source = source; - conf->gen.reset_scontext = context; - conf->gen.reset_srole = srole; + record->reset_source = source; + record->reset_scontext = context; + record->reset_srole = srole; } - for (stack = conf->gen.stack; stack; stack = stack->prev) + for (GucStack *stack = record->stack; stack; stack = stack->prev) { if (stack->source <= source) { stack->prior.val.boolval = newval; - set_extra_field(&conf->gen, &stack->prior.extra, + set_extra_field(record, &stack->prior.extra, newextra); stack->source = source; stack->scontext = context; @@ -3793,7 +3665,7 @@ set_config_with_handle(const char *name, config_handle *handle, } /* Perhaps we didn't install newextra anywhere */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); break; @@ -3802,7 +3674,7 @@ set_config_with_handle(const char *name, config_handle *handle, case PGC_INT: { - struct config_int *conf = (struct config_int *) record; + struct config_int *conf = &record->_int; #define newval (newval_union.intval) @@ -3816,23 +3688,23 @@ set_config_with_handle(const char *name, config_handle *handle, else if (source == PGC_S_DEFAULT) { newval = conf->boot_val; - if (!call_int_check_hook(conf, &newval, &newextra, + if (!call_int_check_hook(record, &newval, &newextra, source, elevel)) return 0; } else { newval = conf->reset_val; - newextra = conf->reset_extra; - source = conf->gen.reset_source; - context = conf->gen.reset_scontext; - srole = conf->gen.reset_srole; + newextra = record->reset_extra; + source = record->reset_source; + context = record->reset_scontext; + srole = record->reset_srole; } if (prohibitValueChange) { /* Release newextra, unless it's reset_extra */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); if (*conf->variable != newval) @@ -3841,7 +3713,7 @@ set_config_with_handle(const char *name, config_handle *handle, ereport(elevel, (errcode(ERRCODE_CANT_CHANGE_RUNTIME_PARAM), errmsg("parameter \"%s\" cannot be changed without restarting the server", - conf->gen.name))); + record->name))); return 0; } record->status &= ~GUC_PENDING_RESTART; @@ -3852,36 +3724,34 @@ set_config_with_handle(const char *name, config_handle *handle, { /* Save old value to support transaction abort */ if (!makeDefault) - push_old_value(&conf->gen, action); + push_old_value(record, action); if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(record, &record->extra, newextra); - set_guc_source(&conf->gen, source); - conf->gen.scontext = context; - conf->gen.srole = srole; + set_guc_source(record, source); + record->scontext = context; + record->srole = srole; } if (makeDefault) { - GucStack *stack; - - if (conf->gen.reset_source <= source) + if (record->reset_source <= source) { conf->reset_val = newval; - set_extra_field(&conf->gen, &conf->reset_extra, + set_extra_field(record, &record->reset_extra, newextra); - conf->gen.reset_source = source; - conf->gen.reset_scontext = context; - conf->gen.reset_srole = srole; + record->reset_source = source; + record->reset_scontext = context; + record->reset_srole = srole; } - for (stack = conf->gen.stack; stack; stack = stack->prev) + for (GucStack *stack = record->stack; stack; stack = stack->prev) { if (stack->source <= source) { stack->prior.val.intval = newval; - set_extra_field(&conf->gen, &stack->prior.extra, + set_extra_field(record, &stack->prior.extra, newextra); stack->source = source; stack->scontext = context; @@ -3891,7 +3761,7 @@ set_config_with_handle(const char *name, config_handle *handle, } /* Perhaps we didn't install newextra anywhere */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); break; @@ -3900,7 +3770,7 @@ set_config_with_handle(const char *name, config_handle *handle, case PGC_REAL: { - struct config_real *conf = (struct config_real *) record; + struct config_real *conf = &record->_real; #define newval (newval_union.realval) @@ -3914,23 +3784,23 @@ set_config_with_handle(const char *name, config_handle *handle, else if (source == PGC_S_DEFAULT) { newval = conf->boot_val; - if (!call_real_check_hook(conf, &newval, &newextra, + if (!call_real_check_hook(record, &newval, &newextra, source, elevel)) return 0; } else { newval = conf->reset_val; - newextra = conf->reset_extra; - source = conf->gen.reset_source; - context = conf->gen.reset_scontext; - srole = conf->gen.reset_srole; + newextra = record->reset_extra; + source = record->reset_source; + context = record->reset_scontext; + srole = record->reset_srole; } if (prohibitValueChange) { /* Release newextra, unless it's reset_extra */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); if (*conf->variable != newval) @@ -3939,7 +3809,7 @@ set_config_with_handle(const char *name, config_handle *handle, ereport(elevel, (errcode(ERRCODE_CANT_CHANGE_RUNTIME_PARAM), errmsg("parameter \"%s\" cannot be changed without restarting the server", - conf->gen.name))); + record->name))); return 0; } record->status &= ~GUC_PENDING_RESTART; @@ -3950,36 +3820,34 @@ set_config_with_handle(const char *name, config_handle *handle, { /* Save old value to support transaction abort */ if (!makeDefault) - push_old_value(&conf->gen, action); + push_old_value(record, action); if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(record, &record->extra, newextra); - set_guc_source(&conf->gen, source); - conf->gen.scontext = context; - conf->gen.srole = srole; + set_guc_source(record, source); + record->scontext = context; + record->srole = srole; } if (makeDefault) { - GucStack *stack; - - if (conf->gen.reset_source <= source) + if (record->reset_source <= source) { conf->reset_val = newval; - set_extra_field(&conf->gen, &conf->reset_extra, + set_extra_field(record, &record->reset_extra, newextra); - conf->gen.reset_source = source; - conf->gen.reset_scontext = context; - conf->gen.reset_srole = srole; + record->reset_source = source; + record->reset_scontext = context; + record->reset_srole = srole; } - for (stack = conf->gen.stack; stack; stack = stack->prev) + for (GucStack *stack = record->stack; stack; stack = stack->prev) { if (stack->source <= source) { stack->prior.val.realval = newval; - set_extra_field(&conf->gen, &stack->prior.extra, + set_extra_field(record, &stack->prior.extra, newextra); stack->source = source; stack->scontext = context; @@ -3989,7 +3857,7 @@ set_config_with_handle(const char *name, config_handle *handle, } /* Perhaps we didn't install newextra anywhere */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); break; @@ -3998,7 +3866,7 @@ set_config_with_handle(const char *name, config_handle *handle, case PGC_STRING: { - struct config_string *conf = (struct config_string *) record; + struct config_string *conf = &record->_string; GucContext orig_context = context; GucSource orig_source = source; Oid orig_srole = srole; @@ -4024,7 +3892,7 @@ set_config_with_handle(const char *name, config_handle *handle, else newval = NULL; - if (!call_string_check_hook(conf, &newval, &newextra, + if (!call_string_check_hook(record, &newval, &newextra, source, elevel)) { guc_free(newval); @@ -4038,10 +3906,10 @@ set_config_with_handle(const char *name, config_handle *handle, * guc.c's control */ newval = conf->reset_val; - newextra = conf->reset_extra; - source = conf->gen.reset_source; - context = conf->gen.reset_scontext; - srole = conf->gen.reset_srole; + newextra = record->reset_extra; + source = record->reset_source; + context = record->reset_scontext; + srole = record->reset_srole; } if (prohibitValueChange) @@ -4054,10 +3922,10 @@ set_config_with_handle(const char *name, config_handle *handle, strcmp(*conf->variable, newval) != 0); /* Release newval, unless it's reset_val */ - if (newval && !string_field_used(conf, newval)) + if (newval && !string_field_used(record, newval)) guc_free(newval); /* Release newextra, unless it's reset_extra */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); if (newval_different) @@ -4066,7 +3934,7 @@ set_config_with_handle(const char *name, config_handle *handle, ereport(elevel, (errcode(ERRCODE_CANT_CHANGE_RUNTIME_PARAM), errmsg("parameter \"%s\" cannot be changed without restarting the server", - conf->gen.name))); + record->name))); return 0; } record->status &= ~GUC_PENDING_RESTART; @@ -4077,16 +3945,16 @@ set_config_with_handle(const char *name, config_handle *handle, { /* Save old value to support transaction abort */ if (!makeDefault) - push_old_value(&conf->gen, action); + push_old_value(record, action); if (conf->assign_hook) conf->assign_hook(newval, newextra); - set_string_field(conf, conf->variable, newval); - set_extra_field(&conf->gen, &conf->gen.extra, + set_string_field(record, conf->variable, newval); + set_extra_field(record, &record->extra, newextra); - set_guc_source(&conf->gen, source); - conf->gen.scontext = context; - conf->gen.srole = srole; + set_guc_source(record, source); + record->scontext = context; + record->srole = srole; /* * Ugly hack: during SET session_authorization, forcibly @@ -4113,7 +3981,7 @@ set_config_with_handle(const char *name, config_handle *handle, * that. */ if (!is_reload && - strcmp(conf->gen.name, "session_authorization") == 0) + strcmp(record->name, "session_authorization") == 0) (void) set_config_with_handle("role", NULL, value ? "none" : NULL, orig_context, @@ -4129,24 +3997,22 @@ set_config_with_handle(const char *name, config_handle *handle, if (makeDefault) { - GucStack *stack; - - if (conf->gen.reset_source <= source) + if (record->reset_source <= source) { - set_string_field(conf, &conf->reset_val, newval); - set_extra_field(&conf->gen, &conf->reset_extra, + set_string_field(record, &conf->reset_val, newval); + set_extra_field(record, &record->reset_extra, newextra); - conf->gen.reset_source = source; - conf->gen.reset_scontext = context; - conf->gen.reset_srole = srole; + record->reset_source = source; + record->reset_scontext = context; + record->reset_srole = srole; } - for (stack = conf->gen.stack; stack; stack = stack->prev) + for (GucStack *stack = record->stack; stack; stack = stack->prev) { if (stack->source <= source) { - set_string_field(conf, &stack->prior.val.stringval, + set_string_field(record, &stack->prior.val.stringval, newval); - set_extra_field(&conf->gen, &stack->prior.extra, + set_extra_field(record, &stack->prior.extra, newextra); stack->source = source; stack->scontext = context; @@ -4156,10 +4022,10 @@ set_config_with_handle(const char *name, config_handle *handle, } /* Perhaps we didn't install newval anywhere */ - if (newval && !string_field_used(conf, newval)) + if (newval && !string_field_used(record, newval)) guc_free(newval); /* Perhaps we didn't install newextra anywhere */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); break; @@ -4168,7 +4034,7 @@ set_config_with_handle(const char *name, config_handle *handle, case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) record; + struct config_enum *conf = &record->_enum; #define newval (newval_union.enumval) @@ -4182,23 +4048,23 @@ set_config_with_handle(const char *name, config_handle *handle, else if (source == PGC_S_DEFAULT) { newval = conf->boot_val; - if (!call_enum_check_hook(conf, &newval, &newextra, + if (!call_enum_check_hook(record, &newval, &newextra, source, elevel)) return 0; } else { newval = conf->reset_val; - newextra = conf->reset_extra; - source = conf->gen.reset_source; - context = conf->gen.reset_scontext; - srole = conf->gen.reset_srole; + newextra = record->reset_extra; + source = record->reset_source; + context = record->reset_scontext; + srole = record->reset_srole; } if (prohibitValueChange) { /* Release newextra, unless it's reset_extra */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); if (*conf->variable != newval) @@ -4207,7 +4073,7 @@ set_config_with_handle(const char *name, config_handle *handle, ereport(elevel, (errcode(ERRCODE_CANT_CHANGE_RUNTIME_PARAM), errmsg("parameter \"%s\" cannot be changed without restarting the server", - conf->gen.name))); + record->name))); return 0; } record->status &= ~GUC_PENDING_RESTART; @@ -4218,36 +4084,34 @@ set_config_with_handle(const char *name, config_handle *handle, { /* Save old value to support transaction abort */ if (!makeDefault) - push_old_value(&conf->gen, action); + push_old_value(record, action); if (conf->assign_hook) conf->assign_hook(newval, newextra); *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, + set_extra_field(record, &record->extra, newextra); - set_guc_source(&conf->gen, source); - conf->gen.scontext = context; - conf->gen.srole = srole; + set_guc_source(record, source); + record->scontext = context; + record->srole = srole; } if (makeDefault) { - GucStack *stack; - - if (conf->gen.reset_source <= source) + if (record->reset_source <= source) { conf->reset_val = newval; - set_extra_field(&conf->gen, &conf->reset_extra, + set_extra_field(record, &record->reset_extra, newextra); - conf->gen.reset_source = source; - conf->gen.reset_scontext = context; - conf->gen.reset_srole = srole; + record->reset_source = source; + record->reset_scontext = context; + record->reset_srole = srole; } - for (stack = conf->gen.stack; stack; stack = stack->prev) + for (GucStack *stack = record->stack; stack; stack = stack->prev) { if (stack->source <= source) { stack->prior.val.enumval = newval; - set_extra_field(&conf->gen, &stack->prior.extra, + set_extra_field(record, &stack->prior.extra, newextra); stack->source = source; stack->scontext = context; @@ -4257,7 +4121,7 @@ set_config_with_handle(const char *name, config_handle *handle, } /* Perhaps we didn't install newextra anywhere */ - if (newextra && !extra_field_used(&conf->gen, newextra)) + if (newextra && !extra_field_used(record, newextra)) guc_free(newextra); break; @@ -4371,25 +4235,25 @@ GetConfigOption(const char *name, bool missing_ok, bool restrict_privileged) switch (record->vartype) { case PGC_BOOL: - return *((struct config_bool *) record)->variable ? "on" : "off"; + return *record->_bool.variable ? "on" : "off"; case PGC_INT: snprintf(buffer, sizeof(buffer), "%d", - *((struct config_int *) record)->variable); + *record->_int.variable); return buffer; case PGC_REAL: snprintf(buffer, sizeof(buffer), "%g", - *((struct config_real *) record)->variable); + *record->_real.variable); return buffer; case PGC_STRING: - return *((struct config_string *) record)->variable ? - *((struct config_string *) record)->variable : ""; + return *record->_string.variable ? + *record->_string.variable : ""; case PGC_ENUM: - return config_enum_lookup_by_value((struct config_enum *) record, - *((struct config_enum *) record)->variable); + return config_enum_lookup_by_value(record, + *record->_enum.variable); } return NULL; } @@ -4419,25 +4283,25 @@ GetConfigOptionResetString(const char *name) switch (record->vartype) { case PGC_BOOL: - return ((struct config_bool *) record)->reset_val ? "on" : "off"; + return record->_bool.reset_val ? "on" : "off"; case PGC_INT: snprintf(buffer, sizeof(buffer), "%d", - ((struct config_int *) record)->reset_val); + record->_int.reset_val); return buffer; case PGC_REAL: snprintf(buffer, sizeof(buffer), "%g", - ((struct config_real *) record)->reset_val); + record->_real.reset_val); return buffer; case PGC_STRING: - return ((struct config_string *) record)->reset_val ? - ((struct config_string *) record)->reset_val : ""; + return record->_string.reset_val ? + record->_string.reset_val : ""; case PGC_ENUM: - return config_enum_lookup_by_value((struct config_enum *) record, - ((struct config_enum *) record)->reset_val); + return config_enum_lookup_by_value(record, + record->_enum.reset_val); } return NULL; } @@ -4469,7 +4333,6 @@ static void write_auto_conf_file(int fd, const char *filename, ConfigVariable *head) { StringInfoData buf; - ConfigVariable *item; initStringInfo(&buf); @@ -4489,7 +4352,7 @@ write_auto_conf_file(int fd, const char *filename, ConfigVariable *head) } /* Emit each parameter, properly quoting the value */ - for (item = head; item != NULL; item = item->next) + for (ConfigVariable *item = head; item != NULL; item = item->next) { char *escaped; @@ -4537,7 +4400,7 @@ static void replace_auto_config_value(ConfigVariable **head_p, ConfigVariable **tail_p, const char *name, const char *value) { - ConfigVariable *item, + ConfigVariable *newitem, *next, *prev = NULL; @@ -4546,7 +4409,7 @@ replace_auto_config_value(ConfigVariable **head_p, ConfigVariable **tail_p, * one, but if external tools have modified the config file, there could * be more. */ - for (item = *head_p; item != NULL; item = next) + for (ConfigVariable *item = *head_p; item != NULL; item = next) { next = item->next; if (guc_name_compare(item->name, name) == 0) @@ -4573,21 +4436,21 @@ replace_auto_config_value(ConfigVariable **head_p, ConfigVariable **tail_p, return; /* OK, append a new entry */ - item = palloc(sizeof *item); - item->name = pstrdup(name); - item->value = pstrdup(value); - item->errmsg = NULL; - item->filename = pstrdup(""); /* new item has no location */ - item->sourceline = 0; - item->ignore = false; - item->applied = false; - item->next = NULL; + newitem = palloc_object(ConfigVariable); + newitem->name = pstrdup(name); + newitem->value = pstrdup(value); + newitem->errmsg = NULL; + newitem->filename = pstrdup(""); /* new item has no location */ + newitem->sourceline = 0; + newitem->ignore = false; + newitem->applied = false; + newitem->next = NULL; if (*head_p == NULL) - *head_p = item; + *head_p = newitem; else - (*tail_p)->next = item; - *tail_p = item; + (*tail_p)->next = newitem; + *tail_p = newitem; } @@ -4722,8 +4585,13 @@ AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt) * the config file cannot cause postmaster start to fail, so we * don't have to be too tense about possibly installing a bad * value.) + * + * As an exception, we skip this check if this is a RESET command + * for an unknown custom GUC, else there'd be no way for users to + * remove such settings with reserved prefixes. */ - (void) assignable_custom_variable_name(name, false, ERROR); + if (value || !valid_custom_variable_name(name)) + (void) assignable_custom_variable_name(name, false, ERROR); } /* @@ -4873,8 +4741,7 @@ init_custom_variable(const char *name, const char *long_desc, GucContext context, int flags, - enum config_type type, - size_t sz) + enum config_type type) { struct config_generic *gen; @@ -4910,8 +4777,8 @@ init_custom_variable(const char *name, context = PGC_SUSET; /* As above, an OOM here is FATAL */ - gen = (struct config_generic *) guc_malloc(FATAL, sz); - memset(gen, 0, sz); + gen = (struct config_generic *) guc_malloc(FATAL, sizeof(struct config_generic)); + memset(gen, 0, sizeof(struct config_generic)); gen->name = guc_strdup(FATAL, name); gen->context = context; @@ -4933,7 +4800,7 @@ define_custom_variable(struct config_generic *variable) { const char *name = variable->name; GUCHashEntry *hentry; - struct config_string *pHolder; + struct config_generic *pHolder; /* Check mapping between initial and default value */ Assert(check_GUC_init(variable)); @@ -4965,7 +4832,7 @@ define_custom_variable(struct config_generic *variable) errmsg("attempt to redefine parameter \"%s\"", name))); Assert(hentry->gucvar->vartype == PGC_STRING); - pHolder = (struct config_string *) hentry->gucvar; + pHolder = hentry->gucvar; /* * First, set the variable to its default value. We must do this even @@ -4984,7 +4851,7 @@ define_custom_variable(struct config_generic *variable) /* * Remove the placeholder from any lists it's in, too. */ - RemoveGUCFromLists(&pHolder->gen); + RemoveGUCFromLists(pHolder); /* * Assign the string value(s) stored in the placeholder to the real @@ -4998,36 +4865,28 @@ define_custom_variable(struct config_generic *variable) */ /* First, apply the reset value if any */ - if (pHolder->reset_val) - (void) set_config_option_ext(name, pHolder->reset_val, - pHolder->gen.reset_scontext, - pHolder->gen.reset_source, - pHolder->gen.reset_srole, + if (pHolder->_string.reset_val) + (void) set_config_option_ext(name, pHolder->_string.reset_val, + pHolder->reset_scontext, + pHolder->reset_source, + pHolder->reset_srole, GUC_ACTION_SET, true, WARNING, false); /* That should not have resulted in stacking anything */ Assert(variable->stack == NULL); /* Now, apply current and stacked values, in the order they were stacked */ - reapply_stacked_values(variable, pHolder, pHolder->gen.stack, - *(pHolder->variable), - pHolder->gen.scontext, pHolder->gen.source, - pHolder->gen.srole); + reapply_stacked_values(variable, pHolder, pHolder->stack, + *(pHolder->_string.variable), + pHolder->scontext, pHolder->source, + pHolder->srole); /* Also copy over any saved source-location information */ - if (pHolder->gen.sourcefile) - set_config_sourcefile(name, pHolder->gen.sourcefile, - pHolder->gen.sourceline); - - /* - * Free up as much as we conveniently can of the placeholder structure. - * (This neglects any stack items, so it's possible for some memory to be - * leaked. Since this can only happen once per session per variable, it - * doesn't seem worth spending much code on.) - */ - set_string_field(pHolder, pHolder->variable, NULL); - set_string_field(pHolder, &pHolder->reset_val, NULL); + if (pHolder->sourcefile) + set_config_sourcefile(name, pHolder->sourcefile, + pHolder->sourceline); - guc_free(pHolder); + /* Now we can free the no-longer-referenced placeholder variable */ + free_placeholder(pHolder); } /* @@ -5039,7 +4898,7 @@ define_custom_variable(struct config_generic *variable) */ static void reapply_stacked_values(struct config_generic *variable, - struct config_string *pHolder, + struct config_generic *pHolder, GucStack *stack, const char *curvalue, GucContext curscontext, GucSource cursource, @@ -5109,10 +4968,10 @@ reapply_stacked_values(struct config_generic *variable, * this is to be just a transactional assignment. (We leak the stack * entry.) */ - if (curvalue != pHolder->reset_val || - curscontext != pHolder->gen.reset_scontext || - cursource != pHolder->gen.reset_source || - cursrole != pHolder->gen.reset_srole) + if (curvalue != pHolder->_string.reset_val || + curscontext != pHolder->reset_scontext || + cursource != pHolder->reset_source || + cursrole != pHolder->reset_srole) { (void) set_config_option_ext(name, curvalue, curscontext, cursource, cursrole, @@ -5126,6 +4985,25 @@ reapply_stacked_values(struct config_generic *variable, } } +/* + * Free up a no-longer-referenced placeholder GUC variable. + * + * This neglects any stack items, so it's possible for some memory to be + * leaked. Since this can only happen once per session per variable, it + * doesn't seem worth spending much code on. + */ +static void +free_placeholder(struct config_generic *pHolder) +{ + /* Placeholders are always STRING type, so free their values */ + Assert(pHolder->vartype == PGC_STRING); + set_string_field(pHolder, pHolder->_string.variable, NULL); + set_string_field(pHolder, &pHolder->_string.reset_val, NULL); + + guc_free(unconstify(char *, pHolder->name)); + guc_free(pHolder); +} + /* * Functions for extensions to call to define their custom GUC variables. */ @@ -5141,18 +5019,16 @@ DefineCustomBoolVariable(const char *name, GucBoolAssignHook assign_hook, GucShowHook show_hook) { - struct config_bool *var; - - var = (struct config_bool *) - init_custom_variable(name, short_desc, long_desc, context, flags, - PGC_BOOL, sizeof(struct config_bool)); - var->variable = valueAddr; - var->boot_val = bootValue; - var->reset_val = bootValue; - var->check_hook = check_hook; - var->assign_hook = assign_hook; - var->show_hook = show_hook; - define_custom_variable(&var->gen); + struct config_generic *var; + + var = init_custom_variable(name, short_desc, long_desc, context, flags, PGC_BOOL); + var->_bool.variable = valueAddr; + var->_bool.boot_val = bootValue; + var->_bool.reset_val = bootValue; + var->_bool.check_hook = check_hook; + var->_bool.assign_hook = assign_hook; + var->_bool.show_hook = show_hook; + define_custom_variable(var); } void @@ -5169,20 +5045,18 @@ DefineCustomIntVariable(const char *name, GucIntAssignHook assign_hook, GucShowHook show_hook) { - struct config_int *var; - - var = (struct config_int *) - init_custom_variable(name, short_desc, long_desc, context, flags, - PGC_INT, sizeof(struct config_int)); - var->variable = valueAddr; - var->boot_val = bootValue; - var->reset_val = bootValue; - var->min = minValue; - var->max = maxValue; - var->check_hook = check_hook; - var->assign_hook = assign_hook; - var->show_hook = show_hook; - define_custom_variable(&var->gen); + struct config_generic *var; + + var = init_custom_variable(name, short_desc, long_desc, context, flags, PGC_INT); + var->_int.variable = valueAddr; + var->_int.boot_val = bootValue; + var->_int.reset_val = bootValue; + var->_int.min = minValue; + var->_int.max = maxValue; + var->_int.check_hook = check_hook; + var->_int.assign_hook = assign_hook; + var->_int.show_hook = show_hook; + define_custom_variable(var); } void @@ -5199,20 +5073,18 @@ DefineCustomRealVariable(const char *name, GucRealAssignHook assign_hook, GucShowHook show_hook) { - struct config_real *var; - - var = (struct config_real *) - init_custom_variable(name, short_desc, long_desc, context, flags, - PGC_REAL, sizeof(struct config_real)); - var->variable = valueAddr; - var->boot_val = bootValue; - var->reset_val = bootValue; - var->min = minValue; - var->max = maxValue; - var->check_hook = check_hook; - var->assign_hook = assign_hook; - var->show_hook = show_hook; - define_custom_variable(&var->gen); + struct config_generic *var; + + var = init_custom_variable(name, short_desc, long_desc, context, flags, PGC_REAL); + var->_real.variable = valueAddr; + var->_real.boot_val = bootValue; + var->_real.reset_val = bootValue; + var->_real.min = minValue; + var->_real.max = maxValue; + var->_real.check_hook = check_hook; + var->_real.assign_hook = assign_hook; + var->_real.show_hook = show_hook; + define_custom_variable(var); } void @@ -5227,17 +5099,15 @@ DefineCustomStringVariable(const char *name, GucStringAssignHook assign_hook, GucShowHook show_hook) { - struct config_string *var; - - var = (struct config_string *) - init_custom_variable(name, short_desc, long_desc, context, flags, - PGC_STRING, sizeof(struct config_string)); - var->variable = valueAddr; - var->boot_val = bootValue; - var->check_hook = check_hook; - var->assign_hook = assign_hook; - var->show_hook = show_hook; - define_custom_variable(&var->gen); + struct config_generic *var; + + var = init_custom_variable(name, short_desc, long_desc, context, flags, PGC_STRING); + var->_string.variable = valueAddr; + var->_string.boot_val = bootValue; + var->_string.check_hook = check_hook; + var->_string.assign_hook = assign_hook; + var->_string.show_hook = show_hook; + define_custom_variable(var); } void @@ -5253,19 +5123,17 @@ DefineCustomEnumVariable(const char *name, GucEnumAssignHook assign_hook, GucShowHook show_hook) { - struct config_enum *var; - - var = (struct config_enum *) - init_custom_variable(name, short_desc, long_desc, context, flags, - PGC_ENUM, sizeof(struct config_enum)); - var->variable = valueAddr; - var->boot_val = bootValue; - var->reset_val = bootValue; - var->options = options; - var->check_hook = check_hook; - var->assign_hook = assign_hook; - var->show_hook = show_hook; - define_custom_variable(&var->gen); + struct config_generic *var; + + var = init_custom_variable(name, short_desc, long_desc, context, flags, PGC_ENUM); + var->_enum.variable = valueAddr; + var->_enum.boot_val = bootValue; + var->_enum.reset_val = bootValue; + var->_enum.options = options; + var->_enum.check_hook = check_hook; + var->_enum.assign_hook = assign_hook; + var->_enum.show_hook = show_hook; + define_custom_variable(var); } /* @@ -5286,9 +5154,7 @@ MarkGUCPrefixReserved(const char *className) /* * Check for existing placeholders. We must actually remove invalid - * placeholders, else future parallel worker startups will fail. (We - * don't bother trying to free associated memory, since this shouldn't - * happen often.) + * placeholders, else future parallel worker startups will fail. */ hash_seq_init(&status, guc_hashtab); while ((hentry = (GUCHashEntry *) hash_seq_search(&status)) != NULL) @@ -5312,6 +5178,8 @@ MarkGUCPrefixReserved(const char *className) NULL); /* Remove it from any lists it's in, too */ RemoveGUCFromLists(var); + /* And free it */ + free_placeholder(var); } } @@ -5340,7 +5208,7 @@ get_explain_guc_options(int *num) * While only a fraction of all the GUC variables are marked GUC_EXPLAIN, * it doesn't seem worth dynamically resizing this array. */ - result = palloc(sizeof(struct config_generic *) * hash_get_num_entries(guc_hashtab)); + result = palloc_array(struct config_generic *, hash_get_num_entries(guc_hashtab)); /* We need only consider GUCs with source not PGC_S_DEFAULT */ dlist_foreach(iter, &guc_nondef_list) @@ -5364,7 +5232,7 @@ get_explain_guc_options(int *num) { case PGC_BOOL: { - struct config_bool *lconf = (struct config_bool *) conf; + struct config_bool *lconf = &conf->_bool; modified = (lconf->boot_val != *(lconf->variable)); } @@ -5372,7 +5240,7 @@ get_explain_guc_options(int *num) case PGC_INT: { - struct config_int *lconf = (struct config_int *) conf; + struct config_int *lconf = &conf->_int; modified = (lconf->boot_val != *(lconf->variable)); } @@ -5380,7 +5248,7 @@ get_explain_guc_options(int *num) case PGC_REAL: { - struct config_real *lconf = (struct config_real *) conf; + struct config_real *lconf = &conf->_real; modified = (lconf->boot_val != *(lconf->variable)); } @@ -5388,7 +5256,7 @@ get_explain_guc_options(int *num) case PGC_STRING: { - struct config_string *lconf = (struct config_string *) conf; + struct config_string *lconf = &conf->_string; if (lconf->boot_val == NULL && *lconf->variable == NULL) @@ -5403,7 +5271,7 @@ get_explain_guc_options(int *num) case PGC_ENUM: { - struct config_enum *lconf = (struct config_enum *) conf; + struct config_enum *lconf = &conf->_enum; modified = (lconf->boot_val != *(lconf->variable)); } @@ -5463,7 +5331,7 @@ GetConfigOptionByName(const char *name, const char **varname, bool missing_ok) * The result string is palloc'd. */ char * -ShowGUCOption(struct config_generic *record, bool use_units) +ShowGUCOption(const struct config_generic *record, bool use_units) { char buffer[256]; const char *val; @@ -5472,7 +5340,7 @@ ShowGUCOption(struct config_generic *record, bool use_units) { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) record; + const struct config_bool *conf = &record->_bool; if (conf->show_hook) val = conf->show_hook(); @@ -5483,7 +5351,7 @@ ShowGUCOption(struct config_generic *record, bool use_units) case PGC_INT: { - struct config_int *conf = (struct config_int *) record; + const struct config_int *conf = &record->_int; if (conf->show_hook) val = conf->show_hook(); @@ -5512,7 +5380,7 @@ ShowGUCOption(struct config_generic *record, bool use_units) case PGC_REAL: { - struct config_real *conf = (struct config_real *) record; + const struct config_real *conf = &record->_real; if (conf->show_hook) val = conf->show_hook(); @@ -5537,7 +5405,7 @@ ShowGUCOption(struct config_generic *record, bool use_units) case PGC_STRING: { - struct config_string *conf = (struct config_string *) record; + const struct config_string *conf = &record->_string; if (conf->show_hook) val = conf->show_hook(); @@ -5550,12 +5418,12 @@ ShowGUCOption(struct config_generic *record, bool use_units) case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) record; + const struct config_enum *conf = &record->_enum; if (conf->show_hook) val = conf->show_hook(); else - val = config_enum_lookup_by_value(conf, *conf->variable); + val = config_enum_lookup_by_value(record, *conf->variable); } break; @@ -5581,7 +5449,7 @@ ShowGUCOption(struct config_generic *record, bool use_units) * variable sourceline, integer * variable source, integer * variable scontext, integer -* variable srole, OID + * variable srole, OID */ static void write_one_nondefault_variable(FILE *fp, struct config_generic *gconf) @@ -5595,7 +5463,7 @@ write_one_nondefault_variable(FILE *fp, struct config_generic *gconf) { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) gconf; + struct config_bool *conf = &gconf->_bool; if (*conf->variable) fprintf(fp, "true"); @@ -5606,7 +5474,7 @@ write_one_nondefault_variable(FILE *fp, struct config_generic *gconf) case PGC_INT: { - struct config_int *conf = (struct config_int *) gconf; + struct config_int *conf = &gconf->_int; fprintf(fp, "%d", *conf->variable); } @@ -5614,7 +5482,7 @@ write_one_nondefault_variable(FILE *fp, struct config_generic *gconf) case PGC_REAL: { - struct config_real *conf = (struct config_real *) gconf; + struct config_real *conf = &gconf->_real; fprintf(fp, "%.17g", *conf->variable); } @@ -5622,7 +5490,7 @@ write_one_nondefault_variable(FILE *fp, struct config_generic *gconf) case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + struct config_string *conf = &gconf->_string; if (*conf->variable) fprintf(fp, "%s", *conf->variable); @@ -5631,10 +5499,10 @@ write_one_nondefault_variable(FILE *fp, struct config_generic *gconf) case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) gconf; + struct config_enum *conf = &gconf->_enum; fprintf(fp, "%s", - config_enum_lookup_by_value(conf, *conf->variable)); + config_enum_lookup_by_value(gconf, *conf->variable)); } break; } @@ -5869,7 +5737,7 @@ estimate_variable_size(struct config_generic *gconf) case PGC_INT: { - struct config_int *conf = (struct config_int *) gconf; + struct config_int *conf = &gconf->_int; /* * Instead of getting the exact display length, use max @@ -5898,7 +5766,7 @@ estimate_variable_size(struct config_generic *gconf) case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + struct config_string *conf = &gconf->_string; /* * If the value is NULL, we transmit it as an empty string. @@ -5914,9 +5782,9 @@ estimate_variable_size(struct config_generic *gconf) case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) gconf; + struct config_enum *conf = &gconf->_enum; - valsize = strlen(config_enum_lookup_by_value(conf, *conf->variable)); + valsize = strlen(config_enum_lookup_by_value(gconf, *conf->variable)); } break; } @@ -6035,7 +5903,7 @@ serialize_variable(char **destptr, Size *maxbytes, { case PGC_BOOL: { - struct config_bool *conf = (struct config_bool *) gconf; + struct config_bool *conf = &gconf->_bool; do_serialize(destptr, maxbytes, (*conf->variable ? "true" : "false")); @@ -6044,7 +5912,7 @@ serialize_variable(char **destptr, Size *maxbytes, case PGC_INT: { - struct config_int *conf = (struct config_int *) gconf; + struct config_int *conf = &gconf->_int; do_serialize(destptr, maxbytes, "%d", *conf->variable); } @@ -6052,7 +5920,7 @@ serialize_variable(char **destptr, Size *maxbytes, case PGC_REAL: { - struct config_real *conf = (struct config_real *) gconf; + struct config_real *conf = &gconf->_real; do_serialize(destptr, maxbytes, "%.*e", REALTYPE_PRECISION, *conf->variable); @@ -6061,7 +5929,7 @@ serialize_variable(char **destptr, Size *maxbytes, case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + struct config_string *conf = &gconf->_string; /* NULL becomes empty string, see estimate_variable_size() */ do_serialize(destptr, maxbytes, "%s", @@ -6071,10 +5939,10 @@ serialize_variable(char **destptr, Size *maxbytes, case PGC_ENUM: { - struct config_enum *conf = (struct config_enum *) gconf; + struct config_enum *conf = &gconf->_enum; do_serialize(destptr, maxbytes, "%s", - config_enum_lookup_by_value(conf, *conf->variable)); + config_enum_lookup_by_value(gconf, *conf->variable)); } break; } @@ -6252,49 +6120,23 @@ RestoreGUCState(void *gucstate) switch (gconf->vartype) { case PGC_BOOL: - { - struct config_bool *conf = (struct config_bool *) gconf; - - if (conf->reset_extra && conf->reset_extra != gconf->extra) - guc_free(conf->reset_extra); - break; - } case PGC_INT: - { - struct config_int *conf = (struct config_int *) gconf; - - if (conf->reset_extra && conf->reset_extra != gconf->extra) - guc_free(conf->reset_extra); - break; - } case PGC_REAL: - { - struct config_real *conf = (struct config_real *) gconf; - - if (conf->reset_extra && conf->reset_extra != gconf->extra) - guc_free(conf->reset_extra); - break; - } + case PGC_ENUM: + /* no need to do anything */ + break; case PGC_STRING: { - struct config_string *conf = (struct config_string *) gconf; + struct config_string *conf = &gconf->_string; guc_free(*conf->variable); if (conf->reset_val && conf->reset_val != *conf->variable) guc_free(conf->reset_val); - if (conf->reset_extra && conf->reset_extra != gconf->extra) - guc_free(conf->reset_extra); - break; - } - case PGC_ENUM: - { - struct config_enum *conf = (struct config_enum *) gconf; - - if (conf->reset_extra && conf->reset_extra != gconf->extra) - guc_free(conf->reset_extra); break; } } + if (gconf->reset_extra && gconf->reset_extra != gconf->extra) + guc_free(gconf->reset_extra); /* Remove it from any lists it's in. */ RemoveGUCFromLists(gconf); /* Now we can reset the struct to PGS_S_DEFAULT state. */ @@ -6363,7 +6205,6 @@ void ParseLongOption(const char *string, char **name, char **value) { size_t equal_pos; - char *cp; Assert(string); Assert(name); @@ -6385,7 +6226,7 @@ ParseLongOption(const char *string, char **name, char **value) *value = NULL; } - for (cp = *name; *cp; cp++) + for (char *cp = *name; *cp; cp++) if (*cp == '-') *cp = '_'; } @@ -6399,8 +6240,6 @@ ParseLongOption(const char *string, char **name, char **value) void TransformGUCArray(ArrayType *array, List **names, List **values) { - int i; - Assert(array != NULL); Assert(ARR_ELEMTYPE(array) == TEXTOID); Assert(ARR_NDIM(array) == 1); @@ -6408,7 +6247,7 @@ TransformGUCArray(ArrayType *array, List **names, List **values) *names = NIL; *values = NIL; - for (i = 1; i <= ARR_DIMS(array)[0]; i++) + for (int i = 1; i <= ARR_DIMS(array)[0]; i++) { Datum d; bool isnull; @@ -6512,7 +6351,6 @@ GUCArrayAdd(ArrayType *array, const char *name, const char *value) { int index; bool isnull; - int i; Assert(ARR_ELEMTYPE(array) == TEXTOID); Assert(ARR_NDIM(array) == 1); @@ -6520,7 +6358,7 @@ GUCArrayAdd(ArrayType *array, const char *name, const char *value) index = ARR_DIMS(array)[0] + 1; /* add after end */ - for (i = 1; i <= ARR_DIMS(array)[0]; i++) + for (int i = 1; i <= ARR_DIMS(array)[0]; i++) { Datum d; char *current; @@ -6568,7 +6406,6 @@ GUCArrayDelete(ArrayType *array, const char *name) { struct config_generic *record; ArrayType *newarray; - int i; int index; Assert(name); @@ -6588,7 +6425,7 @@ GUCArrayDelete(ArrayType *array, const char *name) newarray = NULL; index = 1; - for (i = 1; i <= ARR_DIMS(array)[0]; i++) + for (int i = 1; i <= ARR_DIMS(array)[0]; i++) { Datum d; char *val; @@ -6637,7 +6474,6 @@ ArrayType * GUCArrayReset(ArrayType *array) { ArrayType *newarray; - int i; int index; /* if array is currently null, nothing to do */ @@ -6651,7 +6487,7 @@ GUCArrayReset(ArrayType *array) newarray = NULL; index = 1; - for (i = 1; i <= ARR_DIMS(array)[0]; i++) + for (int i = 1; i <= ARR_DIMS(array)[0]; i++) { Datum d; char *val; @@ -6711,6 +6547,7 @@ validate_option_array_item(const char *name, const char *value, { struct config_generic *gconf; + bool reset_custom; /* * There are three cases to consider: @@ -6729,16 +6566,21 @@ validate_option_array_item(const char *name, const char *value, * it's assumed to be fully validated.) * * name is not known and can't be created as a placeholder. Throw error, - * unless skipIfNoPermissions is true, in which case return false. + * unless skipIfNoPermissions or reset_custom is true. If reset_custom is + * true, this is a RESET or RESET ALL operation for an unknown custom GUC + * with a reserved prefix, in which case we want to fall through to the + * placeholder case described in the preceding paragraph (else there'd be + * no way for users to remove them). Otherwise, return false. */ - gconf = find_option(name, true, skipIfNoPermissions, ERROR); - if (!gconf) + reset_custom = (!value && valid_custom_variable_name(name)); + gconf = find_option(name, true, skipIfNoPermissions || reset_custom, ERROR); + if (!gconf && !reset_custom) { /* not known, failed to make a placeholder */ return false; } - if (gconf->flags & GUC_CUSTOM_PLACEHOLDER) + if (!gconf || gconf->flags & GUC_CUSTOM_PLACEHOLDER) { /* * We cannot do any meaningful check on the value, so only permissions @@ -6796,11 +6638,11 @@ GUC_check_errcode(int sqlerrcode) */ static bool -call_bool_check_hook(struct config_bool *conf, bool *newval, void **extra, +call_bool_check_hook(const struct config_generic *conf, bool *newval, void **extra, GucSource source, int elevel) { /* Quick success if no hook */ - if (!conf->check_hook) + if (!conf->_bool.check_hook) return true; /* Reset variables that might be set by hook */ @@ -6809,19 +6651,19 @@ call_bool_check_hook(struct config_bool *conf, bool *newval, void **extra, GUC_check_errdetail_string = NULL; GUC_check_errhint_string = NULL; - if (!conf->check_hook(newval, extra, source)) + if (!conf->_bool.check_hook(newval, extra, source)) { ereport(elevel, (errcode(GUC_check_errcode_value), GUC_check_errmsg_string ? errmsg_internal("%s", GUC_check_errmsg_string) : errmsg("invalid value for parameter \"%s\": %d", - conf->gen.name, (int) *newval), + conf->name, (int) *newval), GUC_check_errdetail_string ? errdetail_internal("%s", GUC_check_errdetail_string) : 0, GUC_check_errhint_string ? errhint("%s", GUC_check_errhint_string) : 0)); - /* Flush any strings created in ErrorContext */ + /* Flush strings created in ErrorContext (ereport might not have) */ FlushErrorState(); return false; } @@ -6830,11 +6672,11 @@ call_bool_check_hook(struct config_bool *conf, bool *newval, void **extra, } static bool -call_int_check_hook(struct config_int *conf, int *newval, void **extra, +call_int_check_hook(const struct config_generic *conf, int *newval, void **extra, GucSource source, int elevel) { /* Quick success if no hook */ - if (!conf->check_hook) + if (!conf->_int.check_hook) return true; /* Reset variables that might be set by hook */ @@ -6843,19 +6685,19 @@ call_int_check_hook(struct config_int *conf, int *newval, void **extra, GUC_check_errdetail_string = NULL; GUC_check_errhint_string = NULL; - if (!conf->check_hook(newval, extra, source)) + if (!conf->_int.check_hook(newval, extra, source)) { ereport(elevel, (errcode(GUC_check_errcode_value), GUC_check_errmsg_string ? errmsg_internal("%s", GUC_check_errmsg_string) : errmsg("invalid value for parameter \"%s\": %d", - conf->gen.name, *newval), + conf->name, *newval), GUC_check_errdetail_string ? errdetail_internal("%s", GUC_check_errdetail_string) : 0, GUC_check_errhint_string ? errhint("%s", GUC_check_errhint_string) : 0)); - /* Flush any strings created in ErrorContext */ + /* Flush strings created in ErrorContext (ereport might not have) */ FlushErrorState(); return false; } @@ -6864,11 +6706,11 @@ call_int_check_hook(struct config_int *conf, int *newval, void **extra, } static bool -call_real_check_hook(struct config_real *conf, double *newval, void **extra, +call_real_check_hook(const struct config_generic *conf, double *newval, void **extra, GucSource source, int elevel) { /* Quick success if no hook */ - if (!conf->check_hook) + if (!conf->_real.check_hook) return true; /* Reset variables that might be set by hook */ @@ -6877,19 +6719,19 @@ call_real_check_hook(struct config_real *conf, double *newval, void **extra, GUC_check_errdetail_string = NULL; GUC_check_errhint_string = NULL; - if (!conf->check_hook(newval, extra, source)) + if (!conf->_real.check_hook(newval, extra, source)) { ereport(elevel, (errcode(GUC_check_errcode_value), GUC_check_errmsg_string ? errmsg_internal("%s", GUC_check_errmsg_string) : errmsg("invalid value for parameter \"%s\": %g", - conf->gen.name, *newval), + conf->name, *newval), GUC_check_errdetail_string ? errdetail_internal("%s", GUC_check_errdetail_string) : 0, GUC_check_errhint_string ? errhint("%s", GUC_check_errhint_string) : 0)); - /* Flush any strings created in ErrorContext */ + /* Flush strings created in ErrorContext (ereport might not have) */ FlushErrorState(); return false; } @@ -6898,13 +6740,13 @@ call_real_check_hook(struct config_real *conf, double *newval, void **extra, } static bool -call_string_check_hook(struct config_string *conf, char **newval, void **extra, +call_string_check_hook(const struct config_generic *conf, char **newval, void **extra, GucSource source, int elevel) { volatile bool result = true; /* Quick success if no hook */ - if (!conf->check_hook) + if (!conf->_string.check_hook) return true; /* @@ -6920,19 +6762,19 @@ call_string_check_hook(struct config_string *conf, char **newval, void **extra, GUC_check_errdetail_string = NULL; GUC_check_errhint_string = NULL; - if (!conf->check_hook(newval, extra, source)) + if (!conf->_string.check_hook(newval, extra, source)) { ereport(elevel, (errcode(GUC_check_errcode_value), GUC_check_errmsg_string ? errmsg_internal("%s", GUC_check_errmsg_string) : errmsg("invalid value for parameter \"%s\": \"%s\"", - conf->gen.name, *newval ? *newval : ""), + conf->name, *newval ? *newval : ""), GUC_check_errdetail_string ? errdetail_internal("%s", GUC_check_errdetail_string) : 0, GUC_check_errhint_string ? errhint("%s", GUC_check_errhint_string) : 0)); - /* Flush any strings created in ErrorContext */ + /* Flush strings created in ErrorContext (ereport might not have) */ FlushErrorState(); result = false; } @@ -6948,11 +6790,11 @@ call_string_check_hook(struct config_string *conf, char **newval, void **extra, } static bool -call_enum_check_hook(struct config_enum *conf, int *newval, void **extra, +call_enum_check_hook(const struct config_generic *conf, int *newval, void **extra, GucSource source, int elevel) { /* Quick success if no hook */ - if (!conf->check_hook) + if (!conf->_enum.check_hook) return true; /* Reset variables that might be set by hook */ @@ -6961,20 +6803,20 @@ call_enum_check_hook(struct config_enum *conf, int *newval, void **extra, GUC_check_errdetail_string = NULL; GUC_check_errhint_string = NULL; - if (!conf->check_hook(newval, extra, source)) + if (!conf->_enum.check_hook(newval, extra, source)) { ereport(elevel, (errcode(GUC_check_errcode_value), GUC_check_errmsg_string ? errmsg_internal("%s", GUC_check_errmsg_string) : errmsg("invalid value for parameter \"%s\": \"%s\"", - conf->gen.name, + conf->name, config_enum_lookup_by_value(conf, *newval)), GUC_check_errdetail_string ? errdetail_internal("%s", GUC_check_errdetail_string) : 0, GUC_check_errhint_string ? errhint("%s", GUC_check_errhint_string) : 0)); - /* Flush any strings created in ErrorContext */ + /* Flush strings created in ErrorContext (ereport might not have) */ FlushErrorState(); return false; } diff --git a/src/backend/utils/misc/guc_funcs.c b/src/backend/utils/misc/guc_funcs.c index b9e26982abd90..9dbc5d3aeb9c5 100644 --- a/src/backend/utils/misc/guc_funcs.c +++ b/src/backend/utils/misc/guc_funcs.c @@ -210,12 +210,29 @@ flatten_set_variable_args(const char *name, List *args) else flags = 0; - /* Complain if list input and non-list variable */ - if ((flags & GUC_LIST_INPUT) == 0 && - list_length(args) != 1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("SET %s takes only one argument", name))); + /* + * Handle special cases for list input. + */ + if (flags & GUC_LIST_INPUT) + { + /* NULL represents an empty list. */ + if (list_length(args) == 1) + { + Node *arg = (Node *) linitial(args); + + if (IsA(arg, A_Const) && + ((A_Const *) arg)->isnull) + return pstrdup(""); + } + } + else + { + /* Complain if list input and non-list variable. */ + if (list_length(args) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("SET %s takes only one argument", name))); + } initStringInfo(&buf); @@ -246,6 +263,12 @@ flatten_set_variable_args(const char *name, List *args) elog(ERROR, "unrecognized node type: %d", (int) nodeTag(arg)); con = (A_Const *) arg; + /* Complain if NULL is used with a non-list variable. */ + if (con->isnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("NULL is an invalid value for %s", name))); + switch (nodeTag(&con->val)) { case T_Integer: @@ -269,6 +292,9 @@ flatten_set_variable_args(const char *name, List *args) Datum interval; char *intervalout; + /* gram.y ensures this is only reachable for TIME ZONE */ + Assert(!(flags & GUC_LIST_QUOTE)); + typenameTypeIdAndMod(NULL, typeName, &typoid, &typmod); Assert(typoid == INTERVALOID); @@ -578,7 +604,7 @@ pg_settings_get_flags(PG_FUNCTION_ARGS) * Return whether or not the GUC variable is visible to the current user. */ bool -ConfigOptionIsVisible(struct config_generic *conf) +ConfigOptionIsVisible(const struct config_generic *conf) { if ((conf->flags & GUC_SUPERUSER_ONLY) && !has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_SETTINGS)) @@ -591,7 +617,7 @@ ConfigOptionIsVisible(struct config_generic *conf) * Extract fields to show in pg_settings for given variable. */ static void -GetConfigOptionValues(struct config_generic *conf, const char **values) +GetConfigOptionValues(const struct config_generic *conf, const char **values) { char buffer[256]; @@ -629,7 +655,7 @@ GetConfigOptionValues(struct config_generic *conf, const char **values) { case PGC_BOOL: { - struct config_bool *lconf = (struct config_bool *) conf; + const struct config_bool *lconf = &conf->_bool; /* min_val */ values[9] = NULL; @@ -650,7 +676,7 @@ GetConfigOptionValues(struct config_generic *conf, const char **values) case PGC_INT: { - struct config_int *lconf = (struct config_int *) conf; + const struct config_int *lconf = &conf->_int; /* min_val */ snprintf(buffer, sizeof(buffer), "%d", lconf->min); @@ -675,7 +701,7 @@ GetConfigOptionValues(struct config_generic *conf, const char **values) case PGC_REAL: { - struct config_real *lconf = (struct config_real *) conf; + const struct config_real *lconf = &conf->_real; /* min_val */ snprintf(buffer, sizeof(buffer), "%g", lconf->min); @@ -700,7 +726,7 @@ GetConfigOptionValues(struct config_generic *conf, const char **values) case PGC_STRING: { - struct config_string *lconf = (struct config_string *) conf; + const struct config_string *lconf = &conf->_string; /* min_val */ values[9] = NULL; @@ -727,7 +753,7 @@ GetConfigOptionValues(struct config_generic *conf, const char **values) case PGC_ENUM: { - struct config_enum *lconf = (struct config_enum *) conf; + const struct config_enum *lconf = &conf->_enum; /* min_val */ values[9] = NULL; @@ -741,15 +767,15 @@ GetConfigOptionValues(struct config_generic *conf, const char **values) * NOTE! enumvals with double quotes in them are not * supported! */ - values[11] = config_enum_get_options((struct config_enum *) conf, + values[11] = config_enum_get_options(lconf, "{\"", "\"}", "\",\""); /* boot_val */ - values[12] = pstrdup(config_enum_lookup_by_value(lconf, + values[12] = pstrdup(config_enum_lookup_by_value(conf, lconf->boot_val)); /* reset_val */ - values[13] = pstrdup(config_enum_lookup_by_value(lconf, + values[13] = pstrdup(config_enum_lookup_by_value(conf, lconf->reset_val)); } break; @@ -986,7 +1012,6 @@ show_all_file_settings(PG_FUNCTION_ARGS) #define NUM_PG_FILE_SETTINGS_ATTS 7 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; ConfigVariable *conf; - int seqno; /* Scan the config files using current context as workspace */ conf = ProcessConfigFileInternal(PGC_SIGHUP, false, DEBUG3); @@ -995,7 +1020,7 @@ show_all_file_settings(PG_FUNCTION_ARGS) InitMaterializedSRF(fcinfo, 0); /* Process the results and create a tuplestore */ - for (seqno = 1; conf != NULL; conf = conf->next, seqno++) + for (int seqno = 1; conf != NULL; conf = conf->next, seqno++) { Datum values[NUM_PG_FILE_SETTINGS_ATTS]; bool nulls[NUM_PG_FILE_SETTINGS_ATTS]; diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat new file mode 100644 index 0000000000000..f9e37f8b7c2c3 --- /dev/null +++ b/src/backend/utils/misc/guc_parameters.dat @@ -0,0 +1,3536 @@ +#---------------------------------------------------------------------- +# +# Contents of GUC tables. +# +# See src/backend/utils/misc/README for design notes. +# +# Portions Copyright (c) 2000-2025, PostgreSQL Global Development Group +# +# src/backend/utils/misc/guc_parameters.dat +# +#---------------------------------------------------------------------- + +[ + +# TO ADD AN OPTION: +# +# 1. Declare a global variable of type bool, int, double, or char* and +# make use of it. +# +# 2. Decide at what times it's safe to set the option. See guc.h for +# details. +# +# 3. Decide on a name, a default value, upper and lower bounds (if +# applicable), etc. +# +# 4. Add a record below (in alphabetical order). +# +# 5. Add it to src/backend/utils/misc/postgresql.conf.sample, if +# appropriate. +# +# 6. Don't forget to document the option (at least in config.sgml). +# +# 7. If it's a new GUC_LIST_QUOTE option, you must add it to +# variable_is_guc_list_quote() in src/bin/pg_dump/dumputils.c. + +# This setting itself cannot be set by ALTER SYSTEM to avoid an +# operator turning this setting off by using ALTER SYSTEM, without a +# way to turn it back on. +{ name => 'allow_alter_system', type => 'bool', context => 'PGC_SIGHUP', group => 'COMPAT_OPTIONS_OTHER', + short_desc => 'Allows running the ALTER SYSTEM command.', + long_desc => 'Can be set to off for environments where global configuration changes should be made using a different method.', + flags => 'GUC_DISALLOW_IN_AUTO_FILE', + variable => 'AllowAlterSystem', + boot_val => 'true', +}, + +{ name => 'allow_in_place_tablespaces', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Allows tablespaces directly inside pg_tblspc, for testing.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'allow_in_place_tablespaces', + boot_val => 'false', +}, + +{ name => 'allow_system_table_mods', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Allows modifications of the structure of system tables.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'allowSystemTableMods', + boot_val => 'false', +}, + +{ name => 'application_name', type => 'string', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Sets the application name to be reported in statistics and logs.', + flags => 'GUC_IS_NAME | GUC_REPORT | GUC_NOT_IN_SAMPLE', + variable => 'application_name', + boot_val => '""', + check_hook => 'check_application_name', + assign_hook => 'assign_application_name', +}, + +{ name => 'archive_cleanup_command', type => 'string', context => 'PGC_SIGHUP', group => 'WAL_ARCHIVE_RECOVERY', + short_desc => 'Sets the shell command that will be executed at every restart point.', + variable => 'archiveCleanupCommand', + boot_val => '""', +}, + + +{ name => 'archive_command', type => 'string', context => 'PGC_SIGHUP', group => 'WAL_ARCHIVING', + short_desc => 'Sets the shell command that will be called to archive a WAL file.', + long_desc => 'An empty string means use "archive_library".', + variable => 'XLogArchiveCommand', + boot_val => '""', + show_hook => 'show_archive_command', +}, + +{ name => 'archive_library', type => 'string', context => 'PGC_SIGHUP', group => 'WAL_ARCHIVING', + short_desc => 'Sets the library that will be called to archive a WAL file.', + long_desc => 'An empty string means use "archive_command".', + variable => 'XLogArchiveLibrary', + boot_val => '""', +}, + +{ name => 'archive_mode', type => 'enum', context => 'PGC_POSTMASTER', group => 'WAL_ARCHIVING', + short_desc => 'Allows archiving of WAL files using "archive_command".', + variable => 'XLogArchiveMode', + boot_val => 'ARCHIVE_MODE_OFF', + options => 'archive_mode_options', +}, + +{ name => 'archive_timeout', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_ARCHIVING', + short_desc => 'Sets the amount of time to wait before forcing a switch to the next WAL file.', + long_desc => '0 disables the timeout.', + flags => 'GUC_UNIT_S', + variable => 'XLogArchiveTimeout', + boot_val => '0', + min => '0', + max => 'INT_MAX / 2', +}, + +{ name => 'array_nulls', type => 'bool', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'Enables input of NULL elements in arrays.', + long_desc => 'When turned on, unquoted NULL in an array input value means a null value; otherwise it is taken literally.', + variable => 'Array_nulls', + boot_val => 'true', +}, + +{ name => 'authentication_timeout', type => 'int', context => 'PGC_SIGHUP', group => 'CONN_AUTH_AUTH', + short_desc => 'Sets the maximum allowed time to complete client authentication.', + flags => 'GUC_UNIT_S', + variable => 'AuthenticationTimeout', + boot_val => '60', + min => '1', + max => '600', +}, + +{ name => 'autovacuum', type => 'bool', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Starts the autovacuum subprocess.', + variable => 'autovacuum_start_daemon', + boot_val => 'true', +}, + +{ name => 'autovacuum_analyze_scale_factor', type => 'real', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Number of tuple inserts, updates, or deletes prior to analyze as a fraction of reltuples.', + variable => 'autovacuum_anl_scale', + boot_val => '0.1', + min => '0.0', + max => '100.0', +}, + +{ name => 'autovacuum_analyze_threshold', type => 'int', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Minimum number of tuple inserts, updates, or deletes prior to analyze.', + variable => 'autovacuum_anl_thresh', + boot_val => '50', + min => '0', + max => 'INT_MAX', +}, + +# see varsup.c for why this is PGC_POSTMASTER not PGC_SIGHUP +# see vacuum_failsafe_age if you change the upper-limit value. +{ name => 'autovacuum_freeze_max_age', type => 'int', context => 'PGC_POSTMASTER', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Age at which to autovacuum a table to prevent transaction ID wraparound.', + variable => 'autovacuum_freeze_max_age', + boot_val => '200000000', + min => '100000', + max => '2000000000', +}, + +{ name => 'autovacuum_max_workers', type => 'int', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Sets the maximum number of simultaneously running autovacuum worker processes.', + variable => 'autovacuum_max_workers', + boot_val => '3', + min => '1', + max => 'MAX_BACKENDS', +}, + +# see multixact.c for why this is PGC_POSTMASTER not PGC_SIGHUP +{ name => 'autovacuum_multixact_freeze_max_age', type => 'int', context => 'PGC_POSTMASTER', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Multixact age at which to autovacuum a table to prevent multixact wraparound.', + variable => 'autovacuum_multixact_freeze_max_age', + boot_val => '400000000', + min => '10000', + max => '2000000000', +}, + +{ name => 'autovacuum_naptime', type => 'int', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Time to sleep between autovacuum runs.', + flags => 'GUC_UNIT_S', + variable => 'autovacuum_naptime', + boot_val => '60', + min => '1', + max => 'INT_MAX / 1000', +}, + +{ name => 'autovacuum_vacuum_cost_delay', type => 'real', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Vacuum cost delay in milliseconds, for autovacuum.', + long_desc => '-1 means use "vacuum_cost_delay".', + flags => 'GUC_UNIT_MS', + variable => 'autovacuum_vac_cost_delay', + boot_val => '2', + min => '-1', + max => '100', +}, + +{ name => 'autovacuum_vacuum_cost_limit', type => 'int', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Vacuum cost amount available before napping, for autovacuum.', + long_desc => '-1 means use "vacuum_cost_limit".', + variable => 'autovacuum_vac_cost_limit', + boot_val => '-1', + min => '-1', + max => '10000', +}, + +{ name => 'autovacuum_vacuum_insert_scale_factor', type => 'real', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Number of tuple inserts prior to vacuum as a fraction of reltuples.', + variable => 'autovacuum_vac_ins_scale', + boot_val => '0.2', + min => '0.0', + max => '100.0', +}, + +{ name => 'autovacuum_vacuum_insert_threshold', type => 'int', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Minimum number of tuple inserts prior to vacuum.', + long_desc => '-1 disables insert vacuums.', + variable => 'autovacuum_vac_ins_thresh', + boot_val => '1000', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'autovacuum_vacuum_max_threshold', type => 'int', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Maximum number of tuple updates or deletes prior to vacuum.', + long_desc => '-1 disables the maximum threshold.', + variable => 'autovacuum_vac_max_thresh', + boot_val => '100000000', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'autovacuum_vacuum_scale_factor', type => 'real', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Number of tuple updates or deletes prior to vacuum as a fraction of reltuples.', + variable => 'autovacuum_vac_scale', + boot_val => '0.2', + min => '0.0', + max => '100.0', +}, + +{ name => 'autovacuum_vacuum_threshold', type => 'int', context => 'PGC_SIGHUP', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Minimum number of tuple updates or deletes prior to vacuum.', + variable => 'autovacuum_vac_thresh', + boot_val => '50', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'autovacuum_work_mem', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM', + short_desc => 'Sets the maximum memory to be used by each autovacuum worker process.', + long_desc => '-1 means use "maintenance_work_mem".', + flags => 'GUC_UNIT_KB', + variable => 'autovacuum_work_mem', + boot_val => '-1', + min => '-1', + max => 'MAX_KILOBYTES', + check_hook => 'check_autovacuum_work_mem', +}, + +# see max_connections +{ name => 'autovacuum_worker_slots', type => 'int', context => 'PGC_POSTMASTER', group => 'VACUUM_AUTOVACUUM', + short_desc => 'Sets the number of backend slots to allocate for autovacuum workers.', + variable => 'autovacuum_worker_slots', + boot_val => '16', + min => '1', + max => 'MAX_BACKENDS', +}, + +{ name => 'backend_flush_after', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_IO', + short_desc => 'Number of pages after which previously performed writes are flushed to disk.', + long_desc => '0 disables forced writeback.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'backend_flush_after', + boot_val => 'DEFAULT_BACKEND_FLUSH_AFTER', + min => '0', + max => 'WRITEBACK_MAX_PENDING_FLUSHES', +}, + +{ name => 'backslash_quote', type => 'enum', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'Sets whether "\\\\\'" is allowed in string literals.', + variable => 'backslash_quote', + boot_val => 'BACKSLASH_QUOTE_SAFE_ENCODING', + options => 'backslash_quote_options', +}, + +{ name => 'backtrace_functions', type => 'string', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Log backtrace for errors in these functions.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'backtrace_functions', + boot_val => '""', + check_hook => 'check_backtrace_functions', + assign_hook => 'assign_backtrace_functions', +}, + +{ name => 'bgwriter_delay', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER', + short_desc => 'Background writer sleep time between rounds.', + flags => 'GUC_UNIT_MS', + variable => 'BgWriterDelay', + boot_val => '200', + min => '10', + max => '10000', +}, + +{ name => 'bgwriter_flush_after', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER', + short_desc => 'Number of pages after which previously performed writes are flushed to disk.', + long_desc => '0 disables forced writeback.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'bgwriter_flush_after', + boot_val => 'DEFAULT_BGWRITER_FLUSH_AFTER', + min => '0', + max => 'WRITEBACK_MAX_PENDING_FLUSHES', +}, + +# Same upper limit as shared_buffers +{ name => 'bgwriter_lru_maxpages', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER', + short_desc => 'Background writer maximum number of LRU pages to flush per round.', + long_desc => '0 disables background writing.', + variable => 'bgwriter_lru_maxpages', + boot_val => '100', + min => '0', + max => 'INT_MAX / 2', +}, + +{ name => 'bgwriter_lru_multiplier', type => 'real', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER', + short_desc => 'Multiple of the average buffer usage to free per round.', + variable => 'bgwriter_lru_multiplier', + boot_val => '2.0', + min => '0.0', + max => '10.0', +}, + +{ name => 'block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the size of a disk block.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'block_size', + boot_val => 'BLCKSZ', + min => 'BLCKSZ', + max => 'BLCKSZ', +}, + +{ name => 'bonjour', type => 'bool', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Enables advertising the server via Bonjour.', + variable => 'enable_bonjour', + boot_val => 'false', + check_hook => 'check_bonjour', +}, + +{ name => 'bonjour_name', type => 'string', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the Bonjour service name.', + long_desc => 'An empty string means use the computer name.', + variable => 'bonjour_name', + boot_val => '""', +}, + +{ name => 'bytea_output', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the output format for bytea.', + variable => 'bytea_output', + boot_val => 'BYTEA_OUTPUT_HEX', + options => 'bytea_output_options', +}, + +{ name => 'check_function_bodies', type => 'bool', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Check routine bodies during CREATE FUNCTION and CREATE PROCEDURE.', + variable => 'check_function_bodies', + boot_val => 'true', +}, + +{ name => 'checkpoint_completion_target', type => 'real', context => 'PGC_SIGHUP', group => 'WAL_CHECKPOINTS', + short_desc => 'Time spent flushing dirty buffers during checkpoint, as fraction of checkpoint interval.', + variable => 'CheckPointCompletionTarget', + boot_val => '0.9', + min => '0.0', + max => '1.0', + assign_hook => 'assign_checkpoint_completion_target', +}, + +{ name => 'checkpoint_flush_after', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_CHECKPOINTS', + short_desc => 'Number of pages after which previously performed writes are flushed to disk.', + long_desc => '0 disables forced writeback.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'checkpoint_flush_after', + boot_val => 'DEFAULT_CHECKPOINT_FLUSH_AFTER', + min => '0', + max => 'WRITEBACK_MAX_PENDING_FLUSHES', +}, + +{ name => 'checkpoint_timeout', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_CHECKPOINTS', + short_desc => 'Sets the maximum time between automatic WAL checkpoints.', + flags => 'GUC_UNIT_S', + variable => 'CheckPointTimeout', + boot_val => '300', + min => '30', + max => '86400', +}, + +{ name => 'checkpoint_warning', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_CHECKPOINTS', + short_desc => 'Sets the maximum time before warning if checkpoints triggered by WAL volume happen too frequently.', + long_desc => 'Write a message to the server log if checkpoints caused by the filling of WAL segment files happen more frequently than this amount of time. 0 disables the warning.', + flags => 'GUC_UNIT_S', + variable => 'CheckPointWarning', + boot_val => '30', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'client_connection_check_interval', type => 'int', context => 'PGC_USERSET', group => 'CONN_AUTH_TCP', + short_desc => 'Sets the time interval between checks for disconnection while running queries.', + long_desc => '0 disables connection checks.', + flags => 'GUC_UNIT_MS', + variable => 'client_connection_check_interval', + boot_val => '0', + min => '0', + max => 'INT_MAX', + check_hook => 'check_client_connection_check_interval', +}, + +{ name => 'client_encoding', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the client\'s character set encoding.', + flags => 'GUC_IS_NAME | GUC_REPORT', + variable => 'client_encoding_string', + boot_val => '"SQL_ASCII"', + check_hook => 'check_client_encoding', + assign_hook => 'assign_client_encoding', +}, + +{ name => 'client_min_messages', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the message levels that are sent to the client.', + long_desc => 'Each level includes all the levels that follow it. The later the level, the fewer messages are sent.', + variable => 'client_min_messages', + boot_val => 'NOTICE', + options => 'client_message_level_options', +}, + +{ name => 'cluster_name', type => 'string', context => 'PGC_POSTMASTER', group => 'PROCESS_TITLE', + short_desc => 'Sets the name of the cluster, which is included in the process title.', + flags => 'GUC_IS_NAME', + variable => 'cluster_name', + boot_val => '""', + check_hook => 'check_cluster_name', +}, + +# we have no microseconds designation, so can't supply units here +{ name => 'commit_delay', type => 'int', context => 'PGC_SUSET', group => 'WAL_SETTINGS', + short_desc => 'Sets the delay in microseconds between transaction commit and flushing WAL to disk.', + variable => 'CommitDelay', + boot_val => '0', + min => '0', + max => '100000', +}, + +{ name => 'commit_siblings', type => 'int', context => 'PGC_USERSET', group => 'WAL_SETTINGS', + short_desc => 'Sets the minimum number of concurrent open transactions required before performing "commit_delay".', + variable => 'CommitSiblings', + boot_val => '5', + min => '0', + max => '1000', +}, + +{ name => 'commit_timestamp_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the size of the dedicated buffer pool used for the commit timestamp cache.', + long_desc => '0 means use a fraction of "shared_buffers".', + flags => 'GUC_UNIT_BLOCKS', + variable => 'commit_timestamp_buffers', + boot_val => '0', + min => '0', + max => 'SLRU_MAX_ALLOWED_BUFFERS', + check_hook => 'check_commit_ts_buffers', +}, + +{ name => 'compute_query_id', type => 'enum', context => 'PGC_SUSET', group => 'STATS_MONITORING', + short_desc => 'Enables in-core computation of query identifiers.', + variable => 'compute_query_id', + boot_val => 'COMPUTE_QUERY_ID_AUTO', + options => 'compute_query_id_options', +}, + +{ name => 'config_file', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', + short_desc => 'Sets the server\'s main configuration file.', + flags => 'GUC_DISALLOW_IN_FILE | GUC_SUPERUSER_ONLY', + variable => 'ConfigFileName', + boot_val => 'NULL', +}, + +{ name => 'constraint_exclusion', type => 'enum', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Enables the planner to use constraints to optimize queries.', + long_desc => 'Table scans will be skipped if their constraints guarantee that no rows match the query.', + flags => 'GUC_EXPLAIN', + variable => 'constraint_exclusion', + boot_val => 'CONSTRAINT_EXCLUSION_PARTITION', + options => 'constraint_exclusion_options', +}, + +{ name => 'cpu_index_tuple_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of processing each index entry during an index scan.', + flags => 'GUC_EXPLAIN', + variable => 'cpu_index_tuple_cost', + boot_val => 'DEFAULT_CPU_INDEX_TUPLE_COST', + min => '0', + max => 'DBL_MAX', +}, + +{ name => 'cpu_operator_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of processing each operator or function call.', + flags => 'GUC_EXPLAIN', + variable => 'cpu_operator_cost', + boot_val => 'DEFAULT_CPU_OPERATOR_COST', + min => '0', + max => 'DBL_MAX', +}, + +{ name => 'cpu_tuple_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of processing each tuple (row).', + flags => 'GUC_EXPLAIN', + variable => 'cpu_tuple_cost', + boot_val => 'DEFAULT_CPU_TUPLE_COST', + min => '0', + max => 'DBL_MAX', +}, + +{ name => 'createrole_self_grant', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets whether a CREATEROLE user automatically grants the role to themselves, and with which options.', + long_desc => 'An empty string disables automatic self grants.', + flags => 'GUC_LIST_INPUT', + variable => 'createrole_self_grant', + boot_val => '""', + check_hook => 'check_createrole_self_grant', + assign_hook => 'assign_createrole_self_grant', +}, + +{ name => 'cursor_tuple_fraction', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Sets the planner\'s estimate of the fraction of a cursor\'s rows that will be retrieved.', + flags => 'GUC_EXPLAIN', + variable => 'cursor_tuple_fraction', + boot_val => 'DEFAULT_CURSOR_TUPLE_FRACTION', + min => '0.0', + max => '1.0', +}, + +{ name => 'data_checksums', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows whether data checksums are turned on for this cluster.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', + variable => 'data_checksums', + boot_val => 'false', +}, + +# Can't be set by ALTER SYSTEM as it can lead to recursive definition +# of data_directory. +{ name => 'data_directory', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', + short_desc => 'Sets the server\'s data directory.', + flags => 'GUC_SUPERUSER_ONLY | GUC_DISALLOW_IN_AUTO_FILE', + variable => 'data_directory', + boot_val => 'NULL', +}, + +{ name => 'data_directory_mode', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the mode of the data directory.', + long_desc => 'The parameter value is a numeric mode specification in the form accepted by the chmod and umask system calls. (To use the customary octal format the number must start with a 0 (zero).)', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', + variable => 'data_directory_mode', + boot_val => '0700', + min => '0000', + max => '0777', + show_hook => 'show_data_directory_mode', +}, + +{ name => 'data_sync_retry', type => 'bool', context => 'PGC_POSTMASTER', group => 'ERROR_HANDLING_OPTIONS', + short_desc => 'Whether to continue running after a failure to sync data files.', + variable => 'data_sync_retry', + boot_val => 'false', +}, + +{ name => 'DateStyle', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the display format for date and time values.', + long_desc => 'Also controls interpretation of ambiguous date inputs.', + flags => 'GUC_LIST_INPUT | GUC_REPORT', + variable => 'datestyle_string', + boot_val => '"ISO, MDY"', + check_hook => 'check_datestyle', + assign_hook => 'assign_datestyle', +}, + +# This is PGC_SUSET to prevent hiding from log_lock_waits. +{ name => 'deadlock_timeout', type => 'int', context => 'PGC_SUSET', group => 'LOCK_MANAGEMENT', + short_desc => 'Sets the time to wait on a lock before checking for deadlock.', + flags => 'GUC_UNIT_MS', + variable => 'DeadlockTimeout', + boot_val => '1000', + min => '1', + max => 'INT_MAX', +}, + +{ name => 'debug_assertions', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows whether the running server has assertion checks enabled.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'assert_enabled', + boot_val => 'DEFAULT_ASSERT_ENABLED', +}, + +{ name => 'debug_copy_parse_plan_trees', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Set this to force all parse and plan trees to be passed through copyObject(), to facilitate catching errors and omissions in copyObject().', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Debug_copy_parse_plan_trees', + boot_val => 'DEFAULT_DEBUG_COPY_PARSE_PLAN_TREES', + ifdef => 'DEBUG_NODE_TESTS_ENABLED', +}, + +{ name => 'debug_deadlocks', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Dumps information about all current locks when a deadlock timeout occurs.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Debug_deadlocks', + boot_val => 'false', + ifdef => 'LOCK_DEBUG', +}, + +{ name => 'debug_discard_caches', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Aggressively flush system caches for debugging purposes.', + long_desc => '0 means use normal caching behavior.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'debug_discard_caches', + boot_val => 'DEFAULT_DEBUG_DISCARD_CACHES', + min => 'MIN_DEBUG_DISCARD_CACHES', + max => 'MAX_DEBUG_DISCARD_CACHES', +}, + +{ name => 'debug_exec_backend', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows whether the running server is built with EXEC_BACKEND enabled.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'exec_backend_enabled', + boot_val => 'EXEC_BACKEND_ENABLED', +}, + +{ name => 'debug_io_direct', type => 'string', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', + short_desc => 'Use direct I/O for file access.', + long_desc => 'An empty string disables direct I/O.', + flags => 'GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE', + variable => 'debug_io_direct_string', + boot_val => '""', + check_hook => 'check_debug_io_direct', + assign_hook => 'assign_debug_io_direct', +}, + +{ name => 'debug_logical_replication_streaming', type => 'enum', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Forces immediate streaming or serialization of changes in large transactions.', + long_desc => 'On the publisher, it allows streaming or serializing each change in logical decoding. On the subscriber, it allows serialization of all changes to files and notifies the parallel apply workers to read and apply them at the end of the transaction.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'debug_logical_replication_streaming', + boot_val => 'DEBUG_LOGICAL_REP_STREAMING_BUFFERED', + options => 'debug_logical_replication_streaming_options', +}, + +{ name => 'debug_parallel_query', type => 'enum', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Forces the planner\'s use parallel query nodes.', + long_desc => 'This can be useful for testing the parallel query infrastructure by forcing the planner to generate plans that contain nodes that perform tuple communication between workers and the main process.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_EXPLAIN', + variable => 'debug_parallel_query', + boot_val => 'DEBUG_PARALLEL_OFF', + options => 'debug_parallel_query_options', +}, + +{ name => 'debug_pretty_print', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Indents parse and plan tree displays.', + variable => 'Debug_pretty_print', + boot_val => 'true', +}, + +{ name => 'debug_print_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each query\'s parse tree.', + variable => 'Debug_print_parse', + boot_val => 'false', +}, + +{ name => 'debug_print_plan', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each query\'s execution plan.', + variable => 'Debug_print_plan', + boot_val => 'false', +}, + +{ name => 'debug_print_raw_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each query\'s raw parse tree.', + variable => 'Debug_print_raw_parse', + boot_val => 'false', +}, + +{ name => 'debug_print_rewritten', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each query\'s rewritten parse tree.', + variable => 'Debug_print_rewritten', + boot_val => 'false', +}, + +{ name => 'debug_raw_expression_coverage_test', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Set this to force all raw parse trees for DML statements to be scanned by raw_expression_tree_walker(), to facilitate catching errors and omissions in that function.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Debug_raw_expression_coverage_test', + boot_val => 'DEFAULT_DEBUG_RAW_EXPRESSION_COVERAGE_TEST', + ifdef => 'DEBUG_NODE_TESTS_ENABLED', +}, + +{ name => 'debug_write_read_parse_plan_trees', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Set this to force all parse and plan trees to be passed through outfuncs.c/readfuncs.c, to facilitate catching errors and omissions in those modules.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Debug_write_read_parse_plan_trees', + boot_val => 'DEFAULT_DEBUG_READ_WRITE_PARSE_PLAN_TREES', + ifdef => 'DEBUG_NODE_TESTS_ENABLED', +}, + +{ name => 'default_statistics_target', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Sets the default statistics target.', + long_desc => 'This applies to table columns that have not had a column-specific target set via ALTER TABLE SET STATISTICS.', + variable => 'default_statistics_target', + boot_val => '100', + min => '1', + max => 'MAX_STATISTICS_TARGET', +}, + +{ name => 'default_table_access_method', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the default table access method for new tables.', + flags => 'GUC_IS_NAME', + variable => 'default_table_access_method', + boot_val => 'DEFAULT_TABLE_ACCESS_METHOD', + check_hook => 'check_default_table_access_method', +}, + +{ name => 'default_tablespace', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the default tablespace to create tables and indexes in.', + long_desc => 'An empty string means use the database\'s default tablespace.', + flags => 'GUC_IS_NAME', + variable => 'default_tablespace', + boot_val => '""', + check_hook => 'check_default_tablespace', +}, + +{ name => 'default_text_search_config', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets default text search configuration.', + variable => 'TSCurrentConfig', + boot_val => '"pg_catalog.simple"', + check_hook => 'check_default_text_search_config', + assign_hook => 'assign_default_text_search_config', +}, + +{ name => 'default_toast_compression', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the default compression method for compressible values.', + variable => 'default_toast_compression', + boot_val => 'TOAST_PGLZ_COMPRESSION', + options => 'default_toast_compression_options', +}, + +{ name => 'default_transaction_deferrable', type => 'bool', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the default deferrable status of new transactions.', + variable => 'DefaultXactDeferrable', + boot_val => 'false', +}, + +{ name => 'default_transaction_isolation', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the transaction isolation level of each new transaction.', + variable => 'DefaultXactIsoLevel', + boot_val => 'XACT_READ_COMMITTED', + options => 'isolation_level_options', +}, + +{ name => 'default_transaction_read_only', type => 'bool', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the default read-only status of new transactions.', + flags => 'GUC_REPORT', + variable => 'DefaultXactReadOnly', + boot_val => 'false', +}, + +# WITH OIDS support, and consequently default_with_oids, was removed +# in PostgreSQL 12, but we tolerate the parameter being set to false +# to avoid unnecessarily breaking older dump files. +{ name => 'default_with_oids', type => 'bool', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'WITH OIDS is no longer supported; this can only be false.', + flags => 'GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE', + variable => 'default_with_oids', + boot_val => 'false', + check_hook => 'check_default_with_oids', +}, + +{ name => 'dynamic_library_path', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_OTHER', + short_desc => 'Sets the path for dynamically loadable modules.', + long_desc => 'If a dynamically loadable module needs to be opened and the specified name does not have a directory component (i.e., the name does not contain a slash), the system will search this path for the specified file.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'Dynamic_library_path', + boot_val => '"$libdir"', +}, + +{ name => 'dynamic_shared_memory_type', type => 'enum', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Selects the dynamic shared memory implementation used.', + variable => 'dynamic_shared_memory_type', + boot_val => 'DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE', + options => 'dynamic_shared_memory_options', +}, + +{ name => 'effective_cache_size', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s assumption about the total size of the data caches.', + long_desc => 'That is, the total size of the caches (kernel cache and shared buffers) used for PostgreSQL data files. This is measured in disk pages, which are normally 8 kB each.', + flags => 'GUC_UNIT_BLOCKS | GUC_EXPLAIN', + variable => 'effective_cache_size', + boot_val => 'DEFAULT_EFFECTIVE_CACHE_SIZE', + min => '1', + max => 'INT_MAX', +}, + +{ name => 'effective_io_concurrency', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_IO', + short_desc => 'Number of simultaneous requests that can be handled efficiently by the disk subsystem.', + long_desc => '0 disables simultaneous requests.', + flags => 'GUC_EXPLAIN', + variable => 'effective_io_concurrency', + boot_val => 'DEFAULT_EFFECTIVE_IO_CONCURRENCY', + min => '0', + max => 'MAX_IO_CONCURRENCY', +}, + +{ name => 'effective_wal_level', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Show effective WAL level.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'effective_wal_level', + boot_val => 'WAL_LEVEL_REPLICA', + options => 'wal_level_options', + show_hook => 'show_effective_wal_level', +}, + +{ name => 'enable_async_append', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of async append plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_async_append', + boot_val => 'true', +}, + +{ name => 'enable_bitmapscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of bitmap-scan plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_bitmapscan', + boot_val => 'true', +}, + +{ name => 'enable_distinct_reordering', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables reordering of DISTINCT keys.', + flags => 'GUC_EXPLAIN', + variable => 'enable_distinct_reordering', + boot_val => 'true', +}, + +{ name => 'enable_eager_aggregate', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables eager aggregation.', + flags => 'GUC_EXPLAIN', + variable => 'enable_eager_aggregate', + boot_val => 'true', +}, + +{ name => 'enable_gathermerge', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of gather merge plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_gathermerge', + boot_val => 'true', +}, + +{ name => 'enable_group_by_reordering', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables reordering of GROUP BY keys.', + flags => 'GUC_EXPLAIN', + variable => 'enable_group_by_reordering', + boot_val => 'true', +}, + +{ name => 'enable_hashagg', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of hashed aggregation plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_hashagg', + boot_val => 'true', +}, + +{ name => 'enable_hashjoin', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of hash join plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_hashjoin', + boot_val => 'true', +}, + +{ name => 'enable_incremental_sort', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of incremental sort steps.', + flags => 'GUC_EXPLAIN', + variable => 'enable_incremental_sort', + boot_val => 'true', +}, + +{ name => 'enable_indexonlyscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of index-only-scan plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_indexonlyscan', + boot_val => 'true', +}, + +{ name => 'enable_indexscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of index-scan plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_indexscan', + boot_val => 'true', +}, + +{ name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of materialization.', + flags => 'GUC_EXPLAIN', + variable => 'enable_material', + boot_val => 'true', +}, + +{ name => 'enable_memoize', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of memoization.', + flags => 'GUC_EXPLAIN', + variable => 'enable_memoize', + boot_val => 'true', +}, + +{ name => 'enable_mergejoin', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of merge join plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_mergejoin', + boot_val => 'true', +}, + +{ name => 'enable_nestloop', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of nested-loop join plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_nestloop', + boot_val => 'true', +}, + +{ name => 'enable_parallel_append', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of parallel append plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_parallel_append', + boot_val => 'true', +}, + +{ name => 'enable_parallel_hash', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of parallel hash plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_parallel_hash', + boot_val => 'true', +}, + +{ name => 'enable_partition_pruning', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables plan-time and execution-time partition pruning.', + long_desc => 'Allows the query planner and executor to compare partition bounds to conditions in the query to determine which partitions must be scanned.', + flags => 'GUC_EXPLAIN', + variable => 'enable_partition_pruning', + boot_val => 'true', +}, + +{ name => 'enable_partitionwise_aggregate', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables partitionwise aggregation and grouping.', + flags => 'GUC_EXPLAIN', + variable => 'enable_partitionwise_aggregate', + boot_val => 'false', +}, + +{ name => 'enable_partitionwise_join', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables partitionwise join.', + flags => 'GUC_EXPLAIN', + variable => 'enable_partitionwise_join', + boot_val => 'false', +}, + +{ name => 'enable_presorted_aggregate', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s ability to produce plans that provide presorted input for ORDER BY / DISTINCT aggregate functions.', + long_desc => 'Allows the query planner to build plans that provide presorted input for aggregate functions with an ORDER BY / DISTINCT clause. When disabled, implicit sorts are always performed during execution.', + flags => 'GUC_EXPLAIN', + variable => 'enable_presorted_aggregate', + boot_val => 'true', +}, + +{ name => 'enable_self_join_elimination', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables removal of unique self-joins.', + flags => 'GUC_EXPLAIN', + variable => 'enable_self_join_elimination', + boot_val => 'true', +}, + +{ name => 'enable_seqscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of sequential-scan plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_seqscan', + boot_val => 'true', +}, + +{ name => 'enable_sort', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of explicit sort steps.', + flags => 'GUC_EXPLAIN', + variable => 'enable_sort', + boot_val => 'true', +}, + +{ name => 'enable_tidscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of TID scan plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_tidscan', + boot_val => 'true', +}, + +{ name => 'escape_string_warning', type => 'bool', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'Warn about backslash escapes in ordinary string literals.', + variable => 'escape_string_warning', + boot_val => 'true', +}, + +{ name => 'event_source', type => 'string', context => 'PGC_POSTMASTER', group => 'LOGGING_WHERE', + short_desc => 'Sets the application name used to identify PostgreSQL messages in the event log.', + variable => 'event_source', + boot_val => 'DEFAULT_EVENT_SOURCE', +}, + +{ name => 'event_triggers', type => 'bool', context => 'PGC_SUSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Enables event triggers.', + long_desc => 'When enabled, event triggers will fire for all applicable statements.', + variable => 'event_triggers', + boot_val => 'true', +}, + +{ name => 'exit_on_error', type => 'bool', context => 'PGC_USERSET', group => 'ERROR_HANDLING_OPTIONS', + short_desc => 'Terminate session on any error.', + variable => 'ExitOnAnyError', + boot_val => 'false', +}, + +{ name => 'extension_control_path', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_OTHER', + short_desc => 'Sets the path for extension control files.', + long_desc => 'The remaining extension script and secondary control files are then loaded from the same directory where the primary control file was found.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'Extension_control_path', + boot_val => '"$system"', +}, + +{ name => 'extended_parallel_processing', type => 'bool', context => 'PGC_BACKEND', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enable extra features of parallel pocessing.', + flags => 'GUC_EXPLAIN', + variable => 'extended_parallel_processing', + boot_val => 'true', +}, + +{ name => 'external_pid_file', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', + short_desc => 'Writes the postmaster PID to the specified file.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'external_pid_file', + boot_val => 'NULL', + check_hook => 'check_canonical_path', +}, + +{ name => 'extra_float_digits', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the number of digits displayed for floating-point values.', + long_desc => 'This affects real, double precision, and geometric data types. A zero or negative parameter value is added to the standard number of digits (FLT_DIG or DBL_DIG as appropriate). Any value greater than zero selects precise output mode.', + variable => 'extra_float_digits', + boot_val => '1', + min => '-15', + max => '3', +}, + +{ name => 'file_copy_method', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_DISK', + short_desc => 'Selects the file copy method.', + variable => 'file_copy_method', + boot_val => 'FILE_COPY_METHOD_COPY', + options => 'file_copy_method_options', +}, + +{ name => 'from_collapse_limit', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Sets the FROM-list size beyond which subqueries are not collapsed.', + long_desc => 'The planner will merge subqueries into upper queries if the resulting FROM list would have no more than this many items.', + flags => 'GUC_EXPLAIN', + variable => 'from_collapse_limit', + boot_val => '8', + min => '1', + max => 'INT_MAX', +}, + +{ name => 'fsync', type => 'bool', context => 'PGC_SIGHUP', group => 'WAL_SETTINGS', + short_desc => 'Forces synchronization of updates to disk.', + long_desc => 'The server will use the fsync() system call in several places to make sure that updates are physically written to disk. This ensures that a database cluster will recover to a consistent state after an operating system or hardware crash.', + variable => 'enableFsync', + boot_val => 'true', +}, + +{ name => 'full_page_writes', type => 'bool', context => 'PGC_SIGHUP', group => 'WAL_SETTINGS', + short_desc => 'Writes full pages to WAL when first modified after a checkpoint.', + long_desc => 'A page write in process during an operating system crash might be only partially written to disk. During recovery, the row changes stored in WAL are not enough to recover. This option writes pages when first modified after a checkpoint to WAL so full recovery is possible.', + variable => 'fullPageWrites', + boot_val => 'true', +}, + +{ name => 'geqo', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_GEQO', + short_desc => 'Enables genetic query optimization.', + long_desc => 'This algorithm attempts to do planning without exhaustive searching.', + flags => 'GUC_EXPLAIN', + variable => 'enable_geqo', + boot_val => 'true', +}, + +{ name => 'geqo_effort', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_GEQO', + short_desc => 'GEQO: effort is used to set the default for other GEQO parameters.', + flags => 'GUC_EXPLAIN', + variable => 'Geqo_effort', + boot_val => 'DEFAULT_GEQO_EFFORT', + min => 'MIN_GEQO_EFFORT', + max => 'MAX_GEQO_EFFORT', +}, + +{ name => 'geqo_generations', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_GEQO', + short_desc => 'GEQO: number of iterations of the algorithm.', + long_desc => '0 means use a suitable default value.', + flags => 'GUC_EXPLAIN', + variable => 'Geqo_generations', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'geqo_pool_size', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_GEQO', + short_desc => 'GEQO: number of individuals in the population.', + long_desc => '0 means use a suitable default value.', + flags => 'GUC_EXPLAIN', + variable => 'Geqo_pool_size', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'geqo_seed', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_GEQO', + short_desc => 'GEQO: seed for random path selection.', + flags => 'GUC_EXPLAIN', + variable => 'Geqo_seed', + boot_val => '0.0', + min => '0.0', + max => '1.0', +}, + +{ name => 'geqo_selection_bias', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_GEQO', + short_desc => 'GEQO: selective pressure within the population.', + flags => 'GUC_EXPLAIN', + variable => 'Geqo_selection_bias', + boot_val => 'DEFAULT_GEQO_SELECTION_BIAS', + min => 'MIN_GEQO_SELECTION_BIAS', + max => 'MAX_GEQO_SELECTION_BIAS', +}, + +{ name => 'geqo_threshold', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_GEQO', + short_desc => 'Sets the threshold of FROM items beyond which GEQO is used.', + flags => 'GUC_EXPLAIN', + variable => 'geqo_threshold', + boot_val => '12', + min => '2', + max => 'INT_MAX', +}, + +{ name => 'gin_fuzzy_search_limit', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_OTHER', + short_desc => 'Sets the maximum allowed result for exact search by GIN.', + long_desc => '0 means no limit.', + variable => 'GinFuzzySearchLimit', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'gin_pending_list_limit', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the maximum size of the pending list for GIN index.', + flags => 'GUC_UNIT_KB', + variable => 'gin_pending_list_limit', + boot_val => '4096', + min => '64', + max => 'MAX_KILOBYTES', +}, + +{ name => 'gss_accept_delegation', type => 'bool', context => 'PGC_SIGHUP', group => 'CONN_AUTH_AUTH', + short_desc => 'Sets whether GSSAPI delegation should be accepted from the client.', + variable => 'pg_gss_accept_delegation', + boot_val => 'false', +}, + +{ name => 'hash_mem_multiplier', type => 'real', context => 'PGC_USERSET', group => 'RESOURCES_MEM', + short_desc => 'Multiple of "work_mem" to use for hash tables.', + flags => 'GUC_EXPLAIN', + variable => 'hash_mem_multiplier', + boot_val => '2.0', + min => '1.0', + max => '1000.0', +}, + +{ name => 'hba_file', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', + short_desc => 'Sets the server\'s "hba" configuration file.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'HbaFileName', + boot_val => 'NULL', +}, + +{ name => 'hot_standby', type => 'bool', context => 'PGC_POSTMASTER', group => 'REPLICATION_STANDBY', + short_desc => 'Allows connections and queries during recovery.', + variable => 'EnableHotStandby', + boot_val => 'true', +}, + +{ name => 'hot_standby_feedback', type => 'bool', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Allows feedback from a hot standby to the primary that will avoid query conflicts.', + variable => 'hot_standby_feedback', + boot_val => 'false', +}, + +{ name => 'huge_page_size', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'The size of huge page that should be requested.', + long_desc => '0 means use the system default.', + flags => 'GUC_UNIT_KB', + variable => 'huge_page_size', + boot_val => '0', + min => '0', + max => 'INT_MAX', + check_hook => 'check_huge_page_size', +}, + +{ name => 'huge_pages', type => 'enum', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Use of huge pages on Linux or Windows.', + variable => 'huge_pages', + boot_val => 'HUGE_PAGES_TRY', + options => 'huge_pages_options', +}, + +{ name => 'huge_pages_status', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Indicates the status of huge pages.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'huge_pages_status', + boot_val => 'HUGE_PAGES_UNKNOWN', + options => 'huge_pages_status_options', +}, + +{ name => 'icu_validation_level', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Log level for reporting invalid ICU locale strings.', + variable => 'icu_validation_level', + boot_val => 'WARNING', + options => 'icu_validation_level_options', +}, + +{ name => 'ident_file', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', + short_desc => 'Sets the server\'s "ident" configuration file.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'IdentFileName', + boot_val => 'NULL', +}, + +{ name => 'idle_in_transaction_session_timeout', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the maximum allowed idle time between queries, when in a transaction.', + long_desc => '0 disables the timeout.', + flags => 'GUC_UNIT_MS', + variable => 'IdleInTransactionSessionTimeout', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'idle_replication_slot_timeout', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SENDING', + short_desc => 'Sets the duration a replication slot can remain idle before it is invalidated.', + flags => 'GUC_UNIT_S', + variable => 'idle_replication_slot_timeout_secs', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'idle_session_timeout', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the maximum allowed idle time between queries, when not in a transaction.', + long_desc => '0 disables the timeout.', + flags => 'GUC_UNIT_MS', + variable => 'IdleSessionTimeout', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'ignore_checksum_failure', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Continues processing after a checksum failure.', + long_desc => 'Detection of a checksum failure normally causes PostgreSQL to report an error, aborting the current transaction. Setting ignore_checksum_failure to true causes the system to ignore the failure (but still report a warning), and continue processing. This behavior could cause crashes or other serious problems. Only has an effect if checksums are enabled.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'ignore_checksum_failure', + boot_val => 'false', +}, + +{ name => 'ignore_invalid_pages', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', + short_desc => 'Continues recovery after an invalid pages failure.', + long_desc => 'Detection of WAL records having references to invalid pages during recovery causes PostgreSQL to raise a PANIC-level error, aborting the recovery. Setting "ignore_invalid_pages" to true causes the system to ignore invalid page references in WAL records (but still report a warning), and continue recovery. This behavior may cause crashes, data loss, propagate or hide corruption, or other serious problems. Only has an effect during recovery or in standby mode.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'ignore_invalid_pages', + boot_val => 'false', +}, + +{ name => 'ignore_system_indexes', type => 'bool', context => 'PGC_BACKEND', group => 'DEVELOPER_OPTIONS', + short_desc => 'Disables reading from system indexes.', + long_desc => 'It does not prevent updating the indexes, so it is safe to use. The worst consequence is slowness.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'IgnoreSystemIndexes', + boot_val => 'false', +}, + +{ name => 'in_hot_standby', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows whether hot standby is currently active.', + flags => 'GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'in_hot_standby_guc', + boot_val => 'false', + show_hook => 'show_in_hot_standby', +}, + +{ name => 'integer_datetimes', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows whether datetimes are integer based.', + flags => 'GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'integer_datetimes', + boot_val => 'true', +}, + +{ name => 'IntervalStyle', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the display format for interval values.', + flags => 'GUC_REPORT', + variable => 'IntervalStyle', + boot_val => 'INTSTYLE_POSTGRES', + options => 'intervalstyle_options', +}, + +{ name => 'io_combine_limit', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_IO', + short_desc => 'Limit on the size of data reads and writes.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'io_combine_limit_guc', + boot_val => 'DEFAULT_IO_COMBINE_LIMIT', + min => '1', + max => 'MAX_IO_COMBINE_LIMIT', + assign_hook => 'assign_io_combine_limit', +}, + +{ name => 'io_max_combine_limit', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_IO', + short_desc => 'Server-wide limit that clamps io_combine_limit.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'io_max_combine_limit', + boot_val => 'DEFAULT_IO_COMBINE_LIMIT', + min => '1', + max => 'MAX_IO_COMBINE_LIMIT', + assign_hook => 'assign_io_max_combine_limit', +}, + +{ name => 'io_max_concurrency', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_IO', + short_desc => 'Max number of IOs that one process can execute simultaneously.', + variable => 'io_max_concurrency', + boot_val => '-1', + min => '-1', + max => '1024', + check_hook => 'check_io_max_concurrency', +}, + +{ name => 'io_method', type => 'enum', context => 'PGC_POSTMASTER', group => 'RESOURCES_IO', + short_desc => 'Selects the method for executing asynchronous I/O.', + variable => 'io_method', + boot_val => 'DEFAULT_IO_METHOD', + options => 'io_method_options', + assign_hook => 'assign_io_method', +}, + +{ name => 'io_workers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_IO', + short_desc => 'Number of IO worker processes, for io_method=worker.', + variable => 'io_workers', + boot_val => '3', + min => '1', + max => 'MAX_IO_WORKERS', +}, + +# Not for general use --- used by SET SESSION AUTHORIZATION and SET +# ROLE +{ name => 'is_superuser', type => 'bool', context => 'PGC_INTERNAL', group => 'UNGROUPED', + short_desc => 'Shows whether the current user is a superuser.', + flags => 'GUC_REPORT | GUC_NO_SHOW_ALL | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_ALLOW_IN_PARALLEL', + variable => 'current_role_is_superuser', + boot_val => 'false', +}, + +{ name => 'jit', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Allow JIT compilation.', + flags => 'GUC_EXPLAIN', + variable => 'jit_enabled', + boot_val => 'true', +}, + +{ name => 'jit_above_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Perform JIT compilation if query is more expensive.', + long_desc => '-1 disables JIT compilation.', + flags => 'GUC_EXPLAIN', + variable => 'jit_above_cost', + boot_val => '100000', + min => '-1', + max => 'DBL_MAX', +}, + +# This is not guaranteed to be available, but given it's a developer +# oriented option, it doesn't seem worth adding code checking +# availability. +{ name => 'jit_debugging_support', type => 'bool', context => 'PGC_SU_BACKEND', group => 'DEVELOPER_OPTIONS', + short_desc => 'Register JIT-compiled functions with debugger.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'jit_debugging_support', + boot_val => 'false', +}, + +{ name => 'jit_dump_bitcode', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Write out LLVM bitcode to facilitate JIT debugging.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'jit_dump_bitcode', + boot_val => 'false', +}, + +{ name => 'jit_expressions', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Allow JIT compilation of expressions.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'jit_expressions', + boot_val => 'true', +}, + +{ name => 'jit_inline_above_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Perform JIT inlining if query is more expensive.', + long_desc => '-1 disables inlining.', + flags => 'GUC_EXPLAIN', + variable => 'jit_inline_above_cost', + boot_val => '500000', + min => '-1', + max => 'DBL_MAX', +}, + +{ name => 'jit_optimize_above_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Optimize JIT-compiled functions if query is more expensive.', + long_desc => '-1 disables optimization.', + flags => 'GUC_EXPLAIN', + variable => 'jit_optimize_above_cost', + boot_val => '500000', + min => '-1', + max => 'DBL_MAX', +}, + +# This is not guaranteed to be available, but given it's a developer +# oriented option, it doesn't seem worth adding code checking +# availability. +{ name => 'jit_profiling_support', type => 'bool', context => 'PGC_SU_BACKEND', group => 'DEVELOPER_OPTIONS', + short_desc => 'Register JIT-compiled functions with perf profiler.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'jit_profiling_support', + boot_val => 'false', +}, + +{ name => 'jit_provider', type => 'string', context => 'PGC_POSTMASTER', group => 'CLIENT_CONN_PRELOAD', + short_desc => 'JIT provider to use.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'jit_provider', + boot_val => '"llvmjit"', +}, + +{ name => 'jit_tuple_deforming', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Allow JIT compilation of tuple deforming.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'jit_tuple_deforming', + boot_val => 'true', +}, + +{ name => 'join_collapse_limit', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Sets the FROM-list size beyond which JOIN constructs are not flattened.', + long_desc => 'The planner will flatten explicit JOIN constructs into lists of FROM items whenever a list of no more than this many items would result.', + flags => 'GUC_EXPLAIN', + variable => 'join_collapse_limit', + boot_val => '8', + min => '1', + max => 'INT_MAX', +}, + +{ name => 'krb_caseins_users', type => 'bool', context => 'PGC_SIGHUP', group => 'CONN_AUTH_AUTH', + short_desc => 'Sets whether Kerberos and GSSAPI user names should be treated as case-insensitive.', + variable => 'pg_krb_caseins_users', + boot_val => 'false', +}, + +{ name => 'krb_server_keyfile', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_AUTH', + short_desc => 'Sets the location of the Kerberos server key file.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'pg_krb_server_keyfile', + boot_val => 'PG_KRB_SRVTAB', +}, + +{ name => 'lc_messages', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the language in which messages are displayed.', + long_desc => 'An empty string means use the operating system setting.', + variable => 'locale_messages', + boot_val => '""', + check_hook => 'check_locale_messages', + assign_hook => 'assign_locale_messages', +}, + +{ name => 'lc_monetary', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the locale for formatting monetary amounts.', + long_desc => 'An empty string means use the operating system setting.', + variable => 'locale_monetary', + boot_val => '"C"', + check_hook => 'check_locale_monetary', + assign_hook => 'assign_locale_monetary', +}, + +{ name => 'lc_numeric', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the locale for formatting numbers.', + long_desc => 'An empty string means use the operating system setting.', + variable => 'locale_numeric', + boot_val => '"C"', + check_hook => 'check_locale_numeric', + assign_hook => 'assign_locale_numeric', +}, + +{ name => 'lc_time', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the locale for formatting date and time values.', + long_desc => 'An empty string means use the operating system setting.', + variable => 'locale_time', + boot_val => '"C"', + check_hook => 'check_locale_time', + assign_hook => 'assign_locale_time', +}, + +{ name => 'listen_addresses', type => 'string', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the host name or IP address(es) to listen to.', + flags => 'GUC_LIST_INPUT', + variable => 'ListenAddresses', + boot_val => '"localhost"', +}, + +{ name => 'lo_compat_privileges', type => 'bool', context => 'PGC_SUSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'Enables backward compatibility mode for privilege checks on large objects.', + long_desc => 'Skips privilege checks when reading or modifying large objects, for compatibility with PostgreSQL releases prior to 9.0.', + variable => 'lo_compat_privileges', + boot_val => 'false', +}, + +{ name => 'local_preload_libraries', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_PRELOAD', + short_desc => 'Lists unprivileged shared libraries to preload into each backend.', + flags => 'GUC_LIST_INPUT | GUC_LIST_QUOTE', + variable => 'local_preload_libraries_string', + boot_val => '""', +}, + +{ name => 'lock_timeout', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the maximum allowed duration of any wait for a lock.', + long_desc => '0 disables the timeout.', + flags => 'GUC_UNIT_MS', + variable => 'LockTimeout', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'log_autoanalyze_min_duration', type => 'int', context => 'PGC_SIGHUP', group => 'LOGGING_WHAT', + short_desc => 'Sets the minimum execution time above which analyze actions by autovacuum will be logged.', + long_desc => '-1 disables logging analyze actions by autovacuum. 0 means log all analyze actions by autovacuum.', + flags => 'GUC_UNIT_MS', + variable => 'Log_autoanalyze_min_duration', + boot_val => '600000', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'log_autovacuum_min_duration', type => 'int', context => 'PGC_SIGHUP', group => 'LOGGING_WHAT', + short_desc => 'Sets the minimum execution time above which vacuum actions by autovacuum will be logged.', + long_desc => '-1 disables logging vacuum actions by autovacuum. 0 means log all vacuum actions by autovacuum.', + flags => 'GUC_UNIT_MS', + variable => 'Log_autovacuum_min_duration', + boot_val => '600000', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'log_btree_build_stats', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Logs system resource usage statistics (memory and CPU) on various B-tree operations.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'log_btree_build_stats', + boot_val => 'false', + ifdef => 'BTREE_BUILD_STATS', +}, + +{ name => 'log_checkpoints', type => 'bool', context => 'PGC_SIGHUP', group => 'LOGGING_WHAT', + short_desc => 'Logs each checkpoint.', + variable => 'log_checkpoints', + boot_val => 'true', +}, + +{ name => 'log_connections', type => 'string', context => 'PGC_SU_BACKEND', group => 'LOGGING_WHAT', + short_desc => 'Logs specified aspects of connection establishment and setup.', + flags => 'GUC_LIST_INPUT', + variable => 'log_connections_string', + boot_val => '""', + check_hook => 'check_log_connections', + assign_hook => 'assign_log_connections', +}, + +{ name => 'log_destination', type => 'string', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the destination for server log output.', + long_desc => 'Valid values are combinations of "stderr", "syslog", "csvlog", "jsonlog", and "eventlog", depending on the platform.', + flags => 'GUC_LIST_INPUT', + variable => 'Log_destination_string', + boot_val => '"stderr"', + check_hook => 'check_log_destination', + assign_hook => 'assign_log_destination', +}, + +{ name => 'log_directory', type => 'string', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the destination directory for log files.', + long_desc => 'Can be specified as relative to the data directory or as absolute path.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'Log_directory', + boot_val => '"log"', + check_hook => 'check_canonical_path', +}, + +{ name => 'log_disconnections', type => 'bool', context => 'PGC_SU_BACKEND', group => 'LOGGING_WHAT', + short_desc => 'Logs end of a session, including duration.', + variable => 'Log_disconnections', + boot_val => 'false', +}, + +{ name => 'log_duration', type => 'bool', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Logs the duration of each completed SQL statement.', + variable => 'log_duration', + boot_val => 'false', +}, + +{ name => 'log_error_verbosity', type => 'enum', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Sets the verbosity of logged messages.', + variable => 'Log_error_verbosity', + boot_val => 'PGERROR_DEFAULT', + options => 'log_error_verbosity_options', +}, + +{ name => 'log_executor_stats', type => 'bool', context => 'PGC_SUSET', group => 'STATS_MONITORING', + short_desc => 'Writes executor performance statistics to the server log.', + variable => 'log_executor_stats', + boot_val => 'false', + check_hook => 'check_stage_log_stats', +}, + +{ name => 'log_file_mode', type => 'int', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the file permissions for log files.', + long_desc => 'The parameter value is expected to be a numeric mode specification in the form accepted by the chmod and umask system calls. (To use the customary octal format the number must start with a 0 (zero).)', + variable => 'Log_file_mode', + boot_val => '0600', + min => '0000', + max => '0777', + show_hook => 'show_log_file_mode', +}, + +{ name => 'log_filename', type => 'string', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the file name pattern for log files.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'Log_filename', + boot_val => '"postgresql-%Y-%m-%d_%H%M%S.log"', +}, + +{ name => 'log_hostname', type => 'bool', context => 'PGC_SIGHUP', group => 'LOGGING_WHAT', + short_desc => 'Logs the host name in the connection logs.', + long_desc => 'By default, connection logs only show the IP address of the connecting host. If you want them to show the host name you can turn this on, but depending on your host name resolution setup it might impose a non-negligible performance penalty.', + variable => 'log_hostname', + boot_val => 'false', +}, + +{ name => 'log_line_prefix', type => 'string', context => 'PGC_SIGHUP', group => 'LOGGING_WHAT', + short_desc => 'Controls information prefixed to each log line.', + long_desc => 'An empty string means no prefix.', + variable => 'Log_line_prefix', + boot_val => '"%m [%p] "', +}, + +{ name => 'log_lock_failures', type => 'bool', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Logs lock failures.', + variable => 'log_lock_failures', + boot_val => 'false', +}, + +{ name => 'log_lock_waits', type => 'bool', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Logs long lock waits.', + variable => 'log_lock_waits', + boot_val => 'true', +}, + +{ name => 'log_min_duration_sample', type => 'int', context => 'PGC_SUSET', group => 'LOGGING_WHEN', + short_desc => 'Sets the minimum execution time above which a sample of statements will be logged. Sampling is determined by "log_statement_sample_rate".', + long_desc => '-1 disables sampling. 0 means sample all statements.', + flags => 'GUC_UNIT_MS', + variable => 'log_min_duration_sample', + boot_val => '-1', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'log_min_duration_statement', type => 'int', context => 'PGC_SUSET', group => 'LOGGING_WHEN', + short_desc => 'Sets the minimum execution time above which all statements will be logged.', + long_desc => '-1 disables logging statement durations. 0 means log all statement durations.', + flags => 'GUC_UNIT_MS', + variable => 'log_min_duration_statement', + boot_val => '-1', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'log_min_error_statement', type => 'enum', context => 'PGC_SUSET', group => 'LOGGING_WHEN', + short_desc => 'Causes all statements generating error at or above this level to be logged.', + long_desc => 'Each level includes all the levels that follow it. The later the level, the fewer messages are sent.', + variable => 'log_min_error_statement', + boot_val => 'ERROR', + options => 'server_message_level_options', +}, + +{ name => 'log_min_messages', type => 'enum', context => 'PGC_SUSET', group => 'LOGGING_WHEN', + short_desc => 'Sets the message levels that are logged.', + long_desc => 'Each level includes all the levels that follow it. The later the level, the fewer messages are sent.', + variable => 'log_min_messages', + boot_val => 'WARNING', + options => 'server_message_level_options', +}, + +{ name => 'log_parameter_max_length', type => 'int', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Sets the maximum length in bytes of data logged for bind parameter values when logging statements.', + long_desc => '-1 means log values in full.', + flags => 'GUC_UNIT_BYTE', + variable => 'log_parameter_max_length', + boot_val => '-1', + min => '-1', + max => 'INT_MAX / 2', +}, + +{ name => 'log_parameter_max_length_on_error', type => 'int', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Sets the maximum length in bytes of data logged for bind parameter values when logging statements, on error.', + long_desc => '-1 means log values in full.', + flags => 'GUC_UNIT_BYTE', + variable => 'log_parameter_max_length_on_error', + boot_val => '0', + min => '-1', + max => 'INT_MAX / 2', +}, + +{ name => 'log_parser_stats', type => 'bool', context => 'PGC_SUSET', group => 'STATS_MONITORING', + short_desc => 'Writes parser performance statistics to the server log.', + variable => 'log_parser_stats', + boot_val => 'false', + check_hook => 'check_stage_log_stats', +}, + +{ name => 'log_planner_stats', type => 'bool', context => 'PGC_SUSET', group => 'STATS_MONITORING', + short_desc => 'Writes planner performance statistics to the server log.', + variable => 'log_planner_stats', + boot_val => 'false', + check_hook => 'check_stage_log_stats', +}, + +{ name => 'log_recovery_conflict_waits', type => 'bool', context => 'PGC_SIGHUP', group => 'LOGGING_WHAT', + short_desc => 'Logs standby recovery conflict waits.', + variable => 'log_recovery_conflict_waits', + boot_val => 'false', +}, + +{ name => 'log_replication_commands', type => 'bool', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each replication command.', + variable => 'log_replication_commands', + boot_val => 'false', +}, + +{ name => 'log_rotation_age', type => 'int', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the amount of time to wait before forcing log file rotation.', + long_desc => '0 disables time-based creation of new log files.', + flags => 'GUC_UNIT_MIN', + variable => 'Log_RotationAge', + boot_val => 'HOURS_PER_DAY * MINS_PER_HOUR', + min => '0', + max => 'INT_MAX / SECS_PER_MINUTE', +}, + +{ name => 'log_rotation_size', type => 'int', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the maximum size a log file can reach before being rotated.', + long_desc => '0 disables size-based creation of new log files.', + flags => 'GUC_UNIT_KB', + variable => 'Log_RotationSize', + boot_val => '10 * 1024', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'log_startup_progress_interval', type => 'int', context => 'PGC_SIGHUP', group => 'LOGGING_WHEN', + short_desc => 'Time between progress updates for long-running startup operations.', + long_desc => '0 disables progress updates.', + flags => 'GUC_UNIT_MS', + variable => 'log_startup_progress_interval', + boot_val => '10000', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'log_statement', type => 'enum', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Sets the type of statements logged.', + variable => 'log_statement', + boot_val => 'LOGSTMT_NONE', + options => 'log_statement_options', +}, + +{ name => 'log_statement_sample_rate', type => 'real', context => 'PGC_SUSET', group => 'LOGGING_WHEN', + short_desc => 'Fraction of statements exceeding "log_min_duration_sample" to be logged.', + long_desc => 'Use a value between 0.0 (never log) and 1.0 (always log).', + variable => 'log_statement_sample_rate', + boot_val => '1.0', + min => '0.0', + max => '1.0', +}, + +{ name => 'log_statement_stats', type => 'bool', context => 'PGC_SUSET', group => 'STATS_MONITORING', + short_desc => 'Writes cumulative performance statistics to the server log.', + variable => 'log_statement_stats', + boot_val => 'false', + check_hook => 'check_log_stats', +}, + +{ name => 'log_temp_files', type => 'int', context => 'PGC_SUSET', group => 'LOGGING_WHAT', + short_desc => 'Log the use of temporary files larger than this number of kilobytes.', + long_desc => '-1 disables logging temporary files. 0 means log all temporary files.', + flags => 'GUC_UNIT_KB', + variable => 'log_temp_files', + boot_val => '-1', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'log_timezone', type => 'string', context => 'PGC_SIGHUP', group => 'LOGGING_WHAT', + short_desc => 'Sets the time zone to use in log messages.', + variable => 'log_timezone_string', + boot_val => '"GMT"', + check_hook => 'check_log_timezone', + assign_hook => 'assign_log_timezone', + show_hook => 'show_log_timezone', +}, + +{ name => 'log_transaction_sample_rate', type => 'real', context => 'PGC_SUSET', group => 'LOGGING_WHEN', + short_desc => 'Sets the fraction of transactions from which to log all statements.', + long_desc => 'Use a value between 0.0 (never log) and 1.0 (log all statements for all transactions).', + variable => 'log_xact_sample_rate', + boot_val => '0.0', + min => '0.0', + max => '1.0', +}, + +{ name => 'log_truncate_on_rotation', type => 'bool', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Truncate existing log files of same name during log rotation.', + variable => 'Log_truncate_on_rotation', + boot_val => 'false', +}, + +{ name => 'logging_collector', type => 'bool', context => 'PGC_POSTMASTER', group => 'LOGGING_WHERE', + short_desc => 'Start a subprocess to capture stderr, csvlog and/or jsonlog into log files.', + variable => 'Logging_collector', + boot_val => 'false', +}, + +{ name => 'logical_decoding_work_mem', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM', + short_desc => 'Sets the maximum memory to be used for logical decoding.', + long_desc => 'This much memory can be used by each internal reorder buffer before spilling to disk.', + flags => 'GUC_UNIT_KB', + variable => 'logical_decoding_work_mem', + boot_val => '65536', + min => '64', + max => 'MAX_KILOBYTES', +}, + +{ name => 'maintenance_io_concurrency', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_IO', + short_desc => 'A variant of "effective_io_concurrency" that is used for maintenance work.', + long_desc => '0 disables simultaneous requests.', + flags => 'GUC_EXPLAIN', + variable => 'maintenance_io_concurrency', + boot_val => 'DEFAULT_MAINTENANCE_IO_CONCURRENCY', + min => '0', + max => 'MAX_IO_CONCURRENCY', + assign_hook => 'assign_maintenance_io_concurrency', +}, + +# Dynamic shared memory has a higher overhead than local memory +# contexts, so when testing low-memory scenarios that could use shared +# memory, the recommended minimum is 1MB. +{ name => 'maintenance_work_mem', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM', + short_desc => 'Sets the maximum memory to be used for maintenance operations.', + long_desc => 'This includes operations such as VACUUM and CREATE INDEX.', + flags => 'GUC_UNIT_KB', + variable => 'maintenance_work_mem', + boot_val => '65536', + min => '64', + max => 'MAX_KILOBYTES', +}, + +{ name => 'max_active_replication_origins', type => 'int', context => 'PGC_POSTMASTER', group => 'REPLICATION_SUBSCRIBERS', + short_desc => 'Sets the maximum number of active replication origins.', + variable => 'max_active_replication_origins', + boot_val => '10', + min => '0', + max => 'MAX_BACKENDS', +}, + +{ name => 'max_connections', type => 'int', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the maximum number of concurrent connections.', + variable => 'MaxConnections', + boot_val => '100', + min => '1', + max => 'MAX_BACKENDS', +}, + +{ name => 'max_files_per_process', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_KERNEL', + short_desc => 'Sets the maximum number of files each server process is allowed to open simultaneously.', + variable => 'max_files_per_process', + boot_val => '1000', + min => '64', + max => 'INT_MAX', +}, + +{ name => 'max_function_args', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the maximum number of function arguments.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'max_function_args', + boot_val => 'FUNC_MAX_ARGS', + min => 'FUNC_MAX_ARGS', + max => 'FUNC_MAX_ARGS', +}, + +{ name => 'max_identifier_length', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the maximum identifier length.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'max_identifier_length', + boot_val => 'NAMEDATALEN - 1', + min => 'NAMEDATALEN - 1', + max => 'NAMEDATALEN - 1', +}, + +{ name => 'max_index_keys', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the maximum number of index keys.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'max_index_keys', + boot_val => 'INDEX_MAX_KEYS', + min => 'INDEX_MAX_KEYS', + max => 'INDEX_MAX_KEYS', +}, + +# See also CheckRequiredParameterValues() if this parameter changes +{ name => 'max_locks_per_transaction', type => 'int', context => 'PGC_POSTMASTER', group => 'LOCK_MANAGEMENT', + short_desc => 'Sets the maximum number of locks per transaction.', + long_desc => 'The shared lock table is sized on the assumption that at most "max_locks_per_transaction" objects per server process or prepared transaction will need to be locked at any one time.', + variable => 'max_locks_per_xact', + boot_val => '64', + min => '10', + max => 'INT_MAX', +}, + +{ name => 'max_logical_replication_workers', type => 'int', context => 'PGC_POSTMASTER', group => 'REPLICATION_SUBSCRIBERS', + short_desc => 'Maximum number of logical replication worker processes.', + variable => 'max_logical_replication_workers', + boot_val => '4', + min => '0', + max => 'MAX_BACKENDS', +}, + +{ name => 'max_notify_queue_pages', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_DISK', + short_desc => 'Sets the maximum number of allocated pages for NOTIFY / LISTEN queue.', + variable => 'max_notify_queue_pages', + boot_val => '1048576', + min => '64', + max => 'INT_MAX', +}, + +{ name => 'max_parallel_apply_workers_per_subscription', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SUBSCRIBERS', + short_desc => 'Maximum number of parallel apply workers per subscription.', + variable => 'max_parallel_apply_workers_per_subscription', + boot_val => '2', + min => '0', + max => 'MAX_PARALLEL_WORKER_LIMIT', +}, + +{ name => 'max_parallel_maintenance_workers', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_WORKER_PROCESSES', + short_desc => 'Sets the maximum number of parallel processes per maintenance operation.', + variable => 'max_parallel_maintenance_workers', + boot_val => '2', + min => '0', + max => 'MAX_PARALLEL_WORKER_LIMIT', +}, + +{ name => 'max_parallel_workers', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_WORKER_PROCESSES', + short_desc => 'Sets the maximum number of parallel workers that can be active at one time.', + flags => 'GUC_EXPLAIN', + variable => 'max_parallel_workers', + boot_val => '8', + min => '0', + max => 'MAX_PARALLEL_WORKER_LIMIT', +}, + +{ name => 'max_parallel_workers_per_gather', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_WORKER_PROCESSES', + short_desc => 'Sets the maximum number of parallel processes per executor node.', + flags => 'GUC_EXPLAIN', + variable => 'max_parallel_workers_per_gather', + boot_val => '2', + min => '0', + max => 'MAX_PARALLEL_WORKER_LIMIT', +}, + +{ name => 'max_pred_locks_per_page', type => 'int', context => 'PGC_SIGHUP', group => 'LOCK_MANAGEMENT', + short_desc => 'Sets the maximum number of predicate-locked tuples per page.', + long_desc => 'If more than this number of tuples on the same page are locked by a connection, those locks are replaced by a page-level lock.', + variable => 'max_predicate_locks_per_page', + boot_val => '2', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'max_pred_locks_per_relation', type => 'int', context => 'PGC_SIGHUP', group => 'LOCK_MANAGEMENT', + short_desc => 'Sets the maximum number of predicate-locked pages and tuples per relation.', + long_desc => 'If more than this total of pages and tuples in the same relation are locked by a connection, those locks are replaced by a relation-level lock.', + variable => 'max_predicate_locks_per_relation', + boot_val => '-2', + min => 'INT_MIN', + max => 'INT_MAX', +}, + +{ name => 'max_pred_locks_per_transaction', type => 'int', context => 'PGC_POSTMASTER', group => 'LOCK_MANAGEMENT', + short_desc => 'Sets the maximum number of predicate locks per transaction.', + long_desc => 'The shared predicate lock table is sized on the assumption that at most "max_pred_locks_per_transaction" objects per server process or prepared transaction will need to be locked at any one time.', + variable => 'max_predicate_locks_per_xact', + boot_val => '64', + min => '10', + max => 'INT_MAX', +}, + +# See also CheckRequiredParameterValues() if this parameter changes +{ name => 'max_prepared_transactions', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the maximum number of simultaneously prepared transactions.', + variable => 'max_prepared_xacts', + boot_val => '0', + min => '0', + max => 'MAX_BACKENDS', +}, + +/* see max_wal_senders */ +{ name => 'max_replication_slots', type => 'int', context => 'PGC_POSTMASTER', group => 'REPLICATION_SENDING', + short_desc => 'Sets the maximum number of simultaneously defined replication slots.', + variable => 'max_replication_slots', + boot_val => '10', + min => '0', + max => 'MAX_BACKENDS /* XXX? */', +}, + +{ name => 'max_slot_wal_keep_size', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SENDING', + short_desc => 'Sets the maximum WAL size that can be reserved by replication slots.', + long_desc => 'Replication slots will be marked as failed, and segments released for deletion or recycling, if this much space is occupied by WAL on disk. -1 means no maximum.', + flags => 'GUC_UNIT_MB', + variable => 'max_slot_wal_keep_size_mb', + boot_val => '-1', + min => '-1', + max => 'MAX_KILOBYTES', +}, + +# We use the hopefully-safely-small value of 100kB as the compiled-in +# default for max_stack_depth. InitializeGUCOptions will increase it +# if possible, depending on the actual platform-specific stack limit. +{ name => 'max_stack_depth', type => 'int', context => 'PGC_SUSET', group => 'RESOURCES_MEM', + short_desc => 'Sets the maximum stack depth, in kilobytes.', + flags => 'GUC_UNIT_KB', + variable => 'max_stack_depth', + boot_val => '100', + min => '100', + max => 'MAX_KILOBYTES', + check_hook => 'check_max_stack_depth', + assign_hook => 'assign_max_stack_depth', +}, + +{ name => 'max_standby_archive_delay', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the maximum delay before canceling queries when a hot standby server is processing archived WAL data.', + long_desc => '-1 means wait forever.', + flags => 'GUC_UNIT_MS', + variable => 'max_standby_archive_delay', + boot_val => '30 * 1000', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'max_standby_streaming_delay', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the maximum delay before canceling queries when a hot standby server is processing streamed WAL data.', + long_desc => '-1 means wait forever.', + flags => 'GUC_UNIT_MS', + variable => 'max_standby_streaming_delay', + boot_val => '30 * 1000', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'max_sync_workers_per_subscription', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SUBSCRIBERS', + short_desc => 'Maximum number of workers per subscription for synchronizing tables and sequences.', + variable => 'max_sync_workers_per_subscription', + boot_val => '2', + min => '0', + max => 'MAX_BACKENDS', +}, + +{ name => 'max_wal_senders', type => 'int', context => 'PGC_POSTMASTER', group => 'REPLICATION_SENDING', + short_desc => 'Sets the maximum number of simultaneously running WAL sender processes.', + variable => 'max_wal_senders', + boot_val => '10', + min => '0', + max => 'MAX_BACKENDS', +}, + +{ name => 'max_wal_size', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_CHECKPOINTS', + short_desc => 'Sets the WAL size that triggers a checkpoint.', + flags => 'GUC_UNIT_MB', + variable => 'max_wal_size_mb', + boot_val => 'DEFAULT_MAX_WAL_SEGS * (DEFAULT_XLOG_SEG_SIZE / (1024 * 1024))', + min => '2', + max => 'MAX_KILOBYTES', + assign_hook => 'assign_max_wal_size', +}, + +{ name => 'max_worker_processes', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_WORKER_PROCESSES', + short_desc => 'Maximum number of concurrent worker processes.', + variable => 'max_worker_processes', + boot_val => '8', + min => '0', + max => 'MAX_BACKENDS', +}, + +{ name => 'md5_password_warnings', type => 'bool', context => 'PGC_USERSET', group => 'CONN_AUTH_AUTH', + short_desc => 'Enables deprecation warnings for MD5 passwords.', + variable => 'md5_password_warnings', + boot_val => 'true', +}, + +{ name => 'min_dynamic_shared_memory', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Amount of dynamic shared memory reserved at startup.', + flags => 'GUC_UNIT_MB', + variable => 'min_dynamic_shared_memory', + boot_val => '0', + min => '0', + max => '(int) Min((size_t) INT_MAX, SIZE_MAX / (1024 * 1024))', +}, + +{ name => 'min_eager_agg_group_size', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the minimum average group size required to consider applying eager aggregation.', + flags => 'GUC_EXPLAIN', + variable => 'min_eager_agg_group_size', + boot_val => '8.0', + min => '0.0', + max => 'DBL_MAX', +}, + +{ name => 'min_parallel_index_scan_size', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the minimum amount of index data for a parallel scan.', + long_desc => 'If the planner estimates that it will read a number of index pages too small to reach this limit, a parallel scan will not be considered.', + flags => 'GUC_UNIT_BLOCKS | GUC_EXPLAIN', + variable => 'min_parallel_index_scan_size', + boot_val => '(512 * 1024) / BLCKSZ', + min => '0', + max => 'INT_MAX / 3', +}, + +{ name => 'min_parallel_table_scan_size', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the minimum amount of table data for a parallel scan.', + long_desc => 'If the planner estimates that it will read a number of table pages too small to reach this limit, a parallel scan will not be considered.', + flags => 'GUC_UNIT_BLOCKS | GUC_EXPLAIN', + variable => 'min_parallel_table_scan_size', + boot_val => '(8 * 1024 * 1024) / BLCKSZ', + min => '0', + max => 'INT_MAX / 3', +}, + +{ name => 'min_wal_size', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_CHECKPOINTS', + short_desc => 'Sets the minimum size to shrink the WAL to.', + flags => 'GUC_UNIT_MB', + variable => 'min_wal_size_mb', + boot_val => 'DEFAULT_MIN_WAL_SEGS * (DEFAULT_XLOG_SEG_SIZE / (1024 * 1024))', + min => '2', + max => 'MAX_KILOBYTES', +}, + +{ name => 'multixact_member_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the size of the dedicated buffer pool used for the MultiXact member cache.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'multixact_member_buffers', + boot_val => '32', + min => '16', + max => 'SLRU_MAX_ALLOWED_BUFFERS', + check_hook => 'check_multixact_member_buffers', +}, + +{ name => 'multixact_offset_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the size of the dedicated buffer pool used for the MultiXact offset cache.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'multixact_offset_buffers', + boot_val => '16', + min => '16', + max => 'SLRU_MAX_ALLOWED_BUFFERS', + check_hook => 'check_multixact_offset_buffers', +}, + +{ name => 'notify_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the size of the dedicated buffer pool used for the LISTEN/NOTIFY message cache.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'notify_buffers', + boot_val => '16', + min => '16', + max => 'SLRU_MAX_ALLOWED_BUFFERS', + check_hook => 'check_notify_buffers', +}, + +{ name => 'num_os_semaphores', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the number of semaphores required for the server.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', + variable => 'num_os_semaphores', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'oauth_validator_libraries', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_AUTH', + short_desc => 'Lists libraries that may be called to validate OAuth v2 bearer tokens.', + flags => 'GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY', + variable => 'oauth_validator_libraries_string', + boot_val => '""', +}, + +# this is undocumented because not exposed in a standard build +{ name => 'optimize_bounded_sort', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables bounded sorting using heap sort.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_EXPLAIN', + variable => 'optimize_bounded_sort', + boot_val => 'true', + ifdef => 'DEBUG_BOUNDED_SORT', +}, + +{ name => 'parallel_leader_participation', type => 'bool', context => 'PGC_USERSET', group => 'RESOURCES_WORKER_PROCESSES', + short_desc => 'Controls whether Gather and Gather Merge also run subplans.', + long_desc => 'Should gather nodes also run subplans or just gather tuples?', + flags => 'GUC_EXPLAIN', + variable => 'parallel_leader_participation', + boot_val => 'true', +}, + +{ name => 'parallel_setup_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of starting up worker processes for parallel query.', + flags => 'GUC_EXPLAIN', + variable => 'parallel_setup_cost', + boot_val => 'DEFAULT_PARALLEL_SETUP_COST', + min => '0', + max => 'DBL_MAX', +}, + +{ name => 'parallel_tuple_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of passing each tuple (row) from worker to leader backend.', + flags => 'GUC_EXPLAIN', + variable => 'parallel_tuple_cost', + boot_val => 'DEFAULT_PARALLEL_TUPLE_COST', + min => '0', + max => 'DBL_MAX', +}, + +{ name => 'password_encryption', type => 'enum', context => 'PGC_USERSET', group => 'CONN_AUTH_AUTH', + short_desc => 'Chooses the algorithm for encrypting passwords.', + variable => 'Password_encryption', + boot_val => 'PASSWORD_TYPE_SCRAM_SHA_256', + options => 'password_encryption_options', +}, + +{ name => 'plan_cache_mode', type => 'enum', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Controls the planner\'s selection of custom or generic plan.', + long_desc => 'Prepared statements can have custom and generic plans, and the planner will attempt to choose which is better. This can be set to override the default behavior.', + flags => 'GUC_EXPLAIN', + variable => 'plan_cache_mode', + boot_val => 'PLAN_CACHE_MODE_AUTO', + options => 'plan_cache_mode_options', +}, + +{ name => 'port', type => 'int', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the TCP port the server listens on.', + variable => 'PostPortNumber', + boot_val => 'DEF_PGPORT', + min => '1', + max => '65535', +}, + +{ name => 'post_auth_delay', type => 'int', context => 'PGC_BACKEND', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets the amount of time to wait after authentication on connection startup.', + long_desc => 'This allows attaching a debugger to the process.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_UNIT_S', + variable => 'PostAuthDelay', + boot_val => '0', + min => '0', + max => 'INT_MAX / 1000000', +}, + +# Not for general use +{ name => 'pre_auth_delay', type => 'int', context => 'PGC_SIGHUP', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets the amount of time to wait before authentication on connection startup.', + long_desc => 'This allows attaching a debugger to the process.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_UNIT_S', + variable => 'PreAuthDelay', + boot_val => '0', + min => '0', + max => '60', +}, + +{ name => 'primary_conninfo', type => 'string', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the connection string to be used to connect to the sending server.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'PrimaryConnInfo', + boot_val => '""', +}, + +{ name => 'primary_slot_name', type => 'string', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the name of the replication slot to use on the sending server.', + variable => 'PrimarySlotName', + boot_val => '""', + check_hook => 'check_primary_slot_name', +}, + +{ name => 'quote_all_identifiers', type => 'bool', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'When generating SQL fragments, quote all identifiers.', + variable => 'quote_all_identifiers', + boot_val => 'false', +}, + +{ name => 'random_page_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of a nonsequentially fetched disk page.', + flags => 'GUC_EXPLAIN', + variable => 'random_page_cost', + boot_val => 'DEFAULT_RANDOM_PAGE_COST', + min => '0', + max => 'DBL_MAX', +}, + +{ name => 'recovery_end_command', type => 'string', context => 'PGC_SIGHUP', group => 'WAL_ARCHIVE_RECOVERY', + short_desc => 'Sets the shell command that will be executed once at the end of recovery.', + variable => 'recoveryEndCommand', + boot_val => '""', +}, + +{ name => 'recovery_init_sync_method', type => 'enum', context => 'PGC_SIGHUP', group => 'ERROR_HANDLING_OPTIONS', + short_desc => 'Sets the method for synchronizing the data directory before crash recovery.', + variable => 'recovery_init_sync_method', + boot_val => 'DATA_DIR_SYNC_METHOD_FSYNC', + options => 'recovery_init_sync_method_options', +}, + +{ name => 'recovery_min_apply_delay', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the minimum delay for applying changes during recovery.', + flags => 'GUC_UNIT_MS', + variable => 'recovery_min_apply_delay', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'recovery_prefetch', type => 'enum', context => 'PGC_SIGHUP', group => 'WAL_RECOVERY', + short_desc => 'Prefetch referenced blocks during recovery.', + long_desc => 'Look ahead in the WAL to find references to uncached data.', + variable => 'recovery_prefetch', + boot_val => 'RECOVERY_PREFETCH_TRY', + options => 'recovery_prefetch_options', + check_hook => 'check_recovery_prefetch', + assign_hook => 'assign_recovery_prefetch', +}, + +{ name => 'recovery_target', type => 'string', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Set to "immediate" to end recovery as soon as a consistent state is reached.', + variable => 'recovery_target_string', + boot_val => '""', + check_hook => 'check_recovery_target', + assign_hook => 'assign_recovery_target', +}, + +{ name => 'recovery_target_action', type => 'enum', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Sets the action to perform upon reaching the recovery target.', + variable => 'recoveryTargetAction', + boot_val => 'RECOVERY_TARGET_ACTION_PAUSE', + options => 'recovery_target_action_options', +}, + +{ name => 'recovery_target_inclusive', type => 'bool', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Sets whether to include or exclude transaction with recovery target.', + variable => 'recoveryTargetInclusive', + boot_val => 'true', +}, + +{ name => 'recovery_target_lsn', type => 'string', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Sets the LSN of the write-ahead log location up to which recovery will proceed.', + variable => 'recovery_target_lsn_string', + boot_val => '""', + check_hook => 'check_recovery_target_lsn', + assign_hook => 'assign_recovery_target_lsn', +}, + +{ name => 'recovery_target_name', type => 'string', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Sets the named restore point up to which recovery will proceed.', + variable => 'recovery_target_name_string', + boot_val => '""', + check_hook => 'check_recovery_target_name', + assign_hook => 'assign_recovery_target_name', +}, + +{ name => 'recovery_target_time', type => 'string', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Sets the time stamp up to which recovery will proceed.', + variable => 'recovery_target_time_string', + boot_val => '""', + check_hook => 'check_recovery_target_time', + assign_hook => 'assign_recovery_target_time', +}, + +{ name => 'recovery_target_timeline', type => 'string', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Specifies the timeline to recover into.', + variable => 'recovery_target_timeline_string', + boot_val => '"latest"', + check_hook => 'check_recovery_target_timeline', + assign_hook => 'assign_recovery_target_timeline', +}, + +{ name => 'recovery_target_xid', type => 'string', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY_TARGET', + short_desc => 'Sets the transaction ID up to which recovery will proceed.', + variable => 'recovery_target_xid_string', + boot_val => '""', + check_hook => 'check_recovery_target_xid', + assign_hook => 'assign_recovery_target_xid', +}, + +{ name => 'recursive_worktable_factor', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', + short_desc => 'Sets the planner\'s estimate of the average size of a recursive query\'s working table.', + flags => 'GUC_EXPLAIN', + variable => 'recursive_worktable_factor', + boot_val => 'DEFAULT_RECURSIVE_WORKTABLE_FACTOR', + min => '0.001', + max => '1000000.0', +}, + +{ name => 'remove_temp_files_after_crash', type => 'bool', context => 'PGC_SIGHUP', group => 'DEVELOPER_OPTIONS', + short_desc => 'Remove temporary files after backend crash.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'remove_temp_files_after_crash', + boot_val => 'true', +}, + +{ name => 'reserved_connections', type => 'int', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the number of connection slots reserved for roles with privileges of pg_use_reserved_connections.', + variable => 'ReservedConnections', + boot_val => '0', + min => '0', + max => 'MAX_BACKENDS', +}, + +{ name => 'restart_after_crash', type => 'bool', context => 'PGC_SIGHUP', group => 'ERROR_HANDLING_OPTIONS', + short_desc => 'Reinitialize server after backend crash.', + variable => 'restart_after_crash', + boot_val => 'true', +}, + +{ name => 'restore_command', type => 'string', context => 'PGC_SIGHUP', group => 'WAL_ARCHIVE_RECOVERY', + short_desc => 'Sets the shell command that will be called to retrieve an archived WAL file.', + variable => 'recoveryRestoreCommand', + boot_val => '""', +}, + +{ name => 'restrict_nonsystem_relation_kind', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Prohibits access to non-system relations of specified kinds.', + flags => 'GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE', + variable => 'restrict_nonsystem_relation_kind_string', + boot_val => '""', + check_hook => 'check_restrict_nonsystem_relation_kind', + assign_hook => 'assign_restrict_nonsystem_relation_kind', +}, + +# Not for general use --- used by SET ROLE +{ name => 'role', type => 'string', context => 'PGC_USERSET', group => 'UNGROUPED', + short_desc => 'Sets the current role.', + flags => 'GUC_IS_NAME | GUC_NO_SHOW_ALL | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_NOT_WHILE_SEC_REST', + variable => 'role_string', + boot_val => '"none"', + check_hook => 'check_role', + assign_hook => 'assign_role', + show_hook => 'show_role', +}, + +{ name => 'row_security', type => 'bool', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Enables row security.', + long_desc => 'When enabled, row security will be applied to all users.', + variable => 'row_security', + boot_val => 'true', +}, + +{ name => 'scram_iterations', type => 'int', context => 'PGC_USERSET', group => 'CONN_AUTH_AUTH', + short_desc => 'Sets the iteration count for SCRAM secret generation.', + flags => 'GUC_REPORT', + variable => 'scram_sha_256_iterations', + boot_val => 'SCRAM_SHA_256_DEFAULT_ITERATIONS', + min => '1', + max => 'INT_MAX', +}, + +{ name => 'search_path', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the schema search order for names that are not schema-qualified.', + flags => 'GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_EXPLAIN | GUC_REPORT', + variable => 'namespace_search_path', + boot_val => '"\"$user\", public"', + check_hook => 'check_search_path', + assign_hook => 'assign_search_path', +}, + +{ name => 'seed', type => 'real', context => 'PGC_USERSET', group => 'UNGROUPED', + short_desc => 'Sets the seed for random-number generation.', + flags => 'GUC_NO_SHOW_ALL | GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'phony_random_seed', + boot_val => '0.0', + min => '-1.0', + max => '1.0', + check_hook => 'check_random_seed', + assign_hook => 'assign_random_seed', + show_hook => 'show_random_seed', +}, + +{ name => 'segment_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the number of pages per disk file.', + flags => 'GUC_UNIT_BLOCKS | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'segment_size', + boot_val => 'RELSEG_SIZE', + min => 'RELSEG_SIZE', + max => 'RELSEG_SIZE', +}, + +{ name => 'send_abort_for_crash', type => 'bool', context => 'PGC_SIGHUP', group => 'DEVELOPER_OPTIONS', + short_desc => 'Send SIGABRT not SIGQUIT to child processes after backend crash.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'send_abort_for_crash', + boot_val => 'false', +}, + +{ name => 'send_abort_for_kill', type => 'bool', context => 'PGC_SIGHUP', group => 'DEVELOPER_OPTIONS', + short_desc => 'Send SIGABRT not SIGKILL to stuck child processes.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'send_abort_for_kill', + boot_val => 'false', +}, + + +{ name => 'seq_page_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of a sequentially fetched disk page.', + flags => 'GUC_EXPLAIN', + variable => 'seq_page_cost', + boot_val => 'DEFAULT_SEQ_PAGE_COST', + min => '0', + max => 'DBL_MAX', +}, + +{ name => 'serializable_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the size of the dedicated buffer pool used for the serializable transaction cache.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'serializable_buffers', + boot_val => '32', + min => '16', + max => 'SLRU_MAX_ALLOWED_BUFFERS', + check_hook => 'check_serial_buffers', +}, + +# Can't be set in postgresql.conf +{ name => 'server_encoding', type => 'string', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the server (database) character set encoding.', + flags => 'GUC_IS_NAME | GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'server_encoding_string', + boot_val => '"SQL_ASCII"', +}, + +# Can't be set in postgresql.conf +{ name => 'server_version', type => 'string', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the server version.', + flags => 'GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'server_version_string', + boot_val => 'PG_VERSION', +}, + +# Can't be set in postgresql.conf +{ name => 'server_version_num', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the server version as an integer.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'server_version_num', + boot_val => 'PG_VERSION_NUM', + min => 'PG_VERSION_NUM', + max => 'PG_VERSION_NUM', +}, + +# Not for general use --- used by SET SESSION AUTHORIZATION +{ name => 'session_authorization', type => 'string', context => 'PGC_USERSET', group => 'UNGROUPED', + short_desc => 'Sets the session user name.', + flags => 'GUC_IS_NAME | GUC_REPORT | GUC_NO_SHOW_ALL | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_NOT_WHILE_SEC_REST', + variable => 'session_authorization_string', + boot_val => 'NULL', + check_hook => 'check_session_authorization', + assign_hook => 'assign_session_authorization', +}, + +{ name => 'session_preload_libraries', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_PRELOAD', + short_desc => 'Lists shared libraries to preload into each backend.', + flags => 'GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY', + variable => 'session_preload_libraries_string', + boot_val => '""', +}, + +{ name => 'session_replication_role', type => 'enum', context => 'PGC_SUSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the session\'s behavior for triggers and rewrite rules.', + variable => 'SessionReplicationRole', + boot_val => 'SESSION_REPLICATION_ROLE_ORIGIN', + options => 'session_replication_role_options', + assign_hook => 'assign_session_replication_role', +}, + +# We sometimes multiply the number of shared buffers by two without +# checking for overflow, so we mustn't allow more than INT_MAX / 2. +{ name => 'shared_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the number of shared memory buffers used by the server.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'NBuffers', + boot_val => '16384', + min => '16', + max => 'INT_MAX / 2', +}, + +{ name => 'shared_memory_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the size of the server\'s main shared memory area (rounded up to the nearest MB).', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_UNIT_MB | GUC_RUNTIME_COMPUTED', + variable => 'shared_memory_size_mb', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'shared_memory_size_in_huge_pages', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the number of huge pages needed for the main shared memory area.', + long_desc => '-1 means huge pages are not supported.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', + variable => 'shared_memory_size_in_huge_pages', + boot_val => '-1', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'shared_memory_type', type => 'enum', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Selects the shared memory implementation used for the main shared memory region.', + variable => 'shared_memory_type', + boot_val => 'DEFAULT_SHARED_MEMORY_TYPE', + options => 'shared_memory_options', +}, + +{ name => 'shared_preload_libraries', type => 'string', context => 'PGC_POSTMASTER', group => 'CLIENT_CONN_PRELOAD', + short_desc => 'Lists shared libraries to preload into server.', + flags => 'GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY', + variable => 'shared_preload_libraries_string', + boot_val => '""', +}, + +{ name => 'ssl', type => 'bool', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Enables SSL connections.', + variable => 'EnableSSL', + boot_val => 'false', + check_hook => 'check_ssl', +}, + +{ name => 'ssl_ca_file', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Location of the SSL certificate authority file.', + variable => 'ssl_ca_file', + boot_val => '""', +}, + +{ name => 'ssl_cert_file', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Location of the SSL server certificate file.', + variable => 'ssl_cert_file', + boot_val => '"server.crt"', +}, + +{ name => 'ssl_ciphers', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Sets the list of allowed TLSv1.2 (and lower) ciphers.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'SSLCipherList', + boot_val => 'DEFAULT_SSL_CIPHERS', +}, + +{ name => 'ssl_crl_dir', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Location of the SSL certificate revocation list directory.', + variable => 'ssl_crl_dir', + boot_val => '""', +}, + +{ name => 'ssl_crl_file', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Location of the SSL certificate revocation list file.', + variable => 'ssl_crl_file', + boot_val => '""', +}, + +{ name => 'ssl_dh_params_file', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Location of the SSL DH parameters file.', + long_desc => 'An empty string means use compiled-in default parameters.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'ssl_dh_params_file', + boot_val => '""', +}, + +{ name => 'ssl_groups', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Sets the group(s) to use for Diffie-Hellman key exchange.', + long_desc => 'Multiple groups can be specified using a colon-separated list.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'SSLECDHCurve', + boot_val => 'DEFAULT_SSL_GROUPS', +}, + +{ name => 'ssl_key_file', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Location of the SSL server private key file.', + variable => 'ssl_key_file', + boot_val => '"server.key"', +}, + +{ name => 'ssl_library', type => 'string', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the name of the SSL library.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'ssl_library', + boot_val => 'SSL_LIBRARY', +}, + +{ name => 'ssl_max_protocol_version', type => 'enum', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Sets the maximum SSL/TLS protocol version to use.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'ssl_max_protocol_version', + boot_val => 'PG_TLS_ANY', + options => 'ssl_protocol_versions_info', +}, + +{ name => 'ssl_min_protocol_version', type => 'enum', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Sets the minimum SSL/TLS protocol version to use.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'ssl_min_protocol_version', + boot_val => 'PG_TLS1_2_VERSION', + options => 'ssl_protocol_versions_info + 1', # don't allow PG_TLS_ANY +}, + +{ name => 'ssl_passphrase_command', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Command to obtain passphrases for SSL.', + long_desc => 'An empty string means use the built-in prompting mechanism.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'ssl_passphrase_command', + boot_val => '""', +}, + +{ name => 'ssl_passphrase_command_supports_reload', type => 'bool', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Controls whether "ssl_passphrase_command" is called during server reload.', + variable => 'ssl_passphrase_command_supports_reload', + boot_val => 'false', +}, + +{ name => 'ssl_prefer_server_ciphers', type => 'bool', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Give priority to server ciphersuite order.', + variable => 'SSLPreferServerCiphers', + boot_val => 'true', +}, + +{ name => 'ssl_renegotiation_limit', type => 'int', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'SSL renegotiation is no longer supported; this can only be 0.', + flags => 'GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'ssl_renegotiation_limit', + boot_val => '0', + min => '0', + max => '0', +}, + +{ name => 'ssl_tls13_ciphers', type => 'string', context => 'PGC_SIGHUP', group => 'CONN_AUTH_SSL', + short_desc => 'Sets the list of allowed TLSv1.3 cipher suites.', + long_desc => 'An empty string means use the default cipher suites.', + flags => 'GUC_SUPERUSER_ONLY', + variable => 'SSLCipherSuites', + boot_val => '""', +}, + +{ name => 'standard_conforming_strings', type => 'bool', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'Causes \'...\' strings to treat backslashes literally.', + flags => 'GUC_REPORT', + variable => 'standard_conforming_strings', + boot_val => 'true', +}, + +{ name => 'statement_timeout', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the maximum allowed duration of any statement.', + long_desc => '0 disables the timeout.', + flags => 'GUC_UNIT_MS', + variable => 'StatementTimeout', + boot_val => '0', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'stats_fetch_consistency', type => 'enum', context => 'PGC_USERSET', group => 'STATS_CUMULATIVE', + short_desc => 'Sets the consistency of accesses to statistics data.', + variable => 'pgstat_fetch_consistency', + boot_val => 'PGSTAT_FETCH_CONSISTENCY_CACHE', + options => 'stats_fetch_consistency', + assign_hook => 'assign_stats_fetch_consistency', +}, + +{ name => 'subtransaction_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the size of the dedicated buffer pool used for the subtransaction cache.', + long_desc => '0 means use a fraction of "shared_buffers".', + flags => 'GUC_UNIT_BLOCKS', + variable => 'subtransaction_buffers', + boot_val => '0', + min => '0', + max => 'SLRU_MAX_ALLOWED_BUFFERS', + check_hook => 'check_subtrans_buffers', +}, + +{ name => 'summarize_wal', type => 'bool', context => 'PGC_SIGHUP', group => 'WAL_SUMMARIZATION', + short_desc => 'Starts the WAL summarizer process to enable incremental backup.', + variable => 'summarize_wal', + boot_val => 'false', +}, + +# see max_connections +{ name => 'superuser_reserved_connections', type => 'int', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the number of connection slots reserved for superusers.', + variable => 'SuperuserReservedConnections', + boot_val => '3', + min => '0', + max => 'MAX_BACKENDS', +}, + +{ name => 'sync_replication_slots', type => 'bool', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Enables a physical standby to synchronize logical failover replication slots from the primary server.', + variable => 'sync_replication_slots', + boot_val => 'false', +}, + +{ name => 'synchronize_seqscans', type => 'bool', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_PREVIOUS', + short_desc => 'Enables synchronized sequential scans.', + variable => 'synchronize_seqscans', + boot_val => 'true', +}, + +{ name => 'synchronized_standby_slots', type => 'string', context => 'PGC_SIGHUP', group => 'REPLICATION_PRIMARY', + short_desc => 'Lists streaming replication standby server replication slot names that logical WAL sender processes will wait for.', + long_desc => 'Logical WAL sender processes will send decoded changes to output plugins only after the specified replication slots have confirmed receiving WAL.', + flags => 'GUC_LIST_INPUT', + variable => 'synchronized_standby_slots', + boot_val => '""', + check_hook => 'check_synchronized_standby_slots', + assign_hook => 'assign_synchronized_standby_slots', +}, + +{ name => 'synchronous_commit', type => 'enum', context => 'PGC_USERSET', group => 'WAL_SETTINGS', + short_desc => 'Sets the current transaction\'s synchronization level.', + variable => 'synchronous_commit', + boot_val => 'SYNCHRONOUS_COMMIT_ON', + options => 'synchronous_commit_options', + assign_hook => 'assign_synchronous_commit', +}, + +{ name => 'synchronous_standby_names', type => 'string', context => 'PGC_SIGHUP', group => 'REPLICATION_PRIMARY', + short_desc => 'Number of synchronous standbys and list of names of potential synchronous ones.', + flags => 'GUC_LIST_INPUT', + variable => 'SyncRepStandbyNames', + boot_val => '""', + check_hook => 'check_synchronous_standby_names', + assign_hook => 'assign_synchronous_standby_names', +}, + +{ name => 'syslog_facility', type => 'enum', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the syslog "facility" to be used when syslog enabled.', + variable => 'syslog_facility', + boot_val => 'DEFAULT_SYSLOG_FACILITY', + options => 'syslog_facility_options', + assign_hook => 'assign_syslog_facility', +}, + +{ name => 'syslog_ident', type => 'string', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Sets the program name used to identify PostgreSQL messages in syslog.', + variable => 'syslog_ident_str', + boot_val => '"postgres"', + assign_hook => 'assign_syslog_ident', +}, + +{ name => 'syslog_sequence_numbers', type => 'bool', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Add sequence number to syslog messages to avoid duplicate suppression.', + variable => 'syslog_sequence_numbers', + boot_val => 'true', +}, + +{ name => 'syslog_split_messages', type => 'bool', context => 'PGC_SIGHUP', group => 'LOGGING_WHERE', + short_desc => 'Split messages sent to syslog by lines and to fit into 1024 bytes.', + variable => 'syslog_split_messages', + boot_val => 'true', +}, + +{ name => 'tcp_keepalives_count', type => 'int', context => 'PGC_USERSET', group => 'CONN_AUTH_TCP', + short_desc => 'Maximum number of TCP keepalive retransmits.', + long_desc => 'Number of consecutive keepalive retransmits that can be lost before a connection is considered dead. 0 means use the system default.', + variable => 'tcp_keepalives_count', + boot_val => '0', + min => '0', + max => 'INT_MAX', + assign_hook => 'assign_tcp_keepalives_count', + show_hook => 'show_tcp_keepalives_count', +}, + +{ name => 'tcp_keepalives_idle', type => 'int', context => 'PGC_USERSET', group => 'CONN_AUTH_TCP', + short_desc => 'Time between issuing TCP keepalives.', + long_desc => '0 means use the system default.', + flags => 'GUC_UNIT_S', + variable => 'tcp_keepalives_idle', + boot_val => '0', + min => '0', + max => 'INT_MAX', + assign_hook => 'assign_tcp_keepalives_idle', + show_hook => 'show_tcp_keepalives_idle', +}, + +{ name => 'tcp_keepalives_interval', type => 'int', context => 'PGC_USERSET', group => 'CONN_AUTH_TCP', + short_desc => 'Time between TCP keepalive retransmits.', + long_desc => '0 means use the system default.', + flags => 'GUC_UNIT_S', + variable => 'tcp_keepalives_interval', + boot_val => '0', + min => '0', + max => 'INT_MAX', + assign_hook => 'assign_tcp_keepalives_interval', + show_hook => 'show_tcp_keepalives_interval', +}, + +{ name => 'tcp_user_timeout', type => 'int', context => 'PGC_USERSET', group => 'CONN_AUTH_TCP', + short_desc => 'TCP user timeout.', + long_desc => '0 means use the system default.', + flags => 'GUC_UNIT_MS', + variable => 'tcp_user_timeout', + boot_val => '0', + min => '0', + max => 'INT_MAX', + assign_hook => 'assign_tcp_user_timeout', + show_hook => 'show_tcp_user_timeout', +}, + +{ name => 'temp_buffers', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM', + short_desc => 'Sets the maximum number of temporary buffers used by each session.', + flags => 'GUC_UNIT_BLOCKS | GUC_EXPLAIN', + variable => 'num_temp_buffers', + boot_val => '1024', + min => '100', + max => 'INT_MAX / 2', + check_hook => 'check_temp_buffers', +}, + +{ name => 'temp_file_limit', type => 'int', context => 'PGC_SUSET', group => 'RESOURCES_DISK', + short_desc => 'Limits the total size of all temporary files used by each process.', + long_desc => '-1 means no limit.', + flags => 'GUC_UNIT_KB', + variable => 'temp_file_limit', + boot_val => '-1', + min => '-1', + max => 'INT_MAX', +}, + +{ name => 'temp_tablespaces', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the tablespace(s) to use for temporary tables and sort files.', + long_desc => 'An empty string means use the database\'s default tablespace.', + flags => 'GUC_LIST_INPUT | GUC_LIST_QUOTE', + variable => 'temp_tablespaces', + boot_val => '""', + check_hook => 'check_temp_tablespaces', + assign_hook => 'assign_temp_tablespaces', +}, + +{ name => 'TimeZone', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Sets the time zone for displaying and interpreting time stamps.', + flags => 'GUC_REPORT', + variable => 'timezone_string', + boot_val => '"GMT"', + check_hook => 'check_timezone', + assign_hook => 'assign_timezone', + show_hook => 'show_timezone', +}, + +{ name => 'timezone_abbreviations', type => 'string', context => 'PGC_USERSET', group => 'CLIENT_CONN_LOCALE', + short_desc => 'Selects a file of time zone abbreviations.', + variable => 'timezone_abbreviations_string', + boot_val => 'NULL', + check_hook => 'check_timezone_abbreviations', + assign_hook => 'assign_timezone_abbreviations', +}, + +{ name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', + short_desc => 'Logs details of pre-authentication connection handshake.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Trace_connection_negotiation', + boot_val => 'false', +}, + +{ name => 'trace_lock_oidmin', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets the minimum OID of tables for tracking locks.', + long_desc => 'Is used to avoid output on system tables.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Trace_lock_oidmin', + boot_val => 'FirstNormalObjectId', + min => '0', + max => 'INT_MAX', + ifdef => 'LOCK_DEBUG', +}, + +{ name => 'trace_lock_table', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets the OID of the table with unconditionally lock tracing.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Trace_lock_table', + boot_val => '0', + min => '0', + max => 'INT_MAX', + ifdef => 'LOCK_DEBUG', +}, + +{ name => 'trace_locks', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Emits information about lock usage.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Trace_locks', + boot_val => 'false', + ifdef => 'LOCK_DEBUG', +}, + +{ name => 'trace_lwlocks', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Emits information about lightweight lock usage.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Trace_lwlocks', + boot_val => 'false', + ifdef => 'LOCK_DEBUG', +}, + +{ name => 'trace_notify', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Generates debugging output for LISTEN and NOTIFY.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Trace_notify', + boot_val => 'false', +}, + +{ name => 'trace_sort', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Emit information about resource usage in sorting.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'trace_sort', + boot_val => 'false', +}, + +# this is undocumented because not exposed in a standard build +{ name => 'trace_syncscan', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Generate debugging output for synchronized scanning.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'trace_syncscan', + boot_val => 'false', + ifdef => 'TRACE_SYNCSCAN', +}, + +{ name => 'trace_userlocks', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Emits information about user lock usage.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'Trace_userlocks', + boot_val => 'false', + ifdef => 'LOCK_DEBUG', +}, + +{ name => 'track_activities', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects information about executing commands.', + long_desc => 'Enables the collection of information on the currently executing command of each session, along with the time at which that command began execution.', + variable => 'pgstat_track_activities', + boot_val => 'true', +}, + +{ name => 'track_activity_query_size', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE', + short_desc => 'Sets the size reserved for pg_stat_activity.query, in bytes.', + flags => 'GUC_UNIT_BYTE', + variable => 'pgstat_track_activity_query_size', + boot_val => '1024', + min => '100', + max => '1048576', +}, + +{ name => 'track_commit_timestamp', type => 'bool', context => 'PGC_POSTMASTER', group => 'REPLICATION_SENDING', + short_desc => 'Collects transaction commit time.', + variable => 'track_commit_timestamp', + boot_val => 'false', +}, + +{ name => 'track_cost_delay_timing', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects timing statistics for cost-based vacuum delay.', + variable => 'track_cost_delay_timing', + boot_val => 'false', +}, + +{ name => 'track_counts', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects statistics on database activity.', + variable => 'pgstat_track_counts', + boot_val => 'true', +}, + +{ name => 'track_functions', type => 'enum', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects function-level statistics on database activity.', + variable => 'pgstat_track_functions', + boot_val => 'TRACK_FUNC_OFF', + options => 'track_function_options', +}, + +{ name => 'track_io_timing', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects timing statistics for database I/O activity.', + variable => 'track_io_timing', + boot_val => 'false', +}, + +{ name => 'track_wal_io_timing', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects timing statistics for WAL I/O activity.', + variable => 'track_wal_io_timing', + boot_val => 'false', +}, + +{ name => 'transaction_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the size of the dedicated buffer pool used for the transaction status cache.', + long_desc => '0 means use a fraction of "shared_buffers".', + flags => 'GUC_UNIT_BLOCKS', + variable => 'transaction_buffers', + boot_val => '0', + min => '0', + max => 'SLRU_MAX_ALLOWED_BUFFERS', + check_hook => 'check_transaction_buffers', +}, + +{ name => 'transaction_deferrable', type => 'bool', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Whether to defer a read-only serializable transaction until it can be executed with no possible serialization failures.', + flags => 'GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'XactDeferrable', + boot_val => 'false', + check_hook => 'check_transaction_deferrable', +}, + +{ name => 'transaction_isolation', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the current transaction\'s isolation level.', + flags => 'GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'XactIsoLevel', + boot_val => 'XACT_READ_COMMITTED', + options => 'isolation_level_options', + check_hook => 'check_transaction_isolation', +}, + +{ name => 'transaction_read_only', type => 'bool', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the current transaction\'s read-only status.', + flags => 'GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'XactReadOnly', + boot_val => 'false', + check_hook => 'check_transaction_read_only', +}, + +{ name => 'transaction_timeout', type => 'int', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets the maximum allowed duration of any transaction within a session (not a prepared transaction).', + long_desc => '0 disables the timeout.', + flags => 'GUC_UNIT_MS', + variable => 'TransactionTimeout', + boot_val => '0', + min => '0', + max => 'INT_MAX', + assign_hook => 'assign_transaction_timeout', +}, + +{ name => 'transform_null_equals', type => 'bool', context => 'PGC_USERSET', group => 'COMPAT_OPTIONS_OTHER', + short_desc => 'Treats "expr=NULL" as "expr IS NULL".', + long_desc => 'When turned on, expressions of the form expr = NULL (or NULL = expr) are treated as expr IS NULL, that is, they return true if expr evaluates to the null value, and false otherwise. The correct behavior of expr = NULL is to always return null (unknown).', + variable => 'Transform_null_equals', + boot_val => 'false', +}, + +{ name => 'unix_socket_directories', type => 'string', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the directories where Unix-domain sockets will be created.', + flags => 'GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY', + variable => 'Unix_socket_directories', + boot_val => 'DEFAULT_PGSOCKET_DIR', +}, + +{ name => 'unix_socket_group', type => 'string', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the owning group of the Unix-domain socket.', + long_desc => 'The owning user of the socket is always the user that starts the server. An empty string means use the user\'s default group.', + variable => 'Unix_socket_group', + boot_val => '""', +}, + +{ name => 'unix_socket_permissions', type => 'int', context => 'PGC_POSTMASTER', group => 'CONN_AUTH_SETTINGS', + short_desc => 'Sets the access permissions of the Unix-domain socket.', + long_desc => 'Unix-domain sockets use the usual Unix file system permission set. The parameter value is expected to be a numeric mode specification in the form accepted by the chmod and umask system calls. (To use the customary octal format the number must start with a 0 (zero).)', + variable => 'Unix_socket_permissions', + boot_val => '0777', + min => '0000', + max => '0777', + show_hook => 'show_unix_socket_permissions', +}, + +{ name => 'update_process_title', type => 'bool', context => 'PGC_SUSET', group => 'PROCESS_TITLE', + short_desc => 'Updates the process title to show the active SQL command.', + long_desc => 'Enables updating of the process title every time a new SQL command is received by the server.', + variable => 'update_process_title', + boot_val => 'DEFAULT_UPDATE_PROCESS_TITLE', +}, + +{ name => 'vacuum_buffer_usage_limit', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM', + short_desc => 'Sets the buffer pool size for VACUUM, ANALYZE, and autovacuum.', + flags => 'GUC_UNIT_KB', + variable => 'VacuumBufferUsageLimit', + boot_val => '2048', + min => '0', + max => 'MAX_BAS_VAC_RING_SIZE_KB', + check_hook => 'check_vacuum_buffer_usage_limit', +}, + +{ name => 'vacuum_cost_delay', type => 'real', context => 'PGC_USERSET', group => 'VACUUM_COST_DELAY', + short_desc => 'Vacuum cost delay in milliseconds.', + flags => 'GUC_UNIT_MS', + variable => 'VacuumCostDelay', + boot_val => '0', + min => '0', + max => '100', +}, + +{ name => 'vacuum_cost_limit', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_COST_DELAY', + short_desc => 'Vacuum cost amount available before napping.', + variable => 'VacuumCostLimit', + boot_val => '200', + min => '1', + max => '10000', +}, + +{ name => 'vacuum_cost_page_dirty', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_COST_DELAY', + short_desc => 'Vacuum cost for a page dirtied by vacuum.', + variable => 'VacuumCostPageDirty', + boot_val => '20', + min => '0', + max => '10000', +}, + +{ name => 'vacuum_cost_page_hit', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_COST_DELAY', + short_desc => 'Vacuum cost for a page found in the buffer cache.', + variable => 'VacuumCostPageHit', + boot_val => '1', + min => '0', + max => '10000', +}, + +{ name => 'vacuum_cost_page_miss', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_COST_DELAY', + short_desc => 'Vacuum cost for a page not found in the buffer cache.', + variable => 'VacuumCostPageMiss', + boot_val => '2', + min => '0', + max => '10000', +}, + +{ name => 'vacuum_failsafe_age', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_FREEZING', + short_desc => 'Age at which VACUUM should trigger failsafe to avoid a wraparound outage.', + variable => 'vacuum_failsafe_age', + boot_val => '1600000000', + min => '0', + max => '2100000000', +}, + +{ name => 'vacuum_freeze_min_age', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_FREEZING', + short_desc => 'Minimum age at which VACUUM should freeze a table row.', + variable => 'vacuum_freeze_min_age', + boot_val => '50000000', + min => '0', + max => '1000000000', +}, + +{ name => 'vacuum_freeze_table_age', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_FREEZING', + short_desc => 'Age at which VACUUM should scan whole table to freeze tuples.', + variable => 'vacuum_freeze_table_age', + boot_val => '150000000', + min => '0', + max => '2000000000', +}, + +{ name => 'vacuum_max_eager_freeze_failure_rate', type => 'real', context => 'PGC_USERSET', group => 'VACUUM_FREEZING', + short_desc => 'Fraction of pages in a relation vacuum can scan and fail to freeze before disabling eager scanning.', + long_desc => 'A value of 0.0 disables eager scanning and a value of 1.0 will eagerly scan up to 100 percent of the all-visible pages in the relation. If vacuum successfully freezes these pages, the cap is lower than 100 percent, because the goal is to amortize page freezing across multiple vacuums.', + variable => 'vacuum_max_eager_freeze_failure_rate', + boot_val => '0.03', + min => '0.0', + max => '1.0', +}, + +{ name => 'vacuum_multixact_failsafe_age', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_FREEZING', + short_desc => 'Multixact age at which VACUUM should trigger failsafe to avoid a wraparound outage.', + variable => 'vacuum_multixact_failsafe_age', + boot_val => '1600000000', + min => '0', + max => '2100000000', +}, + +{ name => 'vacuum_multixact_freeze_min_age', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_FREEZING', + short_desc => 'Minimum age at which VACUUM should freeze a MultiXactId in a table row.', + variable => 'vacuum_multixact_freeze_min_age', + boot_val => '5000000', + min => '0', + max => '1000000000', +}, + +{ name => 'vacuum_multixact_freeze_table_age', type => 'int', context => 'PGC_USERSET', group => 'VACUUM_FREEZING', + short_desc => 'Multixact age at which VACUUM should scan whole table to freeze tuples.', + variable => 'vacuum_multixact_freeze_table_age', + boot_val => '150000000', + min => '0', + max => '2000000000', +}, + +{ name => 'vacuum_truncate', type => 'bool', context => 'PGC_USERSET', group => 'VACUUM_DEFAULT', + short_desc => 'Enables vacuum to truncate empty pages at the end of the table.', + variable => 'vacuum_truncate', + boot_val => 'true', +}, + +{ name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the block size in the write ahead log.', + flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', + variable => 'wal_block_size', + boot_val => 'XLOG_BLCKSZ', + min => 'XLOG_BLCKSZ', + max => 'XLOG_BLCKSZ', +}, + +{ name => 'wal_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'WAL_SETTINGS', + short_desc => 'Sets the number of disk-page buffers in shared memory for WAL.', + long_desc => '-1 means use a fraction of "shared_buffers".', + flags => 'GUC_UNIT_XBLOCKS', + variable => 'XLOGbuffers', + boot_val => '-1', + min => '-1', + max => '(INT_MAX / XLOG_BLCKSZ)', + check_hook => 'check_wal_buffers', +}, + +{ name => 'wal_compression', type => 'enum', context => 'PGC_SUSET', group => 'WAL_SETTINGS', + short_desc => 'Compresses full-page writes written in WAL file with specified method.', + variable => 'wal_compression', + boot_val => 'WAL_COMPRESSION_NONE', + options => 'wal_compression_options', +}, + +{ name => 'wal_consistency_checking', type => 'string', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets the WAL resource managers for which WAL consistency checks are done.', + long_desc => 'Full-page images will be logged for all data blocks and cross-checked against the results of WAL replay.', + flags => 'GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE', + variable => 'wal_consistency_checking_string', + boot_val => '""', + check_hook => 'check_wal_consistency_checking', + assign_hook => 'assign_wal_consistency_checking', +}, + +{ name => 'wal_debug', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Emit WAL-related debugging output.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'XLOG_DEBUG', + boot_val => 'false', + ifdef => 'WAL_DEBUG', +}, + +{ name => 'wal_decode_buffer_size', type => 'int', context => 'PGC_POSTMASTER', group => 'WAL_RECOVERY', + short_desc => 'Buffer size for reading ahead in the WAL during recovery.', + long_desc => 'Maximum distance to read ahead in the WAL to prefetch referenced data blocks.', + flags => 'GUC_UNIT_BYTE', + variable => 'wal_decode_buffer_size', + boot_val => '512 * 1024', + min => '64 * 1024', + max => 'MaxAllocSize', +}, + +{ name => 'wal_init_zero', type => 'bool', context => 'PGC_SUSET', group => 'WAL_SETTINGS', + short_desc => 'Writes zeroes to new WAL files before first use.', + variable => 'wal_init_zero', + boot_val => 'true', +}, + +{ name => 'wal_keep_size', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SENDING', + short_desc => 'Sets the size of WAL files held for standby servers.', + flags => 'GUC_UNIT_MB', + variable => 'wal_keep_size_mb', + boot_val => '0', + min => '0', + max => 'MAX_KILOBYTES', +}, + +{ name => 'wal_level', type => 'enum', context => 'PGC_POSTMASTER', group => 'WAL_SETTINGS', + short_desc => 'Sets the level of information written to the WAL.', + variable => 'wal_level', + boot_val => 'WAL_LEVEL_REPLICA', + options => 'wal_level_options', +}, + +{ name => 'wal_log_hints', type => 'bool', context => 'PGC_POSTMASTER', group => 'WAL_SETTINGS', + short_desc => 'Writes full pages to WAL when first modified after a checkpoint, even for a non-critical modification.', + variable => 'wal_log_hints', + boot_val => 'false', +}, + +{ name => 'wal_receiver_create_temp_slot', type => 'bool', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets whether a WAL receiver should create a temporary replication slot if no permanent slot is configured.', + variable => 'wal_receiver_create_temp_slot', + boot_val => 'false', +}, + +{ name => 'wal_receiver_status_interval', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the maximum interval between WAL receiver status reports to the sending server.', + flags => 'GUC_UNIT_S', + variable => 'wal_receiver_status_interval', + boot_val => '10', + min => '0', + max => 'INT_MAX / 1000', +}, + +{ name => 'wal_receiver_timeout', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the maximum wait time to receive data from the sending server.', + long_desc => '0 disables the timeout.', + flags => 'GUC_UNIT_MS', + variable => 'wal_receiver_timeout', + boot_val => '60 * 1000', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'wal_recycle', type => 'bool', context => 'PGC_SUSET', group => 'WAL_SETTINGS', + short_desc => 'Recycles WAL files by renaming them.', + variable => 'wal_recycle', + boot_val => 'true', +}, + +{ name => 'wal_retrieve_retry_interval', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', + short_desc => 'Sets the time to wait before retrying to retrieve WAL after a failed attempt.', + flags => 'GUC_UNIT_MS', + variable => 'wal_retrieve_retry_interval', + boot_val => '5000', + min => '1', + max => 'INT_MAX', +}, + +{ name => 'wal_segment_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', + short_desc => 'Shows the size of write ahead log segments.', + flags => 'GUC_UNIT_BYTE | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', + variable => 'wal_segment_size', + boot_val => 'DEFAULT_XLOG_SEG_SIZE', + min => 'WalSegMinSize', + max => 'WalSegMaxSize', + check_hook => 'check_wal_segment_size', +}, + +{ name => 'wal_sender_timeout', type => 'int', context => 'PGC_USERSET', group => 'REPLICATION_SENDING', + short_desc => 'Sets the maximum time to wait for WAL replication.', + flags => 'GUC_UNIT_MS', + variable => 'wal_sender_timeout', + boot_val => '60 * 1000', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'wal_skip_threshold', type => 'int', context => 'PGC_USERSET', group => 'WAL_SETTINGS', + short_desc => 'Minimum size of new file to fsync instead of writing WAL.', + flags => 'GUC_UNIT_KB', + variable => 'wal_skip_threshold', + boot_val => '2048', + min => '0', + max => 'MAX_KILOBYTES', +}, + +{ name => 'wal_summary_keep_time', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_SUMMARIZATION', + short_desc => 'Time for which WAL summary files should be kept.', + long_desc => '0 disables automatic summary file deletion.', + flags => 'GUC_UNIT_MIN', + variable => 'wal_summary_keep_time', + boot_val => '10 * HOURS_PER_DAY * MINS_PER_HOUR /* 10 days */', + min => '0', + max => 'INT_MAX / SECS_PER_MINUTE', +}, + +{ name => 'wal_sync_method', type => 'enum', context => 'PGC_SIGHUP', group => 'WAL_SETTINGS', + short_desc => 'Selects the method used for forcing WAL updates to disk.', + variable => 'wal_sync_method', + boot_val => 'DEFAULT_WAL_SYNC_METHOD', + options => 'wal_sync_method_options', + assign_hook => 'assign_wal_sync_method', +}, + +{ name => 'wal_writer_delay', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_SETTINGS', + short_desc => 'Time between WAL flushes performed in the WAL writer.', + flags => 'GUC_UNIT_MS', + variable => 'WalWriterDelay', + boot_val => '200', + min => '1', + max => '10000', +}, + +{ name => 'wal_writer_flush_after', type => 'int', context => 'PGC_SIGHUP', group => 'WAL_SETTINGS', + short_desc => 'Amount of WAL written out by WAL writer that triggers a flush.', + flags => 'GUC_UNIT_XBLOCKS', + variable => 'WalWriterFlushAfter', + boot_val => 'DEFAULT_WAL_WRITER_FLUSH_AFTER', + min => '0', + max => 'INT_MAX', +}, + +{ name => 'work_mem', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM', + short_desc => 'Sets the maximum memory to be used for query workspaces.', + long_desc => 'This much memory can be used by each internal sort operation and hash table before switching to temporary disk files.', + flags => 'GUC_UNIT_KB | GUC_EXPLAIN', + variable => 'work_mem', + boot_val => '4096', + min => '64', + max => 'MAX_KILOBYTES', +}, + + +{ name => 'write_page_cost', type => 'real', context => 'PGC_USERSET', group => 'QUERY_TUNING_COST', + short_desc => 'Sets the planner\'s estimate of the cost of a disk page flushing.', + flags => 'GUC_EXPLAIN', + variable => 'write_page_cost', + boot_val => 'DEFAULT_WRITE_PAGE_COST', + min => '0', + max => 'DBL_MAX', +}, +{ name => 'xmlbinary', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets how binary values are to be encoded in XML.', + variable => 'xmlbinary', + boot_val => 'XMLBINARY_BASE64', + options => 'xmlbinary_options', +}, + +{ name => 'xmloption', type => 'enum', context => 'PGC_USERSET', group => 'CLIENT_CONN_STATEMENT', + short_desc => 'Sets whether XML data in implicit parsing and serialization operations is to be considered as documents or content fragments.', + variable => 'xmloption', + boot_val => 'XMLOPTION_CONTENT', + options => 'xmloption_options', +}, + +{ name => 'zero_damaged_pages', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Continues processing past damaged page headers.', + long_desc => 'Detection of a damaged page header normally causes PostgreSQL to report an error, aborting the current transaction. Setting "zero_damaged_pages" to true causes the system to instead report a warning, zero out the damaged page, and continue processing. This behavior will destroy data, namely all the rows on the damaged page.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'zero_damaged_pages', + boot_val => 'false', +}, + +] diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 2f8cbd8675998..04ab0a266088a 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -4,10 +4,10 @@ * * Static tables for the Grand Unified Configuration scheme. * - * Many of these tables are const. However, ConfigureNamesBool[] - * and so on are not, because the structs in those arrays are actually - * the live per-variable state data that guc.c manipulates. While many of - * their fields are intended to be constant, some fields change at runtime. + * Many of these tables are const. However, ConfigureNames[] is not, because + * the structs in it are actually the live per-variable state data that guc.c + * manipulates. While many of their fields are intended to be constant, some + * fields change at runtime. * * * Copyright (c) 2000-2025, PostgreSQL Global Development Group @@ -507,6 +507,7 @@ bool AllowAlterSystem = true; bool log_duration = false; bool Debug_print_plan = false; bool Debug_print_parse = false; +bool Debug_print_raw_parse = false; bool Debug_print_rewritten = false; bool Debug_pretty_print = true; @@ -616,6 +617,7 @@ static int shared_memory_size_mb; static int shared_memory_size_in_huge_pages; static int wal_block_size; static int num_os_semaphores; +static int effective_wal_level = WAL_LEVEL_REPLICA; static bool data_checksums; static bool integer_datetimes; @@ -626,6 +628,13 @@ static bool integer_datetimes; #endif static bool assert_enabled = DEFAULT_ASSERT_ENABLED; +#ifdef EXEC_BACKEND +#define EXEC_BACKEND_ENABLED true +#else +#define EXEC_BACKEND_ENABLED false +#endif +static bool exec_backend_enabled = EXEC_BACKEND_ENABLED; + static char *recovery_target_timeline_string; static char *recovery_target_string; static char *recovery_target_xid_string; @@ -760,4666 +769,4 @@ StaticAssertDecl(lengthof(config_type_names) == (PGC_ENUM + 1), "array length mismatch"); -/* - * Contents of GUC tables - * - * See src/backend/utils/misc/README for design notes. - * - * TO ADD AN OPTION: - * - * 1. Declare a global variable of type bool, int, double, or char* - * and make use of it. - * - * 2. Decide at what times it's safe to set the option. See guc.h for - * details. - * - * 3. Decide on a name, a default value, upper and lower bounds (if - * applicable), etc. - * - * 4. Add a record below. - * - * 5. Add it to src/backend/utils/misc/postgresql.conf.sample, if - * appropriate. - * - * 6. Don't forget to document the option (at least in config.sgml). - * - * 7. If it's a new GUC_LIST_QUOTE option, you must add it to - * variable_is_guc_list_quote() in src/bin/pg_dump/dumputils.c. - */ - -struct config_bool ConfigureNamesBool[] = -{ - { - {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of sequential-scan plans."), - NULL, - GUC_EXPLAIN - }, - &enable_seqscan, - true, - NULL, NULL, NULL - }, - { - {"enable_indexscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of index-scan plans."), - NULL, - GUC_EXPLAIN - }, - &enable_indexscan, - true, - NULL, NULL, NULL - }, - { - {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of index-only-scan plans."), - NULL, - GUC_EXPLAIN - }, - &enable_indexonlyscan, - true, - NULL, NULL, NULL - }, - { - {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of bitmap-scan plans."), - NULL, - GUC_EXPLAIN - }, - &enable_bitmapscan, - true, - NULL, NULL, NULL - }, - { - {"enable_tidscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of TID scan plans."), - NULL, - GUC_EXPLAIN - }, - &enable_tidscan, - true, - NULL, NULL, NULL - }, - { - {"enable_sort", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of explicit sort steps."), - NULL, - GUC_EXPLAIN - }, - &enable_sort, - true, - NULL, NULL, NULL - }, - { - {"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of incremental sort steps."), - NULL, - GUC_EXPLAIN - }, - &enable_incremental_sort, - true, - NULL, NULL, NULL - }, - { - {"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of hashed aggregation plans."), - NULL, - GUC_EXPLAIN - }, - &enable_hashagg, - true, - NULL, NULL, NULL - }, - { - {"enable_material", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of materialization."), - NULL, - GUC_EXPLAIN - }, - &enable_material, - true, - NULL, NULL, NULL - }, - { - {"enable_memoize", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of memoization."), - NULL, - GUC_EXPLAIN - }, - &enable_memoize, - true, - NULL, NULL, NULL - }, - { - {"enable_nestloop", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of nested-loop join plans."), - NULL, - GUC_EXPLAIN - }, - &enable_nestloop, - true, - NULL, NULL, NULL - }, - { - {"enable_mergejoin", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of merge join plans."), - NULL, - GUC_EXPLAIN - }, - &enable_mergejoin, - true, - NULL, NULL, NULL - }, - { - {"enable_hashjoin", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of hash join plans."), - NULL, - GUC_EXPLAIN - }, - &enable_hashjoin, - true, - NULL, NULL, NULL - }, - { - {"enable_gathermerge", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of gather merge plans."), - NULL, - GUC_EXPLAIN - }, - &enable_gathermerge, - true, - NULL, NULL, NULL - }, - { - {"enable_partitionwise_join", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables partitionwise join."), - NULL, - GUC_EXPLAIN - }, - &enable_partitionwise_join, - false, - NULL, NULL, NULL - }, - { - {"enable_partitionwise_aggregate", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables partitionwise aggregation and grouping."), - NULL, - GUC_EXPLAIN - }, - &enable_partitionwise_aggregate, - false, - NULL, NULL, NULL - }, - { - {"enable_parallel_append", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of parallel append plans."), - NULL, - GUC_EXPLAIN - }, - &enable_parallel_append, - true, - NULL, NULL, NULL - }, - { - {"enable_parallel_hash", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of parallel hash plans."), - NULL, - GUC_EXPLAIN - }, - &enable_parallel_hash, - true, - NULL, NULL, NULL - }, - { - {"enable_partition_pruning", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables plan-time and execution-time partition pruning."), - gettext_noop("Allows the query planner and executor to compare partition " - "bounds to conditions in the query to determine which " - "partitions must be scanned."), - GUC_EXPLAIN - }, - &enable_partition_pruning, - true, - NULL, NULL, NULL - }, - { - {"enable_presorted_aggregate", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's ability to produce plans that " - "provide presorted input for ORDER BY / DISTINCT aggregate " - "functions."), - gettext_noop("Allows the query planner to build plans that provide " - "presorted input for aggregate functions with an ORDER BY / " - "DISTINCT clause. When disabled, implicit sorts are always " - "performed during execution."), - GUC_EXPLAIN - }, - &enable_presorted_aggregate, - true, - NULL, NULL, NULL - }, - { - {"enable_async_append", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of async append plans."), - NULL, - GUC_EXPLAIN - }, - &enable_async_append, - true, - NULL, NULL, NULL - }, - { - {"enable_self_join_elimination", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables removal of unique self-joins."), - NULL, - GUC_EXPLAIN - }, - &enable_self_join_elimination, - true, - NULL, NULL, NULL - }, - { - {"enable_group_by_reordering", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables reordering of GROUP BY keys."), - NULL, - GUC_EXPLAIN - }, - &enable_group_by_reordering, - true, - NULL, NULL, NULL - }, - { - {"enable_distinct_reordering", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables reordering of DISTINCT pathkeys."), - NULL, - GUC_EXPLAIN - }, - &enable_distinct_reordering, - true, - NULL, NULL, NULL - }, - { - {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, - gettext_noop("Enables genetic query optimization."), - gettext_noop("This algorithm attempts to do planning without " - "exhaustive searching."), - GUC_EXPLAIN - }, - &enable_geqo, - true, - NULL, NULL, NULL - }, - { - /* - * Not for general use --- used by SET SESSION AUTHORIZATION and SET - * ROLE - */ - {"is_superuser", PGC_INTERNAL, UNGROUPED, - gettext_noop("Shows whether the current user is a superuser."), - NULL, - GUC_REPORT | GUC_NO_SHOW_ALL | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_ALLOW_IN_PARALLEL - }, - ¤t_role_is_superuser, - false, - NULL, NULL, NULL - }, - { - /* - * This setting itself cannot be set by ALTER SYSTEM to avoid an - * operator turning this setting off by using ALTER SYSTEM, without a - * way to turn it back on. - */ - {"allow_alter_system", PGC_SIGHUP, COMPAT_OPTIONS_OTHER, - gettext_noop("Allows running the ALTER SYSTEM command."), - gettext_noop("Can be set to off for environments where global configuration " - "changes should be made using a different method."), - GUC_DISALLOW_IN_AUTO_FILE - }, - &AllowAlterSystem, - true, - NULL, NULL, NULL - }, - { - {"bonjour", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Enables advertising the server via Bonjour."), - NULL - }, - &enable_bonjour, - false, - check_bonjour, NULL, NULL - }, - { - {"track_commit_timestamp", PGC_POSTMASTER, REPLICATION_SENDING, - gettext_noop("Collects transaction commit time."), - NULL - }, - &track_commit_timestamp, - false, - NULL, NULL, NULL - }, - { - {"ssl", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Enables SSL connections."), - NULL - }, - &EnableSSL, - false, - check_ssl, NULL, NULL - }, - { - {"ssl_passphrase_command_supports_reload", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Controls whether \"ssl_passphrase_command\" is called during server reload."), - NULL - }, - &ssl_passphrase_command_supports_reload, - false, - NULL, NULL, NULL - }, - { - {"ssl_prefer_server_ciphers", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Give priority to server ciphersuite order."), - NULL - }, - &SSLPreferServerCiphers, - true, - NULL, NULL, NULL - }, - { - {"fsync", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("Forces synchronization of updates to disk."), - gettext_noop("The server will use the fsync() system call in several places to make " - "sure that updates are physically written to disk. This ensures " - "that a database cluster will recover to a consistent state after " - "an operating system or hardware crash.") - }, - &enableFsync, - true, - NULL, NULL, NULL - }, - { - {"ignore_checksum_failure", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Continues processing after a checksum failure."), - gettext_noop("Detection of a checksum failure normally causes PostgreSQL to " - "report an error, aborting the current transaction. Setting " - "ignore_checksum_failure to true causes the system to ignore the failure " - "(but still report a warning), and continue processing. This " - "behavior could cause crashes or other serious problems. Only " - "has an effect if checksums are enabled."), - GUC_NOT_IN_SAMPLE - }, - &ignore_checksum_failure, - false, - NULL, NULL, NULL - }, - { - {"zero_damaged_pages", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Continues processing past damaged page headers."), - gettext_noop("Detection of a damaged page header normally causes PostgreSQL to " - "report an error, aborting the current transaction. Setting " - "\"zero_damaged_pages\" to true causes the system to instead report a " - "warning, zero out the damaged page, and continue processing. This " - "behavior will destroy data, namely all the rows on the damaged page."), - GUC_NOT_IN_SAMPLE - }, - &zero_damaged_pages, - false, - NULL, NULL, NULL - }, - { - {"ignore_invalid_pages", PGC_POSTMASTER, DEVELOPER_OPTIONS, - gettext_noop("Continues recovery after an invalid pages failure."), - gettext_noop("Detection of WAL records having references to " - "invalid pages during recovery causes PostgreSQL to " - "raise a PANIC-level error, aborting the recovery. " - "Setting \"ignore_invalid_pages\" to true causes " - "the system to ignore invalid page references " - "in WAL records (but still report a warning), " - "and continue recovery. This behavior may cause " - "crashes, data loss, propagate or hide corruption, " - "or other serious problems. Only has an effect " - "during recovery or in standby mode."), - GUC_NOT_IN_SAMPLE - }, - &ignore_invalid_pages, - false, - NULL, NULL, NULL - }, - { - {"full_page_writes", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("Writes full pages to WAL when first modified after a checkpoint."), - gettext_noop("A page write in process during an operating system crash might be " - "only partially written to disk. During recovery, the row changes " - "stored in WAL are not enough to recover. This option writes " - "pages when first modified after a checkpoint to WAL so full recovery " - "is possible.") - }, - &fullPageWrites, - true, - NULL, NULL, NULL - }, - - { - {"wal_log_hints", PGC_POSTMASTER, WAL_SETTINGS, - gettext_noop("Writes full pages to WAL when first modified after a checkpoint, even for a non-critical modification."), - NULL - }, - &wal_log_hints, - false, - NULL, NULL, NULL - }, - - { - {"wal_init_zero", PGC_SUSET, WAL_SETTINGS, - gettext_noop("Writes zeroes to new WAL files before first use."), - NULL - }, - &wal_init_zero, - true, - NULL, NULL, NULL - }, - - { - {"wal_recycle", PGC_SUSET, WAL_SETTINGS, - gettext_noop("Recycles WAL files by renaming them."), - NULL - }, - &wal_recycle, - true, - NULL, NULL, NULL - }, - - { - {"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT, - gettext_noop("Logs each checkpoint."), - NULL - }, - &log_checkpoints, - true, - NULL, NULL, NULL - }, - { - {"trace_connection_negotiation", PGC_POSTMASTER, DEVELOPER_OPTIONS, - gettext_noop("Logs details of pre-authentication connection handshake."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Trace_connection_negotiation, - false, - NULL, NULL, NULL - }, - { - {"log_disconnections", PGC_SU_BACKEND, LOGGING_WHAT, - gettext_noop("Logs end of a session, including duration."), - NULL - }, - &Log_disconnections, - false, - NULL, NULL, NULL - }, - { - {"log_replication_commands", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Logs each replication command."), - NULL - }, - &log_replication_commands, - false, - NULL, NULL, NULL - }, - { - {"debug_assertions", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether the running server has assertion checks enabled."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &assert_enabled, - DEFAULT_ASSERT_ENABLED, - NULL, NULL, NULL - }, - - { - {"exit_on_error", PGC_USERSET, ERROR_HANDLING_OPTIONS, - gettext_noop("Terminate session on any error."), - NULL - }, - &ExitOnAnyError, - false, - NULL, NULL, NULL - }, - { - {"restart_after_crash", PGC_SIGHUP, ERROR_HANDLING_OPTIONS, - gettext_noop("Reinitialize server after backend crash."), - NULL - }, - &restart_after_crash, - true, - NULL, NULL, NULL - }, - { - {"remove_temp_files_after_crash", PGC_SIGHUP, DEVELOPER_OPTIONS, - gettext_noop("Remove temporary files after backend crash."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &remove_temp_files_after_crash, - true, - NULL, NULL, NULL - }, - { - {"send_abort_for_crash", PGC_SIGHUP, DEVELOPER_OPTIONS, - gettext_noop("Send SIGABRT not SIGQUIT to child processes after backend crash."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &send_abort_for_crash, - false, - NULL, NULL, NULL - }, - { - {"send_abort_for_kill", PGC_SIGHUP, DEVELOPER_OPTIONS, - gettext_noop("Send SIGABRT not SIGKILL to stuck child processes."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &send_abort_for_kill, - false, - NULL, NULL, NULL - }, - - { - {"log_duration", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Logs the duration of each completed SQL statement."), - NULL - }, - &log_duration, - false, - NULL, NULL, NULL - }, -#ifdef DEBUG_NODE_TESTS_ENABLED - { - {"debug_copy_parse_plan_trees", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Set this to force all parse and plan trees to be passed through " - "copyObject(), to facilitate catching errors and omissions in " - "copyObject()."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Debug_copy_parse_plan_trees, -/* support for legacy compile-time setting */ -#ifdef COPY_PARSE_PLAN_TREES - true, -#else - false, -#endif - NULL, NULL, NULL - }, - { - {"debug_write_read_parse_plan_trees", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Set this to force all parse and plan trees to be passed through " - "outfuncs.c/readfuncs.c, to facilitate catching errors and omissions in " - "those modules."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Debug_write_read_parse_plan_trees, -/* support for legacy compile-time setting */ -#ifdef WRITE_READ_PARSE_PLAN_TREES - true, -#else - false, -#endif - NULL, NULL, NULL - }, - { - {"debug_raw_expression_coverage_test", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Set this to force all raw parse trees for DML statements to be scanned " - "by raw_expression_tree_walker(), to facilitate catching errors and " - "omissions in that function."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Debug_raw_expression_coverage_test, -/* support for legacy compile-time setting */ -#ifdef RAW_EXPRESSION_COVERAGE_TEST - true, -#else - false, -#endif - NULL, NULL, NULL - }, -#endif /* DEBUG_NODE_TESTS_ENABLED */ - { - {"debug_print_parse", PGC_USERSET, LOGGING_WHAT, - gettext_noop("Logs each query's parse tree."), - NULL - }, - &Debug_print_parse, - false, - NULL, NULL, NULL - }, - { - {"debug_print_rewritten", PGC_USERSET, LOGGING_WHAT, - gettext_noop("Logs each query's rewritten parse tree."), - NULL - }, - &Debug_print_rewritten, - false, - NULL, NULL, NULL - }, - { - {"debug_print_plan", PGC_USERSET, LOGGING_WHAT, - gettext_noop("Logs each query's execution plan."), - NULL - }, - &Debug_print_plan, - false, - NULL, NULL, NULL - }, - { - {"debug_pretty_print", PGC_USERSET, LOGGING_WHAT, - gettext_noop("Indents parse and plan tree displays."), - NULL - }, - &Debug_pretty_print, - true, - NULL, NULL, NULL - }, - { - {"log_parser_stats", PGC_SUSET, STATS_MONITORING, - gettext_noop("Writes parser performance statistics to the server log."), - NULL - }, - &log_parser_stats, - false, - check_stage_log_stats, NULL, NULL - }, - { - {"log_planner_stats", PGC_SUSET, STATS_MONITORING, - gettext_noop("Writes planner performance statistics to the server log."), - NULL - }, - &log_planner_stats, - false, - check_stage_log_stats, NULL, NULL - }, - { - {"log_executor_stats", PGC_SUSET, STATS_MONITORING, - gettext_noop("Writes executor performance statistics to the server log."), - NULL - }, - &log_executor_stats, - false, - check_stage_log_stats, NULL, NULL - }, - { - {"log_statement_stats", PGC_SUSET, STATS_MONITORING, - gettext_noop("Writes cumulative performance statistics to the server log."), - NULL - }, - &log_statement_stats, - false, - check_log_stats, NULL, NULL - }, -#ifdef BTREE_BUILD_STATS - { - {"log_btree_build_stats", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Logs system resource usage statistics (memory and CPU) on various B-tree operations."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &log_btree_build_stats, - false, - NULL, NULL, NULL - }, -#endif - - { - {"track_activities", PGC_SUSET, STATS_CUMULATIVE, - gettext_noop("Collects information about executing commands."), - gettext_noop("Enables the collection of information on the currently " - "executing command of each session, along with " - "the time at which that command began execution.") - }, - &pgstat_track_activities, - true, - NULL, NULL, NULL - }, - { - {"track_counts", PGC_SUSET, STATS_CUMULATIVE, - gettext_noop("Collects statistics on database activity."), - NULL - }, - &pgstat_track_counts, - true, - NULL, NULL, NULL - }, - { - {"track_cost_delay_timing", PGC_SUSET, STATS_CUMULATIVE, - gettext_noop("Collects timing statistics for cost-based vacuum delay."), - NULL - }, - &track_cost_delay_timing, - false, - NULL, NULL, NULL - }, - { - {"track_io_timing", PGC_SUSET, STATS_CUMULATIVE, - gettext_noop("Collects timing statistics for database I/O activity."), - NULL - }, - &track_io_timing, - false, - NULL, NULL, NULL - }, - { - {"track_wal_io_timing", PGC_SUSET, STATS_CUMULATIVE, - gettext_noop("Collects timing statistics for WAL I/O activity."), - NULL - }, - &track_wal_io_timing, - false, - NULL, NULL, NULL - }, - - { - {"update_process_title", PGC_SUSET, PROCESS_TITLE, - gettext_noop("Updates the process title to show the active SQL command."), - gettext_noop("Enables updating of the process title every time a new SQL command is received by the server.") - }, - &update_process_title, - DEFAULT_UPDATE_PROCESS_TITLE, - NULL, NULL, NULL - }, - - { - {"autovacuum", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Starts the autovacuum subprocess."), - NULL - }, - &autovacuum_start_daemon, - true, - NULL, NULL, NULL - }, - - { - {"trace_notify", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Generates debugging output for LISTEN and NOTIFY."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Trace_notify, - false, - NULL, NULL, NULL - }, - -#ifdef LOCK_DEBUG - { - {"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Emits information about lock usage."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Trace_locks, - false, - NULL, NULL, NULL - }, - { - {"trace_userlocks", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Emits information about user lock usage."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Trace_userlocks, - false, - NULL, NULL, NULL - }, - { - {"trace_lwlocks", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Emits information about lightweight lock usage."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Trace_lwlocks, - false, - NULL, NULL, NULL - }, - { - {"debug_deadlocks", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Dumps information about all current locks when a deadlock timeout occurs."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Debug_deadlocks, - false, - NULL, NULL, NULL - }, -#endif - - { - {"log_lock_waits", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Logs long lock waits."), - NULL - }, - &log_lock_waits, - false, - NULL, NULL, NULL - }, - { - {"log_lock_failure", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Logs lock failures."), - NULL - }, - &log_lock_failure, - false, - NULL, NULL, NULL - }, - { - {"log_recovery_conflict_waits", PGC_SIGHUP, LOGGING_WHAT, - gettext_noop("Logs standby recovery conflict waits."), - NULL - }, - &log_recovery_conflict_waits, - false, - NULL, NULL, NULL - }, - { - {"log_hostname", PGC_SIGHUP, LOGGING_WHAT, - gettext_noop("Logs the host name in the connection logs."), - gettext_noop("By default, connection logs only show the IP address " - "of the connecting host. If you want them to show the host name you " - "can turn this on, but depending on your host name resolution " - "setup it might impose a non-negligible performance penalty.") - }, - &log_hostname, - false, - NULL, NULL, NULL - }, - { - {"transform_null_equals", PGC_USERSET, COMPAT_OPTIONS_OTHER, - gettext_noop("Treats \"expr=NULL\" as \"expr IS NULL\"."), - gettext_noop("When turned on, expressions of the form expr = NULL " - "(or NULL = expr) are treated as expr IS NULL, that is, they " - "return true if expr evaluates to the null value, and false " - "otherwise. The correct behavior of expr = NULL is to always " - "return null (unknown).") - }, - &Transform_null_equals, - false, - NULL, NULL, NULL - }, - { - {"default_transaction_read_only", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the default read-only status of new transactions."), - NULL, - GUC_REPORT - }, - &DefaultXactReadOnly, - false, - NULL, NULL, NULL - }, - { - {"transaction_read_only", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the current transaction's read-only status."), - NULL, - GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &XactReadOnly, - false, - check_transaction_read_only, NULL, NULL - }, - { - {"default_transaction_deferrable", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the default deferrable status of new transactions."), - NULL - }, - &DefaultXactDeferrable, - false, - NULL, NULL, NULL - }, - { - {"transaction_deferrable", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Whether to defer a read-only serializable transaction until it can be executed with no possible serialization failures."), - NULL, - GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &XactDeferrable, - false, - check_transaction_deferrable, NULL, NULL - }, - { - {"row_security", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Enables row security."), - gettext_noop("When enabled, row security will be applied to all users.") - }, - &row_security, - true, - NULL, NULL, NULL - }, - { - {"check_function_bodies", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Check routine bodies during CREATE FUNCTION and CREATE PROCEDURE."), - NULL - }, - &check_function_bodies, - true, - NULL, NULL, NULL - }, - { - {"array_nulls", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("Enables input of NULL elements in arrays."), - gettext_noop("When turned on, unquoted NULL in an array input " - "value means a null value; " - "otherwise it is taken literally.") - }, - &Array_nulls, - true, - NULL, NULL, NULL - }, - - /* - * WITH OIDS support, and consequently default_with_oids, was removed in - * PostgreSQL 12, but we tolerate the parameter being set to false to - * avoid unnecessarily breaking older dump files. - */ - { - {"default_with_oids", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("WITH OIDS is no longer supported; this can only be false."), - NULL, - GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE - }, - &default_with_oids, - false, - check_default_with_oids, NULL, NULL - }, - { - {"logging_collector", PGC_POSTMASTER, LOGGING_WHERE, - gettext_noop("Start a subprocess to capture stderr, csvlog and/or jsonlog into log files."), - NULL - }, - &Logging_collector, - false, - NULL, NULL, NULL - }, - { - {"log_truncate_on_rotation", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Truncate existing log files of same name during log rotation."), - NULL - }, - &Log_truncate_on_rotation, - false, - NULL, NULL, NULL - }, - - { - {"trace_sort", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Emit information about resource usage in sorting."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &trace_sort, - false, - NULL, NULL, NULL - }, - -#ifdef TRACE_SYNCSCAN - /* this is undocumented because not exposed in a standard build */ - { - {"trace_syncscan", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Generate debugging output for synchronized scanning."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &trace_syncscan, - false, - NULL, NULL, NULL - }, -#endif - -#ifdef DEBUG_BOUNDED_SORT - /* this is undocumented because not exposed in a standard build */ - { - { - "optimize_bounded_sort", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables bounded sorting using heap sort."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_EXPLAIN - }, - &optimize_bounded_sort, - true, - NULL, NULL, NULL - }, -#endif - -#ifdef WAL_DEBUG - { - {"wal_debug", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Emit WAL-related debugging output."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &XLOG_DEBUG, - false, - NULL, NULL, NULL - }, -#endif - - { - {"integer_datetimes", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether datetimes are integer based."), - NULL, - GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &integer_datetimes, - true, - NULL, NULL, NULL - }, - - { - {"krb_caseins_users", PGC_SIGHUP, CONN_AUTH_AUTH, - gettext_noop("Sets whether Kerberos and GSSAPI user names should be treated as case-insensitive."), - NULL - }, - &pg_krb_caseins_users, - false, - NULL, NULL, NULL - }, - - { - {"gss_accept_delegation", PGC_SIGHUP, CONN_AUTH_AUTH, - gettext_noop("Sets whether GSSAPI delegation should be accepted from the client."), - NULL - }, - &pg_gss_accept_delegation, - false, - NULL, NULL, NULL - }, - - { - {"escape_string_warning", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("Warn about backslash escapes in ordinary string literals."), - NULL - }, - &escape_string_warning, - true, - NULL, NULL, NULL - }, - - { - {"standard_conforming_strings", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("Causes '...' strings to treat backslashes literally."), - NULL, - GUC_REPORT - }, - &standard_conforming_strings, - true, - NULL, NULL, NULL - }, - - { - {"synchronize_seqscans", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("Enables synchronized sequential scans."), - NULL - }, - &synchronize_seqscans, - true, - NULL, NULL, NULL - }, - - { - {"recovery_target_inclusive", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Sets whether to include or exclude transaction with recovery target."), - NULL - }, - &recoveryTargetInclusive, - true, - NULL, NULL, NULL - }, - - { - {"summarize_wal", PGC_SIGHUP, WAL_SUMMARIZATION, - gettext_noop("Starts the WAL summarizer process to enable incremental backup."), - NULL - }, - &summarize_wal, - false, - NULL, NULL, NULL - }, - - { - {"hot_standby", PGC_POSTMASTER, REPLICATION_STANDBY, - gettext_noop("Allows connections and queries during recovery."), - NULL - }, - &EnableHotStandby, - true, - NULL, NULL, NULL - }, - - { - {"hot_standby_feedback", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Allows feedback from a hot standby to the primary that will avoid query conflicts."), - NULL - }, - &hot_standby_feedback, - false, - NULL, NULL, NULL - }, - - { - {"in_hot_standby", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether hot standby is currently active."), - NULL, - GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &in_hot_standby_guc, - false, - NULL, NULL, show_in_hot_standby - }, - - { - {"allow_system_table_mods", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Allows modifications of the structure of system tables."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &allowSystemTableMods, - false, - NULL, NULL, NULL - }, - - { - {"ignore_system_indexes", PGC_BACKEND, DEVELOPER_OPTIONS, - gettext_noop("Disables reading from system indexes."), - gettext_noop("It does not prevent updating the indexes, so it is safe " - "to use. The worst consequence is slowness."), - GUC_NOT_IN_SAMPLE - }, - &IgnoreSystemIndexes, - false, - NULL, NULL, NULL - }, - - { - {"allow_in_place_tablespaces", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Allows tablespaces directly inside pg_tblspc, for testing."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &allow_in_place_tablespaces, - false, - NULL, NULL, NULL - }, - - { - {"lo_compat_privileges", PGC_SUSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("Enables backward compatibility mode for privilege checks on large objects."), - gettext_noop("Skips privilege checks when reading or modifying large objects, " - "for compatibility with PostgreSQL releases prior to 9.0.") - }, - &lo_compat_privileges, - false, - NULL, NULL, NULL - }, - - { - {"quote_all_identifiers", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("When generating SQL fragments, quote all identifiers."), - NULL, - }, - "e_all_identifiers, - false, - NULL, NULL, NULL - }, - - { - {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether data checksums are turned on for this cluster."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED - }, - &data_checksums, - false, - NULL, NULL, NULL - }, - - { - {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), - NULL - }, - &syslog_sequence_numbers, - true, - NULL, NULL, NULL - }, - - { - {"syslog_split_messages", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Split messages sent to syslog by lines and to fit into 1024 bytes."), - NULL - }, - &syslog_split_messages, - true, - NULL, NULL, NULL - }, - - { - {"parallel_leader_participation", PGC_USERSET, RESOURCES_WORKER_PROCESSES, - gettext_noop("Controls whether Gather and Gather Merge also run subplans."), - gettext_noop("Should gather nodes also run subplans or just gather tuples?"), - GUC_EXPLAIN - }, - ¶llel_leader_participation, - true, - NULL, NULL, NULL - }, - - { - {"jit", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Allow JIT compilation."), - NULL, - GUC_EXPLAIN - }, - &jit_enabled, - true, - NULL, NULL, NULL - }, - - { - {"jit_debugging_support", PGC_SU_BACKEND, DEVELOPER_OPTIONS, - gettext_noop("Register JIT-compiled functions with debugger."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &jit_debugging_support, - false, - - /* - * This is not guaranteed to be available, but given it's a developer - * oriented option, it doesn't seem worth adding code checking - * availability. - */ - NULL, NULL, NULL - }, - - { - {"jit_dump_bitcode", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Write out LLVM bitcode to facilitate JIT debugging."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &jit_dump_bitcode, - false, - NULL, NULL, NULL - }, - - { - {"jit_expressions", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Allow JIT compilation of expressions."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &jit_expressions, - true, - NULL, NULL, NULL - }, - - { - {"jit_profiling_support", PGC_SU_BACKEND, DEVELOPER_OPTIONS, - gettext_noop("Register JIT-compiled functions with perf profiler."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &jit_profiling_support, - false, - - /* - * This is not guaranteed to be available, but given it's a developer - * oriented option, it doesn't seem worth adding code checking - * availability. - */ - NULL, NULL, NULL - }, - - { - {"jit_tuple_deforming", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Allow JIT compilation of tuple deforming."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &jit_tuple_deforming, - true, - NULL, NULL, NULL - }, - - { - {"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS, - gettext_noop("Whether to continue running after a failure to sync data files."), - }, - &data_sync_retry, - false, - NULL, NULL, NULL - }, - - { - {"wal_receiver_create_temp_slot", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets whether a WAL receiver should create a temporary replication slot if no permanent slot is configured."), - }, - &wal_receiver_create_temp_slot, - false, - NULL, NULL, NULL - }, - - { - {"event_triggers", PGC_SUSET, CLIENT_CONN_STATEMENT, - gettext_noop("Enables event triggers."), - gettext_noop("When enabled, event triggers will fire for all applicable statements."), - }, - &event_triggers, - true, - NULL, NULL, NULL - }, - - { - {"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."), - }, - &sync_replication_slots, - false, - NULL, NULL, NULL - }, - - { - {"md5_password_warnings", PGC_USERSET, CONN_AUTH_AUTH, - gettext_noop("Enables deprecation warnings for MD5 passwords."), - }, - &md5_password_warnings, - true, - NULL, NULL, NULL - }, - - { - {"vacuum_truncate", PGC_USERSET, VACUUM_DEFAULT, - gettext_noop("Enables vacuum to truncate empty pages at the end of the table."), - }, - &vacuum_truncate, - true, - NULL, NULL, NULL - }, - - /* End-of-list marker */ - { - {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL - } -}; - - -struct config_int ConfigureNamesInt[] = -{ - { - {"archive_timeout", PGC_SIGHUP, WAL_ARCHIVING, - gettext_noop("Sets the amount of time to wait before forcing a " - "switch to the next WAL file."), - gettext_noop("0 disables the timeout."), - GUC_UNIT_S - }, - &XLogArchiveTimeout, - 0, 0, INT_MAX / 2, - NULL, NULL, NULL - }, - { - {"post_auth_delay", PGC_BACKEND, DEVELOPER_OPTIONS, - gettext_noop("Sets the amount of time to wait after " - "authentication on connection startup."), - gettext_noop("This allows attaching a debugger to the process."), - GUC_NOT_IN_SAMPLE | GUC_UNIT_S - }, - &PostAuthDelay, - 0, 0, INT_MAX / 1000000, - NULL, NULL, NULL - }, - { - {"default_statistics_target", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Sets the default statistics target."), - gettext_noop("This applies to table columns that have not had a " - "column-specific target set via ALTER TABLE SET STATISTICS.") - }, - &default_statistics_target, - 100, 1, MAX_STATISTICS_TARGET, - NULL, NULL, NULL - }, - { - {"from_collapse_limit", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Sets the FROM-list size beyond which subqueries " - "are not collapsed."), - gettext_noop("The planner will merge subqueries into upper " - "queries if the resulting FROM list would have no more than " - "this many items."), - GUC_EXPLAIN - }, - &from_collapse_limit, - 8, 1, INT_MAX, - NULL, NULL, NULL - }, - { - {"join_collapse_limit", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Sets the FROM-list size beyond which JOIN " - "constructs are not flattened."), - gettext_noop("The planner will flatten explicit JOIN " - "constructs into lists of FROM items whenever a " - "list of no more than this many items would result."), - GUC_EXPLAIN - }, - &join_collapse_limit, - 8, 1, INT_MAX, - NULL, NULL, NULL - }, - { - {"geqo_threshold", PGC_USERSET, QUERY_TUNING_GEQO, - gettext_noop("Sets the threshold of FROM items beyond which GEQO is used."), - NULL, - GUC_EXPLAIN - }, - &geqo_threshold, - 12, 2, INT_MAX, - NULL, NULL, NULL - }, - { - {"geqo_effort", PGC_USERSET, QUERY_TUNING_GEQO, - gettext_noop("GEQO: effort is used to set the default for other GEQO parameters."), - NULL, - GUC_EXPLAIN - }, - &Geqo_effort, - DEFAULT_GEQO_EFFORT, MIN_GEQO_EFFORT, MAX_GEQO_EFFORT, - NULL, NULL, NULL - }, - { - {"geqo_pool_size", PGC_USERSET, QUERY_TUNING_GEQO, - gettext_noop("GEQO: number of individuals in the population."), - gettext_noop("0 means use a suitable default value."), - GUC_EXPLAIN - }, - &Geqo_pool_size, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - { - {"geqo_generations", PGC_USERSET, QUERY_TUNING_GEQO, - gettext_noop("GEQO: number of iterations of the algorithm."), - gettext_noop("0 means use a suitable default value."), - GUC_EXPLAIN - }, - &Geqo_generations, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - /* This is PGC_SUSET to prevent hiding from log_lock_waits. */ - {"deadlock_timeout", PGC_SUSET, LOCK_MANAGEMENT, - gettext_noop("Sets the time to wait on a lock before checking for deadlock."), - NULL, - GUC_UNIT_MS - }, - &DeadlockTimeout, - 1000, 1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"max_standby_archive_delay", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the maximum delay before canceling queries when a hot standby server is processing archived WAL data."), - gettext_noop("-1 means wait forever."), - GUC_UNIT_MS - }, - &max_standby_archive_delay, - 30 * 1000, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"max_standby_streaming_delay", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the maximum delay before canceling queries when a hot standby server is processing streamed WAL data."), - gettext_noop("-1 means wait forever."), - GUC_UNIT_MS - }, - &max_standby_streaming_delay, - 30 * 1000, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"recovery_min_apply_delay", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the minimum delay for applying changes during recovery."), - NULL, - GUC_UNIT_MS - }, - &recovery_min_apply_delay, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"wal_receiver_status_interval", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the maximum interval between WAL receiver status reports to the sending server."), - NULL, - GUC_UNIT_S - }, - &wal_receiver_status_interval, - 10, 0, INT_MAX / 1000, - NULL, NULL, NULL - }, - - { - {"wal_receiver_timeout", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the maximum wait time to receive data from the sending server."), - gettext_noop("0 disables the timeout."), - GUC_UNIT_MS - }, - &wal_receiver_timeout, - 60 * 1000, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the maximum number of concurrent connections."), - NULL - }, - &MaxConnections, - 100, 1, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - /* see max_connections */ - {"superuser_reserved_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the number of connection slots reserved for superusers."), - NULL - }, - &SuperuserReservedConnections, - 3, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - {"reserved_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the number of connection slots reserved for roles " - "with privileges of pg_use_reserved_connections."), - NULL - }, - &ReservedConnections, - 0, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - {"min_dynamic_shared_memory", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Amount of dynamic shared memory reserved at startup."), - NULL, - GUC_UNIT_MB - }, - &min_dynamic_shared_memory, - 0, 0, (int) Min((size_t) INT_MAX, SIZE_MAX / (1024 * 1024)), - NULL, NULL, NULL - }, - - /* - * We sometimes multiply the number of shared buffers by two without - * checking for overflow, so we mustn't allow more than INT_MAX / 2. - */ - { - {"shared_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the number of shared memory buffers used by the server."), - NULL, - GUC_UNIT_BLOCKS - }, - &NBuffers, - 16384, 16, INT_MAX / 2, - NULL, NULL, NULL - }, - - { - {"vacuum_buffer_usage_limit", PGC_USERSET, RESOURCES_MEM, - gettext_noop("Sets the buffer pool size for VACUUM, ANALYZE, and autovacuum."), - NULL, - GUC_UNIT_KB - }, - &VacuumBufferUsageLimit, - 2048, 0, MAX_BAS_VAC_RING_SIZE_KB, - check_vacuum_buffer_usage_limit, NULL, NULL - }, - - { - {"shared_memory_size", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the size of the server's main shared memory area (rounded up to the nearest MB)."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_UNIT_MB | GUC_RUNTIME_COMPUTED - }, - &shared_memory_size_mb, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"shared_memory_size_in_huge_pages", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the number of huge pages needed for the main shared memory area."), - gettext_noop("-1 means huge pages are not supported."), - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED - }, - &shared_memory_size_in_huge_pages, - -1, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"num_os_semaphores", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the number of semaphores required for the server."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED - }, - &num_os_semaphores, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"commit_timestamp_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of the dedicated buffer pool used for the commit timestamp cache."), - gettext_noop("0 means use a fraction of \"shared_buffers\"."), - GUC_UNIT_BLOCKS - }, - &commit_timestamp_buffers, - 0, 0, SLRU_MAX_ALLOWED_BUFFERS, - check_commit_ts_buffers, NULL, NULL - }, - - { - {"multixact_member_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of the dedicated buffer pool used for the MultiXact member cache."), - NULL, - GUC_UNIT_BLOCKS - }, - &multixact_member_buffers, - 32, 16, SLRU_MAX_ALLOWED_BUFFERS, - check_multixact_member_buffers, NULL, NULL - }, - - { - {"multixact_offset_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of the dedicated buffer pool used for the MultiXact offset cache."), - NULL, - GUC_UNIT_BLOCKS - }, - &multixact_offset_buffers, - 16, 16, SLRU_MAX_ALLOWED_BUFFERS, - check_multixact_offset_buffers, NULL, NULL - }, - - { - {"notify_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of the dedicated buffer pool used for the LISTEN/NOTIFY message cache."), - NULL, - GUC_UNIT_BLOCKS - }, - ¬ify_buffers, - 16, 16, SLRU_MAX_ALLOWED_BUFFERS, - check_notify_buffers, NULL, NULL - }, - - { - {"serializable_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of the dedicated buffer pool used for the serializable transaction cache."), - NULL, - GUC_UNIT_BLOCKS - }, - &serializable_buffers, - 32, 16, SLRU_MAX_ALLOWED_BUFFERS, - check_serial_buffers, NULL, NULL - }, - - { - {"subtransaction_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of the dedicated buffer pool used for the subtransaction cache."), - gettext_noop("0 means use a fraction of \"shared_buffers\"."), - GUC_UNIT_BLOCKS - }, - &subtransaction_buffers, - 0, 0, SLRU_MAX_ALLOWED_BUFFERS, - check_subtrans_buffers, NULL, NULL - }, - - { - {"transaction_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of the dedicated buffer pool used for the transaction status cache."), - gettext_noop("0 means use a fraction of \"shared_buffers\"."), - GUC_UNIT_BLOCKS - }, - &transaction_buffers, - 0, 0, SLRU_MAX_ALLOWED_BUFFERS, - check_transaction_buffers, NULL, NULL - }, - - { - {"temp_buffers", PGC_USERSET, RESOURCES_MEM, - gettext_noop("Sets the maximum number of temporary buffers used by each session."), - NULL, - GUC_UNIT_BLOCKS | GUC_EXPLAIN - }, - &num_temp_buffers, - 1024, 100, INT_MAX / 2, - check_temp_buffers, NULL, NULL - }, - - { - {"port", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the TCP port the server listens on."), - NULL - }, - &PostPortNumber, - DEF_PGPORT, 1, 65535, - NULL, NULL, NULL - }, - - { - {"unix_socket_permissions", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the access permissions of the Unix-domain socket."), - gettext_noop("Unix-domain sockets use the usual Unix file system " - "permission set. The parameter value is expected " - "to be a numeric mode specification in the form " - "accepted by the chmod and umask system calls. " - "(To use the customary octal format the number must " - "start with a 0 (zero).)") - }, - &Unix_socket_permissions, - 0777, 0000, 0777, - NULL, NULL, show_unix_socket_permissions - }, - - { - {"log_file_mode", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the file permissions for log files."), - gettext_noop("The parameter value is expected " - "to be a numeric mode specification in the form " - "accepted by the chmod and umask system calls. " - "(To use the customary octal format the number must " - "start with a 0 (zero).)") - }, - &Log_file_mode, - 0600, 0000, 0777, - NULL, NULL, show_log_file_mode - }, - - - { - {"data_directory_mode", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the mode of the data directory."), - gettext_noop("The parameter value is a numeric mode specification " - "in the form accepted by the chmod and umask system " - "calls. (To use the customary octal format the number " - "must start with a 0 (zero).)"), - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED - }, - &data_directory_mode, - 0700, 0000, 0777, - NULL, NULL, show_data_directory_mode - }, - - { - {"work_mem", PGC_USERSET, RESOURCES_MEM, - gettext_noop("Sets the maximum memory to be used for query workspaces."), - gettext_noop("This much memory can be used by each internal " - "sort operation and hash table before switching to " - "temporary disk files."), - GUC_UNIT_KB | GUC_EXPLAIN - }, - &work_mem, - 4096, 64, MAX_KILOBYTES, - NULL, NULL, NULL - }, - - /* - * Dynamic shared memory has a higher overhead than local memory contexts, - * so when testing low-memory scenarios that could use shared memory, the - * recommended minimum is 1MB. - */ - { - {"maintenance_work_mem", PGC_USERSET, RESOURCES_MEM, - gettext_noop("Sets the maximum memory to be used for maintenance operations."), - gettext_noop("This includes operations such as VACUUM and CREATE INDEX."), - GUC_UNIT_KB - }, - &maintenance_work_mem, - 65536, 64, MAX_KILOBYTES, - NULL, NULL, NULL - }, - - { - {"logical_decoding_work_mem", PGC_USERSET, RESOURCES_MEM, - gettext_noop("Sets the maximum memory to be used for logical decoding."), - gettext_noop("This much memory can be used by each internal " - "reorder buffer before spilling to disk."), - GUC_UNIT_KB - }, - &logical_decoding_work_mem, - 65536, 64, MAX_KILOBYTES, - NULL, NULL, NULL - }, - - /* - * We use the hopefully-safely-small value of 100kB as the compiled-in - * default for max_stack_depth. InitializeGUCOptions will increase it if - * possible, depending on the actual platform-specific stack limit. - */ - { - {"max_stack_depth", PGC_SUSET, RESOURCES_MEM, - gettext_noop("Sets the maximum stack depth, in kilobytes."), - NULL, - GUC_UNIT_KB - }, - &max_stack_depth, - 100, 100, MAX_KILOBYTES, - check_max_stack_depth, assign_max_stack_depth, NULL - }, - - { - {"temp_file_limit", PGC_SUSET, RESOURCES_DISK, - gettext_noop("Limits the total size of all temporary files used by each process."), - gettext_noop("-1 means no limit."), - GUC_UNIT_KB - }, - &temp_file_limit, - -1, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"vacuum_cost_page_hit", PGC_USERSET, VACUUM_COST_DELAY, - gettext_noop("Vacuum cost for a page found in the buffer cache."), - NULL - }, - &VacuumCostPageHit, - 1, 0, 10000, - NULL, NULL, NULL - }, - - { - {"vacuum_cost_page_miss", PGC_USERSET, VACUUM_COST_DELAY, - gettext_noop("Vacuum cost for a page not found in the buffer cache."), - NULL - }, - &VacuumCostPageMiss, - 2, 0, 10000, - NULL, NULL, NULL - }, - - { - {"vacuum_cost_page_dirty", PGC_USERSET, VACUUM_COST_DELAY, - gettext_noop("Vacuum cost for a page dirtied by vacuum."), - NULL - }, - &VacuumCostPageDirty, - 20, 0, 10000, - NULL, NULL, NULL - }, - - { - {"vacuum_cost_limit", PGC_USERSET, VACUUM_COST_DELAY, - gettext_noop("Vacuum cost amount available before napping."), - NULL - }, - &VacuumCostLimit, - 200, 1, 10000, - NULL, NULL, NULL - }, - - { - {"autovacuum_vacuum_cost_limit", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Vacuum cost amount available before napping, for autovacuum."), - gettext_noop("-1 means use \"vacuum_cost_limit\".") - }, - &autovacuum_vac_cost_limit, - -1, -1, 10000, - NULL, NULL, NULL - }, - - { - {"max_files_per_process", PGC_POSTMASTER, RESOURCES_KERNEL, - gettext_noop("Sets the maximum number of files each server process is allowed to open simultaneously."), - NULL - }, - &max_files_per_process, - 1000, 64, INT_MAX, - NULL, NULL, NULL - }, - - /* - * See also CheckRequiredParameterValues() if this parameter changes - */ - { - {"max_prepared_transactions", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the maximum number of simultaneously prepared transactions."), - NULL - }, - &max_prepared_xacts, - 0, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - -#ifdef LOCK_DEBUG - { - {"trace_lock_oidmin", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Sets the minimum OID of tables for tracking locks."), - gettext_noop("Is used to avoid output on system tables."), - GUC_NOT_IN_SAMPLE - }, - &Trace_lock_oidmin, - FirstNormalObjectId, 0, INT_MAX, - NULL, NULL, NULL - }, - { - {"trace_lock_table", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Sets the OID of the table with unconditionally lock tracing."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &Trace_lock_table, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, -#endif - - { - {"statement_timeout", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the maximum allowed duration of any statement."), - gettext_noop("0 disables the timeout."), - GUC_UNIT_MS - }, - &StatementTimeout, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"lock_timeout", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the maximum allowed duration of any wait for a lock."), - gettext_noop("0 disables the timeout."), - GUC_UNIT_MS - }, - &LockTimeout, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"idle_in_transaction_session_timeout", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the maximum allowed idle time between queries, when in a transaction."), - gettext_noop("0 disables the timeout."), - GUC_UNIT_MS - }, - &IdleInTransactionSessionTimeout, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"transaction_timeout", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the maximum allowed duration of any transaction within a session (not a prepared transaction)."), - gettext_noop("0 disables the timeout."), - GUC_UNIT_MS - }, - &TransactionTimeout, - 0, 0, INT_MAX, - NULL, assign_transaction_timeout, NULL - }, - - { - {"idle_session_timeout", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the maximum allowed idle time between queries, when not in a transaction."), - gettext_noop("0 disables the timeout."), - GUC_UNIT_MS - }, - &IdleSessionTimeout, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"vacuum_freeze_min_age", PGC_USERSET, VACUUM_FREEZING, - gettext_noop("Minimum age at which VACUUM should freeze a table row."), - NULL - }, - &vacuum_freeze_min_age, - 50000000, 0, 1000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_freeze_table_age", PGC_USERSET, VACUUM_FREEZING, - gettext_noop("Age at which VACUUM should scan whole table to freeze tuples."), - NULL - }, - &vacuum_freeze_table_age, - 150000000, 0, 2000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_multixact_freeze_min_age", PGC_USERSET, VACUUM_FREEZING, - gettext_noop("Minimum age at which VACUUM should freeze a MultiXactId in a table row."), - NULL - }, - &vacuum_multixact_freeze_min_age, - 5000000, 0, 1000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_multixact_freeze_table_age", PGC_USERSET, VACUUM_FREEZING, - gettext_noop("Multixact age at which VACUUM should scan whole table to freeze tuples."), - NULL - }, - &vacuum_multixact_freeze_table_age, - 150000000, 0, 2000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_failsafe_age", PGC_USERSET, VACUUM_FREEZING, - gettext_noop("Age at which VACUUM should trigger failsafe to avoid a wraparound outage."), - NULL - }, - &vacuum_failsafe_age, - 1600000000, 0, 2100000000, - NULL, NULL, NULL - }, - { - {"vacuum_multixact_failsafe_age", PGC_USERSET, VACUUM_FREEZING, - gettext_noop("Multixact age at which VACUUM should trigger failsafe to avoid a wraparound outage."), - NULL - }, - &vacuum_multixact_failsafe_age, - 1600000000, 0, 2100000000, - NULL, NULL, NULL - }, - - /* - * See also CheckRequiredParameterValues() if this parameter changes - */ - { - {"max_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT, - gettext_noop("Sets the maximum number of locks per transaction."), - gettext_noop("The shared lock table is sized on the assumption that at most " - "\"max_locks_per_transaction\" objects per server process or prepared " - "transaction will need to be locked at any one time.") - }, - &max_locks_per_xact, - 64, 10, INT_MAX, - NULL, NULL, NULL - }, - - { - {"max_pred_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT, - gettext_noop("Sets the maximum number of predicate locks per transaction."), - gettext_noop("The shared predicate lock table is sized on the assumption that " - "at most \"max_pred_locks_per_transaction\" objects per server process " - "or prepared transaction will need to be locked at any one time.") - }, - &max_predicate_locks_per_xact, - 64, 10, INT_MAX, - NULL, NULL, NULL - }, - - { - {"max_pred_locks_per_relation", PGC_SIGHUP, LOCK_MANAGEMENT, - gettext_noop("Sets the maximum number of predicate-locked pages and tuples per relation."), - gettext_noop("If more than this total of pages and tuples in the same relation are locked " - "by a connection, those locks are replaced by a relation-level lock.") - }, - &max_predicate_locks_per_relation, - -2, INT_MIN, INT_MAX, - NULL, NULL, NULL - }, - - { - {"max_pred_locks_per_page", PGC_SIGHUP, LOCK_MANAGEMENT, - gettext_noop("Sets the maximum number of predicate-locked tuples per page."), - gettext_noop("If more than this number of tuples on the same page are locked " - "by a connection, those locks are replaced by a page-level lock.") - }, - &max_predicate_locks_per_page, - 2, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"authentication_timeout", PGC_SIGHUP, CONN_AUTH_AUTH, - gettext_noop("Sets the maximum allowed time to complete client authentication."), - NULL, - GUC_UNIT_S - }, - &AuthenticationTimeout, - 60, 1, 600, - NULL, NULL, NULL - }, - - { - /* Not for general use */ - {"pre_auth_delay", PGC_SIGHUP, DEVELOPER_OPTIONS, - gettext_noop("Sets the amount of time to wait before " - "authentication on connection startup."), - gettext_noop("This allows attaching a debugger to the process."), - GUC_NOT_IN_SAMPLE | GUC_UNIT_S - }, - &PreAuthDelay, - 0, 0, 60, - NULL, NULL, NULL - }, - - { - {"max_notify_queue_pages", PGC_POSTMASTER, RESOURCES_DISK, - gettext_noop("Sets the maximum number of allocated pages for NOTIFY / LISTEN queue."), - NULL, - }, - &max_notify_queue_pages, - 1048576, 64, INT_MAX, - NULL, NULL, NULL - }, - - { - {"wal_decode_buffer_size", PGC_POSTMASTER, WAL_RECOVERY, - gettext_noop("Buffer size for reading ahead in the WAL during recovery."), - gettext_noop("Maximum distance to read ahead in the WAL to prefetch referenced data blocks."), - GUC_UNIT_BYTE - }, - &wal_decode_buffer_size, - 512 * 1024, 64 * 1024, MaxAllocSize, - NULL, NULL, NULL - }, - - { - {"wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING, - gettext_noop("Sets the size of WAL files held for standby servers."), - NULL, - GUC_UNIT_MB - }, - &wal_keep_size_mb, - 0, 0, MAX_KILOBYTES, - NULL, NULL, NULL - }, - - { - {"min_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS, - gettext_noop("Sets the minimum size to shrink the WAL to."), - NULL, - GUC_UNIT_MB - }, - &min_wal_size_mb, - DEFAULT_MIN_WAL_SEGS * (DEFAULT_XLOG_SEG_SIZE / (1024 * 1024)), - 2, MAX_KILOBYTES, - NULL, NULL, NULL - }, - - { - {"max_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS, - gettext_noop("Sets the WAL size that triggers a checkpoint."), - NULL, - GUC_UNIT_MB - }, - &max_wal_size_mb, - DEFAULT_MAX_WAL_SEGS * (DEFAULT_XLOG_SEG_SIZE / (1024 * 1024)), - 2, MAX_KILOBYTES, - NULL, assign_max_wal_size, NULL - }, - - { - {"checkpoint_timeout", PGC_SIGHUP, WAL_CHECKPOINTS, - gettext_noop("Sets the maximum time between automatic WAL checkpoints."), - NULL, - GUC_UNIT_S - }, - &CheckPointTimeout, - 300, 30, 86400, - NULL, NULL, NULL - }, - - { - {"checkpoint_warning", PGC_SIGHUP, WAL_CHECKPOINTS, - gettext_noop("Sets the maximum time before warning if checkpoints " - "triggered by WAL volume happen too frequently."), - gettext_noop("Write a message to the server log if checkpoints " - "caused by the filling of WAL segment files happen more " - "frequently than this amount of time. " - "0 disables the warning."), - GUC_UNIT_S - }, - &CheckPointWarning, - 30, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"checkpoint_flush_after", PGC_SIGHUP, WAL_CHECKPOINTS, - gettext_noop("Number of pages after which previously performed writes are flushed to disk."), - gettext_noop("0 disables forced writeback."), - GUC_UNIT_BLOCKS - }, - &checkpoint_flush_after, - DEFAULT_CHECKPOINT_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES, - NULL, NULL, NULL - }, - - { - {"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS, - gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."), - gettext_noop("-1 means use a fraction of \"shared_buffers\"."), - GUC_UNIT_XBLOCKS - }, - &XLOGbuffers, - -1, -1, (INT_MAX / XLOG_BLCKSZ), - check_wal_buffers, NULL, NULL - }, - - { - {"wal_writer_delay", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("Time between WAL flushes performed in the WAL writer."), - NULL, - GUC_UNIT_MS - }, - &WalWriterDelay, - 200, 1, 10000, - NULL, NULL, NULL - }, - - { - {"wal_writer_flush_after", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("Amount of WAL written out by WAL writer that triggers a flush."), - NULL, - GUC_UNIT_XBLOCKS - }, - &WalWriterFlushAfter, - DEFAULT_WAL_WRITER_FLUSH_AFTER, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"wal_skip_threshold", PGC_USERSET, WAL_SETTINGS, - gettext_noop("Minimum size of new file to fsync instead of writing WAL."), - NULL, - GUC_UNIT_KB - }, - &wal_skip_threshold, - 2048, 0, MAX_KILOBYTES, - NULL, NULL, NULL - }, - - { - {"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING, - gettext_noop("Sets the maximum number of simultaneously running WAL sender processes."), - NULL - }, - &max_wal_senders, - 10, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - /* see max_wal_senders */ - {"max_replication_slots", PGC_POSTMASTER, REPLICATION_SENDING, - gettext_noop("Sets the maximum number of simultaneously defined replication slots."), - NULL - }, - &max_replication_slots, - 10, 0, MAX_BACKENDS /* XXX? */ , - NULL, NULL, NULL - }, - - { - {"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING, - gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."), - gettext_noop("Replication slots will be marked as failed, and segments released " - "for deletion or recycling, if this much space is occupied by WAL on disk. " - "-1 means no maximum."), - GUC_UNIT_MB - }, - &max_slot_wal_keep_size_mb, - -1, -1, MAX_KILOBYTES, - check_max_slot_wal_keep_size, NULL, NULL - }, - - { - {"wal_sender_timeout", PGC_USERSET, REPLICATION_SENDING, - gettext_noop("Sets the maximum time to wait for WAL replication."), - NULL, - GUC_UNIT_MS - }, - &wal_sender_timeout, - 60 * 1000, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"idle_replication_slot_timeout", PGC_SIGHUP, REPLICATION_SENDING, - gettext_noop("Sets the duration a replication slot can remain idle before " - "it is invalidated."), - NULL, - GUC_UNIT_MIN - }, - &idle_replication_slot_timeout_mins, - 0, 0, INT_MAX / SECS_PER_MINUTE, - check_idle_replication_slot_timeout, NULL, NULL - }, - - { - {"commit_delay", PGC_SUSET, WAL_SETTINGS, - gettext_noop("Sets the delay in microseconds between transaction commit and " - "flushing WAL to disk."), - NULL - /* we have no microseconds designation, so can't supply units here */ - }, - &CommitDelay, - 0, 0, 100000, - NULL, NULL, NULL - }, - - { - {"commit_siblings", PGC_USERSET, WAL_SETTINGS, - gettext_noop("Sets the minimum number of concurrent open transactions " - "required before performing \"commit_delay\"."), - NULL - }, - &CommitSiblings, - 5, 0, 1000, - NULL, NULL, NULL - }, - - { - {"extra_float_digits", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the number of digits displayed for floating-point values."), - gettext_noop("This affects real, double precision, and geometric data types. " - "A zero or negative parameter value is added to the standard " - "number of digits (FLT_DIG or DBL_DIG as appropriate). " - "Any value greater than zero selects precise output mode.") - }, - &extra_float_digits, - 1, -15, 3, - NULL, NULL, NULL - }, - - { - {"log_min_duration_sample", PGC_SUSET, LOGGING_WHEN, - gettext_noop("Sets the minimum execution time above which " - "a sample of statements will be logged." - " Sampling is determined by \"log_statement_sample_rate\"."), - gettext_noop("-1 disables sampling. 0 means sample all statements."), - GUC_UNIT_MS - }, - &log_min_duration_sample, - -1, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"log_min_duration_statement", PGC_SUSET, LOGGING_WHEN, - gettext_noop("Sets the minimum execution time above which " - "all statements will be logged."), - gettext_noop("-1 disables logging statement durations. 0 means log all statement durations."), - GUC_UNIT_MS - }, - &log_min_duration_statement, - -1, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"log_autovacuum_min_duration", PGC_SIGHUP, LOGGING_WHAT, - gettext_noop("Sets the minimum execution time above which " - "autovacuum actions will be logged."), - gettext_noop("-1 disables logging autovacuum actions. 0 means log all autovacuum actions."), - GUC_UNIT_MS - }, - &Log_autovacuum_min_duration, - 600000, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"log_parameter_max_length", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Sets the maximum length in bytes of data logged for bind " - "parameter values when logging statements."), - gettext_noop("-1 means log values in full."), - GUC_UNIT_BYTE - }, - &log_parameter_max_length, - -1, -1, INT_MAX / 2, - NULL, NULL, NULL - }, - - { - {"log_parameter_max_length_on_error", PGC_USERSET, LOGGING_WHAT, - gettext_noop("Sets the maximum length in bytes of data logged for bind " - "parameter values when logging statements, on error."), - gettext_noop("-1 means log values in full."), - GUC_UNIT_BYTE - }, - &log_parameter_max_length_on_error, - 0, -1, INT_MAX / 2, - NULL, NULL, NULL - }, - - { - {"bgwriter_delay", PGC_SIGHUP, RESOURCES_BGWRITER, - gettext_noop("Background writer sleep time between rounds."), - NULL, - GUC_UNIT_MS - }, - &BgWriterDelay, - 200, 10, 10000, - NULL, NULL, NULL - }, - - { - {"bgwriter_lru_maxpages", PGC_SIGHUP, RESOURCES_BGWRITER, - gettext_noop("Background writer maximum number of LRU pages to flush per round."), - gettext_noop("0 disables background writing.") - }, - &bgwriter_lru_maxpages, - 100, 0, INT_MAX / 2, /* Same upper limit as shared_buffers */ - NULL, NULL, NULL - }, - - { - {"bgwriter_flush_after", PGC_SIGHUP, RESOURCES_BGWRITER, - gettext_noop("Number of pages after which previously performed writes are flushed to disk."), - gettext_noop("0 disables forced writeback."), - GUC_UNIT_BLOCKS - }, - &bgwriter_flush_after, - DEFAULT_BGWRITER_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES, - NULL, NULL, NULL - }, - - { - {"effective_io_concurrency", - PGC_USERSET, - RESOURCES_IO, - gettext_noop("Number of simultaneous requests that can be handled efficiently by the disk subsystem."), - gettext_noop("0 disables simultaneous requests."), - GUC_EXPLAIN - }, - &effective_io_concurrency, - DEFAULT_EFFECTIVE_IO_CONCURRENCY, - 0, MAX_IO_CONCURRENCY, - NULL, NULL, NULL - }, - - { - {"maintenance_io_concurrency", - PGC_USERSET, - RESOURCES_IO, - gettext_noop("A variant of \"effective_io_concurrency\" that is used for maintenance work."), - gettext_noop("0 disables simultaneous requests."), - GUC_EXPLAIN - }, - &maintenance_io_concurrency, - DEFAULT_MAINTENANCE_IO_CONCURRENCY, - 0, MAX_IO_CONCURRENCY, - NULL, assign_maintenance_io_concurrency, - NULL - }, - - { - {"io_max_combine_limit", - PGC_POSTMASTER, - RESOURCES_IO, - gettext_noop("Server-wide limit that clamps io_combine_limit."), - NULL, - GUC_UNIT_BLOCKS - }, - &io_max_combine_limit, - DEFAULT_IO_COMBINE_LIMIT, - 1, MAX_IO_COMBINE_LIMIT, - NULL, assign_io_max_combine_limit, NULL - }, - - { - {"io_combine_limit", - PGC_USERSET, - RESOURCES_IO, - gettext_noop("Limit on the size of data reads and writes."), - NULL, - GUC_UNIT_BLOCKS - }, - &io_combine_limit_guc, - DEFAULT_IO_COMBINE_LIMIT, - 1, MAX_IO_COMBINE_LIMIT, - NULL, assign_io_combine_limit, NULL - }, - - { - {"io_max_concurrency", - PGC_POSTMASTER, - RESOURCES_IO, - gettext_noop("Max number of IOs that one process can execute simultaneously."), - NULL, - }, - &io_max_concurrency, - -1, -1, 1024, - check_io_max_concurrency, NULL, NULL - }, - - { - {"io_workers", - PGC_SIGHUP, - RESOURCES_IO, - gettext_noop("Number of IO worker processes, for io_method=worker."), - NULL, - }, - &io_workers, - 3, 1, MAX_IO_WORKERS, - NULL, NULL, NULL - }, - - { - {"backend_flush_after", PGC_USERSET, RESOURCES_IO, - gettext_noop("Number of pages after which previously performed writes are flushed to disk."), - gettext_noop("0 disables forced writeback."), - GUC_UNIT_BLOCKS - }, - &backend_flush_after, - DEFAULT_BACKEND_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES, - NULL, NULL, NULL - }, - - { - {"max_worker_processes", - PGC_POSTMASTER, - RESOURCES_WORKER_PROCESSES, - gettext_noop("Maximum number of concurrent worker processes."), - NULL, - }, - &max_worker_processes, - 8, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - {"max_logical_replication_workers", - PGC_POSTMASTER, - REPLICATION_SUBSCRIBERS, - gettext_noop("Maximum number of logical replication worker processes."), - NULL, - }, - &max_logical_replication_workers, - 4, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - {"max_sync_workers_per_subscription", - PGC_SIGHUP, - REPLICATION_SUBSCRIBERS, - gettext_noop("Maximum number of table synchronization workers per subscription."), - NULL, - }, - &max_sync_workers_per_subscription, - 2, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - {"max_parallel_apply_workers_per_subscription", - PGC_SIGHUP, - REPLICATION_SUBSCRIBERS, - gettext_noop("Maximum number of parallel apply workers per subscription."), - NULL, - }, - &max_parallel_apply_workers_per_subscription, - 2, 0, MAX_PARALLEL_WORKER_LIMIT, - NULL, NULL, NULL - }, - - { - {"max_active_replication_origins", - PGC_POSTMASTER, - REPLICATION_SUBSCRIBERS, - gettext_noop("Sets the maximum number of active replication origins."), - NULL - }, - &max_active_replication_origins, - 10, 0, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the amount of time to wait before forcing " - "log file rotation."), - gettext_noop("0 disables time-based creation of new log files."), - GUC_UNIT_MIN - }, - &Log_RotationAge, - HOURS_PER_DAY * MINS_PER_HOUR, 0, INT_MAX / SECS_PER_MINUTE, - NULL, NULL, NULL - }, - - { - {"log_rotation_size", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the maximum size a log file can reach before " - "being rotated."), - gettext_noop("0 disables size-based creation of new log files."), - GUC_UNIT_KB - }, - &Log_RotationSize, - 10 * 1024, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"max_function_args", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the maximum number of function arguments."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &max_function_args, - FUNC_MAX_ARGS, FUNC_MAX_ARGS, FUNC_MAX_ARGS, - NULL, NULL, NULL - }, - - { - {"max_index_keys", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the maximum number of index keys."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &max_index_keys, - INDEX_MAX_KEYS, INDEX_MAX_KEYS, INDEX_MAX_KEYS, - NULL, NULL, NULL - }, - - { - {"max_identifier_length", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the maximum identifier length."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &max_identifier_length, - NAMEDATALEN - 1, NAMEDATALEN - 1, NAMEDATALEN - 1, - NULL, NULL, NULL - }, - - { - {"block_size", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the size of a disk block."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &block_size, - BLCKSZ, BLCKSZ, BLCKSZ, - NULL, NULL, NULL - }, - - { - {"segment_size", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the number of pages per disk file."), - NULL, - GUC_UNIT_BLOCKS | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &segment_size, - RELSEG_SIZE, RELSEG_SIZE, RELSEG_SIZE, - NULL, NULL, NULL - }, - - { - {"wal_block_size", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the block size in the write ahead log."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &wal_block_size, - XLOG_BLCKSZ, XLOG_BLCKSZ, XLOG_BLCKSZ, - NULL, NULL, NULL - }, - - { - {"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the time to wait before retrying to retrieve WAL " - "after a failed attempt."), - NULL, - GUC_UNIT_MS - }, - &wal_retrieve_retry_interval, - 5000, 1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the size of write ahead log segments."), - NULL, - GUC_UNIT_BYTE | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED - }, - &wal_segment_size, - DEFAULT_XLOG_SEG_SIZE, - WalSegMinSize, - WalSegMaxSize, - check_wal_segment_size, NULL, NULL - }, - - { - {"wal_summary_keep_time", PGC_SIGHUP, WAL_SUMMARIZATION, - gettext_noop("Time for which WAL summary files should be kept."), - gettext_noop("0 disables automatic summary file deletion."), - GUC_UNIT_MIN, - }, - &wal_summary_keep_time, - 10 * HOURS_PER_DAY * MINS_PER_HOUR, /* 10 days */ - 0, - INT_MAX / SECS_PER_MINUTE, - NULL, NULL, NULL - }, - - { - {"autovacuum_naptime", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Time to sleep between autovacuum runs."), - NULL, - GUC_UNIT_S - }, - &autovacuum_naptime, - 60, 1, INT_MAX / 1000, - NULL, NULL, NULL - }, - { - {"autovacuum_vacuum_threshold", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Minimum number of tuple updates or deletes prior to vacuum."), - NULL - }, - &autovacuum_vac_thresh, - 50, 0, INT_MAX, - NULL, NULL, NULL - }, - { - {"autovacuum_vacuum_max_threshold", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Maximum number of tuple updates or deletes prior to vacuum."), - gettext_noop("-1 disables the maximum threshold.") - }, - &autovacuum_vac_max_thresh, - 100000000, -1, INT_MAX, - NULL, NULL, NULL - }, - { - {"autovacuum_vacuum_insert_threshold", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Minimum number of tuple inserts prior to vacuum."), - gettext_noop("-1 disables insert vacuums.") - }, - &autovacuum_vac_ins_thresh, - 1000, -1, INT_MAX, - NULL, NULL, NULL - }, - { - {"autovacuum_analyze_threshold", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Minimum number of tuple inserts, updates, or deletes prior to analyze."), - NULL - }, - &autovacuum_anl_thresh, - 50, 0, INT_MAX, - NULL, NULL, NULL - }, - { - /* see varsup.c for why this is PGC_POSTMASTER not PGC_SIGHUP */ - {"autovacuum_freeze_max_age", PGC_POSTMASTER, VACUUM_AUTOVACUUM, - gettext_noop("Age at which to autovacuum a table to prevent transaction ID wraparound."), - NULL - }, - &autovacuum_freeze_max_age, - - /* see vacuum_failsafe_age if you change the upper-limit value. */ - 200000000, 100000, 2000000000, - NULL, NULL, NULL - }, - { - /* see multixact.c for why this is PGC_POSTMASTER not PGC_SIGHUP */ - {"autovacuum_multixact_freeze_max_age", PGC_POSTMASTER, VACUUM_AUTOVACUUM, - gettext_noop("Multixact age at which to autovacuum a table to prevent multixact wraparound."), - NULL - }, - &autovacuum_multixact_freeze_max_age, - 400000000, 10000, 2000000000, - NULL, NULL, NULL - }, - { - /* see max_connections */ - {"autovacuum_worker_slots", PGC_POSTMASTER, VACUUM_AUTOVACUUM, - gettext_noop("Sets the number of backend slots to allocate for autovacuum workers."), - NULL - }, - &autovacuum_worker_slots, - 16, 1, MAX_BACKENDS, - NULL, NULL, NULL - }, - { - {"autovacuum_max_workers", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Sets the maximum number of simultaneously running autovacuum worker processes."), - NULL - }, - &autovacuum_max_workers, - 3, 1, MAX_BACKENDS, - NULL, NULL, NULL - }, - - { - {"max_parallel_maintenance_workers", PGC_USERSET, RESOURCES_WORKER_PROCESSES, - gettext_noop("Sets the maximum number of parallel processes per maintenance operation."), - NULL - }, - &max_parallel_maintenance_workers, - 2, 0, MAX_PARALLEL_WORKER_LIMIT, - NULL, NULL, NULL - }, - - { - {"max_parallel_workers_per_gather", PGC_USERSET, RESOURCES_WORKER_PROCESSES, - gettext_noop("Sets the maximum number of parallel processes per executor node."), - NULL, - GUC_EXPLAIN - }, - &max_parallel_workers_per_gather, - 2, 0, MAX_PARALLEL_WORKER_LIMIT, - NULL, NULL, NULL - }, - - { - {"max_parallel_workers", PGC_USERSET, RESOURCES_WORKER_PROCESSES, - gettext_noop("Sets the maximum number of parallel workers that can be active at one time."), - NULL, - GUC_EXPLAIN - }, - &max_parallel_workers, - 8, 0, MAX_PARALLEL_WORKER_LIMIT, - NULL, NULL, NULL - }, - - { - {"autovacuum_work_mem", PGC_SIGHUP, RESOURCES_MEM, - gettext_noop("Sets the maximum memory to be used by each autovacuum worker process."), - gettext_noop("-1 means use \"maintenance_work_mem\"."), - GUC_UNIT_KB - }, - &autovacuum_work_mem, - -1, -1, MAX_KILOBYTES, - check_autovacuum_work_mem, NULL, NULL - }, - - { - {"tcp_keepalives_idle", PGC_USERSET, CONN_AUTH_TCP, - gettext_noop("Time between issuing TCP keepalives."), - gettext_noop("0 means use the system default."), - GUC_UNIT_S - }, - &tcp_keepalives_idle, - 0, 0, INT_MAX, - NULL, assign_tcp_keepalives_idle, show_tcp_keepalives_idle - }, - - { - {"tcp_keepalives_interval", PGC_USERSET, CONN_AUTH_TCP, - gettext_noop("Time between TCP keepalive retransmits."), - gettext_noop("0 means use the system default."), - GUC_UNIT_S - }, - &tcp_keepalives_interval, - 0, 0, INT_MAX, - NULL, assign_tcp_keepalives_interval, show_tcp_keepalives_interval - }, - - { - {"ssl_renegotiation_limit", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("SSL renegotiation is no longer supported; this can only be 0."), - NULL, - GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE, - }, - &ssl_renegotiation_limit, - 0, 0, 0, - NULL, NULL, NULL - }, - - { - {"tcp_keepalives_count", PGC_USERSET, CONN_AUTH_TCP, - gettext_noop("Maximum number of TCP keepalive retransmits."), - gettext_noop("Number of consecutive keepalive retransmits that can be " - "lost before a connection is considered dead. " - "0 means use the system default."), - }, - &tcp_keepalives_count, - 0, 0, INT_MAX, - NULL, assign_tcp_keepalives_count, show_tcp_keepalives_count - }, - - { - {"gin_fuzzy_search_limit", PGC_USERSET, CLIENT_CONN_OTHER, - gettext_noop("Sets the maximum allowed result for exact search by GIN."), - gettext_noop("0 means no limit."), - }, - &GinFuzzySearchLimit, - 0, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"effective_cache_size", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's assumption about the total size of the data caches."), - gettext_noop("That is, the total size of the caches (kernel cache and shared buffers) used for PostgreSQL data files. " - "This is measured in disk pages, which are normally 8 kB each."), - GUC_UNIT_BLOCKS | GUC_EXPLAIN, - }, - &effective_cache_size, - DEFAULT_EFFECTIVE_CACHE_SIZE, 1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"min_parallel_table_scan_size", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the minimum amount of table data for a parallel scan."), - gettext_noop("If the planner estimates that it will read a number of table pages too small to reach this limit, a parallel scan will not be considered."), - GUC_UNIT_BLOCKS | GUC_EXPLAIN, - }, - &min_parallel_table_scan_size, - (8 * 1024 * 1024) / BLCKSZ, 0, INT_MAX / 3, - NULL, NULL, NULL - }, - - { - {"min_parallel_index_scan_size", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the minimum amount of index data for a parallel scan."), - gettext_noop("If the planner estimates that it will read a number of index pages too small to reach this limit, a parallel scan will not be considered."), - GUC_UNIT_BLOCKS | GUC_EXPLAIN, - }, - &min_parallel_index_scan_size, - (512 * 1024) / BLCKSZ, 0, INT_MAX / 3, - NULL, NULL, NULL - }, - - { - /* Can't be set in postgresql.conf */ - {"server_version_num", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the server version as an integer."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &server_version_num, - PG_VERSION_NUM, PG_VERSION_NUM, PG_VERSION_NUM, - NULL, NULL, NULL - }, - - { - {"log_temp_files", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Log the use of temporary files larger than this number of kilobytes."), - gettext_noop("-1 disables logging temporary files. 0 means log all temporary files."), - GUC_UNIT_KB - }, - &log_temp_files, - -1, -1, INT_MAX, - NULL, NULL, NULL - }, - - { - {"track_activity_query_size", PGC_POSTMASTER, STATS_CUMULATIVE, - gettext_noop("Sets the size reserved for pg_stat_activity.query, in bytes."), - NULL, - GUC_UNIT_BYTE - }, - &pgstat_track_activity_query_size, - 1024, 100, 1048576, - NULL, NULL, NULL - }, - - { - {"gin_pending_list_limit", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the maximum size of the pending list for GIN index."), - NULL, - GUC_UNIT_KB - }, - &gin_pending_list_limit, - 4096, 64, MAX_KILOBYTES, - NULL, NULL, NULL - }, - - { - {"tcp_user_timeout", PGC_USERSET, CONN_AUTH_TCP, - gettext_noop("TCP user timeout."), - gettext_noop("0 means use the system default."), - GUC_UNIT_MS - }, - &tcp_user_timeout, - 0, 0, INT_MAX, - NULL, assign_tcp_user_timeout, show_tcp_user_timeout - }, - - { - {"huge_page_size", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("The size of huge page that should be requested."), - gettext_noop("0 means use the system default."), - GUC_UNIT_KB - }, - &huge_page_size, - 0, 0, INT_MAX, - check_huge_page_size, NULL, NULL - }, - - { - {"debug_discard_caches", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Aggressively flush system caches for debugging purposes."), - gettext_noop("0 means use normal caching behavior."), - GUC_NOT_IN_SAMPLE - }, - &debug_discard_caches, -#ifdef DISCARD_CACHES_ENABLED - /* Set default based on older compile-time-only cache clobber macros */ -#if defined(CLOBBER_CACHE_RECURSIVELY) - 3, -#elif defined(CLOBBER_CACHE_ALWAYS) - 1, -#else - 0, -#endif - 0, 5, -#else /* not DISCARD_CACHES_ENABLED */ - 0, 0, 0, -#endif /* not DISCARD_CACHES_ENABLED */ - NULL, NULL, NULL - }, - - { - {"client_connection_check_interval", PGC_USERSET, CONN_AUTH_TCP, - gettext_noop("Sets the time interval between checks for disconnection while running queries."), - gettext_noop("0 disables connection checks."), - GUC_UNIT_MS - }, - &client_connection_check_interval, - 0, 0, INT_MAX, - check_client_connection_check_interval, NULL, NULL - }, - - { - {"log_startup_progress_interval", PGC_SIGHUP, LOGGING_WHEN, - gettext_noop("Time between progress updates for " - "long-running startup operations."), - gettext_noop("0 disables progress updates."), - GUC_UNIT_MS, - }, - &log_startup_progress_interval, - 10000, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"scram_iterations", PGC_USERSET, CONN_AUTH_AUTH, - gettext_noop("Sets the iteration count for SCRAM secret generation."), - NULL, - GUC_REPORT - }, - &scram_sha_256_iterations, - SCRAM_SHA_256_DEFAULT_ITERATIONS, 1, INT_MAX, - NULL, NULL, NULL - }, - - /* End-of-list marker */ - { - {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL - } -}; - - -struct config_real ConfigureNamesReal[] = -{ - { - {"seq_page_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's estimate of the cost of a " - "sequentially fetched disk page."), - NULL, - GUC_EXPLAIN - }, - &seq_page_cost, - DEFAULT_SEQ_PAGE_COST, 0, DBL_MAX, - NULL, NULL, NULL - }, - { - {"random_page_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's estimate of the cost of a " - "nonsequentially fetched disk page."), - NULL, - GUC_EXPLAIN - }, - &random_page_cost, - DEFAULT_RANDOM_PAGE_COST, 0, DBL_MAX, - NULL, NULL, NULL - }, - { - {"cpu_tuple_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's estimate of the cost of " - "processing each tuple (row)."), - NULL, - GUC_EXPLAIN - }, - &cpu_tuple_cost, - DEFAULT_CPU_TUPLE_COST, 0, DBL_MAX, - NULL, NULL, NULL - }, - { - {"cpu_index_tuple_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's estimate of the cost of " - "processing each index entry during an index scan."), - NULL, - GUC_EXPLAIN - }, - &cpu_index_tuple_cost, - DEFAULT_CPU_INDEX_TUPLE_COST, 0, DBL_MAX, - NULL, NULL, NULL - }, - { - {"cpu_operator_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's estimate of the cost of " - "processing each operator or function call."), - NULL, - GUC_EXPLAIN - }, - &cpu_operator_cost, - DEFAULT_CPU_OPERATOR_COST, 0, DBL_MAX, - NULL, NULL, NULL - }, - { - {"parallel_tuple_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's estimate of the cost of " - "passing each tuple (row) from worker to leader backend."), - NULL, - GUC_EXPLAIN - }, - ¶llel_tuple_cost, - DEFAULT_PARALLEL_TUPLE_COST, 0, DBL_MAX, - NULL, NULL, NULL - }, - { - {"parallel_setup_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Sets the planner's estimate of the cost of " - "starting up worker processes for parallel query."), - NULL, - GUC_EXPLAIN - }, - ¶llel_setup_cost, - DEFAULT_PARALLEL_SETUP_COST, 0, DBL_MAX, - NULL, NULL, NULL - }, - - { - {"jit_above_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Perform JIT compilation if query is more expensive."), - gettext_noop("-1 disables JIT compilation."), - GUC_EXPLAIN - }, - &jit_above_cost, - 100000, -1, DBL_MAX, - NULL, NULL, NULL - }, - - { - {"jit_optimize_above_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Optimize JIT-compiled functions if query is more expensive."), - gettext_noop("-1 disables optimization."), - GUC_EXPLAIN - }, - &jit_optimize_above_cost, - 500000, -1, DBL_MAX, - NULL, NULL, NULL - }, - - { - {"jit_inline_above_cost", PGC_USERSET, QUERY_TUNING_COST, - gettext_noop("Perform JIT inlining if query is more expensive."), - gettext_noop("-1 disables inlining."), - GUC_EXPLAIN - }, - &jit_inline_above_cost, - 500000, -1, DBL_MAX, - NULL, NULL, NULL - }, - - { - {"cursor_tuple_fraction", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Sets the planner's estimate of the fraction of " - "a cursor's rows that will be retrieved."), - NULL, - GUC_EXPLAIN - }, - &cursor_tuple_fraction, - DEFAULT_CURSOR_TUPLE_FRACTION, 0.0, 1.0, - NULL, NULL, NULL - }, - - { - {"recursive_worktable_factor", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Sets the planner's estimate of the average size " - "of a recursive query's working table."), - NULL, - GUC_EXPLAIN - }, - &recursive_worktable_factor, - DEFAULT_RECURSIVE_WORKTABLE_FACTOR, 0.001, 1000000.0, - NULL, NULL, NULL - }, - - { - {"geqo_selection_bias", PGC_USERSET, QUERY_TUNING_GEQO, - gettext_noop("GEQO: selective pressure within the population."), - NULL, - GUC_EXPLAIN - }, - &Geqo_selection_bias, - DEFAULT_GEQO_SELECTION_BIAS, - MIN_GEQO_SELECTION_BIAS, MAX_GEQO_SELECTION_BIAS, - NULL, NULL, NULL - }, - { - {"geqo_seed", PGC_USERSET, QUERY_TUNING_GEQO, - gettext_noop("GEQO: seed for random path selection."), - NULL, - GUC_EXPLAIN - }, - &Geqo_seed, - 0.0, 0.0, 1.0, - NULL, NULL, NULL - }, - - { - {"hash_mem_multiplier", PGC_USERSET, RESOURCES_MEM, - gettext_noop("Multiple of \"work_mem\" to use for hash tables."), - NULL, - GUC_EXPLAIN - }, - &hash_mem_multiplier, - 2.0, 1.0, 1000.0, - NULL, NULL, NULL - }, - - { - {"bgwriter_lru_multiplier", PGC_SIGHUP, RESOURCES_BGWRITER, - gettext_noop("Multiple of the average buffer usage to free per round."), - NULL - }, - &bgwriter_lru_multiplier, - 2.0, 0.0, 10.0, - NULL, NULL, NULL - }, - - { - {"seed", PGC_USERSET, UNGROUPED, - gettext_noop("Sets the seed for random-number generation."), - NULL, - GUC_NO_SHOW_ALL | GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &phony_random_seed, - 0.0, -1.0, 1.0, - check_random_seed, assign_random_seed, show_random_seed - }, - - { - {"vacuum_cost_delay", PGC_USERSET, VACUUM_COST_DELAY, - gettext_noop("Vacuum cost delay in milliseconds."), - NULL, - GUC_UNIT_MS - }, - &VacuumCostDelay, - 0, 0, 100, - NULL, NULL, NULL - }, - - { - {"autovacuum_vacuum_cost_delay", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Vacuum cost delay in milliseconds, for autovacuum."), - gettext_noop("-1 means use \"vacuum_cost_delay\"."), - GUC_UNIT_MS - }, - &autovacuum_vac_cost_delay, - 2, -1, 100, - NULL, NULL, NULL - }, - - { - {"autovacuum_vacuum_scale_factor", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Number of tuple updates or deletes prior to vacuum as a fraction of reltuples."), - NULL - }, - &autovacuum_vac_scale, - 0.2, 0.0, 100.0, - NULL, NULL, NULL - }, - - { - {"autovacuum_vacuum_insert_scale_factor", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Number of tuple inserts prior to vacuum as a fraction of reltuples."), - NULL - }, - &autovacuum_vac_ins_scale, - 0.2, 0.0, 100.0, - NULL, NULL, NULL - }, - - { - {"autovacuum_analyze_scale_factor", PGC_SIGHUP, VACUUM_AUTOVACUUM, - gettext_noop("Number of tuple inserts, updates, or deletes prior to analyze as a fraction of reltuples."), - NULL - }, - &autovacuum_anl_scale, - 0.1, 0.0, 100.0, - NULL, NULL, NULL - }, - - { - {"checkpoint_completion_target", PGC_SIGHUP, WAL_CHECKPOINTS, - gettext_noop("Time spent flushing dirty buffers during checkpoint, as fraction of checkpoint interval."), - NULL - }, - &CheckPointCompletionTarget, - 0.9, 0.0, 1.0, - NULL, assign_checkpoint_completion_target, NULL - }, - - { - {"log_statement_sample_rate", PGC_SUSET, LOGGING_WHEN, - gettext_noop("Fraction of statements exceeding \"log_min_duration_sample\" to be logged."), - gettext_noop("Use a value between 0.0 (never log) and 1.0 (always log).") - }, - &log_statement_sample_rate, - 1.0, 0.0, 1.0, - NULL, NULL, NULL - }, - - { - {"log_transaction_sample_rate", PGC_SUSET, LOGGING_WHEN, - gettext_noop("Sets the fraction of transactions from which to log all statements."), - gettext_noop("Use a value between 0.0 (never log) and 1.0 (log all " - "statements for all transactions).") - }, - &log_xact_sample_rate, - 0.0, 0.0, 1.0, - NULL, NULL, NULL - }, - - { - {"vacuum_max_eager_freeze_failure_rate", PGC_USERSET, VACUUM_FREEZING, - gettext_noop("Fraction of pages in a relation vacuum can scan and fail to freeze before disabling eager scanning."), - gettext_noop("A value of 0.0 disables eager scanning and a value of 1.0 will eagerly scan up to 100 percent of the all-visible pages in the relation. If vacuum successfully freezes these pages, the cap is lower than 100 percent, because the goal is to amortize page freezing across multiple vacuums.") - }, - &vacuum_max_eager_freeze_failure_rate, - 0.03, 0.0, 1.0, - NULL, NULL, NULL - }, - - /* End-of-list marker */ - { - {NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL - } -}; - - -struct config_string ConfigureNamesString[] = -{ - { - {"archive_command", PGC_SIGHUP, WAL_ARCHIVING, - gettext_noop("Sets the shell command that will be called to archive a WAL file."), - gettext_noop("An empty string means use \"archive_library\".") - }, - &XLogArchiveCommand, - "", - NULL, NULL, show_archive_command - }, - - { - {"archive_library", PGC_SIGHUP, WAL_ARCHIVING, - gettext_noop("Sets the library that will be called to archive a WAL file."), - gettext_noop("An empty string means use \"archive_command\".") - }, - &XLogArchiveLibrary, - "", - NULL, NULL, NULL - }, - - { - {"restore_command", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY, - gettext_noop("Sets the shell command that will be called to retrieve an archived WAL file."), - NULL - }, - &recoveryRestoreCommand, - "", - NULL, NULL, NULL - }, - - { - {"archive_cleanup_command", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY, - gettext_noop("Sets the shell command that will be executed at every restart point."), - NULL - }, - &archiveCleanupCommand, - "", - NULL, NULL, NULL - }, - - { - {"recovery_end_command", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY, - gettext_noop("Sets the shell command that will be executed once at the end of recovery."), - NULL - }, - &recoveryEndCommand, - "", - NULL, NULL, NULL - }, - - { - {"recovery_target_timeline", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Specifies the timeline to recover into."), - NULL - }, - &recovery_target_timeline_string, - "latest", - check_recovery_target_timeline, assign_recovery_target_timeline, NULL - }, - - { - {"recovery_target", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Set to \"immediate\" to end recovery as soon as a consistent state is reached."), - NULL - }, - &recovery_target_string, - "", - check_recovery_target, assign_recovery_target, NULL - }, - { - {"recovery_target_xid", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Sets the transaction ID up to which recovery will proceed."), - NULL - }, - &recovery_target_xid_string, - "", - check_recovery_target_xid, assign_recovery_target_xid, NULL - }, - { - {"recovery_target_time", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Sets the time stamp up to which recovery will proceed."), - NULL - }, - &recovery_target_time_string, - "", - check_recovery_target_time, assign_recovery_target_time, NULL - }, - { - {"recovery_target_name", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Sets the named restore point up to which recovery will proceed."), - NULL - }, - &recovery_target_name_string, - "", - check_recovery_target_name, assign_recovery_target_name, NULL - }, - { - {"recovery_target_lsn", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Sets the LSN of the write-ahead log location up to which recovery will proceed."), - NULL - }, - &recovery_target_lsn_string, - "", - check_recovery_target_lsn, assign_recovery_target_lsn, NULL - }, - - { - {"primary_conninfo", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the connection string to be used to connect to the sending server."), - NULL, - GUC_SUPERUSER_ONLY - }, - &PrimaryConnInfo, - "", - NULL, NULL, NULL - }, - - { - {"primary_slot_name", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Sets the name of the replication slot to use on the sending server."), - NULL - }, - &PrimarySlotName, - "", - check_primary_slot_name, NULL, NULL - }, - - { - {"client_encoding", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the client's character set encoding."), - NULL, - GUC_IS_NAME | GUC_REPORT - }, - &client_encoding_string, - "SQL_ASCII", - check_client_encoding, assign_client_encoding, NULL - }, - - { - {"log_line_prefix", PGC_SIGHUP, LOGGING_WHAT, - gettext_noop("Controls information prefixed to each log line."), - gettext_noop("An empty string means no prefix.") - }, - &Log_line_prefix, - "%m [%p] ", - NULL, NULL, NULL - }, - - { - {"log_timezone", PGC_SIGHUP, LOGGING_WHAT, - gettext_noop("Sets the time zone to use in log messages."), - NULL - }, - &log_timezone_string, - "GMT", - check_log_timezone, assign_log_timezone, show_log_timezone - }, - - { - {"DateStyle", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the display format for date and time values."), - gettext_noop("Also controls interpretation of ambiguous " - "date inputs."), - GUC_LIST_INPUT | GUC_REPORT - }, - &datestyle_string, - "ISO, MDY", - check_datestyle, assign_datestyle, NULL - }, - - { - {"default_table_access_method", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the default table access method for new tables."), - NULL, - GUC_IS_NAME - }, - &default_table_access_method, - DEFAULT_TABLE_ACCESS_METHOD, - check_default_table_access_method, NULL, NULL - }, - - { - {"default_tablespace", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the default tablespace to create tables and indexes in."), - gettext_noop("An empty string means use the database's default tablespace."), - GUC_IS_NAME - }, - &default_tablespace, - "", - check_default_tablespace, NULL, NULL - }, - - { - {"temp_tablespaces", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the tablespace(s) to use for temporary tables and sort files."), - gettext_noop("An empty string means use the database's default tablespace."), - GUC_LIST_INPUT | GUC_LIST_QUOTE - }, - &temp_tablespaces, - "", - check_temp_tablespaces, assign_temp_tablespaces, NULL - }, - - { - {"createrole_self_grant", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets whether a CREATEROLE user automatically grants " - "the role to themselves, and with which options."), - gettext_noop("An empty string disables automatic self grants."), - GUC_LIST_INPUT - }, - &createrole_self_grant, - "", - check_createrole_self_grant, assign_createrole_self_grant, NULL - }, - - { - {"dynamic_library_path", PGC_SUSET, CLIENT_CONN_OTHER, - gettext_noop("Sets the path for dynamically loadable modules."), - gettext_noop("If a dynamically loadable module needs to be opened and " - "the specified name does not have a directory component (i.e., the " - "name does not contain a slash), the system will search this path for " - "the specified file."), - GUC_SUPERUSER_ONLY - }, - &Dynamic_library_path, - "$libdir", - NULL, NULL, NULL - }, - - { - {"extension_control_path", PGC_SUSET, CLIENT_CONN_OTHER, - gettext_noop("Sets the path for extension control files."), - gettext_noop("The remaining extension script and secondary control files are then loaded " - "from the same directory where the primary control file was found."), - GUC_SUPERUSER_ONLY - }, - &Extension_control_path, - "$system", - NULL, NULL, NULL - }, - - { - {"krb_server_keyfile", PGC_SIGHUP, CONN_AUTH_AUTH, - gettext_noop("Sets the location of the Kerberos server key file."), - NULL, - GUC_SUPERUSER_ONLY - }, - &pg_krb_server_keyfile, - PG_KRB_SRVTAB, - NULL, NULL, NULL - }, - - { - {"bonjour_name", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the Bonjour service name."), - gettext_noop("An empty string means use the computer name.") - }, - &bonjour_name, - "", - NULL, NULL, NULL - }, - - { - {"lc_messages", PGC_SUSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the language in which messages are displayed."), - gettext_noop("An empty string means use the operating system setting.") - }, - &locale_messages, - "", - check_locale_messages, assign_locale_messages, NULL - }, - - { - {"lc_monetary", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the locale for formatting monetary amounts."), - gettext_noop("An empty string means use the operating system setting.") - }, - &locale_monetary, - "C", - check_locale_monetary, assign_locale_monetary, NULL - }, - - { - {"lc_numeric", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the locale for formatting numbers."), - gettext_noop("An empty string means use the operating system setting.") - }, - &locale_numeric, - "C", - check_locale_numeric, assign_locale_numeric, NULL - }, - - { - {"lc_time", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the locale for formatting date and time values."), - gettext_noop("An empty string means use the operating system setting.") - }, - &locale_time, - "C", - check_locale_time, assign_locale_time, NULL - }, - - { - {"session_preload_libraries", PGC_SUSET, CLIENT_CONN_PRELOAD, - gettext_noop("Lists shared libraries to preload into each backend."), - NULL, - GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY - }, - &session_preload_libraries_string, - "", - NULL, NULL, NULL - }, - - { - {"shared_preload_libraries", PGC_POSTMASTER, CLIENT_CONN_PRELOAD, - gettext_noop("Lists shared libraries to preload into server."), - NULL, - GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY - }, - &shared_preload_libraries_string, - "", - NULL, NULL, NULL - }, - - { - {"local_preload_libraries", PGC_USERSET, CLIENT_CONN_PRELOAD, - gettext_noop("Lists unprivileged shared libraries to preload into each backend."), - NULL, - GUC_LIST_INPUT | GUC_LIST_QUOTE - }, - &local_preload_libraries_string, - "", - NULL, NULL, NULL - }, - - { - {"search_path", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the schema search order for names that are not schema-qualified."), - NULL, - GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_EXPLAIN | GUC_REPORT - }, - &namespace_search_path, - "\"$user\", public", - check_search_path, assign_search_path, NULL - }, - - { - /* Can't be set in postgresql.conf */ - {"server_encoding", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the server (database) character set encoding."), - NULL, - GUC_IS_NAME | GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &server_encoding_string, - "SQL_ASCII", - NULL, NULL, NULL - }, - - { - /* Can't be set in postgresql.conf */ - {"server_version", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the server version."), - NULL, - GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &server_version_string, - PG_VERSION, - NULL, NULL, NULL - }, - - { - /* Not for general use --- used by SET ROLE */ - {"role", PGC_USERSET, UNGROUPED, - gettext_noop("Sets the current role."), - NULL, - GUC_IS_NAME | GUC_NO_SHOW_ALL | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_NOT_WHILE_SEC_REST - }, - &role_string, - "none", - check_role, assign_role, show_role - }, - - { - /* Not for general use --- used by SET SESSION AUTHORIZATION */ - {"session_authorization", PGC_USERSET, UNGROUPED, - gettext_noop("Sets the session user name."), - NULL, - GUC_IS_NAME | GUC_REPORT | GUC_NO_SHOW_ALL | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_NOT_WHILE_SEC_REST - }, - &session_authorization_string, - NULL, - check_session_authorization, assign_session_authorization, NULL - }, - - { - {"log_destination", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the destination for server log output."), - gettext_noop("Valid values are combinations of \"stderr\", " - "\"syslog\", \"csvlog\", \"jsonlog\", and \"eventlog\", " - "depending on the platform."), - GUC_LIST_INPUT - }, - &Log_destination_string, - "stderr", - check_log_destination, assign_log_destination, NULL - }, - { - {"log_directory", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the destination directory for log files."), - gettext_noop("Can be specified as relative to the data directory " - "or as absolute path."), - GUC_SUPERUSER_ONLY - }, - &Log_directory, - "log", - check_canonical_path, NULL, NULL - }, - { - {"log_filename", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the file name pattern for log files."), - NULL, - GUC_SUPERUSER_ONLY - }, - &Log_filename, - "postgresql-%Y-%m-%d_%H%M%S.log", - NULL, NULL, NULL - }, - - { - {"syslog_ident", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the program name used to identify PostgreSQL " - "messages in syslog."), - NULL - }, - &syslog_ident_str, - "postgres", - NULL, assign_syslog_ident, NULL - }, - - { - {"event_source", PGC_POSTMASTER, LOGGING_WHERE, - gettext_noop("Sets the application name used to identify " - "PostgreSQL messages in the event log."), - NULL - }, - &event_source, - DEFAULT_EVENT_SOURCE, - NULL, NULL, NULL - }, - - { - {"TimeZone", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the time zone for displaying and interpreting time stamps."), - NULL, - GUC_REPORT - }, - &timezone_string, - "GMT", - check_timezone, assign_timezone, show_timezone - }, - { - {"timezone_abbreviations", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Selects a file of time zone abbreviations."), - NULL - }, - &timezone_abbreviations_string, - NULL, - check_timezone_abbreviations, assign_timezone_abbreviations, NULL - }, - - { - {"unix_socket_group", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the owning group of the Unix-domain socket."), - gettext_noop("The owning user of the socket is always the user that starts the server. " - "An empty string means use the user's default group.") - }, - &Unix_socket_group, - "", - NULL, NULL, NULL - }, - - { - {"unix_socket_directories", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the directories where Unix-domain sockets will be created."), - NULL, - GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY - }, - &Unix_socket_directories, - DEFAULT_PGSOCKET_DIR, - NULL, NULL, NULL - }, - - { - {"listen_addresses", PGC_POSTMASTER, CONN_AUTH_SETTINGS, - gettext_noop("Sets the host name or IP address(es) to listen to."), - NULL, - GUC_LIST_INPUT - }, - &ListenAddresses, - "localhost", - NULL, NULL, NULL - }, - - { - /* - * Can't be set by ALTER SYSTEM as it can lead to recursive definition - * of data_directory. - */ - {"data_directory", PGC_POSTMASTER, FILE_LOCATIONS, - gettext_noop("Sets the server's data directory."), - NULL, - GUC_SUPERUSER_ONLY | GUC_DISALLOW_IN_AUTO_FILE - }, - &data_directory, - NULL, - NULL, NULL, NULL - }, - - { - {"config_file", PGC_POSTMASTER, FILE_LOCATIONS, - gettext_noop("Sets the server's main configuration file."), - NULL, - GUC_DISALLOW_IN_FILE | GUC_SUPERUSER_ONLY - }, - &ConfigFileName, - NULL, - NULL, NULL, NULL - }, - - { - {"hba_file", PGC_POSTMASTER, FILE_LOCATIONS, - gettext_noop("Sets the server's \"hba\" configuration file."), - NULL, - GUC_SUPERUSER_ONLY - }, - &HbaFileName, - NULL, - NULL, NULL, NULL - }, - - { - {"ident_file", PGC_POSTMASTER, FILE_LOCATIONS, - gettext_noop("Sets the server's \"ident\" configuration file."), - NULL, - GUC_SUPERUSER_ONLY - }, - &IdentFileName, - NULL, - NULL, NULL, NULL - }, - - { - {"external_pid_file", PGC_POSTMASTER, FILE_LOCATIONS, - gettext_noop("Writes the postmaster PID to the specified file."), - NULL, - GUC_SUPERUSER_ONLY - }, - &external_pid_file, - NULL, - check_canonical_path, NULL, NULL - }, - - { - {"ssl_library", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows the name of the SSL library."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &ssl_library, -#ifdef USE_SSL - "OpenSSL", -#else - "", -#endif - NULL, NULL, NULL - }, - - { - {"ssl_cert_file", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Location of the SSL server certificate file."), - NULL - }, - &ssl_cert_file, - "server.crt", - NULL, NULL, NULL - }, - - { - {"ssl_key_file", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Location of the SSL server private key file."), - NULL - }, - &ssl_key_file, - "server.key", - NULL, NULL, NULL - }, - - { - {"ssl_ca_file", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Location of the SSL certificate authority file."), - NULL - }, - &ssl_ca_file, - "", - NULL, NULL, NULL - }, - - { - {"ssl_crl_file", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Location of the SSL certificate revocation list file."), - NULL - }, - &ssl_crl_file, - "", - NULL, NULL, NULL - }, - - { - {"ssl_crl_dir", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Location of the SSL certificate revocation list directory."), - NULL - }, - &ssl_crl_dir, - "", - NULL, NULL, NULL - }, - - { - {"synchronous_standby_names", PGC_SIGHUP, REPLICATION_PRIMARY, - gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."), - NULL, - GUC_LIST_INPUT - }, - &SyncRepStandbyNames, - "", - check_synchronous_standby_names, assign_synchronous_standby_names, NULL - }, - - { - {"default_text_search_config", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets default text search configuration."), - NULL - }, - &TSCurrentConfig, - "pg_catalog.simple", - check_default_text_search_config, assign_default_text_search_config, NULL - }, - - { - {"ssl_tls13_ciphers", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Sets the list of allowed TLSv1.3 cipher suites."), - gettext_noop("An empty string means use the default cipher suites."), - GUC_SUPERUSER_ONLY - }, - &SSLCipherSuites, - "", - NULL, NULL, NULL - }, - - { - {"ssl_ciphers", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Sets the list of allowed TLSv1.2 (and lower) ciphers."), - NULL, - GUC_SUPERUSER_ONLY - }, - &SSLCipherList, -#ifdef USE_OPENSSL - "HIGH:MEDIUM:+3DES:!aNULL", -#else - "none", -#endif - NULL, NULL, NULL - }, - - { - {"ssl_groups", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Sets the group(s) to use for Diffie-Hellman key exchange."), - gettext_noop("Multiple groups can be specified using colon-separated list."), - GUC_SUPERUSER_ONLY - }, - &SSLECDHCurve, -#ifdef USE_SSL - "X25519:prime256v1", -#else - "none", -#endif - NULL, NULL, NULL - }, - - { - {"ssl_dh_params_file", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Location of the SSL DH parameters file."), - gettext_noop("An empty string means use compiled-in default parameters."), - GUC_SUPERUSER_ONLY - }, - &ssl_dh_params_file, - "", - NULL, NULL, NULL - }, - - { - {"ssl_passphrase_command", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Command to obtain passphrases for SSL."), - gettext_noop("An empty string means use the built-in prompting mechanism."), - GUC_SUPERUSER_ONLY - }, - &ssl_passphrase_command, - "", - NULL, NULL, NULL - }, - - { - {"application_name", PGC_USERSET, LOGGING_WHAT, - gettext_noop("Sets the application name to be reported in statistics and logs."), - NULL, - GUC_IS_NAME | GUC_REPORT | GUC_NOT_IN_SAMPLE - }, - &application_name, - "", - check_application_name, assign_application_name, NULL - }, - - { - {"cluster_name", PGC_POSTMASTER, PROCESS_TITLE, - gettext_noop("Sets the name of the cluster, which is included in the process title."), - NULL, - GUC_IS_NAME - }, - &cluster_name, - "", - check_cluster_name, NULL, NULL - }, - - { - {"wal_consistency_checking", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Sets the WAL resource managers for which WAL consistency checks are done."), - gettext_noop("Full-page images will be logged for all data blocks and cross-checked against the results of WAL replay."), - GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE - }, - &wal_consistency_checking_string, - "", - check_wal_consistency_checking, assign_wal_consistency_checking, NULL - }, - - { - {"jit_provider", PGC_POSTMASTER, CLIENT_CONN_PRELOAD, - gettext_noop("JIT provider to use."), - NULL, - GUC_SUPERUSER_ONLY - }, - &jit_provider, - "llvmjit", - NULL, NULL, NULL - }, - - { - {"backtrace_functions", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Log backtrace for errors in these functions."), - NULL, - GUC_NOT_IN_SAMPLE - }, - &backtrace_functions, - "", - check_backtrace_functions, assign_backtrace_functions, NULL - }, - - { - {"debug_io_direct", PGC_POSTMASTER, DEVELOPER_OPTIONS, - gettext_noop("Use direct I/O for file access."), - gettext_noop("An empty string disables direct I/O."), - GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE - }, - &debug_io_direct_string, - "", - check_debug_io_direct, assign_debug_io_direct, NULL - }, - - { - {"synchronized_standby_slots", PGC_SIGHUP, REPLICATION_PRIMARY, - gettext_noop("Lists streaming replication standby server replication slot " - "names that logical WAL sender processes will wait for."), - gettext_noop("Logical WAL sender processes will send decoded " - "changes to output plugins only after the specified " - "replication slots have confirmed receiving WAL."), - GUC_LIST_INPUT - }, - &synchronized_standby_slots, - "", - check_synchronized_standby_slots, assign_synchronized_standby_slots, NULL - }, - - { - {"restrict_nonsystem_relation_kind", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Prohibits access to non-system relations of specified kinds."), - NULL, - GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE - }, - &restrict_nonsystem_relation_kind_string, - "", - check_restrict_nonsystem_relation_kind, assign_restrict_nonsystem_relation_kind, NULL - }, - - { - {"oauth_validator_libraries", PGC_SIGHUP, CONN_AUTH_AUTH, - gettext_noop("Lists libraries that may be called to validate OAuth v2 bearer tokens."), - NULL, - GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY - }, - &oauth_validator_libraries_string, - "", - NULL, NULL, NULL - }, - - { - {"log_connections", PGC_SU_BACKEND, LOGGING_WHAT, - gettext_noop("Logs specified aspects of connection establishment and setup."), - NULL, - GUC_LIST_INPUT - }, - &log_connections_string, - "", - check_log_connections, assign_log_connections, NULL - }, - - - /* End-of-list marker */ - { - {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL - } -}; - - -struct config_enum ConfigureNamesEnum[] = -{ - { - {"backslash_quote", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, - gettext_noop("Sets whether \"\\'\" is allowed in string literals."), - NULL - }, - &backslash_quote, - BACKSLASH_QUOTE_SAFE_ENCODING, backslash_quote_options, - NULL, NULL, NULL - }, - - { - {"bytea_output", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the output format for bytea."), - NULL - }, - &bytea_output, - BYTEA_OUTPUT_HEX, bytea_output_options, - NULL, NULL, NULL - }, - - { - {"client_min_messages", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the message levels that are sent to the client."), - gettext_noop("Each level includes all the levels that follow it. The later" - " the level, the fewer messages are sent.") - }, - &client_min_messages, - NOTICE, client_message_level_options, - NULL, NULL, NULL - }, - - { - {"compute_query_id", PGC_SUSET, STATS_MONITORING, - gettext_noop("Enables in-core computation of query identifiers."), - NULL - }, - &compute_query_id, - COMPUTE_QUERY_ID_AUTO, compute_query_id_options, - NULL, NULL, NULL - }, - - { - {"constraint_exclusion", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Enables the planner to use constraints to optimize queries."), - gettext_noop("Table scans will be skipped if their constraints" - " guarantee that no rows match the query."), - GUC_EXPLAIN - }, - &constraint_exclusion, - CONSTRAINT_EXCLUSION_PARTITION, constraint_exclusion_options, - NULL, NULL, NULL - }, - - { - {"default_toast_compression", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the default compression method for compressible values."), - NULL - }, - &default_toast_compression, - TOAST_PGLZ_COMPRESSION, - default_toast_compression_options, - NULL, NULL, NULL - }, - - { - {"default_transaction_isolation", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the transaction isolation level of each new transaction."), - NULL - }, - &DefaultXactIsoLevel, - XACT_READ_COMMITTED, isolation_level_options, - NULL, NULL, NULL - }, - - { - {"transaction_isolation", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the current transaction's isolation level."), - NULL, - GUC_NO_RESET | GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &XactIsoLevel, - XACT_READ_COMMITTED, isolation_level_options, - check_transaction_isolation, NULL, NULL - }, - - { - {"IntervalStyle", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Sets the display format for interval values."), - NULL, - GUC_REPORT - }, - &IntervalStyle, - INTSTYLE_POSTGRES, intervalstyle_options, - NULL, NULL, NULL - }, - - { - {"icu_validation_level", PGC_USERSET, CLIENT_CONN_LOCALE, - gettext_noop("Log level for reporting invalid ICU locale strings."), - NULL - }, - &icu_validation_level, - WARNING, icu_validation_level_options, - NULL, NULL, NULL - }, - - { - {"log_error_verbosity", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Sets the verbosity of logged messages."), - NULL - }, - &Log_error_verbosity, - PGERROR_DEFAULT, log_error_verbosity_options, - NULL, NULL, NULL - }, - - { - {"log_min_messages", PGC_SUSET, LOGGING_WHEN, - gettext_noop("Sets the message levels that are logged."), - gettext_noop("Each level includes all the levels that follow it. The later" - " the level, the fewer messages are sent.") - }, - &log_min_messages, - WARNING, server_message_level_options, - NULL, NULL, NULL - }, - - { - {"log_min_error_statement", PGC_SUSET, LOGGING_WHEN, - gettext_noop("Causes all statements generating error at or above this level to be logged."), - gettext_noop("Each level includes all the levels that follow it. The later" - " the level, the fewer messages are sent.") - }, - &log_min_error_statement, - ERROR, server_message_level_options, - NULL, NULL, NULL - }, - - { - {"log_statement", PGC_SUSET, LOGGING_WHAT, - gettext_noop("Sets the type of statements logged."), - NULL - }, - &log_statement, - LOGSTMT_NONE, log_statement_options, - NULL, NULL, NULL - }, - - { - {"syslog_facility", PGC_SIGHUP, LOGGING_WHERE, - gettext_noop("Sets the syslog \"facility\" to be used when syslog enabled."), - NULL - }, - &syslog_facility, - DEFAULT_SYSLOG_FACILITY, - syslog_facility_options, - NULL, assign_syslog_facility, NULL - }, - - { - {"session_replication_role", PGC_SUSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets the session's behavior for triggers and rewrite rules."), - NULL - }, - &SessionReplicationRole, - SESSION_REPLICATION_ROLE_ORIGIN, session_replication_role_options, - NULL, assign_session_replication_role, NULL - }, - - { - {"synchronous_commit", PGC_USERSET, WAL_SETTINGS, - gettext_noop("Sets the current transaction's synchronization level."), - NULL - }, - &synchronous_commit, - SYNCHRONOUS_COMMIT_ON, synchronous_commit_options, - NULL, assign_synchronous_commit, NULL - }, - - { - {"archive_mode", PGC_POSTMASTER, WAL_ARCHIVING, - gettext_noop("Allows archiving of WAL files using \"archive_command\"."), - NULL - }, - &XLogArchiveMode, - ARCHIVE_MODE_OFF, archive_mode_options, - NULL, NULL, NULL - }, - - { - {"recovery_target_action", PGC_POSTMASTER, WAL_RECOVERY_TARGET, - gettext_noop("Sets the action to perform upon reaching the recovery target."), - NULL - }, - &recoveryTargetAction, - RECOVERY_TARGET_ACTION_PAUSE, recovery_target_action_options, - NULL, NULL, NULL - }, - - { - {"track_functions", PGC_SUSET, STATS_CUMULATIVE, - gettext_noop("Collects function-level statistics on database activity."), - NULL - }, - &pgstat_track_functions, - TRACK_FUNC_OFF, track_function_options, - NULL, NULL, NULL - }, - - - { - {"stats_fetch_consistency", PGC_USERSET, STATS_CUMULATIVE, - gettext_noop("Sets the consistency of accesses to statistics data."), - NULL - }, - &pgstat_fetch_consistency, - PGSTAT_FETCH_CONSISTENCY_CACHE, stats_fetch_consistency, - NULL, assign_stats_fetch_consistency, NULL - }, - - { - {"wal_compression", PGC_SUSET, WAL_SETTINGS, - gettext_noop("Compresses full-page writes written in WAL file with specified method."), - NULL - }, - &wal_compression, - WAL_COMPRESSION_NONE, wal_compression_options, - NULL, NULL, NULL - }, - - { - {"wal_level", PGC_POSTMASTER, WAL_SETTINGS, - gettext_noop("Sets the level of information written to the WAL."), - NULL - }, - &wal_level, - WAL_LEVEL_REPLICA, wal_level_options, - NULL, NULL, NULL - }, - - { - {"dynamic_shared_memory_type", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Selects the dynamic shared memory implementation used."), - NULL - }, - &dynamic_shared_memory_type, - DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE, dynamic_shared_memory_options, - NULL, NULL, NULL - }, - - { - {"shared_memory_type", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Selects the shared memory implementation used for the main shared memory region."), - NULL - }, - &shared_memory_type, - DEFAULT_SHARED_MEMORY_TYPE, shared_memory_options, - NULL, NULL, NULL - }, - - { - {"file_copy_method", PGC_USERSET, RESOURCES_DISK, - gettext_noop("Selects the file copy method."), - NULL - }, - &file_copy_method, - FILE_COPY_METHOD_COPY, file_copy_method_options, - NULL, NULL, NULL - }, - - { - {"wal_sync_method", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("Selects the method used for forcing WAL updates to disk."), - NULL - }, - &wal_sync_method, - DEFAULT_WAL_SYNC_METHOD, wal_sync_method_options, - NULL, assign_wal_sync_method, NULL - }, - - { - {"xmlbinary", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets how binary values are to be encoded in XML."), - NULL - }, - &xmlbinary, - XMLBINARY_BASE64, xmlbinary_options, - NULL, NULL, NULL - }, - - { - {"xmloption", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Sets whether XML data in implicit parsing and serialization " - "operations is to be considered as documents or content fragments."), - NULL - }, - &xmloption, - XMLOPTION_CONTENT, xmloption_options, - NULL, NULL, NULL - }, - - { - {"huge_pages", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Use of huge pages on Linux or Windows."), - NULL - }, - &huge_pages, - HUGE_PAGES_TRY, huge_pages_options, - NULL, NULL, NULL - }, - - { - {"huge_pages_status", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Indicates the status of huge pages."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &huge_pages_status, - HUGE_PAGES_UNKNOWN, huge_pages_status_options, - NULL, NULL, NULL - }, - - { - {"recovery_prefetch", PGC_SIGHUP, WAL_RECOVERY, - gettext_noop("Prefetch referenced blocks during recovery."), - gettext_noop("Look ahead in the WAL to find references to uncached data.") - }, - &recovery_prefetch, - RECOVERY_PREFETCH_TRY, recovery_prefetch_options, - check_recovery_prefetch, assign_recovery_prefetch, NULL - }, - - { - {"debug_parallel_query", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Forces the planner's use parallel query nodes."), - gettext_noop("This can be useful for testing the parallel query infrastructure " - "by forcing the planner to generate plans that contain nodes " - "that perform tuple communication between workers and the main process."), - GUC_NOT_IN_SAMPLE | GUC_EXPLAIN - }, - &debug_parallel_query, - DEBUG_PARALLEL_OFF, debug_parallel_query_options, - NULL, NULL, NULL - }, - - { - {"password_encryption", PGC_USERSET, CONN_AUTH_AUTH, - gettext_noop("Chooses the algorithm for encrypting passwords."), - NULL - }, - &Password_encryption, - PASSWORD_TYPE_SCRAM_SHA_256, password_encryption_options, - NULL, NULL, NULL - }, - - { - {"plan_cache_mode", PGC_USERSET, QUERY_TUNING_OTHER, - gettext_noop("Controls the planner's selection of custom or generic plan."), - gettext_noop("Prepared statements can have custom and generic plans, and the planner " - "will attempt to choose which is better. This can be set to override " - "the default behavior."), - GUC_EXPLAIN - }, - &plan_cache_mode, - PLAN_CACHE_MODE_AUTO, plan_cache_mode_options, - NULL, NULL, NULL - }, - - { - {"ssl_min_protocol_version", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Sets the minimum SSL/TLS protocol version to use."), - NULL, - GUC_SUPERUSER_ONLY - }, - &ssl_min_protocol_version, - PG_TLS1_2_VERSION, - ssl_protocol_versions_info + 1, /* don't allow PG_TLS_ANY */ - NULL, NULL, NULL - }, - - { - {"ssl_max_protocol_version", PGC_SIGHUP, CONN_AUTH_SSL, - gettext_noop("Sets the maximum SSL/TLS protocol version to use."), - NULL, - GUC_SUPERUSER_ONLY - }, - &ssl_max_protocol_version, - PG_TLS_ANY, - ssl_protocol_versions_info, - NULL, NULL, NULL - }, - - { - {"recovery_init_sync_method", PGC_SIGHUP, ERROR_HANDLING_OPTIONS, - gettext_noop("Sets the method for synchronizing the data directory before crash recovery."), - }, - &recovery_init_sync_method, - DATA_DIR_SYNC_METHOD_FSYNC, recovery_init_sync_method_options, - NULL, NULL, NULL - }, - - { - {"debug_logical_replication_streaming", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Forces immediate streaming or serialization of changes in large transactions."), - gettext_noop("On the publisher, it allows streaming or serializing each change in logical decoding. " - "On the subscriber, it allows serialization of all changes to files and notifies the " - "parallel apply workers to read and apply them at the end of the transaction."), - GUC_NOT_IN_SAMPLE - }, - &debug_logical_replication_streaming, - DEBUG_LOGICAL_REP_STREAMING_BUFFERED, debug_logical_replication_streaming_options, - NULL, NULL, NULL - }, - - { - {"io_method", PGC_POSTMASTER, RESOURCES_IO, - gettext_noop("Selects the method for executing asynchronous I/O."), - NULL - }, - &io_method, - DEFAULT_IO_METHOD, io_method_options, - NULL, assign_io_method, NULL - }, - - /* End-of-list marker */ - { - {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL - } -}; +#include "utils/guc_tables.inc.c" diff --git a/src/backend/utils/misc/help_config.c b/src/backend/utils/misc/help_config.c index 55c36ddf051d5..2810715693c58 100644 --- a/src/backend/utils/misc/help_config.c +++ b/src/backend/utils/misc/help_config.c @@ -23,40 +23,24 @@ #include "utils/help_config.h" -/* - * This union allows us to mix the numerous different types of structs - * that we are organizing. - */ -typedef union -{ - struct config_generic generic; - struct config_bool _bool; - struct config_real real; - struct config_int integer; - struct config_string string; - struct config_enum _enum; -} mixedStruct; - - -static void printMixedStruct(mixedStruct *structToPrint); -static bool displayStruct(mixedStruct *structToDisplay); +static void printMixedStruct(const struct config_generic *structToPrint); +static bool displayStruct(const struct config_generic *structToDisplay); void GucInfoMain(void) { struct config_generic **guc_vars; - int numOpts, - i; + int numOpts; /* Initialize the GUC hash table */ build_guc_variables(); guc_vars = get_guc_variables(&numOpts); - for (i = 0; i < numOpts; i++) + for (int i = 0; i < numOpts; i++) { - mixedStruct *var = (mixedStruct *) guc_vars[i]; + const struct config_generic *var = guc_vars[i]; if (displayStruct(var)) printMixedStruct(var); @@ -71,11 +55,11 @@ GucInfoMain(void) * should be displayed to the user. */ static bool -displayStruct(mixedStruct *structToDisplay) +displayStruct(const struct config_generic *structToDisplay) { - return !(structToDisplay->generic.flags & (GUC_NO_SHOW_ALL | - GUC_NOT_IN_SAMPLE | - GUC_DISALLOW_IN_FILE)); + return !(structToDisplay->flags & (GUC_NO_SHOW_ALL | + GUC_NOT_IN_SAMPLE | + GUC_DISALLOW_IN_FILE)); } @@ -84,14 +68,14 @@ displayStruct(mixedStruct *structToDisplay) * a different format, depending on what the user wants to see. */ static void -printMixedStruct(mixedStruct *structToPrint) +printMixedStruct(const struct config_generic *structToPrint) { printf("%s\t%s\t%s\t", - structToPrint->generic.name, - GucContext_Names[structToPrint->generic.context], - _(config_group_names[structToPrint->generic.group])); + structToPrint->name, + GucContext_Names[structToPrint->context], + _(config_group_names[structToPrint->group])); - switch (structToPrint->generic.vartype) + switch (structToPrint->vartype) { case PGC_BOOL: @@ -102,26 +86,26 @@ printMixedStruct(mixedStruct *structToPrint) case PGC_INT: printf("INTEGER\t%d\t%d\t%d\t", - structToPrint->integer.reset_val, - structToPrint->integer.min, - structToPrint->integer.max); + structToPrint->_int.reset_val, + structToPrint->_int.min, + structToPrint->_int.max); break; case PGC_REAL: printf("REAL\t%g\t%g\t%g\t", - structToPrint->real.reset_val, - structToPrint->real.min, - structToPrint->real.max); + structToPrint->_real.reset_val, + structToPrint->_real.min, + structToPrint->_real.max); break; case PGC_STRING: printf("STRING\t%s\t\t\t", - structToPrint->string.boot_val ? structToPrint->string.boot_val : ""); + structToPrint->_string.boot_val ? structToPrint->_string.boot_val : ""); break; case PGC_ENUM: printf("ENUM\t%s\t\t\t", - config_enum_lookup_by_value(&structToPrint->_enum, + config_enum_lookup_by_value(structToPrint, structToPrint->_enum.boot_val)); break; @@ -131,6 +115,6 @@ printMixedStruct(mixedStruct *structToPrint) } printf("%s\t%s\n", - (structToPrint->generic.short_desc == NULL) ? "" : _(structToPrint->generic.short_desc), - (structToPrint->generic.long_desc == NULL) ? "" : _(structToPrint->generic.long_desc)); + (structToPrint->short_desc == NULL) ? "" : _(structToPrint->short_desc), + (structToPrint->long_desc == NULL) ? "" : _(structToPrint->long_desc)); } diff --git a/src/backend/utils/misc/injection_point.c b/src/backend/utils/misc/injection_point.c index f58ebc8ee522d..4945da458b18a 100644 --- a/src/backend/utils/misc/injection_point.c +++ b/src/backend/utils/misc/injection_point.c @@ -186,7 +186,7 @@ injection_point_cache_load(InjectionPointEntry *entry, int slot_idx, uint64 gene elog(ERROR, "could not find library \"%s\" for injection point \"%s\"", path, entry->name); - injection_callback_local = (void *) + injection_callback_local = load_external_function(path, entry->function, false, NULL); if (injection_callback_local == NULL) @@ -283,16 +283,16 @@ InjectionPointAttach(const char *name, int free_idx; if (strlen(name) >= INJ_NAME_MAXLEN) - elog(ERROR, "injection point name %s too long (maximum of %u)", - name, INJ_NAME_MAXLEN); + elog(ERROR, "injection point name %s too long (maximum of %u characters)", + name, INJ_NAME_MAXLEN - 1); if (strlen(library) >= INJ_LIB_MAXLEN) - elog(ERROR, "injection point library %s too long (maximum of %u)", - library, INJ_LIB_MAXLEN); + elog(ERROR, "injection point library %s too long (maximum of %u characters)", + library, INJ_LIB_MAXLEN - 1); if (strlen(function) >= INJ_FUNC_MAXLEN) - elog(ERROR, "injection point function %s too long (maximum of %u)", - function, INJ_FUNC_MAXLEN); - if (private_data_size >= INJ_PRIVATE_MAXLEN) - elog(ERROR, "injection point data too long (maximum of %u)", + elog(ERROR, "injection point function %s too long (maximum of %u characters)", + function, INJ_FUNC_MAXLEN - 1); + if (private_data_size > INJ_PRIVATE_MAXLEN) + elog(ERROR, "injection point data too long (maximum of %u bytes)", INJ_PRIVATE_MAXLEN); /* @@ -331,11 +331,8 @@ InjectionPointAttach(const char *name, /* Save the entry */ strlcpy(entry->name, name, sizeof(entry->name)); - entry->name[INJ_NAME_MAXLEN - 1] = '\0'; strlcpy(entry->library, library, sizeof(entry->library)); - entry->library[INJ_LIB_MAXLEN - 1] = '\0'; strlcpy(entry->function, function, sizeof(entry->function)); - entry->function[INJ_FUNC_MAXLEN - 1] = '\0'; if (private_data != NULL) memcpy(entry->private_data, private_data, private_data_size); @@ -584,3 +581,49 @@ IsInjectionPointAttached(const char *name) return false; /* silence compiler */ #endif } + +/* + * Retrieve a list of all the injection points currently attached. + * + * This list is palloc'd in the current memory context. + */ +List * +InjectionPointList(void) +{ +#ifdef USE_INJECTION_POINTS + List *inj_points = NIL; + uint32 max_inuse; + + LWLockAcquire(InjectionPointLock, LW_SHARED); + + max_inuse = pg_atomic_read_u32(&ActiveInjectionPoints->max_inuse); + + for (uint32 idx = 0; idx < max_inuse; idx++) + { + InjectionPointEntry *entry; + InjectionPointData *inj_point; + uint64 generation; + + entry = &ActiveInjectionPoints->entries[idx]; + generation = pg_atomic_read_u64(&entry->generation); + + /* skip free slots */ + if (generation % 2 == 0) + continue; + + inj_point = palloc0_object(InjectionPointData); + inj_point->name = pstrdup(entry->name); + inj_point->library = pstrdup(entry->library); + inj_point->function = pstrdup(entry->function); + inj_points = lappend(inj_points, inj_point); + } + + LWLockRelease(InjectionPointLock); + + return inj_points; + +#else + elog(ERROR, "Injection points are not supported by this build"); + return NIL; /* keep compiler quiet */ +#endif +} diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 63f991c4f9305..a3cd0d03eb3b0 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -39,16 +39,16 @@ # The default values of these variables are driven from the -D command-line # option or PGDATA environment variable, represented here as ConfigDir. -#data_directory = 'ConfigDir' # use data in another directory - # (change requires restart) -#hba_file = 'ConfigDir/pg_hba.conf' # host-based authentication file - # (change requires restart) -#ident_file = 'ConfigDir/pg_ident.conf' # ident configuration file - # (change requires restart) +#data_directory = 'ConfigDir' # use data in another directory + # (change requires restart) +#hba_file = 'ConfigDir/pg_hba.conf' # host-based authentication file + # (change requires restart) +#ident_file = 'ConfigDir/pg_ident.conf' # ident configuration file + # (change requires restart) # If external_pid_file is not explicitly set, no extra PID file is written. -#external_pid_file = '' # write an extra PID file - # (change requires restart) +#external_pid_file = '' # write an extra PID file + # (change requires restart) #------------------------------------------------------------------------------ @@ -57,47 +57,47 @@ # - Connection Settings - -#listen_addresses = 'localhost' # what IP address(es) to listen on; - # comma-separated list of addresses; - # defaults to 'localhost'; use '*' for all - # (change requires restart) -#port = 5432 # (change requires restart) -#max_connections = 100 # (change requires restart) -#reserved_connections = 0 # (change requires restart) -#superuser_reserved_connections = 3 # (change requires restart) -#unix_socket_directories = '/tmp' # comma-separated list of directories - # (change requires restart) -#unix_socket_group = '' # (change requires restart) -#unix_socket_permissions = 0777 # begin with 0 to use octal notation - # (change requires restart) -#bonjour = off # advertise server via Bonjour - # (change requires restart) -#bonjour_name = '' # defaults to the computer name - # (change requires restart) +#listen_addresses = 'localhost' # what IP address(es) to listen on; + # comma-separated list of addresses; + # defaults to 'localhost'; use '*' for all + # (change requires restart) +#port = 5432 # (change requires restart) +#max_connections = 100 # (change requires restart) +#reserved_connections = 0 # (change requires restart) +#superuser_reserved_connections = 3 # (change requires restart) +#unix_socket_directories = '/tmp' # comma-separated list of directories + # (change requires restart) +#unix_socket_group = '' # (change requires restart) +#unix_socket_permissions = 0777 # begin with 0 to use octal notation + # (change requires restart) +#bonjour = off # advertise server via Bonjour + # (change requires restart) +#bonjour_name = '' # defaults to the computer name + # (change requires restart) # - TCP settings - # see "man tcp" for details -#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; - # 0 selects the system default -#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; - # 0 selects the system default -#tcp_keepalives_count = 0 # TCP_KEEPCNT; - # 0 selects the system default -#tcp_user_timeout = 0 # TCP_USER_TIMEOUT, in milliseconds; - # 0 selects the system default +#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; + # 0 selects the system default +#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; + # 0 selects the system default +#tcp_keepalives_count = 0 # TCP_KEEPCNT; + # 0 selects the system default +#tcp_user_timeout = 0 # TCP_USER_TIMEOUT, in milliseconds; + # 0 selects the system default -#client_connection_check_interval = 0 # time between checks for client - # disconnection while running queries; - # 0 for never +#client_connection_check_interval = 0 # time between checks for client + # disconnection while running queries; + # 0 for never # - Authentication - -#authentication_timeout = 1min # 1s-600s -#password_encryption = scram-sha-256 # scram-sha-256 or md5 +#authentication_timeout = 1min # 1s-600s +#password_encryption = scram-sha-256 # scram-sha-256 or (deprecated) md5 #scram_iterations = 4096 -#md5_password_warnings = on -#oauth_validator_libraries = '' # comma-separated list of trusted validator modules +#md5_password_warnings = on # display md5 deprecation warnings? +#oauth_validator_libraries = '' # comma-separated list of trusted validator modules # GSSAPI using Kerberos #krb_server_keyfile = 'FILE:${sysconfdir}/krb5.keytab' @@ -112,8 +112,8 @@ #ssl_crl_file = '' #ssl_crl_dir = '' #ssl_key_file = 'server.key' -#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed TLSv1.2 ciphers -#ssl_tls13_ciphers = '' # allowed TLSv1.3 cipher suites, blank for default +#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed TLSv1.2 ciphers +#ssl_tls13_ciphers = '' # allowed TLSv1.3 cipher suites, blank for default #ssl_prefer_server_ciphers = on #ssl_groups = 'X25519:prime256v1' #ssl_min_protocol_version = 'TLSv1.2' @@ -129,98 +129,96 @@ # - Memory - -#shared_buffers = 128MB # min 128kB - # (change requires restart) -#huge_pages = try # on, off, or try - # (change requires restart) -#huge_page_size = 0 # zero for system default - # (change requires restart) -#temp_buffers = 8MB # min 800kB -#max_prepared_transactions = 0 # zero disables the feature - # (change requires restart) +#shared_buffers = 128MB # min 128kB + # (change requires restart) +#huge_pages = try # on, off, or try + # (change requires restart) +#huge_page_size = 0 # zero for system default + # (change requires restart) +#temp_buffers = 8MB # min 800kB +#max_prepared_transactions = 0 # zero disables the feature + # (change requires restart) # Caution: it is not advisable to set max_prepared_transactions nonzero unless # you actively intend to use prepared transactions. -#work_mem = 4MB # min 64kB -#hash_mem_multiplier = 2.0 # 1-1000.0 multiplier on hash table work_mem -#maintenance_work_mem = 64MB # min 64kB -#autovacuum_work_mem = -1 # min 64kB, or -1 to use maintenance_work_mem -#logical_decoding_work_mem = 64MB # min 64kB -#max_stack_depth = 2MB # min 100kB -#shared_memory_type = mmap # the default is the first option - # supported by the operating system: - # mmap - # sysv - # windows - # (change requires restart) -#dynamic_shared_memory_type = posix # the default is usually the first option - # supported by the operating system: - # posix - # sysv - # windows - # mmap - # (change requires restart) -#min_dynamic_shared_memory = 0MB # (change requires restart) -#vacuum_buffer_usage_limit = 2MB # size of vacuum and analyze buffer access strategy ring; - # 0 to disable vacuum buffer access strategy; - # range 128kB to 16GB +#work_mem = 4MB # min 64kB +#hash_mem_multiplier = 2.0 # 1-1000.0 multiplier on hash table work_mem +#maintenance_work_mem = 64MB # min 64kB +#autovacuum_work_mem = -1 # min 64kB, or -1 to use maintenance_work_mem +#logical_decoding_work_mem = 64MB # min 64kB +#max_stack_depth = 2MB # min 100kB +#shared_memory_type = mmap # the default is the first option + # supported by the operating system: + # mmap + # sysv + # windows + # (change requires restart) +#dynamic_shared_memory_type = posix # the default is usually the first option + # supported by the operating system: + # posix + # sysv + # windows + # mmap + # (change requires restart) +#min_dynamic_shared_memory = 0MB # (change requires restart) +#vacuum_buffer_usage_limit = 2MB # size of vacuum and analyze buffer access strategy ring; + # 0 to disable vacuum buffer access strategy; + # range 128kB to 16GB # SLRU buffers (change requires restart) -#commit_timestamp_buffers = 0 # memory for pg_commit_ts (0 = auto) -#multixact_offset_buffers = 16 # memory for pg_multixact/offsets -#multixact_member_buffers = 32 # memory for pg_multixact/members -#notify_buffers = 16 # memory for pg_notify -#serializable_buffers = 32 # memory for pg_serial -#subtransaction_buffers = 0 # memory for pg_subtrans (0 = auto) -#transaction_buffers = 0 # memory for pg_xact (0 = auto) +#commit_timestamp_buffers = 0 # memory for pg_commit_ts (0 = auto) +#multixact_offset_buffers = 16 # memory for pg_multixact/offsets +#multixact_member_buffers = 32 # memory for pg_multixact/members +#notify_buffers = 16 # memory for pg_notify +#serializable_buffers = 32 # memory for pg_serial +#subtransaction_buffers = 0 # memory for pg_subtrans (0 = auto) +#transaction_buffers = 0 # memory for pg_xact (0 = auto) # - Disk - -#temp_file_limit = -1 # limits per-process temp file space - # in kilobytes, or -1 for no limit +#temp_file_limit = -1 # limits per-process temp file space + # in kilobytes, or -1 for no limit -#max_notify_queue_pages = 1048576 # limits the number of SLRU pages allocated - # for NOTIFY / LISTEN queue +#file_copy_method = copy # copy, clone (if supported by OS) -#file_copy_method = copy # the default is the first option - # copy - # clone (if system support is available) +#max_notify_queue_pages = 1048576 # limits the number of SLRU pages allocated + # for NOTIFY / LISTEN queue # - Kernel Resources - -#max_files_per_process = 1000 # min 64 - # (change requires restart) +#max_files_per_process = 1000 # min 64 + # (change requires restart) # - Background Writer - -#bgwriter_delay = 200ms # 10-10000ms between rounds -#bgwriter_lru_maxpages = 100 # max buffers written/round, 0 disables -#bgwriter_lru_multiplier = 2.0 # 0-10.0 multiplier on buffers scanned/round -#bgwriter_flush_after = 0 # measured in pages, 0 disables +#bgwriter_delay = 200ms # 10-10000ms between rounds +#bgwriter_lru_maxpages = 100 # max buffers written/round, 0 disables +#bgwriter_lru_multiplier = 2.0 # 0-10.0 multiplier on buffers scanned/round +#bgwriter_flush_after = 0 # measured in pages, 0 disables # - I/O - -#backend_flush_after = 0 # measured in pages, 0 disables -#effective_io_concurrency = 16 # 1-1000; 0 disables issuing multiple simultaneous IO requests -#maintenance_io_concurrency = 16 # 1-1000; same as effective_io_concurrency -#io_max_combine_limit = 128kB # usually 1-128 blocks (depends on OS) - # (change requires restart) -#io_combine_limit = 128kB # usually 1-128 blocks (depends on OS) - -#io_method = worker # worker, io_uring, sync - # (change requires restart) -#io_max_concurrency = -1 # Max number of IOs that one process - # can execute simultaneously - # -1 sets based on shared_buffers - # (change requires restart) -#io_workers = 3 # 1-32; +#backend_flush_after = 0 # measured in pages, 0 disables +#effective_io_concurrency = 16 # 1-1000; 0 disables issuing multiple simultaneous IO requests +#maintenance_io_concurrency = 16 # 1-1000; same as effective_io_concurrency +#io_max_combine_limit = 128kB # usually 1-128 blocks (depends on OS) + # (change requires restart) +#io_combine_limit = 128kB # usually 1-128 blocks (depends on OS) + +#io_method = worker # worker, io_uring, sync + # (change requires restart) +#io_max_concurrency = -1 # Max number of IOs that one process + # can execute simultaneously + # -1 sets based on shared_buffers + # (change requires restart) +#io_workers = 3 # 1-32; # - Worker Processes - -#max_worker_processes = 8 # (change requires restart) -#max_parallel_workers_per_gather = 2 # limited by max_parallel_workers -#max_parallel_maintenance_workers = 2 # limited by max_parallel_workers -#max_parallel_workers = 8 # number of max_worker_processes that - # can be used in parallel operations +#max_worker_processes = 8 # (change requires restart) +#max_parallel_workers_per_gather = 2 # limited by max_parallel_workers +#max_parallel_maintenance_workers = 2 # limited by max_parallel_workers +#max_parallel_workers = 8 # number of max_worker_processes that + # can be used in parallel operations #parallel_leader_participation = on @@ -230,104 +228,104 @@ # - Settings - -#wal_level = replica # minimal, replica, or logical - # (change requires restart) -#fsync = on # flush data to disk for crash safety - # (turning this off can cause - # unrecoverable data corruption) -#synchronous_commit = on # synchronization level; - # off, local, remote_write, remote_apply, or on -#wal_sync_method = fsync # the default is the first option - # supported by the operating system: - # open_datasync - # fdatasync (default on Linux and FreeBSD) - # fsync - # fsync_writethrough - # open_sync -#full_page_writes = on # recover from partial page writes -#wal_log_hints = off # also do full page writes of non-critical updates - # (change requires restart) -#wal_compression = off # enables compression of full-page writes; - # off, pglz, lz4, zstd, or on -#wal_init_zero = on # zero-fill new WAL files -#wal_recycle = on # recycle WAL files -#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers - # (change requires restart) -#wal_writer_delay = 200ms # 1-10000 milliseconds -#wal_writer_flush_after = 1MB # measured in pages, 0 disables +#wal_level = replica # minimal, replica, or logical + # (change requires restart) +#fsync = on # flush data to disk for crash safety + # (turning this off can cause + # unrecoverable data corruption) +#synchronous_commit = on # synchronization level; + # off, local, remote_write, remote_apply, or on +#wal_sync_method = fsync # the default is the first option + # supported by the operating system: + # open_datasync + # fdatasync (default on Linux and FreeBSD) + # fsync + # fsync_writethrough + # open_sync +#full_page_writes = on # recover from partial page writes +#wal_log_hints = off # also do full page writes of non-critical updates + # (change requires restart) +#wal_compression = off # enables compression of full-page writes; + # off, pglz, lz4, zstd, or on +#wal_init_zero = on # zero-fill new WAL files +#wal_recycle = on # recycle WAL files +#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers + # (change requires restart) +#wal_writer_delay = 200ms # 1-10000 milliseconds +#wal_writer_flush_after = 1MB # measured in pages, 0 disables #wal_skip_threshold = 2MB -#commit_delay = 0 # range 0-100000, in microseconds -#commit_siblings = 5 # range 1-1000 +#commit_delay = 0 # range 0-100000, in microseconds +#commit_siblings = 5 # range 0-1000 # - Checkpoints - -#checkpoint_timeout = 5min # range 30s-1d -#checkpoint_completion_target = 0.9 # checkpoint target duration, 0.0 - 1.0 -#checkpoint_flush_after = 0 # measured in pages, 0 disables -#checkpoint_warning = 30s # 0 disables +#checkpoint_timeout = 5min # range 30s-1d +#checkpoint_completion_target = 0.9 # checkpoint target duration, 0.0 - 1.0 +#checkpoint_flush_after = 0 # measured in pages, 0 disables +#checkpoint_warning = 30s # 0 disables #max_wal_size = 1GB #min_wal_size = 80MB # - Prefetching during recovery - -#recovery_prefetch = try # prefetch pages referenced in the WAL? -#wal_decode_buffer_size = 512kB # lookahead window used for prefetching - # (change requires restart) +#recovery_prefetch = try # prefetch pages referenced in the WAL? +#wal_decode_buffer_size = 512kB # lookahead window used for prefetching + # (change requires restart) # - Archiving - -#archive_mode = off # enables archiving; off, on, or always - # (change requires restart) -#archive_library = '' # library to use to archive a WAL file - # (empty string indicates archive_command should - # be used) -#archive_command = '' # command to use to archive a WAL file - # placeholders: %p = path of file to archive - # %f = file name only - # e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' -#archive_timeout = 0 # force a WAL file switch after this - # number of seconds; 0 disables +#archive_mode = off # enables archiving; off, on, or always + # (change requires restart) +#archive_library = '' # library to use to archive a WAL file + # (empty string indicates archive_command should + # be used) +#archive_command = '' # command to use to archive a WAL file + # placeholders: %p = path of file to archive + # %f = file name only + # e.g. 'test ! -f "/mnt/server/archivedir/%f" && cp "%p" "/mnt/server/archivedir/%f"' +#archive_timeout = 0 # force a WAL file switch after this + # number of seconds; 0 disables # - Archive Recovery - # These are only used in recovery mode. -#restore_command = '' # command to use to restore an archived WAL file - # placeholders: %p = path of file to restore - # %f = file name only - # e.g. 'cp /mnt/server/archivedir/%f %p' -#archive_cleanup_command = '' # command to execute at every restartpoint -#recovery_end_command = '' # command to execute at completion of recovery +#restore_command = '' # command to use to restore an archived WAL file + # placeholders: %p = path of file to restore + # %f = file name only + # e.g. 'cp "/mnt/server/archivedir/%f" "%p"' +#archive_cleanup_command = '' # command to execute at every restartpoint +#recovery_end_command = '' # command to execute at completion of recovery # - Recovery Target - # Set these only when performing a targeted recovery. -#recovery_target = '' # 'immediate' to end recovery as soon as a - # consistent state is reached - # (change requires restart) -#recovery_target_name = '' # the named restore point to which recovery will proceed - # (change requires restart) -#recovery_target_time = '' # the time stamp up to which recovery will proceed - # (change requires restart) -#recovery_target_xid = '' # the transaction ID up to which recovery will proceed - # (change requires restart) -#recovery_target_lsn = '' # the WAL LSN up to which recovery will proceed - # (change requires restart) -#recovery_target_inclusive = on # Specifies whether to stop: - # just after the specified recovery target (on) - # just before the recovery target (off) - # (change requires restart) -#recovery_target_timeline = 'latest' # 'current', 'latest', or timeline ID - # (change requires restart) -#recovery_target_action = 'pause' # 'pause', 'promote', 'shutdown' - # (change requires restart) +#recovery_target = '' # 'immediate' to end recovery as soon as a + # consistent state is reached + # (change requires restart) +#recovery_target_name = '' # the named restore point to which recovery will proceed + # (change requires restart) +#recovery_target_time = '' # the time stamp up to which recovery will proceed + # (change requires restart) +#recovery_target_xid = '' # the transaction ID up to which recovery will proceed + # (change requires restart) +#recovery_target_lsn = '' # the WAL LSN up to which recovery will proceed + # (change requires restart) +#recovery_target_inclusive = on # Specifies whether to stop: + # just after the specified recovery target (on) + # just before the recovery target (off) + # (change requires restart) +#recovery_target_timeline = 'latest' # 'current', 'latest', or timeline ID + # (change requires restart) +#recovery_target_action = 'pause' # 'pause', 'promote', 'shutdown' + # (change requires restart) # - WAL Summarization - -#summarize_wal = off # run WAL summarizer process? -#wal_summary_keep_time = '10d' # when to remove old summary files, 0 = never +#summarize_wal = off # run WAL summarizer process? +#wal_summary_keep_time = '10d' # when to remove old summary files, 0 = never #------------------------------------------------------------------------------ @@ -338,66 +336,66 @@ # Set these on the primary and on any standby that will send replication data. -#max_wal_senders = 10 # max number of walsender processes - # (change requires restart) -#max_replication_slots = 10 # max number of replication slots - # (change requires restart) -#wal_keep_size = 0 # in megabytes; 0 disables -#max_slot_wal_keep_size = -1 # in megabytes; -1 disables -#idle_replication_slot_timeout = 0 # in minutes; 0 disables -#wal_sender_timeout = 60s # in milliseconds; 0 disables -#track_commit_timestamp = off # collect timestamp of transaction commit - # (change requires restart) +#max_wal_senders = 10 # max number of walsender processes + # (change requires restart) +#max_replication_slots = 10 # max number of replication slots + # (change requires restart) +#wal_keep_size = 0 # in megabytes; 0 disables +#max_slot_wal_keep_size = -1 # in megabytes; -1 disables +#idle_replication_slot_timeout = 0 # in seconds; 0 disables +#wal_sender_timeout = 60s # in milliseconds; 0 disables +#track_commit_timestamp = off # collect timestamp of transaction commit + # (change requires restart) # - Primary Server - # These settings are ignored on a standby server. -#synchronous_standby_names = '' # standby servers that provide sync rep - # method to choose sync standbys, number of sync standbys, - # and comma-separated list of application_name - # from standby(s); '*' = all -#synchronized_standby_slots = '' # streaming replication standby server slot - # names that logical walsender processes will wait for +#synchronous_standby_names = '' # standby servers that provide sync rep + # method to choose sync standbys, number of sync standbys, + # and comma-separated list of application_name + # from standby(s); '*' = all +#synchronized_standby_slots = '' # streaming replication standby server slot + # names that logical walsender processes will wait for # - Standby Servers - # These settings are ignored on a primary server. -#primary_conninfo = '' # connection string to sending server -#primary_slot_name = '' # replication slot on sending server -#hot_standby = on # "off" disallows queries during recovery - # (change requires restart) -#max_standby_archive_delay = 30s # max delay before canceling queries - # when reading WAL from archive; - # -1 allows indefinite delay -#max_standby_streaming_delay = 30s # max delay before canceling queries - # when reading streaming WAL; - # -1 allows indefinite delay -#wal_receiver_create_temp_slot = off # create temp slot if primary_slot_name - # is not set -#wal_receiver_status_interval = 10s # send replies at least this often - # 0 disables -#hot_standby_feedback = off # send info from standby to prevent - # query conflicts -#wal_receiver_timeout = 60s # time that receiver waits for - # communication from primary - # in milliseconds; 0 disables -#wal_retrieve_retry_interval = 5s # time to wait before retrying to - # retrieve WAL after a failed attempt -#recovery_min_apply_delay = 0 # minimum delay for applying changes during recovery -#sync_replication_slots = off # enables slot synchronization on the physical standby from the primary +#primary_conninfo = '' # connection string to sending server +#primary_slot_name = '' # replication slot on sending server +#hot_standby = on # "off" disallows queries during recovery + # (change requires restart) +#max_standby_archive_delay = 30s # max delay before canceling queries + # when reading WAL from archive; + # -1 allows indefinite delay +#max_standby_streaming_delay = 30s # max delay before canceling queries + # when reading streaming WAL; + # -1 allows indefinite delay +#wal_receiver_create_temp_slot = off # create temp slot if primary_slot_name + # is not set +#wal_receiver_status_interval = 10s # send replies at least this often + # 0 disables +#hot_standby_feedback = off # send info from standby to prevent + # query conflicts +#wal_receiver_timeout = 60s # time that receiver waits for + # communication from primary + # in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to + # retrieve WAL after a failed attempt +#recovery_min_apply_delay = 0 # minimum delay for applying changes during recovery +#sync_replication_slots = off # enables slot synchronization on the physical standby from the primary # - Subscribers - # These settings are ignored on a publisher. -#max_active_replication_origins = 10 # max number of active replication origins - # (change requires restart) -#max_logical_replication_workers = 4 # taken from max_worker_processes - # (change requires restart) -#max_sync_workers_per_subscription = 2 # taken from max_logical_replication_workers -#max_parallel_apply_workers_per_subscription = 2 # taken from max_logical_replication_workers +#max_active_replication_origins = 10 # max number of active replication origins + # (change requires restart) +#max_logical_replication_workers = 4 # taken from max_worker_processes + # (change requires restart) +#max_sync_workers_per_subscription = 2 # taken from max_logical_replication_workers +#max_parallel_apply_workers_per_subscription = 2 # taken from max_logical_replication_workers #------------------------------------------------------------------------------ @@ -430,51 +428,55 @@ #enable_group_by_reordering = on #enable_distinct_reordering = on #enable_self_join_elimination = on +#enable_eager_aggregate = on +#extended_parallel_processing = on # - Planner Cost Constants - -#seq_page_cost = 1.0 # measured on an arbitrary scale -#random_page_cost = 4.0 # same scale as above -#cpu_tuple_cost = 0.01 # same scale as above -#cpu_index_tuple_cost = 0.005 # same scale as above -#cpu_operator_cost = 0.0025 # same scale as above -#parallel_setup_cost = 1000.0 # same scale as above -#parallel_tuple_cost = 0.1 # same scale as above +#seq_page_cost = 1.0 # measured on an arbitrary scale +#random_page_cost = 4.0 # same scale as above +#write_page_cost = 5.0 # same scale as above +#cpu_tuple_cost = 0.01 # same scale as above +#cpu_index_tuple_cost = 0.005 # same scale as above +#cpu_operator_cost = 0.0025 # same scale as above +#parallel_setup_cost = 1000.0 # same scale as above +#parallel_tuple_cost = 0.1 # same scale as above #min_parallel_table_scan_size = 8MB #min_parallel_index_scan_size = 512kB #effective_cache_size = 4GB +#min_eager_agg_group_size = 8.0 -#jit_above_cost = 100000 # perform JIT compilation if available - # and query more expensive than this; - # -1 disables -#jit_inline_above_cost = 500000 # inline small functions if query is - # more expensive than this; -1 disables -#jit_optimize_above_cost = 500000 # use expensive JIT optimizations if - # query is more expensive than this; - # -1 disables +#jit_above_cost = 100000 # perform JIT compilation if available + # and query more expensive than this; + # -1 disables +#jit_inline_above_cost = 500000 # inline small functions if query is + # more expensive than this; -1 disables +#jit_optimize_above_cost = 500000 # use expensive JIT optimizations if + # query is more expensive than this; + # -1 disables # - Genetic Query Optimizer - #geqo = on #geqo_threshold = 12 -#geqo_effort = 5 # range 1-10 -#geqo_pool_size = 0 # selects default based on effort -#geqo_generations = 0 # selects default based on effort -#geqo_selection_bias = 2.0 # range 1.5-2.0 -#geqo_seed = 0.0 # range 0.0-1.0 +#geqo_effort = 5 # range 1-10 +#geqo_pool_size = 0 # selects default based on effort +#geqo_generations = 0 # selects default based on effort +#geqo_selection_bias = 2.0 # range 1.5-2.0 +#geqo_seed = 0.0 # range 0.0-1.0 # - Other Planner Options - -#default_statistics_target = 100 # range 1-10000 -#constraint_exclusion = partition # on, off, or partition -#cursor_tuple_fraction = 0.1 # range 0.0-1.0 +#default_statistics_target = 100 # range 1-10000 +#constraint_exclusion = partition # on, off, or partition +#cursor_tuple_fraction = 0.1 # range 0.0-1.0 #from_collapse_limit = 8 -#jit = on # allow JIT compilation -#join_collapse_limit = 8 # 1 disables collapsing of explicit - # JOIN clauses -#plan_cache_mode = auto # auto, force_generic_plan or - # force_custom_plan -#recursive_worktable_factor = 10.0 # range 0.001-1000000 +#jit = on # allow JIT compilation +#join_collapse_limit = 8 # 1 disables collapsing of explicit + # JOIN clauses +#plan_cache_mode = auto # auto, force_generic_plan or + # force_custom_plan +#recursive_worktable_factor = 10.0 # range 0.001-1000000 #------------------------------------------------------------------------------ @@ -483,38 +485,38 @@ # - Where to Log - -#log_destination = 'stderr' # Valid values are combinations of - # stderr, csvlog, jsonlog, syslog, and - # eventlog, depending on platform. - # csvlog and jsonlog require - # logging_collector to be on. +#log_destination = 'stderr' # Valid values are combinations of + # stderr, csvlog, jsonlog, syslog, and + # eventlog, depending on platform. + # csvlog and jsonlog require + # logging_collector to be on. # This is used when logging to stderr: -#logging_collector = off # Enable capturing of stderr, jsonlog, - # and csvlog into log files. Required - # to be on for csvlogs and jsonlogs. - # (change requires restart) +#logging_collector = off # Enable capturing of stderr, jsonlog, + # and csvlog into log files. Required + # to be on for csvlogs and jsonlogs. + # (change requires restart) # These are only used if logging_collector is on: -#log_directory = 'log' # directory where log files are written, - # can be absolute or relative to PGDATA -#log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' # log file name pattern, - # can include strftime() escapes -#log_file_mode = 0600 # creation mode for log files, - # begin with 0 to use octal notation -#log_rotation_age = 1d # Automatic rotation of logfiles will - # happen after that time. 0 disables. -#log_rotation_size = 10MB # Automatic rotation of logfiles will - # happen after that much log output. - # 0 disables. -#log_truncate_on_rotation = off # If on, an existing log file with the - # same name as the new log file will be - # truncated rather than appended to. - # But such truncation only occurs on - # time-driven rotation, not on restarts - # or size-driven rotation. Default is - # off, meaning append to existing files - # in all cases. +#log_directory = 'log' # directory where log files are written, + # can be absolute or relative to PGDATA +#log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' # log file name pattern, + # can include strftime() escapes +#log_file_mode = 0600 # creation mode for log files, + # begin with 0 to use octal notation +#log_rotation_age = 1d # Automatic rotation of logfiles will + # happen after that time. 0 disables. +#log_rotation_size = 10MB # Automatic rotation of logfiles will + # happen after that much log output. + # 0 disables. +#log_truncate_on_rotation = off # If on, an existing log file with the + # same name as the new log file will be + # truncated rather than appended to. + # But such truncation only occurs on + # time-driven rotation, not on restarts + # or size-driven rotation. Default is + # off, meaning append to existing files + # in all cases. # These are relevant when logging to syslog: #syslog_facility = 'LOCAL0' @@ -528,124 +530,130 @@ # - When to Log - -#log_min_messages = warning # values in order of decreasing detail: - # debug5 - # debug4 - # debug3 - # debug2 - # debug1 - # info - # notice - # warning - # error - # log - # fatal - # panic - -#log_min_error_statement = error # values in order of decreasing detail: - # debug5 - # debug4 - # debug3 - # debug2 - # debug1 - # info - # notice - # warning - # error - # log - # fatal - # panic (effectively off) - -#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements - # and their durations, > 0 logs only - # statements running at least this number - # of milliseconds - -#log_min_duration_sample = -1 # -1 is disabled, 0 logs a sample of statements - # and their durations, > 0 logs only a sample of - # statements running at least this number - # of milliseconds; - # sample fraction is determined by log_statement_sample_rate - -#log_statement_sample_rate = 1.0 # fraction of logged statements exceeding - # log_min_duration_sample to be logged; - # 1.0 logs all such statements, 0.0 never logs - - -#log_transaction_sample_rate = 0.0 # fraction of transactions whose statements - # are logged regardless of their duration; 1.0 logs all - # statements from all transactions, 0.0 never logs - -#log_startup_progress_interval = 10s # Time between progress updates for - # long-running startup operations. - # 0 disables the feature, > 0 indicates - # the interval in milliseconds. +#log_min_messages = warning # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic + +#log_min_error_statement = error # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic (effectively off) + +#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements + # and their durations, > 0 logs only + # statements running at least this number + # of milliseconds + +#log_min_duration_sample = -1 # -1 is disabled, 0 logs a sample of statements + # and their durations, > 0 logs only a sample of + # statements running at least this number + # of milliseconds; + # sample fraction is determined by log_statement_sample_rate + +#log_statement_sample_rate = 1.0 # fraction of logged statements exceeding + # log_min_duration_sample to be logged; + # 1.0 logs all such statements, 0.0 never logs + + +#log_transaction_sample_rate = 0.0 # fraction of transactions whose statements + # are logged regardless of their duration; 1.0 logs all + # statements from all transactions, 0.0 never logs + +#log_startup_progress_interval = 10s # Time between progress updates for + # long-running startup operations. + # 0 disables the feature, > 0 indicates + # the interval in milliseconds. # - What to Log - +#debug_print_raw_parse = off #debug_print_parse = off #debug_print_rewritten = off #debug_print_plan = off #debug_pretty_print = on -#log_autovacuum_min_duration = 10min # log autovacuum activity; - # -1 disables, 0 logs all actions and - # their durations, > 0 logs only - # actions running at least this number - # of milliseconds. +#log_autovacuum_min_duration = 10min # log vacuum activity by autovacuum; + # -1 disables, 0 logs all actions and + # their durations, > 0 logs only + # actions running at least this number + # of milliseconds. +#log_autoanalyze_min_duration = 10min # log analyze activity by autovacuum; + # -1 disables, 0 logs all actions and + # their durations, > 0 logs only + # actions running at least this number + # of milliseconds. #log_checkpoints = on #log_connections = '' # log aspects of connection setup # options include receipt, authentication, authorization, # setup_durations, and all to log all of these aspects #log_disconnections = off #log_duration = off # log statement duration -#log_error_verbosity = default # terse, default, or verbose messages +#log_error_verbosity = default # terse, default, or verbose messages #log_hostname = off -#log_line_prefix = '%m [%p] ' # special values: - # %a = application name - # %u = user name - # %d = database name - # %r = remote host and port - # %h = remote host - # %L = local address - # %b = backend type - # %p = process ID - # %P = process ID of parallel group leader - # %t = timestamp without milliseconds - # %m = timestamp with milliseconds - # %n = timestamp with milliseconds (as a Unix epoch) - # %Q = query ID (0 if none or not computed) - # %i = command tag - # %e = SQL state - # %c = session ID - # %l = session line number - # %s = session start timestamp - # %v = virtual transaction ID - # %x = transaction ID (0 if none) - # %q = stop here in non-session - # processes - # %% = '%' - # e.g. '<%u%%%d> ' -#log_lock_waits = off # log lock waits >= deadlock_timeout -#log_lock_failure = off # log lock failures -#log_recovery_conflict_waits = off # log standby recovery conflict waits - # >= deadlock_timeout -#log_parameter_max_length = -1 # when logging statements, limit logged - # bind-parameter values to N bytes; - # -1 means print in full, 0 disables -#log_parameter_max_length_on_error = 0 # when logging an error, limit logged - # bind-parameter values to N bytes; - # -1 means print in full, 0 disables -#log_statement = 'none' # none, ddl, mod, all +#log_line_prefix = '%m [%p] ' # special values: + # %a = application name + # %u = user name + # %d = database name + # %r = remote host and port + # %h = remote host + # %L = local address + # %b = backend type + # %p = process ID + # %P = process ID of parallel group leader + # %t = timestamp without milliseconds + # %m = timestamp with milliseconds + # %n = timestamp with milliseconds (as a Unix epoch) + # %Q = query ID (0 if none or not computed) + # %i = command tag + # %e = SQL state + # %c = session ID + # %l = session line number + # %s = session start timestamp + # %v = virtual transaction ID + # %x = transaction ID (0 if none) + # %q = stop here in non-session + # processes + # %% = '%' + # e.g. '<%u%%%d> ' +#log_lock_waits = on # log lock waits >= deadlock_timeout +#log_lock_failures = off # log lock failures +#log_recovery_conflict_waits = off # log standby recovery conflict waits + # >= deadlock_timeout +#log_parameter_max_length = -1 # when logging statements, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +#log_parameter_max_length_on_error = 0 # when logging an error, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +#log_statement = 'none' # none, ddl, mod, all #log_replication_commands = off -#log_temp_files = -1 # log temporary files equal or larger - # than the specified size in kilobytes; - # -1 disables, 0 logs all temp files +#log_temp_files = -1 # log temporary files equal or larger + # than the specified size in kilobytes; + # -1 disables, 0 logs all temp files #log_timezone = 'GMT' # - Process Title - -#cluster_name = '' # added to process titles if nonempty - # (change requires restart) +#cluster_name = '' # added to process titles if nonempty + # (change requires restart) #update_process_title = on @@ -656,13 +664,13 @@ # - Cumulative Query and Index Statistics - #track_activities = on -#track_activity_query_size = 1024 # (change requires restart) +#track_activity_query_size = 1024 # (change requires restart) #track_counts = on #track_cost_delay_timing = off #track_io_timing = off #track_wal_io_timing = off -#track_functions = none # none, pl, all -#stats_fetch_consistency = cache # cache, none, snapshot +#track_functions = none # none, pl, all +#stats_fetch_consistency = cache # cache, none, snapshot # - Monitoring - @@ -680,49 +688,49 @@ # - Automatic Vacuuming - -#autovacuum = on # Enable autovacuum subprocess? 'on' - # requires track_counts to also be on. -autovacuum_worker_slots = 16 # autovacuum worker slots to allocate - # (change requires restart) -#autovacuum_max_workers = 3 # max number of autovacuum subprocesses -#autovacuum_naptime = 1min # time between autovacuum runs -#autovacuum_vacuum_threshold = 50 # min number of row updates before - # vacuum -#autovacuum_vacuum_insert_threshold = 1000 # min number of row inserts - # before vacuum; -1 disables insert - # vacuums -#autovacuum_analyze_threshold = 50 # min number of row updates before - # analyze -#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum -#autovacuum_vacuum_insert_scale_factor = 0.2 # fraction of unfrozen pages +#autovacuum = on # Enable autovacuum subprocess? 'on' + # requires track_counts to also be on. +#autovacuum_worker_slots = 16 # autovacuum worker slots to allocate + # (change requires restart) +#autovacuum_max_workers = 3 # max number of autovacuum subprocesses +#autovacuum_naptime = 1min # time between autovacuum runs +#autovacuum_vacuum_threshold = 50 # min number of row updates before + # vacuum +#autovacuum_vacuum_insert_threshold = 1000 # min number of row inserts + # before vacuum; -1 disables insert + # vacuums +#autovacuum_analyze_threshold = 50 # min number of row updates before + # analyze +#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum +#autovacuum_vacuum_insert_scale_factor = 0.2 # fraction of unfrozen pages # before insert vacuum -#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze +#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze #autovacuum_vacuum_max_threshold = 100000000 # max number of row updates - # before vacuum; -1 disables max - # threshold -#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum - # (change requires restart) -#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age - # before forced vacuum - # (change requires restart) -#autovacuum_vacuum_cost_delay = 2ms # default vacuum cost delay for - # autovacuum, in milliseconds; - # -1 means use vacuum_cost_delay -#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for - # autovacuum, -1 means use - # vacuum_cost_limit + # before vacuum; -1 disables max + # threshold +#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum + # (change requires restart) +#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age + # before forced vacuum + # (change requires restart) +#autovacuum_vacuum_cost_delay = 2ms # default vacuum cost delay for + # autovacuum, in milliseconds; + # -1 means use vacuum_cost_delay +#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for + # autovacuum, -1 means use + # vacuum_cost_limit # - Cost-Based Vacuum Delay - -#vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables) -#vacuum_cost_page_hit = 1 # 0-10000 credits -#vacuum_cost_page_miss = 2 # 0-10000 credits -#vacuum_cost_page_dirty = 20 # 0-10000 credits -#vacuum_cost_limit = 200 # 1-10000 credits +#vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables) +#vacuum_cost_page_hit = 1 # 0-10000 credits +#vacuum_cost_page_miss = 2 # 0-10000 credits +#vacuum_cost_page_dirty = 20 # 0-10000 credits +#vacuum_cost_limit = 200 # 1-10000 credits # - Default Behavior - -#vacuum_truncate = on # enable truncation after vacuum +#vacuum_truncate = on # enable truncation after vacuum # - Freezing - @@ -740,38 +748,38 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate # - Statement Behavior - -#client_min_messages = notice # values in order of decreasing detail: - # debug5 - # debug4 - # debug3 - # debug2 - # debug1 - # log - # notice - # warning - # error -#search_path = '"$user", public' # schema names +#client_min_messages = notice # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # log + # notice + # warning + # error +#search_path = '"$user", public' # schema names #row_security = on #default_table_access_method = 'heap' -#default_tablespace = '' # a tablespace name, '' uses the default -#default_toast_compression = 'pglz' # 'pglz' or 'lz4' -#temp_tablespaces = '' # a list of tablespace names, '' uses - # only default tablespace +#default_tablespace = '' # a tablespace name, '' uses the default +#default_toast_compression = 'pglz' # 'pglz' or 'lz4' +#temp_tablespaces = '' # a list of tablespace names, '' uses + # only default tablespace #check_function_bodies = on #default_transaction_isolation = 'read committed' #default_transaction_read_only = off #default_transaction_deferrable = off #session_replication_role = 'origin' -#statement_timeout = 0 # in milliseconds, 0 is disabled -#transaction_timeout = 0 # in milliseconds, 0 is disabled -#lock_timeout = 0 # in milliseconds, 0 is disabled -#idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled -#idle_session_timeout = 0 # in milliseconds, 0 is disabled -#bytea_output = 'hex' # hex, escape +#statement_timeout = 0 # in milliseconds, 0 is disabled +#transaction_timeout = 0 # in milliseconds, 0 is disabled +#lock_timeout = 0 # in milliseconds, 0 is disabled +#idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled +#idle_session_timeout = 0 # in milliseconds, 0 is disabled +#bytea_output = 'hex' # hex, escape #xmlbinary = 'base64' #xmloption = 'content' #gin_pending_list_limit = 4MB -#createrole_self_grant = '' # set and/or inherit +#createrole_self_grant = '' # set and/or inherit #event_triggers = on # - Locale and Formatting - @@ -779,27 +787,27 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate #datestyle = 'iso, mdy' #intervalstyle = 'postgres' #timezone = 'GMT' -#timezone_abbreviations = 'Default' # Select the set of available time zone - # abbreviations. Currently, there are - # Default - # Australia (historical usage) - # India - # You can create your own file in - # share/timezonesets/. -#extra_float_digits = 1 # min -15, max 3; any value >0 actually - # selects precise output mode -#client_encoding = sql_ascii # actually, defaults to database - # encoding +#timezone_abbreviations = 'Default' # Select the set of available time zone + # abbreviations. Currently, there are + # Default + # Australia (historical usage) + # India + # You can create your own file in + # share/timezonesets/. +#extra_float_digits = 1 # min -15, max 3; any value >0 actually + # selects precise output mode +#client_encoding = sql_ascii # actually, defaults to database + # encoding # These settings are initialized by initdb, but they can be changed. -#lc_messages = '' # locale for system error message - # strings -#lc_monetary = 'C' # locale for monetary formatting -#lc_numeric = 'C' # locale for number formatting -#lc_time = 'C' # locale for time formatting +#lc_messages = '' # locale for system error message + # strings +#lc_monetary = 'C' # locale for monetary formatting +#lc_numeric = 'C' # locale for number formatting +#lc_time = 'C' # locale for time formatting -#icu_validation_level = warning # report ICU locale validation - # errors at the given level +#icu_validation_level = warning # report ICU locale validation + # errors at the given level # default configuration for text search #default_text_search_config = 'pg_catalog.simple' @@ -808,8 +816,8 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate #local_preload_libraries = '' #session_preload_libraries = '' -#shared_preload_libraries = '' # (change requires restart) -#jit_provider = 'llvmjit' # JIT library to use +#shared_preload_libraries = '' # (change requires restart) +#jit_provider = 'llvmjit' # JIT library to use # - Other Defaults - @@ -823,14 +831,14 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate #------------------------------------------------------------------------------ #deadlock_timeout = 1s -#max_locks_per_transaction = 64 # min 10 - # (change requires restart) -#max_pred_locks_per_transaction = 64 # min 10 - # (change requires restart) -#max_pred_locks_per_relation = -2 # negative values mean - # (max_pred_locks_per_transaction - # / -max_pred_locks_per_relation) - 1 -#max_pred_locks_per_page = 2 # min 0 +#max_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_relation = -2 # negative values mean + # (max_pred_locks_per_transaction + # / -max_pred_locks_per_relation) - 1 +#max_pred_locks_per_page = 2 # min 0 #------------------------------------------------------------------------------ @@ -840,7 +848,7 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate # - Previous PostgreSQL Versions - #array_nulls = on -#backslash_quote = safe_encoding # on, off, or safe_encoding +#backslash_quote = safe_encoding # on, off, or safe_encoding #escape_string_warning = on #lo_compat_privileges = off #quote_all_identifiers = off @@ -857,12 +865,12 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate # ERROR HANDLING #------------------------------------------------------------------------------ -#exit_on_error = off # terminate session on any error? -#restart_after_crash = on # reinitialize after backend crash? -#data_sync_retry = off # retry or panic on failure to fsync - # data? - # (change requires restart) -#recovery_init_sync_method = fsync # fsync, syncfs (Linux 5.8+) +#exit_on_error = off # terminate session on any error? +#restart_after_crash = on # reinitialize after backend crash? +#data_sync_retry = off # retry or panic on failure to fsync + # data? + # (change requires restart) +#recovery_init_sync_method = fsync # fsync, syncfs (Linux 5.8+) #------------------------------------------------------------------------------ @@ -873,10 +881,10 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate # default postgresql.conf. Note that these are directives, not variable # assignments, so they can usefully be given more than once. -#include_dir = '...' # include files ending in '.conf' from - # a directory, e.g., 'conf.d' -#include_if_exists = '...' # include file only if it exists -#include = '...' # include file +#include_dir = '...' # include files ending in '.conf' from + # a directory, e.g., 'conf.d' +#include_if_exists = '...' # include file only if it exists +#include = '...' # include file #------------------------------------------------------------------------------ diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c index e08b26e8c14f2..5d8de92a57be9 100644 --- a/src/backend/utils/misc/ps_status.c +++ b/src/backend/utils/misc/ps_status.c @@ -23,7 +23,7 @@ #include "utils/guc.h" #include "utils/ps_status.h" -#if !defined(WIN32) || defined(_MSC_VER) +#if !defined(WIN32) extern char **environ; #endif @@ -52,7 +52,7 @@ bool update_process_title = DEFAULT_UPDATE_PROCESS_TITLE; #define PS_USE_SETPROCTITLE_FAST #elif defined(HAVE_SETPROCTITLE) #define PS_USE_SETPROCTITLE -#elif defined(__linux__) || defined(__sun) || defined(__darwin__) +#elif defined(__linux__) || defined(__sun) || defined(__darwin__) || defined(__GNU__) #define PS_USE_CLOBBER_ARGV #elif defined(WIN32) #define PS_USE_WIN32 @@ -62,7 +62,7 @@ bool update_process_title = DEFAULT_UPDATE_PROCESS_TITLE; /* Different systems want the buffer padded differently */ -#if defined(__linux__) || defined(__darwin__) +#if defined(__linux__) || defined(__darwin__) || defined(__GNU__) #define PS_PADDING '\0' #else #define PS_PADDING ' ' @@ -100,6 +100,17 @@ static void flush_ps_display(void); static int save_argc; static char **save_argv; +/* + * Valgrind seems not to consider the global "environ" variable as a valid + * root pointer; so when we allocate a new environment array, it claims that + * data is leaked. To fix that, keep our own statically-allocated copy of the + * pointer. (Oddly, this doesn't seem to be a problem for "argv".) + */ +#if defined(PS_USE_CLOBBER_ARGV) && defined(USE_VALGRIND) +extern char **ps_status_new_environ; +char **ps_status_new_environ; +#endif + /* * Call this early in startup to save the original argc/argv values. @@ -206,6 +217,11 @@ save_ps_display_args(int argc, char **argv) } new_environ[i] = NULL; environ = new_environ; + + /* See notes about Valgrind above. */ +#ifdef USE_VALGRIND + ps_status_new_environ = new_environ; +#endif } /* diff --git a/src/backend/utils/misc/queryenvironment.c b/src/backend/utils/misc/queryenvironment.c index 7bc72dabe6797..06983090039e8 100644 --- a/src/backend/utils/misc/queryenvironment.c +++ b/src/backend/utils/misc/queryenvironment.c @@ -38,7 +38,7 @@ struct QueryEnvironment QueryEnvironment * create_queryEnv(void) { - return (QueryEnvironment *) palloc0(sizeof(QueryEnvironment)); + return palloc0_object(QueryEnvironment); } EphemeralNamedRelationMetadata diff --git a/src/backend/utils/misc/tzparser.c b/src/backend/utils/misc/tzparser.c index 6aaf7395ba852..d7e84bab981aa 100644 --- a/src/backend/utils/misc/tzparser.c +++ b/src/backend/utils/misc/tzparser.c @@ -466,7 +466,7 @@ load_tzoffsets(const char *filename) /* Initialize array at a reasonable size */ arraysize = 128; - array = (tzEntry *) palloc(arraysize * sizeof(tzEntry)); + array = palloc_array(tzEntry, arraysize); /* Parse the file(s) */ n = ParseTzFile(filename, 0, &array, &arraysize, 0); diff --git a/src/backend/utils/mmgr/alignedalloc.c b/src/backend/utils/mmgr/alignedalloc.c index 7eea695de62c5..daee3fc80a1c3 100644 --- a/src/backend/utils/mmgr/alignedalloc.c +++ b/src/backend/utils/mmgr/alignedalloc.c @@ -23,8 +23,8 @@ /* * AlignedAllocFree -* Frees allocated memory; memory is removed from its owning context. -*/ + * Frees allocated memory; memory is removed from its owning context. + */ void AlignedAllocFree(void *pointer) { @@ -45,6 +45,15 @@ AlignedAllocFree(void *pointer) GetMemoryChunkContext(unaligned)->name, chunk); #endif + /* + * Create a dummy vchunk covering the start of the unaligned chunk, but + * not overlapping the aligned chunk. This will be freed while pfree'ing + * the unaligned chunk, keeping Valgrind happy. Then when we return to + * the outer pfree, that will clean up the vchunk for the aligned chunk. + */ + VALGRIND_MEMPOOL_ALLOC(GetMemoryChunkContext(unaligned), unaligned, + (char *) pointer - (char *) unaligned); + /* Recursively pfree the unaligned chunk */ pfree(unaligned); } @@ -123,6 +132,15 @@ AlignedAllocRealloc(void *pointer, Size size, int flags) VALGRIND_MAKE_MEM_DEFINED(pointer, old_size); memcpy(newptr, pointer, Min(size, old_size)); + /* + * Create a dummy vchunk covering the start of the old unaligned chunk, + * but not overlapping the aligned chunk. This will be freed while + * pfree'ing the old unaligned chunk, keeping Valgrind happy. Then when + * we return to repalloc, it will move the vchunk for the aligned chunk. + */ + VALGRIND_MEMPOOL_ALLOC(ctx, unaligned, + (char *) pointer - (char *) unaligned); + pfree(unaligned); return newptr; diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index 666ecd8f78d0e..bcd09c07533a8 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -103,6 +103,8 @@ #define ALLOC_BLOCKHDRSZ MAXALIGN(sizeof(AllocBlockData)) #define ALLOC_CHUNKHDRSZ sizeof(MemoryChunk) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(AllocSetContext)) + \ + ALLOC_BLOCKHDRSZ) typedef struct AllocBlockData *AllocBlock; /* forward reference */ @@ -187,25 +189,19 @@ typedef struct AllocBlockData char *endptr; /* end of space in this block */ } AllocBlockData; -/* - * AllocPointerIsValid - * True iff pointer is valid allocation pointer. - */ -#define AllocPointerIsValid(pointer) PointerIsValid(pointer) - /* * AllocSetIsValid * True iff set is valid allocation set. */ #define AllocSetIsValid(set) \ - (PointerIsValid(set) && IsA(set, AllocSetContext)) + ((set) && IsA(set, AllocSetContext)) /* * AllocBlockIsValid * True iff block is valid block of allocation set. */ #define AllocBlockIsValid(block) \ - (PointerIsValid(block) && AllocSetIsValid((block)->aset)) + ((block) && AllocSetIsValid((block)->aset)) /* * We always store external chunks on a dedicated block. This makes fetching @@ -458,6 +454,21 @@ AllocSetContextCreateInternal(MemoryContext parent, * we'd leak the header/initial block if we ereport in this stretch. */ + /* Create a vpool associated with the context */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + + /* + * Create a vchunk covering both the AllocSetContext struct and the keeper + * block's header. (Perhaps it would be more sensible for these to be two + * separate vchunks, but doing that seems to tickle bugs in some versions + * of Valgrind.) We must have these vchunks, and also a vchunk for each + * subsequently-added block header, so that Valgrind considers the + * pointers within them while checking for leaked memory. Note that + * Valgrind doesn't distinguish between these vchunks and those created by + * mcxt.c for the user-accessible-data chunks we allocate. + */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + /* Fill in the initial block's block header */ block = KeeperBlock(set); block->aset = set; @@ -585,6 +596,14 @@ AllocSetReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif + + /* + * We need to free the block header's vchunk explicitly, although + * the user-data vchunks within will go away in the TRIM below. + * Otherwise Valgrind complains about leaked allocations. + */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } block = next; @@ -592,6 +611,14 @@ AllocSetReset(MemoryContext context) Assert(context->mem_allocated == keepersize); + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the AllocSetContext and + * keeper-block header. This gets rid of the vchunks for whatever user + * data is getting discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* Reset block size allocation sequence, too */ set->nextBlockSize = set->initBlockSize; } @@ -648,6 +675,9 @@ AllocSetDelete(MemoryContext context) freelist->first_free = (AllocSetContext *) oldset->header.nextchild; freelist->num_free--; + /* Destroy the context's vpool --- see notes below */ + VALGRIND_DESTROY_MEMPOOL(oldset); + /* All that remains is to free the header/initial block */ free(oldset); } @@ -675,13 +705,24 @@ AllocSetDelete(MemoryContext context) #endif if (!IsKeeperBlock(set, block)) + { + /* As in AllocSetReset, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); free(block); + } block = next; } Assert(context->mem_allocated == keepersize); + /* + * Destroy the vpool. We don't seem to need to explicitly free the + * initial block's header vchunk, nor any user-data vchunks that Valgrind + * still knows about; they'll all go away automatically. + */ + VALGRIND_DESTROY_MEMPOOL(set); + /* Finally, free the context header, including the keeper block */ free(set); } @@ -716,6 +757,9 @@ AllocSetAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ); + context->mem_allocated += blksize; block->aset = set; @@ -922,6 +966,9 @@ AllocSetAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ); + context->mem_allocated += blksize; block->aset = set; @@ -1104,6 +1151,10 @@ AllocSetFree(void *pointer) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif + + /* As in AllocSetReset, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } else @@ -1184,6 +1235,7 @@ AllocSetRealloc(void *pointer, Size size, int flags) * realloc() to make the containing block bigger, or smaller, with * minimum space wastage. */ + AllocBlock newblock; Size chksize; Size blksize; Size oldblksize; @@ -1223,14 +1275,21 @@ AllocSetRealloc(void *pointer, Size size, int flags) blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; oldblksize = block->endptr - ((char *) block); - block = (AllocBlock) realloc(block, blksize); - if (block == NULL) + newblock = (AllocBlock) realloc(block, blksize); + if (newblock == NULL) { /* Disallow access to the chunk header. */ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ); return MemoryContextAllocationFailure(&set->header, size, flags); } + /* + * Move the block-header vchunk explicitly. (mcxt.c will take care of + * moving the vchunk for the user data.) + */ + VALGRIND_MEMPOOL_CHANGE(set, block, newblock, ALLOC_BLOCKHDRSZ); + block = newblock; + /* updated separately, not to underflow when (oldblksize > blksize) */ set->header.mem_allocated -= oldblksize; set->header.mem_allocated += blksize; @@ -1294,7 +1353,7 @@ AllocSetRealloc(void *pointer, Size size, int flags) /* Ensure any padding bytes are marked NOACCESS. */ VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, chksize - size); - /* Disallow access to the chunk header . */ + /* Disallow access to the chunk header. */ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ); return pointer; @@ -1609,9 +1668,9 @@ AllocSetCheck(MemoryContext context) prevblock = block, block = block->next) { char *bpoz = ((char *) block) + ALLOC_BLOCKHDRSZ; - long blk_used = block->freeptr - bpoz; - long blk_data = 0; - long nchunks = 0; + Size blk_used = block->freeptr - bpoz; + Size blk_data = 0; + Size nchunks = 0; bool has_external_chunk = false; if (IsKeeperBlock(set, block)) diff --git a/src/backend/utils/mmgr/bump.c b/src/backend/utils/mmgr/bump.c index f7a37d1b3e86c..e60ec94e1394c 100644 --- a/src/backend/utils/mmgr/bump.c +++ b/src/backend/utils/mmgr/bump.c @@ -45,7 +45,9 @@ #include "utils/memutils_memorychunk.h" #include "utils/memutils_internal.h" -#define Bump_BLOCKHDRSZ MAXALIGN(sizeof(BumpBlock)) +#define Bump_BLOCKHDRSZ MAXALIGN(sizeof(BumpBlock)) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(BumpContext)) + \ + Bump_BLOCKHDRSZ) /* No chunk header unless built with MEMORY_CONTEXT_CHECKING */ #ifdef MEMORY_CONTEXT_CHECKING @@ -98,7 +100,7 @@ struct BumpBlock * True iff set is valid bump context. */ #define BumpIsValid(set) \ - (PointerIsValid(set) && IsA(set, BumpContext)) + ((set) && IsA(set, BumpContext)) /* * We always store external chunks on a dedicated block. This makes fetching @@ -189,6 +191,12 @@ BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize, * Avoid writing code that can fail between here and MemoryContextCreate; * we'd leak the header and initial block if we ereport in this stretch. */ + + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + /* This vchunk covers the BumpContext and the keeper block header */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + dlist_init(&set->blocks); /* Fill in the initial block's block header */ @@ -262,6 +270,14 @@ BumpReset(MemoryContext context) BumpBlockFree(set, block); } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the BumpContext and keeper-block + * header. This gets rid of the vchunks for whatever user data is getting + * discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* Reset block size allocation sequence, too */ set->nextBlockSize = set->initBlockSize; @@ -279,6 +295,10 @@ BumpDelete(MemoryContext context) { /* Reset to release all releasable BumpBlocks */ BumpReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header and keeper block */ free(context); } @@ -318,6 +338,9 @@ BumpAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ); + context->mem_allocated += blksize; /* the block is completely full */ @@ -384,7 +407,7 @@ BumpAllocChunkFromBlock(MemoryContext context, BumpBlock *block, Size size, #ifdef MEMORY_CONTEXT_CHECKING chunk = (MemoryChunk *) block->freeptr; #else - ptr = (void *) block->freeptr; + ptr = block->freeptr; #endif /* point the freeptr beyond this chunk */ @@ -455,6 +478,9 @@ BumpAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ); + context->mem_allocated += blksize; /* initialize the new block */ @@ -606,6 +632,9 @@ BumpBlockFree(BumpContext *set, BumpBlock *block) wipe_mem(block, ((char *) block->endptr - (char *) block)); #endif + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 17d4f7a7a06e1..6b37839e92592 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -531,6 +531,21 @@ dsa_attach(dsa_handle handle) return area; } +/* + * Returns whether the area with the given handle was already attached by the + * current process. The area must have been created with dsa_create (not + * dsa_create_in_place). + */ +bool +dsa_is_attached(dsa_handle handle) +{ + /* + * An area handle is really a DSM segment handle for the first segment, so + * we can just search for that. + */ + return dsm_find_mapping(handle) != NULL; +} + /* * Attach to an area that was created with dsa_create_in_place. The caller * must somehow know the location in memory that was used when the area was @@ -1028,13 +1043,48 @@ dsa_get_total_size(dsa_area *area) { size_t size; - LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + LWLockAcquire(DSA_AREA_LOCK(area), LW_SHARED); size = area->control->total_segment_size; LWLockRelease(DSA_AREA_LOCK(area)); return size; } +/* + * Same as dsa_get_total_size(), but accepts a DSA handle. The area must have + * been created with dsa_create (not dsa_create_in_place). + */ +size_t +dsa_get_total_size_from_handle(dsa_handle handle) +{ + size_t size; + bool already_attached; + dsm_segment *segment; + dsa_area_control *control; + + already_attached = dsa_is_attached(handle); + if (already_attached) + segment = dsm_find_mapping(handle); + else + segment = dsm_attach(handle); + + if (segment == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to dynamic shared area"))); + + control = (dsa_area_control *) dsm_segment_address(segment); + + LWLockAcquire(&control->lock, LW_SHARED); + size = control->total_segment_size; + LWLockRelease(&control->lock); + + if (!already_attached) + dsm_detach(segment); + + return size; +} + /* * Aggressively free all spare memory in the hope of returning DSM segments to * the operating system. @@ -1280,7 +1330,7 @@ create_internal(void *place, size_t size, * area. Other backends will need to obtain their own dsa_area object by * attaching. */ - area = palloc(sizeof(dsa_area)); + area = palloc_object(dsa_area); area->control = control; area->resowner = CurrentResourceOwner; memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); @@ -1336,7 +1386,7 @@ attach_internal(void *place, dsm_segment *segment, dsa_handle handle) (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0)); /* Build the backend-local area object. */ - area = palloc(sizeof(dsa_area)); + area = palloc_object(dsa_area); area->control = control; area->resowner = CurrentResourceOwner; memset(&area->segment_maps[0], 0, diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c index 52fa78dc58612..27d3e6e100c10 100644 --- a/src/backend/utils/mmgr/freepage.c +++ b/src/backend/utils/mmgr/freepage.c @@ -894,14 +894,14 @@ FreePageBtreeGetRecycled(FreePageManager *fpm) } /* - * Insert an item into an internal page. + * Insert an item into an internal page (there must be room). */ static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, Size first_page, FreePageBtree *child) { Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); - Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + Assert(btp->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE); Assert(index <= btp->hdr.nused); memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index], sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index)); @@ -911,14 +911,14 @@ FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, } /* - * Insert an item into a leaf page. + * Insert an item into a leaf page (there must be room). */ static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, Size npages) { Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); - Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + Assert(btp->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE); Assert(index <= btp->hdr.nused); memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index], sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c index 18679ad4f1e41..f6203501956d7 100644 --- a/src/backend/utils/mmgr/generation.c +++ b/src/backend/utils/mmgr/generation.c @@ -45,6 +45,8 @@ #define Generation_BLOCKHDRSZ MAXALIGN(sizeof(GenerationBlock)) #define Generation_CHUNKHDRSZ sizeof(MemoryChunk) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(GenerationContext)) + \ + Generation_BLOCKHDRSZ) #define Generation_CHUNK_FRACTION 8 @@ -100,14 +102,14 @@ struct GenerationBlock * True iff set is valid generation set. */ #define GenerationIsValid(set) \ - (PointerIsValid(set) && IsA(set, GenerationContext)) + ((set) && IsA(set, GenerationContext)) /* * GenerationBlockIsValid * True iff block is valid block of generation set. */ #define GenerationBlockIsValid(block) \ - (PointerIsValid(block) && GenerationIsValid((block)->context)) + ((block) && GenerationIsValid((block)->context)) /* * GenerationBlockIsEmpty @@ -221,6 +223,12 @@ GenerationContextCreate(MemoryContext parent, * Avoid writing code that can fail between here and MemoryContextCreate; * we'd leak the header if we ereport in this stretch. */ + + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + /* This vchunk covers the GenerationContext and the keeper block header */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + dlist_init(&set->blocks); /* Fill in the initial block's block header */ @@ -309,6 +317,14 @@ GenerationReset(MemoryContext context) GenerationBlockFree(set, block); } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the GenerationContext and + * keeper-block header. This gets rid of the vchunks for whatever user + * data is getting discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* set it so new allocations to make use of the keeper block */ set->block = KeeperBlock(set); @@ -329,6 +345,10 @@ GenerationDelete(MemoryContext context) { /* Reset to release all releasable GenerationBlocks */ GenerationReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header and keeper block */ free(context); } @@ -365,6 +385,9 @@ GenerationAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ); + context->mem_allocated += blksize; /* block with a single (used) chunk */ @@ -487,6 +510,9 @@ GenerationAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ); + context->mem_allocated += blksize; /* initialize the new block */ @@ -677,6 +703,9 @@ GenerationBlockFree(GenerationContext *set, GenerationBlock *block) wipe_mem(block, block->blksize); #endif + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 15fa4d0a55eeb..5c1a06d86fd82 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -8,6 +8,23 @@ * context-type-specific operations via the function pointers in a * context's MemoryContextMethods struct. * + * A note about Valgrind support: when USE_VALGRIND is defined, we provide + * support for memory leak tracking at the allocation-unit level. Valgrind + * does leak detection by tracking allocated "chunks", which can be grouped + * into "pools". The "chunk" terminology is overloaded, since we use that + * word for our allocation units, and it's sometimes important to distinguish + * those from the Valgrind objects that describe them. To reduce confusion, + * let's use the terms "vchunk" and "vpool" for the Valgrind objects. + * + * We use a separate vpool for each memory context. The context-type-specific + * code is responsible for creating and deleting the vpools, and also for + * creating vchunks to cover its management data structures such as block + * headers. (There must be a vchunk that includes every pointer we want + * Valgrind to consider for leak-tracking purposes.) This module creates + * and deletes the vchunks that cover the caller-visible allocated chunks. + * However, the context-type-specific code must handle cleaning up those + * vchunks too during memory context reset operations. + * * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -157,6 +174,9 @@ MemoryContext CurTransactionContext = NULL; /* This is a transient link to the active portal's memory context: */ MemoryContext PortalContext = NULL; +/* Is memory context logging currently in progress? */ +static bool LogMemoryContextInProgress = false; + static void MemoryContextDeleteOnly(MemoryContext context); static void MemoryContextCallResetCallbacks(MemoryContext context); static void MemoryContextStatsInternal(MemoryContext context, int level, @@ -418,8 +438,6 @@ MemoryContextResetOnly(MemoryContext context) context->methods->reset(context); context->isReset = true; - VALGRIND_DESTROY_MEMPOOL(context); - VALGRIND_CREATE_MEMPOOL(context, 0, false); } } @@ -526,8 +544,6 @@ MemoryContextDeleteOnly(MemoryContext context) context->ident = NULL; context->methods->delete_context(context); - - VALGRIND_DESTROY_MEMPOOL(context); } /* @@ -560,9 +576,7 @@ MemoryContextDeleteChildren(MemoryContext context) * the specified context, since that means it will automatically be freed * when no longer needed. * - * There is no API for deregistering a callback once registered. If you - * want it to not do anything anymore, adjust the state pointed to by its - * "arg" to indicate that. + * Note that callers can assume this cannot fail. */ void MemoryContextRegisterResetCallback(MemoryContext context, @@ -577,6 +591,41 @@ MemoryContextRegisterResetCallback(MemoryContext context, context->isReset = false; } +/* + * MemoryContextUnregisterResetCallback + * Undo the effects of MemoryContextRegisterResetCallback. + * + * This can be used if a callback's effects are no longer required + * at some point before the context has been reset/deleted. It is the + * caller's responsibility to pfree the callback struct (if needed). + * + * An assertion failure occurs if the callback was not registered. + * We could alternatively define that case as a no-op, but that seems too + * likely to mask programming errors such as passing the wrong context. + */ +void +MemoryContextUnregisterResetCallback(MemoryContext context, + MemoryContextCallback *cb) +{ + MemoryContextCallback *prev, + *cur; + + Assert(MemoryContextIsValid(context)); + + for (prev = NULL, cur = context->reset_cbs; cur != NULL; + prev = cur, cur = cur->next) + { + if (cur != cb) + continue; + if (prev) + prev->next = cur->next; + else + context->reset_cbs = cur->next; + return; + } + Assert(false); +} + /* * MemoryContextCallResetCallbacks * Internal function to call all registered callbacks for context. @@ -1137,8 +1186,6 @@ MemoryContextCreate(MemoryContext node, node->nextchild = NULL; node->allowInCritSection = false; } - - VALGRIND_CREATE_MEMPOOL(node, 0, false); } /* @@ -1295,26 +1342,45 @@ ProcessLogMemoryContextInterrupt(void) LogMemoryContextPending = false; /* - * Use LOG_SERVER_ONLY to prevent this message from being sent to the - * connected client. + * Exit immediately if memory context logging is already in progress. This + * prevents recursive calls, which could occur if logging is requested + * repeatedly and rapidly, potentially leading to infinite recursion and a + * crash. */ - ereport(LOG_SERVER_ONLY, - (errhidestmt(true), - errhidecontext(true), - errmsg("logging memory contexts of PID %d", MyProcPid))); + if (LogMemoryContextInProgress) + return; + LogMemoryContextInProgress = true; - /* - * When a backend process is consuming huge memory, logging all its memory - * contexts might overrun available disk space. To prevent this, we limit - * the depth of the hierarchy, as well as the number of child contexts to - * log per parent to 100. - * - * As with MemoryContextStats(), we suppose that practical cases where the - * dump gets long will typically be huge numbers of siblings under the - * same parent context; while the additional debugging value from seeing - * details about individual siblings beyond 100 will not be large. - */ - MemoryContextStatsDetail(TopMemoryContext, 100, 100, false); + PG_TRY(); + { + /* + * Use LOG_SERVER_ONLY to prevent this message from being sent to the + * connected client. + */ + ereport(LOG_SERVER_ONLY, + (errhidestmt(true), + errhidecontext(true), + errmsg("logging memory contexts of PID %d", MyProcPid))); + + /* + * When a backend process is consuming huge memory, logging all its + * memory contexts might overrun available disk space. To prevent + * this, we limit the depth of the hierarchy, as well as the number of + * child contexts to log per parent to 100. + * + * As with MemoryContextStats(), we suppose that practical cases where + * the dump gets long will typically be huge numbers of siblings under + * the same parent context; while the additional debugging value from + * seeing details about individual siblings beyond 100 will not be + * large. + */ + MemoryContextStatsDetail(TopMemoryContext, 100, 100, false); + } + PG_FINALLY(); + { + LogMemoryContextInProgress = false; + } + PG_END_TRY(); } void * @@ -1421,7 +1487,13 @@ MemoryContextAllocAligned(MemoryContext context, void *unaligned; void *aligned; - /* wouldn't make much sense to waste that much space */ + /* + * Restrict alignto to ensure that it can fit into the "value" field of + * the redirection MemoryChunk, and that the distance back to the start of + * the unaligned chunk will fit into the space available for that. This + * isn't a limitation in practice, since it wouldn't make much sense to + * waste that much space. + */ Assert(alignto < (128 * 1024 * 1024)); /* ensure alignto is a power of 2 */ @@ -1458,10 +1530,15 @@ MemoryContextAllocAligned(MemoryContext context, alloc_size += 1; #endif - /* perform the actual allocation */ - unaligned = MemoryContextAllocExtended(context, alloc_size, flags); + /* + * Perform the actual allocation, but do not pass down MCXT_ALLOC_ZERO. + * This ensures that wasted bytes beyond the aligned chunk do not become + * DEFINED. + */ + unaligned = MemoryContextAllocExtended(context, alloc_size, + flags & ~MCXT_ALLOC_ZERO); - /* set the aligned pointer */ + /* compute the aligned pointer */ aligned = (void *) TYPEALIGN(alignto, (char *) unaligned + sizeof(MemoryChunk)); @@ -1489,12 +1566,23 @@ MemoryContextAllocAligned(MemoryContext context, set_sentinel(aligned, size); #endif - /* Mark the bytes before the redirection header as noaccess */ - VALGRIND_MAKE_MEM_NOACCESS(unaligned, - (char *) alignedchunk - (char *) unaligned); + /* + * MemoryContextAllocExtended marked the whole unaligned chunk as a + * vchunk. Undo that, instead making just the aligned chunk be a vchunk. + * This prevents Valgrind from complaining that the vchunk is possibly + * leaked, since only pointers to the aligned chunk will exist. + * + * After these calls, the aligned chunk will be marked UNDEFINED, and all + * the rest of the unaligned chunk (the redirection chunk header, the + * padding bytes before it, and any wasted trailing bytes) will be marked + * NOACCESS, which is what we want. + */ + VALGRIND_MEMPOOL_FREE(context, unaligned); + VALGRIND_MEMPOOL_ALLOC(context, aligned, size); - /* Disallow access to the redirection chunk header. */ - VALGRIND_MAKE_MEM_NOACCESS(alignedchunk, sizeof(MemoryChunk)); + /* Now zero (and make DEFINED) just the aligned chunk, if requested */ + if ((flags & MCXT_ALLOC_ZERO) != 0) + MemSetAligned(aligned, 0, size); return aligned; } @@ -1528,16 +1616,12 @@ void pfree(void *pointer) { #ifdef USE_VALGRIND - MemoryContextMethodID method = GetMemoryChunkMethodID(pointer); MemoryContext context = GetMemoryChunkContext(pointer); #endif MCXT_METHOD(pointer, free_p) (pointer); -#ifdef USE_VALGRIND - if (method != MCTX_ALIGNED_REDIRECT_ID) - VALGRIND_MEMPOOL_FREE(context, pointer); -#endif + VALGRIND_MEMPOOL_FREE(context, pointer); } /* @@ -1547,9 +1631,6 @@ pfree(void *pointer) void * repalloc(void *pointer, Size size) { -#ifdef USE_VALGRIND - MemoryContextMethodID method = GetMemoryChunkMethodID(pointer); -#endif #if defined(USE_ASSERT_CHECKING) || defined(USE_VALGRIND) MemoryContext context = GetMemoryChunkContext(pointer); #endif @@ -1572,10 +1653,7 @@ repalloc(void *pointer, Size size) */ ret = MCXT_METHOD(pointer, realloc) (pointer, size, 0); -#ifdef USE_VALGRIND - if (method != MCTX_ALIGNED_REDIRECT_ID) - VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); -#endif + VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); return ret; } diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index 0be1c2b0fff85..1f2a423f38a67 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -131,7 +131,7 @@ GetPortalByName(const char *name) { Portal portal; - if (PointerIsValid(name)) + if (name) PortalHashTableLookup(name, portal); else portal = NULL; @@ -176,7 +176,7 @@ CreatePortal(const char *name, bool allowDup, bool dupSilent) { Portal portal; - Assert(PointerIsValid(name)); + Assert(name); portal = GetPortalByName(name); if (PortalIsValid(portal)) @@ -425,7 +425,7 @@ MarkPortalDone(Portal portal) * aborted transaction, this is necessary, or we'd reach AtCleanup_Portals * with the cleanup hook still unexecuted. */ - if (PointerIsValid(portal->cleanup)) + if (portal->cleanup) { portal->cleanup(portal); portal->cleanup = NULL; @@ -453,7 +453,7 @@ MarkPortalFailed(Portal portal) * is necessary, or we'd reach AtCleanup_Portals with the cleanup hook * still unexecuted. */ - if (PointerIsValid(portal->cleanup)) + if (portal->cleanup) { portal->cleanup(portal); portal->cleanup = NULL; @@ -497,7 +497,7 @@ PortalDrop(Portal portal, bool isTopCommit) * Note: in most paths of control, this will have been done already in * MarkPortalDone or MarkPortalFailed. We're just making sure. */ - if (PointerIsValid(portal->cleanup)) + if (portal->cleanup) { portal->cleanup(portal); portal->cleanup = NULL; @@ -823,7 +823,7 @@ AtAbort_Portals(void) * Allow portalcmds.c to clean up the state it knows about, if we * haven't already. */ - if (PointerIsValid(portal->cleanup)) + if (portal->cleanup) { portal->cleanup(portal); portal->cleanup = NULL; @@ -853,7 +853,8 @@ AtAbort_Portals(void) /* * Post-abort cleanup for portals. * - * Delete all portals not held over from prior transactions. */ + * Delete all portals not held over from prior transactions. + */ void AtCleanup_Portals(void) { @@ -896,7 +897,7 @@ AtCleanup_Portals(void) * We had better not call any user-defined code during cleanup, so if * the cleanup hook hasn't been run yet, too bad; we'll just skip it. */ - if (PointerIsValid(portal->cleanup)) + if (portal->cleanup) { elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name); portal->cleanup = NULL; @@ -1056,7 +1057,7 @@ AtSubAbort_Portals(SubTransactionId mySubid, * Allow portalcmds.c to clean up the state it knows about, if we * haven't already. */ - if (PointerIsValid(portal->cleanup)) + if (portal->cleanup) { portal->cleanup(portal); portal->cleanup = NULL; @@ -1115,7 +1116,7 @@ AtSubCleanup_Portals(SubTransactionId mySubid) * We had better not call any user-defined code during cleanup, so if * the cleanup hook hasn't been run yet, too bad; we'll just skip it. */ - if (PointerIsValid(portal->cleanup)) + if (portal->cleanup) { elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name); portal->cleanup = NULL; diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c index d32c0d318fbf4..0bb96187603fc 100644 --- a/src/backend/utils/mmgr/slab.c +++ b/src/backend/utils/mmgr/slab.c @@ -193,14 +193,14 @@ typedef struct SlabBlock * SlabIsValid * True iff set is a valid slab allocation set. */ -#define SlabIsValid(set) (PointerIsValid(set) && IsA(set, SlabContext)) +#define SlabIsValid(set) ((set) && IsA(set, SlabContext)) /* * SlabBlockIsValid * True iff block is a valid block of slab allocation set. */ #define SlabBlockIsValid(block) \ - (PointerIsValid(block) && SlabIsValid((block)->slab)) + ((block) && SlabIsValid((block)->slab)) /* * SlabBlocklistIndex @@ -377,6 +377,11 @@ SlabContextCreate(MemoryContext parent, * we'd leak the header if we ereport in this stretch. */ + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(slab, 0, false); + /* This vchunk covers the SlabContext only */ + VALGRIND_MEMPOOL_ALLOC(slab, slab, sizeof(SlabContext)); + /* Fill in SlabContext-specific header fields */ slab->chunkSize = (uint32) chunkSize; slab->fullChunkSize = (uint32) fullChunkSize; @@ -451,6 +456,10 @@ SlabReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); context->mem_allocated -= slab->blockSize; } @@ -467,11 +476,23 @@ SlabReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); context->mem_allocated -= slab->blockSize; } } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the SlabContext. This gets rid of + * the vchunks for whatever user data is getting discarded by the context + * reset. + */ + VALGRIND_MEMPOOL_TRIM(slab, slab, sizeof(SlabContext)); + slab->curBlocklistIndex = 0; Assert(context->mem_allocated == 0); @@ -486,6 +507,10 @@ SlabDelete(MemoryContext context) { /* Reset to release all the SlabBlocks */ SlabReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header */ free(context); } @@ -567,6 +592,9 @@ SlabAllocFromNewBlock(MemoryContext context, Size size, int flags) if (unlikely(block == NULL)) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(slab, block, Slab_BLOCKHDRSZ); + block->slab = slab; context->mem_allocated += slab->blockSize; @@ -795,6 +823,10 @@ SlabFree(void *pointer) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); slab->header.mem_allocated -= slab->blockSize; } diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index d39f3e1b655cd..fca84ded6ddc3 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -231,11 +231,8 @@ hash_resource_elem(Datum value, const ResourceOwnerDesc *kind) * 'kind' into the hash. Just add it with hash_combine(), it perturbs the * result enough for our purposes. */ -#if SIZEOF_DATUM == 8 - return hash_combine64(murmurhash64((uint64) value), (uint64) kind); -#else - return hash_combine(murmurhash32((uint32) value), (uint32) kind); -#endif + return hash_combine64(murmurhash64((uint64) value), + (uint64) (uintptr_t) kind); } /* diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index e529ceb8260bb..42bf50221b8c7 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -437,7 +437,7 @@ ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt) if (lt->prealloc == NULL) { lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN; - lt->prealloc = (int64 *) palloc(sizeof(int64) * lt->prealloc_size); + lt->prealloc = palloc_array(int64, lt->prealloc_size); } else if (lt->prealloc_size < TAPE_WRITE_PREALLOC_MAX) { @@ -560,7 +560,7 @@ LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker) /* * Create top-level struct including per-tape LogicalTape structs. */ - lts = (LogicalTapeSet *) palloc(sizeof(LogicalTapeSet)); + lts = palloc_object(LogicalTapeSet); lts->nBlocksAllocated = 0L; lts->nBlocksWritten = 0L; lts->nHoleBlocks = 0L; @@ -681,7 +681,7 @@ LogicalTapeCreate(LogicalTapeSet *lts) { /* * The only thing that currently prevents creating new tapes in leader is - * the fact that BufFiles opened using BufFileOpenShared() are read-only + * the fact that BufFiles opened using BufFileOpenFileSet() are read-only * by definition, but that could be changed if it seemed worthwhile. For * now, writing to the leader tape will raise a "Bad file descriptor" * error, so tuplesort must avoid writing to the leader tape altogether. @@ -700,7 +700,7 @@ ltsCreateTape(LogicalTapeSet *lts) /* * Create per-tape struct. Note we allocate the I/O buffer lazily. */ - lt = palloc(sizeof(LogicalTape)); + lt = palloc_object(LogicalTape); lt->tapeSet = lts; lt->writing = true; lt->frozen = false; diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index 2f031c329094a..e77d857ff3f51 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -88,7 +88,6 @@ struct SharedTuplestoreAccessor /* State for writing. */ SharedTuplestoreChunk *write_chunk; /* Buffer for writing. */ BufFile *write_file; /* The current file to write to. */ - BlockNumber write_page; /* The next page to write to. */ char *write_pointer; /* Current write pointer within chunk. */ char *write_end; /* One past the end of the current chunk. */ }; @@ -161,7 +160,7 @@ sts_initialize(SharedTuplestore *sts, int participants, sts->participants[i].writing = false; } - accessor = palloc0(sizeof(SharedTuplestoreAccessor)); + accessor = palloc0_object(SharedTuplestoreAccessor); accessor->participant = my_participant_number; accessor->sts = sts; accessor->fileset = fileset; @@ -183,7 +182,7 @@ sts_attach(SharedTuplestore *sts, Assert(my_participant_number < sts->nparticipants); - accessor = palloc0(sizeof(SharedTuplestoreAccessor)); + accessor = palloc0_object(SharedTuplestoreAccessor); accessor->participant = my_participant_number; accessor->sts = sts; accessor->fileset = fileset; diff --git a/src/backend/utils/sort/sortsupport.c b/src/backend/utils/sort/sortsupport.c index e0f500b9aa29c..f582c6624f11a 100644 --- a/src/backend/utils/sort/sortsupport.c +++ b/src/backend/utils/sort/sortsupport.c @@ -57,7 +57,7 @@ comparison_shim(Datum x, Datum y, SortSupport ssup) if (extra->fcinfo.isnull) elog(ERROR, "function %u returned NULL", extra->flinfo.fn_oid); - return result; + return DatumGetInt32(result); } /* diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 65ab83fff8b26..88ae529e84313 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -111,11 +111,9 @@ #include "utils/tuplesort.h" /* - * Initial size of memtuples array. We're trying to select this size so that - * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of - * allocation might possibly be lowered. However, we don't consider array sizes - * less than 1024. - * + * Initial size of memtuples array. This must be more than + * ALLOCSET_SEPARATE_THRESHOLD; see comments in grow_memtuples(). Clamp at + * 1024 elements to avoid excessive reallocs. */ #define INITIAL_MEMTUPSIZE Max(1024, \ ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) @@ -512,7 +510,6 @@ qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) return state->base.comparetup_tiebreak(a, b, state); } -#if SIZEOF_DATUM >= 8 /* Used if first key's comparator is ssup_datum_signed_cmp */ static pg_attribute_always_inline int qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) @@ -535,7 +532,6 @@ qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) return state->base.comparetup_tiebreak(a, b, state); } -#endif /* Used if first key's comparator is ssup_datum_int32_cmp */ static pg_attribute_always_inline int @@ -578,7 +574,6 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) #define ST_DEFINE #include "lib/sort_template.h" -#if SIZEOF_DATUM >= 8 #define ST_SORT qsort_tuple_signed #define ST_ELEMENT_TYPE SortTuple #define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state) @@ -587,7 +582,6 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) #define ST_SCOPE static #define ST_DEFINE #include "lib/sort_template.h" -#endif #define ST_SORT qsort_tuple_int32 #define ST_ELEMENT_TYPE SortTuple @@ -677,7 +671,7 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt) */ oldcontext = MemoryContextSwitchTo(maincontext); - state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + state = palloc0_object(Tuplesortstate); if (trace_sort) pg_rusage_init(&state->ru_start); @@ -696,10 +690,6 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt) state->base.sortcontext = sortcontext; state->base.maincontext = maincontext; - /* - * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; - * see comments in grow_memtuples(). - */ state->memtupsize = INITIAL_MEMTUPSIZE; state->memtuples = NULL; @@ -788,10 +778,6 @@ tuplesort_begin_batch(Tuplesortstate *state) state->memtupcount = 0; - /* - * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; - * see comments in grow_memtuples(). - */ state->growmemtuples = true; state->slabAllocatorUsed = false; if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) @@ -2692,7 +2678,6 @@ tuplesort_sort_memtuples(Tuplesortstate *state) state); return; } -#if SIZEOF_DATUM >= 8 else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp) { qsort_tuple_signed(state->memtuples, @@ -2700,7 +2685,6 @@ tuplesort_sort_memtuples(Tuplesortstate *state) state); return; } -#endif else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp) { qsort_tuple_int32(state->memtuples, @@ -3146,7 +3130,6 @@ ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup) return 0; } -#if SIZEOF_DATUM >= 8 int ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup) { @@ -3160,7 +3143,6 @@ ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup) else return 0; } -#endif int ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup) diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 5f70e8dddac57..a1f5c19ee9760 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -31,6 +31,7 @@ #include "utils/datum.h" #include "utils/guc.h" #include "utils/lsyscache.h" +#include "utils/rel.h" #include "utils/tuplesort.h" @@ -264,7 +265,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, Assert(indexRel->rd_rel->relam == BTREE_AM_OID); oldcontext = MemoryContextSwitchTo(base->maincontext); - arg = (TuplesortClusterArg *) palloc0(sizeof(TuplesortClusterArg)); + arg = palloc0_object(TuplesortClusterArg); if (trace_sort) elog(LOG, @@ -371,7 +372,7 @@ tuplesort_begin_index_btree(Relation heapRel, int i; oldcontext = MemoryContextSwitchTo(base->maincontext); - arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg)); + arg = palloc_object(TuplesortIndexBTreeArg); if (trace_sort) elog(LOG, @@ -452,7 +453,7 @@ tuplesort_begin_index_hash(Relation heapRel, TuplesortIndexHashArg *arg; oldcontext = MemoryContextSwitchTo(base->maincontext); - arg = (TuplesortIndexHashArg *) palloc(sizeof(TuplesortIndexHashArg)); + arg = palloc_object(TuplesortIndexHashArg); if (trace_sort) elog(LOG, @@ -501,7 +502,7 @@ tuplesort_begin_index_gist(Relation heapRel, int i; oldcontext = MemoryContextSwitchTo(base->maincontext); - arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg)); + arg = palloc_object(TuplesortIndexBTreeArg); if (trace_sort) elog(LOG, @@ -661,7 +662,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool typbyval; oldcontext = MemoryContextSwitchTo(base->maincontext); - arg = (TuplesortDatumArg *) palloc(sizeof(TuplesortDatumArg)); + arg = palloc_object(TuplesortDatumArg); if (trace_sort) elog(LOG, @@ -693,7 +694,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, base->tuples = !typbyval; /* Prepare SortSupport data */ - base->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + base->sortKeys = palloc0_object(SortSupportData); base->sortKeys->ssup_cxt = CurrentMemoryContext; base->sortKeys->ssup_collation = sortCollation; @@ -815,7 +816,7 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) */ void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, - ItemPointer self, const Datum *values, + const ItemPointerData *self, const Datum *values, const bool *isnull) { SortTuple stup; @@ -865,7 +866,7 @@ tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size) memcpy(&bstup->tuple, tuple, size); stup.tuple = bstup; - stup.datum1 = tuple->bt_blkno; + stup.datum1 = UInt32GetDatum(tuple->bt_blkno); stup.isnull1 = false; /* GetMemoryChunkSpace is not supported for bump contexts */ @@ -1131,7 +1132,6 @@ tuplesort_getgintuple(Tuplesortstate *state, Size *len, bool forward) * efficient, but only safe for callers that are prepared to have any * subsequent manipulation of the tuplesort's state invalidate slot contents. * For byval Datums, the value of the 'copy' parameter has no effect. - */ bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy, @@ -1836,7 +1836,7 @@ removeabbrev_index_brin(Tuplesortstate *state, SortTuple *stups, int count) BrinSortTuple *tuple; tuple = stups[i].tuple; - stups[i].datum1 = tuple->tuple.bt_blkno; + stups[i].datum1 = UInt32GetDatum(tuple->tuple.bt_blkno); } } @@ -1893,7 +1893,7 @@ readtup_index_brin(Tuplesortstate *state, SortTuple *stup, stup->tuple = tuple; /* set up first-column key value, which is block number */ - stup->datum1 = tuple->tuple.bt_blkno; + stup->datum1 = UInt32GetDatum(tuple->tuple.bt_blkno); } /* @@ -1953,7 +1953,7 @@ readtup_index_gin(Tuplesortstate *state, SortTuple *stup, LogicalTapeReadExact(tape, tuple, tuplen); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); - stup->tuple = (void *) tuple; + stup->tuple = tuple; /* no abbreviations (FIXME maybe use attrnum for this?) */ stup->datum1 = (Datum) 0; diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index c9aecab8d66cb..9701b8133602b 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -94,7 +94,7 @@ typedef struct bool eof_reached; /* read has reached EOF */ int current; /* next array index to read */ int file; /* temp file# */ - off_t offset; /* byte offset in file */ + pgoff_t offset; /* byte offset in file */ } TSReadPointer; /* @@ -179,7 +179,7 @@ struct Tuplestorestate int readptrsize; /* allocated length of readptrs array */ int writepos_file; /* file# (valid if READFILE state) */ - off_t writepos_offset; /* offset (valid if READFILE state) */ + pgoff_t writepos_offset; /* offset (valid if READFILE state) */ }; #define COPYTUP(state,tup) ((*(state)->copytup) (state, tup)) @@ -257,7 +257,7 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) { Tuplestorestate *state; - state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate)); + state = palloc0_object(Tuplestorestate); state->status = TSS_INMEM; state->eflags = eflags; @@ -1051,7 +1051,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, * Back up to fetch previously-returned tuple's ending length * word. If seek fails, assume we are at start of file. */ - if (BufFileSeek(state->myfile, 0, -(long) sizeof(unsigned int), + if (BufFileSeek(state->myfile, 0, -(pgoff_t) sizeof(unsigned int), SEEK_CUR) != 0) { /* even a failed backwards fetch gets you out of eof state */ @@ -1072,7 +1072,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, * Back up to get ending length word of tuple before it. */ if (BufFileSeek(state->myfile, 0, - -(long) (tuplen + 2 * sizeof(unsigned int)), + -(pgoff_t) (tuplen + 2 * sizeof(unsigned int)), SEEK_CUR) != 0) { /* @@ -1082,7 +1082,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, * what in-memory case does). */ if (BufFileSeek(state->myfile, 0, - -(long) (tuplen + sizeof(unsigned int)), + -(pgoff_t) (tuplen + sizeof(unsigned int)), SEEK_CUR) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -1099,7 +1099,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, * length word of the tuple, so back up to that point. */ if (BufFileSeek(state->myfile, 0, - -(long) tuplen, + -(pgoff_t) tuplen, SEEK_CUR) != 0) ereport(ERROR, (errcode_for_file_access(), diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index ea35f30f49457..5af8326d5e8be 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -119,6 +119,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/resowner.h" #include "utils/snapmgr.h" @@ -271,12 +272,23 @@ Snapshot GetTransactionSnapshot(void) { /* - * This should not be called while doing logical decoding. Historic - * snapshots are only usable for catalog access, not for general-purpose - * queries. + * Return historic snapshot if doing logical decoding. + * + * Historic snapshots are only usable for catalog access, not for + * general-purpose queries. The caller is responsible for ensuring that + * the snapshot is used correctly! (PostgreSQL code never calls this + * during logical decoding, but extensions can do it.) */ if (HistoricSnapshotActive()) - elog(ERROR, "cannot take query snapshot during logical decoding"); + { + /* + * We'll never need a non-historic transaction snapshot in this + * (sub-)transaction, so there's no need to be careful to set one up + * for later calls to GetTransactionSnapshot(). + */ + Assert(!FirstSnapshotSet); + return HistoricSnapshot; + } /* First call in transaction? */ if (!FirstSnapshotSet) @@ -447,6 +459,7 @@ InvalidateCatalogSnapshot(void) pairingheap_remove(&RegisteredSnapshots, &CatalogSnapshot->ph_node); CatalogSnapshot = NULL; SnapshotResetXmin(); + INJECTION_POINT("invalidate-catalog-snapshot-end", NULL); } } @@ -1166,7 +1179,7 @@ ExportSnapshot(Snapshot snapshot) snapshot = CopySnapshot(snapshot); oldcxt = MemoryContextSwitchTo(TopTransactionContext); - esnap = (ExportedSnapshot *) palloc(sizeof(ExportedSnapshot)); + esnap = palloc_object(ExportedSnapshot); esnap->snapfile = pstrdup(path); esnap->snapshot = snapshot; exportedSnapshots = lappend(exportedSnapshots, esnap); @@ -1835,12 +1848,9 @@ RestoreSnapshot(char *start_address) /* * Install a restored snapshot as the transaction snapshot. - * - * The second argument is of type void * so that snapmgr.h need not include - * the declaration for PGPROC. */ void -RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc) +RestoreTransactionSnapshot(Snapshot snapshot, PGPROC *source_pgproc) { SetTransactionSnapshot(snapshot, NULL, InvalidPid, source_pgproc); } diff --git a/src/bin/initdb/Makefile b/src/bin/initdb/Makefile index 997e0a013e956..c0470efda92a3 100644 --- a/src/bin/initdb/Makefile +++ b/src/bin/initdb/Makefile @@ -20,7 +20,7 @@ include $(top_builddir)/src/Makefile.global # from libpq, else we have risks of version skew if we run with a libpq # shared library from a different PG version. Define # USE_PRIVATE_ENCODING_FUNCS to ensure that that happens. -override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(ICU_CFLAGS) $(CPPFLAGS) +override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS) $(ICU_CFLAGS) # We need libpq only because fe_utils does. LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) $(ICU_LIBS) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 62bbd08d9f658..92fe2f531f7a8 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -1580,9 +1580,6 @@ bootstrap_template1(void) bki_lines = replace_token(bki_lines, "ALIGNOF_POINTER", (sizeof(Pointer) == 4) ? "i" : "d"); - bki_lines = replace_token(bki_lines, "FLOAT8PASSBYVAL", - FLOAT8PASSBYVAL ? "true" : "false"); - bki_lines = replace_token(bki_lines, "POSTGRES", escape_quotes_bki(username)); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 15dd10ce40a31..1e9543c2585db 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -76,7 +76,8 @@ 'checksums are enabled in control file'); command_ok([ 'initdb', '--sync-only', $datadir ], 'sync only'); -command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], '--no-sync-data-files'); +command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], + '--no-sync-data-files'); command_fails([ 'initdb', $datadir ], 'existing data directory'); if ($supports_syncfs) @@ -307,9 +308,9 @@ 'multiple --set options with different case'); my $conf = slurp_file("$tempdir/dataY/postgresql.conf"); -ok($conf !~ qr/^WORK_MEM = /m, "WORK_MEM should not be configured"); -ok($conf !~ qr/^Work_Mem = /m, "Work_Mem should not be configured"); -ok($conf =~ qr/^work_mem = 512/m, "work_mem should be in config"); +unlike($conf, qr/^WORK_MEM = /m, "WORK_MEM should not be configured"); +unlike($conf, qr/^Work_Mem = /m, "Work_Mem should not be configured"); +like($conf, qr/^work_mem = 512/m, "work_mem should be in config"); # Test the no-data-checksums flag my $datadir_nochecksums = "$tempdir/data_no_checksums"; diff --git a/src/bin/pg_amcheck/t/004_verify_heapam.pl b/src/bin/pg_amcheck/t/004_verify_heapam.pl index 2a3af2666f52a..72693660fb64b 100644 --- a/src/bin/pg_amcheck/t/004_verify_heapam.pl +++ b/src/bin/pg_amcheck/t/004_verify_heapam.pl @@ -529,7 +529,7 @@ sub header $tup->{t_infomask2} |= HEAP_NATTS_MASK; push @expected, - qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 2047 exceeds maximum 3 expected for table/; } elsif ($offnum == 10) { @@ -552,7 +552,7 @@ sub header $tup->{t_hoff} = 32; push @expected, - qr/${$header}number of attributes 67 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 67 exceeds maximum 3 expected for table/; } elsif ($offnum == 12) { diff --git a/src/bin/pg_archivecleanup/pg_archivecleanup.c b/src/bin/pg_archivecleanup/pg_archivecleanup.c index c25348bcb85dd..ab686b4748ca4 100644 --- a/src/bin/pg_archivecleanup/pg_archivecleanup.c +++ b/src/bin/pg_archivecleanup/pg_archivecleanup.c @@ -375,6 +375,10 @@ main(int argc, char **argv) exit(2); } + if (dryrun) + pg_log_info("Executing in dry-run mode.\n" + "No files will be removed."); + /* * Check archive exists and other initialization if required. */ diff --git a/src/bin/pg_basebackup/astreamer_inject.c b/src/bin/pg_basebackup/astreamer_inject.c index 15334e458ad1e..e77de72f7ac14 100644 --- a/src/bin/pg_basebackup/astreamer_inject.c +++ b/src/bin/pg_basebackup/astreamer_inject.c @@ -68,7 +68,7 @@ astreamer_recovery_injector_new(astreamer *next, { astreamer_recovery_injector *streamer; - streamer = palloc0(sizeof(astreamer_recovery_injector)); + streamer = palloc0_object(astreamer_recovery_injector); *((const astreamer_ops **) &streamer->base.bbs_ops) = &astreamer_recovery_injector_ops; streamer->base.bbs_next = next; diff --git a/src/bin/pg_basebackup/meson.build b/src/bin/pg_basebackup/meson.build index 8a1c96b4f5c84..3a7fc10eab02f 100644 --- a/src/bin/pg_basebackup/meson.build +++ b/src/bin/pg_basebackup/meson.build @@ -93,9 +93,9 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { - 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'TAR': tar.found() ? tar.path() : '', - 'LZ4': program_lz4.found() ? program_lz4.path() : '', + 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'TAR': tar.found() ? tar.full_path() : '', + 'LZ4': program_lz4.found() ? program_lz4.full_path() : '', }, 'tests': [ 't/010_pg_basebackup.pl', diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index eb7354200bcee..0a3ca4315de1e 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -35,6 +35,7 @@ #include "fe_utils/option_utils.h" #include "fe_utils/recovery_gen.h" #include "getopt_long.h" +#include "libpq/protocol.h" #include "receivelog.h" #include "streamutil.h" @@ -487,7 +488,7 @@ reached_end_position(XLogRecPtr segendpos, uint32 timeline, if (r < 0) pg_fatal("could not read from ready pipe: %m"); - if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) + if (sscanf(xlogend, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", xlogend); xlogendptr = ((uint64) hi) << 32 | lo; @@ -629,7 +630,7 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, param->wal_compress_level = wal_compress_level; /* Convert the starting position */ - if (sscanf(startpos, "%X/%X", &hi, &lo) != 2) + if (sscanf(startpos, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", startpos); param->startptr = ((uint64) hi) << 32 | lo; @@ -1338,7 +1339,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) /* Each CopyData message begins with a type byte. */ switch (GetCopyDataByte(r, copybuf, &cursor)) { - case 'n': + case PqBackupMsg_NewArchive: { /* New archive. */ char *archive_name; @@ -1410,7 +1411,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) break; } - case 'd': + case PqMsg_CopyData: { /* Archive or manifest data. */ if (state->manifest_buffer != NULL) @@ -1446,7 +1447,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) break; } - case 'p': + case PqBackupMsg_ProgressReport: { /* * Progress report. @@ -1465,7 +1466,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) break; } - case 'm': + case PqBackupMsg_Manifest: { /* * Manifest data will be sent next. This message is not @@ -2255,7 +2256,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, * value directly in the variable, and then set the flag that says * it's there. */ - if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) + if (sscanf(xlogend, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", xlogend); xlogendptr = ((uint64) hi) << 32 | lo; diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index f65acc7cb1141..dab4dfb3a52d6 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -23,9 +23,11 @@ #include "common/logging.h" #include "common/pg_prng.h" #include "common/restricted_token.h" +#include "datatype/timestamp.h" #include "fe_utils/recovery_gen.h" #include "fe_utils/simple_list.h" #include "fe_utils/string_utils.h" +#include "fe_utils/version.h" #include "getopt_long.h" #define DEFAULT_SUB_PORT "50432" @@ -46,7 +48,7 @@ struct CreateSubscriberOptions SimpleStringList replslot_names; /* list of replication slot names */ int recovery_timeout; /* stop recovery after this time */ bool all_dbs; /* all option */ - SimpleStringList objecttypes_to_remove; /* list of object types to remove */ + SimpleStringList objecttypes_to_clean; /* list of object types to cleanup */ }; /* per-database publication/subscription info */ @@ -71,12 +73,12 @@ struct LogicalRepInfos { struct LogicalRepInfo *dbinfo; bool two_phase; /* enable-two-phase option */ - bits32 objecttypes_to_remove; /* flags indicating which object types - * to remove on subscriber */ + bits32 objecttypes_to_clean; /* flags indicating which object types + * to clean up on subscriber */ }; static void cleanup_objects_atexit(void); -static void usage(); +static void usage(void); static char *get_base_conninfo(const char *conninfo, char **dbname); static char *get_sub_conninfo(const struct CreateSubscriberOptions *opt); static char *get_exec_path(const char *argv0, const char *progname); @@ -114,6 +116,7 @@ static void stop_standby_server(const char *datadir); static void wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions *opt); static void create_publication(PGconn *conn, struct LogicalRepInfo *dbinfo); +static bool find_publication(PGconn *conn, const char *pubname, const char *dbname); static void drop_publication(PGconn *conn, const char *pubname, const char *dbname, bool *made_publication); static void check_and_drop_publications(PGconn *conn, struct LogicalRepInfo *dbinfo); @@ -123,12 +126,11 @@ static void set_replication_progress(PGconn *conn, const struct LogicalRepInfo * static void enable_subscription(PGconn *conn, const struct LogicalRepInfo *dbinfo); static void check_and_drop_existing_subscriptions(PGconn *conn, const struct LogicalRepInfo *dbinfo); -static void drop_existing_subscriptions(PGconn *conn, const char *subname, - const char *dbname); +static void drop_existing_subscription(PGconn *conn, const char *subname, + const char *dbname); static void get_publisher_databases(struct CreateSubscriberOptions *opt, bool dbnamespecified); -#define USEC_PER_SEC 1000000 #define WAIT_INTERVAL 1 /* 1 second */ static const char *progname; @@ -155,12 +157,6 @@ static char *subscriber_dir = NULL; static bool recovery_ended = false; static bool standby_running = false; -enum WaitPMResult -{ - POSTMASTER_READY, - POSTMASTER_STILL_STARTING -}; - /* * Cleanup objects that were created by pg_createsubscriber if there is an @@ -247,19 +243,19 @@ usage(void) printf(_(" %s [OPTION]...\n"), progname); printf(_("\nOptions:\n")); printf(_(" -a, --all create subscriptions for all databases except template\n" - " databases or databases that don't allow connections\n")); + " databases and databases that don't allow connections\n")); printf(_(" -d, --database=DBNAME database in which to create a subscription\n")); printf(_(" -D, --pgdata=DATADIR location for the subscriber data directory\n")); printf(_(" -n, --dry-run dry run, just show what would be done\n")); printf(_(" -p, --subscriber-port=PORT subscriber port number (default %s)\n"), DEFAULT_SUB_PORT); printf(_(" -P, --publisher-server=CONNSTR publisher connection string\n")); - printf(_(" -R, --remove=OBJECTTYPE remove all objects of the specified type from specified\n" - " databases on the subscriber; accepts: publications\n")); printf(_(" -s, --socketdir=DIR socket directory to use (default current dir.)\n")); printf(_(" -t, --recovery-timeout=SECS seconds to wait for recovery to end\n")); printf(_(" -T, --enable-two-phase enable two-phase commit for all subscriptions\n")); printf(_(" -U, --subscriber-username=NAME user name for subscriber connection\n")); printf(_(" -v, --verbose output verbose messages\n")); + printf(_(" --clean=OBJECTTYPE drop all objects of the specified type from specified\n" + " databases on the subscriber; accepts: \"%s\"\n"), "publications"); printf(_(" --config-file=FILENAME use specified main server configuration\n" " file when running target cluster\n")); printf(_(" --publication=NAME publication name\n")); @@ -407,7 +403,8 @@ static void check_data_directory(const char *datadir) { struct stat statbuf; - char versionfile[MAXPGPATH]; + uint32 major_version; + char *version_str; pg_log_info("checking if directory \"%s\" is a cluster data directory", datadir); @@ -420,11 +417,18 @@ check_data_directory(const char *datadir) pg_fatal("could not access directory \"%s\": %m", datadir); } - snprintf(versionfile, MAXPGPATH, "%s/PG_VERSION", datadir); - if (stat(versionfile, &statbuf) != 0 && errno == ENOENT) + /* + * Retrieve the contents of this cluster's PG_VERSION. We require + * compatibility with the same major version as the one this tool is + * compiled with. + */ + major_version = GET_PG_MAJORVERSION_NUM(get_pg_version(datadir, &version_str)); + if (major_version != PG_MAJORVERSION_NUM) { - pg_fatal("directory \"%s\" is not a database cluster directory", - datadir); + pg_log_error("data directory is of wrong version"); + pg_log_error_detail("File \"%s\" contains \"%s\", which is not compatible with this program's version \"%s\".", + "PG_VERSION", version_str, PG_MAJORVERSION); + exit(1); } } @@ -679,13 +683,20 @@ modify_subscriber_sysid(const struct CreateSubscriberOptions *opt) cf->system_identifier |= ((uint64) tv.tv_usec) << 12; cf->system_identifier |= getpid() & 0xFFF; - if (!dry_run) + if (dry_run) + pg_log_info("dry-run: would set system identifier to %" PRIu64 " on subscriber", + cf->system_identifier); + else + { update_controlfile(subscriber_dir, cf, true); + pg_log_info("system identifier is %" PRIu64 " on subscriber", + cf->system_identifier); + } - pg_log_info("system identifier is %" PRIu64 " on subscriber", - cf->system_identifier); - - pg_log_info("running pg_resetwal on the subscriber"); + if (dry_run) + pg_log_info("dry-run: would run pg_resetwal on the subscriber"); + else + pg_log_info("running pg_resetwal on the subscriber"); cmd_str = psprintf("\"%s\" -D \"%s\" > \"%s\"", pg_resetwal_path, subscriber_dir, DEVNULL); @@ -697,9 +708,9 @@ modify_subscriber_sysid(const struct CreateSubscriberOptions *opt) int rc = system(cmd_str); if (rc == 0) - pg_log_info("subscriber successfully changed the system identifier"); + pg_log_info("successfully reset WAL on the subscriber"); else - pg_fatal("could not change system identifier of subscriber: %s", wait_result_to_str(rc)); + pg_fatal("could not reset WAL on subscriber: %s", wait_result_to_str(rc)); } pg_free(cf); @@ -753,6 +764,39 @@ generate_object_name(PGconn *conn) return objname; } +/* + * Does the publication exist in the specified database? + */ +static bool +find_publication(PGconn *conn, const char *pubname, const char *dbname) +{ + PQExpBuffer str = createPQExpBuffer(); + PGresult *res; + bool found = false; + char *pubname_esc = PQescapeLiteral(conn, pubname, strlen(pubname)); + + appendPQExpBuffer(str, + "SELECT 1 FROM pg_catalog.pg_publication " + "WHERE pubname = %s", + pubname_esc); + res = PQexec(conn, str->data); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + pg_log_error("could not find publication \"%s\" in database \"%s\": %s", + pubname, dbname, PQerrorMessage(conn)); + disconnect_database(conn, true); + } + + if (PQntuples(res) == 1) + found = true; + + PQclear(res); + PQfreemem(pubname_esc); + destroyPQExpBuffer(str); + + return found; +} + /* * Create the publications and replication slots in preparation for logical * replication. Returns the LSN from latest replication slot. It will be the @@ -789,22 +833,31 @@ setup_publisher(struct LogicalRepInfo *dbinfo) if (num_replslots == 0) dbinfo[i].replslotname = pg_strdup(dbinfo[i].subname); - /* - * Create publication on publisher. This step should be executed - * *before* promoting the subscriber to avoid any transactions between - * consistent LSN and the new publication rows (such transactions - * wouldn't see the new publication rows resulting in an error). - */ - create_publication(conn, &dbinfo[i]); + if (find_publication(conn, dbinfo[i].pubname, dbinfo[i].dbname)) + { + /* Reuse existing publication on publisher. */ + pg_log_info("use existing publication \"%s\" in database \"%s\"", + dbinfo[i].pubname, dbinfo[i].dbname); + /* Don't remove pre-existing publication if an error occurs. */ + dbinfo[i].made_publication = false; + } + else + { + /* + * Create publication on publisher. This step should be executed + * *before* promoting the subscriber to avoid any transactions + * between consistent LSN and the new publication rows (such + * transactions wouldn't see the new publication rows resulting in + * an error). + */ + create_publication(conn, &dbinfo[i]); + } /* Create replication slot on publisher */ if (lsn) pg_free(lsn); lsn = create_logical_replication_slot(conn, &dbinfo[i]); - if (lsn != NULL || dry_run) - pg_log_info("create replication slot \"%s\" on publisher", - dbinfo[i].replslotname); - else + if (lsn == NULL && !dry_run) exit(1); /* @@ -900,7 +953,7 @@ check_publisher(const struct LogicalRepInfo *dbinfo) * Since these parameters are not a requirement for physical replication, * we should check it to make sure it won't fail. * - * - wal_level = logical + * - wal_level >= replica * - max_replication_slots >= current + number of dbs to be converted * - max_wal_senders >= current + number of dbs to be converted * - max_slot_wal_keep_size = -1 (to prevent deletion of required WAL files) @@ -944,9 +997,9 @@ check_publisher(const struct LogicalRepInfo *dbinfo) disconnect_database(conn, false); - if (strcmp(wal_level, "logical") != 0) + if (strcmp(wal_level, "minimal") == 0) { - pg_log_error("publisher requires \"wal_level\" >= \"logical\""); + pg_log_error("publisher requires \"wal_level\" >= \"replica\""); failed = true; } @@ -973,13 +1026,13 @@ check_publisher(const struct LogicalRepInfo *dbinfo) pg_log_warning("two_phase option will not be enabled for replication slots"); pg_log_warning_detail("Subscriptions will be created with the two_phase option disabled. " "Prepared transactions will be replicated at COMMIT PREPARED."); - pg_log_warning_hint("You can use --enable-two-phase switch to enable two_phase."); + pg_log_warning_hint("You can use the command-line option --enable-two-phase to enable two_phase."); } /* - * Validate 'max_slot_wal_keep_size'. If this parameter is set to a - * non-default value, it may cause replication failures due to required - * WAL files being prematurely removed. + * In dry-run mode, validate 'max_slot_wal_keep_size'. If this parameter + * is set to a non-default value, it may cause replication failures due to + * required WAL files being prematurely removed. */ if (dry_run && (strcmp(max_slot_wal_keep_size, "-1") != 0)) { @@ -1107,7 +1160,7 @@ check_subscriber(const struct LogicalRepInfo *dbinfo) * node. */ static void -drop_existing_subscriptions(PGconn *conn, const char *subname, const char *dbname) +drop_existing_subscription(PGconn *conn, const char *subname, const char *dbname) { PQExpBuffer query = createPQExpBuffer(); PGresult *res; @@ -1124,11 +1177,14 @@ drop_existing_subscriptions(PGconn *conn, const char *subname, const char *dbnam subname); appendPQExpBuffer(query, " DROP SUBSCRIPTION %s;", subname); - pg_log_info("dropping subscription \"%s\" in database \"%s\"", - subname, dbname); - - if (!dry_run) + if (dry_run) + pg_log_info("dry-run: would drop subscription \"%s\" in database \"%s\"", + subname, dbname); + else { + pg_log_info("dropping subscription \"%s\" in database \"%s\"", + subname, dbname); + res = PQexec(conn, query->data); if (PQresultStatus(res) != PGRES_COMMAND_OK) @@ -1174,8 +1230,8 @@ check_and_drop_existing_subscriptions(PGconn *conn, } for (int i = 0; i < PQntuples(res); i++) - drop_existing_subscriptions(conn, PQgetvalue(res, i, 0), - dbinfo->dbname); + drop_existing_subscription(conn, PQgetvalue(res, i, 0), + dbinfo->dbname); PQclear(res); destroyPQExpBuffer(query); @@ -1250,8 +1306,17 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c appendPQExpBufferStr(recoveryconfcontents, "recovery_target = ''\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_timeline = 'latest'\n"); + + /* + * Set recovery_target_inclusive = false to avoid reapplying the + * transaction committed at 'lsn' after subscription is enabled. This is + * because the provided 'lsn' is also used as the replication start point + * for the subscription. So, the server can send the transaction committed + * at that 'lsn' after replication is started which can lead to applying + * the same transaction twice if we keep recovery_target_inclusive = true. + */ appendPQExpBufferStr(recoveryconfcontents, - "recovery_target_inclusive = true\n"); + "recovery_target_inclusive = false\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_action = promote\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_name = ''\n"); @@ -1260,9 +1325,9 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c if (dry_run) { - appendPQExpBufferStr(recoveryconfcontents, "# dry run mode"); + appendPQExpBufferStr(recoveryconfcontents, "# dry run mode\n"); appendPQExpBuffer(recoveryconfcontents, - "recovery_target_lsn = '%X/%X'\n", + "recovery_target_lsn = '%X/%08X'\n", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); } else @@ -1366,8 +1431,12 @@ create_logical_replication_slot(PGconn *conn, struct LogicalRepInfo *dbinfo) Assert(conn != NULL); - pg_log_info("creating the replication slot \"%s\" in database \"%s\"", - slot_name, dbinfo->dbname); + if (dry_run) + pg_log_info("dry-run: would create the replication slot \"%s\" in database \"%s\" on publisher", + slot_name, dbinfo->dbname); + else + pg_log_info("creating the replication slot \"%s\" in database \"%s\" on publisher", + slot_name, dbinfo->dbname); slot_name_esc = PQescapeLiteral(conn, slot_name, strlen(slot_name)); @@ -1415,8 +1484,12 @@ drop_replication_slot(PGconn *conn, struct LogicalRepInfo *dbinfo, Assert(conn != NULL); - pg_log_info("dropping the replication slot \"%s\" in database \"%s\"", - slot_name, dbinfo->dbname); + if (dry_run) + pg_log_info("dry-run: would drop the replication slot \"%s\" in database \"%s\"", + slot_name, dbinfo->dbname); + else + pg_log_info("dropping the replication slot \"%s\" in database \"%s\"", + slot_name, dbinfo->dbname); slot_name_esc = PQescapeLiteral(conn, slot_name, strlen(slot_name)); @@ -1551,7 +1624,7 @@ static void wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions *opt) { PGconn *conn; - int status = POSTMASTER_STILL_STARTING; + bool ready = false; int timer = 0; pg_log_info("waiting for the target server to reach the consistent state"); @@ -1560,15 +1633,10 @@ wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions for (;;) { - bool in_recovery = server_is_in_recovery(conn); - - /* - * Does the recovery process finish? In dry run mode, there is no - * recovery mode. Bail out as the recovery process has ended. - */ - if (!in_recovery || dry_run) + /* Did the recovery process finish? We're done if so. */ + if (dry_run || !server_is_in_recovery(conn)) { - status = POSTMASTER_READY; + ready = true; recovery_ended = true; break; } @@ -1582,14 +1650,13 @@ wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions } /* Keep waiting */ - pg_usleep(WAIT_INTERVAL * USEC_PER_SEC); - + pg_usleep(WAIT_INTERVAL * USECS_PER_SEC); timer += WAIT_INTERVAL; } disconnect_database(conn, false); - if (status == POSTMASTER_STILL_STARTING) + if (!ready) pg_fatal("server did not end recovery"); pg_log_info("target server reached the consistent state"); @@ -1642,8 +1709,12 @@ create_publication(PGconn *conn, struct LogicalRepInfo *dbinfo) PQclear(res); resetPQExpBuffer(str); - pg_log_info("creating publication \"%s\" in database \"%s\"", - dbinfo->pubname, dbinfo->dbname); + if (dry_run) + pg_log_info("dry-run: would create publication \"%s\" in database \"%s\"", + dbinfo->pubname, dbinfo->dbname); + else + pg_log_info("creating publication \"%s\" in database \"%s\"", + dbinfo->pubname, dbinfo->dbname); appendPQExpBuffer(str, "CREATE PUBLICATION %s FOR ALL TABLES", ipubname_esc); @@ -1685,8 +1756,12 @@ drop_publication(PGconn *conn, const char *pubname, const char *dbname, pubname_esc = PQescapeIdentifier(conn, pubname, strlen(pubname)); - pg_log_info("dropping publication \"%s\" in database \"%s\"", - pubname, dbname); + if (dry_run) + pg_log_info("dry-run: would drop publication \"%s\" in database \"%s\"", + pubname, dbname); + else + pg_log_info("dropping publication \"%s\" in database \"%s\"", + pubname, dbname); appendPQExpBuffer(str, "DROP PUBLICATION %s", pubname_esc); @@ -1720,17 +1795,16 @@ drop_publication(PGconn *conn, const char *pubname, const char *dbname, /* * Retrieve and drop the publications. * - * Since the publications were created before the consistent LSN, they - * remain on the subscriber even after the physical replica is - * promoted. Remove these publications from the subscriber because - * they have no use. Additionally, if requested, drop all pre-existing - * publications. + * Publications copied during physical replication remain on the subscriber + * after promotion. If --clean=publications is specified, drop all existing + * publications in the subscriber database. Otherwise, only drop publications + * that were created by pg_createsubscriber during this operation. */ static void check_and_drop_publications(PGconn *conn, struct LogicalRepInfo *dbinfo) { PGresult *res; - bool drop_all_pubs = dbinfos.objecttypes_to_remove & OBJECTTYPE_PUBLICATIONS; + bool drop_all_pubs = dbinfos.objecttypes_to_clean & OBJECTTYPE_PUBLICATIONS; Assert(conn != NULL); @@ -1756,14 +1830,24 @@ check_and_drop_publications(PGconn *conn, struct LogicalRepInfo *dbinfo) PQclear(res); } - - /* - * In dry-run mode, we don't create publications, but we still try to drop - * those to provide necessary information to the user. - */ - if (!drop_all_pubs || dry_run) - drop_publication(conn, dbinfo->pubname, dbinfo->dbname, - &dbinfo->made_publication); + else + { + /* Drop publication only if it was created by this tool */ + if (dbinfo->made_publication) + { + drop_publication(conn, dbinfo->pubname, dbinfo->dbname, + &dbinfo->made_publication); + } + else + { + if (dry_run) + pg_log_info("dry-run: would preserve existing publication \"%s\" in database \"%s\"", + dbinfo->pubname, dbinfo->dbname); + else + pg_log_info("preserve existing publication \"%s\" in database \"%s\"", + dbinfo->pubname, dbinfo->dbname); + } + } } /* @@ -1794,8 +1878,12 @@ create_subscription(PGconn *conn, const struct LogicalRepInfo *dbinfo) pubconninfo_esc = PQescapeLiteral(conn, dbinfo->pubconninfo, strlen(dbinfo->pubconninfo)); replslotname_esc = PQescapeLiteral(conn, dbinfo->replslotname, strlen(dbinfo->replslotname)); - pg_log_info("creating subscription \"%s\" in database \"%s\"", - dbinfo->subname, dbinfo->dbname); + if (dry_run) + pg_log_info("dry-run: would create subscription \"%s\" in database \"%s\"", + dbinfo->subname, dbinfo->dbname); + else + pg_log_info("creating subscription \"%s\" in database \"%s\"", + dbinfo->subname, dbinfo->dbname); appendPQExpBuffer(str, "CREATE SUBSCRIPTION %s CONNECTION %s PUBLICATION %s " @@ -1876,7 +1964,7 @@ set_replication_progress(PGconn *conn, const struct LogicalRepInfo *dbinfo, cons if (dry_run) { suboid = InvalidOid; - lsnstr = psprintf("%X/%X", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); + lsnstr = psprintf("%X/%08X", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); } else { @@ -1892,8 +1980,12 @@ set_replication_progress(PGconn *conn, const struct LogicalRepInfo *dbinfo, cons */ originname = psprintf("pg_%u", suboid); - pg_log_info("setting the replication progress (node name \"%s\", LSN %s) in database \"%s\"", - originname, lsnstr, dbinfo->dbname); + if (dry_run) + pg_log_info("dry-run: would set the replication progress (node name \"%s\", LSN %s) in database \"%s\"", + originname, lsnstr, dbinfo->dbname); + else + pg_log_info("setting the replication progress (node name \"%s\", LSN %s) in database \"%s\"", + originname, lsnstr, dbinfo->dbname); resetPQExpBuffer(str); appendPQExpBuffer(str, @@ -1938,8 +2030,12 @@ enable_subscription(PGconn *conn, const struct LogicalRepInfo *dbinfo) subname = PQescapeIdentifier(conn, dbinfo->subname, strlen(dbinfo->subname)); - pg_log_info("enabling subscription \"%s\" in database \"%s\"", - dbinfo->subname, dbinfo->dbname); + if (dry_run) + pg_log_info("dry-run: would enable subscription \"%s\" in database \"%s\"", + dbinfo->subname, dbinfo->dbname); + else + pg_log_info("enabling subscription \"%s\" in database \"%s\"", + dbinfo->subname, dbinfo->dbname); appendPQExpBuffer(str, "ALTER SUBSCRIPTION %s ENABLE", subname); @@ -2026,7 +2122,6 @@ main(int argc, char **argv) {"dry-run", no_argument, NULL, 'n'}, {"subscriber-port", required_argument, NULL, 'p'}, {"publisher-server", required_argument, NULL, 'P'}, - {"remove", required_argument, NULL, 'R'}, {"socketdir", required_argument, NULL, 's'}, {"recovery-timeout", required_argument, NULL, 't'}, {"enable-two-phase", no_argument, NULL, 'T'}, @@ -2038,6 +2133,7 @@ main(int argc, char **argv) {"publication", required_argument, NULL, 2}, {"replication-slot", required_argument, NULL, 3}, {"subscription", required_argument, NULL, 4}, + {"clean", required_argument, NULL, 5}, {NULL, 0, NULL, 0} }; @@ -2109,7 +2205,7 @@ main(int argc, char **argv) get_restricted_token(); - while ((c = getopt_long(argc, argv, "ad:D:np:P:R:s:t:TU:v", + while ((c = getopt_long(argc, argv, "ad:D:np:P:s:t:TU:v", long_options, &option_index)) != -1) { switch (c) @@ -2139,12 +2235,6 @@ main(int argc, char **argv) case 'P': opt.pub_conninfo_str = pg_strdup(optarg); break; - case 'R': - if (!simple_string_list_member(&opt.objecttypes_to_remove, optarg)) - simple_string_list_append(&opt.objecttypes_to_remove, optarg); - else - pg_fatal("object type \"%s\" is specified more than once for -R/--remove", optarg); - break; case 's': opt.socket_dir = pg_strdup(optarg); canonicalize_path(opt.socket_dir); @@ -2191,6 +2281,12 @@ main(int argc, char **argv) else pg_fatal("subscription \"%s\" specified more than once for --subscription", optarg); break; + case 5: + if (!simple_string_list_member(&opt.objecttypes_to_clean, optarg)) + simple_string_list_append(&opt.objecttypes_to_clean, optarg); + else + pg_fatal("object type \"%s\" specified more than once for --clean", optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -2214,7 +2310,8 @@ main(int argc, char **argv) if (bad_switch) { - pg_log_error("%s cannot be used with -a/--all", bad_switch); + pg_log_error("options %s and %s cannot be used together", + bad_switch, "-a/--all"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } @@ -2264,6 +2361,11 @@ main(int argc, char **argv) pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } + + if (dry_run) + pg_log_info("Executing in dry-run mode.\n" + "The target directory will not be modified."); + pg_log_info("validating publisher connection string"); pub_base_conninfo = get_base_conninfo(opt.pub_conninfo_str, &dbname_conninfo); @@ -2334,14 +2436,15 @@ main(int argc, char **argv) } /* Verify the object types specified for removal from the subscriber */ - for (SimpleStringListCell *cell = opt.objecttypes_to_remove.head; cell; cell = cell->next) + for (SimpleStringListCell *cell = opt.objecttypes_to_clean.head; cell; cell = cell->next) { if (pg_strcasecmp(cell->val, "publications") == 0) - dbinfos.objecttypes_to_remove |= OBJECTTYPE_PUBLICATIONS; + dbinfos.objecttypes_to_clean |= OBJECTTYPE_PUBLICATIONS; else { - pg_log_error("invalid object type \"%s\" specified for -R/--remove", cell->val); - pg_log_error_hint("The valid option is: \"publications\""); + pg_log_error("invalid object type \"%s\" specified for %s", + cell->val, "--clean"); + pg_log_error_hint("The valid value is: \"%s\"", "publications"); exit(1); } } diff --git a/src/bin/pg_basebackup/pg_receivewal.c b/src/bin/pg_basebackup/pg_receivewal.c index e816cf58101fb..46e553dce4b1d 100644 --- a/src/bin/pg_basebackup/pg_receivewal.c +++ b/src/bin/pg_basebackup/pg_receivewal.c @@ -188,14 +188,14 @@ stop_streaming(XLogRecPtr xlogpos, uint32 timeline, bool segment_finished) /* we assume that we get called once at the end of each segment */ if (verbose && segment_finished) - pg_log_info("finished segment at %X/%X (timeline %u)", + pg_log_info("finished segment at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(xlogpos), timeline); - if (!XLogRecPtrIsInvalid(endpos) && endpos < xlogpos) + if (XLogRecPtrIsValid(endpos) && endpos < xlogpos) { if (verbose) - pg_log_info("stopped log streaming at %X/%X (timeline %u)", + pg_log_info("stopped log streaming at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(xlogpos), timeline); time_to_stop = true; @@ -211,7 +211,7 @@ stop_streaming(XLogRecPtr xlogpos, uint32 timeline, bool segment_finished) * timeline, but it's close enough for reporting purposes. */ if (verbose && prevtimeline != 0 && prevtimeline != timeline) - pg_log_info("switched to timeline %u at %X/%X", + pg_log_info("switched to timeline %u at %X/%08X", timeline, LSN_FORMAT_ARGS(prevpos)); @@ -535,7 +535,7 @@ StreamLog(void) * Figure out where to start streaming. First scan the local directory. */ stream.startpos = FindStreamingStart(&stream.timeline); - if (stream.startpos == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(stream.startpos)) { /* * Try to get the starting point from the slot if any. This is @@ -556,14 +556,14 @@ StreamLog(void) * If it the starting point is still not known, use the current WAL * flush value as last resort. */ - if (stream.startpos == InvalidXLogRecPtr) + if (!XLogRecPtrIsValid(stream.startpos)) { stream.startpos = serverpos; stream.timeline = servertli; } } - Assert(stream.startpos != InvalidXLogRecPtr && + Assert(XLogRecPtrIsValid(stream.startpos) && stream.timeline != 0); /* @@ -575,7 +575,7 @@ StreamLog(void) * Start the replication */ if (verbose) - pg_log_info("starting log streaming at %X/%X (timeline %u)", + pg_log_info("starting log streaming at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(stream.startpos), stream.timeline); @@ -689,7 +689,7 @@ main(int argc, char **argv) basedir = pg_strdup(optarg); break; case 'E': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse end position \"%s\"", optarg); endpos = ((uint64) hi) << 32 | lo; break; diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c index e6810efe5f0d7..14ad15046782e 100644 --- a/src/bin/pg_basebackup/pg_recvlogical.c +++ b/src/bin/pg_basebackup/pg_recvlogical.c @@ -24,6 +24,7 @@ #include "getopt_long.h" #include "libpq-fe.h" #include "libpq/pqsignal.h" +#include "libpq/protocol.h" #include "pqexpbuffer.h" #include "streamutil.h" @@ -41,8 +42,8 @@ typedef enum /* Global Options */ static char *outfile = NULL; static int verbose = 0; -static bool two_phase = false; -static bool failover = false; +static bool two_phase = false; /* enable-two-phase option */ +static bool failover = false; /* enable-failover option */ static int noloop = 0; static int standby_message_timeout = 10 * 1000; /* 10 sec = default */ static int fsync_interval = 10 * 1000; /* 10 sec = default */ @@ -89,9 +90,9 @@ usage(void) printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n")); printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n")); printf(_("\nOptions:\n")); + printf(_(" --enable-failover enable replication slot synchronization to standby servers when\n" + " creating a replication slot\n")); printf(_(" -E, --endpos=LSN exit after receiving the specified LSN\n")); - printf(_(" --failover enable replication slot synchronization to standby servers when\n" - " creating a slot\n")); printf(_(" -f, --file=FILE receive log into this file, - for stdout\n")); printf(_(" -F --fsync-interval=SECS\n" " time between fsyncs to the output file (default: %d)\n"), (fsync_interval / 1000)); @@ -105,7 +106,8 @@ usage(void) printf(_(" -s, --status-interval=SECS\n" " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000)); printf(_(" -S, --slot=SLOTNAME name of the logical replication slot\n")); - printf(_(" -t, --two-phase enable decoding of prepared transactions when creating a slot\n")); + printf(_(" -t, --enable-two-phase enable decoding of prepared transactions when creating a slot\n")); + printf(_(" --two-phase (same as --enable-two-phase, deprecated)\n")); printf(_(" -v, --verbose output verbose messages\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); @@ -143,12 +145,12 @@ sendFeedback(PGconn *conn, TimestampTz now, bool force, bool replyRequested) return true; if (verbose) - pg_log_info("confirming write up to %X/%X, flush to %X/%X (slot %s)", + pg_log_info("confirming write up to %X/%08X, flush to %X/%08X (slot %s)", LSN_FORMAT_ARGS(output_written_lsn), LSN_FORMAT_ARGS(output_fsync_lsn), replication_slot); - replybuf[len] = 'r'; + replybuf[len] = PqReplMsg_StandbyStatusUpdate; len += 1; fe_sendint64(output_written_lsn, &replybuf[len]); /* write */ len += 8; @@ -237,13 +239,13 @@ StreamLogicalLog(void) * Start the replication */ if (verbose) - pg_log_info("starting log streaming at %X/%X (slot %s)", + pg_log_info("starting log streaming at %X/%08X (slot %s)", LSN_FORMAT_ARGS(startpos), replication_slot); /* Initiate the replication stream at specified location */ query = createPQExpBuffer(); - appendPQExpBuffer(query, "START_REPLICATION SLOT \"%s\" LOGICAL %X/%X", + appendPQExpBuffer(query, "START_REPLICATION SLOT \"%s\" LOGICAL %X/%08X", replication_slot, LSN_FORMAT_ARGS(startpos)); /* print options if there are any */ @@ -453,7 +455,7 @@ StreamLogicalLog(void) } /* Check the message type. */ - if (copybuf[0] == 'k') + if (copybuf[0] == PqReplMsg_Keepalive) { int pos; bool replyRequested; @@ -465,7 +467,7 @@ StreamLogicalLog(void) * We just check if the server requested a reply, and ignore the * rest. */ - pos = 1; /* skip msgtype 'k' */ + pos = 1; /* skip msgtype PqReplMsg_Keepalive */ walEnd = fe_recvint64(©buf[pos]); output_written_lsn = Max(walEnd, output_written_lsn); @@ -480,7 +482,7 @@ StreamLogicalLog(void) } replyRequested = copybuf[pos]; - if (endpos != InvalidXLogRecPtr && walEnd >= endpos) + if (XLogRecPtrIsValid(endpos) && walEnd >= endpos) { /* * If there's nothing to read on the socket until a keepalive @@ -508,7 +510,7 @@ StreamLogicalLog(void) continue; } - else if (copybuf[0] != 'w') + else if (copybuf[0] != PqReplMsg_WALData) { pg_log_error("unrecognized streaming header: \"%c\"", copybuf[0]); @@ -516,11 +518,11 @@ StreamLogicalLog(void) } /* - * Read the header of the XLogData message, enclosed in the CopyData + * Read the header of the WALData message, enclosed in the CopyData * message. We only need the WAL location field (dataStart), the rest * of the header is ignored. */ - hdr_len = 1; /* msgtype 'w' */ + hdr_len = 1; /* msgtype PqReplMsg_WALData */ hdr_len += 8; /* dataStart */ hdr_len += 8; /* walEnd */ hdr_len += 8; /* sendTime */ @@ -533,7 +535,7 @@ StreamLogicalLog(void) /* Extract WAL location for this block */ cur_record_lsn = fe_recvint64(©buf[1]); - if (endpos != InvalidXLogRecPtr && cur_record_lsn > endpos) + if (XLogRecPtrIsValid(endpos) && cur_record_lsn > endpos) { /* * We've read past our endpoint, so prepare to go away being @@ -581,7 +583,7 @@ StreamLogicalLog(void) goto error; } - if (endpos != InvalidXLogRecPtr && cur_record_lsn == endpos) + if (XLogRecPtrIsValid(endpos) && cur_record_lsn == endpos) { /* endpos was exactly the record we just processed, we're done */ if (!flushAndSendFeedback(conn, &now)) @@ -604,7 +606,7 @@ StreamLogicalLog(void) /* * We're doing a client-initiated clean exit and have sent CopyDone to * the server. Drain any messages, so we don't miss a last-minute - * ErrorResponse. The walsender stops generating XLogData records once + * ErrorResponse. The walsender stops generating WALData records once * it sees CopyDone, so expect this to finish quickly. After CopyDone, * it's too late for sendFeedback(), even if this were to take a long * time. Hence, use synchronous-mode PQgetCopyData(). @@ -698,9 +700,10 @@ main(int argc, char **argv) {"file", required_argument, NULL, 'f'}, {"fsync-interval", required_argument, NULL, 'F'}, {"no-loop", no_argument, NULL, 'n'}, - {"failover", no_argument, NULL, 5}, + {"enable-failover", no_argument, NULL, 5}, + {"enable-two-phase", no_argument, NULL, 't'}, + {"two-phase", no_argument, NULL, 't'}, /* deprecated */ {"verbose", no_argument, NULL, 'v'}, - {"two-phase", no_argument, NULL, 't'}, {"version", no_argument, NULL, 'V'}, {"help", no_argument, NULL, '?'}, /* connection options */ @@ -798,12 +801,12 @@ main(int argc, char **argv) break; /* replication options */ case 'I': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse start position \"%s\"", optarg); startpos = ((uint64) hi) << 32 | lo; break; case 'E': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse end position \"%s\"", optarg); endpos = ((uint64) hi) << 32 | lo; break; @@ -910,14 +913,14 @@ main(int argc, char **argv) exit(1); } - if (startpos != InvalidXLogRecPtr && (do_create_slot || do_drop_slot)) + if (XLogRecPtrIsValid(startpos) && (do_create_slot || do_drop_slot)) { pg_log_error("cannot use --create-slot or --drop-slot together with --startpos"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - if (endpos != InvalidXLogRecPtr && !do_start_slot) + if (XLogRecPtrIsValid(endpos) && !do_start_slot) { pg_log_error("--endpos may only be specified with --start"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -928,14 +931,14 @@ main(int argc, char **argv) { if (two_phase) { - pg_log_error("--two-phase may only be specified with --create-slot"); + pg_log_error("%s may only be specified with --create-slot", "--enable-two-phase"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } if (failover) { - pg_log_error("--failover may only be specified with --create-slot"); + pg_log_error("%s may only be specified with --create-slot", "--enable-failover"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } @@ -1073,12 +1076,12 @@ prepareToTerminate(PGconn *conn, XLogRecPtr endpos, StreamStopReason reason, pg_log_info("received interrupt signal, exiting"); break; case STREAM_STOP_KEEPALIVE: - pg_log_info("end position %X/%X reached by keepalive", + pg_log_info("end position %X/%08X reached by keepalive", LSN_FORMAT_ARGS(endpos)); break; case STREAM_STOP_END_OF_WAL: - Assert(!XLogRecPtrIsInvalid(lsn)); - pg_log_info("end position %X/%X reached by WAL record at %X/%X", + Assert(XLogRecPtrIsValid(lsn)); + pg_log_info("end position %X/%08X reached by WAL record at %X/%08X", LSN_FORMAT_ARGS(endpos), LSN_FORMAT_ARGS(lsn)); break; case STREAM_STOP_NONE: diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 6b6e32dfbdf56..25b13c7f55cd1 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -21,6 +21,7 @@ #include "access/xlog_internal.h" #include "common/logging.h" #include "libpq-fe.h" +#include "libpq/protocol.h" #include "receivelog.h" #include "streamutil.h" @@ -38,8 +39,8 @@ static int CopyStreamReceive(PGconn *conn, long timeout, pgsocket stop_socket, char **buffer); static bool ProcessKeepaliveMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, XLogRecPtr blockpos, TimestampTz *last_status); -static bool ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, - XLogRecPtr *blockpos); +static bool ProcessWALDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, + XLogRecPtr *blockpos); static PGresult *HandleEndOfCopyStream(PGconn *conn, StreamCtl *stream, char *copybuf, XLogRecPtr blockpos, XLogRecPtr *stoppos); static bool CheckCopyStreamStop(PGconn *conn, StreamCtl *stream, XLogRecPtr blockpos); @@ -338,7 +339,7 @@ sendFeedback(PGconn *conn, XLogRecPtr blockpos, TimestampTz now, bool replyReque char replybuf[1 + 8 + 8 + 8 + 8 + 1]; int len = 0; - replybuf[len] = 'r'; + replybuf[len] = PqReplMsg_StandbyStatusUpdate; len += 1; fe_sendint64(blockpos, &replybuf[len]); /* write */ len += 8; @@ -571,7 +572,7 @@ ReceiveXlogStream(PGconn *conn, StreamCtl *stream) return true; /* Initiate the replication stream at specified location */ - snprintf(query, sizeof(query), "START_REPLICATION %s%X/%X TIMELINE %u", + snprintf(query, sizeof(query), "START_REPLICATION %s%X/%08X TIMELINE %u", slotcmd, LSN_FORMAT_ARGS(stream->startpos), stream->timeline); @@ -628,7 +629,7 @@ ReceiveXlogStream(PGconn *conn, StreamCtl *stream) } if (stream->startpos > stoppos) { - pg_log_error("server stopped streaming timeline %u at %X/%X, but reported next timeline %u to begin at %X/%X", + pg_log_error("server stopped streaming timeline %u at %X/%08X, but reported next timeline %u to begin at %X/%08X", stream->timeline, LSN_FORMAT_ARGS(stoppos), newtimeline, LSN_FORMAT_ARGS(stream->startpos)); goto error; @@ -720,7 +721,7 @@ ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos, uint32 *timeline) } *timeline = atoi(PQgetvalue(res, 0, 0)); - if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &startpos_xlogid, + if (sscanf(PQgetvalue(res, 0, 1), "%X/%08X", &startpos_xlogid, &startpos_xrecoff) != 2) { pg_log_error("could not parse next timeline's starting point \"%s\"", @@ -823,15 +824,15 @@ HandleCopyStream(PGconn *conn, StreamCtl *stream, } /* Check the message type. */ - if (copybuf[0] == 'k') + if (copybuf[0] == PqReplMsg_Keepalive) { if (!ProcessKeepaliveMsg(conn, stream, copybuf, r, blockpos, &last_status)) goto error; } - else if (copybuf[0] == 'w') + else if (copybuf[0] == PqReplMsg_WALData) { - if (!ProcessXLogDataMsg(conn, stream, copybuf, r, &blockpos)) + if (!ProcessWALDataMsg(conn, stream, copybuf, r, &blockpos)) goto error; /* @@ -1001,7 +1002,7 @@ ProcessKeepaliveMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, * Parse the keepalive message, enclosed in the CopyData message. We just * check if the server requested a reply, and ignore the rest. */ - pos = 1; /* skip msgtype 'k' */ + pos = 1; /* skip msgtype PqReplMsg_Keepalive */ pos += 8; /* skip walEnd */ pos += 8; /* skip sendTime */ @@ -1041,11 +1042,11 @@ ProcessKeepaliveMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, } /* - * Process XLogData message. + * Process WALData message. */ static bool -ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, - XLogRecPtr *blockpos) +ProcessWALDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, + XLogRecPtr *blockpos) { int xlogoff; int bytes_left; @@ -1054,17 +1055,17 @@ ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, /* * Once we've decided we don't want to receive any more, just ignore any - * subsequent XLogData messages. + * subsequent WALData messages. */ if (!(still_sending)) return true; /* - * Read the header of the XLogData message, enclosed in the CopyData + * Read the header of the WALData message, enclosed in the CopyData * message. We only need the WAL location field (dataStart), the rest of * the header is ignored. */ - hdr_len = 1; /* msgtype 'w' */ + hdr_len = 1; /* msgtype PqReplMsg_WALData */ hdr_len += 8; /* dataStart */ hdr_len += 8; /* walEnd */ hdr_len += 8; /* sendTime */ @@ -1162,7 +1163,7 @@ ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, return false; } still_sending = false; - return true; /* ignore the rest of this XLogData packet */ + return true; /* ignore the rest of this WALData packet */ } } } diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c index c7b8a4c3a4b6a..e5a7cb6e5b14e 100644 --- a/src/bin/pg_basebackup/streamutil.c +++ b/src/bin/pg_basebackup/streamutil.c @@ -445,7 +445,7 @@ RunIdentifySystem(PGconn *conn, char **sysid, TimeLineID *starttli, /* Get LSN start position if necessary */ if (startpos != NULL) { - if (sscanf(PQgetvalue(res, 0, 2), "%X/%X", &hi, &lo) != 2) + if (sscanf(PQgetvalue(res, 0, 2), "%X/%08X", &hi, &lo) != 2) { pg_log_error("could not parse write-ahead log location \"%s\"", PQgetvalue(res, 0, 2)); @@ -551,7 +551,7 @@ GetSlotInformation(PGconn *conn, const char *slot_name, uint32 hi, lo; - if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &hi, &lo) != 2) + if (sscanf(PQgetvalue(res, 0, 1), "%X/%08X", &hi, &lo) != 2) { pg_log_error("could not parse restart_lsn \"%s\" for replication slot \"%s\"", PQgetvalue(res, 0, 1), slot_name); diff --git a/src/bin/pg_basebackup/t/030_pg_recvlogical.pl b/src/bin/pg_basebackup/t/030_pg_recvlogical.pl index c82e78847b382..1b7a6f6f43fdd 100644 --- a/src/bin/pg_basebackup/t/030_pg_recvlogical.pl +++ b/src/bin/pg_basebackup/t/030_pg_recvlogical.pl @@ -110,7 +110,7 @@ '--dbname' => $node->connstr('postgres'), '--start', '--endpos' => $nextlsn, - '--two-phase', '--no-loop', + '--enable-two-phase', '--no-loop', '--file' => '-', ], 'incorrect usage'); @@ -142,12 +142,13 @@ '--slot' => 'test', '--dbname' => $node->connstr('postgres'), '--create-slot', - '--failover', + '--enable-failover', ], 'slot with failover created'); my $result = $node->safe_psql('postgres', - "SELECT failover FROM pg_catalog.pg_replication_slots WHERE slot_name = 'test'"); + "SELECT failover FROM pg_catalog.pg_replication_slots WHERE slot_name = 'test'" +); is($result, 't', "failover is enabled for the new slot"); done_testing(); diff --git a/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl b/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl index 2d532fee567dd..4657172c9ac79 100644 --- a/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl +++ b/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl @@ -240,7 +240,6 @@ sub generate_db # Check some unmet conditions on node P $node_p->append_conf( 'postgresql.conf', q{ -wal_level = replica max_replication_slots = 1 max_wal_senders = 1 max_worker_processes = 2 @@ -265,7 +264,6 @@ sub generate_db # standby settings should not be a lower setting than on the primary. $node_p->append_conf( 'postgresql.conf', q{ -wal_level = logical max_replication_slots = 10 max_wal_senders = 10 max_worker_processes = 8 @@ -331,7 +329,7 @@ sub generate_db $node_p->wait_for_replay_catchup($node_s); # Create user-defined publications, wait for streaming replication to sync them -# to the standby, then verify that '--remove' +# to the standby, then verify that '--clean' # removes them. $node_p->safe_psql( $db1, qq( @@ -341,8 +339,8 @@ sub generate_db $node_p->wait_for_replay_catchup($node_s); -ok($node_s->safe_psql($db1, "SELECT COUNT(*) = 2 FROM pg_publication"), - 'two pre-existing publications on subscriber'); +is($node_s->safe_psql($db1, "SELECT COUNT(*) FROM pg_publication"), + '2', 'two pre-existing publications on subscriber'); $node_s->stop; @@ -399,7 +397,7 @@ sub generate_db '--database' => $db1, '--all', ], - qr/--database cannot be used with -a\/--all/, + qr/options --database and -a\/--all cannot be used together/, 'fail if --database is used with --all'); # run pg_createsubscriber with '--publication' and '--all' and verify @@ -416,7 +414,7 @@ sub generate_db '--all', '--publication' => 'pub1', ], - qr/--publication cannot be used with -a\/--all/, + qr/options --publication and -a\/--all cannot be used together/, 'fail if --publication is used with --all'); # run pg_createsubscriber with '--all' option @@ -436,17 +434,24 @@ sub generate_db # Verify that the required logical replication objects are output. # The expected count 3 refers to postgres, $db1 and $db2 databases. -is(scalar(() = $stderr =~ /creating publication/g), +is(scalar(() = $stderr =~ /would create publication/g), 3, "verify publications are created for all databases"); -is(scalar(() = $stderr =~ /creating the replication slot/g), +is(scalar(() = $stderr =~ /would create the replication slot/g), 3, "verify replication slots are created for all databases"); -is(scalar(() = $stderr =~ /creating subscription/g), +is(scalar(() = $stderr =~ /would create subscription/g), 3, "verify subscriptions are created for all databases"); +# Create a user-defined publication, and a table that is not a member of that +# publication. +$node_p->safe_psql($db1, qq( + CREATE PUBLICATION test_pub3 FOR TABLE tbl1; + CREATE TABLE not_replicated (a int); +)); + # Run pg_createsubscriber on node S. --verbose is used twice # to show more information. -# In passing, also test the --enable-two-phase option and -# --remove option +# +# Test two phase and clean options. Use pre-existing publication. command_ok( [ 'pg_createsubscriber', @@ -456,14 +461,14 @@ sub generate_db '--publisher-server' => $node_p->connstr($db1), '--socketdir' => $node_s->host, '--subscriber-port' => $node_s->port, - '--publication' => 'pub1', + '--publication' => 'test_pub3', '--publication' => 'pub2', '--replication-slot' => 'replslot1', '--replication-slot' => 'replslot2', '--database' => $db1, '--database' => $db2, '--enable-two-phase', - '--remove' => 'publications', + '--clean' => 'publications', ], 'run pg_createsubscriber on node S'); @@ -478,13 +483,16 @@ sub generate_db # Insert rows on P $node_p->safe_psql($db1, "INSERT INTO tbl1 VALUES('third row')"); $node_p->safe_psql($db2, "INSERT INTO tbl2 VALUES('row 1')"); +$node_p->safe_psql($db1, "INSERT INTO not_replicated VALUES(0)"); # Start subscriber $node_s->start; # Confirm publications are removed from the subscriber node -is($node_s->safe_psql($db1, "SELECT COUNT(*) FROM pg_publication;"), - '0', 'all publications on subscriber have been removed'); +is($node_s->safe_psql($db1, 'SELECT COUNT(*) FROM pg_publication'), + '0', 'all publications were removed from db1'); +is($node_s->safe_psql($db2, 'SELECT COUNT(*) FROM pg_publication'), + '0', 'all publications were removed from db2'); # Verify that all subtwophase states are pending or enabled, # e.g. there are no subscriptions where subtwophase is disabled ('d') @@ -525,6 +533,9 @@ sub generate_db second row third row), "logical replication works in database $db1"); +$result = $node_s->safe_psql($db1, 'SELECT * FROM not_replicated'); +is($result, qq(), + "table is not replicated in database $db1"); # Check result in database $db2 $result = $node_s->safe_psql($db2, 'SELECT * FROM tbl2'); @@ -535,7 +546,38 @@ sub generate_db 'SELECT system_identifier FROM pg_control_system()'); my $sysid_s = $node_s->safe_psql('postgres', 'SELECT system_identifier FROM pg_control_system()'); -ok($sysid_p != $sysid_s, 'system identifier was changed'); +isnt($sysid_p, $sysid_s, 'system identifier was changed'); + +# Verify that pub2 was created in $db2 +is($node_p->safe_psql($db2, "SELECT COUNT(*) FROM pg_publication WHERE pubname = 'pub2'"), + '1', "publication pub2 was created in $db2"); + +# Get subscription and publication names +$result = $node_s->safe_psql( + 'postgres', qq( + SELECT subname, subpublications FROM pg_subscription WHERE subname ~ '^pg_createsubscriber_' + ORDER BY subpublications; +)); +like( + $result, + qr/^pg_createsubscriber_\d+_[0-9a-f]+ \|\{pub2\}\n + pg_createsubscriber_\d+_[0-9a-f]+ \|\{test_pub3\}$/x, + 'subscription and publication names are ok'); + +# Verify that the correct publications are being used +$result = $node_s->safe_psql( + 'postgres', qq( + SELECT d.datname, s.subpublications + FROM pg_subscription s + JOIN pg_database d ON d.oid = s.subdbid + WHERE subname ~ '^pg_createsubscriber_' + ORDER BY s.subdbid + ) +); + +is($result, qq($db1|{test_pub3} +$db2|{pub2}), + "subscriptions use the correct publications"); # clean up $node_p->teardown_node; diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index f20be82862a2b..46cb2f36efaa5 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -25,6 +25,7 @@ #include "common/logging.h" #include "common/relpath.h" #include "fe_utils/option_utils.h" +#include "fe_utils/version.h" #include "getopt_long.h" #include "pg_getopt.h" #include "storage/bufpage.h" @@ -448,6 +449,8 @@ main(int argc, char *argv[]) int c; int option_index; bool crc_ok; + uint32 major_version; + char *version_str; pg_logging_init(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_checksums")); @@ -543,6 +546,20 @@ main(int argc, char *argv[]) exit(1); } + /* + * Retrieve the contents of this cluster's PG_VERSION. We require + * compatibility with the same major version as the one this tool is + * compiled with. + */ + major_version = GET_PG_MAJORVERSION_NUM(get_pg_version(DataDir, &version_str)); + if (major_version != PG_MAJORVERSION_NUM) + { + pg_log_error("data directory is of wrong version"); + pg_log_error_detail("File \"%s\" contains \"%s\", which is not compatible with this program's version \"%s\".", + "PG_VERSION", version_str, PG_MAJORVERSION); + exit(1); + } + /* Read the control file and check compatibility */ ControlFile = get_controlfile(DataDir, &crc_ok); if (!crc_ok) diff --git a/src/bin/pg_combinebackup/backup_label.c b/src/bin/pg_combinebackup/backup_label.c index e89d4603f09dc..e774bc78a6264 100644 --- a/src/bin/pg_combinebackup/backup_label.c +++ b/src/bin/pg_combinebackup/backup_label.c @@ -247,7 +247,7 @@ parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c) unsigned lo; *e = '\0'; - success = (sscanf(s, "%X/%X%n", &hi, &lo, &nchars) == 2); + success = (sscanf(s, "%X/%08X%n", &hi, &lo, &nchars) == 2); *e = save; if (success) diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c index db6c86223bbda..8f791c802cdcb 100644 --- a/src/bin/pg_combinebackup/copy_file.c +++ b/src/bin/pg_combinebackup/copy_file.c @@ -4,7 +4,7 @@ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * src/bin/pg_combinebackup/copy_file.h + * src/bin/pg_combinebackup/copy_file.c * *------------------------------------------------------------------------- */ diff --git a/src/bin/pg_combinebackup/load_manifest.c b/src/bin/pg_combinebackup/load_manifest.c index 8e0d04a26a6a7..44db0d2b16457 100644 --- a/src/bin/pg_combinebackup/load_manifest.c +++ b/src/bin/pg_combinebackup/load_manifest.c @@ -298,7 +298,7 @@ combinebackup_per_wal_range_cb(JsonManifestParseContext *context, manifest_wal_range *range; /* Allocate and initialize a struct describing this WAL range. */ - range = palloc(sizeof(manifest_wal_range)); + range = palloc_object(manifest_wal_range); range->tli = tli; range->start_lsn = start_lsn; range->end_lsn = end_lsn; diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 28e58cd8ef458..c9bf0a9e1055c 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -34,6 +34,7 @@ #include "common/relpath.h" #include "copy_file.h" #include "fe_utils/option_utils.h" +#include "fe_utils/version.h" #include "getopt_long.h" #include "lib/stringinfo.h" #include "load_manifest.h" @@ -117,7 +118,6 @@ static void process_directory_recursively(Oid tsoid, manifest_data **manifests, manifest_writer *mwriter, cb_options *opt); -static int read_pg_version_file(char *directory); static void remember_to_cleanup_directory(char *target_path, bool rmtopdir); static void reset_directory_cleanup_list(void); static cb_tablespace *scan_for_existing_tablespaces(char *pathname, @@ -153,7 +153,7 @@ main(int argc, char *argv[]) int c; int n_backups; int n_prior_backups; - int version; + uint32 version; uint64 system_identifier; char **prior_backup_dirs; cb_options opt; @@ -162,6 +162,7 @@ main(int argc, char *argv[]) StringInfo last_backup_label; manifest_data **manifests; manifest_writer *mwriter; + char *pgdata; pg_logging_init(argv[0]); progname = get_progname(argv[0]); @@ -241,6 +242,10 @@ main(int argc, char *argv[]) if (opt.no_manifest) opt.manifest_checksums = CHECKSUM_TYPE_NONE; + if (opt.dry_run) + pg_log_info("Executing in dry-run mode.\n" + "The target directory will not be modified."); + /* Check that the platform supports the requested copy method. */ if (opt.copy_method == COPY_METHOD_CLONE) { @@ -271,7 +276,12 @@ main(int argc, char *argv[]) } /* Read the server version from the final backup. */ - version = read_pg_version_file(argv[argc - 1]); + pgdata = argv[argc - 1]; + version = get_pg_version(pgdata, NULL); + if (GET_PG_MAJORVERSION_NUM(version) < 10) + pg_fatal("server version too old"); + pg_log_debug("read server version %u from file \"%s/%s\"", + GET_PG_MAJORVERSION_NUM(version), pgdata, "PG_VERSION"); /* Sanity-check control files. */ n_backups = argc - optind; @@ -425,7 +435,7 @@ main(int argc, char *argv[]) else { pg_log_debug("recursively fsyncing \"%s\"", opt.output); - sync_pgdata(opt.output, version * 10000, opt.sync_method, true); + sync_pgdata(opt.output, version, opt.sync_method, true); } } @@ -569,7 +579,7 @@ check_backup_label_files(int n_backups, char **backup_dirs) pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u", backup_dirs[i], start_tli, check_tli); if (i < n_backups - 1 && start_lsn != check_lsn) - pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X", + pg_fatal("backup at \"%s\" starts at LSN %X/%08X, but expected %X/%08X", backup_dirs[i], LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(check_lsn)); @@ -1155,59 +1165,6 @@ process_directory_recursively(Oid tsoid, closedir(dir); } -/* - * Read the version number from PG_VERSION and convert it to the usual server - * version number format. (e.g. If PG_VERSION contains "14\n" this function - * will return 140000) - */ -static int -read_pg_version_file(char *directory) -{ - char filename[MAXPGPATH]; - StringInfoData buf; - int fd; - int version; - char *ep; - - /* Construct pathname. */ - snprintf(filename, MAXPGPATH, "%s/PG_VERSION", directory); - - /* Open file. */ - if ((fd = open(filename, O_RDONLY, 0)) < 0) - pg_fatal("could not open file \"%s\": %m", filename); - - /* Read into memory. Length limit of 128 should be more than generous. */ - initStringInfo(&buf); - slurp_file(fd, filename, &buf, 128); - - /* Close the file. */ - if (close(fd) != 0) - pg_fatal("could not close file \"%s\": %m", filename); - - /* Convert to integer. */ - errno = 0; - version = strtoul(buf.data, &ep, 10); - if (errno != 0 || *ep != '\n') - { - /* - * Incremental backup is not relevant to very old server versions that - * used multi-part version number (e.g. 9.6, or 8.4). So if we see - * what looks like the beginning of such a version number, just bail - * out. - */ - if (version < 10 && *ep == '.') - pg_fatal("%s: server version too old", filename); - pg_fatal("%s: could not parse version number", filename); - } - - /* Debugging output. */ - pg_log_debug("read server version %d from file \"%s\"", version, filename); - - /* Release memory and return result. */ - pfree(buf.data); - return version * 10000; -} - /* * Add a directory to the list of output directories to clean up. */ diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c index 8acaa54ff38b4..38d8e8a2dc9af 100644 --- a/src/bin/pg_combinebackup/reconstruct.c +++ b/src/bin/pg_combinebackup/reconstruct.c @@ -370,6 +370,7 @@ reconstruct_from_incremental_file(char *input_filename, if (s->relative_block_numbers != NULL) pfree(s->relative_block_numbers); pg_free(s->filename); + pg_free(s); } pfree(sourcemap); pfree(offsetmap); @@ -517,6 +518,7 @@ make_rfile(char *filename, bool missing_ok) { if (missing_ok && errno == ENOENT) { + pg_free(rf->filename); pg_free(rf); return NULL; } diff --git a/src/bin/pg_combinebackup/t/002_compare_backups.pl b/src/bin/pg_combinebackup/t/002_compare_backups.pl index 2c7ca89b92f7f..a3e29c055091e 100644 --- a/src/bin/pg_combinebackup/t/002_compare_backups.pl +++ b/src/bin/pg_combinebackup/t/002_compare_backups.pl @@ -174,6 +174,7 @@ $pitr1->command_ok( [ 'pg_dumpall', + '--restrict-key' => 'test', '--no-sync', '--no-unlogged-table-data', '--file' => $dump1, @@ -183,6 +184,7 @@ $pitr2->command_ok( [ 'pg_dumpall', + '--restrict-key' => 'test', '--no-sync', '--no-unlogged-table-data', '--file' => $dump2, diff --git a/src/bin/pg_combinebackup/t/010_hardlink.pl b/src/bin/pg_combinebackup/t/010_hardlink.pl index a0ee419090cf6..23acf72d25f77 100644 --- a/src/bin/pg_combinebackup/t/010_hardlink.pl +++ b/src/bin/pg_combinebackup/t/010_hardlink.pl @@ -56,7 +56,7 @@ '--pgdata' => $backup1path, '--no-sync', '--checkpoint' => 'fast', - '--wal-method' => 'none' + '--wal-method' => 'none' ], "full backup"); @@ -74,7 +74,7 @@ '--pgdata' => $backup2path, '--no-sync', '--checkpoint' => 'fast', - '--wal-method' => 'none', + '--wal-method' => 'none', '--incremental' => $backup1path . '/backup_manifest' ], "incremental backup"); @@ -112,45 +112,45 @@ # of the given data file. sub check_data_file { - my ($data_file, $last_segment_nlinks) = @_; - - my @data_file_segments = ($data_file); - - # Start checking for additional segments - my $segment_number = 1; - - while (1) - { - my $next_segment = $data_file . '.' . $segment_number; - - # If the file exists and is a regular file, add it to the list - if (-f $next_segment) - { - push @data_file_segments, $next_segment; - $segment_number++; - } - # Stop the loop if the file doesn't exist - else - { - last; - } - } - - # All segments of the given data file should contain 2 hard links, except - # for the last one, which should match the given number of links. - my $last_segment = pop @data_file_segments; - - for my $segment (@data_file_segments) - { - # Get the file's stat information of each segment - my $nlink_count = get_hard_link_count($segment); - ok($nlink_count == 2, "File '$segment' has 2 hard links"); - } - - # Get the file's stat information of the last segment - my $nlink_count = get_hard_link_count($last_segment); - ok($nlink_count == $last_segment_nlinks, - "File '$last_segment' has $last_segment_nlinks hard link(s)"); + my ($data_file, $last_segment_nlinks) = @_; + + my @data_file_segments = ($data_file); + + # Start checking for additional segments + my $segment_number = 1; + + while (1) + { + my $next_segment = $data_file . '.' . $segment_number; + + # If the file exists and is a regular file, add it to the list + if (-f $next_segment) + { + push @data_file_segments, $next_segment; + $segment_number++; + } + # Stop the loop if the file doesn't exist + else + { + last; + } + } + + # All segments of the given data file should contain 2 hard links, except + # for the last one, which should match the given number of links. + my $last_segment = pop @data_file_segments; + + for my $segment (@data_file_segments) + { + # Get the file's stat information of each segment + my $nlink_count = get_hard_link_count($segment); + is($nlink_count, 2, "File '$segment' has 2 hard links"); + } + + # Get the file's stat information of the last segment + my $nlink_count = get_hard_link_count($last_segment); + is($nlink_count, $last_segment_nlinks, + "File '$last_segment' has $last_segment_nlinks hard link(s)"); } @@ -159,11 +159,11 @@ sub check_data_file # that file. sub get_hard_link_count { - my ($file) = @_; + my ($file) = @_; - # Get file stats - my @stats = stat($file); - my $nlink = $stats[3]; # Number of hard links + # Get file stats + my @stats = stat($file); + my $nlink = $stats[3]; # Number of hard links - return $nlink; + return $nlink; } diff --git a/src/bin/pg_combinebackup/write_manifest.c b/src/bin/pg_combinebackup/write_manifest.c index 313f8929df509..c845175a82c71 100644 --- a/src/bin/pg_combinebackup/write_manifest.c +++ b/src/bin/pg_combinebackup/write_manifest.c @@ -155,7 +155,7 @@ finalize_manifest(manifest_writer *mwriter, for (wal_range = first_wal_range; wal_range != NULL; wal_range = wal_range->next) appendStringInfo(&mwriter->buf, - "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%08X\", \"End-LSN\": \"%X/%08X\" }", wal_range == first_wal_range ? "" : ",\n", wal_range->tli, LSN_FORMAT_ARGS(wal_range->start_lsn), @@ -259,8 +259,8 @@ flush_manifest(manifest_writer *mwriter) if (wb < 0) pg_fatal("could not write file \"%s\": %m", mwriter->pathname); else - pg_fatal("could not write file \"%s\": wrote %d of %d", - mwriter->pathname, (int) wb, mwriter->buf.len); + pg_fatal("could not write file \"%s\": wrote %zd of %d", + mwriter->pathname, wb, mwriter->buf.len); } if (mwriter->still_checksumming && diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 7bb801bb88612..a4060309ae0e4 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -167,7 +167,14 @@ main(int argc, char *argv[]) /* get a copy of the control file */ ControlFile = get_controlfile(DataDir, &crc_ok); - if (!crc_ok) + if (ControlFile->pg_control_version != PG_CONTROL_VERSION) + { + pg_log_warning("control file version (%u) does not match the version understood by this program (%u)", + ControlFile->pg_control_version, PG_CONTROL_VERSION); + pg_log_warning_detail("Either the control file has been created with a different version of PostgreSQL, " + "or it is corrupt. The results below are untrustworthy."); + } + else if (!crc_ok) { pg_log_warning("calculated CRC checksum does not match value stored in control file"); pg_log_warning_detail("Either the control file is corrupt, or it has a different layout than this program " @@ -245,9 +252,9 @@ main(int argc, char *argv[]) dbState(ControlFile->state)); printf(_("pg_control last modified: %s\n"), pgctime_str); - printf(_("Latest checkpoint location: %X/%X\n"), + printf(_("Latest checkpoint location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->checkPoint)); - printf(_("Latest checkpoint's REDO location: %X/%X\n"), + printf(_("Latest checkpoint's REDO location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)); printf(_("Latest checkpoint's REDO WAL file: %s\n"), xlogfilename); @@ -264,7 +271,7 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile->checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), + printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"), ControlFile->checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile->checkPointCopy.oldestXid); @@ -282,15 +289,15 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.newestCommitTsXid); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); - printf(_("Fake LSN counter for unlogged rels: %X/%X\n"), + printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->unloggedLSN)); - printf(_("Minimum recovery ending location: %X/%X\n"), + printf(_("Minimum recovery ending location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint)); printf(_("Min recovery ending loc's timeline: %u\n"), ControlFile->minRecoveryPointTLI); - printf(_("Backup start location: %X/%X\n"), + printf(_("Backup start location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->backupStartPoint)); - printf(_("Backup end location: %X/%X\n"), + printf(_("Backup end location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->backupEndPoint)); printf(_("End-of-backup record required: %s\n"), ControlFile->backupEndRequired ? _("yes") : _("no")); @@ -317,6 +324,8 @@ main(int argc, char *argv[]) ControlFile->blcksz); printf(_("Blocks per segment of large relation: %u\n"), ControlFile->relseg_size); + printf(_("Pages per SLRU segment: %u\n"), + ControlFile->slru_pages_per_segment); printf(_("WAL block size: %u\n"), ControlFile->xlog_blcksz); printf(_("Bytes per WAL segment: %u\n"), diff --git a/src/bin/pg_controldata/t/001_pg_controldata.pl b/src/bin/pg_controldata/t/001_pg_controldata.pl index 4aea00d6d5af4..6ace61f93142f 100644 --- a/src/bin/pg_controldata/t/001_pg_controldata.pl +++ b/src/bin/pg_controldata/t/001_pg_controldata.pl @@ -21,16 +21,22 @@ qr/checkpoint/, 'pg_controldata produces output'); -# check with a corrupted pg_control +# Check with a corrupted pg_control +# +# To corrupt it, overwrite most of it with zeros. We leave the +# beginning portion that contains the pg_control version number (first +# 16 bytes) unmodified because otherwise you get an error about the +# version number, instead of checksum mismatch. my $pg_control = $node->data_dir . '/global/pg_control'; my $size = -s $pg_control; -open my $fh, '>', $pg_control or BAIL_OUT($!); +open my $fh, '+<', $pg_control or BAIL_OUT($!); binmode $fh; -# fill file with zeros -print $fh pack("x[$size]"); +my ($overwrite_off, $overwrite_len) = (16, $size - 16); +seek $fh, $overwrite_off, 0 or BAIL_OUT($!); +print $fh pack("x[$overwrite_len]"); close $fh; command_checks_all( diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 8a405ff122c71..dfe7b4d2f9e0a 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -26,6 +26,7 @@ #include "common/file_perm.h" #include "common/logging.h" #include "common/string.h" +#include "datatype/timestamp.h" #include "getopt_long.h" #include "utils/pidfile.h" @@ -68,9 +69,9 @@ typedef enum #define DEFAULT_WAIT 60 -#define USEC_PER_SEC 1000000 - -#define WAITS_PER_SEC 10 /* should divide USEC_PER_SEC evenly */ +#define WAITS_PER_SEC 10 +StaticAssertDecl(USECS_PER_SEC % WAITS_PER_SEC == 0, + "WAITS_PER_SEC must divide USECS_PER_SEC evenly"); static bool do_wait = true; static int wait_seconds = DEFAULT_WAIT; @@ -563,7 +564,7 @@ start_postmaster(void) if (!CreateRestrictedProcess(cmd, &pi, false)) { write_stderr(_("%s: could not start server: error code %lu\n"), - progname, (unsigned long) GetLastError()); + progname, GetLastError()); exit(1); } /* Don't close command process handle here; caller must do so */ @@ -699,7 +700,7 @@ wait_for_postmaster_start(pid_t pm_pid, bool do_checkpoint) print_msg("."); } - pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); + pg_usleep(USECS_PER_SEC / WAITS_PER_SEC); } /* out of patience; report that postmaster is still starting up */ @@ -738,7 +739,7 @@ wait_for_postmaster_stop(void) if (cnt % WAITS_PER_SEC == 0) print_msg("."); - pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); + pg_usleep(USECS_PER_SEC / WAITS_PER_SEC); } return false; /* timeout reached */ } @@ -771,7 +772,7 @@ wait_for_postmaster_promote(void) if (cnt % WAITS_PER_SEC == 0) print_msg("."); - pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); + pg_usleep(USECS_PER_SEC / WAITS_PER_SEC); } return false; /* timeout reached */ } @@ -1536,7 +1537,7 @@ pgwin32_doRegister(void) CloseServiceHandle(hSCM); write_stderr(_("%s: could not register service \"%s\": error code %lu\n"), progname, register_servicename, - (unsigned long) GetLastError()); + GetLastError()); exit(1); } CloseServiceHandle(hService); @@ -1566,7 +1567,7 @@ pgwin32_doUnregister(void) CloseServiceHandle(hSCM); write_stderr(_("%s: could not open service \"%s\": error code %lu\n"), progname, register_servicename, - (unsigned long) GetLastError()); + GetLastError()); exit(1); } if (!DeleteService(hService)) @@ -1575,7 +1576,7 @@ pgwin32_doUnregister(void) CloseServiceHandle(hSCM); write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"), progname, register_servicename, - (unsigned long) GetLastError()); + GetLastError()); exit(1); } CloseServiceHandle(hService); @@ -1724,7 +1725,7 @@ pgwin32_doRunAsService(void) { write_stderr(_("%s: could not start service \"%s\": error code %lu\n"), progname, register_servicename, - (unsigned long) GetLastError()); + GetLastError()); exit(1); } } @@ -1796,7 +1797,7 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser * it doesn't cast DWORD before printing. */ write_stderr(_("%s: could not open process token: error code %lu\n"), - progname, (unsigned long) GetLastError()); + progname, GetLastError()); return 0; } @@ -1810,7 +1811,7 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser 0, &dropSids[1].Sid)) { write_stderr(_("%s: could not allocate SIDs: error code %lu\n"), - progname, (unsigned long) GetLastError()); + progname, GetLastError()); return 0; } @@ -1836,7 +1837,7 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser if (!b) { write_stderr(_("%s: could not create restricted token: error code %lu\n"), - progname, (unsigned long) GetLastError()); + progname, GetLastError()); return 0; } @@ -1855,8 +1856,7 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser HANDLE job; char jobname[128]; - sprintf(jobname, "PostgreSQL_%lu", - (unsigned long) processInfo->dwProcessId); + sprintf(jobname, "PostgreSQL_%lu", processInfo->dwProcessId); job = CreateJobObject(NULL, jobname); if (job) @@ -1918,7 +1918,7 @@ GetPrivilegesToDelete(HANDLE hToken) !LookupPrivilegeValue(NULL, SE_CHANGE_NOTIFY_NAME, &luidChangeNotify)) { write_stderr(_("%s: could not get LUIDs for privileges: error code %lu\n"), - progname, (unsigned long) GetLastError()); + progname, GetLastError()); return NULL; } @@ -1926,7 +1926,7 @@ GetPrivilegesToDelete(HANDLE hToken) GetLastError() != ERROR_INSUFFICIENT_BUFFER) { write_stderr(_("%s: could not get token information: error code %lu\n"), - progname, (unsigned long) GetLastError()); + progname, GetLastError()); return NULL; } @@ -1941,7 +1941,7 @@ GetPrivilegesToDelete(HANDLE hToken) if (!GetTokenInformation(hToken, TokenPrivileges, tokenPrivs, length, &length)) { write_stderr(_("%s: could not get token information: error code %lu\n"), - progname, (unsigned long) GetLastError()); + progname, GetLastError()); free(tokenPrivs); return NULL; } diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index aa1589e3331d2..0007e78667f26 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -17,6 +17,7 @@ #include +#include "catalog/pg_am_d.h" #include "catalog/pg_class_d.h" #include "catalog/pg_collation_d.h" #include "catalog/pg_extension_d.h" @@ -243,8 +244,8 @@ getSchemaData(Archive *fout, int *numTablesPtr) pg_log_info("reading subscriptions"); getSubscriptions(fout); - pg_log_info("reading subscription membership of tables"); - getSubscriptionTables(fout); + pg_log_info("reading subscription membership of relations"); + getSubscriptionRelations(fout); free(inhinfo); /* not needed any longer */ @@ -352,7 +353,7 @@ flagInhTables(Archive *fout, TableInfo *tblinfo, int numTables, tblinfo[i].numParents, tblinfo[i].dobj.name); - attachinfo = (TableAttachInfo *) palloc(sizeof(TableAttachInfo)); + attachinfo = palloc_object(TableAttachInfo); attachinfo->dobj.objType = DO_TABLE_ATTACH; attachinfo->dobj.catId.tableoid = 0; attachinfo->dobj.catId.oid = 0; @@ -944,6 +945,24 @@ findOprByOid(Oid oid) return (OprInfo *) dobj; } +/* + * findAccessMethodByOid + * finds the DumpableObject for the access method with the given oid + * returns NULL if not found + */ +AccessMethodInfo * +findAccessMethodByOid(Oid oid) +{ + CatalogId catId; + DumpableObject *dobj; + + catId.tableoid = AccessMethodRelationId; + catId.oid = oid; + dobj = findObjectByCatalogId(catId); + Assert(dobj == NULL || dobj->objType == DO_ACCESS_METHOD); + return (AccessMethodInfo *) dobj; +} + /* * findCollationByOid * finds the DumpableObject for the collation with the given oid diff --git a/src/bin/pg_dump/compress_gzip.c b/src/bin/pg_dump/compress_gzip.c index 5a30ebf9bf5b5..be31209f8116a 100644 --- a/src/bin/pg_dump/compress_gzip.c +++ b/src/bin/pg_dump/compress_gzip.c @@ -20,6 +20,15 @@ #ifdef HAVE_LIBZ #include +/* + * We don't use the gzgetc() macro, because zlib's configuration logic is not + * robust enough to guarantee that the macro will have the same ideas about + * struct field layout as the library itself does; see for example + * https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=59711 + * Instead, #undef the macro and fall back to the underlying function. + */ +#undef gzgetc + /*---------------------- * Compressor API *---------------------- @@ -251,34 +260,53 @@ InitCompressorGzip(CompressorState *cs, *---------------------- */ -static bool -Gzip_read(void *ptr, size_t size, size_t *rsize, CompressFileHandle *CFH) +static size_t +Gzip_read(void *ptr, size_t size, CompressFileHandle *CFH) { gzFile gzfp = (gzFile) CFH->private_data; int gzret; + /* Reading zero bytes must be a no-op */ + if (size == 0) + return 0; + gzret = gzread(gzfp, ptr, size); - if (gzret <= 0 && !gzeof(gzfp)) + + /* + * gzread returns zero on EOF as well as some error conditions, and less + * than zero on other error conditions, so we need to inspect for EOF on + * zero. + */ + if (gzret <= 0) { int errnum; - const char *errmsg = gzerror(gzfp, &errnum); + const char *errmsg; + + if (gzret == 0 && gzeof(gzfp)) + return 0; + + errmsg = gzerror(gzfp, &errnum); pg_fatal("could not read from input file: %s", errnum == Z_ERRNO ? strerror(errno) : errmsg); } - if (rsize) - *rsize = (size_t) gzret; - - return true; + return (size_t) gzret; } -static bool +static void Gzip_write(const void *ptr, size_t size, CompressFileHandle *CFH) { gzFile gzfp = (gzFile) CFH->private_data; + int errnum; + const char *errmsg; - return gzwrite(gzfp, ptr, size) > 0; + if (gzwrite(gzfp, ptr, size) != size) + { + errmsg = gzerror(gzfp, &errnum); + pg_fatal("could not write to file: %s", + errnum == Z_ERRNO ? strerror(errno) : errmsg); + } } static int diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c index 8c3d9c911c47b..9cadc6f2a3f34 100644 --- a/src/bin/pg_dump/compress_io.c +++ b/src/bin/pg_dump/compress_io.c @@ -269,6 +269,7 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode) } CFH = InitCompressFileHandle(compression_spec); + errno = 0; if (!CFH->open_func(fname, -1, mode, CFH)) { free_keep_errno(CFH); @@ -289,6 +290,7 @@ EndCompressFileHandle(CompressFileHandle *CFH) { bool ret = false; + errno = 0; if (CFH->private_data) ret = CFH->close_func(CFH); diff --git a/src/bin/pg_dump/compress_io.h b/src/bin/pg_dump/compress_io.h index db9b38744c8e2..ae008585c899b 100644 --- a/src/bin/pg_dump/compress_io.h +++ b/src/bin/pg_dump/compress_io.h @@ -22,9 +22,9 @@ * * When changing this value, it's necessary to check the relevant test cases * still exercise all the branches. This applies especially if the value is - * increased, in which case the overflow buffer may not be needed. + * increased, in which case some loops may not get iterated. */ -#define DEFAULT_IO_BUFFER_SIZE 4096 +#define DEFAULT_IO_BUFFER_SIZE (128 * 1024) extern char *supports_compression(const pg_compress_specification compression_spec); @@ -123,21 +123,22 @@ struct CompressFileHandle CompressFileHandle *CFH); /* - * Read 'size' bytes of data from the file and store them into 'ptr'. - * Optionally it will store the number of bytes read in 'rsize'. + * Read up to 'size' bytes of data from the file and store them into + * 'ptr'. * - * Returns true on success and throws an internal error otherwise. + * Returns number of bytes read (this might be less than 'size' if EOF was + * reached). Exits via pg_fatal for all error conditions. */ - bool (*read_func) (void *ptr, size_t size, size_t *rsize, + size_t (*read_func) (void *ptr, size_t size, CompressFileHandle *CFH); /* * Write 'size' bytes of data into the file from 'ptr'. * - * Returns true on success and false on error. + * Returns nothing, exits via pg_fatal for all error conditions. */ - bool (*write_func) (const void *ptr, size_t size, - struct CompressFileHandle *CFH); + void (*write_func) (const void *ptr, size_t size, + CompressFileHandle *CFH); /* * Read at most size - 1 characters from the compress file handle into diff --git a/src/bin/pg_dump/compress_lz4.c b/src/bin/pg_dump/compress_lz4.c index e99f0cad71fcb..450afd4e2be44 100644 --- a/src/bin/pg_dump/compress_lz4.c +++ b/src/bin/pg_dump/compress_lz4.c @@ -12,6 +12,7 @@ *------------------------------------------------------------------------- */ #include "postgres_fe.h" +#include #include "compress_lz4.h" #include "pg_backup_utils.h" @@ -59,27 +60,17 @@ typedef struct LZ4State bool compressing; /* - * Used by the Compressor API to mark if the compression headers have been - * written after initialization. + * I/O buffer area. */ - bool needs_header_flush; - - size_t buflen; - char *buffer; - - /* - * Used by the Stream API to store already uncompressed data that the - * caller has not consumed. - */ - size_t overflowalloclen; - size_t overflowlen; - char *overflowbuf; - - /* - * Used by both APIs to keep track of the compressed data length stored in - * the buffer. - */ - size_t compressedlen; + char *buffer; /* buffer for compressed data */ + size_t buflen; /* allocated size of buffer */ + size_t bufdata; /* amount of valid data currently in buffer */ + /* These fields are used only while decompressing: */ + size_t bufnext; /* next buffer position to decompress */ + char *outbuf; /* buffer for decompressed data */ + size_t outbuflen; /* allocated size of outbuf */ + size_t outbufdata; /* amount of valid data currently in outbuf */ + size_t outbufnext; /* next outbuf position to return */ /* * Used by both APIs to keep track of error codes. @@ -102,8 +93,22 @@ LZ4State_compression_init(LZ4State *state) { size_t status; + /* + * Compute size needed for buffer, assuming we will present at most + * DEFAULT_IO_BUFFER_SIZE input bytes at a time. + */ state->buflen = LZ4F_compressBound(DEFAULT_IO_BUFFER_SIZE, &state->prefs); + /* + * Add some slop to ensure we're not forced to flush every time. + * + * The present slop factor of 50% is chosen so that the typical output + * block size is about 128K when DEFAULT_IO_BUFFER_SIZE = 128K. We might + * need a different slop factor to maintain that equivalence if + * DEFAULT_IO_BUFFER_SIZE is changed dramatically. + */ + state->buflen += state->buflen / 2; + /* * LZ4F_compressBegin requires a buffer that is greater or equal to * LZ4F_HEADER_SIZE_MAX. Verify that the requirement is met. @@ -119,6 +124,10 @@ LZ4State_compression_init(LZ4State *state) } state->buffer = pg_malloc(state->buflen); + + /* + * Insert LZ4 header into buffer. + */ status = LZ4F_compressBegin(state->ctx, state->buffer, state->buflen, &state->prefs); @@ -128,7 +137,7 @@ LZ4State_compression_init(LZ4State *state) return false; } - state->compressedlen = status; + state->bufdata = status; return true; } @@ -157,8 +166,8 @@ ReadDataFromArchiveLZ4(ArchiveHandle *AH, CompressorState *cs) pg_fatal("could not create LZ4 decompression context: %s", LZ4F_getErrorName(status)); - outbuf = pg_malloc0(DEFAULT_IO_BUFFER_SIZE); - readbuf = pg_malloc0(DEFAULT_IO_BUFFER_SIZE); + outbuf = pg_malloc(DEFAULT_IO_BUFFER_SIZE); + readbuf = pg_malloc(DEFAULT_IO_BUFFER_SIZE); readbuflen = DEFAULT_IO_BUFFER_SIZE; while ((r = cs->readF(AH, &readbuf, &readbuflen)) > 0) { @@ -173,7 +182,6 @@ ReadDataFromArchiveLZ4(ArchiveHandle *AH, CompressorState *cs) size_t out_size = DEFAULT_IO_BUFFER_SIZE; size_t read_size = readend - readp; - memset(outbuf, 0, DEFAULT_IO_BUFFER_SIZE); status = LZ4F_decompress(ctx, outbuf, &out_size, readp, &read_size, &dec_opt); if (LZ4F_isError(status)) @@ -200,36 +208,37 @@ WriteDataToArchiveLZ4(ArchiveHandle *AH, CompressorState *cs, { LZ4State *state = (LZ4State *) cs->private_data; size_t remaining = dLen; - size_t status; - size_t chunk; - - /* Write the header if not yet written. */ - if (state->needs_header_flush) - { - cs->writeF(AH, state->buffer, state->compressedlen); - state->needs_header_flush = false; - } while (remaining > 0) { + size_t chunk; + size_t required; + size_t status; - if (remaining > DEFAULT_IO_BUFFER_SIZE) - chunk = DEFAULT_IO_BUFFER_SIZE; - else - chunk = remaining; + /* We don't try to present more than DEFAULT_IO_BUFFER_SIZE bytes */ + chunk = Min(remaining, (size_t) DEFAULT_IO_BUFFER_SIZE); + + /* If not enough space, must flush buffer */ + required = LZ4F_compressBound(chunk, &state->prefs); + if (required > state->buflen - state->bufdata) + { + cs->writeF(AH, state->buffer, state->bufdata); + state->bufdata = 0; + } - remaining -= chunk; status = LZ4F_compressUpdate(state->ctx, - state->buffer, state->buflen, + state->buffer + state->bufdata, + state->buflen - state->bufdata, data, chunk, NULL); if (LZ4F_isError(status)) pg_fatal("could not compress data: %s", LZ4F_getErrorName(status)); - cs->writeF(AH, state->buffer, status); + state->bufdata += status; - data = ((char *) data) + chunk; + data = ((const char *) data) + chunk; + remaining -= chunk; } } @@ -237,29 +246,32 @@ static void EndCompressorLZ4(ArchiveHandle *AH, CompressorState *cs) { LZ4State *state = (LZ4State *) cs->private_data; + size_t required; size_t status; /* Nothing needs to be done */ if (!state) return; - /* - * Write the header if not yet written. The caller is not required to call - * writeData if the relation does not contain any data. Thus it is - * possible to reach here without having flushed the header. Do it before - * ending the compression. - */ - if (state->needs_header_flush) - cs->writeF(AH, state->buffer, state->compressedlen); + /* We might need to flush the buffer to make room for LZ4F_compressEnd */ + required = LZ4F_compressBound(0, &state->prefs); + if (required > state->buflen - state->bufdata) + { + cs->writeF(AH, state->buffer, state->bufdata); + state->bufdata = 0; + } status = LZ4F_compressEnd(state->ctx, - state->buffer, state->buflen, + state->buffer + state->bufdata, + state->buflen - state->bufdata, NULL); if (LZ4F_isError(status)) pg_fatal("could not end compression: %s", LZ4F_getErrorName(status)); + state->bufdata += status; - cs->writeF(AH, state->buffer, status); + /* Write the final bufferload */ + cs->writeF(AH, state->buffer, state->bufdata); status = LZ4F_freeCompressionContext(state->ctx); if (LZ4F_isError(status)) @@ -301,8 +313,6 @@ InitCompressorLZ4(CompressorState *cs, const pg_compress_specification compressi pg_fatal("could not initialize LZ4 compression: %s", LZ4F_getErrorName(state->errcode)); - /* Remember that the header has not been written. */ - state->needs_header_flush = true; cs->private_data = state; } @@ -314,15 +324,16 @@ InitCompressorLZ4(CompressorState *cs, const pg_compress_specification compressi /* * LZ4 equivalent to feof() or gzeof(). Return true iff there is no - * decompressed output in the overflow buffer and the end of the backing file - * is reached. + * more buffered data and the end of the input file has been reached. */ static bool LZ4Stream_eof(CompressFileHandle *CFH) { LZ4State *state = (LZ4State *) CFH->private_data; - return state->overflowlen == 0 && feof(state->fp); + return state->outbufnext >= state->outbufdata && + state->bufnext >= state->bufdata && + feof(state->fp); } static const char * @@ -344,13 +355,15 @@ LZ4Stream_get_error(CompressFileHandle *CFH) * * Creates the necessary contexts for either compression or decompression. When * compressing data (indicated by compressing=true), it additionally writes the - * LZ4 header in the output stream. + * LZ4 header in the output buffer. + * + * It's expected that a not-yet-initialized LZ4State will be zero-filled. * * Returns true on success. In case of a failure returns false, and stores the * error code in state->errcode. */ static bool -LZ4Stream_init(LZ4State *state, int size, bool compressing) +LZ4Stream_init(LZ4State *state, bool compressing) { size_t status; @@ -358,20 +371,11 @@ LZ4Stream_init(LZ4State *state, int size, bool compressing) return true; state->compressing = compressing; - state->inited = true; - /* When compressing, write LZ4 header to the output stream. */ if (state->compressing) { - if (!LZ4State_compression_init(state)) return false; - - if (fwrite(state->buffer, 1, state->compressedlen, state->fp) != state->compressedlen) - { - errno = (errno) ? errno : ENOSPC; - return false; - } } else { @@ -382,65 +386,22 @@ LZ4Stream_init(LZ4State *state, int size, bool compressing) return false; } - state->buflen = Max(size, DEFAULT_IO_BUFFER_SIZE); + state->buflen = DEFAULT_IO_BUFFER_SIZE; state->buffer = pg_malloc(state->buflen); - - state->overflowalloclen = state->buflen; - state->overflowbuf = pg_malloc(state->overflowalloclen); - state->overflowlen = 0; + state->outbuflen = DEFAULT_IO_BUFFER_SIZE; + state->outbuf = pg_malloc(state->outbuflen); } + state->inited = true; return true; } -/* - * Read already decompressed content from the overflow buffer into 'ptr' up to - * 'size' bytes, if available. If the eol_flag is set, then stop at the first - * occurrence of the newline char prior to 'size' bytes. - * - * Any unread content in the overflow buffer is moved to the beginning. - * - * Returns the number of bytes read from the overflow buffer (and copied into - * the 'ptr' buffer), or 0 if the overflow buffer is empty. - */ -static int -LZ4Stream_read_overflow(LZ4State *state, void *ptr, int size, bool eol_flag) -{ - char *p; - int readlen = 0; - - if (state->overflowlen == 0) - return 0; - - if (state->overflowlen >= size) - readlen = size; - else - readlen = state->overflowlen; - - if (eol_flag && (p = memchr(state->overflowbuf, '\n', readlen))) - /* Include the line terminating char */ - readlen = p - state->overflowbuf + 1; - - memcpy(ptr, state->overflowbuf, readlen); - state->overflowlen -= readlen; - - if (state->overflowlen > 0) - memmove(state->overflowbuf, state->overflowbuf + readlen, state->overflowlen); - - return readlen; -} - /* * The workhorse for reading decompressed content out of an LZ4 compressed * stream. * * It will read up to 'ptrsize' decompressed content, or up to the new line - * char if found first when the eol_flag is set. It is possible that the - * decompressed output generated by reading any compressed input via the - * LZ4F API, exceeds 'ptrsize'. Any exceeding decompressed content is stored - * at an overflow buffer within LZ4State. Of course, when the function is - * called, it will first try to consume any decompressed content already - * present in the overflow buffer, before decompressing new content. + * char if one is found first when the eol_flag is set. * * Returns the number of bytes of decompressed data copied into the ptr * buffer, or -1 in case of error. @@ -449,108 +410,97 @@ static int LZ4Stream_read_internal(LZ4State *state, void *ptr, int ptrsize, bool eol_flag) { int dsize = 0; - int rsize; - int size = ptrsize; - bool eol_found = false; - - void *readbuf; + int remaining = ptrsize; /* Lazy init */ - if (!LZ4Stream_init(state, size, false /* decompressing */ )) - return -1; - - /* No work needs to be done for a zero-sized output buffer */ - if (size <= 0) - return 0; - - /* Verify that there is enough space in the outbuf */ - if (size > state->buflen) + if (!LZ4Stream_init(state, false /* decompressing */ )) { - state->buflen = size; - state->buffer = pg_realloc(state->buffer, size); + pg_log_error("unable to initialize LZ4 library: %s", + LZ4F_getErrorName(state->errcode)); + return -1; } - /* use already decompressed content if available */ - dsize = LZ4Stream_read_overflow(state, ptr, size, eol_flag); - if (dsize == size || (eol_flag && memchr(ptr, '\n', dsize))) - return dsize; - - readbuf = pg_malloc(size); - - do + /* Loop until postcondition is satisfied */ + while (remaining > 0) { - char *rp; - char *rend; - - rsize = fread(readbuf, 1, size, state->fp); - if (rsize < size && !feof(state->fp)) - return -1; - - rp = (char *) readbuf; - rend = (char *) readbuf + rsize; - - while (rp < rend) + /* + * If we already have some decompressed data, return that. + */ + if (state->outbufnext < state->outbufdata) { - size_t status; - size_t outlen = state->buflen; - size_t read_remain = rend - rp; - - memset(state->buffer, 0, outlen); - status = LZ4F_decompress(state->dtx, state->buffer, &outlen, - rp, &read_remain, NULL); - if (LZ4F_isError(status)) + char *outptr = state->outbuf + state->outbufnext; + size_t readlen = state->outbufdata - state->outbufnext; + bool eol_found = false; + + if (readlen > remaining) + readlen = remaining; + /* If eol_flag is set, don't read beyond a newline */ + if (eol_flag) { - state->errcode = status; - return -1; - } - - rp += read_remain; + char *eolptr = memchr(outptr, '\n', readlen); - /* - * fill in what space is available in ptr if the eol flag is set, - * either skip if one already found or fill up to EOL if present - * in the outbuf - */ - if (outlen > 0 && dsize < size && eol_found == false) - { - char *p; - size_t lib = (!eol_flag) ? size - dsize : size - 1 - dsize; - size_t len = outlen < lib ? outlen : lib; - - if (eol_flag && - (p = memchr(state->buffer, '\n', outlen)) && - (size_t) (p - state->buffer + 1) <= len) + if (eolptr) { - len = p - state->buffer + 1; + readlen = eolptr - outptr + 1; eol_found = true; } + } + memcpy(ptr, outptr, readlen); + ptr = ((char *) ptr) + readlen; + state->outbufnext += readlen; + dsize += readlen; + remaining -= readlen; + if (eol_found || remaining == 0) + break; + /* We must have emptied outbuf */ + Assert(state->outbufnext >= state->outbufdata); + } - memcpy((char *) ptr + dsize, state->buffer, len); - dsize += len; + /* + * If we don't have any pending compressed data, load more into + * state->buffer. + */ + if (state->bufnext >= state->bufdata) + { + size_t rsize; - /* move what did not fit, if any, at the beginning of the buf */ - if (len < outlen) - memmove(state->buffer, state->buffer + len, outlen - len); - outlen -= len; + rsize = fread(state->buffer, 1, state->buflen, state->fp); + if (rsize < state->buflen && !feof(state->fp)) + { + pg_log_error("could not read from input file: %m"); + return -1; } + if (rsize == 0) + break; /* must be EOF */ + state->bufdata = rsize; + state->bufnext = 0; + } - /* if there is available output, save it */ - if (outlen > 0) + /* + * Decompress some data into state->outbuf. + */ + { + size_t status; + size_t outlen = state->outbuflen; + size_t inlen = state->bufdata - state->bufnext; + + status = LZ4F_decompress(state->dtx, + state->outbuf, &outlen, + state->buffer + state->bufnext, + &inlen, + NULL); + if (LZ4F_isError(status)) { - while (state->overflowlen + outlen > state->overflowalloclen) - { - state->overflowalloclen *= 2; - state->overflowbuf = pg_realloc(state->overflowbuf, - state->overflowalloclen); - } - - memcpy(state->overflowbuf + state->overflowlen, state->buffer, outlen); - state->overflowlen += outlen; + state->errcode = status; + pg_log_error("could not read from input file: %s", + LZ4F_getErrorName(state->errcode)); + return -1; } + state->bufnext += inlen; + state->outbufdata = outlen; + state->outbufnext = 0; } - } while (rsize == size && dsize < size && eol_found == false); - - pg_free(readbuf); + } return dsize; } @@ -558,48 +508,57 @@ LZ4Stream_read_internal(LZ4State *state, void *ptr, int ptrsize, bool eol_flag) /* * Compress size bytes from ptr and write them to the stream. */ -static bool +static void LZ4Stream_write(const void *ptr, size_t size, CompressFileHandle *CFH) { LZ4State *state = (LZ4State *) CFH->private_data; - size_t status; - int remaining = size; + size_t remaining = size; /* Lazy init */ - if (!LZ4Stream_init(state, size, true)) - return false; + if (!LZ4Stream_init(state, true)) + pg_fatal("unable to initialize LZ4 library: %s", + LZ4F_getErrorName(state->errcode)); while (remaining > 0) { - int chunk = Min(remaining, DEFAULT_IO_BUFFER_SIZE); + size_t chunk; + size_t required; + size_t status; - remaining -= chunk; + /* We don't try to present more than DEFAULT_IO_BUFFER_SIZE bytes */ + chunk = Min(remaining, (size_t) DEFAULT_IO_BUFFER_SIZE); - status = LZ4F_compressUpdate(state->ctx, state->buffer, state->buflen, - ptr, chunk, NULL); - if (LZ4F_isError(status)) + /* If not enough space, must flush buffer */ + required = LZ4F_compressBound(chunk, &state->prefs); + if (required > state->buflen - state->bufdata) { - state->errcode = status; - return false; + errno = 0; + if (fwrite(state->buffer, 1, state->bufdata, state->fp) != state->bufdata) + { + errno = (errno) ? errno : ENOSPC; + pg_fatal("error during writing: %m"); + } + state->bufdata = 0; } - if (fwrite(state->buffer, 1, status, state->fp) != status) - { - errno = (errno) ? errno : ENOSPC; - return false; - } + status = LZ4F_compressUpdate(state->ctx, + state->buffer + state->bufdata, + state->buflen - state->bufdata, + ptr, chunk, NULL); + if (LZ4F_isError(status)) + pg_fatal("error during writing: %s", LZ4F_getErrorName(status)); + state->bufdata += status; ptr = ((const char *) ptr) + chunk; + remaining -= chunk; } - - return true; } /* * fread() equivalent implementation for LZ4 compressed files. */ -static bool -LZ4Stream_read(void *ptr, size_t size, size_t *rsize, CompressFileHandle *CFH) +static size_t +LZ4Stream_read(void *ptr, size_t size, CompressFileHandle *CFH) { LZ4State *state = (LZ4State *) CFH->private_data; int ret; @@ -607,10 +566,7 @@ LZ4Stream_read(void *ptr, size_t size, size_t *rsize, CompressFileHandle *CFH) if ((ret = LZ4Stream_read_internal(state, ptr, size, false)) < 0) pg_fatal("could not read from input file: %s", LZ4Stream_get_error(CFH)); - if (rsize) - *rsize = (size_t) ret; - - return true; + return (size_t) ret; } /* @@ -643,11 +599,13 @@ LZ4Stream_gets(char *ptr, int size, CompressFileHandle *CFH) int ret; ret = LZ4Stream_read_internal(state, ptr, size - 1, true); - if (ret < 0 || (ret == 0 && !LZ4Stream_eof(CFH))) - pg_fatal("could not read from input file: %s", LZ4Stream_get_error(CFH)); - /* Done reading */ - if (ret == 0) + /* + * LZ4Stream_read_internal returning 0 or -1 means that it was either an + * EOF or an error, but gets_func is defined to return NULL in either case + * so we can treat both the same here. + */ + if (ret <= 0) return NULL; /* @@ -668,64 +626,94 @@ LZ4Stream_close(CompressFileHandle *CFH) { FILE *fp; LZ4State *state = (LZ4State *) CFH->private_data; + size_t required; size_t status; + int ret; fp = state->fp; if (state->inited) { if (state->compressing) { - status = LZ4F_compressEnd(state->ctx, state->buffer, state->buflen, NULL); + /* We might need to flush the buffer to make room */ + required = LZ4F_compressBound(0, &state->prefs); + if (required > state->buflen - state->bufdata) + { + errno = 0; + if (fwrite(state->buffer, 1, state->bufdata, state->fp) != state->bufdata) + { + errno = (errno) ? errno : ENOSPC; + pg_log_error("could not write to output file: %m"); + } + state->bufdata = 0; + } + + status = LZ4F_compressEnd(state->ctx, + state->buffer + state->bufdata, + state->buflen - state->bufdata, + NULL); if (LZ4F_isError(status)) - pg_fatal("could not end compression: %s", - LZ4F_getErrorName(status)); - else if (fwrite(state->buffer, 1, status, state->fp) != status) + { + pg_log_error("could not end compression: %s", + LZ4F_getErrorName(status)); + } + else + state->bufdata += status; + + errno = 0; + if (fwrite(state->buffer, 1, state->bufdata, state->fp) != state->bufdata) { errno = (errno) ? errno : ENOSPC; - WRITE_ERROR_EXIT; + pg_log_error("could not write to output file: %m"); } status = LZ4F_freeCompressionContext(state->ctx); if (LZ4F_isError(status)) - pg_fatal("could not end compression: %s", - LZ4F_getErrorName(status)); + pg_log_error("could not end compression: %s", + LZ4F_getErrorName(status)); } else { status = LZ4F_freeDecompressionContext(state->dtx); if (LZ4F_isError(status)) - pg_fatal("could not end decompression: %s", - LZ4F_getErrorName(status)); - pg_free(state->overflowbuf); + pg_log_error("could not end decompression: %s", + LZ4F_getErrorName(status)); + pg_free(state->outbuf); } pg_free(state->buffer); } pg_free(state); + CFH->private_data = NULL; + + errno = 0; + ret = fclose(fp); + if (ret != 0) + { + pg_log_error("could not close file: %m"); + return false; + } - return fclose(fp) == 0; + return true; } static bool LZ4Stream_open(const char *path, int fd, const char *mode, CompressFileHandle *CFH) { - FILE *fp; LZ4State *state = (LZ4State *) CFH->private_data; if (fd >= 0) - fp = fdopen(fd, mode); + state->fp = fdopen(dup(fd), mode); else - fp = fopen(path, mode); - if (fp == NULL) + state->fp = fopen(path, mode); + if (state->fp == NULL) { state->errcode = errno; return false; } - state->fp = fp; - return true; } diff --git a/src/bin/pg_dump/compress_none.c b/src/bin/pg_dump/compress_none.c index 3fc89c9985461..94c155a572df0 100644 --- a/src/bin/pg_dump/compress_none.c +++ b/src/bin/pg_dump/compress_none.c @@ -22,6 +22,18 @@ *---------------------- */ +/* + * We buffer outgoing data, just to ensure that data blocks written to the + * archive file are of reasonable size. The read side could use this struct, + * but there's no need because it does not retain data across calls. + */ +typedef struct NoneCompressorState +{ + char *buffer; /* buffer for unwritten data */ + size_t buflen; /* allocated size of buffer */ + size_t bufdata; /* amount of valid data currently in buffer */ +} NoneCompressorState; + /* * Private routines */ @@ -49,13 +61,45 @@ static void WriteDataToArchiveNone(ArchiveHandle *AH, CompressorState *cs, const void *data, size_t dLen) { - cs->writeF(AH, data, dLen); + NoneCompressorState *nonecs = (NoneCompressorState *) cs->private_data; + size_t remaining = dLen; + + while (remaining > 0) + { + size_t chunk; + + /* Dump buffer if full */ + if (nonecs->bufdata >= nonecs->buflen) + { + cs->writeF(AH, nonecs->buffer, nonecs->bufdata); + nonecs->bufdata = 0; + } + /* And fill it */ + chunk = nonecs->buflen - nonecs->bufdata; + if (chunk > remaining) + chunk = remaining; + memcpy(nonecs->buffer + nonecs->bufdata, data, chunk); + nonecs->bufdata += chunk; + data = ((const char *) data) + chunk; + remaining -= chunk; + } } static void EndCompressorNone(ArchiveHandle *AH, CompressorState *cs) { - /* no op */ + NoneCompressorState *nonecs = (NoneCompressorState *) cs->private_data; + + if (nonecs) + { + /* Dump buffer if nonempty */ + if (nonecs->bufdata > 0) + cs->writeF(AH, nonecs->buffer, nonecs->bufdata); + /* Free working state */ + pg_free(nonecs->buffer); + pg_free(nonecs); + cs->private_data = NULL; + } } /* @@ -71,6 +115,22 @@ InitCompressorNone(CompressorState *cs, cs->end = EndCompressorNone; cs->compression_spec = compression_spec; + + /* + * If the caller has defined a write function, prepare the necessary + * buffer. + */ + if (cs->writeF) + { + NoneCompressorState *nonecs; + + nonecs = (NoneCompressorState *) pg_malloc(sizeof(NoneCompressorState)); + nonecs->buflen = DEFAULT_IO_BUFFER_SIZE; + nonecs->buffer = pg_malloc(nonecs->buflen); + nonecs->bufdata = 0; + + cs->private_data = nonecs; + } } @@ -83,35 +143,31 @@ InitCompressorNone(CompressorState *cs, * Private routines */ -static bool -read_none(void *ptr, size_t size, size_t *rsize, CompressFileHandle *CFH) +static size_t +read_none(void *ptr, size_t size, CompressFileHandle *CFH) { FILE *fp = (FILE *) CFH->private_data; size_t ret; - if (size == 0) - return true; - ret = fread(ptr, 1, size, fp); - if (ret != size && !feof(fp)) + if (ferror(fp)) pg_fatal("could not read from input file: %m"); - if (rsize) - *rsize = ret; - - return true; + return ret; } -static bool +static void write_none(const void *ptr, size_t size, CompressFileHandle *CFH) { size_t ret; + errno = 0; ret = fwrite(ptr, 1, size, (FILE *) CFH->private_data); if (ret != size) - return false; - - return true; + { + errno = (errno) ? errno : ENOSPC; + pg_fatal("could not write to file: %m"); + } } static const char * @@ -153,7 +209,12 @@ close_none(CompressFileHandle *CFH) CFH->private_data = NULL; if (fp) + { + errno = 0; ret = fclose(fp); + if (ret != 0) + pg_log_error("could not close file: %m"); + } return ret == 0; } diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c index cb595b10c2d32..36c1fd264ee14 100644 --- a/src/bin/pg_dump/compress_zstd.c +++ b/src/bin/pg_dump/compress_zstd.c @@ -13,6 +13,7 @@ */ #include "postgres_fe.h" +#include #include "compress_zstd.h" #include "pg_backup_utils.h" @@ -97,24 +98,22 @@ _ZstdWriteCommon(ArchiveHandle *AH, CompressorState *cs, bool flush) ZSTD_outBuffer *output = &zstdcs->output; /* Loop while there's any input or until flushed */ - while (input->pos != input->size || flush) + while (input->pos < input->size || flush) { size_t res; - output->pos = 0; res = ZSTD_compressStream2(zstdcs->cstream, output, input, flush ? ZSTD_e_end : ZSTD_e_continue); if (ZSTD_isError(res)) pg_fatal("could not compress data: %s", ZSTD_getErrorName(res)); - /* - * Extra paranoia: avoid zero-length chunks, since a zero length chunk - * is the EOF marker in the custom format. This should never happen - * but... - */ - if (output->pos > 0) + /* Dump output buffer if full, or if we're told to flush */ + if (output->pos >= output->size || flush) + { cs->writeF(AH, output->dst, output->pos); + output->pos = 0; + } if (res == 0) break; /* End of frame or all input consumed */ @@ -258,8 +257,8 @@ InitCompressorZstd(CompressorState *cs, * Compressed stream API */ -static bool -Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH) +static size_t +Zstd_read_internal(void *ptr, size_t size, CompressFileHandle *CFH, bool exit_on_error) { ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data; ZSTD_inBuffer *input = &zstdcs->input; @@ -268,11 +267,27 @@ Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH) size_t res, cnt; + /* + * If this is the first call to the reading function, initialize the + * required datastructures. + */ + if (zstdcs->dstream == NULL) + { + zstdcs->input.src = pg_malloc0(input_allocated_size); + zstdcs->dstream = ZSTD_createDStream(); + if (zstdcs->dstream == NULL) + { + if (exit_on_error) + pg_fatal("could not initialize compression library"); + return -1; + } + } + output->size = size; output->dst = ptr; output->pos = 0; - for (;;) + while (output->pos < output->size) { Assert(input->pos <= input->size); Assert(input->size <= input_allocated_size); @@ -292,6 +307,13 @@ Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH) if (input->pos == input->size) { cnt = fread(unconstify(void *, input->src), 1, input_allocated_size, zstdcs->fp); + if (ferror(zstdcs->fp)) + { + if (exit_on_error) + pg_fatal("could not read from input file: %m"); + return -1; + } + input->size = cnt; Assert(cnt <= input_allocated_size); @@ -307,7 +329,11 @@ Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH) res = ZSTD_decompressStream(zstdcs->dstream, output, input); if (ZSTD_isError(res)) - pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res)); + { + if (exit_on_error) + pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res)); + return -1; + } if (output->pos == output->size) break; /* No more room for output */ @@ -315,18 +341,12 @@ Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH) if (res == 0) break; /* End of frame */ } - - if (output->pos == output->size) - break; /* We read all the data that fits */ } - if (rdsize != NULL) - *rdsize = output->pos; - - return true; + return output->pos; } -static bool +static void Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH) { ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data; @@ -339,41 +359,45 @@ Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH) input->size = size; input->pos = 0; + if (zstdcs->cstream == NULL) + { + zstdcs->output.size = ZSTD_CStreamOutSize(); + zstdcs->output.dst = pg_malloc(zstdcs->output.size); + zstdcs->output.pos = 0; + zstdcs->cstream = _ZstdCStreamParams(CFH->compression_spec); + if (zstdcs->cstream == NULL) + pg_fatal("could not initialize compression library"); + } + /* Consume all input, to be flushed later */ - while (input->pos != input->size) + while (input->pos < input->size) { - output->pos = 0; res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue); if (ZSTD_isError(res)) - { - zstdcs->zstderror = ZSTD_getErrorName(res); - return false; - } + pg_fatal("could not write to file: %s", ZSTD_getErrorName(res)); - cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp); - if (cnt != output->pos) + /* Dump output buffer if full */ + if (output->pos >= output->size) { - zstdcs->zstderror = strerror(errno); - return false; + errno = 0; + cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp); + if (cnt != output->pos) + { + errno = (errno) ? errno : ENOSPC; + pg_fatal("could not write to file: %m"); + } + output->pos = 0; } } - - return size; } static int Zstd_getc(CompressFileHandle *CFH) { - ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data; - int ret; + unsigned char ret; - if (CFH->read_func(&ret, 1, NULL, CFH) != 1) - { - if (feof(zstdcs->fp)) - pg_fatal("could not read from input file: end of file"); - else - pg_fatal("could not read from input file: %m"); - } + if (CFH->read_func(&ret, 1, CFH) != 1) + pg_fatal("could not read from input file: end of file"); return ret; } @@ -390,11 +414,7 @@ Zstd_gets(char *buf, int len, CompressFileHandle *CFH) */ for (i = 0; i < len - 1; ++i) { - size_t readsz; - - if (!CFH->read_func(&buf[i], 1, &readsz, CFH)) - break; - if (readsz != 1) + if (Zstd_read_internal(&buf[i], 1, CFH, false) != 1) break; if (buf[i] == '\n') { @@ -406,10 +426,17 @@ Zstd_gets(char *buf, int len, CompressFileHandle *CFH) return i > 0 ? buf : NULL; } +static size_t +Zstd_read(void *ptr, size_t size, CompressFileHandle *CFH) +{ + return Zstd_read_internal(ptr, size, CFH, true); +} + static bool Zstd_close(CompressFileHandle *CFH) { ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data; + bool success = true; if (zstdcs->cstream) { @@ -421,20 +448,24 @@ Zstd_close(CompressFileHandle *CFH) /* Loop until the compression buffers are fully consumed */ for (;;) { - output->pos = 0; res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end); if (ZSTD_isError(res)) { zstdcs->zstderror = ZSTD_getErrorName(res); - return false; + success = false; + break; } + errno = 0; cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp); if (cnt != output->pos) { + errno = (errno) ? errno : ENOSPC; zstdcs->zstderror = strerror(errno); - return false; + success = false; + break; } + output->pos = 0; if (res == 0) break; /* End of frame */ @@ -450,11 +481,16 @@ Zstd_close(CompressFileHandle *CFH) pg_free(unconstify(void *, zstdcs->input.src)); } + errno = 0; if (fclose(zstdcs->fp) != 0) - return false; + { + zstdcs->zstderror = strerror(errno); + success = false; + } pg_free(zstdcs); - return true; + CFH->private_data = NULL; + return success; } static bool @@ -472,35 +508,33 @@ Zstd_open(const char *path, int fd, const char *mode, FILE *fp; ZstdCompressorState *zstdcs; + /* + * Clear state storage to avoid having the fd point to non-NULL memory on + * error return. + */ + CFH->private_data = NULL; + + zstdcs = (ZstdCompressorState *) pg_malloc_extended(sizeof(*zstdcs), + MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (!zstdcs) + { + errno = ENOMEM; + return false; + } + if (fd >= 0) - fp = fdopen(fd, mode); + fp = fdopen(dup(fd), mode); else fp = fopen(path, mode); if (fp == NULL) + { + pg_free(zstdcs); return false; + } - zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs)); - CFH->private_data = zstdcs; zstdcs->fp = fp; - - if (mode[0] == 'r') - { - zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize()); - zstdcs->dstream = ZSTD_createDStream(); - if (zstdcs->dstream == NULL) - pg_fatal("could not initialize compression library"); - } - else if (mode[0] == 'w' || mode[0] == 'a') - { - zstdcs->output.size = ZSTD_CStreamOutSize(); - zstdcs->output.dst = pg_malloc0(zstdcs->output.size); - zstdcs->cstream = _ZstdCStreamParams(CFH->compression_spec); - if (zstdcs->cstream == NULL) - pg_fatal("could not initialize compression library"); - } - else - pg_fatal("unhandled mode \"%s\"", mode); + CFH->private_data = zstdcs; return true; } diff --git a/src/bin/pg_dump/dumputils.c b/src/bin/pg_dump/dumputils.c index 73ce34346b278..2d22723aa9112 100644 --- a/src/bin/pg_dump/dumputils.c +++ b/src/bin/pg_dump/dumputils.c @@ -21,6 +21,7 @@ #include "dumputils.h" #include "fe_utils/string_utils.h" +static const char restrict_chars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; static bool parseAclItem(const char *item, const char *type, const char *name, const char *subname, int remoteVersion, @@ -31,6 +32,43 @@ static void AddAcl(PQExpBuffer aclbuf, const char *keyword, const char *subname); +/* + * Sanitize a string to be included in an SQL comment or TOC listing, by + * replacing any newlines with spaces. This ensures each logical output line + * is in fact one physical output line, to prevent corruption of the dump + * (which could, in the worst case, present an SQL injection vulnerability + * if someone were to incautiously load a dump containing objects with + * maliciously crafted names). + * + * The result is a freshly malloc'd string. If the input string is NULL, + * return a malloc'ed empty string, unless want_hyphen, in which case return a + * malloc'ed hyphen. + * + * Note that we currently don't bother to quote names, meaning that the name + * fields aren't automatically parseable. "pg_restore -L" doesn't care because + * it only examines the dumpId field, but someday we might want to try harder. + */ +char * +sanitize_line(const char *str, bool want_hyphen) +{ + char *result; + char *s; + + if (!str) + return pg_strdup(want_hyphen ? "-" : ""); + + result = pg_strdup(str); + + for (s = result; *s != '\0'; s++) + { + if (*s == '\n' || *s == '\r') + *s = ' '; + } + + return result; +} + + /* * Build GRANT/REVOKE command(s) for an object. * @@ -686,7 +724,7 @@ emitShSecLabels(PGconn *conn, PGresult *res, PQExpBuffer buffer, * currently known to guc.c, so that it'd be unsafe for extensions to declare * GUC_LIST_QUOTE variables anyway. Lacking a solution for that, it doesn't * seem worth the work to do more than have this list, which must be kept in - * sync with the variables actually marked GUC_LIST_QUOTE in guc_tables.c. + * sync with the variables actually marked GUC_LIST_QUOTE in guc_parameters.dat. */ bool variable_is_guc_list_quote(const char *name) @@ -743,7 +781,7 @@ SplitGUCList(char *rawstring, char separator, nextp++; /* skip leading whitespace */ if (*nextp == '\0') - return true; /* allow empty string */ + return true; /* empty string represents empty list */ /* At the top of the loop, we are at start of a new identifier. */ do @@ -855,6 +893,7 @@ makeAlterConfigCommand(PGconn *conn, const char *configitem, * elements as string literals. (The elements may be double-quoted as-is, * but we can't just feed them to the SQL parser; it would do the wrong * thing with elements that are zero-length or longer than NAMEDATALEN.) + * Also, we need a special case for empty lists. * * Variables that are not so marked should just be emitted as simple * string literals. If the variable is not known to @@ -870,6 +909,9 @@ makeAlterConfigCommand(PGconn *conn, const char *configitem, /* this shouldn't fail really */ if (SplitGUCList(pos, ',', &namelist)) { + /* Special case: represent an empty list as NULL */ + if (*namelist == NULL) + appendPQExpBufferStr(buf, "NULL"); for (nameptr = namelist; *nameptr; nameptr++) { if (nameptr != namelist) @@ -920,3 +962,40 @@ create_or_open_dir(const char *dirname) pg_fatal("directory \"%s\" is not empty", dirname); } } + +/* + * Generates a valid restrict key (i.e., an alphanumeric string) for use with + * psql's \restrict and \unrestrict meta-commands. For safety, the value is + * chosen at random. + */ +char * +generate_restrict_key(void) +{ + uint8 buf[64]; + char *ret = palloc(sizeof(buf)); + + if (!pg_strong_random(buf, sizeof(buf))) + return NULL; + + for (int i = 0; i < sizeof(buf) - 1; i++) + { + uint8 idx = buf[i] % strlen(restrict_chars); + + ret[i] = restrict_chars[idx]; + } + ret[sizeof(buf) - 1] = '\0'; + + return ret; +} + +/* + * Checks that a given restrict key (intended for use with psql's \restrict and + * \unrestrict meta-commands) contains only alphanumeric characters. + */ +bool +valid_restrict_key(const char *restrict_key) +{ + return restrict_key != NULL && + restrict_key[0] != '\0' && + strspn(restrict_key, restrict_chars) == strlen(restrict_key); +} diff --git a/src/bin/pg_dump/dumputils.h b/src/bin/pg_dump/dumputils.h index 91c6e612e282e..0fa47c9e3cf79 100644 --- a/src/bin/pg_dump/dumputils.h +++ b/src/bin/pg_dump/dumputils.h @@ -25,9 +25,10 @@ * We don't print the timezone on Windows, because the names are long and * localized, which means they may contain characters in various random * encodings; this has been seen to cause encoding errors when reading the - * dump script. Think not to get around that by using %z, because - * (1) %z is not portable to pre-C99 systems, and - * (2) %z doesn't actually act differently from %Z on Windows anyway. + * dump script. One could now possibly get around that by using %z, but %z + * was previously not portable to pre-C99 systems, and also previously %z + * didn't actually act differently from %Z on Windows. But of these problems + * might be obsolete now. */ #ifndef WIN32 #define PGDUMP_STRFTIME_FMT "%Y-%m-%d %H:%M:%S %Z" @@ -36,6 +37,7 @@ #endif +extern char *sanitize_line(const char *str, bool want_hyphen); extern bool buildACLCommands(const char *name, const char *subname, const char *nspname, const char *type, const char *acls, const char *baseacls, const char *owner, const char *prefix, int remoteVersion, @@ -64,4 +66,7 @@ extern void makeAlterConfigCommand(PGconn *conn, const char *configitem, PQExpBuffer buf); extern void create_or_open_dir(const char *dirname); +extern char *generate_restrict_key(void); +extern bool valid_restrict_key(const char *restrict_key); + #endif /* DUMPUTILS_H */ diff --git a/src/bin/pg_dump/filter.c b/src/bin/pg_dump/filter.c index 7214d51413771..e3cdcf4097563 100644 --- a/src/bin/pg_dump/filter.c +++ b/src/bin/pg_dump/filter.c @@ -171,9 +171,8 @@ pg_log_filter_error(FilterStateData *fstate, const char *fmt,...) /* * filter_get_keyword - read the next filter keyword from buffer * - * Search for keywords (limited to ascii alphabetic characters) in - * the passed in line buffer. Returns NULL when the buffer is empty or the first - * char is not alpha. The char '_' is allowed, except as the first character. + * Search for keywords (strings of non-whitespace characters) in the passed + * in line buffer. Returns NULL when the buffer is empty or no keyword exists. * The length of the found keyword is returned in the size parameter. */ static const char * @@ -182,6 +181,9 @@ filter_get_keyword(const char **line, int *size) const char *ptr = *line; const char *result = NULL; + /* The passed buffer must not be NULL */ + Assert(*line != NULL); + /* Set returned length preemptively in case no keyword is found */ *size = 0; @@ -189,11 +191,12 @@ filter_get_keyword(const char **line, int *size) while (isspace((unsigned char) *ptr)) ptr++; - if (isalpha((unsigned char) *ptr)) + /* Grab one keyword that's the string of non-whitespace characters */ + if (*ptr != '\0' && !isspace((unsigned char) *ptr)) { result = ptr++; - while (isalpha((unsigned char) *ptr) || *ptr == '_') + while (*ptr != '\0' && !isspace((unsigned char) *ptr)) ptr++; *size = ptr - result; diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build index d8e9e101254b1..f3c669f484ed7 100644 --- a/src/bin/pg_dump/meson.build +++ b/src/bin/pg_dump/meson.build @@ -91,9 +91,9 @@ tests += { 'bd': meson.current_build_dir(), 'tap': { 'env': { - 'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'LZ4': program_lz4.found() ? program_lz4.path() : '', - 'ZSTD': program_zstd.found() ? program_zstd.path() : '', + 'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'LZ4': program_lz4.found() ? program_lz4.full_path() : '', + 'ZSTD': program_zstd.found() ? program_zstd.full_path() : '', 'with_icu': icu.found() ? 'yes' : 'no', }, 'tests': [ @@ -102,7 +102,7 @@ tests += { 't/003_pg_dump_with_server.pl', 't/004_pg_dump_parallel.pl', 't/005_pg_dump_filterfile.pl', - 't/006_pg_dumpall.pl', + 't/006_pg_dump_compress.pl', 't/010_dump_connstr.pl', ], }, diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c index 5974d6706fd57..086adcdc50295 100644 --- a/src/bin/pg_dump/parallel.c +++ b/src/bin/pg_dump/parallel.c @@ -333,16 +333,6 @@ on_exit_close_archive(Archive *AHX) on_exit_nicely(archive_close_connection, &shutdown_info); } -/* - * When pg_restore restores multiple databases, then update already added entry - * into array for cleanup. - */ -void -replace_on_exit_close_archive(Archive *AHX) -{ - shutdown_info.AHX = AHX; -} - /* * on_exit_nicely handler for shutting down database connections and * worker processes cleanly. diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index af0007fb6d2f1..d9041dad72068 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -163,6 +163,8 @@ typedef struct _restoreOptions bool dumpSchema; bool dumpData; bool dumpStatistics; + + char *restrict_key; } RestoreOptions; typedef struct _dumpOptions @@ -213,6 +215,8 @@ typedef struct _dumpOptions bool dumpSchema; bool dumpData; bool dumpStatistics; + + char *restrict_key; } DumpOptions; /* @@ -308,7 +312,7 @@ extern void SetArchiveOptions(Archive *AH, DumpOptions *dopt, RestoreOptions *ro extern void ProcessArchiveRestoreOptions(Archive *AHX); -extern void RestoreArchive(Archive *AHX, bool append_data); +extern void RestoreArchive(Archive *AHX); /* Open an existing archive */ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index afa42337b110f..4a63f7392ae89 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -31,6 +31,8 @@ #endif #include "catalog/pg_class_d.h" +#include "catalog/pg_largeobject_metadata_d.h" +#include "catalog/pg_shdepend_d.h" #include "common/string.h" #include "compress_io.h" #include "dumputils.h" @@ -57,7 +59,6 @@ static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt, DataDirSyncMethod sync_method); static void _getObjectDescription(PQExpBuffer buf, const TocEntry *te); static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, const char *pfx); -static char *sanitize_line(const char *str, bool want_hyphen); static void _doSetFixedOutputState(ArchiveHandle *AH); static void _doSetSessionAuth(ArchiveHandle *AH, const char *user); static void _reconnectToDB(ArchiveHandle *AH, const char *dbname); @@ -85,7 +86,7 @@ static int RestoringToDB(ArchiveHandle *AH); static void dump_lo_buf(ArchiveHandle *AH); static void dumpTimestamp(ArchiveHandle *AH, const char *msg, time_t tim); static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, bool append_data); + const pg_compress_specification compression_spec); static CompressFileHandle *SaveOutput(ArchiveHandle *AH); static void RestoreOutput(ArchiveHandle *AH, CompressFileHandle *savedOutput); @@ -152,7 +153,7 @@ InitDumpOptions(DumpOptions *opts) opts->dumpSections = DUMP_UNSECTIONED; opts->dumpSchema = true; opts->dumpData = true; - opts->dumpStatistics = true; + opts->dumpStatistics = false; } /* @@ -196,6 +197,7 @@ dumpOptionsFromRestoreOptions(RestoreOptions *ropt) dopt->include_everything = ropt->include_everything; dopt->enable_row_security = ropt->enable_row_security; dopt->sequence_data = ropt->sequence_data; + dopt->restrict_key = ropt->restrict_key ? pg_strdup(ropt->restrict_key) : NULL; return dopt; } @@ -337,14 +339,9 @@ ProcessArchiveRestoreOptions(Archive *AHX) StrictNamesCheck(ropt); } -/* - * RestoreArchive - * - * If append_data is set, then append data into file as we are restoring dump - * of multiple databases which was taken by pg_dumpall. - */ +/* Public */ void -RestoreArchive(Archive *AHX, bool append_data) +RestoreArchive(Archive *AHX) { ArchiveHandle *AH = (ArchiveHandle *) AHX; RestoreOptions *ropt = AH->public.ropt; @@ -461,10 +458,21 @@ RestoreArchive(Archive *AHX, bool append_data) */ sav = SaveOutput(AH); if (ropt->filename || ropt->compression_spec.algorithm != PG_COMPRESSION_NONE) - SetOutput(AH, ropt->filename, ropt->compression_spec, append_data); + SetOutput(AH, ropt->filename, ropt->compression_spec); ahprintf(AH, "--\n-- PostgreSQL database dump\n--\n\n"); + /* + * If generating plain-text output, enter restricted mode to block any + * unexpected psql meta-commands. A malicious source might try to inject + * a variety of things via bogus responses to queries. While we cannot + * prevent such sources from affecting the destination at restore time, we + * can block psql meta-commands so that the client machine that runs psql + * with the dump output remains unaffected. + */ + if (ropt->restrict_key) + ahprintf(AH, "\\restrict %s\n\n", ropt->restrict_key); + if (AH->archiveRemoteVersion) ahprintf(AH, "-- Dumped from database version %s\n", AH->archiveRemoteVersion); @@ -805,6 +813,14 @@ RestoreArchive(Archive *AHX, bool append_data) ahprintf(AH, "--\n-- PostgreSQL database dump complete\n--\n\n"); + /* + * If generating plain-text output, exit restricted mode at the very end + * of the script. This is not pro forma; in particular, pg_dumpall + * requires this when transitioning from one database to another. + */ + if (ropt->restrict_key) + ahprintf(AH, "\\unrestrict %s\n\n", ropt->restrict_key); + /* * Clean up & we're done. */ @@ -1300,7 +1316,7 @@ PrintTOCSummary(Archive *AHX) sav = SaveOutput(AH); if (ropt->filename) - SetOutput(AH, ropt->filename, out_compression_spec, false); + SetOutput(AH, ropt->filename, out_compression_spec); if (strftime(stamp_str, sizeof(stamp_str), PGDUMP_STRFTIME_FMT, localtime(&AH->createDate)) == 0) @@ -1330,8 +1346,8 @@ PrintTOCSummary(Archive *AHX) ahprintf(AH, "; Dump Version: %d.%d-%d\n", ARCHIVE_MAJOR(AH->version), ARCHIVE_MINOR(AH->version), ARCHIVE_REV(AH->version)); ahprintf(AH, "; Format: %s\n", fmtName); - ahprintf(AH, "; Integer: %d bytes\n", (int) AH->intSize); - ahprintf(AH, "; Offset: %d bytes\n", (int) AH->offSize); + ahprintf(AH, "; Integer: %zu bytes\n", AH->intSize); + ahprintf(AH, "; Offset: %zu bytes\n", AH->offSize); if (AH->archiveRemoteVersion) ahprintf(AH, "; Dumped from database version: %s\n", AH->archiveRemoteVersion); @@ -1679,8 +1695,7 @@ archprintf(Archive *AH, const char *fmt,...) static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, - bool append_data) + const pg_compress_specification compression_spec) { CompressFileHandle *CFH; const char *mode; @@ -1700,7 +1715,7 @@ SetOutput(ArchiveHandle *AH, const char *filename, else fn = fileno(stdout); - if (append_data || AH->mode == archModeAppend) + if (AH->mode == archModeAppend) mode = PG_BINARY_A; else mode = PG_BINARY_W; @@ -1868,8 +1883,8 @@ ahwrite(const void *ptr, size_t size, size_t nmemb, ArchiveHandle *AH) { CompressFileHandle *CFH = (CompressFileHandle *) AH->OF; - if (CFH->write_func(ptr, size * nmemb, CFH)) - bytes_written = size * nmemb; + CFH->write_func(ptr, size * nmemb, CFH); + bytes_written = size * nmemb; } if (bytes_written != size * nmemb) @@ -2052,7 +2067,7 @@ WriteOffset(ArchiveHandle *AH, pgoff_t o, int wasSet) } int -ReadOffset(ArchiveHandle *AH, pgoff_t * o) +ReadOffset(ArchiveHandle *AH, pgoff_t *o) { int i; int off; @@ -2292,8 +2307,7 @@ _discoverArchiveFormat(ArchiveHandle *AH) if (ferror(fh)) pg_fatal("could not read input file: %m"); else - pg_fatal("input file is too short (read %lu, expected 5)", - (unsigned long) cnt); + pg_fatal("input file is too short (read %zu, expected 5)", cnt); } /* Save it, just in case we need it later */ @@ -2655,7 +2669,7 @@ WriteToc(ArchiveHandle *AH) pg_fatal("unexpected TOC entry in WriteToc(): %d %s %s", te->dumpId, te->desc, te->tag); - if (fseeko(AH->FH, te->defnLen, SEEK_CUR != 0)) + if (fseeko(AH->FH, te->defnLen, SEEK_CUR) != 0) pg_fatal("error during file seek: %m"); } else if (te->defnDumper) @@ -2974,6 +2988,19 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) int res = REQ_SCHEMA | REQ_DATA; RestoreOptions *ropt = AH->public.ropt; + /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (ropt->binary_upgrade && AH->public.remoteVersion >= 120000 && + strcmp(te->desc, "TABLE DATA") == 0 && + (te->catalogId.oid == LargeObjectMetadataRelationId || + te->catalogId.oid == SharedDependRelationId)) + return REQ_DATA; + /* These items are treated specially */ if (strcmp(te->desc, "ENCODING") == 0 || strcmp(te->desc, "STDSTRINGS") == 0 || @@ -3020,6 +3047,25 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) strcmp(te->desc, "ROW SECURITY") == 0)) return 0; + /* + * If it's a comment on a policy, a publication, or a subscription, maybe + * ignore it. + */ + if (strcmp(te->desc, "COMMENT") == 0) + { + if (ropt->no_policies && + strncmp(te->tag, "POLICY", strlen("POLICY")) == 0) + return 0; + + if (ropt->no_publications && + strncmp(te->tag, "PUBLICATION", strlen("PUBLICATION")) == 0) + return 0; + + if (ropt->no_subscriptions && + strncmp(te->tag, "SUBSCRIPTION", strlen("SUBSCRIPTION")) == 0) + return 0; + } + /* * If it's a publication or a table part of a publication, maybe ignore * it. @@ -3034,6 +3080,21 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) if (ropt->no_security_labels && strcmp(te->desc, "SECURITY LABEL") == 0) return 0; + /* + * If it's a security label on a publication or a subscription, maybe + * ignore it. + */ + if (strcmp(te->desc, "SECURITY LABEL") == 0) + { + if (ropt->no_publications && + strncmp(te->tag, "PUBLICATION", strlen("PUBLICATION")) == 0) + return 0; + + if (ropt->no_subscriptions && + strncmp(te->tag, "SUBSCRIPTION", strlen("SUBSCRIPTION")) == 0) + return 0; + } + /* If it's a subscription, maybe ignore it */ if (ropt->no_subscriptions && strcmp(te->desc, "SUBSCRIPTION") == 0) return 0; @@ -3278,12 +3339,14 @@ _tocEntryRestorePass(TocEntry *te) return RESTORE_PASS_POST_ACL; /* - * Comments need to be emitted in the same pass as their parent objects. - * ACLs haven't got comments, and neither do matview data objects, but - * event triggers do. (Fortunately, event triggers haven't got ACLs, or - * we'd need yet another weird special case.) + * Comments and security labels need to be emitted in the same pass as + * their parent objects. ACLs haven't got comments and security labels, + * and neither do matview data objects, but event triggers do. + * (Fortunately, event triggers haven't got ACLs, or we'd need yet another + * weird special case.) */ - if (strcmp(te->desc, "COMMENT") == 0 && + if ((strcmp(te->desc, "COMMENT") == 0 || + strcmp(te->desc, "SECURITY LABEL") == 0) && strncmp(te->tag, "EVENT TRIGGER ", 14) == 0) return RESTORE_PASS_POST_ACL; @@ -3444,11 +3507,21 @@ _reconnectToDB(ArchiveHandle *AH, const char *dbname) else { PQExpBufferData connectbuf; + RestoreOptions *ropt = AH->public.ropt; + + /* + * We must temporarily exit restricted mode for \connect, etc. + * Anything added between this line and the following \restrict must + * be careful to avoid any possible meta-command injection vectors. + */ + ahprintf(AH, "\\unrestrict %s\n", ropt->restrict_key); initPQExpBuffer(&connectbuf); appendPsqlMetaConnect(&connectbuf, dbname); - ahprintf(AH, "%s\n", connectbuf.data); + ahprintf(AH, "%s", connectbuf.data); termPQExpBuffer(&connectbuf); + + ahprintf(AH, "\\restrict %s\n\n", ropt->restrict_key); } /* @@ -4041,42 +4114,6 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, const char *pfx) } } -/* - * Sanitize a string to be included in an SQL comment or TOC listing, by - * replacing any newlines with spaces. This ensures each logical output line - * is in fact one physical output line, to prevent corruption of the dump - * (which could, in the worst case, present an SQL injection vulnerability - * if someone were to incautiously load a dump containing objects with - * maliciously crafted names). - * - * The result is a freshly malloc'd string. If the input string is NULL, - * return a malloc'ed empty string, unless want_hyphen, in which case return a - * malloc'ed hyphen. - * - * Note that we currently don't bother to quote names, meaning that the name - * fields aren't automatically parseable. "pg_restore -L" doesn't care because - * it only examines the dumpId field, but someday we might want to try harder. - */ -static char * -sanitize_line(const char *str, bool want_hyphen) -{ - char *result; - char *s; - - if (!str) - return pg_strdup(want_hyphen ? "-" : ""); - - result = pg_strdup(str); - - for (s = result; *s != '\0'; s++) - { - if (*s == '\n' || *s == '\r') - *s = ' '; - } - - return result; -} - /* * Write the file header for a custom-format archive */ @@ -4147,8 +4184,7 @@ ReadHead(ArchiveHandle *AH) AH->intSize = AH->ReadBytePtr(AH); if (AH->intSize > 32) - pg_fatal("sanity check on integer size (%lu) failed", - (unsigned long) AH->intSize); + pg_fatal("sanity check on integer size (%zu) failed", AH->intSize); if (AH->intSize > sizeof(int)) pg_log_warning("archive was made on a machine with larger integers, some operations might fail"); diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index 365073b3eae45..325b53fc9bd4b 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -394,7 +394,6 @@ struct _tocEntry extern int parallel_restore(ArchiveHandle *AH, TocEntry *te); extern void on_exit_close_archive(Archive *AHX); -extern void replace_on_exit_close_archive(Archive *AHX); extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *fmt,...) pg_attribute_printf(2, 3); diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index f7c3af56304ce..2226520dffca9 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -624,12 +624,19 @@ _skipData(ArchiveHandle *AH) lclContext *ctx = (lclContext *) AH->formatData; size_t blkLen; char *buf = NULL; - int buflen = 0; + size_t buflen = 0; blkLen = ReadInt(AH); while (blkLen != 0) { - if (ctx->hasSeek) + /* + * Seeks of less than stdio's buffer size are less efficient than just + * reading the data, at least on common platforms. We don't know the + * buffer size for sure, but 4kB is the usual value. (While pg_dump + * currently tries to avoid producing such short data blocks, older + * dump files often contain them.) + */ + if (ctx->hasSeek && blkLen >= 4 * 1024) { if (fseeko(AH->FH, blkLen, SEEK_CUR) != 0) pg_fatal("error during file seek: %m"); @@ -639,8 +646,8 @@ _skipData(ArchiveHandle *AH) if (blkLen > buflen) { free(buf); - buf = (char *) pg_malloc(blkLen); - buflen = blkLen; + buflen = Max(blkLen, 4 * 1024); + buf = (char *) pg_malloc(buflen); } if (fread(buf, 1, blkLen, AH->FH) != blkLen) { diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c index 21b00792a8a48..94d401d8a4e5a 100644 --- a/src/bin/pg_dump/pg_backup_directory.c +++ b/src/bin/pg_dump/pg_backup_directory.c @@ -316,15 +316,9 @@ _WriteData(ArchiveHandle *AH, const void *data, size_t dLen) lclContext *ctx = (lclContext *) AH->formatData; CompressFileHandle *CFH = ctx->dataFH; - errno = 0; - if (dLen > 0 && !CFH->write_func(data, dLen, CFH)) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - pg_fatal("could not write to output file: %s", - CFH->get_error_func(CFH)); - } + if (dLen <= 0) + return; + CFH->write_func(data, dLen, CFH); } /* @@ -351,7 +345,7 @@ _EndData(ArchiveHandle *AH, TocEntry *te) static void _PrintFileData(ArchiveHandle *AH, char *filename) { - size_t cnt = 0; + size_t cnt; char *buf; size_t buflen; CompressFileHandle *CFH; @@ -366,7 +360,7 @@ _PrintFileData(ArchiveHandle *AH, char *filename) buflen = DEFAULT_IO_BUFFER_SIZE; buf = pg_malloc(buflen); - while (CFH->read_func(buf, buflen, &cnt, CFH) && cnt > 0) + while ((cnt = CFH->read_func(buf, buflen, CFH)) > 0) { ahwrite(buf, 1, cnt, AH); } @@ -412,10 +406,15 @@ _LoadLOs(ArchiveHandle *AH, TocEntry *te) /* * Note: before archive v16, there was always only one BLOBS TOC entry, - * now there can be multiple. We don't need to worry what version we are - * reading though, because tctx->filename should be correct either way. + * now there can be multiple. Furthermore, although the actual filename + * was always "blobs.toc" before v16, the value of tctx->filename did not + * match that before commit 548e50976 fixed it. For simplicity we assume + * it must be "blobs.toc" in all archives before v16. */ - setFilePath(AH, tocfname, tctx->filename); + if (AH->version < K_VERS_1_16) + setFilePath(AH, tocfname, "blobs.toc"); + else + setFilePath(AH, tocfname, tctx->filename); CFH = ctx->LOsTocFH = InitDiscoverCompressFileHandle(tocfname, PG_BINARY_R); @@ -465,16 +464,7 @@ _WriteByte(ArchiveHandle *AH, const int i) lclContext *ctx = (lclContext *) AH->formatData; CompressFileHandle *CFH = ctx->dataFH; - errno = 0; - if (!CFH->write_func(&c, 1, CFH)) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - pg_fatal("could not write to output file: %s", - CFH->get_error_func(CFH)); - } - + CFH->write_func(&c, 1, CFH); return 1; } @@ -503,15 +493,7 @@ _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len) lclContext *ctx = (lclContext *) AH->formatData; CompressFileHandle *CFH = ctx->dataFH; - errno = 0; - if (!CFH->write_func(buf, len, CFH)) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - pg_fatal("could not write to output file: %s", - CFH->get_error_func(CFH)); - } + CFH->write_func(buf, len, CFH); } /* @@ -526,10 +508,10 @@ _ReadBuf(ArchiveHandle *AH, void *buf, size_t len) CompressFileHandle *CFH = ctx->dataFH; /* - * If there was an I/O error, we already exited in readF(), so here we - * exit on short reads. + * We do not expect a short read, so fail if we get one. The read_func + * already dealt with any outright I/O error. */ - if (!CFH->read_func(buf, len, NULL, CFH)) + if (CFH->read_func(buf, len, CFH) != len) pg_fatal("could not read from input file: end of file"); } @@ -672,14 +654,7 @@ _EndLO(ArchiveHandle *AH, TocEntry *te, Oid oid) /* register the LO in blobs_NNN.toc */ len = snprintf(buf, sizeof(buf), "%u blob_%u.dat\n", oid, oid); - if (!CFH->write_func(buf, len, CFH)) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - pg_fatal("could not write to LOs TOC file: %s", - CFH->get_error_func(CFH)); - } + CFH->write_func(buf, len, CFH); } /* diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index d94d0de2a5d17..b5ba3b46dd999 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -826,7 +826,7 @@ _CloseArchive(ArchiveHandle *AH) savVerbose = AH->public.verbose; AH->public.verbose = 0; - RestoreArchive((Archive *) AH, false); + RestoreArchive((Archive *) AH); SetArchiveOptions((Archive *) AH, savDopt, savRopt); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 37432e66efd7c..27f6be3f0f82b 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -47,10 +47,13 @@ #include "catalog/pg_authid_d.h" #include "catalog/pg_cast_d.h" #include "catalog/pg_class_d.h" +#include "catalog/pg_constraint_d.h" #include "catalog/pg_default_acl_d.h" #include "catalog/pg_largeobject_d.h" +#include "catalog/pg_largeobject_metadata_d.h" #include "catalog/pg_proc_d.h" #include "catalog/pg_publication_d.h" +#include "catalog/pg_shdepend_d.h" #include "catalog/pg_subscription_d.h" #include "catalog/pg_type_d.h" #include "common/connect.h" @@ -209,6 +212,12 @@ static int nbinaryUpgradeClassOids = 0; static SequenceItem *sequences = NULL; static int nsequences = 0; +/* + * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use + * as a dependency for pg_shdepend and any large object comments/seclabels. + */ +static DumpId lo_metadata_dumpId; + /* Maximum number of relations to fetch in a fetchAttributeStats() call. */ #define MAX_ATTR_STATS_RELS 64 @@ -350,7 +359,9 @@ static void buildMatViewRefreshDependencies(Archive *fout); static void getTableDataFKConstraints(void); static void determineNotNullFlags(Archive *fout, PGresult *res, int r, TableInfo *tbinfo, int j, - int i_notnull_name, int i_notnull_invalidoid, + int i_notnull_name, + int i_notnull_comment, + int i_notnull_invalidoid, int i_notnull_noinherit, int i_notnull_islocal, PQExpBuffer *invalidnotnulloids); @@ -438,8 +449,6 @@ main(int argc, char **argv) bool data_only = false; bool schema_only = false; bool statistics_only = false; - bool with_data = false; - bool with_schema = false; bool with_statistics = false; bool no_data = false; bool no_schema = false; @@ -503,6 +512,7 @@ main(int argc, char **argv) {"section", required_argument, NULL, 5}, {"serializable-deferrable", no_argument, &dopt.serializable_deferrable, 1}, {"snapshot", required_argument, NULL, 6}, + {"statistics", no_argument, NULL, 22}, {"statistics-only", no_argument, NULL, 18}, {"strict-names", no_argument, &strict_names, 1}, {"use-set-session-authorization", no_argument, &dopt.use_setsessauth, 1}, @@ -517,9 +527,6 @@ main(int argc, char **argv) {"no-toast-compression", no_argument, &dopt.no_toast_compression, 1}, {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1}, {"no-sync", no_argument, NULL, 7}, - {"with-data", no_argument, NULL, 22}, - {"with-schema", no_argument, NULL, 23}, - {"with-statistics", no_argument, NULL, 24}, {"on-conflict-do-nothing", no_argument, &dopt.do_nothing, 1}, {"rows-per-insert", required_argument, NULL, 10}, {"include-foreign-data", required_argument, NULL, 11}, @@ -530,6 +537,7 @@ main(int argc, char **argv) {"filter", required_argument, NULL, 16}, {"exclude-extension", required_argument, NULL, 17}, {"sequence-data", no_argument, &dopt.sequence_data, 1}, + {"restrict-key", required_argument, NULL, 25}, {NULL, 0, NULL, 0} }; @@ -787,15 +795,11 @@ main(int argc, char **argv) break; case 22: - with_data = true; - break; - - case 23: - with_schema = true; + with_statistics = true; break; - case 24: - with_statistics = true; + case 25: + dopt.restrict_key = pg_strdup(optarg); break; default: @@ -827,51 +831,64 @@ main(int argc, char **argv) /* reject conflicting "-only" options */ if (data_only && schema_only) - pg_fatal("options -s/--schema-only and -a/--data-only cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "-a/--data-only"); if (schema_only && statistics_only) - pg_fatal("options -s/--schema-only and --statistics-only cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--statistics-only"); if (data_only && statistics_only) - pg_fatal("options -a/--data-only and --statistics-only cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "-a/--data-only", "--statistics-only"); /* reject conflicting "-only" and "no-" options */ if (data_only && no_data) - pg_fatal("options -a/--data-only and --no-data cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "-a/--data-only", "--no-data"); if (schema_only && no_schema) - pg_fatal("options -s/--schema-only and --no-schema cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--no-schema"); if (statistics_only && no_statistics) - pg_fatal("options --statistics-only and --no-statistics cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "--statistics-only", "--no-statistics"); - /* reject conflicting "with-" and "no-" options */ - if (with_data && no_data) - pg_fatal("options --with-data and --no-data cannot be used together"); - if (with_schema && no_schema) - pg_fatal("options --with-schema and --no-schema cannot be used together"); + /* reject conflicting "no-" options */ if (with_statistics && no_statistics) - pg_fatal("options --with-statistics and --no-statistics cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "--statistics", "--no-statistics"); + + /* reject conflicting "-only" options */ + if (data_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-a/--data-only", "--statistics"); + if (schema_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--statistics"); if (schema_only && foreign_servers_include_patterns.head != NULL) - pg_fatal("options -s/--schema-only and --include-foreign-data cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--include-foreign-data"); if (numWorkers > 1 && foreign_servers_include_patterns.head != NULL) - pg_fatal("option --include-foreign-data is not supported with parallel backup"); + pg_fatal("option %s is not supported with parallel backup", + "--include-foreign-data"); if (data_only && dopt.outputClean) - pg_fatal("options -c/--clean and -a/--data-only cannot be used together"); + pg_fatal("options %s and %s cannot be used together", + "-c/--clean", "-a/--data-only"); if (dopt.if_exists && !dopt.outputClean) - pg_fatal("option --if-exists requires option -c/--clean"); + pg_fatal("option %s requires option %s", + "--if-exists", "-c/--clean"); /* - * Set derivative flags. An "-only" option may be overridden by an - * explicit "with-" option; e.g. "--schema-only --with-statistics" will - * include schema and statistics. Other ambiguous or nonsensical - * combinations, e.g. "--schema-only --no-schema", will have already - * caused an error in one of the checks above. + * Set derivative flags. Ambiguous or nonsensical combinations, e.g. + * "--schema-only --no-schema", will have already caused an error in one + * of the checks above. */ dopt.dumpData = ((dopt.dumpData && !schema_only && !statistics_only) || - (data_only || with_data)) && !no_data; + data_only) && !no_data; dopt.dumpSchema = ((dopt.dumpSchema && !data_only && !statistics_only) || - (schema_only || with_schema)) && !no_schema; + schema_only) && !no_schema; dopt.dumpStatistics = ((dopt.dumpStatistics && !schema_only && !data_only) || (statistics_only || with_statistics)) && !no_statistics; @@ -881,15 +898,32 @@ main(int argc, char **argv) * --rows-per-insert were specified. */ if (dopt.do_nothing && dopt.dump_inserts == 0) - pg_fatal("option --on-conflict-do-nothing requires option --inserts, --rows-per-insert, or --column-inserts"); + pg_fatal("option %s requires option %s, %s, or %s", + "--on-conflict-do-nothing", + "--inserts", "--rows-per-insert", "--column-inserts"); /* Identify archive format to emit */ archiveFormat = parseArchiveFormat(format, &archiveMode); /* archiveFormat specific setup */ if (archiveFormat == archNull) + { plainText = 1; + /* + * If you don't provide a restrict key, one will be appointed for you. + */ + if (!dopt.restrict_key) + dopt.restrict_key = generate_restrict_key(); + if (!dopt.restrict_key) + pg_fatal("could not generate restrict key"); + if (!valid_restrict_key(dopt.restrict_key)) + pg_fatal("invalid restrict key"); + } + else if (dopt.restrict_key) + pg_fatal("option %s can only be used with %s", + "--restrict-key", "--format=plain"); + /* * Custom and directory formats are compressed by default with gzip when * available, not the others. If gzip is not available, no compression is @@ -1083,6 +1117,53 @@ main(int argc, char **argv) if (!dopt.dumpData && dopt.sequence_data) getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE); + /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (dopt.binary_upgrade && fout->remoteVersion >= 120000) + { + TableInfo *lo_metadata = findTableByOid(LargeObjectMetadataRelationId); + TableInfo *shdepend = findTableByOid(SharedDependRelationId); + + makeTableDataInfo(&dopt, lo_metadata); + makeTableDataInfo(&dopt, shdepend); + + /* + * Save pg_largeobject_metadata's dump ID for use as a dependency for + * pg_shdepend and any large object comments/seclabels. + */ + lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId; + addObjectDependency(&shdepend->dataObj->dobj, lo_metadata_dumpId); + + /* + * Only dump large object shdepend rows for this database. + */ + shdepend->dataObj->filtercond = "WHERE classid = 'pg_largeobject'::regclass " + "AND dbid = (SELECT oid FROM pg_database " + " WHERE datname = current_database())"; + + /* + * If upgrading from v16 or newer, only dump large objects with + * comments/seclabels. For these upgrades, pg_upgrade can copy/link + * pg_largeobject_metadata's files (which is usually faster) but we + * still need to dump LOs with comments/seclabels here so that the + * subsequent COMMENT and SECURITY LABEL commands work. pg_upgrade + * can't copy/link the files from older versions because aclitem + * (needed by pg_largeobject_metadata.lomacl) changed its storage + * format in v16. + */ + if (fout->remoteVersion >= 160000) + lo_metadata->dataObj->filtercond = "WHERE oid IN " + "(SELECT objoid FROM pg_description " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " + "UNION SELECT objoid FROM pg_seclabel " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) ")"; + } + /* * In binary-upgrade mode, we do not have to worry about the actual LO * data or the associated metadata that resides in the pg_largeobject and @@ -1198,6 +1279,7 @@ main(int argc, char **argv) ropt->enable_row_security = dopt.enable_row_security; ropt->sequence_data = dopt.sequence_data; ropt->binary_upgrade = dopt.binary_upgrade; + ropt->restrict_key = dopt.restrict_key ? pg_strdup(dopt.restrict_key) : NULL; ropt->compression_spec = compression_spec; @@ -1224,7 +1306,7 @@ main(int argc, char **argv) * right now. */ if (plainText) - RestoreArchive(fout, false); + RestoreArchive(fout); CloseArchive(fout); @@ -1235,7 +1317,7 @@ main(int argc, char **argv) static void help(const char *progname) { - printf(_("%s dumps a database as a text file or to other formats.\n\n"), progname); + printf(_("%s exports a PostgreSQL database as an SQL script or to other formats.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION]... [DBNAME]\n"), progname); @@ -1309,11 +1391,13 @@ help(const char *progname) printf(_(" --no-unlogged-table-data do not dump unlogged table data\n")); printf(_(" --on-conflict-do-nothing add ON CONFLICT DO NOTHING to INSERT commands\n")); printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n")); + printf(_(" --restrict-key=RESTRICT_KEY use provided string as psql \\restrict key\n")); printf(_(" --rows-per-insert=NROWS number of rows per INSERT; implies --inserts\n")); printf(_(" --section=SECTION dump named section (pre-data, data, or post-data)\n")); printf(_(" --sequence-data include sequence data in dump\n")); printf(_(" --serializable-deferrable wait until the dump can run without anomalies\n")); printf(_(" --snapshot=SNAPSHOT use given snapshot for the dump\n")); + printf(_(" --statistics dump the statistics\n")); printf(_(" --statistics-only dump only the statistics, not schema or data\n")); printf(_(" --strict-names require table and/or schema include patterns to\n" " match at least one entity each\n")); @@ -1322,9 +1406,6 @@ help(const char *progname) printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); - printf(_(" --with-data dump the data\n")); - printf(_(" --with-schema dump the schema\n")); - printf(_(" --with-statistics dump the statistics\n")); printf(_("\nConnection options:\n")); printf(_(" -d, --dbname=DBNAME database to dump\n")); @@ -2115,7 +2196,7 @@ selectDumpableCast(CastInfo *cast, Archive *fout) * This would be DUMP_COMPONENT_ACL for from-initdb casts, but they do not * support ACLs currently. */ - if (cast->dobj.catId.oid <= (Oid) g_last_builtin_oid) + if (cast->dobj.catId.oid <= g_last_builtin_oid) cast->dobj.dump = DUMP_COMPONENT_NONE; else cast->dobj.dump = fout->dopt->include_everything ? @@ -2147,7 +2228,7 @@ selectDumpableProcLang(ProcLangInfo *plang, Archive *fout) plang->dobj.dump = DUMP_COMPONENT_NONE; else { - if (plang->dobj.catId.oid <= (Oid) g_last_builtin_oid) + if (plang->dobj.catId.oid <= g_last_builtin_oid) plang->dobj.dump = fout->remoteVersion < 90600 ? DUMP_COMPONENT_NONE : DUMP_COMPONENT_ACL; else @@ -2166,6 +2247,13 @@ selectDumpableProcLang(ProcLangInfo *plang, Archive *fout) static void selectDumpableAccessMethod(AccessMethodInfo *method, Archive *fout) { + /* see getAccessMethods() comment about v9.6. */ + if (fout->remoteVersion < 90600) + { + method->dobj.dump = DUMP_COMPONENT_NONE; + return; + } + if (checkExtensionMembership(&method->dobj, fout)) return; /* extension membership overrides all else */ @@ -2173,7 +2261,7 @@ selectDumpableAccessMethod(AccessMethodInfo *method, Archive *fout) * This would be DUMP_COMPONENT_ACL for from-initdb access methods, but * they do not support ACLs currently. */ - if (method->dobj.catId.oid <= (Oid) g_last_builtin_oid) + if (method->dobj.catId.oid <= g_last_builtin_oid) method->dobj.dump = DUMP_COMPONENT_NONE; else method->dobj.dump = fout->dopt->include_everything ? @@ -2199,7 +2287,7 @@ selectDumpableExtension(ExtensionInfo *extinfo, DumpOptions *dopt) * change permissions on their member objects, if they wish to, and have * those changes preserved. */ - if (extinfo->dobj.catId.oid <= (Oid) g_last_builtin_oid) + if (extinfo->dobj.catId.oid <= g_last_builtin_oid) extinfo->dobj.dump = extinfo->dobj.dump_contains = DUMP_COMPONENT_ACL; else { @@ -2804,11 +2892,14 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo) forcePartitionRootLoad(tbinfo))) { TableInfo *parentTbinfo; + char *sanitized; parentTbinfo = getRootTableInfo(tbinfo); copyFrom = fmtQualifiedDumpable(parentTbinfo); + sanitized = sanitize_line(copyFrom, true); printfPQExpBuffer(copyBuf, "-- load via partition root %s", - copyFrom); + sanitized); + free(sanitized); tdDefn = pg_strdup(copyBuf->data); } else @@ -3569,26 +3660,32 @@ dumpDatabase(Archive *fout) /* * pg_largeobject comes from the old system intact, so set its * relfrozenxids, relminmxids and relfilenode. + * + * pg_largeobject_metadata also comes from the old system intact for + * upgrades from v16 and newer, so set its relfrozenxids, relminmxids, and + * relfilenode, too. pg_upgrade can't copy/link the files from older + * versions because aclitem (needed by pg_largeobject_metadata.lomacl) + * changed its storage format in v16. */ if (dopt->binary_upgrade) { PGresult *lo_res; PQExpBuffer loFrozenQry = createPQExpBuffer(); PQExpBuffer loOutQry = createPQExpBuffer(); + PQExpBuffer lomOutQry = createPQExpBuffer(); PQExpBuffer loHorizonQry = createPQExpBuffer(); + PQExpBuffer lomHorizonQry = createPQExpBuffer(); int ii_relfrozenxid, ii_relfilenode, ii_oid, ii_relminmxid; - /* - * pg_largeobject - */ if (fout->remoteVersion >= 90300) appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" - "WHERE oid IN (%u, %u);\n", - LargeObjectRelationId, LargeObjectLOidPNIndexId); + "WHERE oid IN (%u, %u, %u, %u);\n", + LargeObjectRelationId, LargeObjectLOidPNIndexId, + LargeObjectMetadataRelationId, LargeObjectMetadataOidIndexId); else appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, 0 AS relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" @@ -3603,35 +3700,57 @@ dumpDatabase(Archive *fout) ii_oid = PQfnumber(lo_res, "oid"); appendPQExpBufferStr(loHorizonQry, "\n-- For binary upgrade, set pg_largeobject relfrozenxid and relminmxid\n"); + appendPQExpBufferStr(lomHorizonQry, "\n-- For binary upgrade, set pg_largeobject_metadata relfrozenxid and relminmxid\n"); appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, preserve pg_largeobject and index relfilenodes\n"); + appendPQExpBufferStr(lomOutQry, "\n-- For binary upgrade, preserve pg_largeobject_metadata and index relfilenodes\n"); for (int i = 0; i < PQntuples(lo_res); ++i) { Oid oid; RelFileNumber relfilenumber; + PQExpBuffer horizonQry; + PQExpBuffer outQry; + + oid = atooid(PQgetvalue(lo_res, i, ii_oid)); + relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); + + if (oid == LargeObjectRelationId || + oid == LargeObjectLOidPNIndexId) + { + horizonQry = loHorizonQry; + outQry = loOutQry; + } + else + { + horizonQry = lomHorizonQry; + outQry = lomOutQry; + } - appendPQExpBuffer(loHorizonQry, "UPDATE pg_catalog.pg_class\n" + appendPQExpBuffer(horizonQry, "UPDATE pg_catalog.pg_class\n" "SET relfrozenxid = '%u', relminmxid = '%u'\n" "WHERE oid = %u;\n", atooid(PQgetvalue(lo_res, i, ii_relfrozenxid)), atooid(PQgetvalue(lo_res, i, ii_relminmxid)), atooid(PQgetvalue(lo_res, i, ii_oid))); - oid = atooid(PQgetvalue(lo_res, i, ii_oid)); - relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); - - if (oid == LargeObjectRelationId) - appendPQExpBuffer(loOutQry, + if (oid == LargeObjectRelationId || + oid == LargeObjectMetadataRelationId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); - else if (oid == LargeObjectLOidPNIndexId) - appendPQExpBuffer(loOutQry, + else if (oid == LargeObjectLOidPNIndexId || + oid == LargeObjectMetadataOidIndexId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); } appendPQExpBufferStr(loOutQry, "TRUNCATE pg_catalog.pg_largeobject;\n"); + appendPQExpBufferStr(lomOutQry, + "TRUNCATE pg_catalog.pg_largeobject_metadata;\n"); + appendPQExpBufferStr(loOutQry, loHorizonQry->data); + appendPQExpBufferStr(lomOutQry, lomHorizonQry->data); ArchiveEntry(fout, nilCatalogId, createDumpId(), ARCHIVE_OPTS(.tag = "pg_largeobject", @@ -3639,11 +3758,20 @@ dumpDatabase(Archive *fout) .section = SECTION_PRE_DATA, .createStmt = loOutQry->data)); + if (fout->remoteVersion >= 160000) + ArchiveEntry(fout, nilCatalogId, createDumpId(), + ARCHIVE_OPTS(.tag = "pg_largeobject_metadata", + .description = "pg_largeobject_metadata", + .section = SECTION_PRE_DATA, + .createStmt = lomOutQry->data)); + PQclear(lo_res); destroyPQExpBuffer(loFrozenQry); destroyPQExpBuffer(loHorizonQry); + destroyPQExpBuffer(lomHorizonQry); destroyPQExpBuffer(loOutQry); + destroyPQExpBuffer(lomOutQry); } PQclear(res); @@ -3922,10 +4050,37 @@ getLOs(Archive *fout) * as it will be copied by pg_upgrade, which simply copies the * pg_largeobject table. We *do* however dump out anything but the * data, as pg_upgrade copies just pg_largeobject, but not - * pg_largeobject_metadata, after the dump is restored. + * pg_largeobject_metadata, after the dump is restored. In versions + * before v12, this is done via proper large object commands. In + * newer versions, we dump the content of pg_largeobject_metadata and + * any associated pg_shdepend rows, which is faster to restore. (On + *