PostgreSQL Source Code git master
pruneheap.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pruneheap.c
4 * heap page pruning and HOT-chain management code
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/pruneheap.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include "access/heapam.h"
18#include "access/heapam_xlog.h"
19#include "access/htup_details.h"
20#include "access/multixact.h"
21#include "access/transam.h"
23#include "access/xlog.h"
24#include "access/xloginsert.h"
25#include "commands/vacuum.h"
26#include "executor/instrument.h"
27#include "miscadmin.h"
28#include "pgstat.h"
29#include "storage/bufmgr.h"
30#include "utils/rel.h"
31#include "utils/snapmgr.h"
32
33/* Working data for heap_page_prune_and_freeze() and subroutines */
34typedef struct
35{
36 /*-------------------------------------------------------
37 * Arguments passed to heap_page_prune_and_freeze()
38 *-------------------------------------------------------
39 */
40
41 /* tuple visibility test, initialized for the relation */
43 /* whether or not dead items can be set LP_UNUSED during pruning */
45 /* whether to attempt freezing tuples */
48
49 /*-------------------------------------------------------
50 * Fields describing what to do to the page
51 *-------------------------------------------------------
52 */
53 TransactionId new_prune_xid; /* new prune hint value */
55 int nredirected; /* numbers of entries in arrays below */
56 int ndead;
59 /* arrays that accumulate indexes of items to be changed */
64
65 /*-------------------------------------------------------
66 * Working state for HOT chain processing
67 *-------------------------------------------------------
68 */
69
70 /*
71 * 'root_items' contains offsets of all LP_REDIRECT line pointers and
72 * normal non-HOT tuples. They can be stand-alone items or the first item
73 * in a HOT chain. 'heaponly_items' contains heap-only tuples which can
74 * only be removed as part of a HOT chain.
75 */
80
81 /*
82 * processed[offnum] is true if item at offnum has been processed.
83 *
84 * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
85 * 1. Otherwise every access would need to subtract 1.
86 */
87 bool processed[MaxHeapTuplesPerPage + 1];
88
89 /*
90 * Tuple visibility is only computed once for each tuple, for correctness
91 * and efficiency reasons; see comment in heap_page_prune_and_freeze() for
92 * details. This is of type int8[], instead of HTSV_Result[], so we can
93 * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD
94 * items.
95 *
96 * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
97 * 1. Otherwise every access would need to subtract 1.
98 */
100
101 /*
102 * Freezing-related state.
103 */
105
106 /*-------------------------------------------------------
107 * Information about what was done
108 *
109 * These fields are not used by pruning itself for the most part, but are
110 * used to collect information about what was pruned and what state the
111 * page is in after pruning, for the benefit of the caller. They are
112 * copied to the caller's PruneFreezeResult at the end.
113 * -------------------------------------------------------
114 */
115
116 int ndeleted; /* Number of tuples deleted from the page */
117
118 /* Number of live and recently dead tuples, after pruning */
121
122 /* Whether or not the page makes rel truncation unsafe */
123 bool hastup;
124
125 /*
126 * LP_DEAD items on the page after pruning. Includes existing LP_DEAD
127 * items
128 */
129 int lpdead_items; /* number of items in the array */
130 OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */
131
132 /*
133 * The snapshot conflict horizon used when freezing tuples. The final
134 * snapshot conflict horizon for the record may be newer if pruning
135 * removes newer transaction IDs.
136 */
138
139 /*
140 * all_visible and all_frozen indicate if the all-visible and all-frozen
141 * bits in the visibility map can be set for this page after pruning.
142 *
143 * visibility_cutoff_xid is the newest xmin of live tuples on the page.
144 * The caller can use it as the conflict horizon, when setting the VM
145 * bits. It is only valid if we froze some tuples, and all_frozen is
146 * true.
147 *
148 * NOTE: all_visible and all_frozen initially don't include LP_DEAD items.
149 * That's convenient for heap_page_prune_and_freeze() to use them to
150 * decide whether to freeze the page or not. The all_visible and
151 * all_frozen values returned to the caller are adjusted to include
152 * LP_DEAD items after we determine whether to opportunistically freeze.
153 */
157} PruneState;
158
159/* Local functions */
160static void prune_freeze_setup(PruneFreezeParams *params,
161 TransactionId new_relfrozen_xid,
162 MultiXactId new_relmin_mxid,
163 const PruneFreezeResult *presult,
164 PruneState *prstate);
165static void prune_freeze_plan(Oid reloid, Buffer buffer,
166 PruneState *prstate,
167 OffsetNumber *off_loc);
169 HeapTuple tup,
170 Buffer buffer);
171static inline HTSV_Result htsv_get_valid_status(int status);
172static void heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff,
173 OffsetNumber rootoffnum, PruneState *prstate);
174static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
175static void heap_prune_record_redirect(PruneState *prstate,
176 OffsetNumber offnum, OffsetNumber rdoffnum,
177 bool was_normal);
178static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
179 bool was_normal);
180static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
181 bool was_normal);
182static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal);
183
184static void heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum);
185static void heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum);
186static void heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum);
188
189static void page_verify_redirects(Page page);
190
191static bool heap_page_will_freeze(Relation relation, Buffer buffer,
192 bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune,
193 PruneState *prstate);
194
195
196/*
197 * Optionally prune and repair fragmentation in the specified page.
198 *
199 * This is an opportunistic function. It will perform housekeeping
200 * only if the page heuristically looks like a candidate for pruning and we
201 * can acquire buffer cleanup lock without blocking.
202 *
203 * Note: this is called quite often. It's important that it fall out quickly
204 * if there's not any use in pruning.
205 *
206 * Caller must have pin on the buffer, and must *not* have a lock on it.
207 */
208void
210{
211 Page page = BufferGetPage(buffer);
212 TransactionId prune_xid;
213 GlobalVisState *vistest;
214 Size minfree;
215
216 /*
217 * We can't write WAL in recovery mode, so there's no point trying to
218 * clean the page. The primary will likely issue a cleaning WAL record
219 * soon anyway, so this is no particular loss.
220 */
221 if (RecoveryInProgress())
222 return;
223
224 /*
225 * First check whether there's any chance there's something to prune,
226 * determining the appropriate horizon is a waste if there's no prune_xid
227 * (i.e. no updates/deletes left potentially dead tuples around).
228 */
229 prune_xid = ((PageHeader) page)->pd_prune_xid;
230 if (!TransactionIdIsValid(prune_xid))
231 return;
232
233 /*
234 * Check whether prune_xid indicates that there may be dead rows that can
235 * be cleaned up.
236 */
237 vistest = GlobalVisTestFor(relation);
238
239 if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
240 return;
241
242 /*
243 * We prune when a previous UPDATE failed to find enough space on the page
244 * for a new tuple version, or when free space falls below the relation's
245 * fill-factor target (but not less than 10%).
246 *
247 * Checking free space here is questionable since we aren't holding any
248 * lock on the buffer; in the worst case we could get a bogus answer. It's
249 * unlikely to be *seriously* wrong, though, since reading either pd_lower
250 * or pd_upper is probably atomic. Avoiding taking a lock seems more
251 * important than sometimes getting a wrong answer in what is after all
252 * just a heuristic estimate.
253 */
254 minfree = RelationGetTargetPageFreeSpace(relation,
256 minfree = Max(minfree, BLCKSZ / 10);
257
258 if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
259 {
260 /* OK, try to get exclusive buffer lock */
262 return;
263
264 /*
265 * Now that we have buffer lock, get accurate information about the
266 * page's free space, and recheck the heuristic about whether to
267 * prune.
268 */
269 if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
270 {
271 OffsetNumber dummy_off_loc;
272 PruneFreezeResult presult;
273
274 /*
275 * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option
276 * regardless of whether or not the relation has indexes, since we
277 * cannot safely determine that during on-access pruning with the
278 * current implementation.
279 */
280 PruneFreezeParams params = {
281 .relation = relation,
282 .buffer = buffer,
283 .reason = PRUNE_ON_ACCESS,
284 .options = 0,
285 .vistest = vistest,
286 .cutoffs = NULL,
287 };
288
289 heap_page_prune_and_freeze(&params, &presult, &dummy_off_loc,
290 NULL, NULL);
291
292 /*
293 * Report the number of tuples reclaimed to pgstats. This is
294 * presult.ndeleted minus the number of newly-LP_DEAD-set items.
295 *
296 * We derive the number of dead tuples like this to avoid totally
297 * forgetting about items that were set to LP_DEAD, since they
298 * still need to be cleaned up by VACUUM. We only want to count
299 * heap-only tuples that just became LP_UNUSED in our report,
300 * which don't.
301 *
302 * VACUUM doesn't have to compensate in the same way when it
303 * tracks ndeleted, since it will set the same LP_DEAD items to
304 * LP_UNUSED separately.
305 */
306 if (presult.ndeleted > presult.nnewlpdead)
308 presult.ndeleted - presult.nnewlpdead);
309 }
310
311 /* And release buffer lock */
313
314 /*
315 * We avoid reuse of any free space created on the page by unrelated
316 * UPDATEs/INSERTs by opting to not update the FSM at this point. The
317 * free space should be reused by UPDATEs to *this* page.
318 */
319 }
320}
321
322/*
323 * Helper for heap_page_prune_and_freeze() to initialize the PruneState using
324 * the provided parameters.
325 */
326static void
328 TransactionId new_relfrozen_xid,
329 MultiXactId new_relmin_mxid,
330 const PruneFreezeResult *presult,
331 PruneState *prstate)
332{
333 /* Copy parameters to prstate */
334 prstate->vistest = params->vistest;
335 prstate->mark_unused_now =
337
338 /* cutoffs must be provided if we will attempt freezing */
339 Assert(!(params->options & HEAP_PAGE_PRUNE_FREEZE) || params->cutoffs);
340 prstate->attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0;
341 prstate->cutoffs = params->cutoffs;
342
343 /*
344 * Our strategy is to scan the page and make lists of items to change,
345 * then apply the changes within a critical section. This keeps as much
346 * logic as possible out of the critical section, and also ensures that
347 * WAL replay will work the same as the normal case.
348 *
349 * First, initialize the new pd_prune_xid value to zero (indicating no
350 * prunable tuples). If we find any tuples which may soon become
351 * prunable, we will save the lowest relevant XID in new_prune_xid. Also
352 * initialize the rest of our working state.
353 */
356 prstate->nredirected = prstate->ndead = prstate->nunused = 0;
357 prstate->nfrozen = 0;
358 prstate->nroot_items = 0;
359 prstate->nheaponly_items = 0;
360
361 /* initialize page freezing working state */
362 prstate->pagefrz.freeze_required = false;
363 if (prstate->attempt_freeze)
364 {
365 prstate->pagefrz.FreezePageRelfrozenXid = new_relfrozen_xid;
366 prstate->pagefrz.NoFreezePageRelfrozenXid = new_relfrozen_xid;
367 prstate->pagefrz.FreezePageRelminMxid = new_relmin_mxid;
368 prstate->pagefrz.NoFreezePageRelminMxid = new_relmin_mxid;
369 }
370 else
371 {
372 Assert(new_relfrozen_xid == InvalidTransactionId &&
373 new_relmin_mxid == InvalidMultiXactId);
378 }
379
380 prstate->ndeleted = 0;
381 prstate->live_tuples = 0;
382 prstate->recently_dead_tuples = 0;
383 prstate->hastup = false;
384 prstate->lpdead_items = 0;
385 prstate->deadoffsets = (OffsetNumber *) presult->deadoffsets;
387
388 /*
389 * Vacuum may update the VM after we're done. We can keep track of
390 * whether the page will be all-visible and all-frozen after pruning and
391 * freezing to help the caller to do that.
392 *
393 * Currently, only VACUUM sets the VM bits. To save the effort, only do
394 * the bookkeeping if the caller needs it. Currently, that's tied to
395 * HEAP_PAGE_PRUNE_FREEZE, but it could be a separate flag if you wanted
396 * to update the VM bits without also freezing or freeze without also
397 * setting the VM bits.
398 *
399 * In addition to telling the caller whether it can set the VM bit, we
400 * also use 'all_visible' and 'all_frozen' for our own decision-making. If
401 * the whole page would become frozen, we consider opportunistically
402 * freezing tuples. We will not be able to freeze the whole page if there
403 * are tuples present that are not visible to everyone or if there are
404 * dead tuples which are not yet removable. However, dead tuples which
405 * will be removed by the end of vacuuming should not preclude us from
406 * opportunistically freezing. Because of that, we do not immediately
407 * clear all_visible and all_frozen when we see LP_DEAD items. We fix
408 * that after scanning the line pointers. We must correct all_visible and
409 * all_frozen before we return them to the caller, so that the caller
410 * doesn't set the VM bits incorrectly.
411 */
412 if (prstate->attempt_freeze)
413 {
414 prstate->all_visible = true;
415 prstate->all_frozen = true;
416 }
417 else
418 {
419 /*
420 * Initializing to false allows skipping the work to update them in
421 * heap_prune_record_unchanged_lp_normal().
422 */
423 prstate->all_visible = false;
424 prstate->all_frozen = false;
425 }
426
427 /*
428 * The visibility cutoff xid is the newest xmin of live tuples on the
429 * page. In the common case, this will be set as the conflict horizon the
430 * caller can use for updating the VM. If, at the end of freezing and
431 * pruning, the page is all-frozen, there is no possibility that any
432 * running transaction on the standby does not see tuples on the page as
433 * all-visible, so the conflict horizon remains InvalidTransactionId.
434 */
436}
437
438/*
439 * Helper for heap_page_prune_and_freeze(). Iterates over every tuple on the
440 * page, examines its visibility information, and determines the appropriate
441 * action for each tuple. All tuples are processed and classified during this
442 * phase, but no modifications are made to the page until the later execution
443 * stage.
444 *
445 * *off_loc is used for error callback and cleared before returning.
446 */
447static void
448prune_freeze_plan(Oid reloid, Buffer buffer, PruneState *prstate,
449 OffsetNumber *off_loc)
450{
451 Page page = BufferGetPage(buffer);
452 BlockNumber blockno = BufferGetBlockNumber(buffer);
454 OffsetNumber offnum;
455 HeapTupleData tup;
456
457 tup.t_tableOid = reloid;
458
459 /*
460 * Determine HTSV for all tuples, and queue them up for processing as HOT
461 * chain roots or as heap-only items.
462 *
463 * Determining HTSV only once for each tuple is required for correctness,
464 * to deal with cases where running HTSV twice could result in different
465 * results. For example, RECENTLY_DEAD can turn to DEAD if another
466 * checked item causes GlobalVisTestIsRemovableFullXid() to update the
467 * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting
468 * transaction aborts.
469 *
470 * It's also good for performance. Most commonly tuples within a page are
471 * stored at decreasing offsets (while the items are stored at increasing
472 * offsets). When processing all tuples on a page this leads to reading
473 * memory at decreasing offsets within a page, with a variable stride.
474 * That's hard for CPU prefetchers to deal with. Processing the items in
475 * reverse order (and thus the tuples in increasing order) increases
476 * prefetching efficiency significantly / decreases the number of cache
477 * misses.
478 */
479 for (offnum = maxoff;
480 offnum >= FirstOffsetNumber;
481 offnum = OffsetNumberPrev(offnum))
482 {
483 ItemId itemid = PageGetItemId(page, offnum);
484 HeapTupleHeader htup;
485
486 /*
487 * Set the offset number so that we can display it along with any
488 * error that occurred while processing this tuple.
489 */
490 *off_loc = offnum;
491
492 prstate->processed[offnum] = false;
493 prstate->htsv[offnum] = -1;
494
495 /* Nothing to do if slot doesn't contain a tuple */
496 if (!ItemIdIsUsed(itemid))
497 {
498 heap_prune_record_unchanged_lp_unused(page, prstate, offnum);
499 continue;
500 }
501
502 if (ItemIdIsDead(itemid))
503 {
504 /*
505 * If the caller set mark_unused_now true, we can set dead line
506 * pointers LP_UNUSED now.
507 */
508 if (unlikely(prstate->mark_unused_now))
509 heap_prune_record_unused(prstate, offnum, false);
510 else
511 heap_prune_record_unchanged_lp_dead(page, prstate, offnum);
512 continue;
513 }
514
515 if (ItemIdIsRedirected(itemid))
516 {
517 /* This is the start of a HOT chain */
518 prstate->root_items[prstate->nroot_items++] = offnum;
519 continue;
520 }
521
522 Assert(ItemIdIsNormal(itemid));
523
524 /*
525 * Get the tuple's visibility status and queue it up for processing.
526 */
527 htup = (HeapTupleHeader) PageGetItem(page, itemid);
528 tup.t_data = htup;
529 tup.t_len = ItemIdGetLength(itemid);
530 ItemPointerSet(&tup.t_self, blockno, offnum);
531
532 prstate->htsv[offnum] = heap_prune_satisfies_vacuum(prstate, &tup,
533 buffer);
534
535 if (!HeapTupleHeaderIsHeapOnly(htup))
536 prstate->root_items[prstate->nroot_items++] = offnum;
537 else
538 prstate->heaponly_items[prstate->nheaponly_items++] = offnum;
539 }
540
541 /*
542 * Process HOT chains.
543 *
544 * We added the items to the array starting from 'maxoff', so by
545 * processing the array in reverse order, we process the items in
546 * ascending offset number order. The order doesn't matter for
547 * correctness, but some quick micro-benchmarking suggests that this is
548 * faster. (Earlier PostgreSQL versions, which scanned all the items on
549 * the page instead of using the root_items array, also did it in
550 * ascending offset number order.)
551 */
552 for (int i = prstate->nroot_items - 1; i >= 0; i--)
553 {
554 offnum = prstate->root_items[i];
555
556 /* Ignore items already processed as part of an earlier chain */
557 if (prstate->processed[offnum])
558 continue;
559
560 /* see preceding loop */
561 *off_loc = offnum;
562
563 /* Process this item or chain of items */
564 heap_prune_chain(page, blockno, maxoff, offnum, prstate);
565 }
566
567 /*
568 * Process any heap-only tuples that were not already processed as part of
569 * a HOT chain.
570 */
571 for (int i = prstate->nheaponly_items - 1; i >= 0; i--)
572 {
573 offnum = prstate->heaponly_items[i];
574
575 if (prstate->processed[offnum])
576 continue;
577
578 /* see preceding loop */
579 *off_loc = offnum;
580
581 /*
582 * If the tuple is DEAD and doesn't chain to anything else, mark it
583 * unused. (If it does chain, we can only remove it as part of
584 * pruning its chain.)
585 *
586 * We need this primarily to handle aborted HOT updates, that is,
587 * XMIN_INVALID heap-only tuples. Those might not be linked to by any
588 * chain, since the parent tuple might be re-updated before any
589 * pruning occurs. So we have to be able to reap them separately from
590 * chain-pruning. (Note that HeapTupleHeaderIsHotUpdated will never
591 * return true for an XMIN_INVALID tuple, so this code will work even
592 * when there were sequential updates within the aborted transaction.)
593 */
594 if (prstate->htsv[offnum] == HEAPTUPLE_DEAD)
595 {
596 ItemId itemid = PageGetItemId(page, offnum);
597 HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid);
598
600 {
602 &prstate->latest_xid_removed);
603 heap_prune_record_unused(prstate, offnum, true);
604 }
605 else
606 {
607 /*
608 * This tuple should've been processed and removed as part of
609 * a HOT chain, so something's wrong. To preserve evidence,
610 * we don't dare to remove it. We cannot leave behind a DEAD
611 * tuple either, because that will cause VACUUM to error out.
612 * Throwing an error with a distinct error message seems like
613 * the least bad option.
614 */
615 elog(ERROR, "dead heap-only tuple (%u, %d) is not linked to from any HOT chain",
616 blockno, offnum);
617 }
618 }
619 else
620 heap_prune_record_unchanged_lp_normal(page, prstate, offnum);
621 }
622
623 /* We should now have processed every tuple exactly once */
624#ifdef USE_ASSERT_CHECKING
625 for (offnum = FirstOffsetNumber;
626 offnum <= maxoff;
627 offnum = OffsetNumberNext(offnum))
628 {
629 *off_loc = offnum;
630
631 Assert(prstate->processed[offnum]);
632 }
633#endif
634
635 /* Clear the offset information once we have processed the given page. */
636 *off_loc = InvalidOffsetNumber;
637}
638
639/*
640 * Decide whether to proceed with freezing according to the freeze plans
641 * prepared for the given heap buffer. If freezing is chosen, this function
642 * performs several pre-freeze checks.
643 *
644 * The values of do_prune, do_hint_prune, and did_tuple_hint_fpi must be
645 * determined before calling this function.
646 *
647 * prstate is both an input and output parameter.
648 *
649 * Returns true if we should apply the freeze plans and freeze tuples on the
650 * page, and false otherwise.
651 */
652static bool
654 bool did_tuple_hint_fpi,
655 bool do_prune,
656 bool do_hint_prune,
657 PruneState *prstate)
658{
659 bool do_freeze = false;
660
661 /*
662 * If the caller specified we should not attempt to freeze any tuples,
663 * validate that everything is in the right state and return.
664 */
665 if (!prstate->attempt_freeze)
666 {
667 Assert(!prstate->all_frozen && prstate->nfrozen == 0);
668 Assert(prstate->lpdead_items == 0 || !prstate->all_visible);
669 return false;
670 }
671
672 if (prstate->pagefrz.freeze_required)
673 {
674 /*
675 * heap_prepare_freeze_tuple indicated that at least one XID/MXID from
676 * before FreezeLimit/MultiXactCutoff is present. Must freeze to
677 * advance relfrozenxid/relminmxid.
678 */
679 do_freeze = true;
680 }
681 else
682 {
683 /*
684 * Opportunistically freeze the page if we are generating an FPI
685 * anyway and if doing so means that we can set the page all-frozen
686 * afterwards (might not happen until VACUUM's final heap pass).
687 *
688 * XXX: Previously, we knew if pruning emitted an FPI by checking
689 * pgWalUsage.wal_fpi before and after pruning. Once the freeze and
690 * prune records were combined, this heuristic couldn't be used
691 * anymore. The opportunistic freeze heuristic must be improved;
692 * however, for now, try to approximate the old logic.
693 */
694 if (prstate->all_frozen && prstate->nfrozen > 0)
695 {
696 Assert(prstate->all_visible);
697
698 /*
699 * Freezing would make the page all-frozen. Have already emitted
700 * an FPI or will do so anyway?
701 */
702 if (RelationNeedsWAL(relation))
703 {
704 if (did_tuple_hint_fpi)
705 do_freeze = true;
706 else if (do_prune)
707 {
708 if (XLogCheckBufferNeedsBackup(buffer))
709 do_freeze = true;
710 }
711 else if (do_hint_prune)
712 {
714 do_freeze = true;
715 }
716 }
717 }
718 }
719
720 if (do_freeze)
721 {
722 /*
723 * Validate the tuples we will be freezing before entering the
724 * critical section.
725 */
726 heap_pre_freeze_checks(buffer, prstate->frozen, prstate->nfrozen);
727
728 /*
729 * Calculate what the snapshot conflict horizon should be for a record
730 * freezing tuples. We can use the visibility_cutoff_xid as our cutoff
731 * for conflicts when the whole page is eligible to become all-frozen
732 * in the VM once we're done with it. Otherwise, we generate a
733 * conservative cutoff by stepping back from OldestXmin.
734 */
735 if (prstate->all_frozen)
737 else
738 {
739 /* Avoids false conflicts when hot_standby_feedback in use */
740 prstate->frz_conflict_horizon = prstate->cutoffs->OldestXmin;
742 }
743 }
744 else if (prstate->nfrozen > 0)
745 {
746 /*
747 * The page contained some tuples that were not already frozen, and we
748 * chose not to freeze them now. The page won't be all-frozen then.
749 */
750 Assert(!prstate->pagefrz.freeze_required);
751
752 prstate->all_frozen = false;
753 prstate->nfrozen = 0; /* avoid miscounts in instrumentation */
754 }
755 else
756 {
757 /*
758 * We have no freeze plans to execute. The page might already be
759 * all-frozen (perhaps only following pruning), though. Such pages
760 * can be marked all-frozen in the VM by our caller, even though none
761 * of its tuples were newly frozen here.
762 */
763 }
764
765 return do_freeze;
766}
767
768
769/*
770 * Prune and repair fragmentation and potentially freeze tuples on the
771 * specified page.
772 *
773 * Caller must have pin and buffer cleanup lock on the page. Note that we
774 * don't update the FSM information for page on caller's behalf. Caller might
775 * also need to account for a reduction in the length of the line pointer
776 * array following array truncation by us.
777 *
778 * params contains the input parameters used to control freezing and pruning
779 * behavior. See the definition of PruneFreezeParams for more on what each
780 * parameter does.
781 *
782 * If the HEAP_PAGE_PRUNE_FREEZE option is set in params, we will freeze
783 * tuples if it's required in order to advance relfrozenxid / relminmxid, or
784 * if it's considered advantageous for overall system performance to do so
785 * now. The 'params.cutoffs', 'presult', 'new_relfrozen_xid' and
786 * 'new_relmin_mxid' arguments are required when freezing. When
787 * HEAP_PAGE_PRUNE_FREEZE option is passed, we also set presult->all_visible
788 * and presult->all_frozen after determining whether or not to
789 * opportunistically freeze, to indicate if the VM bits can be set. They are
790 * always set to false when the HEAP_PAGE_PRUNE_FREEZE option is not passed,
791 * because at the moment only callers that also freeze need that information.
792 *
793 * presult contains output parameters needed by callers, such as the number of
794 * tuples removed and the offsets of dead items on the page after pruning.
795 * heap_page_prune_and_freeze() is responsible for initializing it. Required
796 * by all callers.
797 *
798 * off_loc is the offset location required by the caller to use in error
799 * callback.
800 *
801 * new_relfrozen_xid and new_relmin_mxid must be provided by the caller if the
802 * HEAP_PAGE_PRUNE_FREEZE option is set in params. On entry, they contain the
803 * oldest XID and multi-XID seen on the relation so far. They will be updated
804 * with the oldest values present on the page after pruning. After processing
805 * the whole relation, VACUUM can use these values as the new
806 * relfrozenxid/relminmxid for the relation.
807 */
808void
810 PruneFreezeResult *presult,
811 OffsetNumber *off_loc,
812 TransactionId *new_relfrozen_xid,
813 MultiXactId *new_relmin_mxid)
814{
815 Buffer buffer = params->buffer;
816 Page page = BufferGetPage(buffer);
817 PruneState prstate;
818 bool do_freeze;
819 bool do_prune;
820 bool do_hint_prune;
821 bool did_tuple_hint_fpi;
822 int64 fpi_before = pgWalUsage.wal_fpi;
823
824 /* Initialize prstate */
825 prune_freeze_setup(params,
826 new_relfrozen_xid ?
827 *new_relfrozen_xid : InvalidTransactionId,
828 new_relmin_mxid ?
829 *new_relmin_mxid : InvalidMultiXactId,
830 presult,
831 &prstate);
832
833 /*
834 * Examine all line pointers and tuple visibility information to determine
835 * which line pointers should change state and which tuples may be frozen.
836 * Prepare queue of state changes to later be executed in a critical
837 * section.
838 */
840 buffer, &prstate, off_loc);
841
842 /*
843 * If checksums are enabled, calling heap_prune_satisfies_vacuum() while
844 * checking tuple visibility information in prune_freeze_plan() may have
845 * caused an FPI to be emitted.
846 */
847 did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi;
848
849 do_prune = prstate.nredirected > 0 ||
850 prstate.ndead > 0 ||
851 prstate.nunused > 0;
852
853 /*
854 * Even if we don't prune anything, if we found a new value for the
855 * pd_prune_xid field or the page was marked full, we will update the hint
856 * bit.
857 */
858 do_hint_prune = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
859 PageIsFull(page);
860
861 /*
862 * Decide if we want to go ahead with freezing according to the freeze
863 * plans we prepared, or not.
864 */
865 do_freeze = heap_page_will_freeze(params->relation, buffer,
866 did_tuple_hint_fpi,
867 do_prune,
868 do_hint_prune,
869 &prstate);
870
871 /*
872 * While scanning the line pointers, we did not clear
873 * all_visible/all_frozen when encountering LP_DEAD items because we
874 * wanted the decision whether or not to freeze the page to be unaffected
875 * by the short-term presence of LP_DEAD items. These LP_DEAD items are
876 * effectively assumed to be LP_UNUSED items in the making. It doesn't
877 * matter which vacuum heap pass (initial pass or final pass) ends up
878 * setting the page all-frozen, as long as the ongoing VACUUM does it.
879 *
880 * Now that we finished determining whether or not to freeze the page,
881 * update all_visible and all_frozen so that they reflect the true state
882 * of the page for setting PD_ALL_VISIBLE and VM bits.
883 */
884 if (prstate.lpdead_items > 0)
885 prstate.all_visible = prstate.all_frozen = false;
886
887 Assert(!prstate.all_frozen || prstate.all_visible);
888
889 /* Any error while applying the changes is critical */
891
892 if (do_hint_prune)
893 {
894 /*
895 * Update the page's pd_prune_xid field to either zero, or the lowest
896 * XID of any soon-prunable tuple.
897 */
898 ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
899
900 /*
901 * Also clear the "page is full" flag, since there's no point in
902 * repeating the prune/defrag process until something else happens to
903 * the page.
904 */
905 PageClearFull(page);
906
907 /*
908 * If that's all we had to do to the page, this is a non-WAL-logged
909 * hint. If we are going to freeze or prune the page, we will mark
910 * the buffer dirty below.
911 */
912 if (!do_freeze && !do_prune)
913 MarkBufferDirtyHint(buffer, true);
914 }
915
916 if (do_prune || do_freeze)
917 {
918 /* Apply the planned item changes and repair page fragmentation. */
919 if (do_prune)
920 {
921 heap_page_prune_execute(buffer, false,
922 prstate.redirected, prstate.nredirected,
923 prstate.nowdead, prstate.ndead,
924 prstate.nowunused, prstate.nunused);
925 }
926
927 if (do_freeze)
928 heap_freeze_prepared_tuples(buffer, prstate.frozen, prstate.nfrozen);
929
930 MarkBufferDirty(buffer);
931
932 /*
933 * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did
934 */
935 if (RelationNeedsWAL(params->relation))
936 {
937 /*
938 * The snapshotConflictHorizon for the whole record should be the
939 * most conservative of all the horizons calculated for any of the
940 * possible modifications. If this record will prune tuples, any
941 * transactions on the standby older than the youngest xmax of the
942 * most recently removed tuple this record will prune will
943 * conflict. If this record will freeze tuples, any transactions
944 * on the standby with xids older than the youngest tuple this
945 * record will freeze will conflict.
946 */
947 TransactionId conflict_xid;
948
950 prstate.latest_xid_removed))
951 conflict_xid = prstate.frz_conflict_horizon;
952 else
953 conflict_xid = prstate.latest_xid_removed;
954
955 log_heap_prune_and_freeze(params->relation, buffer,
956 InvalidBuffer, /* vmbuffer */
957 0, /* vmflags */
958 conflict_xid,
959 true, params->reason,
960 prstate.frozen, prstate.nfrozen,
961 prstate.redirected, prstate.nredirected,
962 prstate.nowdead, prstate.ndead,
963 prstate.nowunused, prstate.nunused);
964 }
965 }
966
968
969 /* Copy information back for caller */
970 presult->ndeleted = prstate.ndeleted;
971 presult->nnewlpdead = prstate.ndead;
972 presult->nfrozen = prstate.nfrozen;
973 presult->live_tuples = prstate.live_tuples;
975 presult->all_visible = prstate.all_visible;
976 presult->all_frozen = prstate.all_frozen;
977 presult->hastup = prstate.hastup;
978
979 /*
980 * For callers planning to update the visibility map, the conflict horizon
981 * for that record must be the newest xmin on the page. However, if the
982 * page is completely frozen, there can be no conflict and the
983 * vm_conflict_horizon should remain InvalidTransactionId. This includes
984 * the case that we just froze all the tuples; the prune-freeze record
985 * included the conflict XID already so the caller doesn't need it.
986 */
987 if (presult->all_frozen)
989 else
991
992 presult->lpdead_items = prstate.lpdead_items;
993 /* the presult->deadoffsets array was already filled in */
994
995 if (prstate.attempt_freeze)
996 {
997 if (presult->nfrozen > 0)
998 {
999 *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid;
1000 *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid;
1001 }
1002 else
1003 {
1004 *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid;
1005 *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid;
1006 }
1007 }
1008}
1009
1010
1011/*
1012 * Perform visibility checks for heap pruning.
1013 */
1014static HTSV_Result
1016{
1017 HTSV_Result res;
1018 TransactionId dead_after;
1019
1020 res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
1021
1022 if (res != HEAPTUPLE_RECENTLY_DEAD)
1023 return res;
1024
1025 /*
1026 * For VACUUM, we must be sure to prune tuples with xmax older than
1027 * OldestXmin -- a visibility cutoff determined at the beginning of
1028 * vacuuming the relation. OldestXmin is used for freezing determination
1029 * and we cannot freeze dead tuples' xmaxes.
1030 */
1031 if (prstate->cutoffs &&
1033 NormalTransactionIdPrecedes(dead_after, prstate->cutoffs->OldestXmin))
1034 return HEAPTUPLE_DEAD;
1035
1036 /*
1037 * Determine whether or not the tuple is considered dead when compared
1038 * with the provided GlobalVisState. On-access pruning does not provide
1039 * VacuumCutoffs. And for vacuum, even if the tuple's xmax is not older
1040 * than OldestXmin, GlobalVisTestIsRemovableXid() could find the row dead
1041 * if the GlobalVisState has been updated since the beginning of vacuuming
1042 * the relation.
1043 */
1044 if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
1045 return HEAPTUPLE_DEAD;
1046
1047 return res;
1048}
1049
1050
1051/*
1052 * Pruning calculates tuple visibility once and saves the results in an array
1053 * of int8. See PruneState.htsv for details. This helper function is meant
1054 * to guard against examining visibility status array members which have not
1055 * yet been computed.
1056 */
1057static inline HTSV_Result
1059{
1060 Assert(status >= HEAPTUPLE_DEAD &&
1062 return (HTSV_Result) status;
1063}
1064
1065/*
1066 * Prune specified line pointer or a HOT chain originating at line pointer.
1067 *
1068 * Tuple visibility information is provided in prstate->htsv.
1069 *
1070 * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
1071 * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
1072 * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
1073 * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
1074 * DEAD, our visibility test is just too coarse to detect it.
1075 *
1076 * Pruning must never leave behind a DEAD tuple that still has tuple storage.
1077 * VACUUM isn't prepared to deal with that case.
1078 *
1079 * The root line pointer is redirected to the tuple immediately after the
1080 * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
1081 * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
1082 * tuple, which we treat as a chain of length 1.)
1083 *
1084 * We don't actually change the page here. We just add entries to the arrays in
1085 * prstate showing the changes to be made. Items to be redirected are added
1086 * to the redirected[] array (two entries per redirection); items to be set to
1087 * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
1088 * state are added to nowunused[]. We perform bookkeeping of live tuples,
1089 * visibility etc. based on what the page will look like after the changes
1090 * applied. All that bookkeeping is performed in the heap_prune_record_*()
1091 * subroutines. The division of labor is that heap_prune_chain() decides the
1092 * fate of each tuple, ie. whether it's going to be removed, redirected or
1093 * left unchanged, and the heap_prune_record_*() subroutines update PruneState
1094 * based on that outcome.
1095 */
1096static void
1098 OffsetNumber rootoffnum, PruneState *prstate)
1099{
1101 ItemId rootlp;
1102 OffsetNumber offnum;
1104
1105 /*
1106 * After traversing the HOT chain, ndeadchain is the index in chainitems
1107 * of the first live successor after the last dead item.
1108 */
1109 int ndeadchain = 0,
1110 nchain = 0;
1111
1112 rootlp = PageGetItemId(page, rootoffnum);
1113
1114 /* Start from the root tuple */
1115 offnum = rootoffnum;
1116
1117 /* while not end of the chain */
1118 for (;;)
1119 {
1120 HeapTupleHeader htup;
1121 ItemId lp;
1122
1123 /* Sanity check (pure paranoia) */
1124 if (offnum < FirstOffsetNumber)
1125 break;
1126
1127 /*
1128 * An offset past the end of page's line pointer array is possible
1129 * when the array was truncated (original item must have been unused)
1130 */
1131 if (offnum > maxoff)
1132 break;
1133
1134 /* If item is already processed, stop --- it must not be same chain */
1135 if (prstate->processed[offnum])
1136 break;
1137
1138 lp = PageGetItemId(page, offnum);
1139
1140 /*
1141 * Unused item obviously isn't part of the chain. Likewise, a dead
1142 * line pointer can't be part of the chain. Both of those cases were
1143 * already marked as processed.
1144 */
1145 Assert(ItemIdIsUsed(lp));
1146 Assert(!ItemIdIsDead(lp));
1147
1148 /*
1149 * If we are looking at the redirected root line pointer, jump to the
1150 * first normal tuple in the chain. If we find a redirect somewhere
1151 * else, stop --- it must not be same chain.
1152 */
1153 if (ItemIdIsRedirected(lp))
1154 {
1155 if (nchain > 0)
1156 break; /* not at start of chain */
1157 chainitems[nchain++] = offnum;
1158 offnum = ItemIdGetRedirect(rootlp);
1159 continue;
1160 }
1161
1163
1164 htup = (HeapTupleHeader) PageGetItem(page, lp);
1165
1166 /*
1167 * Check the tuple XMIN against prior XMAX, if any
1168 */
1169 if (TransactionIdIsValid(priorXmax) &&
1171 break;
1172
1173 /*
1174 * OK, this tuple is indeed a member of the chain.
1175 */
1176 chainitems[nchain++] = offnum;
1177
1178 switch (htsv_get_valid_status(prstate->htsv[offnum]))
1179 {
1180 case HEAPTUPLE_DEAD:
1181
1182 /* Remember the last DEAD tuple seen */
1183 ndeadchain = nchain;
1185 &prstate->latest_xid_removed);
1186 /* Advance to next chain member */
1187 break;
1188
1190
1191 /*
1192 * We don't need to advance the conflict horizon for
1193 * RECENTLY_DEAD tuples, even if we are removing them. This
1194 * is because we only remove RECENTLY_DEAD tuples if they
1195 * precede a DEAD tuple, and the DEAD tuple must have been
1196 * inserted by a newer transaction than the RECENTLY_DEAD
1197 * tuple by virtue of being later in the chain. We will have
1198 * advanced the conflict horizon for the DEAD tuple.
1199 */
1200
1201 /*
1202 * Advance past RECENTLY_DEAD tuples just in case there's a
1203 * DEAD one after them. We have to make sure that we don't
1204 * miss any DEAD tuples, since DEAD tuples that still have
1205 * tuple storage after pruning will confuse VACUUM.
1206 */
1207 break;
1208
1210 case HEAPTUPLE_LIVE:
1212 goto process_chain;
1213
1214 default:
1215 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1216 goto process_chain;
1217 }
1218
1219 /*
1220 * If the tuple is not HOT-updated, then we are at the end of this
1221 * HOT-update chain.
1222 */
1223 if (!HeapTupleHeaderIsHotUpdated(htup))
1224 goto process_chain;
1225
1226 /* HOT implies it can't have moved to different partition */
1228
1229 /*
1230 * Advance to next chain member.
1231 */
1232 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blockno);
1233 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1234 priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1235 }
1236
1237 if (ItemIdIsRedirected(rootlp) && nchain < 2)
1238 {
1239 /*
1240 * We found a redirect item that doesn't point to a valid follow-on
1241 * item. This can happen if the loop in heap_page_prune_and_freeze()
1242 * caused us to visit the dead successor of a redirect item before
1243 * visiting the redirect item. We can clean up by setting the
1244 * redirect item to LP_DEAD state or LP_UNUSED if the caller
1245 * indicated.
1246 */
1247 heap_prune_record_dead_or_unused(prstate, rootoffnum, false);
1248 return;
1249 }
1250
1251process_chain:
1252
1253 if (ndeadchain == 0)
1254 {
1255 /*
1256 * No DEAD tuple was found, so the chain is entirely composed of
1257 * normal, unchanged tuples. Leave it alone.
1258 */
1259 int i = 0;
1260
1261 if (ItemIdIsRedirected(rootlp))
1262 {
1263 heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum);
1264 i++;
1265 }
1266 for (; i < nchain; i++)
1267 heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]);
1268 }
1269 else if (ndeadchain == nchain)
1270 {
1271 /*
1272 * The entire chain is dead. Mark the root line pointer LP_DEAD, and
1273 * fully remove the other tuples in the chain.
1274 */
1275 heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp));
1276 for (int i = 1; i < nchain; i++)
1277 heap_prune_record_unused(prstate, chainitems[i], true);
1278 }
1279 else
1280 {
1281 /*
1282 * We found a DEAD tuple in the chain. Redirect the root line pointer
1283 * to the first non-DEAD tuple, and mark as unused each intermediate
1284 * item that we are able to remove from the chain.
1285 */
1286 heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain],
1287 ItemIdIsNormal(rootlp));
1288 for (int i = 1; i < ndeadchain; i++)
1289 heap_prune_record_unused(prstate, chainitems[i], true);
1290
1291 /* the rest of tuples in the chain are normal, unchanged tuples */
1292 for (int i = ndeadchain; i < nchain; i++)
1293 heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]);
1294 }
1295}
1296
1297/* Record lowest soon-prunable XID */
1298static void
1300{
1301 /*
1302 * This should exactly match the PageSetPrunable macro. We can't store
1303 * directly into the page header yet, so we update working state.
1304 */
1306 if (!TransactionIdIsValid(prstate->new_prune_xid) ||
1307 TransactionIdPrecedes(xid, prstate->new_prune_xid))
1308 prstate->new_prune_xid = xid;
1309}
1310
1311/* Record line pointer to be redirected */
1312static void
1314 OffsetNumber offnum, OffsetNumber rdoffnum,
1315 bool was_normal)
1316{
1317 Assert(!prstate->processed[offnum]);
1318 prstate->processed[offnum] = true;
1319
1320 /*
1321 * Do not mark the redirect target here. It needs to be counted
1322 * separately as an unchanged tuple.
1323 */
1324
1326 prstate->redirected[prstate->nredirected * 2] = offnum;
1327 prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
1328
1329 prstate->nredirected++;
1330
1331 /*
1332 * If the root entry had been a normal tuple, we are deleting it, so count
1333 * it in the result. But changing a redirect (even to DEAD state) doesn't
1334 * count.
1335 */
1336 if (was_normal)
1337 prstate->ndeleted++;
1338
1339 prstate->hastup = true;
1340}
1341
1342/* Record line pointer to be marked dead */
1343static void
1345 bool was_normal)
1346{
1347 Assert(!prstate->processed[offnum]);
1348 prstate->processed[offnum] = true;
1349
1350 Assert(prstate->ndead < MaxHeapTuplesPerPage);
1351 prstate->nowdead[prstate->ndead] = offnum;
1352 prstate->ndead++;
1353
1354 /*
1355 * Deliberately delay unsetting all_visible and all_frozen until later
1356 * during pruning. Removable dead tuples shouldn't preclude freezing the
1357 * page.
1358 */
1359
1360 /* Record the dead offset for vacuum */
1361 prstate->deadoffsets[prstate->lpdead_items++] = offnum;
1362
1363 /*
1364 * If the root entry had been a normal tuple, we are deleting it, so count
1365 * it in the result. But changing a redirect (even to DEAD state) doesn't
1366 * count.
1367 */
1368 if (was_normal)
1369 prstate->ndeleted++;
1370}
1371
1372/*
1373 * Depending on whether or not the caller set mark_unused_now to true, record that a
1374 * line pointer should be marked LP_DEAD or LP_UNUSED. There are other cases in
1375 * which we will mark line pointers LP_UNUSED, but we will not mark line
1376 * pointers LP_DEAD if mark_unused_now is true.
1377 */
1378static void
1380 bool was_normal)
1381{
1382 /*
1383 * If the caller set mark_unused_now to true, we can remove dead tuples
1384 * during pruning instead of marking their line pointers dead. Set this
1385 * tuple's line pointer LP_UNUSED. We hint that this option is less
1386 * likely.
1387 */
1388 if (unlikely(prstate->mark_unused_now))
1389 heap_prune_record_unused(prstate, offnum, was_normal);
1390 else
1391 heap_prune_record_dead(prstate, offnum, was_normal);
1392}
1393
1394/* Record line pointer to be marked unused */
1395static void
1396heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
1397{
1398 Assert(!prstate->processed[offnum]);
1399 prstate->processed[offnum] = true;
1400
1402 prstate->nowunused[prstate->nunused] = offnum;
1403 prstate->nunused++;
1404
1405 /*
1406 * If the root entry had been a normal tuple, we are deleting it, so count
1407 * it in the result. But changing a redirect (even to DEAD state) doesn't
1408 * count.
1409 */
1410 if (was_normal)
1411 prstate->ndeleted++;
1412}
1413
1414/*
1415 * Record an unused line pointer that is left unchanged.
1416 */
1417static void
1419{
1420 Assert(!prstate->processed[offnum]);
1421 prstate->processed[offnum] = true;
1422}
1423
1424/*
1425 * Record line pointer that is left unchanged. We consider freezing it, and
1426 * update bookkeeping of tuple counts and page visibility.
1427 */
1428static void
1430{
1431 HeapTupleHeader htup;
1432
1433 Assert(!prstate->processed[offnum]);
1434 prstate->processed[offnum] = true;
1435
1436 prstate->hastup = true; /* the page is not empty */
1437
1438 /*
1439 * The criteria for counting a tuple as live in this block need to match
1440 * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and
1441 * ANALYZE may produce wildly different reltuples values, e.g. when there
1442 * are many recently-dead tuples.
1443 *
1444 * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM
1445 * can't run inside a transaction block, which makes some cases impossible
1446 * (e.g. in-progress insert from the same transaction).
1447 *
1448 * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*()
1449 * subroutines. They don't count dead items like acquire_sample_rows()
1450 * does, because we assume that all dead items will become LP_UNUSED
1451 * before VACUUM finishes. This difference is only superficial. VACUUM
1452 * effectively agrees with ANALYZE about DEAD items, in the end. VACUUM
1453 * won't remember LP_DEAD items, but only because they're not supposed to
1454 * be left behind when it is done. (Cases where we bypass index vacuuming
1455 * will violate this optimistic assumption, but the overall impact of that
1456 * should be negligible.)
1457 */
1458 htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum));
1459
1460 switch (prstate->htsv[offnum])
1461 {
1462 case HEAPTUPLE_LIVE:
1463
1464 /*
1465 * Count it as live. Not only is this natural, but it's also what
1466 * acquire_sample_rows() does.
1467 */
1468 prstate->live_tuples++;
1469
1470 /*
1471 * Is the tuple definitely visible to all transactions?
1472 *
1473 * NB: Like with per-tuple hint bits, we can't set the
1474 * PD_ALL_VISIBLE flag if the inserter committed asynchronously.
1475 * See SetHintBits for more info. Check that the tuple is hinted
1476 * xmin-committed because of that.
1477 */
1478 if (prstate->all_visible)
1479 {
1480 TransactionId xmin;
1481
1483 {
1484 prstate->all_visible = false;
1485 prstate->all_frozen = false;
1486 break;
1487 }
1488
1489 /*
1490 * The inserter definitely committed. But is it old enough
1491 * that everyone sees it as committed? A FrozenTransactionId
1492 * is seen as committed to everyone. Otherwise, we check if
1493 * there is a snapshot that considers this xid to still be
1494 * running, and if so, we don't consider the page all-visible.
1495 */
1496 xmin = HeapTupleHeaderGetXmin(htup);
1497
1498 /*
1499 * For now always use prstate->cutoffs for this test, because
1500 * we only update 'all_visible' and 'all_frozen' when freezing
1501 * is requested. We could use GlobalVisTestIsRemovableXid
1502 * instead, if a non-freezing caller wanted to set the VM bit.
1503 */
1504 Assert(prstate->cutoffs);
1505 if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin))
1506 {
1507 prstate->all_visible = false;
1508 prstate->all_frozen = false;
1509 break;
1510 }
1511
1512 /* Track newest xmin on page. */
1513 if (TransactionIdFollows(xmin, prstate->visibility_cutoff_xid) &&
1515 prstate->visibility_cutoff_xid = xmin;
1516 }
1517 break;
1518
1520 prstate->recently_dead_tuples++;
1521 prstate->all_visible = false;
1522 prstate->all_frozen = false;
1523
1524 /*
1525 * This tuple will soon become DEAD. Update the hint field so
1526 * that the page is reconsidered for pruning in future.
1527 */
1530 break;
1531
1533
1534 /*
1535 * We do not count these rows as live, because we expect the
1536 * inserting transaction to update the counters at commit, and we
1537 * assume that will happen only after we report our results. This
1538 * assumption is a bit shaky, but it is what acquire_sample_rows()
1539 * does, so be consistent.
1540 */
1541 prstate->all_visible = false;
1542 prstate->all_frozen = false;
1543
1544 /*
1545 * If we wanted to optimize for aborts, we might consider marking
1546 * the page prunable when we see INSERT_IN_PROGRESS. But we
1547 * don't. See related decisions about when to mark the page
1548 * prunable in heapam.c.
1549 */
1550 break;
1551
1553
1554 /*
1555 * This an expected case during concurrent vacuum. Count such
1556 * rows as live. As above, we assume the deleting transaction
1557 * will commit and update the counters after we report.
1558 */
1559 prstate->live_tuples++;
1560 prstate->all_visible = false;
1561 prstate->all_frozen = false;
1562
1563 /*
1564 * This tuple may soon become DEAD. Update the hint field so that
1565 * the page is reconsidered for pruning in future.
1566 */
1569 break;
1570
1571 default:
1572
1573 /*
1574 * DEAD tuples should've been passed to heap_prune_record_dead()
1575 * or heap_prune_record_unused() instead.
1576 */
1577 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d",
1578 prstate->htsv[offnum]);
1579 break;
1580 }
1581
1582 /* Consider freezing any normal tuples which will not be removed */
1583 if (prstate->attempt_freeze)
1584 {
1585 bool totally_frozen;
1586
1587 if ((heap_prepare_freeze_tuple(htup,
1588 prstate->cutoffs,
1589 &prstate->pagefrz,
1590 &prstate->frozen[prstate->nfrozen],
1591 &totally_frozen)))
1592 {
1593 /* Save prepared freeze plan for later */
1594 prstate->frozen[prstate->nfrozen++].offset = offnum;
1595 }
1596
1597 /*
1598 * If any tuple isn't either totally frozen already or eligible to
1599 * become totally frozen (according to its freeze plan), then the page
1600 * definitely cannot be set all-frozen in the visibility map later on.
1601 */
1602 if (!totally_frozen)
1603 prstate->all_frozen = false;
1604 }
1605}
1606
1607
1608/*
1609 * Record line pointer that was already LP_DEAD and is left unchanged.
1610 */
1611static void
1613{
1614 Assert(!prstate->processed[offnum]);
1615 prstate->processed[offnum] = true;
1616
1617 /*
1618 * Deliberately don't set hastup for LP_DEAD items. We make the soft
1619 * assumption that any LP_DEAD items encountered here will become
1620 * LP_UNUSED later on, before count_nondeletable_pages is reached. If we
1621 * don't make this assumption then rel truncation will only happen every
1622 * other VACUUM, at most. Besides, VACUUM must treat
1623 * hastup/nonempty_pages as provisional no matter how LP_DEAD items are
1624 * handled (handled here, or handled later on).
1625 *
1626 * Similarly, don't unset all_visible and all_frozen until later, at the
1627 * end of heap_page_prune_and_freeze(). This will allow us to attempt to
1628 * freeze the page after pruning. As long as we unset it before updating
1629 * the visibility map, this will be correct.
1630 */
1631
1632 /* Record the dead offset for vacuum */
1633 prstate->deadoffsets[prstate->lpdead_items++] = offnum;
1634}
1635
1636/*
1637 * Record LP_REDIRECT that is left unchanged.
1638 */
1639static void
1641{
1642 /*
1643 * A redirect line pointer doesn't count as a live tuple.
1644 *
1645 * If we leave a redirect line pointer in place, there will be another
1646 * tuple on the page that it points to. We will do the bookkeeping for
1647 * that separately. So we have nothing to do here, except remember that
1648 * we processed this item.
1649 */
1650 Assert(!prstate->processed[offnum]);
1651 prstate->processed[offnum] = true;
1652}
1653
1654/*
1655 * Perform the actual page changes needed by heap_page_prune_and_freeze().
1656 *
1657 * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers
1658 * as unused, not redirecting or removing anything else. The
1659 * PageRepairFragmentation() call is skipped in that case.
1660 *
1661 * If 'lp_truncate_only' is not set, the caller must hold a cleanup lock on
1662 * the buffer. If it is set, an ordinary exclusive lock suffices.
1663 */
1664void
1665heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
1666 OffsetNumber *redirected, int nredirected,
1667 OffsetNumber *nowdead, int ndead,
1668 OffsetNumber *nowunused, int nunused)
1669{
1670 Page page = BufferGetPage(buffer);
1671 OffsetNumber *offnum;
1673
1674 /* Shouldn't be called unless there's something to do */
1675 Assert(nredirected > 0 || ndead > 0 || nunused > 0);
1676
1677 /* If 'lp_truncate_only', we can only remove already-dead line pointers */
1678 Assert(!lp_truncate_only || (nredirected == 0 && ndead == 0));
1679
1680 /* Update all redirected line pointers */
1681 offnum = redirected;
1682 for (int i = 0; i < nredirected; i++)
1683 {
1684 OffsetNumber fromoff = *offnum++;
1685 OffsetNumber tooff = *offnum++;
1686 ItemId fromlp = PageGetItemId(page, fromoff);
1688
1689#ifdef USE_ASSERT_CHECKING
1690
1691 /*
1692 * Any existing item that we set as an LP_REDIRECT (any 'from' item)
1693 * must be the first item from a HOT chain. If the item has tuple
1694 * storage then it can't be a heap-only tuple. Otherwise we are just
1695 * maintaining an existing LP_REDIRECT from an existing HOT chain that
1696 * has been pruned at least once before now.
1697 */
1698 if (!ItemIdIsRedirected(fromlp))
1699 {
1700 Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
1701
1702 htup = (HeapTupleHeader) PageGetItem(page, fromlp);
1704 }
1705 else
1706 {
1707 /* We shouldn't need to redundantly set the redirect */
1708 Assert(ItemIdGetRedirect(fromlp) != tooff);
1709 }
1710
1711 /*
1712 * The item that we're about to set as an LP_REDIRECT (the 'from'
1713 * item) will point to an existing item (the 'to' item) that is
1714 * already a heap-only tuple. There can be at most one LP_REDIRECT
1715 * item per HOT chain.
1716 *
1717 * We need to keep around an LP_REDIRECT item (after original
1718 * non-heap-only root tuple gets pruned away) so that it's always
1719 * possible for VACUUM to easily figure out what TID to delete from
1720 * indexes when an entire HOT chain becomes dead. A heap-only tuple
1721 * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
1722 * tuple can.
1723 *
1724 * This check may miss problems, e.g. the target of a redirect could
1725 * be marked as unused subsequently. The page_verify_redirects() check
1726 * below will catch such problems.
1727 */
1728 tolp = PageGetItemId(page, tooff);
1729 Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
1730 htup = (HeapTupleHeader) PageGetItem(page, tolp);
1732#endif
1733
1734 ItemIdSetRedirect(fromlp, tooff);
1735 }
1736
1737 /* Update all now-dead line pointers */
1738 offnum = nowdead;
1739 for (int i = 0; i < ndead; i++)
1740 {
1741 OffsetNumber off = *offnum++;
1742 ItemId lp = PageGetItemId(page, off);
1743
1744#ifdef USE_ASSERT_CHECKING
1745
1746 /*
1747 * An LP_DEAD line pointer must be left behind when the original item
1748 * (which is dead to everybody) could still be referenced by a TID in
1749 * an index. This should never be necessary with any individual
1750 * heap-only tuple item, though. (It's not clear how much of a problem
1751 * that would be, but there is no reason to allow it.)
1752 */
1753 if (ItemIdHasStorage(lp))
1754 {
1756 htup = (HeapTupleHeader) PageGetItem(page, lp);
1758 }
1759 else
1760 {
1761 /* Whole HOT chain becomes dead */
1763 }
1764#endif
1765
1766 ItemIdSetDead(lp);
1767 }
1768
1769 /* Update all now-unused line pointers */
1770 offnum = nowunused;
1771 for (int i = 0; i < nunused; i++)
1772 {
1773 OffsetNumber off = *offnum++;
1774 ItemId lp = PageGetItemId(page, off);
1775
1776#ifdef USE_ASSERT_CHECKING
1777
1778 if (lp_truncate_only)
1779 {
1780 /* Setting LP_DEAD to LP_UNUSED in vacuum's second pass */
1782 }
1783 else
1784 {
1785 /*
1786 * When heap_page_prune_and_freeze() was called, mark_unused_now
1787 * may have been passed as true, which allows would-be LP_DEAD
1788 * items to be made LP_UNUSED instead. This is only possible if
1789 * the relation has no indexes. If there are any dead items, then
1790 * mark_unused_now was not true and every item being marked
1791 * LP_UNUSED must refer to a heap-only tuple.
1792 */
1793 if (ndead > 0)
1794 {
1796 htup = (HeapTupleHeader) PageGetItem(page, lp);
1798 }
1799 else
1800 Assert(ItemIdIsUsed(lp));
1801 }
1802
1803#endif
1804
1805 ItemIdSetUnused(lp);
1806 }
1807
1808 if (lp_truncate_only)
1810 else
1811 {
1812 /*
1813 * Finally, repair any fragmentation, and update the page's hint bit
1814 * about whether it has free pointers.
1815 */
1817
1818 /*
1819 * Now that the page has been modified, assert that redirect items
1820 * still point to valid targets.
1821 */
1823 }
1824}
1825
1826
1827/*
1828 * If built with assertions, verify that all LP_REDIRECT items point to a
1829 * valid item.
1830 *
1831 * One way that bugs related to HOT pruning show is redirect items pointing to
1832 * removed tuples. It's not trivial to reliably check that marking an item
1833 * unused will not orphan a redirect item during heap_prune_chain() /
1834 * heap_page_prune_execute(), so we additionally check the whole page after
1835 * pruning. Without this check such bugs would typically only cause asserts
1836 * later, potentially well after the corruption has been introduced.
1837 *
1838 * Also check comments in heap_page_prune_execute()'s redirection loop.
1839 */
1840static void
1842{
1843#ifdef USE_ASSERT_CHECKING
1844 OffsetNumber offnum;
1845 OffsetNumber maxoff;
1846
1847 maxoff = PageGetMaxOffsetNumber(page);
1848 for (offnum = FirstOffsetNumber;
1849 offnum <= maxoff;
1850 offnum = OffsetNumberNext(offnum))
1851 {
1852 ItemId itemid = PageGetItemId(page, offnum);
1853 OffsetNumber targoff;
1854 ItemId targitem;
1855 HeapTupleHeader htup;
1856
1857 if (!ItemIdIsRedirected(itemid))
1858 continue;
1859
1860 targoff = ItemIdGetRedirect(itemid);
1861 targitem = PageGetItemId(page, targoff);
1862
1863 Assert(ItemIdIsUsed(targitem));
1864 Assert(ItemIdIsNormal(targitem));
1865 Assert(ItemIdHasStorage(targitem));
1866 htup = (HeapTupleHeader) PageGetItem(page, targitem);
1868 }
1869#endif
1870}
1871
1872
1873/*
1874 * For all items in this page, find their respective root line pointers.
1875 * If item k is part of a HOT-chain with root at item j, then we set
1876 * root_offsets[k - 1] = j.
1877 *
1878 * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
1879 * Unused entries are filled with InvalidOffsetNumber (zero).
1880 *
1881 * The function must be called with at least share lock on the buffer, to
1882 * prevent concurrent prune operations.
1883 *
1884 * Note: The information collected here is valid only as long as the caller
1885 * holds a pin on the buffer. Once pin is released, a tuple might be pruned
1886 * and reused by a completely unrelated tuple.
1887 */
1888void
1890{
1891 OffsetNumber offnum,
1892 maxoff;
1893
1894 MemSet(root_offsets, InvalidOffsetNumber,
1896
1897 maxoff = PageGetMaxOffsetNumber(page);
1898 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
1899 {
1900 ItemId lp = PageGetItemId(page, offnum);
1901 HeapTupleHeader htup;
1902 OffsetNumber nextoffnum;
1903 TransactionId priorXmax;
1904
1905 /* skip unused and dead items */
1906 if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
1907 continue;
1908
1909 if (ItemIdIsNormal(lp))
1910 {
1911 htup = (HeapTupleHeader) PageGetItem(page, lp);
1912
1913 /*
1914 * Check if this tuple is part of a HOT-chain rooted at some other
1915 * tuple. If so, skip it for now; we'll process it when we find
1916 * its root.
1917 */
1918 if (HeapTupleHeaderIsHeapOnly(htup))
1919 continue;
1920
1921 /*
1922 * This is either a plain tuple or the root of a HOT-chain.
1923 * Remember it in the mapping.
1924 */
1925 root_offsets[offnum - 1] = offnum;
1926
1927 /* If it's not the start of a HOT-chain, we're done with it */
1928 if (!HeapTupleHeaderIsHotUpdated(htup))
1929 continue;
1930
1931 /* Set up to scan the HOT-chain */
1932 nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1933 priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1934 }
1935 else
1936 {
1937 /* Must be a redirect item. We do not set its root_offsets entry */
1939 /* Set up to scan the HOT-chain */
1940 nextoffnum = ItemIdGetRedirect(lp);
1941 priorXmax = InvalidTransactionId;
1942 }
1943
1944 /*
1945 * Now follow the HOT-chain and collect other tuples in the chain.
1946 *
1947 * Note: Even though this is a nested loop, the complexity of the
1948 * function is O(N) because a tuple in the page should be visited not
1949 * more than twice, once in the outer loop and once in HOT-chain
1950 * chases.
1951 */
1952 for (;;)
1953 {
1954 /* Sanity check (pure paranoia) */
1955 if (offnum < FirstOffsetNumber)
1956 break;
1957
1958 /*
1959 * An offset past the end of page's line pointer array is possible
1960 * when the array was truncated
1961 */
1962 if (offnum > maxoff)
1963 break;
1964
1965 lp = PageGetItemId(page, nextoffnum);
1966
1967 /* Check for broken chains */
1968 if (!ItemIdIsNormal(lp))
1969 break;
1970
1971 htup = (HeapTupleHeader) PageGetItem(page, lp);
1972
1973 if (TransactionIdIsValid(priorXmax) &&
1975 break;
1976
1977 /* Remember the root line pointer for this item */
1978 root_offsets[nextoffnum - 1] = offnum;
1979
1980 /* Advance to next chain member, if any */
1981 if (!HeapTupleHeaderIsHotUpdated(htup))
1982 break;
1983
1984 /* HOT implies it can't have moved to different partition */
1986
1987 nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1988 priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1989 }
1990 }
1991}
1992
1993
1994/*
1995 * Compare fields that describe actions required to freeze tuple with caller's
1996 * open plan. If everything matches then the frz tuple plan is equivalent to
1997 * caller's plan.
1998 */
1999static inline bool
2001{
2002 if (plan->xmax == frz->xmax &&
2003 plan->t_infomask2 == frz->t_infomask2 &&
2004 plan->t_infomask == frz->t_infomask &&
2005 plan->frzflags == frz->frzflags)
2006 return true;
2007
2008 /* Caller must call heap_log_freeze_new_plan again for frz */
2009 return false;
2010}
2011
2012/*
2013 * Comparator used to deduplicate the freeze plans used in WAL records.
2014 */
2015static int
2016heap_log_freeze_cmp(const void *arg1, const void *arg2)
2017{
2018 HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1;
2019 HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2;
2020
2021 if (frz1->xmax < frz2->xmax)
2022 return -1;
2023 else if (frz1->xmax > frz2->xmax)
2024 return 1;
2025
2026 if (frz1->t_infomask2 < frz2->t_infomask2)
2027 return -1;
2028 else if (frz1->t_infomask2 > frz2->t_infomask2)
2029 return 1;
2030
2031 if (frz1->t_infomask < frz2->t_infomask)
2032 return -1;
2033 else if (frz1->t_infomask > frz2->t_infomask)
2034 return 1;
2035
2036 if (frz1->frzflags < frz2->frzflags)
2037 return -1;
2038 else if (frz1->frzflags > frz2->frzflags)
2039 return 1;
2040
2041 /*
2042 * heap_log_freeze_eq would consider these tuple-wise plans to be equal.
2043 * (So the tuples will share a single canonical freeze plan.)
2044 *
2045 * We tiebreak on page offset number to keep each freeze plan's page
2046 * offset number array individually sorted. (Unnecessary, but be tidy.)
2047 */
2048 if (frz1->offset < frz2->offset)
2049 return -1;
2050 else if (frz1->offset > frz2->offset)
2051 return 1;
2052
2053 Assert(false);
2054 return 0;
2055}
2056
2057/*
2058 * Start new plan initialized using tuple-level actions. At least one tuple
2059 * will have steps required to freeze described by caller's plan during REDO.
2060 */
2061static inline void
2063{
2064 plan->xmax = frz->xmax;
2065 plan->t_infomask2 = frz->t_infomask2;
2066 plan->t_infomask = frz->t_infomask;
2067 plan->frzflags = frz->frzflags;
2068 plan->ntuples = 1; /* for now */
2069}
2070
2071/*
2072 * Deduplicate tuple-based freeze plans so that each distinct set of
2073 * processing steps is only stored once in the WAL record.
2074 * Called during original execution of freezing (for logged relations).
2075 *
2076 * Return value is number of plans set in *plans_out for caller. Also writes
2077 * an array of offset numbers into *offsets_out output argument for caller
2078 * (actually there is one array per freeze plan, but that's not of immediate
2079 * concern to our caller).
2080 */
2081static int
2083 xlhp_freeze_plan *plans_out,
2084 OffsetNumber *offsets_out)
2085{
2086 int nplans = 0;
2087
2088 /* Sort tuple-based freeze plans in the order required to deduplicate */
2089 qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
2090
2091 for (int i = 0; i < ntuples; i++)
2092 {
2093 HeapTupleFreeze *frz = tuples + i;
2094
2095 if (i == 0)
2096 {
2097 /* New canonical freeze plan starting with first tup */
2098 heap_log_freeze_new_plan(plans_out, frz);
2099 nplans++;
2100 }
2101 else if (heap_log_freeze_eq(plans_out, frz))
2102 {
2103 /* tup matches open canonical plan -- include tup in it */
2104 Assert(offsets_out[i - 1] < frz->offset);
2105 plans_out->ntuples++;
2106 }
2107 else
2108 {
2109 /* Tup doesn't match current plan -- done with it now */
2110 plans_out++;
2111
2112 /* New canonical freeze plan starting with this tup */
2113 heap_log_freeze_new_plan(plans_out, frz);
2114 nplans++;
2115 }
2116
2117 /*
2118 * Save page offset number in dedicated buffer in passing.
2119 *
2120 * REDO routine relies on the record's offset numbers array grouping
2121 * offset numbers by freeze plan. The sort order within each grouping
2122 * is ascending offset number order, just to keep things tidy.
2123 */
2124 offsets_out[i] = frz->offset;
2125 }
2126
2127 Assert(nplans > 0 && nplans <= ntuples);
2128
2129 return nplans;
2130}
2131
2132/*
2133 * Write an XLOG_HEAP2_PRUNE* WAL record
2134 *
2135 * This is used for several different page maintenance operations:
2136 *
2137 * - Page pruning, in VACUUM's 1st pass or on access: Some items are
2138 * redirected, some marked dead, and some removed altogether.
2139 *
2140 * - Freezing: Items are marked as 'frozen'.
2141 *
2142 * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused.
2143 *
2144 * They have enough commonalities that we use a single WAL record for them
2145 * all.
2146 *
2147 * If replaying the record requires a cleanup lock, pass cleanup_lock = true.
2148 * Replaying 'redirected' or 'dead' items always requires a cleanup lock, but
2149 * replaying 'unused' items depends on whether they were all previously marked
2150 * as dead.
2151 *
2152 * If the VM is being updated, vmflags will contain the bits to set. In this
2153 * case, vmbuffer should already have been updated and marked dirty and should
2154 * still be pinned and locked.
2155 *
2156 * Note: This function scribbles on the 'frozen' array.
2157 *
2158 * Note: This is called in a critical section, so careful what you do here.
2159 */
2160void
2162 Buffer vmbuffer, uint8 vmflags,
2163 TransactionId conflict_xid,
2164 bool cleanup_lock,
2165 PruneReason reason,
2166 HeapTupleFreeze *frozen, int nfrozen,
2167 OffsetNumber *redirected, int nredirected,
2168 OffsetNumber *dead, int ndead,
2169 OffsetNumber *unused, int nunused)
2170{
2171 xl_heap_prune xlrec;
2172 XLogRecPtr recptr;
2173 uint8 info;
2174 uint8 regbuf_flags_heap;
2175
2176 /* The following local variables hold data registered in the WAL record: */
2178 xlhp_freeze_plans freeze_plans;
2179 xlhp_prune_items redirect_items;
2180 xlhp_prune_items dead_items;
2181 xlhp_prune_items unused_items;
2183 bool do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
2184 bool do_set_vm = vmflags & VISIBILITYMAP_VALID_BITS;
2185
2186 Assert((vmflags & VISIBILITYMAP_VALID_BITS) == vmflags);
2187
2188 xlrec.flags = 0;
2189 regbuf_flags_heap = REGBUF_STANDARD;
2190
2191 /*
2192 * We can avoid an FPI of the heap page if the only modification we are
2193 * making to it is to set PD_ALL_VISIBLE and checksums/wal_log_hints are
2194 * disabled. Note that if we explicitly skip an FPI, we must not stamp the
2195 * heap page with this record's LSN. Recovery skips records <= the stamped
2196 * LSN, so this could lead to skipping an earlier FPI needed to repair a
2197 * torn page.
2198 */
2199 if (!do_prune &&
2200 nfrozen == 0 &&
2201 (!do_set_vm || !XLogHintBitIsNeeded()))
2202 regbuf_flags_heap |= REGBUF_NO_IMAGE;
2203
2204 /*
2205 * Prepare data for the buffer. The arrays are not actually in the
2206 * buffer, but we pretend that they are. When XLogInsert stores a full
2207 * page image, the arrays can be omitted.
2208 */
2210 XLogRegisterBuffer(0, buffer, regbuf_flags_heap);
2211
2212 if (do_set_vm)
2213 XLogRegisterBuffer(1, vmbuffer, 0);
2214
2215 if (nfrozen > 0)
2216 {
2217 int nplans;
2218
2220
2221 /*
2222 * Prepare deduplicated representation for use in the WAL record. This
2223 * destructively sorts frozen tuples array in-place.
2224 */
2225 nplans = heap_log_freeze_plan(frozen, nfrozen, plans, frz_offsets);
2226
2227 freeze_plans.nplans = nplans;
2228 XLogRegisterBufData(0, &freeze_plans,
2229 offsetof(xlhp_freeze_plans, plans));
2230 XLogRegisterBufData(0, plans,
2231 sizeof(xlhp_freeze_plan) * nplans);
2232 }
2233 if (nredirected > 0)
2234 {
2236
2237 redirect_items.ntargets = nredirected;
2238 XLogRegisterBufData(0, &redirect_items,
2239 offsetof(xlhp_prune_items, data));
2240 XLogRegisterBufData(0, redirected,
2241 sizeof(OffsetNumber[2]) * nredirected);
2242 }
2243 if (ndead > 0)
2244 {
2245 xlrec.flags |= XLHP_HAS_DEAD_ITEMS;
2246
2247 dead_items.ntargets = ndead;
2248 XLogRegisterBufData(0, &dead_items,
2249 offsetof(xlhp_prune_items, data));
2250 XLogRegisterBufData(0, dead,
2251 sizeof(OffsetNumber) * ndead);
2252 }
2253 if (nunused > 0)
2254 {
2256
2257 unused_items.ntargets = nunused;
2258 XLogRegisterBufData(0, &unused_items,
2259 offsetof(xlhp_prune_items, data));
2260 XLogRegisterBufData(0, unused,
2261 sizeof(OffsetNumber) * nunused);
2262 }
2263 if (nfrozen > 0)
2264 XLogRegisterBufData(0, frz_offsets,
2265 sizeof(OffsetNumber) * nfrozen);
2266
2267 /*
2268 * Prepare the main xl_heap_prune record. We already set the XLHP_HAS_*
2269 * flag above.
2270 */
2271 if (vmflags & VISIBILITYMAP_ALL_VISIBLE)
2272 {
2273 xlrec.flags |= XLHP_VM_ALL_VISIBLE;
2274 if (vmflags & VISIBILITYMAP_ALL_FROZEN)
2275 xlrec.flags |= XLHP_VM_ALL_FROZEN;
2276 }
2278 xlrec.flags |= XLHP_IS_CATALOG_REL;
2279 if (TransactionIdIsValid(conflict_xid))
2281 if (cleanup_lock)
2282 xlrec.flags |= XLHP_CLEANUP_LOCK;
2283 else
2284 {
2285 Assert(nredirected == 0 && ndead == 0);
2286 /* also, any items in 'unused' must've been LP_DEAD previously */
2287 }
2289 if (TransactionIdIsValid(conflict_xid))
2290 XLogRegisterData(&conflict_xid, sizeof(TransactionId));
2291
2292 switch (reason)
2293 {
2294 case PRUNE_ON_ACCESS:
2296 break;
2297 case PRUNE_VACUUM_SCAN:
2299 break;
2302 break;
2303 default:
2304 elog(ERROR, "unrecognized prune reason: %d", (int) reason);
2305 break;
2306 }
2307 recptr = XLogInsert(RM_HEAP2_ID, info);
2308
2309 if (do_set_vm)
2310 {
2311 Assert(BufferIsDirty(vmbuffer));
2312 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2313 }
2314
2315 /*
2316 * See comment at the top of the function about regbuf_flags_heap for
2317 * details on when we can advance the page LSN.
2318 */
2319 if (do_prune || nfrozen > 0 || (do_set_vm && XLogHintBitIsNeeded()))
2320 {
2321 Assert(BufferIsDirty(buffer));
2322 PageSetLSN(BufferGetPage(buffer), recptr);
2323 }
2324}
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4223
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2911
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2943
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5604
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5430
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5857
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:203
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:433
Size PageGetHeapFreeSpace(const PageData *page)
Definition: bufpage.c:990
void PageRepairFragmentation(Page page)
Definition: bufpage.c:698
void PageTruncateLinePointerArray(Page page)
Definition: bufpage.c:834
PageHeaderData * PageHeader
Definition: bufpage.h:173
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static void PageClearFull(Page page)
Definition: bufpage.h:422
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
PageData * Page
Definition: bufpage.h:81
static bool PageIsFull(const PageData *page)
Definition: bufpage.h:412
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
#define likely(x)
Definition: c.h:406
uint8_t uint8
Definition: c.h:541
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:228
#define Max(x, y)
Definition: c.h:1002
int64_t int64
Definition: c.h:540
TransactionId MultiXactId
Definition: c.h:672
int8_t int8
Definition: c.h:537
#define unlikely(x)
Definition: c.h:407
#define MemSet(start, val, len)
Definition: c.h:1024
uint32 TransactionId
Definition: c.h:662
size_t Size
Definition: c.h:615
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
Assert(PointerIsAligned(start, uint64))
void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
Definition: heapam.c:7999
void heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition: heapam.c:7406
bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
Definition: heapam.c:7080
void heap_pre_freeze_checks(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition: heapam.c:7353
#define HEAP_PAGE_PRUNE_FREEZE
Definition: heapam.h:44
HTSV_Result
Definition: heapam.h:125
@ HEAPTUPLE_RECENTLY_DEAD
Definition: heapam.h:128
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition: heapam.h:129
@ HEAPTUPLE_LIVE
Definition: heapam.h:127
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition: heapam.h:130
@ HEAPTUPLE_DEAD
Definition: heapam.h:126
PruneReason
Definition: heapam.h:227
@ PRUNE_VACUUM_CLEANUP
Definition: heapam.h:230
@ PRUNE_ON_ACCESS
Definition: heapam.h:228
@ PRUNE_VACUUM_SCAN
Definition: heapam.h:229
#define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW
Definition: heapam.h:43
HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after)
#define XLHP_HAS_CONFLICT_HORIZON
Definition: heapam_xlog.h:316
#define XLHP_HAS_FREEZE_PLANS
Definition: heapam_xlog.h:322
#define XLHP_VM_ALL_VISIBLE
Definition: heapam_xlog.h:339
#define SizeOfHeapPrune
Definition: heapam_xlog.h:295
#define XLHP_HAS_NOW_UNUSED_ITEMS
Definition: heapam_xlog.h:331
#define XLHP_VM_ALL_FROZEN
Definition: heapam_xlog.h:340
#define XLHP_HAS_REDIRECTIONS
Definition: heapam_xlog.h:329
#define XLOG_HEAP2_PRUNE_VACUUM_SCAN
Definition: heapam_xlog.h:61
#define XLOG_HEAP2_PRUNE_ON_ACCESS
Definition: heapam_xlog.h:60
#define XLHP_CLEANUP_LOCK
Definition: heapam_xlog.h:308
#define XLHP_HAS_DEAD_ITEMS
Definition: heapam_xlog.h:330
#define XLOG_HEAP2_PRUNE_VACUUM_CLEANUP
Definition: heapam_xlog.h:62
#define XLHP_IS_CATALOG_REL
Definition: heapam_xlog.h:298
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
static bool HeapTupleHeaderIsHeapOnly(const HeapTupleHeaderData *tup)
Definition: htup_details.h:555
static TransactionId HeapTupleHeaderGetXmin(const HeapTupleHeaderData *tup)
Definition: htup_details.h:324
static bool HeapTupleHeaderIndicatesMovedPartitions(const HeapTupleHeaderData *tup)
Definition: htup_details.h:480
static bool HeapTupleHeaderIsHotUpdated(const HeapTupleHeaderData *tup)
Definition: htup_details.h:534
static TransactionId HeapTupleHeaderGetUpdateXid(const HeapTupleHeaderData *tup)
Definition: htup_details.h:397
#define MaxHeapTuplesPerPage
Definition: htup_details.h:624
static bool HeapTupleHeaderXminCommitted(const HeapTupleHeaderData *tup)
Definition: htup_details.h:337
WalUsage pgWalUsage
Definition: instrument.c:22
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdSetRedirect(itemId, link)
Definition: itemid.h:152
#define ItemIdIsNormal(itemId)
Definition: itemid.h:99
#define ItemIdGetRedirect(itemId)
Definition: itemid.h:78
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define ItemIdSetDead(itemId)
Definition: itemid.h:164
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define ItemIdHasStorage(itemId)
Definition: itemid.h:120
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
Definition: itemptr.h:135
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
#define InvalidMultiXactId
Definition: multixact.h:25
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
const void * data
#define plan(x)
Definition: pg_regress.c:161
void pgstat_update_heap_dead_tuples(Relation rel, int delta)
#define qsort(a, b, c, d)
Definition: port.h:500
unsigned int Oid
Definition: postgres_ext.h:32
bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
Definition: procarray.c:4226
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition: procarray.c:4069
static bool heap_page_will_freeze(Relation relation, Buffer buffer, bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune, PruneState *prstate)
Definition: pruneheap.c:653
static void prune_freeze_plan(Oid reloid, Buffer buffer, PruneState *prstate, OffsetNumber *off_loc)
Definition: pruneheap.c:448
static void heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, OffsetNumber rootoffnum, PruneState *prstate)
Definition: pruneheap.c:1097
static void heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:1612
void heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
Definition: pruneheap.c:1889
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition: pruneheap.c:209
void heap_page_prune_and_freeze(PruneFreezeParams *params, PruneFreezeResult *presult, OffsetNumber *off_loc, TransactionId *new_relfrozen_xid, MultiXactId *new_relmin_mxid)
Definition: pruneheap.c:809
static HTSV_Result htsv_get_valid_status(int status)
Definition: pruneheap.c:1058
static int heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, xlhp_freeze_plan *plans_out, OffsetNumber *offsets_out)
Definition: pruneheap.c:2082
static void page_verify_redirects(Page page)
Definition: pruneheap.c:1841
void log_heap_prune_and_freeze(Relation relation, Buffer buffer, Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, PruneReason reason, HeapTupleFreeze *frozen, int nfrozen, OffsetNumber *redirected, int nredirected, OffsetNumber *dead, int ndead, OffsetNumber *unused, int nunused)
Definition: pruneheap.c:2161
static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
Definition: pruneheap.c:1396
static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum, bool was_normal)
Definition: pruneheap.c:1313
static void heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:1429
static int heap_log_freeze_cmp(const void *arg1, const void *arg2)
Definition: pruneheap.c:2016
static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, bool was_normal)
Definition: pruneheap.c:1344
static bool heap_log_freeze_eq(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
Definition: pruneheap.c:2000
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
Definition: pruneheap.c:1299
static void heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:1418
static void heap_log_freeze_new_plan(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
Definition: pruneheap.c:2062
static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
Definition: pruneheap.c:1015
static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
Definition: pruneheap.c:1379
static void prune_freeze_setup(PruneFreezeParams *params, TransactionId new_relfrozen_xid, MultiXactId new_relmin_mxid, const PruneFreezeResult *presult, PruneState *prstate)
Definition: pruneheap.c:327
void heap_page_prune_execute(Buffer buffer, bool lp_truncate_only, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused)
Definition: pruneheap.c:1665
static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:1640
#define RelationGetRelid(relation)
Definition: rel.h:515
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition: rel.h:390
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:694
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define HEAP_DEFAULT_FILLFACTOR
Definition: rel.h:361
MultiXactId NoFreezePageRelminMxid
Definition: heapam.h:220
TransactionId FreezePageRelfrozenXid
Definition: heapam.h:208
bool freeze_required
Definition: heapam.h:182
MultiXactId FreezePageRelminMxid
Definition: heapam.h:209
TransactionId NoFreezePageRelfrozenXid
Definition: heapam.h:219
ItemPointerData t_self
Definition: htup.h:65
uint32 t_len
Definition: htup.h:64
HeapTupleHeader t_data
Definition: htup.h:68
Oid t_tableOid
Definition: htup.h:66
uint8 frzflags
Definition: heapam.h:147
uint16 t_infomask2
Definition: heapam.h:145
TransactionId xmax
Definition: heapam.h:144
OffsetNumber offset
Definition: heapam.h:152
uint16 t_infomask
Definition: heapam.h:146
ItemPointerData t_ctid
Definition: htup_details.h:161
PruneReason reason
Definition: heapam.h:245
Buffer buffer
Definition: heapam.h:239
GlobalVisState * vistest
Definition: heapam.h:262
struct VacuumCutoffs * cutoffs
Definition: heapam.h:271
Relation relation
Definition: heapam.h:238
int recently_dead_tuples
Definition: heapam.h:285
TransactionId vm_conflict_horizon
Definition: heapam.h:300
OffsetNumber deadoffsets[MaxHeapTuplesPerPage]
Definition: heapam.h:314
bool all_visible
Definition: heapam.h:298
HeapPageFreeze pagefrz
Definition: pruneheap.c:104
bool all_visible
Definition: pruneheap.c:154
int ndead
Definition: pruneheap.c:56
bool processed[MaxHeapTuplesPerPage+1]
Definition: pruneheap.c:87
OffsetNumber heaponly_items[MaxHeapTuplesPerPage]
Definition: pruneheap.c:79
TransactionId new_prune_xid
Definition: pruneheap.c:53
bool attempt_freeze
Definition: pruneheap.c:46
bool hastup
Definition: pruneheap.c:123
int recently_dead_tuples
Definition: pruneheap.c:120
OffsetNumber nowdead[MaxHeapTuplesPerPage]
Definition: pruneheap.c:61
TransactionId frz_conflict_horizon
Definition: pruneheap.c:137
int nroot_items
Definition: pruneheap.c:76
OffsetNumber nowunused[MaxHeapTuplesPerPage]
Definition: pruneheap.c:62
int nheaponly_items
Definition: pruneheap.c:78
bool mark_unused_now
Definition: pruneheap.c:44
int live_tuples
Definition: pruneheap.c:119
TransactionId visibility_cutoff_xid
Definition: pruneheap.c:156
bool all_frozen
Definition: pruneheap.c:155
GlobalVisState * vistest
Definition: pruneheap.c:42
struct VacuumCutoffs * cutoffs
Definition: pruneheap.c:47
HeapTupleFreeze frozen[MaxHeapTuplesPerPage]
Definition: pruneheap.c:63
int lpdead_items
Definition: pruneheap.c:129
int nfrozen
Definition: pruneheap.c:58
OffsetNumber redirected[MaxHeapTuplesPerPage *2]
Definition: pruneheap.c:60
int ndeleted
Definition: pruneheap.c:116
int nredirected
Definition: pruneheap.c:55
int8 htsv[MaxHeapTuplesPerPage+1]
Definition: pruneheap.c:99
TransactionId latest_xid_removed
Definition: pruneheap.c:54
int nunused
Definition: pruneheap.c:57
OffsetNumber root_items[MaxHeapTuplesPerPage]
Definition: pruneheap.c:77
OffsetNumber * deadoffsets
Definition: pruneheap.c:130
TransactionId OldestXmin
Definition: vacuum.h:279
int64 wal_fpi
Definition: instrument.h:54
static bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition: transam.h:297
#define TransactionIdRetreat(dest)
Definition: transam.h:141
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define NormalTransactionIdPrecedes(id1, id2)
Definition: transam.h:147
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.h:263
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_ALL_VISIBLE
bool RecoveryInProgress(void)
Definition: xlog.c:6406
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:409
bool XLogCheckBufferNeedsBackup(Buffer buffer)
Definition: xloginsert.c:1049
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:245
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:33