@@ -93,8 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
9393static void btvacuumscan (IndexVacuumInfo * info , IndexBulkDeleteResult * stats ,
9494 IndexBulkDeleteCallback callback , void * callback_state ,
9595 BTCycleId cycleid );
96- static void btvacuumpage (BTVacState * vstate , BlockNumber blkno ,
97- BlockNumber orig_blkno );
96+ static void btvacuumpage (BTVacState * vstate , BlockNumber scanblkno );
9897static BTVacuumPosting btreevacuumposting (BTVacState * vstate ,
9998 IndexTuple posting ,
10099 OffsetNumber updatedoffset ,
@@ -959,7 +958,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
959958 Relation rel = info -> index ;
960959 BTVacState vstate ;
961960 BlockNumber num_pages ;
962- BlockNumber blkno ;
961+ BlockNumber scanblkno ;
963962 bool needLock ;
964963
965964 /*
@@ -1009,7 +1008,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10091008 */
10101009 needLock = !RELATION_IS_LOCAL (rel );
10111010
1012- blkno = BTREE_METAPAGE + 1 ;
1011+ scanblkno = BTREE_METAPAGE + 1 ;
10131012 for (;;)
10141013 {
10151014 /* Get the current relation length */
@@ -1024,15 +1023,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10241023 num_pages );
10251024
10261025 /* Quit if we've scanned the whole relation */
1027- if (blkno >= num_pages )
1026+ if (scanblkno >= num_pages )
10281027 break ;
10291028 /* Iterate over pages, then loop back to recheck length */
1030- for (; blkno < num_pages ; blkno ++ )
1029+ for (; scanblkno < num_pages ; scanblkno ++ )
10311030 {
1032- btvacuumpage (& vstate , blkno , blkno );
1031+ btvacuumpage (& vstate , scanblkno );
10331032 if (info -> report_progress )
10341033 pgstat_progress_update_param (PROGRESS_SCAN_BLOCKS_DONE ,
1035- blkno );
1034+ scanblkno );
10361035 }
10371036 }
10381037
@@ -1076,31 +1075,33 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10761075/*
10771076 * btvacuumpage --- VACUUM one page
10781077 *
1079- * This processes a single page for btvacuumscan(). In some cases we
1080- * must go back and re-examine previously-scanned pages; this routine
1081- * recurses when necessary to handle that case.
1082- *
1083- * blkno is the page to process. orig_blkno is the highest block number
1084- * reached by the outer btvacuumscan loop (the same as blkno, unless we
1085- * are recursing to re-examine a previous page).
1078+ * This processes a single page for btvacuumscan(). In some cases we must
1079+ * backtrack to re-examine and VACUUM pages that were the scanblkno during
1080+ * a previous call here. This is how we handle page splits (that happened
1081+ * after our cycleid was acquired) whose right half page happened to reuse
1082+ * a block that we might have processed at some point before it was
1083+ * recycled (i.e. before the page split).
10861084 */
10871085static void
1088- btvacuumpage (BTVacState * vstate , BlockNumber blkno , BlockNumber orig_blkno )
1086+ btvacuumpage (BTVacState * vstate , BlockNumber scanblkno )
10891087{
10901088 IndexVacuumInfo * info = vstate -> info ;
10911089 IndexBulkDeleteResult * stats = vstate -> stats ;
10921090 IndexBulkDeleteCallback callback = vstate -> callback ;
10931091 void * callback_state = vstate -> callback_state ;
10941092 Relation rel = info -> index ;
1095- bool delete_now ;
1096- BlockNumber recurse_to ;
1093+ bool attempt_pagedel ;
1094+ BlockNumber blkno , backtrack_to ;
10971095 Buffer buf ;
10981096 Page page ;
1099- BTPageOpaque opaque = NULL ;
1097+ BTPageOpaque opaque ;
1098+
1099+ blkno = scanblkno ;
1100+
1101+ backtrack :
11001102
1101- restart :
1102- delete_now = false;
1103- recurse_to = P_NONE ;
1103+ attempt_pagedel = false;
1104+ backtrack_to = P_NONE ;
11041105
11051106 /* call vacuum_delay_point while not holding any buffer lock */
11061107 vacuum_delay_point ();
@@ -1115,24 +1116,59 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11151116 info -> strategy );
11161117 LockBuffer (buf , BT_READ );
11171118 page = BufferGetPage (buf );
1119+ opaque = NULL ;
11181120 if (!PageIsNew (page ))
11191121 {
11201122 _bt_checkpage (rel , buf );
11211123 opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
11221124 }
11231125
1124- /*
1125- * If we are recursing, the only case we want to do anything with is a
1126- * live leaf page having the current vacuum cycle ID. Any other state
1127- * implies we already saw the page (eg, deleted it as being empty).
1128- */
1129- if (blkno != orig_blkno )
1126+ Assert (blkno <= scanblkno );
1127+ if (blkno != scanblkno )
11301128 {
1131- if (_bt_page_recyclable (page ) ||
1132- P_IGNORE (opaque ) ||
1133- !P_ISLEAF (opaque ) ||
1134- opaque -> btpo_cycleid != vstate -> cycleid )
1129+ /*
1130+ * We're backtracking.
1131+ *
1132+ * We followed a right link to a sibling leaf page (a page that
1133+ * happens to be from a block located before scanblkno). The only
1134+ * case we want to do anything with is a live leaf page having the
1135+ * current vacuum cycle ID.
1136+ *
1137+ * The page had better be in a state that's consistent with what we
1138+ * expect. Check for conditions that imply corruption in passing. It
1139+ * can't be half-dead because only an interrupted VACUUM process can
1140+ * leave pages in that state, so we'd definitely have dealt with it
1141+ * back when the page was the scanblkno page (half-dead pages are
1142+ * always marked fully deleted by _bt_pagedel()). This assumes that
1143+ * there can be only one vacuum process running at a time.
1144+ */
1145+ if (!opaque || !P_ISLEAF (opaque ) || P_ISHALFDEAD (opaque ))
11351146 {
1147+ Assert (false);
1148+ ereport (LOG ,
1149+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1150+ errmsg_internal ("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"" ,
1151+ blkno , scanblkno , RelationGetRelationName (rel ))));
1152+ _bt_relbuf (rel , buf );
1153+ return ;
1154+ }
1155+
1156+ /*
1157+ * We may have already processed the page in an earlier call, when the
1158+ * page was scanblkno. This happens when the leaf page split occurred
1159+ * after the scan began, but before the right sibling page became the
1160+ * scanblkno.
1161+ *
1162+ * Page may also have been deleted by current btvacuumpage() call,
1163+ * since _bt_pagedel() sometimes deletes the right sibling page of
1164+ * scanblkno in passing (it does so after we decided where to
1165+ * backtrack to). We don't need to process this page as a deleted
1166+ * page a second time now (in fact, it would be wrong to count it as a
1167+ * deleted page in the bulk delete statistics a second time).
1168+ */
1169+ if (opaque -> btpo_cycleid != vstate -> cycleid || P_ISDELETED (opaque ))
1170+ {
1171+ /* Done with current scanblkno (and all lower split pages) */
11361172 _bt_relbuf (rel , buf );
11371173 return ;
11381174 }
@@ -1165,7 +1201,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11651201 * Half-dead leaf page. Try to delete now. Might update
11661202 * oldestBtpoXact and pages_deleted below.
11671203 */
1168- delete_now = true;
1204+ attempt_pagedel = true;
11691205 }
11701206 else if (P_ISLEAF (opaque ))
11711207 {
@@ -1189,18 +1225,20 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11891225 LockBufferForCleanup (buf );
11901226
11911227 /*
1192- * Check whether we need to recurse back to earlier pages. What we
1193- * are concerned about is a page split that happened since we started
1194- * the vacuum scan. If the split moved some tuples to a lower page
1195- * then we might have missed 'em. If so, set up for tail recursion.
1196- * (Must do this before possibly clearing btpo_cycleid below!)
1228+ * Check whether we need to backtrack to earlier pages. What we are
1229+ * concerned about is a page split that happened since we started the
1230+ * vacuum scan. If the split moved tuples on the right half of the
1231+ * split (i.e. the tuples that sort high) to a block that we already
1232+ * passed over, then we might have missed the tuples. We need to
1233+ * backtrack now. (Must do this before possibly clearing btpo_cycleid
1234+ * or deleting scanblkno page below!)
11971235 */
11981236 if (vstate -> cycleid != 0 &&
11991237 opaque -> btpo_cycleid == vstate -> cycleid &&
12001238 !(opaque -> btpo_flags & BTP_SPLIT_END ) &&
12011239 !P_RIGHTMOST (opaque ) &&
1202- opaque -> btpo_next < orig_blkno )
1203- recurse_to = opaque -> btpo_next ;
1240+ opaque -> btpo_next < scanblkno )
1241+ backtrack_to = opaque -> btpo_next ;
12041242
12051243 /*
12061244 * When each VACUUM begins, it determines an OldestXmin cutoff value.
@@ -1311,7 +1349,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13111349 */
13121350 if (ndeletable > 0 || nupdatable > 0 )
13131351 {
1314- Assert (nhtidsdead >= Max ( ndeletable , 1 ) );
1352+ Assert (nhtidsdead >= ndeletable + nupdatable );
13151353 _bt_delitems_vacuum (rel , buf , deletable , ndeletable , updatable ,
13161354 nupdatable );
13171355
@@ -1347,19 +1385,19 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13471385 /*
13481386 * If the leaf page is now empty, try to delete it; else count the
13491387 * live tuples (live table TIDs in posting lists are counted as
1350- * separate live tuples). We don't delete when recursing , though, to
1351- * avoid putting entries into freePages out-of-order (doesn't seem
1352- * worth any extra code to handle the case ).
1388+ * separate live tuples). We don't delete when backtracking , though,
1389+ * since that would require teaching _bt_pagedel() about backtracking
1390+ * (doesn't seem worth adding more complexity to deal with that ).
13531391 */
13541392 if (minoff > maxoff )
1355- delete_now = (blkno == orig_blkno );
1393+ attempt_pagedel = (blkno == scanblkno );
13561394 else
13571395 stats -> num_index_tuples += nhtidslive ;
13581396
1359- Assert (!delete_now || nhtidslive == 0 );
1397+ Assert (!attempt_pagedel || nhtidslive == 0 );
13601398 }
13611399
1362- if (delete_now )
1400+ if (attempt_pagedel )
13631401 {
13641402 MemoryContext oldcontext ;
13651403
@@ -1372,6 +1410,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13721410 * any page that a future call here from btvacuumscan is expected to
13731411 * count. There will be no double-counting.
13741412 */
1413+ Assert (blkno == scanblkno );
13751414 stats -> pages_deleted += _bt_pagedel (rel , buf , & vstate -> oldestBtpoXact );
13761415
13771416 MemoryContextSwitchTo (oldcontext );
@@ -1380,18 +1419,10 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13801419 else
13811420 _bt_relbuf (rel , buf );
13821421
1383- /*
1384- * This is really tail recursion, but if the compiler is too stupid to
1385- * optimize it as such, we'd eat an uncomfortably large amount of stack
1386- * space per recursion level (due to the arrays used to track details of
1387- * deletable/updatable items). A failure is improbable since the number
1388- * of levels isn't likely to be large ... but just in case, let's
1389- * hand-optimize into a loop.
1390- */
1391- if (recurse_to != P_NONE )
1422+ if (backtrack_to != P_NONE )
13921423 {
1393- blkno = recurse_to ;
1394- goto restart ;
1424+ blkno = backtrack_to ;
1425+ goto backtrack ;
13951426 }
13961427}
13971428
0 commit comments