PostgreSQL Source Code git master
tidbitmap.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tidbitmap.c
4 * PostgreSQL tuple-id (TID) bitmap package
5 *
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
14 *
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
21 *
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
30 *
31 *
32 * Copyright (c) 2003-2025, PostgreSQL Global Development Group
33 *
34 * IDENTIFICATION
35 * src/backend/nodes/tidbitmap.c
36 *
37 *-------------------------------------------------------------------------
38 */
39#include "postgres.h"
40
41#include <limits.h>
42
43#include "access/htup_details.h"
44#include "common/hashfn.h"
45#include "common/int.h"
46#include "nodes/bitmapset.h"
47#include "nodes/tidbitmap.h"
48#include "storage/lwlock.h"
49#include "utils/dsa.h"
50
51/*
52 * When we have to switch over to lossy storage, we use a data structure
53 * with one bit per page, where all pages having the same number DIV
54 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
55 * and has the bit set for a given page, there must not be a per-page entry
56 * for that page in the page table.
57 *
58 * We actually store both exact pages and lossy chunks in the same hash
59 * table, using identical data structures. (This is because the memory
60 * management for hashtables doesn't easily/efficiently allow space to be
61 * transferred easily from one hashtable to another.) Therefore it's best
62 * if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not
63 * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
64 * avoid expensive integer remainder operations. So, define it like this:
65 */
66#define PAGES_PER_CHUNK (BLCKSZ / 32)
67
68/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
69
70#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
71#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
72
73/* number of active words for an exact page: */
74#define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
75/* number of active words for a lossy chunk: */
76#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
77
78/*
79 * The hashtable entries are represented by this data structure. For
80 * an exact page, blockno is the page number and bit k of the bitmap
81 * represents tuple offset k+1. For a lossy chunk, blockno is the first
82 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
83 * bit k represents page blockno+k. Note that it is not possible to
84 * have exact storage for the first page of a chunk if we are using
85 * lossy storage for any page in the chunk's range, since the same
86 * hashtable entry has to serve both purposes.
87 *
88 * recheck is used only on exact pages --- it indicates that although
89 * only the stated tuples need be checked, the full index qual condition
90 * must be checked for each (ie, these are candidate matches).
91 */
92typedef struct PagetableEntry
93{
94 BlockNumber blockno; /* page number (hashtable key) */
95 char status; /* hash entry status */
96 bool ischunk; /* T = lossy storage, F = exact */
97 bool recheck; /* should the tuples be rechecked? */
100
101/*
102 * Holds array of pagetable entries.
103 */
104typedef struct PTEntryArray
105{
106 pg_atomic_uint32 refcount; /* no. of iterator attached */
109
110/*
111 * We want to avoid the overhead of creating the hashtable, which is
112 * comparatively large, when not necessary. Particularly when we are using a
113 * bitmap scan on the inside of a nestloop join: a bitmap may well live only
114 * long enough to accumulate one entry in such cases. We therefore avoid
115 * creating an actual hashtable until we need two pagetable entries. When
116 * just one pagetable entry is needed, we store it in a fixed field of
117 * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
118 * shrinks down to zero or one page again. So, status can be TBM_HASH even
119 * when nentries is zero or one.)
120 */
121typedef enum
122{
123 TBM_EMPTY, /* no hashtable, nentries == 0 */
124 TBM_ONE_PAGE, /* entry1 contains the single entry */
125 TBM_HASH, /* pagetable is valid, entry1 is not */
126} TBMStatus;
127
128/*
129 * Current iterating state of the TBM.
130 */
131typedef enum
132{
133 TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
134 TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
135 TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
137
138/*
139 * Here is the representation for a whole TIDBitMap:
140 */
142{
143 NodeTag type; /* to make it a valid Node */
144 MemoryContext mcxt; /* memory context containing me */
145 TBMStatus status; /* see codes above */
146 struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
147 int nentries; /* number of entries in pagetable */
148 int maxentries; /* limit on same to meet maxbytes */
149 int npages; /* number of exact entries in pagetable */
150 int nchunks; /* number of lossy entries in pagetable */
151 TBMIteratingState iterating; /* tbm_begin_iterate called? */
152 uint32 lossify_start; /* offset to start lossifying hashtable at */
153 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
154 /* these are valid when iterating is true: */
155 PagetableEntry **spages; /* sorted exact-page list, or NULL */
156 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
157 dsa_pointer dsapagetable; /* dsa_pointer to the element array */
158 dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
159 dsa_pointer ptpages; /* dsa_pointer to the page array */
160 dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
161 dsa_area *dsa; /* reference to per-query dsa area */
162};
163
164/*
165 * When iterating over a backend-local bitmap in sorted order, a
166 * TBMPrivateIterator is used to track our progress. There can be several
167 * iterators scanning the same bitmap concurrently. Note that the bitmap
168 * becomes read-only as soon as any iterator is created.
169 */
171{
172 TIDBitmap *tbm; /* TIDBitmap we're iterating over */
173 int spageptr; /* next spages index */
174 int schunkptr; /* next schunks index */
175 int schunkbit; /* next bit to check in current schunk */
176};
177
178/*
179 * Holds the shared members of the iterator so that multiple processes
180 * can jointly iterate.
181 */
183{
184 int nentries; /* number of entries in pagetable */
185 int maxentries; /* limit on same to meet maxbytes */
186 int npages; /* number of exact entries in pagetable */
187 int nchunks; /* number of lossy entries in pagetable */
188 dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
189 dsa_pointer spages; /* dsa pointer to page array */
190 dsa_pointer schunks; /* dsa pointer to chunk array */
191 LWLock lock; /* lock to protect below members */
192 int spageptr; /* next spages index */
193 int schunkptr; /* next schunks index */
194 int schunkbit; /* next bit to check in current schunk */
196
197/*
198 * pagetable iteration array.
199 */
200typedef struct PTIterationArray
201{
202 pg_atomic_uint32 refcount; /* no. of iterator attached */
203 int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
205
206/*
207 * same as TBMPrivateIterator, but it is used for joint iteration, therefore
208 * this also holds a reference to the shared state.
209 */
211{
212 TBMSharedIteratorState *state; /* shared state */
213 PTEntryArray *ptbase; /* pagetable element array */
214 PTIterationArray *ptpages; /* sorted exact page index list */
215 PTIterationArray *ptchunks; /* sorted lossy page index list */
216};
217
218/* Local function prototypes */
219static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
220static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
221 const TIDBitmap *b);
222static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
223 BlockNumber pageno);
225static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
226static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
227static void tbm_lossify(TIDBitmap *tbm);
228static int tbm_comparator(const void *left, const void *right);
229static int tbm_shared_comparator(const void *left, const void *right,
230 void *arg);
231
232/* define hashtable mapping block numbers to PagetableEntry's */
233#define SH_USE_NONDEFAULT_ALLOCATOR
234#define SH_PREFIX pagetable
235#define SH_ELEMENT_TYPE PagetableEntry
236#define SH_KEY_TYPE BlockNumber
237#define SH_KEY blockno
238#define SH_HASH_KEY(tb, key) murmurhash32(key)
239#define SH_EQUAL(tb, a, b) a == b
240#define SH_SCOPE static inline
241#define SH_DEFINE
242#define SH_DECLARE
243#include "lib/simplehash.h"
244
245
246/*
247 * tbm_create - create an initially-empty bitmap
248 *
249 * The bitmap will live in the memory context that is CurrentMemoryContext
250 * at the time of this call. It will be limited to (approximately) maxbytes
251 * total memory consumption. If the DSA passed to this function is not NULL
252 * then the memory for storing elements of the underlying page table will
253 * be allocated from the DSA.
254 */
255TIDBitmap *
256tbm_create(Size maxbytes, dsa_area *dsa)
257{
258 TIDBitmap *tbm;
259
260 /* Create the TIDBitmap struct and zero all its fields */
261 tbm = makeNode(TIDBitmap);
262
264 tbm->status = TBM_EMPTY;
265
266 tbm->maxentries = tbm_calculate_entries(maxbytes);
267 tbm->lossify_start = 0;
268 tbm->dsa = dsa;
273
274 return tbm;
275}
276
277/*
278 * Actually create the hashtable. Since this is a moderately expensive
279 * proposition, we don't do it until we have to.
280 */
281static void
283{
284 Assert(tbm->status != TBM_HASH);
285 Assert(tbm->pagetable == NULL);
286
287 tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
288
289 /* If entry1 is valid, push it into the hashtable */
290 if (tbm->status == TBM_ONE_PAGE)
291 {
292 PagetableEntry *page;
293 bool found;
294 char oldstatus;
295
296 page = pagetable_insert(tbm->pagetable,
297 tbm->entry1.blockno,
298 &found);
299 Assert(!found);
300 oldstatus = page->status;
301 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
302 page->status = oldstatus;
303 }
304
305 tbm->status = TBM_HASH;
306}
307
308/*
309 * tbm_free - free a TIDBitmap
310 */
311void
313{
314 if (tbm->pagetable)
315 pagetable_destroy(tbm->pagetable);
316 if (tbm->spages)
317 pfree(tbm->spages);
318 if (tbm->schunks)
319 pfree(tbm->schunks);
320 pfree(tbm);
321}
322
323/*
324 * tbm_free_shared_area - free shared state
325 *
326 * Free shared iterator state, Also free shared pagetable and iterator arrays
327 * memory if they are not referred by any of the shared iterator i.e recount
328 * is becomes 0.
329 */
330void
332{
333 TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
334 PTEntryArray *ptbase;
335 PTIterationArray *ptpages;
336 PTIterationArray *ptchunks;
337
338 if (DsaPointerIsValid(istate->pagetable))
339 {
340 ptbase = dsa_get_address(dsa, istate->pagetable);
341 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
342 dsa_free(dsa, istate->pagetable);
343 }
344 if (DsaPointerIsValid(istate->spages))
345 {
346 ptpages = dsa_get_address(dsa, istate->spages);
347 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
348 dsa_free(dsa, istate->spages);
349 }
350 if (DsaPointerIsValid(istate->schunks))
351 {
352 ptchunks = dsa_get_address(dsa, istate->schunks);
353 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
354 dsa_free(dsa, istate->schunks);
355 }
356
357 dsa_free(dsa, dp);
358}
359
360/*
361 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
362 *
363 * If recheck is true, then the recheck flag will be set in the
364 * TBMIterateResult when any of these tuples are reported out.
365 */
366void
367tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids,
368 bool recheck)
369{
371 PagetableEntry *page = NULL; /* only valid when currblk is valid */
372
374 for (int i = 0; i < ntids; i++)
375 {
378 int wordnum,
379 bitnum;
380
381 /* safety check to ensure we don't overrun bit array bounds */
382 if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE)
383 elog(ERROR, "tuple offset out of range: %u", off);
384
385 /*
386 * Look up target page unless we already did. This saves cycles when
387 * the input includes consecutive tuples on the same page, which is
388 * common enough to justify an extra test here.
389 */
390 if (blk != currblk)
391 {
392 if (tbm_page_is_lossy(tbm, blk))
393 page = NULL; /* remember page is lossy */
394 else
395 page = tbm_get_pageentry(tbm, blk);
396 currblk = blk;
397 }
398
399 if (page == NULL)
400 continue; /* whole page is already marked */
401
402 if (page->ischunk)
403 {
404 /* The page is a lossy chunk header, set bit for itself */
405 wordnum = bitnum = 0;
406 }
407 else
408 {
409 /* Page is exact, so set bit for individual tuple */
410 wordnum = WORDNUM(off - 1);
411 bitnum = BITNUM(off - 1);
412 }
413 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
414 page->recheck |= recheck;
415
416 if (tbm->nentries > tbm->maxentries)
417 {
418 tbm_lossify(tbm);
419 /* Page could have been converted to lossy, so force new lookup */
420 currblk = InvalidBlockNumber;
421 }
422 }
423}
424
425/*
426 * tbm_add_page - add a whole page to a TIDBitmap
427 *
428 * This causes the whole page to be reported (with the recheck flag)
429 * when the TIDBitmap is scanned.
430 */
431void
433{
434 /* Enter the page in the bitmap, or mark it lossy if already present */
435 tbm_mark_page_lossy(tbm, pageno);
436 /* If we went over the memory limit, lossify some more pages */
437 if (tbm->nentries > tbm->maxentries)
438 tbm_lossify(tbm);
439}
440
441/*
442 * tbm_union - set union
443 *
444 * a is modified in-place, b is not changed
445 */
446void
448{
449 Assert(!a->iterating);
450 /* Nothing to do if b is empty */
451 if (b->nentries == 0)
452 return;
453 /* Scan through chunks and pages in b, merge into a */
454 if (b->status == TBM_ONE_PAGE)
455 tbm_union_page(a, &b->entry1);
456 else
457 {
458 pagetable_iterator i;
459 PagetableEntry *bpage;
460
461 Assert(b->status == TBM_HASH);
462 pagetable_start_iterate(b->pagetable, &i);
463 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
464 tbm_union_page(a, bpage);
465 }
466}
467
468/* Process one page of b during a union op */
469static void
471{
472 PagetableEntry *apage;
473
474 if (bpage->ischunk)
475 {
476 /* Scan b's chunk, mark each indicated page lossy in a */
477 for (int wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
478 {
479 bitmapword w = bpage->words[wordnum];
480
481 if (w != 0)
482 {
483 BlockNumber pg;
484
485 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
486 while (w != 0)
487 {
488 if (w & 1)
490 pg++;
491 w >>= 1;
492 }
493 }
494 }
495 }
496 else if (tbm_page_is_lossy(a, bpage->blockno))
497 {
498 /* page is already lossy in a, nothing to do */
499 return;
500 }
501 else
502 {
503 apage = tbm_get_pageentry(a, bpage->blockno);
504 if (apage->ischunk)
505 {
506 /* The page is a lossy chunk header, set bit for itself */
507 apage->words[0] |= ((bitmapword) 1 << 0);
508 }
509 else
510 {
511 /* Both pages are exact, merge at the bit level */
512 for (int wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
513 apage->words[wordnum] |= bpage->words[wordnum];
514 apage->recheck |= bpage->recheck;
515 }
516 }
517
518 if (a->nentries > a->maxentries)
519 tbm_lossify(a);
520}
521
522/*
523 * tbm_intersect - set intersection
524 *
525 * a is modified in-place, b is not changed
526 */
527void
529{
530 Assert(!a->iterating);
531 /* Nothing to do if a is empty */
532 if (a->nentries == 0)
533 return;
534 /* Scan through chunks and pages in a, try to match to b */
535 if (a->status == TBM_ONE_PAGE)
536 {
537 if (tbm_intersect_page(a, &a->entry1, b))
538 {
539 /* Page is now empty, remove it from a */
540 Assert(!a->entry1.ischunk);
541 a->npages--;
542 a->nentries--;
543 Assert(a->nentries == 0);
544 a->status = TBM_EMPTY;
545 }
546 }
547 else
548 {
549 pagetable_iterator i;
550 PagetableEntry *apage;
551
552 Assert(a->status == TBM_HASH);
553 pagetable_start_iterate(a->pagetable, &i);
554 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
555 {
556 if (tbm_intersect_page(a, apage, b))
557 {
558 /* Page or chunk is now empty, remove it from a */
559 if (apage->ischunk)
560 a->nchunks--;
561 else
562 a->npages--;
563 a->nentries--;
564 if (!pagetable_delete(a->pagetable, apage->blockno))
565 elog(ERROR, "hash table corrupted");
566 }
567 }
568 }
569}
570
571/*
572 * Process one page of a during an intersection op
573 *
574 * Returns true if apage is now empty and should be deleted from a
575 */
576static bool
578{
579 const PagetableEntry *bpage;
580
581 if (apage->ischunk)
582 {
583 /* Scan each bit in chunk, try to clear */
584 bool candelete = true;
585
586 for (int wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
587 {
588 bitmapword w = apage->words[wordnum];
589
590 if (w != 0)
591 {
592 bitmapword neww = w;
593 BlockNumber pg;
594 int bitnum;
595
596 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
597 bitnum = 0;
598 while (w != 0)
599 {
600 if (w & 1)
601 {
602 if (!tbm_page_is_lossy(b, pg) &&
603 tbm_find_pageentry(b, pg) == NULL)
604 {
605 /* Page is not in b at all, lose lossy bit */
606 neww &= ~((bitmapword) 1 << bitnum);
607 }
608 }
609 pg++;
610 bitnum++;
611 w >>= 1;
612 }
613 apage->words[wordnum] = neww;
614 if (neww != 0)
615 candelete = false;
616 }
617 }
618 return candelete;
619 }
620 else if (tbm_page_is_lossy(b, apage->blockno))
621 {
622 /*
623 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
624 * because the page 'b' is lossy, we don't know which ones. Therefore
625 * we mark 'a' as requiring rechecks, to indicate that at most those
626 * tuples set in 'a' are matches.
627 */
628 apage->recheck = true;
629 return false;
630 }
631 else
632 {
633 bool candelete = true;
634
635 bpage = tbm_find_pageentry(b, apage->blockno);
636 if (bpage != NULL)
637 {
638 /* Both pages are exact, merge at the bit level */
639 Assert(!bpage->ischunk);
640 for (int wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
641 {
642 apage->words[wordnum] &= bpage->words[wordnum];
643 if (apage->words[wordnum] != 0)
644 candelete = false;
645 }
646 apage->recheck |= bpage->recheck;
647 }
648 /* If there is no matching b page, we can just delete the a page */
649 return candelete;
650 }
651}
652
653/*
654 * tbm_is_empty - is a TIDBitmap completely empty?
655 */
656bool
658{
659 return (tbm->nentries == 0);
660}
661
662/*
663 * tbm_begin_private_iterate - prepare to iterate through a TIDBitmap
664 *
665 * The TBMPrivateIterator struct is created in the caller's memory context.
666 * For a clean shutdown of the iteration, call tbm_end_private_iterate; but
667 * it's okay to just allow the memory context to be released, too. It is
668 * caller's responsibility not to touch the TBMPrivateIterator anymore once
669 * the TIDBitmap is freed.
670 *
671 * NB: after this is called, it is no longer allowed to modify the contents
672 * of the bitmap. However, you can call this multiple times to scan the
673 * contents repeatedly, including parallel scans.
674 */
677{
678 TBMPrivateIterator *iterator;
679
681
682 /*
683 * Create the TBMPrivateIterator struct, with enough trailing space to
684 * serve the needs of the TBMIterateResult sub-struct.
685 */
686 iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator));
687 iterator->tbm = tbm;
688
689 /*
690 * Initialize iteration pointers.
691 */
692 iterator->spageptr = 0;
693 iterator->schunkptr = 0;
694 iterator->schunkbit = 0;
695
696 /*
697 * If we have a hashtable, create and fill the sorted page lists, unless
698 * we already did that for a previous iterator. Note that the lists are
699 * attached to the bitmap not the iterator, so they can be used by more
700 * than one iterator.
701 */
702 if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
703 {
704 pagetable_iterator i;
705 PagetableEntry *page;
706 int npages;
707 int nchunks;
708
709 if (!tbm->spages && tbm->npages > 0)
710 tbm->spages = (PagetableEntry **)
712 tbm->npages * sizeof(PagetableEntry *));
713 if (!tbm->schunks && tbm->nchunks > 0)
714 tbm->schunks = (PagetableEntry **)
716 tbm->nchunks * sizeof(PagetableEntry *));
717
718 npages = nchunks = 0;
719 pagetable_start_iterate(tbm->pagetable, &i);
720 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
721 {
722 if (page->ischunk)
723 tbm->schunks[nchunks++] = page;
724 else
725 tbm->spages[npages++] = page;
726 }
727 Assert(npages == tbm->npages);
728 Assert(nchunks == tbm->nchunks);
729 if (npages > 1)
730 qsort(tbm->spages, npages, sizeof(PagetableEntry *),
732 if (nchunks > 1)
733 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
735 }
736
738
739 return iterator;
740}
741
742/*
743 * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
744 *
745 * The necessary shared state will be allocated from the DSA passed to
746 * tbm_create, so that multiple processes can attach to it and iterate jointly.
747 *
748 * This will convert the pagetable hash into page and chunk array of the index
749 * into pagetable array.
750 */
753{
754 dsa_pointer dp;
756 PTEntryArray *ptbase = NULL;
757 PTIterationArray *ptpages = NULL;
758 PTIterationArray *ptchunks = NULL;
759
760 Assert(tbm->dsa != NULL);
762
763 /*
764 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
765 * lock, this will also be used by multiple worker for shared iterate.
766 */
767 dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
768 istate = dsa_get_address(tbm->dsa, dp);
769
770 /*
771 * If we're not already iterating, create and fill the sorted page lists.
772 * (If we are, the sorted page lists are already stored in the TIDBitmap,
773 * and we can just reuse them.)
774 */
775 if (tbm->iterating == TBM_NOT_ITERATING)
776 {
777 pagetable_iterator i;
778 PagetableEntry *page;
779 int idx;
780 int npages;
781 int nchunks;
782
783 /*
784 * Allocate the page and chunk array memory from the DSA to share
785 * across multiple processes.
786 */
787 if (tbm->npages)
788 {
789 tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
790 tbm->npages * sizeof(int));
791 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
792 pg_atomic_init_u32(&ptpages->refcount, 0);
793 }
794 if (tbm->nchunks)
795 {
796 tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
797 tbm->nchunks * sizeof(int));
798 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
799 pg_atomic_init_u32(&ptchunks->refcount, 0);
800 }
801
802 /*
803 * If TBM status is TBM_HASH then iterate over the pagetable and
804 * convert it to page and chunk arrays. But if it's in the
805 * TBM_ONE_PAGE mode then directly allocate the space for one entry
806 * from the DSA.
807 */
808 npages = nchunks = 0;
809 if (tbm->status == TBM_HASH)
810 {
811 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
812
813 pagetable_start_iterate(tbm->pagetable, &i);
814 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
815 {
816 idx = page - ptbase->ptentry;
817 if (page->ischunk)
818 ptchunks->index[nchunks++] = idx;
819 else
820 ptpages->index[npages++] = idx;
821 }
822
823 Assert(npages == tbm->npages);
824 Assert(nchunks == tbm->nchunks);
825 }
826 else if (tbm->status == TBM_ONE_PAGE)
827 {
828 /*
829 * In one page mode allocate the space for one pagetable entry,
830 * initialize it, and directly store its index (i.e. 0) in the
831 * page array.
832 */
833 tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
834 sizeof(PagetableEntry));
835 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
836 memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
837 ptpages->index[0] = 0;
838 }
839
840 if (ptbase != NULL)
841 pg_atomic_init_u32(&ptbase->refcount, 0);
842 if (npages > 1)
843 qsort_arg(ptpages->index, npages, sizeof(int),
845 if (nchunks > 1)
846 qsort_arg(ptchunks->index, nchunks, sizeof(int),
848 }
849
850 /*
851 * Store the TBM members in the shared state so that we can share them
852 * across multiple processes.
853 */
854 istate->nentries = tbm->nentries;
855 istate->maxentries = tbm->maxentries;
856 istate->npages = tbm->npages;
857 istate->nchunks = tbm->nchunks;
858 istate->pagetable = tbm->dsapagetable;
859 istate->spages = tbm->ptpages;
860 istate->schunks = tbm->ptchunks;
861
862 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
863 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
864 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
865
866 /*
867 * For every shared iterator referring to pagetable and iterator array,
868 * increase the refcount by 1 so that while freeing the shared iterator we
869 * don't free pagetable and iterator array until its refcount becomes 0.
870 */
871 if (ptbase != NULL)
873 if (ptpages != NULL)
874 pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
875 if (ptchunks != NULL)
876 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
877
878 /* Initialize the iterator lock */
879 LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
880
881 /* Initialize the shared iterator state */
882 istate->schunkbit = 0;
883 istate->schunkptr = 0;
884 istate->spageptr = 0;
885
887
888 return dp;
889}
890
891/*
892 * tbm_extract_page_tuple - extract the tuple offsets from a page
893 *
894 * Returns the number of offsets it filled in if <= max_offsets. Otherwise,
895 * fills in as many offsets as fit and returns the total number of offsets in
896 * the page.
897 */
898int
900 OffsetNumber *offsets,
901 uint32 max_offsets)
902{
903 PagetableEntry *page = iteritem->internal_page;
904 int ntuples = 0;
905
906 for (int wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
907 {
908 bitmapword w = page->words[wordnum];
909
910 if (w != 0)
911 {
912 int off = wordnum * BITS_PER_BITMAPWORD + 1;
913
914 while (w != 0)
915 {
916 if (w & 1)
917 {
918 if (ntuples < max_offsets)
919 offsets[ntuples] = (OffsetNumber) off;
920 ntuples++;
921 }
922 off++;
923 w >>= 1;
924 }
925 }
926 }
927
928 return ntuples;
929}
930
931/*
932 * tbm_advance_schunkbit - Advance the schunkbit
933 */
934static inline void
935tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
936{
937 int schunkbit = *schunkbitp;
938
939 while (schunkbit < PAGES_PER_CHUNK)
940 {
941 int wordnum = WORDNUM(schunkbit);
942 int bitnum = BITNUM(schunkbit);
943
944 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
945 break;
946 schunkbit++;
947 }
948
949 *schunkbitp = schunkbit;
950}
951
952/*
953 * tbm_private_iterate - scan through next page of a TIDBitmap
954 *
955 * Caller must pass in a TBMIterateResult to be filled.
956 *
957 * Pages are guaranteed to be delivered in numerical order.
958 *
959 * Returns false when there are no more pages to scan and true otherwise. When
960 * there are no more pages to scan, tbmres->blockno is set to
961 * InvalidBlockNumber.
962 *
963 * If lossy is true, then the bitmap is "lossy" and failed to remember
964 * the exact tuples to look at on this page --- the caller must examine all
965 * tuples on the page and check if they meet the intended condition. If lossy
966 * is false, the caller must later extract the tuple offsets from the page
967 * pointed to by internal_page with tbm_extract_page_tuple.
968 *
969 * If tbmres->recheck is true, only the indicated tuples need be examined, but
970 * the condition must be rechecked anyway. (For ease of testing, recheck is
971 * always set true when lossy is true.)
972 */
973bool
975{
976 TIDBitmap *tbm = iterator->tbm;
977
979
980 /*
981 * If lossy chunk pages remain, make sure we've advanced schunkptr/
982 * schunkbit to the next set bit.
983 */
984 while (iterator->schunkptr < tbm->nchunks)
985 {
986 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
987 int schunkbit = iterator->schunkbit;
988
989 tbm_advance_schunkbit(chunk, &schunkbit);
990 if (schunkbit < PAGES_PER_CHUNK)
991 {
992 iterator->schunkbit = schunkbit;
993 break;
994 }
995 /* advance to next chunk */
996 iterator->schunkptr++;
997 iterator->schunkbit = 0;
998 }
999
1000 /*
1001 * If both chunk and per-page data remain, must output the numerically
1002 * earlier page.
1003 */
1004 if (iterator->schunkptr < tbm->nchunks)
1005 {
1006 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1007 BlockNumber chunk_blockno;
1008
1009 chunk_blockno = chunk->blockno + iterator->schunkbit;
1010 if (iterator->spageptr >= tbm->npages ||
1011 chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1012 {
1013 /* Return a lossy page indicator from the chunk */
1014 tbmres->blockno = chunk_blockno;
1015 tbmres->lossy = true;
1016 tbmres->recheck = true;
1017 tbmres->internal_page = NULL;
1018 iterator->schunkbit++;
1019 return true;
1020 }
1021 }
1022
1023 if (iterator->spageptr < tbm->npages)
1024 {
1025 PagetableEntry *page;
1026
1027 /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1028 if (tbm->status == TBM_ONE_PAGE)
1029 page = &tbm->entry1;
1030 else
1031 page = tbm->spages[iterator->spageptr];
1032
1033 tbmres->internal_page = page;
1034 tbmres->blockno = page->blockno;
1035 tbmres->lossy = false;
1036 tbmres->recheck = page->recheck;
1037 iterator->spageptr++;
1038 return true;
1039 }
1040
1041 /* Nothing more in the bitmap */
1042 tbmres->blockno = InvalidBlockNumber;
1043 return false;
1044}
1045
1046/*
1047 * tbm_shared_iterate - scan through next page of a TIDBitmap
1048 *
1049 * As above, but this will iterate using an iterator which is shared
1050 * across multiple processes. We need to acquire the iterator LWLock,
1051 * before accessing the shared members.
1052 */
1053bool
1055{
1056 TBMSharedIteratorState *istate = iterator->state;
1057 PagetableEntry *ptbase = NULL;
1058 int *idxpages = NULL;
1059 int *idxchunks = NULL;
1060
1061 if (iterator->ptbase != NULL)
1062 ptbase = iterator->ptbase->ptentry;
1063 if (iterator->ptpages != NULL)
1064 idxpages = iterator->ptpages->index;
1065 if (iterator->ptchunks != NULL)
1066 idxchunks = iterator->ptchunks->index;
1067
1068 /* Acquire the LWLock before accessing the shared members */
1069 LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1070
1071 /*
1072 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1073 * schunkbit to the next set bit.
1074 */
1075 while (istate->schunkptr < istate->nchunks)
1076 {
1077 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1078 int schunkbit = istate->schunkbit;
1079
1080 tbm_advance_schunkbit(chunk, &schunkbit);
1081 if (schunkbit < PAGES_PER_CHUNK)
1082 {
1083 istate->schunkbit = schunkbit;
1084 break;
1085 }
1086 /* advance to next chunk */
1087 istate->schunkptr++;
1088 istate->schunkbit = 0;
1089 }
1090
1091 /*
1092 * If both chunk and per-page data remain, must output the numerically
1093 * earlier page.
1094 */
1095 if (istate->schunkptr < istate->nchunks)
1096 {
1097 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1098 BlockNumber chunk_blockno;
1099
1100 chunk_blockno = chunk->blockno + istate->schunkbit;
1101
1102 if (istate->spageptr >= istate->npages ||
1103 chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1104 {
1105 /* Return a lossy page indicator from the chunk */
1106 tbmres->blockno = chunk_blockno;
1107 tbmres->lossy = true;
1108 tbmres->recheck = true;
1109 tbmres->internal_page = NULL;
1110 istate->schunkbit++;
1111
1112 LWLockRelease(&istate->lock);
1113 return true;
1114 }
1115 }
1116
1117 if (istate->spageptr < istate->npages)
1118 {
1119 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1120
1121 tbmres->internal_page = page;
1122 tbmres->blockno = page->blockno;
1123 tbmres->lossy = false;
1124 tbmres->recheck = page->recheck;
1125 istate->spageptr++;
1126
1127 LWLockRelease(&istate->lock);
1128
1129 return true;
1130 }
1131
1132 LWLockRelease(&istate->lock);
1133
1134 /* Nothing more in the bitmap */
1135 tbmres->blockno = InvalidBlockNumber;
1136 return false;
1137}
1138
1139/*
1140 * tbm_end_private_iterate - finish an iteration over a TIDBitmap
1141 *
1142 * Currently this is just a pfree, but it might do more someday. (For
1143 * instance, it could be useful to count open iterators and allow the
1144 * bitmap to return to read/write status when there are no more iterators.)
1145 */
1146void
1148{
1149 pfree(iterator);
1150}
1151
1152/*
1153 * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1154 *
1155 * This doesn't free any of the shared state associated with the iterator,
1156 * just our backend-private state.
1157 */
1158void
1160{
1161 pfree(iterator);
1162}
1163
1164/*
1165 * tbm_find_pageentry - find a PagetableEntry for the pageno
1166 *
1167 * Returns NULL if there is no non-lossy entry for the pageno.
1168 */
1169static const PagetableEntry *
1171{
1172 const PagetableEntry *page;
1173
1174 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1175 return NULL;
1176
1177 if (tbm->status == TBM_ONE_PAGE)
1178 {
1179 page = &tbm->entry1;
1180 if (page->blockno != pageno)
1181 return NULL;
1182 Assert(!page->ischunk);
1183 return page;
1184 }
1185
1186 page = pagetable_lookup(tbm->pagetable, pageno);
1187 if (page == NULL)
1188 return NULL;
1189 if (page->ischunk)
1190 return NULL; /* don't want a lossy chunk header */
1191 return page;
1192}
1193
1194/*
1195 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1196 *
1197 * If new, the entry is marked as an exact (non-chunk) entry.
1198 *
1199 * This may cause the table to exceed the desired memory size. It is
1200 * up to the caller to call tbm_lossify() at the next safe point if so.
1201 */
1202static PagetableEntry *
1204{
1205 PagetableEntry *page;
1206 bool found;
1207
1208 if (tbm->status == TBM_EMPTY)
1209 {
1210 /* Use the fixed slot */
1211 page = &tbm->entry1;
1212 found = false;
1213 tbm->status = TBM_ONE_PAGE;
1214 }
1215 else
1216 {
1217 if (tbm->status == TBM_ONE_PAGE)
1218 {
1219 page = &tbm->entry1;
1220 if (page->blockno == pageno)
1221 return page;
1222 /* Time to switch from one page to a hashtable */
1224 }
1225
1226 /* Look up or create an entry */
1227 page = pagetable_insert(tbm->pagetable, pageno, &found);
1228 }
1229
1230 /* Initialize it if not present before */
1231 if (!found)
1232 {
1233 char oldstatus = page->status;
1234
1235 MemSet(page, 0, sizeof(PagetableEntry));
1236 page->status = oldstatus;
1237 page->blockno = pageno;
1238 /* must count it too */
1239 tbm->nentries++;
1240 tbm->npages++;
1241 }
1242
1243 return page;
1244}
1245
1246/*
1247 * tbm_page_is_lossy - is the page marked as lossily stored?
1248 */
1249static bool
1251{
1252 PagetableEntry *page;
1253 BlockNumber chunk_pageno;
1254 int bitno;
1255
1256 /* we can skip the lookup if there are no lossy chunks */
1257 if (tbm->nchunks == 0)
1258 return false;
1259 Assert(tbm->status == TBM_HASH);
1260
1261 bitno = pageno % PAGES_PER_CHUNK;
1262 chunk_pageno = pageno - bitno;
1263
1264 page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1265
1266 if (page != NULL && page->ischunk)
1267 {
1268 int wordnum = WORDNUM(bitno);
1269 int bitnum = BITNUM(bitno);
1270
1271 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1272 return true;
1273 }
1274 return false;
1275}
1276
1277/*
1278 * tbm_mark_page_lossy - mark the page number as lossily stored
1279 *
1280 * This may cause the table to exceed the desired memory size. It is
1281 * up to the caller to call tbm_lossify() at the next safe point if so.
1282 */
1283static void
1285{
1286 PagetableEntry *page;
1287 bool found;
1288 BlockNumber chunk_pageno;
1289 int bitno;
1290 int wordnum;
1291 int bitnum;
1292
1293 /* We force the bitmap into hashtable mode whenever it's lossy */
1294 if (tbm->status != TBM_HASH)
1296
1297 bitno = pageno % PAGES_PER_CHUNK;
1298 chunk_pageno = pageno - bitno;
1299
1300 /*
1301 * Remove any extant non-lossy entry for the page. If the page is its own
1302 * chunk header, however, we skip this and handle the case below.
1303 */
1304 if (bitno != 0)
1305 {
1306 if (pagetable_delete(tbm->pagetable, pageno))
1307 {
1308 /* It was present, so adjust counts */
1309 tbm->nentries--;
1310 tbm->npages--; /* assume it must have been non-lossy */
1311 }
1312 }
1313
1314 /* Look up or create entry for chunk-header page */
1315 page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1316
1317 /* Initialize it if not present before */
1318 if (!found)
1319 {
1320 char oldstatus = page->status;
1321
1322 MemSet(page, 0, sizeof(PagetableEntry));
1323 page->status = oldstatus;
1324 page->blockno = chunk_pageno;
1325 page->ischunk = true;
1326 /* must count it too */
1327 tbm->nentries++;
1328 tbm->nchunks++;
1329 }
1330 else if (!page->ischunk)
1331 {
1332 char oldstatus = page->status;
1333
1334 /* chunk header page was formerly non-lossy, make it lossy */
1335 MemSet(page, 0, sizeof(PagetableEntry));
1336 page->status = oldstatus;
1337 page->blockno = chunk_pageno;
1338 page->ischunk = true;
1339 /* we assume it had some tuple bit(s) set, so mark it lossy */
1340 page->words[0] = ((bitmapword) 1 << 0);
1341 /* adjust counts */
1342 tbm->nchunks++;
1343 tbm->npages--;
1344 }
1345
1346 /* Now set the original target page's bit */
1347 wordnum = WORDNUM(bitno);
1348 bitnum = BITNUM(bitno);
1349 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1350}
1351
1352/*
1353 * tbm_lossify - lose some information to get back under the memory limit
1354 */
1355static void
1357{
1358 pagetable_iterator i;
1359 PagetableEntry *page;
1360
1361 /*
1362 * XXX Really stupid implementation: this just lossifies pages in
1363 * essentially random order. We should be paying some attention to the
1364 * number of bits set in each page, instead.
1365 *
1366 * Since we are called as soon as nentries exceeds maxentries, we should
1367 * push nentries down to significantly less than maxentries, or else we'll
1368 * just end up doing this again very soon. We shoot for maxentries/2.
1369 */
1371 Assert(tbm->status == TBM_HASH);
1372
1373 pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1374 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1375 {
1376 if (page->ischunk)
1377 continue; /* already a chunk header */
1378
1379 /*
1380 * If the page would become a chunk header, we won't save anything by
1381 * converting it to lossy, so skip it.
1382 */
1383 if ((page->blockno % PAGES_PER_CHUNK) == 0)
1384 continue;
1385
1386 /* This does the dirty work ... */
1387 tbm_mark_page_lossy(tbm, page->blockno);
1388
1389 if (tbm->nentries <= tbm->maxentries / 2)
1390 {
1391 /*
1392 * We have made enough room. Remember where to start lossifying
1393 * next round, so we evenly iterate over the hashtable.
1394 */
1395 tbm->lossify_start = i.cur;
1396 break;
1397 }
1398
1399 /*
1400 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1401 * hashtable and may have deleted the non-lossy chunk. We can
1402 * continue the same hash table scan, since failure to visit one
1403 * element or visiting the newly inserted element, isn't fatal.
1404 */
1405 }
1406
1407 /*
1408 * With a big bitmap and small work_mem, it's possible that we cannot get
1409 * under maxentries. Again, if that happens, we'd end up uselessly
1410 * calling tbm_lossify over and over. To prevent this from becoming a
1411 * performance sink, force maxentries up to at least double the current
1412 * number of entries. (In essence, we're admitting inability to fit
1413 * within work_mem when we do this.) Note that this test will not fire if
1414 * we broke out of the loop early; and if we didn't, the current number of
1415 * entries is simply not reducible any further.
1416 */
1417 if (tbm->nentries > tbm->maxentries / 2)
1418 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1419}
1420
1421/*
1422 * qsort comparator to handle PagetableEntry pointers.
1423 */
1424static int
1425tbm_comparator(const void *left, const void *right)
1426{
1427 BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1428 BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1429
1430 return pg_cmp_u32(l, r);
1431}
1432
1433/*
1434 * As above, but this will get index into PagetableEntry array. Therefore,
1435 * it needs to get actual PagetableEntry using the index before comparing the
1436 * blockno.
1437 */
1438static int
1439tbm_shared_comparator(const void *left, const void *right, void *arg)
1440{
1441 PagetableEntry *base = (PagetableEntry *) arg;
1442 PagetableEntry *lpage = &base[*(int *) left];
1443 PagetableEntry *rpage = &base[*(int *) right];
1444
1445 if (lpage->blockno < rpage->blockno)
1446 return -1;
1447 else if (lpage->blockno > rpage->blockno)
1448 return 1;
1449 return 0;
1450}
1451
1452/*
1453 * tbm_attach_shared_iterate
1454 *
1455 * Allocate a backend-private iterator and attach the shared iterator state
1456 * to it so that multiple processed can iterate jointly.
1457 *
1458 * We also converts the DSA pointers to local pointers and store them into
1459 * our private iterator.
1460 */
1463{
1464 TBMSharedIterator *iterator;
1465 TBMSharedIteratorState *istate;
1466
1467 /*
1468 * Create the TBMSharedIterator struct, with enough trailing space to
1469 * serve the needs of the TBMIterateResult sub-struct.
1470 */
1471 iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
1472
1473 istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1474
1475 iterator->state = istate;
1476
1477 iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1478
1479 if (istate->npages)
1480 iterator->ptpages = dsa_get_address(dsa, istate->spages);
1481 if (istate->nchunks)
1482 iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1483
1484 return iterator;
1485}
1486
1487/*
1488 * pagetable_allocate
1489 *
1490 * Callback function for allocating the memory for hashtable elements.
1491 * Allocate memory for hashtable elements, using DSA if available.
1492 */
1493static inline void *
1494pagetable_allocate(pagetable_hash *pagetable, Size size)
1495{
1496 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1497 PTEntryArray *ptbase;
1498
1499 if (tbm->dsa == NULL)
1500 return MemoryContextAllocExtended(pagetable->ctx, size,
1502
1503 /*
1504 * Save the dsapagetable reference in dsapagetableold before allocating
1505 * new memory so that pagetable_free can free the old entry.
1506 */
1507 tbm->dsapagetableold = tbm->dsapagetable;
1509 sizeof(PTEntryArray) + size,
1511 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1512
1513 return ptbase->ptentry;
1514}
1515
1516/*
1517 * pagetable_free
1518 *
1519 * Callback function for freeing hash table elements.
1520 */
1521static inline void
1522pagetable_free(pagetable_hash *pagetable, void *pointer)
1523{
1524 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1525
1526 /* pfree the input pointer if DSA is not available */
1527 if (tbm->dsa == NULL)
1528 pfree(pointer);
1529 else if (DsaPointerIsValid(tbm->dsapagetableold))
1530 {
1531 dsa_free(tbm->dsa, tbm->dsapagetableold);
1533 }
1534}
1535
1536/*
1537 * tbm_calculate_entries
1538 *
1539 * Estimate number of hashtable entries we can have within maxbytes.
1540 */
1541int
1543{
1544 Size nbuckets;
1545
1546 /*
1547 * Estimate number of hashtable entries we can have within maxbytes. This
1548 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1549 * for our purpose. Also count an extra Pointer per entry for the arrays
1550 * created during iteration readout.
1551 */
1552 nbuckets = maxbytes /
1553 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1554 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1555 nbuckets = Max(nbuckets, 16); /* sanity limit */
1556
1557 return (int) nbuckets;
1558}
1559
1560/*
1561 * Create a shared or private bitmap iterator and start iteration.
1562 *
1563 * `tbm` is only used to create the private iterator and dsa and dsp are only
1564 * used to create the shared iterator.
1565 *
1566 * Before invoking tbm_begin_iterate() to create a shared iterator, one
1567 * process must already have invoked tbm_prepare_shared_iterate() to create
1568 * and set up the TBMSharedIteratorState.
1569 */
1572{
1573 TBMIterator iterator = {0};
1574
1575 /* Allocate a private iterator and attach the shared state to it */
1576 if (DsaPointerIsValid(dsp))
1577 {
1578 iterator.shared = true;
1579 iterator.i.shared_iterator = tbm_attach_shared_iterate(dsa, dsp);
1580 }
1581 else
1582 {
1583 iterator.shared = false;
1585 }
1586
1587 return iterator;
1588}
1589
1590/*
1591 * Clean up shared or private bitmap iterator.
1592 */
1593void
1595{
1596 Assert(iterator && !tbm_exhausted(iterator));
1597
1598 if (iterator->shared)
1600 else
1602
1603 *iterator = (TBMIterator)
1604 {
1605 0
1606 };
1607}
1608
1609/*
1610 * Populate the next TBMIterateResult using the shared or private bitmap
1611 * iterator. Returns false when there is nothing more to scan.
1612 */
1613bool
1615{
1616 Assert(iterator);
1617 Assert(tbmres);
1618
1619 if (iterator->shared)
1620 return tbm_shared_iterate(iterator->i.shared_iterator, tbmres);
1621 else
1622 return tbm_private_iterate(iterator->i.private_iterator, tbmres);
1623}
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:262
static uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:437
static void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:219
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:422
uint32 bitmapword
Definition: bitmapset.h:44
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:43
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define Min(x, y)
Definition: c.h:1008
#define Max(x, y)
Definition: c.h:1002
char * Pointer
Definition: c.h:534
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:475
uint32_t uint32
Definition: c.h:543
#define MemSet(start, val, len)
Definition: c.h:1024
size_t Size
Definition: c.h:615
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:957
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:686
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:841
#define dsa_allocate0(area, size)
Definition: dsa.h:113
uint64 dsa_pointer
Definition: dsa.h:62
#define dsa_allocate(area, size)
Definition: dsa.h:109
#define InvalidDsaPointer
Definition: dsa.h:78
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:30
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:28
Assert(PointerIsAligned(start, uint64))
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:719
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:698
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc0(Size size)
Definition: mcxt.c:1395
void * palloc(Size size)
Definition: mcxt.c:1365
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:1286
NodeTag
Definition: nodes.h:27
#define makeNode(_type_)
Definition: nodes.h:161
uint16 OffsetNumber
Definition: off.h:24
void * arg
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:500
Definition: lwlock.h:42
pg_atomic_uint32 refcount
Definition: tidbitmap.c:106
PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:107
pg_atomic_uint32 refcount
Definition: tidbitmap.c:202
int index[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:203
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]
Definition: tidbitmap.c:98
BlockNumber blockno
Definition: tidbitmap.c:94
void * internal_page
Definition: tidbitmap.h:78
BlockNumber blockno
Definition: tidbitmap.h:63
TBMSharedIterator * shared_iterator
Definition: tidbitmap.h:56
union TBMIterator::@111 i
TBMPrivateIterator * private_iterator
Definition: tidbitmap.h:55
bool shared
Definition: tidbitmap.h:52
TIDBitmap * tbm
Definition: tidbitmap.c:172
dsa_pointer pagetable
Definition: tidbitmap.c:188
dsa_pointer spages
Definition: tidbitmap.c:189
dsa_pointer schunks
Definition: tidbitmap.c:190
TBMSharedIteratorState * state
Definition: tidbitmap.c:212
PTEntryArray * ptbase
Definition: tidbitmap.c:213
PTIterationArray * ptchunks
Definition: tidbitmap.c:215
PTIterationArray * ptpages
Definition: tidbitmap.c:214
dsa_pointer ptpages
Definition: tidbitmap.c:159
TBMIteratingState iterating
Definition: tidbitmap.c:151
int nentries
Definition: tidbitmap.c:147
dsa_pointer dsapagetableold
Definition: tidbitmap.c:158
struct pagetable_hash * pagetable
Definition: tidbitmap.c:146
PagetableEntry ** schunks
Definition: tidbitmap.c:156
MemoryContext mcxt
Definition: tidbitmap.c:144
int npages
Definition: tidbitmap.c:149
int maxentries
Definition: tidbitmap.c:148
uint32 lossify_start
Definition: tidbitmap.c:152
int nchunks
Definition: tidbitmap.c:150
NodeTag type
Definition: tidbitmap.c:143
dsa_pointer ptchunks
Definition: tidbitmap.c:160
TBMStatus status
Definition: tidbitmap.c:145
PagetableEntry entry1
Definition: tidbitmap.c:153
dsa_area * dsa
Definition: tidbitmap.c:161
dsa_pointer dsapagetable
Definition: tidbitmap.c:157
PagetableEntry ** spages
Definition: tidbitmap.c:155
Definition: dsa.c:348
Definition: type.h:96
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:312
struct PTIterationArray PTIterationArray
bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:1614
bool tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:1054
#define WORDNUM(x)
Definition: tidbitmap.c:70
static void tbm_lossify(TIDBitmap *tbm)
Definition: tidbitmap.c:1356
bool tbm_is_empty(const TIDBitmap *tbm)
Definition: tidbitmap.c:657
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:1594
#define WORDS_PER_PAGE
Definition: tidbitmap.c:74
TBMIteratingState
Definition: tidbitmap.c:132
@ TBM_ITERATING_SHARED
Definition: tidbitmap.c:135
@ TBM_NOT_ITERATING
Definition: tidbitmap.c:133
@ TBM_ITERATING_PRIVATE
Definition: tidbitmap.c:134
void tbm_end_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1159
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1250
static void * pagetable_allocate(pagetable_hash *pagetable, Size size)
Definition: tidbitmap.c:1494
dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:752
void tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:528
void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:331
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1284
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
Definition: tidbitmap.c:470
#define BITNUM(x)
Definition: tidbitmap.c:71
static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1170
bool tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:974
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:432
static void tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
Definition: tidbitmap.c:935
TBMStatus
Definition: tidbitmap.c:122
@ TBM_EMPTY
Definition: tidbitmap.c:123
@ TBM_ONE_PAGE
Definition: tidbitmap.c:124
@ TBM_HASH
Definition: tidbitmap.c:125
TBMSharedIterator * tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:1462
#define PAGES_PER_CHUNK
Definition: tidbitmap.c:66
TBMIterator tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp)
Definition: tidbitmap.c:1571
static int tbm_shared_comparator(const void *left, const void *right, void *arg)
Definition: tidbitmap.c:1439
static int tbm_comparator(const void *left, const void *right)
Definition: tidbitmap.c:1425
static void tbm_create_pagetable(TIDBitmap *tbm)
Definition: tidbitmap.c:282
int tbm_extract_page_tuple(TBMIterateResult *iteritem, OffsetNumber *offsets, uint32 max_offsets)
Definition: tidbitmap.c:899
#define WORDS_PER_CHUNK
Definition: tidbitmap.c:76
void tbm_union(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:447
void tbm_end_private_iterate(TBMPrivateIterator *iterator)
Definition: tidbitmap.c:1147
TIDBitmap * tbm_create(Size maxbytes, dsa_area *dsa)
Definition: tidbitmap.c:256
struct PagetableEntry PagetableEntry
TBMPrivateIterator * tbm_begin_private_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:676
struct PTEntryArray PTEntryArray
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition: tidbitmap.c:367
int tbm_calculate_entries(Size maxbytes)
Definition: tidbitmap.c:1542
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
Definition: tidbitmap.c:577
struct TBMSharedIteratorState TBMSharedIteratorState
static void pagetable_free(pagetable_hash *pagetable, void *pointer)
Definition: tidbitmap.c:1522
static PagetableEntry * tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1203
struct TBMIterator TBMIterator
#define TBM_MAX_TUPLES_PER_PAGE
Definition: tidbitmap.h:34
static bool tbm_exhausted(TBMIterator *iterator)
Definition: tidbitmap.h:118