summaryrefslogtreecommitdiffstats
path: root/src/backend/access/brin/brin_bloom.c
blob: 1ed0d8a3b8045a512eba7ec2298c146f075d3d0e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
/*
 * brin_bloom.c
 *		Implementation of Bloom opclass for BRIN
 *
 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * A BRIN opclass summarizing page range into a bloom filter.
 *
 * Bloom filters allow efficient testing whether a given page range contains
 * a particular value. Therefore, if we summarize each page range into a small
 * bloom filter, we can easily (and cheaply) test whether it contains values
 * we get later.
 *
 * The index only supports equality operators, similarly to hash indexes.
 * Bloom indexes are however much smaller, and support only bitmap scans.
 *
 * Note: Don't confuse this with bloom indexes, implemented in a contrib
 * module. That extension implements an entirely new AM, building a bloom
 * filter on multiple columns in a single row. This opclass works with an
 * existing AM (BRIN) and builds bloom filter on a column.
 *
 *
 * values vs. hashes
 * -----------------
 *
 * The original column values are not used directly, but are first hashed
 * using the regular type-specific hash function, producing a uint32 hash.
 * And this hash value is then added to the summary - i.e. it's hashed
 * again and added to the bloom filter.
 *
 * This allows the code to treat all data types (byval/byref/...) the same
 * way, with only minimal space requirements, because we're working with
 * hashes and not the original values. Everything is uint32.
 *
 * Of course, this assumes the built-in hash function is reasonably good,
 * without too many collisions etc. But that does seem to be the case, at
 * least based on past experience. After all, the same hash functions are
 * used for hash indexes, hash partitioning and so on.
 *
 *
 * hashing scheme
 * --------------
 *
 * Bloom filters require a number of independent hash functions. There are
 * different schemes how to construct them - for example we might use
 * hash_uint32_extended with random seeds, but that seems fairly expensive.
 * We use a scheme requiring only two functions described in this paper:
 *
 * Less Hashing, Same Performance:Building a Better Bloom Filter
 * Adam Kirsch, Michael Mitzenmacher, Harvard School of Engineering and
 * Applied Sciences, Cambridge, Massachusetts [DOI 10.1002/rsa.20208]
 *
 * The two hash functions h1 and h2 are calculated using hard-coded seeds,
 * and then combined using (h1 + i * h2) to generate the hash functions.
 *
 *
 * sizing the bloom filter
 * -----------------------
 *
 * Size of a bloom filter depends on the number of distinct values we will
 * store in it, and the desired false positive rate. The higher the number
 * of distinct values and/or the lower the false positive rate, the larger
 * the bloom filter. On the other hand, we want to keep the index as small
 * as possible - that's one of the basic advantages of BRIN indexes.
 *
 * Although the number of distinct elements (in a page range) depends on
 * the data, we can consider it fixed. This simplifies the trade-off to
 * just false positive rate vs. size.
 *
 * At the page range level, false positive rate is a probability the bloom
 * filter matches a random value. For the whole index (with sufficiently
 * many page ranges) it represents the fraction of the index ranges (and
 * thus fraction of the table to be scanned) matching the random value.
 *
 * Furthermore, the size of the bloom filter is subject to implementation
 * limits - it has to fit onto a single index page (8kB by default). As
 * the bitmap is inherently random (when "full" about half the bits is set
 * to 1, randomly), compression can't help very much.
 *
 * To reduce the size of a filter (to fit to a page), we have to either
 * accept higher false positive rate (undesirable), or reduce the number
 * of distinct items to be stored in the filter. We can't alter the input
 * data, of course, but we may make the BRIN page ranges smaller - instead
 * of the default 128 pages (1MB) we may build index with 16-page ranges,
 * or something like that. This should reduce the number of distinct values
 * in the page range, making the filter smaller (with fixed false positive
 * rate). Even for random data sets this should help, as the number of rows
 * per heap page is limited (to ~290 with very narrow tables, likely ~20
 * in practice).
 *
 * Of course, good sizing decisions depend on having the necessary data,
 * i.e. number of distinct values in a page range (of a given size) and
 * table size (to estimate cost change due to change in false positive
 * rate due to having larger index vs. scanning larger indexes). We may
 * not have that data - for example when building an index on empty table
 * it's not really possible. And for some data we only have estimates for
 * the whole table and we can only estimate per-range values (ndistinct).
 *
 * Another challenge is that while the bloom filter is per-column, it's
 * the whole index tuple that has to fit into a page. And for multi-column
 * indexes that may include pieces we have no control over (not necessarily
 * bloom filters, the other columns may use other BRIN opclasses). So it's
 * not entirely clear how to distribute the space between those columns.
 *
 * The current logic, implemented in brin_bloom_get_ndistinct, attempts to
 * make some basic sizing decisions, based on the size of BRIN ranges, and
 * the maximum number of rows per range.
 *
 *
 * IDENTIFICATION
 *	  src/backend/access/brin/brin_bloom.c
 */
#include "postgres.h"

#include "access/genam.h"
#include "access/brin.h"
#include "access/brin_internal.h"
#include "access/brin_page.h"
#include "access/brin_tuple.h"
#include "access/hash.h"
#include "access/htup_details.h"
#include "access/reloptions.h"
#include "access/stratnum.h"
#include "catalog/pg_type.h"
#include "catalog/pg_amop.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/syscache.h"

#include <math.h>

#define BloomEqualStrategyNumber	1

/*
 * Additional SQL level support functions. We only have one, which is
 * used to calculate hash of the input value.
 *
 * Procedure numbers must not use values reserved for BRIN itself; see
 * brin_internal.h.
 */
#define		BLOOM_MAX_PROCNUMS		1	/* maximum support procs we need */
#define		PROCNUM_HASH			11	/* required */

/*
 * Subtract this from procnum to obtain index in BloomOpaque arrays
 * (Must be equal to minimum of private procnums).
 */
#define		PROCNUM_BASE			11

/*
 * Storage type for BRIN's reloptions.
 */
typedef struct BloomOptions
{
	int32		vl_len_;		/* varlena header (do not touch directly!) */
	double		nDistinctPerRange;	/* number of distinct values per range */
	double		falsePositiveRate;	/* false positive for bloom filter */
} BloomOptions;

/*
 * The current min value (16) is somewhat arbitrary, but it's based
 * on the fact that the filter header is ~20B alone, which is about
 * the same as the filter bitmap for 16 distinct items with 1% false
 * positive rate. So by allowing lower values we'd not gain much. In
 * any case, the min should not be larger than MaxHeapTuplesPerPage
 * (~290), which is the theoretical maximum for single-page ranges.
 */
#define		BLOOM_MIN_NDISTINCT_PER_RANGE		16

/*
 * Used to determine number of distinct items, based on the number of rows
 * in a page range. The 10% is somewhat similar to what estimate_num_groups
 * does, so we use the same factor here.
 */
#define		BLOOM_DEFAULT_NDISTINCT_PER_RANGE	-0.1	/* 10% of values */

/*
 * Allowed range and default value for the false positive range. The exact
 * values are somewhat arbitrary, but were chosen considering the various
 * parameters (size of filter vs. page size, etc.).
 *
 * The lower the false-positive rate, the more accurate the filter is, but
 * it also gets larger - at some point this eliminates the main advantage
 * of BRIN indexes, which is the tiny size. At 0.01% the index is about
 * 10% of the table (assuming 290 distinct values per 8kB page).
 *
 * On the other hand, as the false-positive rate increases, larger part of
 * the table has to be scanned due to mismatches - at 25% we're probably
 * close to sequential scan being cheaper.
 */
#define		BLOOM_MIN_FALSE_POSITIVE_RATE	0.0001	/* 0.01% fp rate */
#define		BLOOM_MAX_FALSE_POSITIVE_RATE	0.25	/* 25% fp rate */
#define		BLOOM_DEFAULT_FALSE_POSITIVE_RATE	0.01	/* 1% fp rate */

#define BloomGetNDistinctPerRange(opts) \
	((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \
	 (((BloomOptions *) (opts))->nDistinctPerRange) : \
	 BLOOM_DEFAULT_NDISTINCT_PER_RANGE)

#define BloomGetFalsePositiveRate(opts) \
	((opts) && (((BloomOptions *) (opts))->falsePositiveRate != 0.0) ? \
	 (((BloomOptions *) (opts))->falsePositiveRate) : \
	 BLOOM_DEFAULT_FALSE_POSITIVE_RATE)

/*
 * And estimate of the largest bloom we can fit onto a page. This is not
 * a perfect guarantee, for a couple of reasons. For example, the row may
 * be larger because the index has multiple columns.
 */
#define BloomMaxFilterSize \
	MAXALIGN_DOWN(BLCKSZ - \
				  (MAXALIGN(SizeOfPageHeaderData + \
							sizeof(ItemIdData)) + \
				   MAXALIGN(sizeof(BrinSpecialSpace)) + \
				   SizeOfBrinTuple))

/*
 * Seeds used to calculate two hash functions h1 and h2, which are then used
 * to generate k hashes using the (h1 + i * h2) scheme.
 */
#define BLOOM_SEED_1	0x71d924af
#define BLOOM_SEED_2	0xba48b314

/*
 * Bloom Filter
 *
 * Represents a bloom filter, built on hashes of the indexed values. That is,
 * we compute a uint32 hash of the value, and then store this hash into the
 * bloom filter (and compute additional hashes on it).
 *
 * XXX We could implement "sparse" bloom filters, keeping only the bytes that
 * are not entirely 0. But while indexes don't support TOAST, the varlena can
 * still be compressed. So this seems unnecessary, because the compression
 * should do the same job.
 *
 * XXX We can also watch the number of bits set in the bloom filter, and then
 * stop using it (and not store the bitmap, to save space) when the false
 * positive rate gets too high. But even if the false positive rate exceeds the
 * desired value, it still can eliminate some page ranges.
 */
typedef struct BloomFilter
{
	/* varlena header (do not touch directly!) */
	int32		vl_len_;

	/* space for various flags (unused for now) */
	uint16		flags;

	/* fields for the HASHED phase */
	uint8		nhashes;		/* number of hash functions */
	uint32		nbits;			/* number of bits in the bitmap (size) */
	uint32		nbits_set;		/* number of bits set to 1 */

	/* data of the bloom filter */
	char		data[FLEXIBLE_ARRAY_MEMBER];
} BloomFilter;


/*
 * bloom_init
 * 		Initialize the Bloom Filter, allocate all the memory.
 *
 * The filter is initialized with optimal size for ndistinct expected values
 * and the requested false positive rate. The filter is stored as varlena.
 */
static BloomFilter *
bloom_init(int ndistinct, double false_positive_rate)
{
	Size		len;
	BloomFilter *filter;

	int			nbits;			/* size of filter / number of bits */
	int			nbytes;			/* size of filter / number of bytes */

	double		k;				/* number of hash functions */

	Assert(ndistinct > 0);
	Assert(false_positive_rate > 0 && false_positive_rate < 1);

	/* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */
	nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2));

	/* round m to whole bytes */
	nbytes = ((nbits + 7) / 8);
	nbits = nbytes * 8;

	/*
	 * Reject filters that are obviously too large to store on a page.
	 *
	 * Initially the bloom filter is just zeroes and so very compressible, but
	 * as we add values it gets more and more random, and so less and less
	 * compressible. So initially everything fits on the page, but we might
	 * get surprising failures later - we want to prevent that, so we reject
	 * bloom filter that are obviously too large.
	 *
	 * XXX It's not uncommon to oversize the bloom filter a bit, to defend
	 * against unexpected data anomalies (parts of table with more distinct
	 * values per range etc.). But we still need to make sure even the
	 * oversized filter fits on page, if such need arises.
	 *
	 * XXX This check is not perfect, because the index may have multiple
	 * filters that are small individually, but too large when combined.
	 */
	if (nbytes > BloomMaxFilterSize)
		elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes,
			 BloomMaxFilterSize);

	/*
	 * round(log(2.0) * m / ndistinct), but assume round() may not be
	 * available on Windows
	 */
	k = log(2.0) * nbits / ndistinct;
	k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k);

	/*
	 * We allocate the whole filter. Most of it is going to be 0 bits, so the
	 * varlena is easy to compress.
	 */
	len = offsetof(BloomFilter, data) + nbytes;

	filter = (BloomFilter *) palloc0(len);

	filter->flags = 0;
	filter->nhashes = (int) k;
	filter->nbits = nbits;

	SET_VARSIZE(filter, len);

	return filter;
}


/*
 * bloom_add_value
 * 		Add value to the bloom filter.
 */
static BloomFilter *
bloom_add_value(BloomFilter *filter, uint32 value, bool *updated)
{
	int			i;
	uint64		h1,
				h2;

	/* compute the hashes, used for the bloom filter */
	h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
	h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;

	/* compute the requested number of hashes */
	for (i = 0; i < filter->nhashes; i++)
	{
		/* h1 + h2 + f(i) */
		uint32		h = (h1 + i * h2) % filter->nbits;
		uint32		byte = (h / 8);
		uint32		bit = (h % 8);

		/* if the bit is not set, set it and remember we did that */
		if (!(filter->data[byte] & (0x01 << bit)))
		{
			filter->data[byte] |= (0x01 << bit);
			filter->nbits_set++;
			if (updated)
				*updated = true;
		}
	}

	return filter;
}


/*
 * bloom_contains_value
 * 		Check if the bloom filter contains a particular value.
 */
static bool
bloom_contains_value(BloomFilter *filter, uint32 value)
{
	int			i;
	uint64		h1,
				h2;

	/* calculate the two hashes */
	h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
	h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;

	/* compute the requested number of hashes */
	for (i = 0; i < filter->nhashes; i++)
	{
		/* h1 + h2 + f(i) */
		uint32		h = (h1 + i * h2) % filter->nbits;
		uint32		byte = (h / 8);
		uint32		bit = (h % 8);

		/* if the bit is not set, the value is not there */
		if (!(filter->data[byte] & (0x01 << bit)))
			return false;
	}

	/* all hashes found in bloom filter */
	return true;
}

typedef struct BloomOpaque
{
	/*
	 * XXX At this point we only need a single proc (to compute the hash), but
	 * let's keep the array just like inclusion and minmax opclasses, for
	 * consistency. We may need additional procs in the future.
	 */
	FmgrInfo	extra_procinfos[BLOOM_MAX_PROCNUMS];
	bool		extra_proc_missing[BLOOM_MAX_PROCNUMS];
} BloomOpaque;

static FmgrInfo *bloom_get_procinfo(BrinDesc *bdesc, uint16 attno,
									uint16 procnum);


Datum
brin_bloom_opcinfo(PG_FUNCTION_ARGS)
{
	BrinOpcInfo *result;

	/*
	 * opaque->strategy_procinfos is initialized lazily; here it is set to
	 * all-uninitialized by palloc0 which sets fn_oid to InvalidOid.
	 *
	 * bloom indexes only store the filter as a single BYTEA column
	 */

	result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) +
					 sizeof(BloomOpaque));
	result->oi_nstored = 1;
	result->oi_regular_nulls = true;
	result->oi_opaque = (BloomOpaque *)
		MAXALIGN((char *) result + SizeofBrinOpcInfo(1));
	result->oi_typcache[0] = lookup_type_cache(PG_BRIN_BLOOM_SUMMARYOID, 0);

	PG_RETURN_POINTER(result);
}

/*
 * brin_bloom_get_ndistinct
 *		Determine the ndistinct value used to size bloom filter.
 *
 * Adjust the ndistinct value based on the pagesPerRange value. First,
 * if it's negative, it's assumed to be relative to maximum number of
 * tuples in the range (assuming each page gets MaxHeapTuplesPerPage
 * tuples, which is likely a significant over-estimate). We also clamp
 * the value, not to over-size the bloom filter unnecessarily.
 *
 * XXX We can only do this when the pagesPerRange value was supplied.
 * If it wasn't, it has to be a read-only access to the index, in which
 * case we don't really care. But perhaps we should fall-back to the
 * default pagesPerRange value?
 *
 * XXX We might also fetch info about ndistinct estimate for the column,
 * and compute the expected number of distinct values in a range. But
 * that may be tricky due to data being sorted in various ways, so it
 * seems better to rely on the upper estimate.
 *
 * XXX We might also calculate a better estimate of rows per BRIN range,
 * instead of using MaxHeapTuplesPerPage (which probably produces values
 * much higher than reality).
 */
static int
brin_bloom_get_ndistinct(BrinDesc *bdesc, BloomOptions *opts)
{
	double		ndistinct;
	double		maxtuples;
	BlockNumber pagesPerRange;

	pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index);
	ndistinct = BloomGetNDistinctPerRange(opts);

	Assert(BlockNumberIsValid(pagesPerRange));

	maxtuples = MaxHeapTuplesPerPage * pagesPerRange;

	/*
	 * Similarly to n_distinct, negative values are relative - in this case to
	 * maximum number of tuples in the page range (maxtuples).
	 */
	if (ndistinct < 0)
		ndistinct = (-ndistinct) * maxtuples;

	/*
	 * Positive values are to be used directly, but we still apply a couple of
	 * safeties to avoid using unreasonably small bloom filters.
	 */
	ndistinct = Max(ndistinct, BLOOM_MIN_NDISTINCT_PER_RANGE);

	/*
	 * And don't use more than the maximum possible number of tuples, in the
	 * range, which would be entirely wasteful.
	 */
	ndistinct = Min(ndistinct, maxtuples);

	return (int) ndistinct;
}

/*
 * Examine the given index tuple (which contains partial status of a certain
 * page range) by comparing it to the given value that comes from another heap
 * tuple.  If the new value is outside the bloom filter specified by the
 * existing tuple values, update the index tuple and return true.  Otherwise,
 * return false and do not modify in this case.
 */
Datum
brin_bloom_add_value(PG_FUNCTION_ARGS)
{
	BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
	BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
	Datum		newval = PG_GETARG_DATUM(2);
	bool		isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3);
	BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS();
	Oid			colloid = PG_GET_COLLATION();
	FmgrInfo   *hashFn;
	uint32		hashValue;
	bool		updated = false;
	AttrNumber	attno;
	BloomFilter *filter;

	Assert(!isnull);

	attno = column->bv_attno;

	/*
	 * If this is the first non-null value, we need to initialize the bloom
	 * filter. Otherwise just extract the existing bloom filter from
	 * BrinValues.
	 */
	if (column->bv_allnulls)
	{
		filter = bloom_init(brin_bloom_get_ndistinct(bdesc, opts),
							BloomGetFalsePositiveRate(opts));
		column->bv_values[0] = PointerGetDatum(filter);
		column->bv_allnulls = false;
		updated = true;
	}
	else
		filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);

	/*
	 * Compute the hash of the new value, using the supplied hash function,
	 * and then add the hash value to the bloom filter.
	 */
	hashFn = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);

	hashValue = DatumGetUInt32(FunctionCall1Coll(hashFn, colloid, newval));

	filter = bloom_add_value(filter, hashValue, &updated);

	column->bv_values[0] = PointerGetDatum(filter);

	PG_RETURN_BOOL(updated);
}

/*
 * Given an index tuple corresponding to a certain page range and a scan key,
 * return whether the scan key is consistent with the index tuple's bloom
 * filter.  Return true if so, false otherwise.
 */
Datum
brin_bloom_consistent(PG_FUNCTION_ARGS)
{
	BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
	BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
	ScanKey    *keys = (ScanKey *) PG_GETARG_POINTER(2);
	int			nkeys = PG_GETARG_INT32(3);
	Oid			colloid = PG_GET_COLLATION();
	AttrNumber	attno;
	Datum		value;
	Datum		matches;
	FmgrInfo   *finfo;
	uint32		hashValue;
	BloomFilter *filter;
	int			keyno;

	filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);

	Assert(filter);

	matches = true;

	for (keyno = 0; keyno < nkeys; keyno++)
	{
		ScanKey		key = keys[keyno];

		/* NULL keys are handled and filtered-out in bringetbitmap */
		Assert(!(key->sk_flags & SK_ISNULL));

		attno = key->sk_attno;
		value = key->sk_argument;

		switch (key->sk_strategy)
		{
			case BloomEqualStrategyNumber:

				/*
				 * In the equality case (WHERE col = someval), we want to
				 * return the current page range if the minimum value in the
				 * range <= scan key, and the maximum value >= scan key.
				 */
				finfo = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);

				hashValue = DatumGetUInt32(FunctionCall1Coll(finfo, colloid, value));
				matches &= bloom_contains_value(filter, hashValue);

				break;
			default:
				/* shouldn't happen */
				elog(ERROR, "invalid strategy number %d", key->sk_strategy);
				matches = 0;
				break;
		}

		if (!matches)
			break;
	}

	PG_RETURN_DATUM(matches);
}

/*
 * Given two BrinValues, update the first of them as a union of the summary
 * values contained in both.  The second one is untouched.
 *
 * XXX We assume the bloom filters have the same parameters for now. In the
 * future we should have 'can union' function, to decide if we can combine
 * two particular bloom filters.
 */
Datum
brin_bloom_union(PG_FUNCTION_ARGS)
{
	int			i;
	int			nbytes;
	BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
	BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
	BloomFilter *filter_a;
	BloomFilter *filter_b;

	Assert(col_a->bv_attno == col_b->bv_attno);
	Assert(!col_a->bv_allnulls && !col_b->bv_allnulls);

	filter_a = (BloomFilter *) PG_DETOAST_DATUM(col_a->bv_values[0]);
	filter_b = (BloomFilter *) PG_DETOAST_DATUM(col_b->bv_values[0]);

	/* make sure the filters use the same parameters */
	Assert(filter_a && filter_b);
	Assert(filter_a->nbits == filter_b->nbits);
	Assert(filter_a->nhashes == filter_b->nhashes);
	Assert((filter_a->nbits > 0) && (filter_a->nbits % 8 == 0));

	nbytes = (filter_a->nbits) / 8;

	/* simply OR the bitmaps */
	for (i = 0; i < nbytes; i++)
		filter_a->data[i] |= filter_b->data[i];

	PG_RETURN_VOID();
}

/*
 * Cache and return inclusion opclass support procedure
 *
 * Return the procedure corresponding to the given function support number
 * or null if it does not exist.
 */
static FmgrInfo *
bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
{
	BloomOpaque *opaque;
	uint16		basenum = procnum - PROCNUM_BASE;

	/*
	 * We cache these in the opaque struct, to avoid repetitive syscache
	 * lookups.
	 */
	opaque = (BloomOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;

	/*
	 * If we already searched for this proc and didn't find it, don't bother
	 * searching again.
	 */
	if (opaque->extra_proc_missing[basenum])
		return NULL;

	if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid)
	{
		if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno,
												procnum)))
		{
			fmgr_info_copy(&opaque->extra_procinfos[basenum],
						   index_getprocinfo(bdesc->bd_index, attno, procnum),
						   bdesc->bd_context);
		}
		else
		{
			opaque->extra_proc_missing[basenum] = true;
			return NULL;
		}
	}

	return &opaque->extra_procinfos[basenum];
}

Datum
brin_bloom_options(PG_FUNCTION_ARGS)
{
	local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0);

	init_local_reloptions(relopts, sizeof(BloomOptions));

	add_local_real_reloption(relopts, "n_distinct_per_range",
							 "number of distinct items expected in a BRIN page range",
							 BLOOM_DEFAULT_NDISTINCT_PER_RANGE,
							 -1.0, INT_MAX, offsetof(BloomOptions, nDistinctPerRange));

	add_local_real_reloption(relopts, "false_positive_rate",
							 "desired false-positive rate for the bloom filters",
							 BLOOM_DEFAULT_FALSE_POSITIVE_RATE,
							 BLOOM_MIN_FALSE_POSITIVE_RATE,
							 BLOOM_MAX_FALSE_POSITIVE_RATE,
							 offsetof(BloomOptions, falsePositiveRate));

	PG_RETURN_VOID();
}

/*
 * brin_bloom_summary_in
 *		- input routine for type brin_bloom_summary.
 *
 * brin_bloom_summary is only used internally to represent summaries
 * in BRIN bloom indexes, so it has no operations of its own, and we
 * disallow input too.
 */
Datum
brin_bloom_summary_in(PG_FUNCTION_ARGS)
{
	/*
	 * brin_bloom_summary stores the data in binary form and parsing text
	 * input is not needed, so disallow this.
	 */
	ereport(ERROR,
			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
			 errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));

	PG_RETURN_VOID();			/* keep compiler quiet */
}


/*
 * brin_bloom_summary_out
 *		- output routine for type brin_bloom_summary.
 *
 * BRIN bloom summaries are serialized into a bytea value, but we want
 * to output something nicer humans can understand.
 */
Datum
brin_bloom_summary_out(PG_FUNCTION_ARGS)
{
	BloomFilter *filter;
	StringInfoData str;

	/* detoast the data to get value with a full 4B header */
	filter = (BloomFilter *) PG_DETOAST_DATUM_PACKED(PG_GETARG_DATUM(0));

	initStringInfo(&str);
	appendStringInfoChar(&str, '{');

	appendStringInfo(&str, "mode: hashed  nhashes: %u  nbits: %u  nbits_set: %u",
					 filter->nhashes, filter->nbits, filter->nbits_set);

	appendStringInfoChar(&str, '}');

	PG_RETURN_CSTRING(str.data);
}

/*
 * brin_bloom_summary_recv
 *		- binary input routine for type brin_bloom_summary.
 */
Datum
brin_bloom_summary_recv(PG_FUNCTION_ARGS)
{
	ereport(ERROR,
			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
			 errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));

	PG_RETURN_VOID();			/* keep compiler quiet */
}

/*
 * brin_bloom_summary_send
 *		- binary output routine for type brin_bloom_summary.
 *
 * BRIN bloom summaries are serialized in a bytea value (although the
 * type is named differently), so let's just send that.
 */
Datum
brin_bloom_summary_send(PG_FUNCTION_ARGS)
{
	return byteasend(fcinfo);
}