3838 * | DONE |
3939 * \===========/
4040 *
41+ *
42+ *
43+ * N-key SkipScan needs to do 2^N null check stages when using the above scheme,
44+ * made even more complicated with having to change searches for previous keys.
45+ *
46+ * So we made a decision to support multikey SkipScan in NOT NULL mode only.
47+ *
48+ * For N-key SkipScan we search with these predicates when current key = K:
49+ * (key_1 = prev_1),...,(key_K > prev_K),(key_K+1 IS NOT NULL)...(key_N IS NOT NULL)
50+ *
51+ * As all skip keys are NOT NULL, "IS NOT NULL" fetches the tuple with no previous value.
52+ *
53+ * We start the search with K=1 i.e. with these predicates:
54+ * (key_1 IS NOT NULL),...,(key_N IS NOT NULL).
55+ *
56+ * When a tuple is fetched we set K=N as we can fill all previous values, search is now:
57+ * (key_1 = prev_1),...,(key_N > prev_N)
58+ *
59+ * When no tuple is fetched and K>1 we can relax the search and move to previous key (K-1):
60+ * (key_1 = prev_1),...,(key_K-1 > prev_K-1),(key_K IS NOT NULL)...(key_N IS NOT NULL)
61+ *
62+ * When no tuple is fetched and K=1, we are done.
63+ *
64+ * Multikey SkipScan flowchart:
65+ * start (K=1)
66+ * | +---------+
67+ * | | |
68+ * v v |
69+ * +=================================+ |
70+ * | search for NOT NULL after K | |
71+ * +=================================+ |
72+ * | | |
73+ * | found value | |
74+ * v | |
75+ * +==============================+ | |
76+ * | search for values after prev | | |
77+ * +==============================+ | |
78+ * | | |
79+ * | no value | |
80+ * v v |
81+ * +======================+ |
82+ * | K=1 | K>1 |
83+ * v v |
84+ * /===========\ +=========+ |
85+ * | DONE | | K = K-1 |---+
86+ * \===========/ +=========+
87+ *
4188 */
4289
4390#include <postgres.h>
@@ -58,6 +105,7 @@ typedef enum SkipScanStage
58105 SS_NOT_NULL ,
59106 SS_VALUES ,
60107 SS_NULLS_LAST ,
108+ SS_PREV_KEY ,
61109 SS_END ,
62110} SkipScanStage ;
63111
@@ -93,6 +141,17 @@ typedef struct SkipScanState
93141 int num_skip_keys ;
94142 SkipKeyData * skip_keys ;
95143
144+ /* Skip key with ">" qual, coming after "=" skip quals for multikey SkipScan */
145+ int current_key ;
146+
147+ /* For Multikey SkipScan we keep copies of "sk_func" for "=" and ">" for keys 1..N-1
148+ * to be swapped during execution.
149+ */
150+ FmgrInfo * eq_funcs ;
151+ /* Will be filled after IndexScan scankeys have been initialized */
152+ FmgrInfo * comp_funcs ;
153+ StrategyNumber * comp_strategies ;
154+
96155 SkipScanStage stage ;
97156
98157 /* rescan required before getting next tuple */
@@ -157,31 +216,44 @@ skip_scan_begin(CustomScanState *node, EState *estate, int eflags)
157216 /* find position of our skip key
158217 * skip key is put as first key for the respective column in sort_indexquals
159218 */
160- ScanKey data = * state -> scan_keys ;
219+ ScanKey scankeydata = * state -> scan_keys ;
161220 int j = 0 ;
162221 for (int i = 0 ; i < * state -> num_scan_keys ; i ++ )
163222 {
164- if (data [i ].sk_flags == SK_ISNULL && data [i ].sk_attno == state -> skip_keys [j ].sk_attno )
223+ if (scankeydata [i ].sk_flags == SK_ISNULL &&
224+ scankeydata [i ].sk_attno == state -> skip_keys [j ].sk_attno )
165225 {
166- state -> skip_keys [j ++ ].skip_key = & data [i ];
226+ SkipKeyData * skipkeydata = & state -> skip_keys [j ++ ];
227+ skipkeydata -> skip_key = & scankeydata [i ];
228+ /* Set up ">" sk_func swaps for skip keys 1..N-1 */
229+ if (j < state -> num_skip_keys )
230+ {
231+ state -> comp_strategies [j - 1 ] = scankeydata [i ].sk_strategy ;
232+ fmgr_info_copy (& state -> comp_funcs [j - 1 ],
233+ & scankeydata [i ].sk_func ,
234+ CurrentMemoryContext );
235+ }
167236 if (j == state -> num_skip_keys )
168237 break ;
169238 }
170239 }
171240 if (j < state -> num_skip_keys )
172241 elog (ERROR , "ScanKey for skip qual not found" );
242+
243+ /* when we fetch the 1st tuple we update all skip keys from 0 to N */
244+ state -> current_key = 0 ;
173245}
174246
175247static bool
176248has_nulls_first (SkipScanState * state )
177249{
178- return state -> skip_keys [0 ].nulls == SKIPKEY_NULLS_FIRST ;
250+ return state -> skip_keys [0 ].nulls == SK_NULLS_FIRST ;
179251}
180252
181253static bool
182254has_nulls_last (SkipScanState * state )
183255{
184- return state -> skip_keys [0 ].nulls == SKIPKEY_NULLS_LAST ;
256+ return state -> skip_keys [0 ].nulls == SK_NULLS_LAST ;
185257}
186258
187259static void
@@ -223,18 +295,48 @@ skip_scan_rescan_index(SkipScanState *state)
223295static void
224296skip_scan_switch_stage (SkipScanState * state , SkipScanStage new_stage )
225297{
226- Assert (new_stage > state -> stage );
298+ Assert (new_stage > state -> stage || state -> num_skip_keys > 1 );
227299
228300 switch (new_stage )
229301 {
230302 case SS_NOT_NULL :
231- state -> skip_keys [0 ].skip_key -> sk_flags = SK_ISNULL | SK_SEARCHNOTNULL ;
232- state -> skip_keys [0 ].skip_key -> sk_argument = 0 ;
303+ for (int i = 0 ; i < state -> num_skip_keys ; i ++ )
304+ {
305+ state -> skip_keys [i ].skip_key -> sk_flags = SK_ISNULL | SK_SEARCHNOTNULL ;
306+ state -> skip_keys [i ].skip_key -> sk_argument = 0 ;
307+ }
308+ state -> needs_rescan = true;
309+ break ;
310+
311+ case SS_PREV_KEY :
312+ /* Done searching with ">" for this key: set this key to NOT NULL i.e. any value,
313+ * set previous "=" key to search with ">".
314+ */
315+ state -> skip_keys [state -> current_key ].skip_key -> sk_flags = SK_ISNULL | SK_SEARCHNOTNULL ;
316+ state -> current_key -- ;
317+ state -> skip_keys [state -> current_key ].skip_key -> sk_flags = 0 ;
318+ fmgr_info_copy (& state -> skip_keys [state -> current_key ].skip_key -> sk_func ,
319+ & state -> comp_funcs [state -> current_key ],
320+ CurrentMemoryContext );
321+ state -> skip_keys [state -> current_key ].skip_key -> sk_strategy =
322+ state -> comp_strategies [state -> current_key ];
233323 state -> needs_rescan = true;
234324 break ;
235325
236326 case SS_VALUES :
237- state -> skip_keys [0 ].skip_key -> sk_flags = 0 ;
327+ for (int i = 0 ; i < state -> num_skip_keys ; i ++ )
328+ {
329+ state -> skip_keys [i ].skip_key -> sk_flags = 0 ;
330+ /* reset all ">" back to "=" from the current key to N-1 */
331+ if (i >= state -> current_key && i < state -> num_skip_keys - 1 )
332+ {
333+ fmgr_info_copy (& state -> skip_keys [i ].skip_key -> sk_func ,
334+ & state -> eq_funcs [i ],
335+ CurrentMemoryContext );
336+ state -> skip_keys [i ].skip_key -> sk_strategy = BTEqualStrategyNumber ;
337+ }
338+ }
339+ state -> current_key = state -> num_skip_keys - 1 ;
238340 state -> needs_rescan = true;
239341 break ;
240342
@@ -256,31 +358,32 @@ skip_scan_switch_stage(SkipScanState *state, SkipScanStage new_stage)
256358static void
257359skip_scan_update_key (SkipScanState * state , TupleTableSlot * slot )
258360{
259- if (! state -> skip_keys [ 0 ]. prev_is_null && ! state -> skip_keys [ 0 ]. distinct_by_val )
361+ for ( int i = state -> current_key ; i < state -> num_skip_keys ; i ++ )
260362 {
261- Assert (state -> stage == SS_VALUES );
262- pfree (DatumGetPointer (state -> skip_keys [0 ].prev_datum ));
263- }
363+ if (!state -> skip_keys [i ].prev_is_null && !state -> skip_keys [i ].distinct_by_val )
364+ {
365+ Assert (state -> stage == SS_VALUES || state -> num_skip_keys > 1 );
366+ pfree (DatumGetPointer (state -> skip_keys [i ].prev_datum ));
367+ }
264368
265- MemoryContext old_ctx = MemoryContextSwitchTo (state -> ctx );
266- state -> skip_keys [0 ].prev_datum = slot_getattr (slot ,
267- state -> skip_keys [0 ].distinct_col_attnum ,
268- & state -> skip_keys [0 ].prev_is_null );
269- if (state -> skip_keys [0 ].prev_is_null )
270- {
271- state -> skip_keys [0 ].skip_key -> sk_flags = SK_ISNULL ;
272- state -> skip_keys [0 ].skip_key -> sk_argument = 0 ;
273- }
274- else
275- {
276- state -> skip_keys [0 ].prev_datum = datumCopy (state -> skip_keys [0 ].prev_datum ,
277- state -> skip_keys [0 ].distinct_by_val ,
278- state -> skip_keys [0 ].distinct_typ_len );
279- state -> skip_keys [0 ].skip_key -> sk_argument = state -> skip_keys [0 ].prev_datum ;
369+ MemoryContext old_ctx = MemoryContextSwitchTo (state -> ctx );
370+ state -> skip_keys [i ].prev_datum = slot_getattr (slot ,
371+ state -> skip_keys [i ].distinct_col_attnum ,
372+ & state -> skip_keys [i ].prev_is_null );
373+ if (state -> skip_keys [i ].prev_is_null )
374+ {
375+ state -> skip_keys [i ].skip_key -> sk_flags = SK_ISNULL ;
376+ state -> skip_keys [i ].skip_key -> sk_argument = 0 ;
377+ }
378+ else
379+ {
380+ state -> skip_keys [i ].prev_datum = datumCopy (state -> skip_keys [i ].prev_datum ,
381+ state -> skip_keys [i ].distinct_by_val ,
382+ state -> skip_keys [i ].distinct_typ_len );
383+ state -> skip_keys [i ].skip_key -> sk_argument = state -> skip_keys [i ].prev_datum ;
384+ }
385+ MemoryContextSwitchTo (old_ctx );
280386 }
281-
282- MemoryContextSwitchTo (old_ctx );
283-
284387 /* we need to do a rescan whenever we modify the ScanKey */
285388 state -> needs_rescan = true;
286389}
@@ -330,6 +433,7 @@ skip_scan_exec(CustomScanState *node)
330433 break ;
331434
332435 case SS_NOT_NULL :
436+ case SS_PREV_KEY :
333437 case SS_VALUES :
334438 child_state = linitial (state -> cscan_state .custom_ps );
335439 result = child_state -> ps .ExecProcNode (& child_state -> ps );
@@ -343,10 +447,10 @@ skip_scan_exec(CustomScanState *node)
343447 * also switch stage to look for values greater than
344448 * that in subsequent calls.
345449 */
346- if (state -> stage == SS_NOT_NULL )
450+ skip_scan_update_key (state , result );
451+ if (state -> stage == SS_NOT_NULL || state -> stage == SS_PREV_KEY )
347452 skip_scan_switch_stage (state , SS_VALUES );
348453
349- skip_scan_update_key (state , result );
350454 return result ;
351455 }
352456 else
@@ -356,9 +460,15 @@ skip_scan_exec(CustomScanState *node)
356460 * the skip constraint we are either done
357461 * for NULLS FIRST ordering or need to check
358462 * for NULLs if we have NULLS LAST ordering
463+ *
464+ * Or we can move back one key for multikey SkipScan to relax the search,
465+ * i.e. make current key NOT NULL (any value) and change previous search from
466+ * "=" to ">"
359467 */
360468 if (has_nulls_last (state ))
361469 skip_scan_switch_stage (state , SS_NULLS_LAST );
470+ else if (state -> current_key > 0 )
471+ skip_scan_switch_stage (state , SS_PREV_KEY );
362472 else
363473 skip_scan_switch_stage (state , SS_END );
364474 }
@@ -401,8 +511,11 @@ skip_scan_rescan(CustomScanState *node)
401511 else
402512 skip_scan_switch_stage (state , SS_NOT_NULL );
403513
404- state -> skip_keys [0 ].prev_is_null = true;
405- state -> skip_keys [0 ].prev_datum = 0 ;
514+ for (int i = 0 ; i < state -> num_skip_keys ; i ++ )
515+ {
516+ state -> skip_keys [i ].prev_is_null = true;
517+ state -> skip_keys [i ].prev_datum = 0 ;
518+ }
406519
407520 state -> needs_rescan = false;
408521 ScanState * child_state = linitial (state -> cscan_state .custom_ps );
@@ -435,24 +548,54 @@ tsl_skip_scan_state_create(CustomScan *cscan)
435548 }
436549 state -> stage = SS_BEGIN ;
437550
438- state -> num_skip_keys = list_length (cscan -> custom_private );
551+ /* set up N skipkeyinfos for N skip keys */
552+ List * skinfos = (List * ) linitial (cscan -> custom_private );
553+ state -> num_skip_keys = list_length (skinfos );
439554 state -> skip_keys = palloc (sizeof (SkipKeyData ) * state -> num_skip_keys );
440555
441556 ListCell * lc ;
442557 int i = 0 ;
443- foreach (lc , cscan -> custom_private )
558+ foreach (lc , skinfos )
444559 {
445560 List * skipkeyinfo = (List * ) lfirst (lc );
446561
447- state -> skip_keys [i ].distinct_col_attnum = linitial_int (skipkeyinfo );
448- state -> skip_keys [i ].distinct_by_val = lsecond_int (skipkeyinfo );
449- state -> skip_keys [i ].distinct_typ_len = lthird_int (skipkeyinfo );
450- state -> skip_keys [i ].nulls = lfourth_int (skipkeyinfo );
451- state -> skip_keys [i ].sk_attno = list_nth_int (skipkeyinfo , 4 );
562+ state -> skip_keys [i ].distinct_col_attnum = list_nth_int (skipkeyinfo , SK_DistinctColAttno );
563+ state -> skip_keys [i ].distinct_by_val = list_nth_int (skipkeyinfo , SK_DistinctByVal );
564+ state -> skip_keys [i ].distinct_typ_len = list_nth_int (skipkeyinfo , SK_DistinctTypeLen );
565+ state -> skip_keys [i ].nulls = list_nth_int (skipkeyinfo , SK_NullStatus );
566+ Assert (state -> num_skip_keys == 1 || state -> skip_keys [i ].nulls == SK_NOT_NULL );
567+ state -> skip_keys [i ].sk_attno = list_nth_int (skipkeyinfo , SK_IndexKeyAttno );
452568
453569 state -> skip_keys [i ].prev_is_null = true;
454570 i ++ ;
455571 }
572+
573+ state -> eq_funcs = NULL ;
574+ state -> comp_funcs = NULL ;
575+ state -> comp_strategies = NULL ;
576+
577+ /* set up N-1 equality ops for N skip keys if N>1 */
578+ if (state -> num_skip_keys > 1 )
579+ {
580+ /* Should have a list of N-1 equality op Oids for N skip keys if N>1 */
581+ Assert (list_length (cscan -> custom_private ) == 2 );
582+ List * eqoids = (List * ) lsecond (cscan -> custom_private );
583+
584+ state -> eq_funcs = palloc (sizeof (FmgrInfo ) * (state -> num_skip_keys - 1 ));
585+ state -> comp_funcs = palloc (sizeof (FmgrInfo ) * (state -> num_skip_keys - 1 ));
586+ state -> comp_strategies = palloc (sizeof (StrategyNumber ) * (state -> num_skip_keys - 1 ));
587+
588+ int i = 0 ;
589+ /* Set up "=" sk_funcs for keys 1..N-1 */
590+ foreach (lc , eqoids )
591+ {
592+ Oid eqoid = lfirst_oid (lc );
593+ Assert (OidIsValid (eqoid ));
594+ fmgr_info (eqoid , & state -> eq_funcs [i ++ ]);
595+ }
596+ Assert (i == state -> num_skip_keys - 1 );
597+ }
598+
456599 state -> cscan_state .methods = & skip_scan_state_methods ;
457600 return (Node * ) state ;
458601}
0 commit comments