From 9276312ccef820f1975c45b791addc64c38941b0 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Wed, 6 Dec 2023 18:20:26 +0100
Subject: [PATCH] Vectorize predicates that use scalar array operations

We can do this if the underlying scalar predicate is vectorizable, by
running the vector predicate on each element of the array and combining
the results.
---
 tsl/src/nodes/decompress_chunk/CMakeLists.txt |   1 +
 .../nodes/decompress_chunk/compressed_batch.c |  74 +++-
 tsl/src/nodes/decompress_chunk/exec.c         |  12 +-
 tsl/src/nodes/decompress_chunk/planner.c      |  92 +++--
 .../decompress_chunk/pred_vector_array.c      | 206 ++++++++++
 .../pred_vector_const_arithmetic_single.c     |   8 -
 .../decompress_chunk/vector_predicates.c      |  13 +-
 .../decompress_chunk/vector_predicates.h      |   8 +-
 tsl/test/expected/decompress_vector_qual.out  | 351 +++++++++++++++++-
 tsl/test/sql/decompress_vector_qual.sql       |  96 +++++
 10 files changed, 808 insertions(+), 53 deletions(-)
 create mode 100644 tsl/src/nodes/decompress_chunk/pred_vector_array.c

diff --git a/tsl/src/nodes/decompress_chunk/CMakeLists.txt b/tsl/src/nodes/decompress_chunk/CMakeLists.txt
index 0ef8c79a67a..ab122f1dfd6 100644
--- a/tsl/src/nodes/decompress_chunk/CMakeLists.txt
+++ b/tsl/src/nodes/decompress_chunk/CMakeLists.txt
@@ -6,6 +6,7 @@ set(SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/decompress_chunk.c
     ${CMAKE_CURRENT_SOURCE_DIR}/exec.c
     ${CMAKE_CURRENT_SOURCE_DIR}/planner.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/pred_vector_array.c
     ${CMAKE_CURRENT_SOURCE_DIR}/qual_pushdown.c
     ${CMAKE_CURRENT_SOURCE_DIR}/vector_predicates.c)
 target_sources(${TSL_LIBRARY_NAME} PRIVATE ${SOURCES})
diff --git a/tsl/src/nodes/decompress_chunk/compressed_batch.c b/tsl/src/nodes/decompress_chunk/compressed_batch.c
index 5875d23f9e0..b3f1825cb5a 100644
--- a/tsl/src/nodes/decompress_chunk/compressed_batch.c
+++ b/tsl/src/nodes/decompress_chunk/compressed_batch.c
@@ -183,6 +183,16 @@ compute_vector_quals(DecompressChunkState *chunk_state, DecompressBatchState *ba
 	const int bitmap_bytes = sizeof(uint64) * ((batch_state->total_batch_rows + 63) / 64);
 	batch_state->vector_qual_result = palloc(bitmap_bytes);
 	memset(batch_state->vector_qual_result, 0xFF, bitmap_bytes);
+	if (batch_state->total_batch_rows % 64 != 0)
+	{
+		/*
+		 * We have to zero out the bits for past-the-end elements in the last
+		 * bitmap word. Since all predicates are ANDed to the result bitmap,
+		 * we can do it here once instead of doing it in each predicate.
+		 */
+		const uint64 mask = ((uint64) -1) >> (64 - batch_state->total_batch_rows % 64);
+		batch_state->vector_qual_result[batch_state->total_batch_rows / 64] = mask;
+	}
 
 	/*
 	 * Compute the quals.
@@ -190,14 +200,37 @@ compute_vector_quals(DecompressChunkState *chunk_state, DecompressBatchState *ba
 	ListCell *lc;
 	foreach (lc, chunk_state->vectorized_quals_constified)
 	{
-		/* For now we only support "Var ? Const" predicates. */
-		OpExpr *oe = castNode(OpExpr, lfirst(lc));
-		Var *var = castNode(Var, linitial(oe->args));
-		Const *constnode = castNode(Const, lsecond(oe->args));
+		/*
+		 * For now we support "Var ? Const" predicates and
+		 * ScalarArrayOperations.
+		 */
+		List *args = NULL;
+		RegProcedure vector_const_opcode = InvalidOid;
+		ScalarArrayOpExpr *saop = NULL;
+		OpExpr *opexpr = NULL;
+		if (IsA(lfirst(lc), ScalarArrayOpExpr))
+		{
+			saop = castNode(ScalarArrayOpExpr, lfirst(lc));
+			args = saop->args;
+			vector_const_opcode = get_opcode(saop->opno);
+		}
+		else
+		{
+			opexpr = castNode(OpExpr, lfirst(lc));
+			args = opexpr->args;
+			vector_const_opcode = get_opcode(opexpr->opno);
+		}
+
+		/*
+		 * Find the vector_const predicate.
+		 */
+		VectorPredicate *vector_const_predicate = get_vector_const_predicate(vector_const_opcode);
+		Assert(vector_const_predicate != NULL);
 
 		/*
 		 * Find the compressed column referred to by the Var.
 		 */
+		Var *var = castNode(Var, linitial(args));
 		DecompressChunkColumnDescription *column_description = NULL;
 		int column_index = 0;
 		for (; column_index < chunk_state->num_total_columns; column_index++)
@@ -273,20 +306,37 @@ compute_vector_quals(DecompressChunkState *chunk_state, DecompressBatchState *ba
 			predicate_result = &default_value_predicate_result;
 		}
 
-		/* Find and compute the predicate. */
-		void (*predicate)(const ArrowArray *, Datum, uint64 *restrict) =
-			get_vector_const_predicate(get_opcode(oe->opno));
-		Ensure(predicate != NULL,
-			   "vectorized predicate not found for postgres predicate %d",
-			   get_opcode(oe->opno));
-
 		/*
 		 * The vectorizable predicates should be STRICT, so we shouldn't see null
 		 * constants here.
 		 */
+		Const *constnode = castNode(Const, lsecond(args));
 		Ensure(!constnode->constisnull, "vectorized predicate called for a null value");
 
-		predicate(vector, constnode->constvalue, predicate_result);
+		/*
+		 * At last, compute the predicate.
+		 */
+		if (saop)
+		{
+			vector_array_predicate(vector_const_predicate,
+								   saop->useOr,
+								   vector,
+								   constnode->constvalue,
+								   predicate_result);
+		}
+		else
+		{
+			vector_const_predicate(vector, constnode->constvalue, predicate_result);
+		}
+
+		/* Account for nulls which shouldn't pass the predicate. */
+		const size_t n = vector->length;
+		const size_t n_words = (n + 63) / 64;
+		const uint64 *restrict validity = (uint64 *restrict) vector->buffers[0];
+		for (size_t i = 0; i < n_words; i++)
+		{
+			predicate_result[i] &= validity[i];
+		}
 
 		/* Process the result. */
 		if (column_values->arrow == NULL)
diff --git a/tsl/src/nodes/decompress_chunk/exec.c b/tsl/src/nodes/decompress_chunk/exec.c
index 7ced5805810..28a54b47faf 100644
--- a/tsl/src/nodes/decompress_chunk/exec.c
+++ b/tsl/src/nodes/decompress_chunk/exec.c
@@ -532,8 +532,16 @@ decompress_chunk_begin(CustomScanState *node, EState *estate, int eflags)
 			}
 		}
 
-		OpExpr *opexpr = castNode(OpExpr, constified);
-		Ensure(IsA(lsecond(opexpr->args), Const),
+		List *args;
+		if (IsA(constified, OpExpr))
+		{
+			args = castNode(OpExpr, constified)->args;
+		}
+		else
+		{
+			args = castNode(ScalarArrayOpExpr, constified)->args;
+		}
+		Ensure(IsA(lsecond(args), Const),
 			   "failed to evaluate runtime constant in vectorized filter");
 		chunk_state->vectorized_quals_constified =
 			lappend(chunk_state->vectorized_quals_constified, constified);
diff --git a/tsl/src/nodes/decompress_chunk/planner.c b/tsl/src/nodes/decompress_chunk/planner.c
index d9fefbc6f0d..f159d0d0893 100644
--- a/tsl/src/nodes/decompress_chunk/planner.c
+++ b/tsl/src/nodes/decompress_chunk/planner.c
@@ -431,34 +431,57 @@ is_not_runtime_constant(Node *node)
 static Node *
 make_vectorized_qual(DecompressChunkPath *path, Node *qual)
 {
-	/* Only simple "Var op Const" binary predicates for now. */
-	if (!IsA(qual, OpExpr))
+	/*
+	 * Currently we vectorize some "Var op Const" binary predicates,
+	 * and scalar array operations with these predicates.
+	 */
+	if (!IsA(qual, OpExpr) && !IsA(qual, ScalarArrayOpExpr))
 	{
 		return NULL;
 	}
 
-	OpExpr *o = castNode(OpExpr, qual);
+	List *args = NIL;
+	OpExpr *opexpr = NULL;
+	Oid opno = InvalidOid;
+	ScalarArrayOpExpr *saop = NULL;
+	if (IsA(qual, OpExpr))
+	{
+		opexpr = castNode(OpExpr, qual);
+		args = opexpr->args;
+		opno = opexpr->opno;
+	}
+	else
+	{
+		saop = castNode(ScalarArrayOpExpr, qual);
+		args = saop->args;
+		opno = saop->opno;
+	}
 
-	if (list_length(o->args) != 2)
+	if (list_length(args) != 2)
 	{
 		return NULL;
 	}
 
-	if (IsA(lsecond(o->args), Var))
+	if (opexpr && IsA(lsecond(args), Var))
 	{
-		/* Try to commute the operator if the constant is on the right. */
-		Oid commutator_opno = get_commutator(o->opno);
-		if (OidIsValid(commutator_opno))
+		/*
+		 * Try to commute the operator if we have Var on the right.
+		 */
+		opno = get_commutator(opno);
+		if (!OidIsValid(opno))
 		{
-			o = (OpExpr *) copyObject(o);
-			o->opno = commutator_opno;
-			/*
-			 * opfuncid is a cache, we can set it to InvalidOid like the
-			 * CommuteOpExpr() does.
-			 */
-			o->opfuncid = InvalidOid;
-			o->args = list_make2(lsecond(o->args), linitial(o->args));
+			return NULL;
 		}
+
+		opexpr = (OpExpr *) copyObject(opexpr);
+		opexpr->opno = opno;
+		/*
+		 * opfuncid is a cache, we can set it to InvalidOid like the
+		 * CommuteOpExpr() does.
+		 */
+		opexpr->opfuncid = InvalidOid;
+		args = list_make2(lsecond(args), linitial(args));
+		opexpr->args = args;
 	}
 
 	/*
@@ -466,12 +489,12 @@ make_vectorized_qual(DecompressChunkPath *path, Node *qual)
 	 * side is a constant or can be evaluated to a constant at run time (e.g.
 	 * contains stable functions).
 	 */
-	if (!IsA(linitial(o->args), Var) || is_not_runtime_constant(lsecond(o->args)))
+	if (!IsA(linitial(args), Var) || is_not_runtime_constant(lsecond(args)))
 	{
 		return NULL;
 	}
 
-	Var *var = castNode(Var, linitial(o->args));
+	Var *var = castNode(Var, linitial(args));
 	Assert((Index) var->varno == path->info->chunk_rel->relid);
 
 	/*
@@ -485,13 +508,26 @@ make_vectorized_qual(DecompressChunkPath *path, Node *qual)
 		return NULL;
 	}
 
-	Oid opcode = get_opcode(o->opno);
-	if (get_vector_const_predicate(opcode))
+	Oid opcode = get_opcode(opno);
+	if (!get_vector_const_predicate(opcode))
 	{
-		return (Node *) o;
+		return NULL;
 	}
 
-	return NULL;
+#if PG14_GE
+	if (saop)
+	{
+		if (saop->hashfuncid)
+		{
+			/*
+			 * Don't vectorize if the planner decided to build a hash table.
+			 */
+			return NULL;
+		}
+	}
+#endif
+
+	return opexpr ? (Node *) opexpr : (Node *) saop;
 }
 
 /*
@@ -861,10 +897,16 @@ decompress_chunk_plan_create(PlannerInfo *root, RelOptInfo *rel, CustomPath *pat
 	{
 		elog(ERROR, "debug: encountered vector quals when they are disabled");
 	}
-	else if (ts_guc_debug_require_vector_qual == RVQ_Only &&
-			 list_length(decompress_plan->scan.plan.qual) > 0)
+	else if (ts_guc_debug_require_vector_qual == RVQ_Only)
 	{
-		elog(ERROR, "debug: encountered non-vector quals when they are disabled");
+		if (list_length(decompress_plan->scan.plan.qual) > 0)
+		{
+			elog(ERROR, "debug: encountered non-vector quals when they are disabled");
+		}
+		if (list_length(vectorized_quals) == 0)
+		{
+			elog(ERROR, "debug: did not encounter vector quals when they are required");
+		}
 	}
 #endif
 
diff --git a/tsl/src/nodes/decompress_chunk/pred_vector_array.c b/tsl/src/nodes/decompress_chunk/pred_vector_array.c
new file mode 100644
index 00000000000..da154644d9b
--- /dev/null
+++ b/tsl/src/nodes/decompress_chunk/pred_vector_array.c
@@ -0,0 +1,206 @@
+/*
+ * This file and its contents are licensed under the Timescale License.
+ * Please see the included NOTICE for copyright information and
+ * LICENSE-TIMESCALE for a copy of the license.
+ */
+
+#include <postgres.h>
+
+#include "compression/arrow_c_data_interface.h"
+
+#include "vector_predicates.h"
+
+#include "compression/compression.h"
+
+/*
+ * Vectorized implementation of ScalarArrayOpExpr. Applies scalar_predicate for
+ * vector and each element of array, combines the result according to "is_or"
+ * flag. Written along the lines of ExecEvalScalarArrayOp().
+ */
+static inline void
+vector_array_predicate_impl(VectorPredicate *vector_const_predicate, bool is_or,
+							const ArrowArray *vector, Datum array, uint64 *restrict final_result)
+{
+	const size_t result_bits = vector->length;
+	const size_t result_words = (result_bits + 63) / 64;
+
+	uint64 *restrict array_result = NULL;
+	/*
+	 * For OR, we need an intermediate storage to accumulate the results
+	 * from all elements.
+	 * For AND, we can apply predicate for each element to the final result.
+	 */
+	uint64 array_result_storage[(GLOBAL_MAX_ROWS_PER_COMPRESSION + 63) / 64];
+	if (is_or)
+	{
+		array_result = array_result_storage;
+		for (size_t i = 0; i < result_words; i++)
+		{
+			array_result_storage[i] = 0;
+		}
+
+		if (vector->length % 64 != 0)
+		{
+			/*
+			 * Set the bits for past-the-end elements to 1. This way it's more
+			 * convenient to check for early exit, and the final result should
+			 * have them already set to 0 so it doesn't matter.
+			 */
+			const uint64 mask = ((uint64) -1) << (vector->length % 64);
+			array_result[vector->length / 64] = mask;
+		}
+	}
+
+	ArrayType *arr = DatumGetArrayTypeP(array);
+
+	int16 typlen;
+	bool typbyval;
+	char typalign;
+	get_typlenbyvalalign(ARR_ELEMTYPE(arr), &typlen, &typbyval, &typalign);
+
+	const char *array_data = (const char *) ARR_DATA_PTR(arr);
+	const size_t nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr));
+	const uint64 *restrict array_null_bitmap = (uint64 *) ARR_NULLBITMAP(arr);
+
+	for (size_t array_index = 0; array_index < nitems; array_index++)
+	{
+		if (array_null_bitmap != NULL && !arrow_row_is_valid(array_null_bitmap, array_index))
+		{
+			/*
+			 * This array element is NULL. We can't avoid NULLS when evaluating
+			 * the stable functions at run time, so we have to support them.
+			 * This is a predicate, not a generic scalar array operation, so
+			 * thankfully we return a non-nullable bool.
+			 * For ANY: null | true = true, null | false = null, so this means
+			 * we can skip the null element and continue evaluation.
+			 * For ALL: null & true = null, null & false = false, so this means
+			 * that for each row the condition goes to false, and we don't have
+			 * to evaluate the next elements.
+			 */
+			if (is_or)
+			{
+				continue;
+			}
+
+			for (size_t word = 0; word < result_words; word++)
+			{
+				final_result[word] = 0;
+			}
+			return;
+		}
+		Datum constvalue = fetch_att(array_data, typbyval, typlen);
+		array_data = att_addlength_pointer(array_data, typlen, array_data);
+		array_data = (char *) att_align_nominal(array_data, typalign);
+
+		/*
+		 * For OR, we also need an intermediate storage for predicate result
+		 * for each array element, since the predicates AND their result.
+		 *
+		 * For AND, we can and apply predicate for each array element to the
+		 * final result.
+		 */
+		uint64 single_result_storage[(GLOBAL_MAX_ROWS_PER_COMPRESSION + 63) / 64];
+		uint64 *restrict single_result;
+		if (is_or)
+		{
+			single_result = single_result_storage;
+			for (size_t outer = 0; outer < result_words; outer++)
+			{
+				single_result[outer] = -1;
+			}
+		}
+		else
+		{
+			single_result = final_result;
+		}
+
+		vector_const_predicate(vector, constvalue, single_result);
+
+		if (is_or)
+		{
+			for (size_t outer = 0; outer < result_words; outer++)
+			{
+				array_result[outer] |= single_result[outer];
+			}
+		}
+
+		/*
+		 * On big arrays, we want to sometimes check if we can exit early,
+		 * to avoid being slower than the non-vectorized version which exits
+		 * at first possibility. The frequency is chosen by benchmarking.
+		 * In debug mode, do this more frequently to simplify testing.
+		 */
+#ifdef NDEBUG
+		if (array_index > 0 && array_index % 16 == 0)
+#else
+		if (array_index > 0 && array_index % 3 == 0)
+#endif
+		{
+			if (is_or)
+			{
+				/*
+				 * Note that we have set the bits for past-the-end rows in
+				 * array_result to 1, so we can use simple AND here.
+				 */
+				uint64 all_rows_match = -1;
+				for (size_t word = 0; word < result_words; word++)
+				{
+					all_rows_match &= array_result[word];
+				}
+				if (all_rows_match == -1ULL)
+				{
+					return;
+				}
+			}
+			else
+			{
+				uint64 any_rows_match = 0;
+				for (size_t word = 0; word < result_words; word++)
+				{
+					any_rows_match |= final_result[word];
+				}
+				if (any_rows_match == 0)
+				{
+					return;
+				}
+			}
+		}
+	}
+
+	if (is_or)
+	{
+		for (size_t outer = 0; outer < result_words; outer++)
+		{
+			/*
+			 * The tail bits corresponding to past-the-end rows when n % 64 != 0
+			 * should be already zeroed out in the final_result.
+			 */
+			final_result[outer] &= array_result[outer];
+		}
+	}
+}
+
+/*
+ * This is a thin wrapper to nudge the compiler to specialize the AND version
+ * which is much simpler than the OR version.
+ */
+static pg_noinline void
+vector_array_predicate_and(VectorPredicate *scalar_predicate, const ArrowArray *vector, Datum array,
+						   uint64 *restrict result)
+{
+	vector_array_predicate_impl(scalar_predicate, /* is_or = */ false, vector, array, result);
+}
+
+void
+vector_array_predicate(VectorPredicate *scalar_predicate, bool is_or, const ArrowArray *vector,
+					   Datum array, uint64 *restrict result)
+{
+	if (is_or)
+	{
+		vector_array_predicate_impl(scalar_predicate, /* is_or = */ true, vector, array, result);
+	}
+	else
+	{
+		vector_array_predicate_and(scalar_predicate, vector, array, result);
+	}
+}
diff --git a/tsl/src/nodes/decompress_chunk/pred_vector_const_arithmetic_single.c b/tsl/src/nodes/decompress_chunk/pred_vector_const_arithmetic_single.c
index d6c41c4bfc3..d89f54eebfe 100644
--- a/tsl/src/nodes/decompress_chunk/pred_vector_const_arithmetic_single.c
+++ b/tsl/src/nodes/decompress_chunk/pred_vector_const_arithmetic_single.c
@@ -27,14 +27,6 @@ FUNCTION_NAME(PREDICATE_NAME, VECTOR_CTYPE,
 {
 	const size_t n = arrow->length;
 
-	/* Account for nulls which shouldn't pass the predicate. */
-	const size_t n_words = (n + 63) / 64;
-	const uint64 *restrict validity = (uint64 *restrict) arrow->buffers[0];
-	for (size_t i = 0; i < n_words; i++)
-	{
-		result[i] &= validity[i];
-	}
-
 	/* Now run the predicate itself. */
 	const CONST_CTYPE constvalue = CONST_CONVERSION(constdatum);
 	const VECTOR_CTYPE *restrict vector = (VECTOR_CTYPE *restrict) arrow->buffers[1];
diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.c b/tsl/src/nodes/decompress_chunk/vector_predicates.c
index 3f91a0a915f..f225383ec3f 100644
--- a/tsl/src/nodes/decompress_chunk/vector_predicates.c
+++ b/tsl/src/nodes/decompress_chunk/vector_predicates.c
@@ -13,19 +13,26 @@
 #include <utils/date.h>
 #include <utils/fmgroids.h>
 
-#include "compat/compat.h"
 #include "compression/arrow_c_data_interface.h"
 
 #include "vector_predicates.h"
 
+#include "compat/compat.h"
+#include "compression/compression.h"
+#include "debug_assert.h"
+
+/*
+ * We include all implementations of vector-const predicates here. No separate
+ * declarations for them to reduce the amount of macro template magic.
+ */
 #include "pred_vector_const_arithmetic_all.c"
 
 /*
  * Look up the vectorized implementation for a Postgres predicate, specified by
  * its Oid in pg_proc. Note that this Oid is different from the opcode.
  */
-void (*get_vector_const_predicate(Oid pg_predicate))(const ArrowArray *, const Datum,
-													 uint64 *restrict)
+VectorPredicate *
+get_vector_const_predicate(Oid pg_predicate)
 {
 	switch (pg_predicate)
 	{
diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.h b/tsl/src/nodes/decompress_chunk/vector_predicates.h
index f00d72dfe44..56d7809585f 100644
--- a/tsl/src/nodes/decompress_chunk/vector_predicates.h
+++ b/tsl/src/nodes/decompress_chunk/vector_predicates.h
@@ -10,5 +10,9 @@
 
 #pragma once
 
-void (*get_vector_const_predicate(Oid pg_predicate))(const ArrowArray *, const Datum,
-													 uint64 *restrict);
+typedef void(VectorPredicate)(const ArrowArray *, Datum, uint64 *restrict);
+
+VectorPredicate *get_vector_const_predicate(Oid pg_predicate);
+
+void vector_array_predicate(VectorPredicate *scalar_predicate, bool is_or, const ArrowArray *vector,
+							Datum array, uint64 *restrict result);
diff --git a/tsl/test/expected/decompress_vector_qual.out b/tsl/test/expected/decompress_vector_qual.out
index 430a67507bc..4ccf378e15a 100644
--- a/tsl/test/expected/decompress_vector_qual.out
+++ b/tsl/test/expected/decompress_vector_qual.out
@@ -1,6 +1,7 @@
 -- This file and its contents are licensed under the Timescale License.
 -- Please see the included NOTICE for copyright information and
 -- LICENSE-TIMESCALE for a copy of the license.
+\c :TEST_DBNAME :ROLE_SUPERUSER
 create table vectorqual(metric1 int8, ts timestamp, metric2 int8, device int8);
 select create_hypertable('vectorqual', 'ts');
 WARNING:  column type "timestamp without time zone" used for "ts" does not follow best practices
@@ -184,6 +185,29 @@ select count(*) from vectorqual where !!metric3;
      5
 (1 row)
 
+-- Custom operator on column that supports bulk decompression is not vectorized.
+set timescaledb.debug_require_vector_qual to 'forbid';
+create function int4eqq(int4, int4) returns bool as 'int4eq' language internal;
+create operator === (function = 'int4eqq', rightarg = int4, leftarg = int4);
+select count(*) from vectorqual where metric3 === 777;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where metric3 === any(array[777, 888]);
+ count 
+-------
+     2
+(1 row)
+
+-- It also doesn't have a commutator.
+select count(*) from vectorqual where 777 === metric3;
+ count 
+-------
+     2
+(1 row)
+
 -- NullTest is not vectorized.
 set timescaledb.debug_require_vector_qual to 'forbid';
 select count(*) from vectorqual where metric4 is null;
@@ -198,6 +222,331 @@ select count(*) from vectorqual where metric4 is not null;
      2
 (1 row)
 
+-- Scalar array operators are vectorized if the operator is vectorizable.
+set timescaledb.debug_require_vector_qual to 'only';
+select count(*) from vectorqual where metric3 = any(array[777, 888]); /* default value */
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where metric4 = any(array[44, 55]) /* default null */;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 > any(array[-1, -2, -3]) /* any */;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from vectorqual where metric2 > all(array[-1, -2, -3]) /* all */;
+ count 
+-------
+     5
+(1 row)
+
+-- Also have to support null array elements, because they are impossible to
+-- prevent in stable expressions.
+set timescaledb.debug_require_vector_qual to 'only';
+select count(*) from vectorqual where metric2 = any(array[null::int]) /* any with null element */;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from vectorqual where metric2 = any(array[22, null]) /* any with null element */;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 = any(array[null, 32]) /* any with null element */;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 = any(array[22, null, 32]) /* any with null element */;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where metric2 = all(array[null::int]) /* all with null element */;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from vectorqual where metric2 = all(array[22, null]) /* all with null element */;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from vectorqual where metric2 = all(array[null, 32]) /* all with null element */;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from vectorqual where metric2 = all(array[22, null, 32]) /* all with null element */;
+ count 
+-------
+     0
+(1 row)
+
+-- Check early exit.
+reset timescaledb.debug_require_vector_qual;
+create table singlebatch(like vectorqual);
+select create_hypertable('singlebatch', 'ts');
+WARNING:  column type "timestamp without time zone" used for "ts" does not follow best practices
+    create_hypertable     
+--------------------------
+ (3,public,singlebatch,t)
+(1 row)
+
+alter table singlebatch set (timescaledb.compress);
+insert into singlebatch select '2022-02-02 02:02:02', metric2, device, metric3, metric4, tag from vectorqual;
+select count(compress_chunk(x, true)) from show_chunks('singlebatch') x;
+ count 
+-------
+     1
+(1 row)
+
+set timescaledb.debug_require_vector_qual to 'only';
+-- Uncomment to generate the test reference w/o the vector optimizations.
+-- set timescaledb.enable_bulk_decompression to off;
+-- set timescaledb.debug_require_vector_qual to 'forbid';
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 22]);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 22, 0, 0, 0]);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 0]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 != any(array[0, 0, 0, 0, 0]);
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 0]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 0, 12, 12, 12]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12]);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 and metric2 = any(array[0, 0, 0, 0, 22]);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 and metric2 = any(array[0, 22, 0, 0, 0]);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 and metric2 = any(array[0, 0, 0, 0, 0]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 and metric2 != any(array[0, 0, 0, 0, 0]);
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 and metric2 <= all(array[12, 12, 12, 12, 0]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 and metric2 <= all(array[12, 0, 12, 12, 12]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 and metric2 <= all(array[12, 12, 12, 12, 12]);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 22]) and metric3 = 777;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 22, 0, 0, 0]) and metric3 = 777;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 0]) and metric3 = 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 != any(array[0, 0, 0, 0, 0]) and metric3 = 777;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 0]) and metric3 = 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 0, 12, 12, 12]) and metric3 = 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12]) and metric3 = 777;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric3 != 777 and metric2 = any(array[0, 0, 0, 0, 22]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 != 777 and metric2 = any(array[0, 22, 0, 0, 0]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 != 777 and metric2 = any(array[0, 0, 0, 0, 0]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 != 777 and metric2 != any(array[0, 0, 0, 0, 0]);
+ count 
+-------
+     3
+(1 row)
+
+select count(*) from singlebatch where metric3 != 777 and metric2 <= all(array[12, 12, 12, 12, 0]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 != 777 and metric2 <= all(array[12, 0, 12, 12, 12]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric3 != 777 and metric2 <= all(array[12, 12, 12, 12, 12]);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 22]) and metric3 != 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 22, 0, 0, 0]) and metric3 != 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 0]) and metric3 != 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 != any(array[0, 0, 0, 0, 0]) and metric3 != 777;
+ count 
+-------
+     3
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 0]) and metric3 != 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 0, 12, 12, 12]) and metric3 != 777;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12]) and metric3 != 777;
+ count 
+-------
+     0
+(1 row)
+
+reset timescaledb.enable_bulk_decompression;
+reset timescaledb.debug_require_vector_qual;
+-- Comparison with other column not vectorized.
+set timescaledb.debug_require_vector_qual to 'forbid';
+select count(*) from vectorqual where metric3 = metric4;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from vectorqual where metric3 = any(array[metric4]);
+ count 
+-------
+     0
+(1 row)
+
 -- Vectorized filters also work if we have only stable functions on the right
 -- side that can be evaluated to a constant at run time.
 set timescaledb.debug_require_vector_qual to 'only';
@@ -270,7 +619,7 @@ select create_hypertable('date_table', 'ts');
 NOTICE:  adding not-null constraint to column "ts"
     create_hypertable    
 -------------------------
- (3,public,date_table,t)
+ (5,public,date_table,t)
 (1 row)
 
 alter table date_table set (timescaledb.compress);
diff --git a/tsl/test/sql/decompress_vector_qual.sql b/tsl/test/sql/decompress_vector_qual.sql
index a4317a9a64d..9cee66531a0 100644
--- a/tsl/test/sql/decompress_vector_qual.sql
+++ b/tsl/test/sql/decompress_vector_qual.sql
@@ -2,6 +2,8 @@
 -- Please see the included NOTICE for copyright information and
 -- LICENSE-TIMESCALE for a copy of the license.
 
+\c :TEST_DBNAME :ROLE_SUPERUSER
+
 create table vectorqual(metric1 int8, ts timestamp, metric2 int8, device int8);
 select create_hypertable('vectorqual', 'ts');
 alter table vectorqual set (timescaledb.compress, timescaledb.compress_segmentby = 'device');
@@ -35,6 +37,7 @@ select count(*) from vectorqual where metric4 >= 0 /* nulls shouldn't pass the q
 set timescaledb.debug_require_vector_qual to 'forbid';
 select count(*) from vectorqual where device = 1 /* can't apply vector ops to the segmentby column */;
 
+
 -- Test columns that don't support bulk decompression.
 alter table vectorqual add column tag text;
 insert into vectorqual(ts, device, metric2, metric3, metric4, tag) values ('2025-01-01 00:00:00', 5, 52, 53, 54, 'tag5');
@@ -67,12 +70,105 @@ create operator !! (function = 'bool', rightarg = int4);
 select count(*) from vectorqual where !!metric3;
 
 
+-- Custom operator on column that supports bulk decompression is not vectorized.
+set timescaledb.debug_require_vector_qual to 'forbid';
+create function int4eqq(int4, int4) returns bool as 'int4eq' language internal;
+create operator === (function = 'int4eqq', rightarg = int4, leftarg = int4);
+select count(*) from vectorqual where metric3 === 777;
+select count(*) from vectorqual where metric3 === any(array[777, 888]);
+
+-- It also doesn't have a commutator.
+select count(*) from vectorqual where 777 === metric3;
+
+
 -- NullTest is not vectorized.
 set timescaledb.debug_require_vector_qual to 'forbid';
 select count(*) from vectorqual where metric4 is null;
 select count(*) from vectorqual where metric4 is not null;
 
 
+-- Scalar array operators are vectorized if the operator is vectorizable.
+set timescaledb.debug_require_vector_qual to 'only';
+select count(*) from vectorqual where metric3 = any(array[777, 888]); /* default value */
+select count(*) from vectorqual where metric4 = any(array[44, 55]) /* default null */;
+select count(*) from vectorqual where metric2 > any(array[-1, -2, -3]) /* any */;
+select count(*) from vectorqual where metric2 > all(array[-1, -2, -3]) /* all */;
+
+-- Also have to support null array elements, because they are impossible to
+-- prevent in stable expressions.
+set timescaledb.debug_require_vector_qual to 'only';
+select count(*) from vectorqual where metric2 = any(array[null::int]) /* any with null element */;
+select count(*) from vectorqual where metric2 = any(array[22, null]) /* any with null element */;
+select count(*) from vectorqual where metric2 = any(array[null, 32]) /* any with null element */;
+select count(*) from vectorqual where metric2 = any(array[22, null, 32]) /* any with null element */;
+select count(*) from vectorqual where metric2 = all(array[null::int]) /* all with null element */;
+select count(*) from vectorqual where metric2 = all(array[22, null]) /* all with null element */;
+select count(*) from vectorqual where metric2 = all(array[null, 32]) /* all with null element */;
+select count(*) from vectorqual where metric2 = all(array[22, null, 32]) /* all with null element */;
+
+-- Check early exit.
+reset timescaledb.debug_require_vector_qual;
+create table singlebatch(like vectorqual);
+select create_hypertable('singlebatch', 'ts');
+alter table singlebatch set (timescaledb.compress);
+insert into singlebatch select '2022-02-02 02:02:02', metric2, device, metric3, metric4, tag from vectorqual;
+select count(compress_chunk(x, true)) from show_chunks('singlebatch') x;
+
+set timescaledb.debug_require_vector_qual to 'only';
+-- Uncomment to generate the test reference w/o the vector optimizations.
+-- set timescaledb.enable_bulk_decompression to off;
+-- set timescaledb.debug_require_vector_qual to 'forbid';
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 22]);
+select count(*) from singlebatch where metric2 = any(array[0, 22, 0, 0, 0]);
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 0]);
+select count(*) from singlebatch where metric2 != any(array[0, 0, 0, 0, 0]);
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 0]);
+select count(*) from singlebatch where metric2 <= all(array[12, 0, 12, 12, 12]);
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12]);
+
+select count(*) from singlebatch where metric3 = 777 and metric2 = any(array[0, 0, 0, 0, 22]);
+select count(*) from singlebatch where metric3 = 777 and metric2 = any(array[0, 22, 0, 0, 0]);
+select count(*) from singlebatch where metric3 = 777 and metric2 = any(array[0, 0, 0, 0, 0]);
+select count(*) from singlebatch where metric3 = 777 and metric2 != any(array[0, 0, 0, 0, 0]);
+select count(*) from singlebatch where metric3 = 777 and metric2 <= all(array[12, 12, 12, 12, 0]);
+select count(*) from singlebatch where metric3 = 777 and metric2 <= all(array[12, 0, 12, 12, 12]);
+select count(*) from singlebatch where metric3 = 777 and metric2 <= all(array[12, 12, 12, 12, 12]);
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 22]) and metric3 = 777;
+select count(*) from singlebatch where metric2 = any(array[0, 22, 0, 0, 0]) and metric3 = 777;
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 0]) and metric3 = 777;
+select count(*) from singlebatch where metric2 != any(array[0, 0, 0, 0, 0]) and metric3 = 777;
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 0]) and metric3 = 777;
+select count(*) from singlebatch where metric2 <= all(array[12, 0, 12, 12, 12]) and metric3 = 777;
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12]) and metric3 = 777;
+
+select count(*) from singlebatch where metric3 != 777 and metric2 = any(array[0, 0, 0, 0, 22]);
+select count(*) from singlebatch where metric3 != 777 and metric2 = any(array[0, 22, 0, 0, 0]);
+select count(*) from singlebatch where metric3 != 777 and metric2 = any(array[0, 0, 0, 0, 0]);
+select count(*) from singlebatch where metric3 != 777 and metric2 != any(array[0, 0, 0, 0, 0]);
+select count(*) from singlebatch where metric3 != 777 and metric2 <= all(array[12, 12, 12, 12, 0]);
+select count(*) from singlebatch where metric3 != 777 and metric2 <= all(array[12, 0, 12, 12, 12]);
+select count(*) from singlebatch where metric3 != 777 and metric2 <= all(array[12, 12, 12, 12, 12]);
+
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 22]) and metric3 != 777;
+select count(*) from singlebatch where metric2 = any(array[0, 22, 0, 0, 0]) and metric3 != 777;
+select count(*) from singlebatch where metric2 = any(array[0, 0, 0, 0, 0]) and metric3 != 777;
+select count(*) from singlebatch where metric2 != any(array[0, 0, 0, 0, 0]) and metric3 != 777;
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 0]) and metric3 != 777;
+select count(*) from singlebatch where metric2 <= all(array[12, 0, 12, 12, 12]) and metric3 != 777;
+select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12]) and metric3 != 777;
+
+reset timescaledb.enable_bulk_decompression;
+reset timescaledb.debug_require_vector_qual;
+
+
+-- Comparison with other column not vectorized.
+set timescaledb.debug_require_vector_qual to 'forbid';
+select count(*) from vectorqual where metric3 = metric4;
+select count(*) from vectorqual where metric3 = any(array[metric4]);
+
+
 -- Vectorized filters also work if we have only stable functions on the right
 -- side that can be evaluated to a constant at run time.
 set timescaledb.debug_require_vector_qual to 'only';