apache · walterddr · Nov 15, 2022 · Nov 7, 2022 · Nov 8, 2022 · Nov 15, 2022
diff --git a/...ation-tests/src/test/java/org/apache/pinot/integration/tests/SSBQueryIntegrationTest.java b/...ation-tests/src/test/java/org/apache/pinot/integration/tests/SSBQueryIntegrationTest.java
@@ -44,7 +44,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.testng.Assert;
-import org.testng.SkipException;
 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
 import org.testng.annotations.DataProvider;
@@ -54,7 +53,6 @@
 
 public class SSBQueryIntegrationTest extends BaseClusterIntegrationTest {
   private static final Logger LOGGER = LoggerFactory.getLogger(SSBQueryIntegrationTest.class);
-  private static final int MIN_AVAILABLE_CORE_REQUIREMENT = 4;
   private static final Map<String, String> SSB_QUICKSTART_TABLE_RESOURCES = ImmutableMap.of(
       "customer", "examples/batch/ssb/customer",
       "dates", "examples/batch/ssb/dates",
@@ -66,10 +64,6 @@ public class SSBQueryIntegrationTest extends BaseClusterIntegrationTest {
   @BeforeClass
   public void setUp()
       throws Exception {
-    if (Runtime.getRuntime().availableProcessors() < MIN_AVAILABLE_CORE_REQUIREMENT) {
-      throw new SkipException("Skip SSB query testing. Insufficient core count: "
-          + Runtime.getRuntime().availableProcessors());
-    }
     TestUtils.ensureDirectoriesExistAndEmpty(_tempDir, _segmentDir, _tarDir);
 
     // Start the Pinot cluster

diff --git a/pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/QueryRunner.java b/pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/QueryRunner.java
@@ -50,11 +50,15 @@
 import org.apache.pinot.query.mailbox.MultiplexingMailboxService;
 import org.apache.pinot.query.planner.StageMetadata;
 import org.apache.pinot.query.planner.stage.MailboxSendNode;
+import org.apache.pinot.query.planner.stage.StageNode;
 import org.apache.pinot.query.runtime.blocks.TransferableBlock;
 import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
-import org.apache.pinot.query.runtime.executor.WorkerQueryExecutor;
+import org.apache.pinot.query.runtime.executor.OpChainSchedulerService;
 import org.apache.pinot.query.runtime.operator.MailboxSendOperator;
+import org.apache.pinot.query.runtime.operator.OpChain;
 import org.apache.pinot.query.runtime.plan.DistributedStagePlan;
+import org.apache.pinot.query.runtime.plan.PhysicalPlanVisitor;
+import org.apache.pinot.query.runtime.plan.PlanRequestContext;
 import org.apache.pinot.query.runtime.plan.ServerRequestPlanVisitor;
 import org.apache.pinot.query.runtime.plan.server.ServerPlanRequestContext;
 import org.apache.pinot.query.service.QueryConfig;
@@ -76,7 +80,6 @@ public class QueryRunner {
   private static final Logger LOGGER = LoggerFactory.getLogger(QueryRunner.class);
   // This is a temporary before merging the 2 type of executor.
   private ServerQueryExecutorV1Impl _serverExecutor;
-  private WorkerQueryExecutor _workerExecutor;
   private HelixManager _helixManager;
   private ZkHelixPropertyStore<ZNRecord> _helixPropertyStore;
   private MailboxService<TransferableBlock> _mailboxService;
@@ -98,8 +101,6 @@ public void init(PinotConfiguration config, InstanceDataManager instanceDataMana
       _mailboxService = MultiplexingMailboxService.newInstance(_hostname, _port, config);
       _serverExecutor = new ServerQueryExecutorV1Impl();
       _serverExecutor.init(config, instanceDataManager, serverMetrics);
-      _workerExecutor = new WorkerQueryExecutor();
-      _workerExecutor.init(config, serverMetrics, _mailboxService, _hostname, _port);
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
@@ -109,16 +110,14 @@ public void start() {
     _helixPropertyStore = _helixManager.getHelixPropertyStore();
     _mailboxService.start();
     _serverExecutor.start();
-    _workerExecutor.start();
   }
 
   public void shutDown() {
-    _workerExecutor.shutDown();
     _serverExecutor.shutDown();
     _mailboxService.shutdown();
   }
 
-  public void processQuery(DistributedStagePlan distributedStagePlan, ExecutorService executorService,
+  public void processQuery(DistributedStagePlan distributedStagePlan, OpChainSchedulerService scheduler,
       Map<String, String> requestMetadataMap) {
     if (isLeafStage(distributedStagePlan)) {
       // TODO: make server query request return via mailbox, this is a hack to gather the non-streaming data table
@@ -132,7 +131,7 @@ public void processQuery(DistributedStagePlan distributedStagePlan, ExecutorServ
       for (ServerPlanRequestContext requestContext : serverQueryRequests) {
         ServerQueryRequest request = new ServerQueryRequest(requestContext.getInstanceRequest(),
             new ServerMetrics(PinotMetricUtils.getPinotMetricsRegistry()), System.currentTimeMillis());
-        serverQueryResults.add(processServerQuery(request, executorService));
+        serverQueryResults.add(processServerQuery(request, scheduler.getWorkerPool()));
       }
 
       MailboxSendNode sendNode = (MailboxSendNode) distributedStagePlan.getStageRoot();
@@ -148,7 +147,11 @@ public void processQuery(DistributedStagePlan distributedStagePlan, ExecutorServ
         LOGGER.debug("Acquired transferable block: {}", blockCounter++);
       }
     } else {
-      _workerExecutor.processQuery(distributedStagePlan, requestMetadataMap, executorService);
+      long requestId = Long.parseLong(requestMetadataMap.get("REQUEST_ID"));
+      StageNode stageRoot = distributedStagePlan.getStageRoot();
+      OpChain rootOperator = PhysicalPlanVisitor.build(stageRoot, new PlanRequestContext(
+          _mailboxService, requestId, stageRoot.getStageId(), _hostname, _port, distributedStagePlan.getMetadataMap()));
+      scheduler.register(rootOperator);
     }
   }
 

diff --git a/...query-runtime/src/main/java/org/apache/pinot/query/runtime/executor/OpChainScheduler.java b/...query-runtime/src/main/java/org/apache/pinot/query/runtime/executor/OpChainScheduler.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.executor;
+
+import org.apache.pinot.query.mailbox.MailboxIdentifier;
+import org.apache.pinot.query.runtime.operator.OpChain;
+
+
+/**
+ * An interface that defines different scheduling strategies to work with the
+ * {@link OpChainSchedulerService}. All methods are thread safe and can be guaranteed
+ * to never be called concurrently - therefore all implementations may use data
+ * structures that are not concurrent.
+ */
+public interface OpChainScheduler {
+
+  /**
+   * @param operatorChain the operator chain to register
+   */
+  void register(OpChain operatorChain);
+
+  /**
+   * This method is called whenever {@code mailbox} has new data available to consume,
+   * this can be useful for advanced scheduling algorithms
+   *
+   * @param mailbox the mailbox ID
+   */
+  void onDataAvailable(MailboxIdentifier mailbox);
+
+  /**
+   * @return whether or not there is any work for the scheduler to do
+   */
+  boolean hasNext();
+
+  /**
+   * @return the next operator chain to process
+   * @throws java.util.NoSuchElementException if {@link #hasNext()} returns false
+   *         prior to this call
+   */
+  OpChain next();
+}
diff --git a/...untime/src/main/java/org/apache/pinot/query/runtime/executor/OpChainSchedulerService.java b/...untime/src/main/java/org/apache/pinot/query/runtime/executor/OpChainSchedulerService.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.executor;
+
+import com.google.common.util.concurrent.AbstractExecutionThreadService;
+import com.google.common.util.concurrent.Monitor;
+import java.util.concurrent.ExecutorService;
+import org.apache.pinot.common.request.context.ThreadTimer;
+import org.apache.pinot.core.util.trace.TraceRunnable;
+import org.apache.pinot.query.mailbox.MailboxIdentifier;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.operator.OpChain;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * This class provides the implementation for scheduling multistage queries on a single node based
+ * on the {@link OpChainScheduler} logic that is passed in. Multistage queries support partial execution
+ * and will return a NOOP metadata block as a "yield" signal, indicating that the next operator
+ * chain ({@link OpChainScheduler#next()} will be requested.
+ *
+ * <p>Note that a yielded operator chain will be re-registered with the underlying scheduler.
+ */
+@SuppressWarnings("UnstableApiUsage")
+public class OpChainSchedulerService extends AbstractExecutionThreadService {
+
+  private static final Logger LOGGER = LoggerFactory.getLogger(OpChainSchedulerService.class);
+
+  private final OpChainScheduler _scheduler;
+  private final ExecutorService _workerPool;
+
+  // anything that is guarded by this monitor should be non-blocking
+  private final Monitor _monitor = new Monitor();
+  protected final Monitor.Guard _hasNextOrClosing = new Monitor.Guard(_monitor) {
+    @Override
+    public boolean isSatisfied() {
+      return _scheduler.hasNext() || !isRunning();
+    }
+  };
+
+  public OpChainSchedulerService(OpChainScheduler scheduler, ExecutorService workerPool) {
+    _scheduler = scheduler;
+    _workerPool = workerPool;
+  }
+
+  @Override
+  protected void triggerShutdown() {
+    // this wil just notify all waiters that the scheduler is shutting down
+    _monitor.enter();
+    _monitor.leave();
+  }
+
+  @Override
+  protected void run()
+      throws Exception {
+    while (isRunning()) {
+      _monitor.enterWhen(_hasNextOrClosing);
+      try {
+        if (!isRunning()) {
+          return;
+        }
+
+        OpChain operatorChain = _scheduler.next();
+        _workerPool.submit(new TraceRunnable() {
+          @Override
+          public void runJob() {
+            try {
+              ThreadTimer timer = operatorChain.getAndStartTimer();
+
+              // so long as there's work to be done, keep getting the next block
+              // when the operator chain returns a NOOP block, then yield the execution
+              // of this to another worker
+              TransferableBlock result = operatorChain.getRoot().nextBlock();
+              while (!result.isNoOpBlock() && !result.isEndOfStreamBlock()) {
+                LOGGER.debug("Got block with {} rows.", result.getNumRows());
+                result = operatorChain.getRoot().nextBlock();
+              }
+
+              if (!result.isEndOfStreamBlock()) {
+                // not complete, needs to re-register for scheduling
+                register(operatorChain);
+              } else {
+                LOGGER.info("Execution time: " + timer.getThreadTimeNs());
+              }
+            } catch (Exception e) {
+              LOGGER.error("Failed to execute query!", e);
+            }
+          }
+        });
+      } finally {
+        _monitor.leave();
+      }
+    }
+  }
+
+  /**
+   * Register a new operator chain with the scheduler.
+   *
+   * @param operatorChain the chain to register
+   */
+  public final void register(OpChain operatorChain) {
+    _monitor.enter();
+    try {
+      _scheduler.register(operatorChain);
+    } finally {
+      _monitor.leave();
+    }
+  }
+
+  /**
+   * This method should be called whenever data is available in a given mailbox.
+   * Implementations of this method should be idempotent, it may be called in the
+   * scenario that no mail is available.
+   *
+   * @param mailbox the identifier of the mailbox that now has data
+   */
+  public final void onDataAvailable(MailboxIdentifier mailbox) {
+    _monitor.enter();
+    try {
+      _scheduler.onDataAvailable(mailbox);
+    } finally {
+      _monitor.leave();
+    }
+  }
+
+  // TODO: remove this method after we pipe down the proper executor pool to the v1 engine
+  public ExecutorService getWorkerPool() {
+    return _workerPool;
+  }
+}
diff --git a/...ry-runtime/src/main/java/org/apache/pinot/query/runtime/executor/RoundRobinScheduler.java b/...ry-runtime/src/main/java/org/apache/pinot/query/runtime/executor/RoundRobinScheduler.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.executor;
+
+import java.util.LinkedList;
+import java.util.Queue;
+import org.apache.pinot.query.mailbox.MailboxIdentifier;
+import org.apache.pinot.query.runtime.operator.OpChain;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class RoundRobinScheduler implements OpChainScheduler {
+  private static final Logger LOGGER = LoggerFactory.getLogger(RoundRobinScheduler.class);
+
+  private final Queue<OpChain> _opChainQueue = new LinkedList<>();
+
+  @Override
+  public void register(OpChain operatorChain) {
+    _opChainQueue.add(operatorChain);
+  }
+
+  @Override
+  public void onDataAvailable(MailboxIdentifier mailbox) {
+    // do nothing - this doesn't change order of execution
+  }
+
+  @Override
+  public boolean hasNext() {
+    // don't use _nextOpChain.hasNext() because that may potentially create
+    // a new iterator that gets tossed
+    return !_opChainQueue.isEmpty();
+  }
+
+  @Override
+  public OpChain next() {
+    return _opChainQueue.poll();
+  }
+}