exec: Find ARG_MAX with binary search after E2BIG

tavianator · tavianator · commit ac1d28ea6bf9 · 2021-11-17T14:49:04.000-05:00
Previously we would shrink the command by one argument at a time until a
successful execution.  This is okay if the ARG_MAX estimate is just a
little bit off, but is terribly slow when it's off by a lot.

One situation where it's very far off is when a 32-bit build of bfs
launches a 64-bit binary.  In this case, bfs thinks the argv pointers
are 4 bytes, while they actually take up 8 bytes.  The performance is
quite bad:

    $ time ./bfs-64 ~/code/linux -exec echo {} + &gt;/dev/null
    ./bfs-64 ~/code/linux -exec echo {} + &gt; /dev/null  0.03s user 0.07s system 99% cpu 0.095 total
    $ time ./bfs-32 ~/code/linux -exec echo {} + &gt;/dev/null
    ./bfs-32 ~/code/linux -exec echo {} + &gt; /dev/null  0.08s user 10.33s system 100% cpu 10.390 total

After this change, performance is much better:

    $ time ./bfs-32 ~/code/linux -exec echo {} + &gt;/dev/null
    ./bfs-32 ~/code/linux -exec echo {} + &gt; /dev/null  0.03s user 0.08s system 99% cpu 0.110 total
diff --git a/exec.c b/exec.c
@@ -59,8 +59,6 @@ static void bfs_exec_debug(const struct bfs_exec *execbuf, const char *format, .
 	va_end(args);
 }
 
-extern char **environ;
-
 /** Determine the size of a single argument, for comparison to arg_max. */
 static size_t bfs_exec_arg_size(const char *arg) {
 	return sizeof(arg) + strlen(arg) + 1;
@@ -79,6 +77,7 @@ static size_t bfs_exec_arg_max(const struct bfs_exec *execbuf) {
 	}
 
 	// We have to share space with the environment variables
+	extern char **environ;
 	for (char **envp = environ; *envp; ++envp) {
 		arg_max -= bfs_exec_arg_size(*envp);
 	}
@@ -131,6 +130,7 @@ struct bfs_exec *bfs_exec_parse(const struct bfs_ctx *ctx, char **argv, enum bfs
 	execbuf->argv_cap = 0;
 	execbuf->arg_size = 0;
 	execbuf->arg_max = 0;
+	execbuf->arg_min = 0;
 	execbuf->wd_fd = -1;
 	execbuf->wd_path = NULL;
 	execbuf->wd_len = 0;
@@ -183,6 +183,7 @@ struct bfs_exec *bfs_exec_parse(const struct bfs_ctx *ctx, char **argv, enum bfs
 		execbuf->argc = execbuf->tmpl_argc - 1;
 
 		execbuf->arg_max = bfs_exec_arg_max(execbuf);
+		execbuf->arg_min = execbuf->arg_max;
 	}
 
 	return execbuf;
@@ -461,6 +462,54 @@ static bool bfs_exec_args_remain(const struct bfs_exec *execbuf) {
 	return execbuf->argc >= execbuf->tmpl_argc;
 }
 
+/** Compute the current ARG_MAX estimate for binary search. */
+static size_t bfs_exec_estimate_max(const struct bfs_exec *execbuf) {
+	size_t min = execbuf->arg_min;
+	size_t max = execbuf->arg_max;
+	return min + (max - min)/2;
+}
+
+/** Update the ARG_MAX lower bound from a successful execution. */
+static void bfs_exec_update_min(struct bfs_exec *execbuf) {
+	if (execbuf->arg_size > execbuf->arg_min) {
+		execbuf->arg_min = execbuf->arg_size;
+
+		// Don't let min exceed max
+		if (execbuf->arg_min > execbuf->arg_max) {
+			execbuf->arg_min = execbuf->arg_max;
+		}
+
+		size_t estimate = bfs_exec_estimate_max(execbuf);
+		bfs_exec_debug(execbuf, "ARG_MAX between [%zu, %zu], trying %zu\n",
+		               execbuf->arg_min, execbuf->arg_max, estimate);
+	}
+}
+
+/** Update the ARG_MAX upper bound from a failed execution. */
+static size_t bfs_exec_update_max(struct bfs_exec *execbuf) {
+	bfs_exec_debug(execbuf, "Got E2BIG, shrinking argument list...\n");
+
+	if (execbuf->arg_size < execbuf->arg_max) {
+		execbuf->arg_max = execbuf->arg_size;
+
+		// Don't let min exceed max
+		if (execbuf->arg_min > execbuf->arg_max) {
+			execbuf->arg_min = execbuf->arg_max;
+		}
+	}
+
+	if (execbuf->arg_size <= execbuf->arg_min) {
+		// Lower bound was wrong, restart binary search.
+		execbuf->arg_min = 0;
+	}
+
+	// Binary search for a more precise bound
+	size_t estimate = bfs_exec_estimate_max(execbuf);
+	bfs_exec_debug(execbuf, "ARG_MAX between [%zu, %zu], trying %zu\n",
+	               execbuf->arg_min, execbuf->arg_max, estimate);
+	return estimate;
+}
+
 /** Execute the pending command from a BFS_EXEC_MULTI execbuf. */
 static int bfs_exec_flush(struct bfs_exec *execbuf) {
 	int ret = 0, error = 0;
@@ -470,30 +519,31 @@ static int bfs_exec_flush(struct bfs_exec *execbuf) {
 		execbuf->argv[execbuf->argc] = NULL;
 		ret = bfs_exec_spawn(execbuf);
 		error = errno;
-		if (ret == 0 || error != E2BIG) {
+		if (ret == 0) {
+			bfs_exec_update_min(execbuf);
+			break;
+		} else if (error != E2BIG) {
 			break;
 		}
 
 		// Try to recover from E2BIG by trying fewer and fewer arguments
 		// until they fit
-		bfs_exec_debug(execbuf, "Got E2BIG, shrinking argument list...\n");
-		execbuf->argv[execbuf->argc] = execbuf->argv[execbuf->argc - 1];
-		execbuf->arg_size -= bfs_exec_arg_size(execbuf->argv[execbuf->argc]);
-		--execbuf->argc;
+		size_t new_max = bfs_exec_update_max(execbuf);
+		while (execbuf->arg_size > new_max) {
+			execbuf->argv[execbuf->argc] = execbuf->argv[execbuf->argc - 1];
+			execbuf->arg_size -= bfs_exec_arg_size(execbuf->argv[execbuf->argc]);
+			--execbuf->argc;
+		}
 	}
-	size_t new_argc = execbuf->argc;
-	size_t new_size = execbuf->arg_size;
 
+	size_t new_argc = execbuf->argc;
 	for (size_t i = execbuf->tmpl_argc - 1; i < new_argc; ++i) {
 		free(execbuf->argv[i]);
 	}
 	execbuf->argc = execbuf->tmpl_argc - 1;
 	execbuf->arg_size = 0;
 
 	if (new_argc < orig_argc) {
-		execbuf->arg_max = new_size;
-		bfs_exec_debug(execbuf, "ARG_MAX: %zu\n", execbuf->arg_max);
-
 		// If we recovered from E2BIG, there are unused arguments at the
 		// end of the list
 		for (size_t i = new_argc + 1; i <= orig_argc; ++i) {
@@ -526,10 +576,11 @@ static bool bfs_exec_changed_dirs(const struct bfs_exec *execbuf, const struct B
 
 /** Check if we need to flush the execbuf because we're too big. */
 static bool bfs_exec_would_overflow(const struct bfs_exec *execbuf, const char *arg) {
+	size_t arg_max = bfs_exec_estimate_max(execbuf);
 	size_t next_size = execbuf->arg_size + bfs_exec_arg_size(arg);
-	if (next_size > execbuf->arg_max) {
+	if (next_size > arg_max) {
 		bfs_exec_debug(execbuf, "Command size (%zu) would exceed maximum (%zu), executing buffered command\n",
-		               next_size, execbuf->arg_max);
+		               next_size, arg_max);
 		return true;
 	}
 
diff --git a/exec.h b/exec.h
@@ -63,6 +63,8 @@ struct bfs_exec {
 	size_t arg_size;
 	/** Maximum arg_size before E2BIG. */
 	size_t arg_max;
+	/** Lower bound for arg_max. */
+	size_t arg_min;
 
 	/** A file descriptor for the working directory, for BFS_EXEC_CHDIR. */
 	int wd_fd;