Skip to content
/ git Public
forked from git/git

Commit fe21c6b

Browse files
dschogitster
authored andcommitted
mingw: reencode environment variables on the fly (UTF-16 <-> UTF-8)
On Windows, the authoritative environment is encoded in UTF-16. In Git for Windows, we convert that to UTF-8 (because UTF-16 is *such* a foreign idea to Git that its source code is unprepared for it). Previously, out of performance concerns, we converted the entire environment to UTF-8 in one fell swoop at the beginning, and upon putenv() and run_command() converted it back. Having a private copy of the environment comes with its own perils: when a library used by Git's source code tries to modify the environment, it does not really work (in Git for Windows' case, libcurl, see https://github.com/git-for-windows/git/compare/bcad1e6d58^...bcad1e6d58^2 for a glimpse of the issues). Hence, it makes our environment handling substantially more robust if we switch to on-the-fly-conversion in `getenv()`/`putenv()` calls. Based on an initial version in the MSVC context by Jeff Hostetler, this patch makes it so. Surprisingly, this has a *positive* effect on speed: at the time when the current code was written, we tested the performance, and there were *so many* `getenv()` calls that it seemed better to convert everything in one go. In the meantime, though, Git has obviously been cleaned up a bit with regards to `getenv()` calls so that the Git processes spawned by the test suite use an average of only 40 `getenv()`/`putenv()` calls over the process lifetime. Speaking of the entire test suite: the total time spent in the re-encoding in the current code takes about 32.4 seconds (out of 113 minutes runtime), whereas the code introduced in this patch takes only about 8.2 seconds in total. Not much, but it proves that we need not be concerned about the performance impact introduced by this patch. Helped-by: Jeff Hostetler <[email protected]> Signed-off-by: Johannes Schindelin <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 665177e commit fe21c6b

File tree

2 files changed

+196
-116
lines changed

2 files changed

+196
-116
lines changed

compat/mingw.c

+168-112
Original file line numberDiff line numberDiff line change
@@ -1040,44 +1040,121 @@ static char *path_lookup(const char *cmd, int exe_only)
10401040
return prog;
10411041
}
10421042

1043-
static int do_putenv(char **env, const char *name, int size, int free_old);
1043+
static const wchar_t *wcschrnul(const wchar_t *s, wchar_t c)
1044+
{
1045+
while (*s && *s != c)
1046+
s++;
1047+
return s;
1048+
}
1049+
1050+
/* Compare only keys */
1051+
static int wenvcmp(const void *a, const void *b)
1052+
{
1053+
wchar_t *p = *(wchar_t **)a, *q = *(wchar_t **)b;
1054+
size_t p_len, q_len;
1055+
1056+
/* Find the keys */
1057+
p_len = wcschrnul(p, L'=') - p;
1058+
q_len = wcschrnul(q, L'=') - q;
1059+
1060+
/* If the length differs, include the shorter key's NUL */
1061+
if (p_len < q_len)
1062+
p_len++;
1063+
else if (p_len > q_len)
1064+
p_len = q_len + 1;
1065+
1066+
return _wcsnicmp(p, q, p_len);
1067+
}
10441068

1045-
/* used number of elements of environ array, including terminating NULL */
1046-
static int environ_size = 0;
1047-
/* allocated size of environ array, in bytes */
1048-
static int environ_alloc = 0;
1069+
/* We need a stable sort to convert the environment between UTF-16 <-> UTF-8 */
1070+
#ifndef INTERNAL_QSORT
1071+
#include "qsort.c"
1072+
#endif
10491073

10501074
/*
1051-
* Create environment block suitable for CreateProcess. Merges current
1052-
* process environment and the supplied environment changes.
1075+
* Build an environment block combining the inherited environment
1076+
* merged with the given list of settings.
1077+
*
1078+
* Values of the form "KEY=VALUE" in deltaenv override inherited values.
1079+
* Values of the form "KEY" in deltaenv delete inherited values.
1080+
*
1081+
* Multiple entries in deltaenv for the same key are explicitly allowed.
1082+
*
1083+
* We return a contiguous block of UNICODE strings with a final trailing
1084+
* zero word.
10531085
*/
10541086
static wchar_t *make_environment_block(char **deltaenv)
10551087
{
1056-
wchar_t *wenvblk = NULL;
1057-
char **tmpenv;
1058-
int i = 0, size = environ_size, wenvsz = 0, wenvpos = 0;
1088+
wchar_t *wenv = GetEnvironmentStringsW(), *wdeltaenv, *result, *p;
1089+
size_t wlen, s, delta_size, size;
1090+
1091+
wchar_t **array = NULL;
1092+
size_t alloc = 0, nr = 0, i;
1093+
1094+
size = 1; /* for extra NUL at the end */
1095+
1096+
/* If there is no deltaenv to apply, simply return a copy. */
1097+
if (!deltaenv || !*deltaenv) {
1098+
for (p = wenv; p && *p; ) {
1099+
size_t s = wcslen(p) + 1;
1100+
size += s;
1101+
p += s;
1102+
}
1103+
1104+
ALLOC_ARRAY(result, size);
1105+
memcpy(result, wenv, size * sizeof(*wenv));
1106+
FreeEnvironmentStringsW(wenv);
1107+
return result;
1108+
}
1109+
1110+
/*
1111+
* If there is a deltaenv, let's accumulate all keys into `array`,
1112+
* sort them using the stable git_qsort() and then copy, skipping
1113+
* duplicate keys
1114+
*/
1115+
for (p = wenv; p && *p; ) {
1116+
ALLOC_GROW(array, nr + 1, alloc);
1117+
s = wcslen(p) + 1;
1118+
array[nr++] = p;
1119+
p += s;
1120+
size += s;
1121+
}
1122+
1123+
/* (over-)assess size needed for wchar version of deltaenv */
1124+
for (delta_size = 0, i = 0; deltaenv[i]; i++)
1125+
delta_size += strlen(deltaenv[i]) * 2 + 1;
1126+
ALLOC_ARRAY(wdeltaenv, delta_size);
1127+
1128+
/* convert the deltaenv, appending to array */
1129+
for (i = 0, p = wdeltaenv; deltaenv[i]; i++) {
1130+
ALLOC_GROW(array, nr + 1, alloc);
1131+
wlen = xutftowcs(p, deltaenv[i], wdeltaenv + delta_size - p);
1132+
array[nr++] = p;
1133+
p += wlen + 1;
1134+
}
10591135

1060-
while (deltaenv && deltaenv[i])
1061-
i++;
1136+
git_qsort(array, nr, sizeof(*array), wenvcmp);
1137+
ALLOC_ARRAY(result, size + delta_size);
10621138

1063-
/* copy the environment, leaving space for changes */
1064-
ALLOC_ARRAY(tmpenv, size + i);
1065-
memcpy(tmpenv, environ, size * sizeof(char*));
1139+
for (p = result, i = 0; i < nr; i++) {
1140+
/* Skip any duplicate keys; last one wins */
1141+
while (i + 1 < nr && !wenvcmp(array + i, array + i + 1))
1142+
i++;
10661143

1067-
/* merge supplied environment changes into the temporary environment */
1068-
for (i = 0; deltaenv && deltaenv[i]; i++)
1069-
size = do_putenv(tmpenv, deltaenv[i], size, 0);
1144+
/* Skip "to delete" entry */
1145+
if (!wcschr(array[i], L'='))
1146+
continue;
10701147

1071-
/* create environment block from temporary environment */
1072-
for (i = 0; tmpenv[i]; i++) {
1073-
size = 2 * strlen(tmpenv[i]) + 2; /* +2 for final \0 */
1074-
ALLOC_GROW(wenvblk, (wenvpos + size) * sizeof(wchar_t), wenvsz);
1075-
wenvpos += xutftowcs(&wenvblk[wenvpos], tmpenv[i], size) + 1;
1148+
size = wcslen(array[i]) + 1;
1149+
memcpy(p, array[i], size * sizeof(*p));
1150+
p += size;
10761151
}
1077-
/* add final \0 terminator */
1078-
wenvblk[wenvpos] = 0;
1079-
free(tmpenv);
1080-
return wenvblk;
1152+
*p = L'\0';
1153+
1154+
free(array);
1155+
free(wdeltaenv);
1156+
FreeEnvironmentStringsW(wenv);
1157+
return result;
10811158
}
10821159

10831160
struct pinfo_t {
@@ -1319,87 +1396,83 @@ int mingw_kill(pid_t pid, int sig)
13191396
}
13201397

13211398
/*
1322-
* Compare environment entries by key (i.e. stopping at '=' or '\0').
1399+
* UTF-8 versions of getenv(), putenv() and unsetenv().
1400+
* Internally, they use the CRT's stock UNICODE routines
1401+
* to avoid data loss.
13231402
*/
1324-
static int compareenv(const void *v1, const void *v2)
1403+
char *mingw_getenv(const char *name)
13251404
{
1326-
const char *e1 = *(const char**)v1;
1327-
const char *e2 = *(const char**)v2;
1405+
#define GETENV_MAX_RETAIN 30
1406+
static char *values[GETENV_MAX_RETAIN];
1407+
static int value_counter;
1408+
int len_key, len_value;
1409+
wchar_t *w_key;
1410+
char *value;
1411+
wchar_t w_value[32768];
13281412

1329-
for (;;) {
1330-
int c1 = *e1++;
1331-
int c2 = *e2++;
1332-
c1 = (c1 == '=') ? 0 : tolower(c1);
1333-
c2 = (c2 == '=') ? 0 : tolower(c2);
1334-
if (c1 > c2)
1335-
return 1;
1336-
if (c1 < c2)
1337-
return -1;
1338-
if (c1 == 0)
1339-
return 0;
1340-
}
1341-
}
1413+
if (!name || !*name)
1414+
return NULL;
13421415

1343-
static int bsearchenv(char **env, const char *name, size_t size)
1344-
{
1345-
unsigned low = 0, high = size;
1346-
while (low < high) {
1347-
unsigned mid = low + ((high - low) >> 1);
1348-
int cmp = compareenv(&env[mid], &name);
1349-
if (cmp < 0)
1350-
low = mid + 1;
1351-
else if (cmp > 0)
1352-
high = mid;
1353-
else
1354-
return mid;
1416+
len_key = strlen(name) + 1;
1417+
/* We cannot use xcalloc() here because that uses getenv() itself */
1418+
w_key = calloc(len_key, sizeof(wchar_t));
1419+
if (!w_key)
1420+
die("Out of memory, (tried to allocate %u wchar_t's)", len_key);
1421+
xutftowcs(w_key, name, len_key);
1422+
len_value = GetEnvironmentVariableW(w_key, w_value, ARRAY_SIZE(w_value));
1423+
if (!len_value && GetLastError() == ERROR_ENVVAR_NOT_FOUND) {
1424+
free(w_key);
1425+
return NULL;
13551426
}
1356-
return ~low; /* not found, return 1's complement of insert position */
1427+
free(w_key);
1428+
1429+
len_value = len_value * 3 + 1;
1430+
/* We cannot use xcalloc() here because that uses getenv() itself */
1431+
value = calloc(len_value, sizeof(char));
1432+
if (!value)
1433+
die("Out of memory, (tried to allocate %u bytes)", len_value);
1434+
xwcstoutf(value, w_value, len_value);
1435+
1436+
/*
1437+
* We return `value` which is an allocated value and the caller is NOT
1438+
* expecting to have to free it, so we keep a round-robin array,
1439+
* invalidating the buffer after GETENV_MAX_RETAIN getenv() calls.
1440+
*/
1441+
free(values[value_counter]);
1442+
values[value_counter++] = value;
1443+
if (value_counter >= ARRAY_SIZE(values))
1444+
value_counter = 0;
1445+
1446+
return value;
13571447
}
13581448

1359-
/*
1360-
* If name contains '=', then sets the variable, otherwise it unsets it
1361-
* Size includes the terminating NULL. Env must have room for size + 1 entries
1362-
* (in case of insert). Returns the new size. Optionally frees removed entries.
1363-
*/
1364-
static int do_putenv(char **env, const char *name, int size, int free_old)
1449+
int mingw_putenv(const char *namevalue)
13651450
{
1366-
int i = bsearchenv(env, name, size - 1);
1451+
int size;
1452+
wchar_t *wide, *equal;
1453+
BOOL result;
13671454

1368-
/* optionally free removed / replaced entry */
1369-
if (i >= 0 && free_old)
1370-
free(env[i]);
1455+
if (!namevalue || !*namevalue)
1456+
return 0;
13711457

1372-
if (strchr(name, '=')) {
1373-
/* if new value ('key=value') is specified, insert or replace entry */
1374-
if (i < 0) {
1375-
i = ~i;
1376-
memmove(&env[i + 1], &env[i], (size - i) * sizeof(char*));
1377-
size++;
1378-
}
1379-
env[i] = (char*) name;
1380-
} else if (i >= 0) {
1381-
/* otherwise ('key') remove existing entry */
1382-
size--;
1383-
memmove(&env[i], &env[i + 1], (size - i) * sizeof(char*));
1458+
size = strlen(namevalue) * 2 + 1;
1459+
wide = calloc(size, sizeof(wchar_t));
1460+
if (!wide)
1461+
die("Out of memory, (tried to allocate %u wchar_t's)", size);
1462+
xutftowcs(wide, namevalue, size);
1463+
equal = wcschr(wide, L'=');
1464+
if (!equal)
1465+
result = SetEnvironmentVariableW(wide, NULL);
1466+
else {
1467+
*equal = L'\0';
1468+
result = SetEnvironmentVariableW(wide, equal + 1);
13841469
}
1385-
return size;
1386-
}
1470+
free(wide);
13871471

1388-
char *mingw_getenv(const char *name)
1389-
{
1390-
char *value;
1391-
int pos = bsearchenv(environ, name, environ_size - 1);
1392-
if (pos < 0)
1393-
return NULL;
1394-
value = strchr(environ[pos], '=');
1395-
return value ? &value[1] : NULL;
1396-
}
1472+
if (!result)
1473+
errno = err_win_to_posix(GetLastError());
13971474

1398-
int mingw_putenv(const char *namevalue)
1399-
{
1400-
ALLOC_GROW(environ, (environ_size + 1) * sizeof(char*), environ_alloc);
1401-
environ_size = do_putenv(environ, namevalue, environ_size, 1);
1402-
return 0;
1475+
return result ? 0 : -1;
14031476
}
14041477

14051478
/*
@@ -2261,17 +2334,6 @@ void mingw_startup(void)
22612334
maxlen = wcslen(wargv[0]);
22622335
for (i = 1; i < argc; i++)
22632336
maxlen = max(maxlen, wcslen(wargv[i]));
2264-
for (i = 0; wenv[i]; i++)
2265-
maxlen = max(maxlen, wcslen(wenv[i]));
2266-
2267-
/*
2268-
* nedmalloc can't free CRT memory, allocate resizable environment
2269-
* list. Note that xmalloc / xmemdupz etc. call getenv, so we cannot
2270-
* use it while initializing the environment itself.
2271-
*/
2272-
environ_size = i + 1;
2273-
environ_alloc = alloc_nr(environ_size * sizeof(char*));
2274-
environ = malloc_startup(environ_alloc);
22752337

22762338
/* allocate buffer (wchar_t encodes to max 3 UTF-8 bytes) */
22772339
maxlen = 3 * maxlen + 1;
@@ -2280,14 +2342,8 @@ void mingw_startup(void)
22802342
/* convert command line arguments and environment to UTF-8 */
22812343
for (i = 0; i < argc; i++)
22822344
__argv[i] = wcstoutfdup_startup(buffer, wargv[i], maxlen);
2283-
for (i = 0; wenv[i]; i++)
2284-
environ[i] = wcstoutfdup_startup(buffer, wenv[i], maxlen);
2285-
environ[i] = NULL;
22862345
free(buffer);
22872346

2288-
/* sort environment for O(log n) getenv / putenv */
2289-
qsort(environ, i, sizeof(char*), compareenv);
2290-
22912347
/* fix Windows specific environment settings */
22922348
setup_windows_environment();
22932349

compat/mingw.h

+28-4
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,35 @@ char *mingw_mktemp(char *template);
257257
char *mingw_getcwd(char *pointer, int len);
258258
#define getcwd mingw_getcwd
259259

260+
#ifdef NO_UNSETENV
261+
#error "NO_UNSETENV is incompatible with the Windows-specific startup code!"
262+
#endif
263+
264+
/*
265+
* We bind *env() routines (even the mingw_ ones) to private mingw_ versions.
266+
* These talk to the CRT using UNICODE/wchar_t, but maintain the original
267+
* narrow-char API.
268+
*
269+
* Note that the MSCRT maintains both ANSI (getenv()) and UNICODE (_wgetenv())
270+
* routines and stores both versions of each environment variable in parallel
271+
* (and secretly updates both when you set one or the other), but it uses CP_ACP
272+
* to do the conversion rather than CP_UTF8.
273+
*
274+
* Since everything in the git code base is UTF8, we define the mingw_ routines
275+
* to access the CRT using the UNICODE routines and manually convert them to
276+
* UTF8. This also avoids round-trip problems.
277+
*
278+
* This also helps with our linkage, since "_wenviron" is publicly exported
279+
* from the CRT. But to access "_environ" we would have to statically link
280+
* to the CRT (/MT).
281+
*
282+
* We require NO_SETENV (and let gitsetenv() call our mingw_putenv).
283+
*/
284+
#define getenv mingw_getenv
285+
#define putenv mingw_putenv
286+
#define unsetenv mingw_putenv
260287
char *mingw_getenv(const char *name);
261-
#define getenv mingw_getenv
262-
int mingw_putenv(const char *namevalue);
263-
#define putenv mingw_putenv
264-
#define unsetenv mingw_putenv
288+
int mingw_putenv(const char *name);
265289

266290
int mingw_gethostname(char *host, int namelen);
267291
#define gethostname mingw_gethostname

0 commit comments

Comments
 (0)