From b1142f816c837a786d8cf0d123a8cd3e06904e8b Mon Sep 17 00:00:00 2001 From: L Lllvvuu Date: Wed, 23 Aug 2023 03:54:05 -0700 Subject: [PATCH] feat: regex sampler This fixes #152 except in an edge case where both `pattern` and `maxLength` are used, and the sampler skips over the valid length range. It introduces a tiny dependency (<10 KB uncompressed) which doesn't really have any prominent competitors. --- package-lock.json | 52 +++++++++++++++++++++++- package.json | 3 +- src/samplers/string.js | 85 +++++++++++++++++++++++++++++++++++----- test/unit/string.spec.js | 16 ++++++++ 4 files changed, 145 insertions(+), 11 deletions(-) diff --git a/package-lock.json b/package-lock.json index 249528a..91024ee 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,7 +10,8 @@ "license": "MIT", "dependencies": { "@types/json-schema": "^7.0.7", - "json-pointer": "0.6.2" + "json-pointer": "0.6.2", + "randexp": "^0.5.3" }, "devDependencies": { "@babel/core": "^7.7.2", @@ -4633,6 +4634,14 @@ "integrity": "sha1-eufhqUgTRSnzdosvsT9Mj6kqV88=", "dev": true }, + "node_modules/drange": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/drange/-/drange-1.1.1.tgz", + "integrity": "sha512-pYxfDYpued//QpnLIm4Avk7rsNtAtQkUES2cwAYSvD/wd2pKD71gN2Ebj3e7klzXwjocvE8c5vx/1fxwpqmSxA==", + "engines": { + "node": ">=4" + } + }, "node_modules/duplexer2": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/duplexer2/-/duplexer2-0.1.4.tgz", @@ -12311,6 +12320,26 @@ } ] }, + "node_modules/randexp": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/randexp/-/randexp-0.5.3.tgz", + "integrity": "sha512-U+5l2KrcMNOUPYvazA3h5ekF80FHTUG+87SEAmHZmolh1M+i/WyTCxVzmi+tidIa1tM4BSe8g2Y/D3loWDjj+w==", + "dependencies": { + "drange": "^1.0.2", + "ret": "^0.2.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/randexp/node_modules/ret": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/ret/-/ret-0.2.2.tgz", + "integrity": "sha512-M0b3YWQs7R3Z917WRQy1HHA7Ba7D8hvZg6UE5mLykJxQVE2ju0IXbGlaHPPlkY+WN7wFP+wUMXmBFA0aV6vYGQ==", + "engines": { + "node": ">=4" + } + }, "node_modules/randombytes": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", @@ -19518,6 +19547,11 @@ "integrity": "sha1-eufhqUgTRSnzdosvsT9Mj6kqV88=", "dev": true }, + "drange": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/drange/-/drange-1.1.1.tgz", + "integrity": "sha512-pYxfDYpued//QpnLIm4Avk7rsNtAtQkUES2cwAYSvD/wd2pKD71gN2Ebj3e7klzXwjocvE8c5vx/1fxwpqmSxA==" + }, "duplexer2": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/duplexer2/-/duplexer2-0.1.4.tgz", @@ -25544,6 +25578,22 @@ "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", "dev": true }, + "randexp": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/randexp/-/randexp-0.5.3.tgz", + "integrity": "sha512-U+5l2KrcMNOUPYvazA3h5ekF80FHTUG+87SEAmHZmolh1M+i/WyTCxVzmi+tidIa1tM4BSe8g2Y/D3loWDjj+w==", + "requires": { + "drange": "^1.0.2", + "ret": "^0.2.0" + }, + "dependencies": { + "ret": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/ret/-/ret-0.2.2.tgz", + "integrity": "sha512-M0b3YWQs7R3Z917WRQy1HHA7Ba7D8hvZg6UE5mLykJxQVE2ju0IXbGlaHPPlkY+WN7wFP+wUMXmBFA0aV6vYGQ==" + } + } + }, "randombytes": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", diff --git a/package.json b/package.json index 1454af9..ea52786 100644 --- a/package.json +++ b/package.json @@ -88,6 +88,7 @@ }, "dependencies": { "@types/json-schema": "^7.0.7", - "json-pointer": "0.6.2" + "json-pointer": "0.6.2", + "randexp": "^0.5.3" } } diff --git a/src/samplers/string.js b/src/samplers/string.js index 77a6618..e1ac835 100644 --- a/src/samplers/string.js +++ b/src/samplers/string.js @@ -1,8 +1,70 @@ 'use strict'; +import RandExp from 'randexp'; + import { ensureMinLength, toRFCDateTime, uuid } from '../utils'; const passwordSymbols = 'qwerty!@#$%^123456'; +const MAX_REGEX_SAMPLES = 100; + +function sampleRegex(pattern, min, max) { + let res; + let i = 0; + let length; + let prevLength; + + // Increase length of the sample until it satisfies the minimum. + do { + RandExp.prototype.randInt = (from, to) => Math.min(from + i, to); + res = new RandExp(pattern).gen(); + prevLength = length; + length = res.length; + i++; + } while (length < min && i < MAX_REGEX_SAMPLES); + + // Handle case where we went past the maximum. + // Example: /\d*\d*foo/, will sample foo, 11foo, 2222foo, etc. + // + // HACK: RandExp doesn't expose an API to set the value of a specific sample, + // so we'll just fuzz it. If no satisfying string is found, + // we prefer to return a string that is too long than to return a string + // that doesn't fit the regex. + if (max && max >= min && res.length > max) { + // Let N is the number of * or + in the regex. + // The probability that N coinflips with probability k/N comes up k heads + // ~ sqrt(N/(2 * pi * k * (n - k))) by Stirling's approximation + // This is worst case ~ sqrt(2 / (pi * n)) by taking k = N/2, + // so if N < 63 then we can hit an exact length with probability > 0.1, + // which means that with 100 samples we can hit an exact match + // with probability > 0.99997. + const targetProbability = ((min + max) / 2 - prevLength) / (length - prevLength) + + for (let j = 0; j < MAX_REGEX_SAMPLES; j++) { + RandExp.prototype.randInt = (from, to) => Math.max( + from, + Math.min(from + i - 2 + (Math.random() < targetProbability ? 1 : 0), to), + ); + const candidate = new RandExp(pattern).gen(); + if (candidate.length >= min) { + if (candidate.length <= max) { + return candidate; + } else if (candidate.length < res.length) { + res = candidate; + } + } + } + } + + return res; +} + +function truncateString(str, min, max) { + let res = ensureMinLength(str, min); + if (max && res.length > max) { + res = res.substring(0, max); + } + return res; +} function emailSample() { return 'user@example.com'; @@ -42,12 +104,10 @@ function timeSample(min, max) { return commonDateTimeSample({ min, max, omitTime: false, omitDate: true }).slice(1); } -function defaultSample(min, max) { - let res = ensureMinLength('string', min); - if (max && res.length > max) { - res = res.substring(0, max); - } - return res; +function defaultSample(min, max, _propertyName, pattern) { + return pattern + ? sampleRegex(pattern, min, max) + : truncateString('string', min, max) } function ipv4Sample() { @@ -96,8 +156,10 @@ function relativeJsonPointerSample() { return '1/relative/json/pointer'; } -function regexSample() { - return '/regex/'; +function regexSample(min, max, _propertyName, pattern) { + return pattern + ? sampleRegex(pattern, min, max) + : truncateString('/regex/', min, max) } const stringFormats = { @@ -127,5 +189,10 @@ export function sampleString(schema, options, spec, context) { let format = schema.format || 'default'; let sampler = stringFormats[format] || defaultSample; let propertyName = context && context.propertyName; - return sampler(schema.minLength | 0, schema.maxLength, propertyName); + return sampler( + schema.minLength || 0, + schema.maxLength, + propertyName, + schema.pattern, + ); } diff --git a/test/unit/string.spec.js b/test/unit/string.spec.js index 63d4a15..29eb664 100644 --- a/test/unit/string.spec.js +++ b/test/unit/string.spec.js @@ -130,6 +130,22 @@ describe('sampleString', () => { expect(res).to.equal('fb4274c7-4fcd-4035-8958-a680548957ff'); }); + it('should return valid string for regex with min and max', () => { + const regex = 'foo-\\d+\\d+\\d+\\d+\\d+\\d+\\d+\\d+\\d+\\d+-bar'; // 10 reps + let schema = { + format: 'regex', + pattern: regex, + minLength: 509, + maxLength: 509, + }; + res = sampleString(schema, null, null, {propertyName: 'fooId'}); + expect(res).to.match(/foo-\d{501}-bar/); + + schema = { ...schema, minLength: 513, maxLength: 513 }; + res = sampleString(schema, null, null, {propertyName: 'fooId'}); + expect(res).to.match(/foo-\d{505}-bar/); + }); + it.each([ 'email', // 'idn-email', // unsupported by ajv-formats