Skip to content

Commit

Permalink
Update data extract handling of Discovery responses (#142)
Browse files Browse the repository at this point in the history
- Update default similarity-check url
- Strip html tags and urls from each discovery passage

Signed-off-by: Sean Sundberg <[email protected]>
  • Loading branch information
seansund authored Nov 7, 2023
1 parent 83de908 commit 6fa4bd1
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 15 deletions.
24 changes: 16 additions & 8 deletions src/services/data-extraction/data-extraction.impl.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import * as process from "process";
import {IamAuthenticator, IamTokenManager} from "ibm-cloud-sdk-core";
import DiscoveryV2 = require("ibm-watson/discovery/v2");
const striptags = require("striptags");
const stripTags = require("striptags");
import axios from "axios";

import {DataExtractionApi} from "./data-extraction.api";
Expand All @@ -10,7 +10,7 @@ import {createDiscoveryV2} from "../../utils/discovery-v2";
import {DataExtractionConfig, DataExtractionCsv} from "./data-extraction.csv";
import {kycCaseSummaryApi, KycCaseSummaryApi} from "../kyc-case-summary";
import {DataExtractionResultModel} from "../../models";
import {first, GenAiModel, GenerativeResponse} from "../../utils";
import {first, GenAiModel, GenerativeResponse, stripUrls} from "../../utils";
import PQueue from "../../utils/p-queue";

const concurrency = parseInt(process.env.FIND_PASSAGE_CONCURRENCY || '8')
Expand Down Expand Up @@ -146,7 +146,7 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex
async queryDiscovery(customer: string, config: DataExtractionConfig, backends: WatsonBackends): Promise<string> {
const naturalLanguageQuery = config.question + ' ' + customer;

const passagesPerDocument = true;
const passagesPerDocument: boolean = true;
const response: DiscoveryV2.Response<DiscoveryV2.QueryResponse> = await backends.discovery.query({
projectId: this.backendConfig.discoveryProjectId,
naturalLanguageQuery,
Expand All @@ -159,9 +159,7 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex
}
})

const passages: string[] = !passagesPerDocument
? this.handleDiscoveryPassages(response.result)
: this.handleDiscoveryResult(response.result, customer);
const passages: string[] = this.handleDiscoveryResponse(response.result, customer, passagesPerDocument)

console.log('Finding relevant passages')

Expand All @@ -174,6 +172,16 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex
return text;
}

handleDiscoveryResponse(result: DiscoveryV2.QueryResponse, subject: string, passagesPerDocument: boolean): string[] {
const passages: string[] = !passagesPerDocument
? this.handleDiscoveryPassages(result)
: this.handleDiscoveryResult(result, subject);

return passages
.map(stripTags)
.map(stripUrls)
}

filterDocuments(result: DiscoveryV2.QueryResponse, subject: string): DiscoveryV2.QueryResult[] {
return result.results.filter(val => {
const organizations = extractEntities(val.enriched_text, 'Organization')
Expand All @@ -198,7 +206,7 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex
}

async findRelevantPassages(question: string, passages: string[]): Promise<string> {
const url = process.env.RELEVANT_PASSAGES_URL || 'https://similarity-check.18xu6cedovu0.us-south.codeengine.appdomain.cloud/api/find_relevant_passage'
const url = process.env.RELEVANT_PASSAGES_URL || 'https://similarity-check.18z7sftfb1j5.us-south.codeengine.appdomain.cloud/api/find_relevant_passage'

if (passages.length === 1) {
return passages[0]
Expand All @@ -220,7 +228,7 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex
.catch(err => {
console.error('Error getting relevant passages: ', {err})

return striptags(passages.join('\n'))
return passages.join('\n')
})

console.log('Found relevant passage: ', {relevantPassage})
Expand Down
1 change: 1 addition & 0 deletions src/utils/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ export * from './gen-ai-model';
export * from './stream-to-buffer';
export * from './url-to-stream';
export * from './validate-url';
export * from './strip-urls';
6 changes: 3 additions & 3 deletions src/utils/p-queue/index.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import {EventEmitter} from 'eventemitter3';
import pTimeout, {TimeoutError} from '../p-timeout';
import {Queue, RunFunction} from './queue.js';
import PriorityQueue from './priority-queue.js';
import {QueueAddOptions, Options, TaskOptions} from './options.js';
import {Queue, RunFunction} from './queue';
import PriorityQueue from './priority-queue';
import {QueueAddOptions, Options, TaskOptions} from './options';

type Task<TaskResultType> =
| ((options: TaskOptions) => PromiseLike<TaskResultType>)
Expand Down
2 changes: 1 addition & 1 deletion src/utils/p-queue/options.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {Queue, RunFunction} from './queue.js';
import {Queue, RunFunction} from './queue';

interface TimeoutOptions {
/**
Expand Down
6 changes: 3 additions & 3 deletions src/utils/p-queue/priority-queue.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import {Queue, RunFunction} from './queue.js';
import lowerBound from './lower-bound.js';
import {QueueAddOptions} from './options.js';
import {Queue, RunFunction} from './queue';
import lowerBound from './lower-bound';
import {QueueAddOptions} from './options';

export interface PriorityQueueOptions extends QueueAddOptions {
priority?: number;
Expand Down
1 change: 1 addition & 0 deletions src/utils/strip-urls/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from './strip-urls'
37 changes: 37 additions & 0 deletions src/utils/strip-urls/strip-urls.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import {stripUrls} from "./strip-urls";

describe('strip-urls', () => {
describe('Given stripUrls', () => {
const url = 'https://host.com/path/to/file.html'

describe('when "{url}" provided', () => {
test('then return ""', () => {
expect(stripUrls(url)).toEqual('')
})
})

describe('when "http://host.com/path/to/file.html" provided', () => {
test('then return ""', () => {
expect(stripUrls('http://host.com/path/to/file.html')).toEqual('')
})
})

describe('when "This is a test {url}." provided', () => {
test('then return "This is a test "', () => {
expect(stripUrls(`This is a test ${url}.`)).toEqual('This is a test ')
})
})

describe('when "This is a test {url})" provided', () => {
test('then return "This is a test "', () => {
expect(stripUrls(`This is a test ${url})`)).toEqual('This is a test ')
})
})

describe('when "This is a test {url} )" provided', () => {
test('then return "This is a test )"', () => {
expect(stripUrls(`This is a test ${url} )`)).toEqual('This is a test )')
})
})
})
})
3 changes: 3 additions & 0 deletions src/utils/strip-urls/strip-urls.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
export const stripUrls = (text: string): string => {
return text.replace(/https?:\/\/[\n\S]+/g, '');;
}

0 comments on commit 6fa4bd1

Please sign in to comment.