This repository has been archived by the owner on Aug 17, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprefix_search.go
313 lines (287 loc) · 9.09 KB
/
prefix_search.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
package main
import (
"bufio"
"fmt"
"os"
"strconv"
"strings"
)
type context struct {
searchDirA string // First search directory to include
searchDirB string // Second search directory to include
prefixCache map[string]prefixResult // Cache of results for prefixes
outFile *os.File // File for writing out results
notFoundPrefixes map[string]int // Counts of sequences not found by prefix
curPrefix string // curPrefix for some cache optimization
}
// Example of a caller function for matching sequences from a big file to
// smaller files found in the search directories.
func matchSequencesCaller() error {
home := getUserHome()
var err error
// Setup
input := home + "/sequence_lists/blast/db/FASTA/nr.gz.trimmed.sorted" +
".reduced.txt"
output := home + "/sequence_lists/blast/db/FASTA/nr_run_1.txt"
ctx := context{}
ctx.outFile, err = os.Create(output)
if err != nil {
return handle("Error in creating outfile", err)
}
ctx.searchDirA = home + "/sequence_lists/genbank_reduced"
ctx.searchDirB = home + "/sequence_lists/refseq_trimmed"
ctx.prefixCache = make(map[string]prefixResult)
ctx.notFoundPrefixes = make(map[string]int)
if err = matchSequences(&ctx, input); err != nil {
return handle("Error in running match sequence routine", err)
}
// Prefixes not found and the counts of missing sequences (point values)
fmt.Println("NOT FOUND COUNTS:")
notFoundTotal := 0
for k, v := range ctx.notFoundPrefixes {
notFoundTotal += v
c := strconv.Itoa(v)
fmt.Println(k + ": " + c)
}
// Total number of sequences that weren't matched
c := strconv.Itoa(notFoundTotal)
fmt.Println("Not found total: " + c)
return err
}
// matchSequences reads in accession numbers and ranges from an input file
// and matches the point values or ranges to the same accession numbers in
// files in a search directory.
func matchSequences(ctx *context, input string) error {
file, err := os.Open(input)
if err != nil {
return handle("Error in opening input file.", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
// Print header
str := fmt.Sprintf("%-15s | %13s | %s", "Target", "Found in range", "In file")
writeLine(str, ctx.outFile)
// Go line-by-line
for scanner.Scan() {
line := scanner.Text()
if !strings.Contains(line, ": ") || !strings.Contains(line, "_") {
continue
}
parts := strings.Split(line, ": ")
prefixToFind := parts[0]
if prefixToFind == "" {
continue
}
valToFind := parts[1]
if !strings.Contains(valToFind, "-") {
// Dealing with a point value
findSingleValue(ctx, prefixToFind, valToFind)
} else {
// Dealing with a range
findRange(ctx, prefixToFind, valToFind)
}
}
return err
}
// Matches a single accession number (prefix and number) to files in the
// search directory.
func findSingleValue(ctx *context, prefix string, toFind string) error {
num, err := strconv.Atoi(toFind)
if err != nil {
return handle("Error in converting to int.", err)
}
res, err := accessionSearch(ctx, prefix, num)
if res != "" {
out := fmt.Sprintf("%s%-13d | %s", prefix, num, res)
writeLine(out, ctx.outFile)
} else {
out := fmt.Sprintf("%s%d not found.", prefix, num)
writeLine(out, ctx.outFile)
ctx.notFoundPrefixes[prefix] += 1 // Update not found counts
}
return err
}
// Matches an accession number range (e.g. XM_: 100-150) to files in the
// search directory.
func findRange(ctx *context, prefix string, toFind string) error {
p := strings.Split(toFind, "-")
startNum, startRes, err := rangePiece(ctx, prefix, p[0])
if err != nil {
return handle("Error in finding results for range start.", err)
}
endNum, endRes, err := rangePiece(ctx, prefix, p[1])
if err != nil {
return handle("Error in finding results for range end.", err)
}
if strings.Contains(startRes, "-") && startRes == endRes {
// Result was a range, and the start/end numbers matched to the same range.
// This means that all the intermediate range values must also be included
// in the result.
out := fmt.Sprintf("%s%-13s | %s", prefix, toFind, startRes)
writeLine(out, ctx.outFile)
} else {
// Otherwise just go through the range sequentially and check each point
// value.
for i := startNum; i <= endNum; i++ {
if err = findSingleValue(ctx, prefix, strconv.Itoa(i)); err != nil {
return handle("Error in searching for point value.", err)
}
}
}
return err
}
// Writes a line to stdout and the results file.
func writeLine(input string, outFile *os.File) error {
fmt.Println(input)
if _, err := outFile.WriteString(input + "\n"); err != nil {
return handle("Error in writing line.", err)
}
return nil
}
// A prefixResult represents the matches from searching for a prefix.
// - accessionNums is a list of the number values/ranges found with the same
// prefix.
// - valueToFile is a mapping of values/ranges to the file name in which the
// match was found.
type prefixResult struct {
accessionNums []string
valueToFile map[string]string
}
// Gets the results of a search for a prefix to all the matching accession
// numbers in the search directory.
func prefixToResults(ctx *context, prefix string) (prefixResult, error) {
// Setup
var err error
accessionNums := []string{}
valueToFile := make(map[string]string)
res, present := ctx.prefixCache[prefix]
if present {
return res, err
}
// Get results from disk by calling sift.
dest := ctx.searchDirA
if strings.Contains(prefix, "_") {
// Underscore is only for the Refseq files
dest = ctx.searchDirB
}
template := "sift '%s' '%s' -w --binary-skip | sort -k2 -n"
cmd := fmt.Sprintf(template, prefix, dest)
stdout, _, err := commandVerboseOnErr(cmd)
if err != nil {
return res, handle("Error in calling search utility", err)
}
// Process output
lines := strings.Split(stdout, "\n")
if len(lines) == 0 { // No results. Return with empty values.
return res, err
}
// Make mapping of accession num/ranges to filename.
// Make an ascending array of the accession num/ranges.
for _, line := range lines {
if strings.Contains(line, ": ") {
pieces := strings.Split(line, ": ")
key := pieces[1]
accessionNums = append(accessionNums, key)
// Format the file names:lines
snip := pieces[0][len(ctx.searchDirA):]
snip = snip[:len(snip)-len(prefix)-1]
valueToFile[key] = snip
}
}
// Add to cache
res = prefixResult{accessionNums, valueToFile}
ctx.prefixCache[prefix] = res
return res, err
}
// accessionSearch matches a single prefix and target num to matches in the
// search directory.
func accessionSearch(ctx *context, prefix string, targetNum int) (string,
error) {
// Setup
var err error
accessionNums := []string{}
valueToFile := make(map[string]string)
// Get prefix to file search results
if prefix != ctx.curPrefix {
// Clear the cache when the prefix changes due to memory issues.
ctx.curPrefix = prefix
ctx.prefixCache = make(map[string]prefixResult)
}
prefixRes, err := prefixToResults(ctx, prefix)
if err != nil {
return "", handle("Error in getting file results for the prefix", err)
}
accessionNums = prefixRes.accessionNums
valueToFile = prefixRes.valueToFile
// Call the binary search of the results
res, err := arraySearch(accessionNums, targetNum)
if err != nil {
return "", handle("Error in searching array", err)
}
// Format results
if res > 0 && res < len(accessionNums) {
matched := accessionNums[res]
resFile := valueToFile[matched]
resFile = resFile[:len(resFile)-4]
out := fmt.Sprintf("%-13s | %s", accessionNums[res], resFile)
return out, err
}
return "", err
}
// Modified binary search on the array where toFind is a point value and
// toSearch contains either point values or ranges.
func arraySearch(toSearch []string, toFind int) (int, error) {
n := len(toSearch)
low, high := 0, n-1
for low <= high {
mid := low + (high-low)/2 // Midpoint
lookAt := toSearch[mid]
if !strings.Contains(lookAt, "-") {
// Point val in array. Standard binary search iterations.
lookAtVal, err := strconv.Atoi(lookAt)
if err != nil {
return 0, handle("Problem converting number", err)
}
if lookAtVal > toFind {
high = mid - 1
} else if lookAtVal < toFind {
low = mid + 1
} else { // Found
return mid, err
}
} else {
// Range val in array
p := strings.Split(lookAt, "-")
bottom, err := strconv.Atoi(p[0]) // Lower bound
if err != nil {
return 0, handle("Problem converting number", err)
}
top, err := strconv.Atoi(p[1]) // Upper bound
if err != nil {
return 0, handle("Problem converting number", err)
}
if bottom <= toFind && toFind <= top {
return mid, err // Found in current range
} else if toFind < bottom {
high = mid - 1
} else if top < toFind {
low = mid + 1
}
}
}
return -1, nil // Not found.
}
// rangePiece gets the single value accession number search results for a
// piece of a range.
func rangePiece(ctx *context, prefix string, input string) (int, string,
error) {
num, err := strconv.Atoi(input)
if err != nil {
return 0, "", handle("Error in converting to int.", err)
}
res, err := accessionSearch(ctx, prefix, num)
if err != nil {
return 0, "", handle("Error in accession number search.", err)
}
return num, res, err
}