Skip to content

Commit

Permalink
[MRG] add --num-results/-n to gather (#1047)
Browse files Browse the repository at this point in the history
* add --num-results/-n to gather

* add test for --num-results

* use tempdir decorator for new test
  • Loading branch information
ctb authored Jun 29, 2020
1 parent 0e2d259 commit 40b5c35
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 1 deletion.
6 changes: 5 additions & 1 deletion sourmash/cli/gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ def subparser(subparsers):
subparser.add_argument(
'-d', '--debug', action='store_true'
)
subparser.add_argument(
'-n', '--num-results', default=None, type=int, metavar='N',
help='number of results to report (default: terminate at --threshold-bp)'
)
subparser.add_argument(
'--traverse-directory', action='store_true',
help='search all signatures underneath directories'
Expand All @@ -32,7 +36,7 @@ def subparser(subparsers):
)
subparser.add_argument(
'--threshold-bp', metavar='REAL', type=float, default=5e4,
help='reporting threshold (in bp) for estimated overlap with remaining query (default=50,000)'
help='reporting threshold (in bp) for estimated overlap with remaining query (default=50kb)'
)
subparser.add_argument(
'--output-unassigned', metavar='FILE',
Expand Down
7 changes: 7 additions & 0 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,7 @@ def gather(args):
weighted_missed = 1
new_max_hash = query.minhash.max_hash
next_query = query

for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
if not len(found): # first result? print header.
if query.minhash.track_abundance and not args.ignore_abundance:
Expand Down Expand Up @@ -635,9 +636,15 @@ def gather(args):
name)
found.append(result)

if args.num_results and len(found) >= args.num_results:
break


# basic reporting
print_results('\nfound {} matches total;', len(found))
if args.num_results and len(found) == args.num_results:
print_results('(truncated gather because --num-results={})',
args.num_results)

print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
Expand Down
32 changes: 32 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2563,6 +2563,38 @@ def test_gather_metagenome():
'NC_011294.1 Salmonella enterica subsp...' in out))


@utils.in_tempdir
def test_gather_metagenome_num_results(c):
# set a threshold on the number of results to be reported by gather
testdata_glob = utils.get_test_data('gather/GCF*.sig')
testdata_sigs = glob.glob(testdata_glob)

query_sig = utils.get_test_data('gather/combined.sig')

cmd = ['index', 'gcf_all', '-k', '21']
cmd.extend(testdata_sigs)

c.run_sourmash(*cmd)

assert os.path.exists(c.output('gcf_all.sbt.json'))

cmd = 'gather {} gcf_all -k 21 --num-results 10'.format(query_sig)
cmd = cmd.split(' ')
c.run_sourmash(*cmd)

print(c.last_result.out)
print(c.last_result.err)

out = c.last_result.out

assert 'found 10 matches total' in out
assert '(truncated gather because --num-results=10)' in out
assert 'the recovered matches hit 99.4% of the query' in out
assert all(('4.9 Mbp 33.2% 100.0%' in out,
'NC_003198.1 Salmonella enterica subsp...' in out))
assert '4.3 Mbp 2.1% 7.3% NC_006511.1 Salmonella enterica subsp' in out


def test_gather_metagenome_threshold_bp():
# set a threshold on the gather output
with utils.TempDirectory() as location:
Expand Down

0 comments on commit 40b5c35

Please sign in to comment.