From 40b5c35dee590b8f1d3f75e1f82d18bd64bcd894 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 29 Jun 2020 12:39:23 -0700 Subject: [PATCH] [MRG] add --num-results/-n to gather (#1047) * add --num-results/-n to gather * add test for --num-results * use tempdir decorator for new test --- sourmash/cli/gather.py | 6 +++++- sourmash/commands.py | 7 +++++++ tests/test_sourmash.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/sourmash/cli/gather.py b/sourmash/cli/gather.py index dc72328629..16d16e7396 100644 --- a/sourmash/cli/gather.py +++ b/sourmash/cli/gather.py @@ -17,6 +17,10 @@ def subparser(subparsers): subparser.add_argument( '-d', '--debug', action='store_true' ) + subparser.add_argument( + '-n', '--num-results', default=None, type=int, metavar='N', + help='number of results to report (default: terminate at --threshold-bp)' + ) subparser.add_argument( '--traverse-directory', action='store_true', help='search all signatures underneath directories' @@ -32,7 +36,7 @@ def subparser(subparsers): ) subparser.add_argument( '--threshold-bp', metavar='REAL', type=float, default=5e4, - help='reporting threshold (in bp) for estimated overlap with remaining query (default=50,000)' + help='reporting threshold (in bp) for estimated overlap with remaining query (default=50kb)' ) subparser.add_argument( '--output-unassigned', metavar='FILE', diff --git a/sourmash/commands.py b/sourmash/commands.py index 7ca0d92813..e3b0bd381a 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -607,6 +607,7 @@ def gather(args): weighted_missed = 1 new_max_hash = query.minhash.max_hash next_query = query + for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance): if not len(found): # first result? print header. if query.minhash.track_abundance and not args.ignore_abundance: @@ -635,9 +636,15 @@ def gather(args): name) found.append(result) + if args.num_results and len(found) >= args.num_results: + break + # basic reporting print_results('\nfound {} matches total;', len(found)) + if args.num_results and len(found) == args.num_results: + print_results('(truncated gather because --num-results={})', + args.num_results) print_results('the recovered matches hit {:.1f}% of the query', (1 - weighted_missed) * 100) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index bb9a9d5d5f..3d022cb134 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2563,6 +2563,38 @@ def test_gather_metagenome(): 'NC_011294.1 Salmonella enterica subsp...' in out)) +@utils.in_tempdir +def test_gather_metagenome_num_results(c): + # set a threshold on the number of results to be reported by gather + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = ['index', 'gcf_all', '-k', '21'] + cmd.extend(testdata_sigs) + + c.run_sourmash(*cmd) + + assert os.path.exists(c.output('gcf_all.sbt.json')) + + cmd = 'gather {} gcf_all -k 21 --num-results 10'.format(query_sig) + cmd = cmd.split(' ') + c.run_sourmash(*cmd) + + print(c.last_result.out) + print(c.last_result.err) + + out = c.last_result.out + + assert 'found 10 matches total' in out + assert '(truncated gather because --num-results=10)' in out + assert 'the recovered matches hit 99.4% of the query' in out + assert all(('4.9 Mbp 33.2% 100.0%' in out, + 'NC_003198.1 Salmonella enterica subsp...' in out)) + assert '4.3 Mbp 2.1% 7.3% NC_006511.1 Salmonella enterica subsp' in out + + def test_gather_metagenome_threshold_bp(): # set a threshold on the gather output with utils.TempDirectory() as location: