From a0da8ea815662c4f543d6ba781a0d4061e18820e Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Mon, 3 Feb 2025 16:17:49 -0800 Subject: [PATCH] GFA walks with no interval; fixes vgteam/vg#4517 --- LICENSE | 2 +- src/utils.cpp | 13 ++++-- tests/test_gfa.cpp | 19 ++++++++ tests/test_utils.cpp | 102 ++++++++++++++++++++++++++----------------- 4 files changed, 91 insertions(+), 45 deletions(-) diff --git a/LICENSE b/LICENSE index 315b6a5..0eac1f3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2019, 2020, 2021, 2022, 2023, 2024 Jouni Siren and other authors +Copyright (c) 2019, 2020, 2021, 2022, 2023, 2024, 2025 Jouni Siren and other authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/src/utils.cpp b/src/utils.cpp index 8df8651..a24f991 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -869,10 +869,17 @@ MetadataBuilder::add_walk(const std::string& sample, const std::string& haplotyp // Start position as fragment identifier. size_t phase_block = PathMetadata::NO_PHASE_BLOCK; - try { phase_block = std::stoul(start); } - catch(const std::invalid_argument&) + if (start == "*") { - throw std::runtime_error("MetadataBuilder: Invalid start position " + start); + phase_block = 0; + } + else + { + try { phase_block = std::stoul(start); } + catch(const std::invalid_argument&) + { + throw std::runtime_error("MetadataBuilder: Invalid start position " + start); + } } // Add as a haplotype diff --git a/tests/test_gfa.cpp b/tests/test_gfa.cpp index fc156eb..89242ba 100644 --- a/tests/test_gfa.cpp +++ b/tests/test_gfa.cpp @@ -873,6 +873,25 @@ TEST_F(GBWTMetadata, Walks) this->check_metadata(index.metadata, expected_metadata); } +TEST_F(GBWTMetadata, WalksNoInterval) +{ + auto gfa_parse = gfa_to_gbwt("gfas/components_walks_no_interval.gfa"); + const gbwt::GBWT& index = *(gfa_parse.first); + + gbwt::Metadata expected_metadata; + std::vector samples = { "sample" }; + expected_metadata.setSamples(samples); + expected_metadata.setHaplotypes(2); + expected_metadata.setContigs(this->names); + expected_metadata.addPath(0, 0, 1, 0); + expected_metadata.addPath(0, 0, 2, 0); + expected_metadata.addPath(0, 1, 1, 0); + expected_metadata.addPath(0, 1, 2, 0); + + ASSERT_TRUE(index.hasMetadata()) << "No GBWT metadata was created"; + this->check_metadata(index.metadata, expected_metadata); +} + TEST_F(GBWTMetadata, WalksAndPaths) { auto gfa_parse = gfa_to_gbwt("gfas/example_walks.gfa"); diff --git a/tests/test_utils.cpp b/tests/test_utils.cpp index 53f32e5..bc07fb9 100644 --- a/tests/test_utils.cpp +++ b/tests/test_utils.cpp @@ -170,14 +170,6 @@ TEST_F(SourceTest, TranslateSegments) //------------------------------------------------------------------------------ -struct StandAlonePathName -{ - std::string sample; - std::string contig; - size_t haplotype; - size_t fragment; -}; - class MetadataBuilderTest : public ::testing::Test { public: @@ -192,7 +184,7 @@ class MetadataBuilderTest : public ::testing::Test void create_example( std::vector& samples, std::vector& contigs, - std::vector& paths, + std::vector& paths, bool generic_reference) const { std::string reference_sample = (generic_reference ? REFERENCE_PATH_SAMPLE_NAME : "GRCh38"); @@ -218,7 +210,7 @@ class MetadataBuilderTest : public ::testing::Test void add_hg004( std::vector& samples, - std::vector& paths) const + std::vector& paths) const { samples.push_back("HG004"); paths.push_back({ "HG004", "chr1", 1, 0 }); @@ -227,59 +219,76 @@ class MetadataBuilderTest : public ::testing::Test paths.push_back({ "HG004", "chr2", 2, 0 }); } - size_t get_job(const StandAlonePathName& path) const + size_t get_job(const gbwt::FullPathName& path) const { - if(path.contig == "chr1") { return 0; } - if(path.contig == "chr2") { return 1; } + if(path.contig_name == "chr1") { return 0; } + if(path.contig_name == "chr2") { return 1; } return 0; } - void add_haplotypes(MetadataBuilder& builder, const std::vector& paths, size_t from, bool assign_job) + void add_haplotypes(MetadataBuilder& builder, const std::vector& paths, size_t from, bool assign_job) { for(size_t i = from; i < paths.size(); i++) { - const StandAlonePathName& path = paths[i]; + const gbwt::FullPathName& path = paths[i]; size_t job = (assign_job ? get_job(path) : 0); - if(path.sample == REFERENCE_PATH_SAMPLE_NAME) + if(path.sample_name == REFERENCE_PATH_SAMPLE_NAME) + { + builder.add_generic_path(path.contig_name, job); + } + else + { + builder.add_haplotype(path.sample_name, path.contig_name, path.haplotype, path.offset, job); + } + } + } + + void add_walks(MetadataBuilder& builder, const std::vector& paths) + { + for(const gbwt::FullPathName& path : paths) + { + if(path.sample_name == REFERENCE_PATH_SAMPLE_NAME) { - builder.add_generic_path(path.contig, job); + builder.add_generic_path(path.contig_name); } else { - builder.add_haplotype(path.sample, path.contig, path.haplotype, path.fragment, job); + std::string haplotype = std::to_string(path.haplotype); + std::string start = std::to_string(path.offset); + builder.add_walk(path.sample_name, haplotype, path.contig_name, start); } } } - void add_walks(MetadataBuilder& builder, const std::vector& paths) + void add_walks_no_interval(MetadataBuilder& builder, const std::vector& paths) { - for(const StandAlonePathName& path : paths) + std::string no_interval = "*"; + for(const gbwt::FullPathName& path : paths) { - if(path.sample == REFERENCE_PATH_SAMPLE_NAME) + if(path.sample_name == REFERENCE_PATH_SAMPLE_NAME) { - builder.add_generic_path(path.contig); + builder.add_generic_path(path.contig_name); } else { std::string haplotype = std::to_string(path.haplotype); - std::string start = std::to_string(path.fragment); - builder.add_walk(path.sample, haplotype, path.contig, start); + builder.add_walk(path.sample_name, haplotype, path.contig_name, no_interval); } } } - void add_named_paths(MetadataBuilder& builder, const std::vector& paths) + void add_named_paths(MetadataBuilder& builder, const std::vector& paths) { - for(const StandAlonePathName& path : paths) + for(const gbwt::FullPathName& path : paths) { std::string name; - if(path.sample == REFERENCE_PATH_SAMPLE_NAME) + if(path.sample_name == REFERENCE_PATH_SAMPLE_NAME) { - name = path.contig; + name = path.contig_name; } else { - name = path.sample + "#" + std::to_string(path.haplotype) + "#" + path.contig; + name = path.sample_name + "#" + std::to_string(path.haplotype) + "#" + path.contig_name; } builder.add_path(name); } @@ -289,7 +298,7 @@ class MetadataBuilderTest : public ::testing::Test const gbwt::Metadata& metadata, const std::vector& samples, const std::vector& contigs, - const std::vector& paths) const + const std::vector& paths) const { ASSERT_EQ(metadata.samples(), samples.size()) << "Invalid number of samples"; for(size_t i = 0; i < samples.size(); i++) @@ -307,10 +316,10 @@ class MetadataBuilderTest : public ::testing::Test for(size_t i = 0; i < paths.size(); i++) { gbwt::PathName path = metadata.path(i); - EXPECT_EQ(metadata.sample(path.sample), paths[i].sample) << "Invalid sample name for path " << i; - EXPECT_EQ(metadata.contig(path.contig), paths[i].contig) << "Invalid contig name for path " << i; + EXPECT_EQ(metadata.sample(path.sample), paths[i].sample_name) << "Invalid sample name for path " << i; + EXPECT_EQ(metadata.contig(path.contig), paths[i].contig_name) << "Invalid contig name for path " << i; EXPECT_EQ(path.phase, paths[i].haplotype) << "Invalid haplotype for path " << i; - EXPECT_EQ(path.count, paths[i].fragment) << "Invalid fragment for path " << i; + EXPECT_EQ(path.count, paths[i].offset) << "Invalid offset for path " << i; } } }; @@ -323,7 +332,7 @@ TEST_F(MetadataBuilderTest, Empty) TEST_F(MetadataBuilderTest, GenericPathsAndHaplotypes) { std::vector samples, contigs; - std::vector paths; + std::vector paths; this->create_example(samples, contigs, paths, true); MetadataBuilder builder; @@ -334,7 +343,7 @@ TEST_F(MetadataBuilderTest, GenericPathsAndHaplotypes) TEST_F(MetadataBuilderTest, GFAPathsAndWalks) { std::vector samples, contigs; - std::vector paths; + std::vector paths; this->create_example(samples, contigs, paths, true); MetadataBuilder builder; @@ -342,10 +351,21 @@ TEST_F(MetadataBuilderTest, GFAPathsAndWalks) this->check_metadata(builder.get_metadata(), samples, contigs, paths); } +TEST_F(MetadataBuilderTest, GFAWalksNoInterval) +{ + std::vector samples, contigs; + std::vector paths; + this->create_example(samples, contigs, paths, true); + + MetadataBuilder builder; + this->add_walks_no_interval(builder, paths); + this->check_metadata(builder.get_metadata(), samples, contigs, paths); +} + TEST_F(MetadataBuilderTest, PanSN) { std::vector samples, contigs; - std::vector paths; + std::vector paths; this->create_example(samples, contigs, paths, false); MetadataBuilder builder( @@ -360,7 +380,7 @@ TEST_F(MetadataBuilderTest, PanSN) TEST_F(MetadataBuilderTest, Clear) { std::vector samples, contigs; - std::vector paths; + std::vector paths; this->create_example(samples, contigs, paths, true); MetadataBuilder builder; @@ -372,7 +392,7 @@ TEST_F(MetadataBuilderTest, Clear) TEST_F(MetadataBuilderTest, MultipleFormats) { std::vector samples, contigs; - std::vector paths; + std::vector paths; this->create_example(samples, contigs, paths, true); MetadataBuilder builder; @@ -390,7 +410,7 @@ TEST_F(MetadataBuilderTest, MultipleFormats) TEST_F(MetadataBuilderTest, FromMetadata) { std::vector samples, contigs; - std::vector paths; + std::vector paths; this->create_example(samples, contigs, paths, true); size_t old_paths = paths.size(); @@ -407,13 +427,13 @@ TEST_F(MetadataBuilderTest, FromMetadata) TEST_F(MetadataBuilderTest, MultipleJobs) { std::vector samples, contigs; - std::vector paths; + std::vector paths; this->create_example(samples, contigs, paths, true); MetadataBuilder builder; this->add_haplotypes(builder, paths, 0, true); - std::vector reordered_paths; + std::vector reordered_paths; for(size_t job = 0; job < contigs.size(); job++) { for(size_t i = 0; i < paths.size(); i++)