From 9cc7ac618b4c837a73c22aa6bd236b0039f26ae9 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Thu, 22 Oct 2020 23:49:04 +0000 Subject: [PATCH 1/4] package: use a consistent timestamp For each entry in the tar archive, we generate a new timestamp. Normally cargo will be fast enough that we get a consistent timestamp, but that need not be the case. There's very little reason to produce different timestamps for different files and it's slightly more efficient not to need to make multiple queries, so let's instead generate a single timestamp for all entries that we generate. --- src/cargo/ops/cargo_package.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index a32c67004dc..e3f4025148a 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -472,6 +472,13 @@ fn check_repo_state( } } +fn timestamp() -> u64 { + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() +} + fn tar( ws: &Workspace<'_>, ar_files: Vec, @@ -491,6 +498,7 @@ fn tar( let base_name = format!("{}-{}", pkg.name(), pkg.version()); let base_path = Path::new(&base_name); + let time = timestamp(); for ar_file in ar_files { let ArchiveFile { rel_path, @@ -525,12 +533,7 @@ fn tar( }; header.set_entry_type(EntryType::file()); header.set_mode(0o644); - header.set_mtime( - SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_secs(), - ); + header.set_mtime(time); header.set_size(contents.len() as u64); header.set_cksum(); ar.append_data(&mut header, &ar_path, contents.as_bytes()) From 436b9eb85d9c02f6cfbd285136dfea73cfaf8172 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Sun, 15 Nov 2020 21:38:26 +0000 Subject: [PATCH 2/4] package: honor SOURCE_DATE_EPOCH For projects supporting reproducible builds, it's possible to set the timestamp used in artifacts by setting SOURCE_DATE_EPOCH to a decimal Unix timestamp. This is helpful because it allows users to produce the exact same artifact, regardless of when the project was built, and it also means that services which generate crates from source can generate a consistent crate without having store previously built artifacts. For all these reasons, let's honor the SOURCE_DATE_EPOCH environment variable if it's set and use the current timestamp if it's not. --- src/cargo/ops/cargo_package.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index e3f4025148a..2056f8cf6a5 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -473,6 +473,11 @@ fn check_repo_state( } fn timestamp() -> u64 { + if let Ok(var) = std::env::var("SOURCE_DATE_EPOCH") { + if let Ok(stamp) = var.parse() { + return stamp; + } + } SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap() From e46ca84b6c4e5cfd58edb425307214ba9d687cab Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Sun, 15 Nov 2020 21:43:39 +0000 Subject: [PATCH 3/4] package: canonicalize tar headers for crate packages Currently, when reading a file from disk, we include several pieces of data from the on-disk file, including the user and group names and IDs, the device major and minor, the mode, and the timestamp. This means that our archives differ between systems, sometimes in unhelpful ways. In addition, most users probably did not intend to share information about their user and group settings, operating system and disk type, and umask. While these aren't huge privacy leaks, cargo doesn't use them when extracting archives, so there's no value to including them. Since using consistent data means that our archives are reproducible and don't leak user data, both of which are desirable features, let's canonicalize the header to strip out identifying information. We set the user and group information to 0 and root, since that's the only user that's typically consistent among Unix systems. Setting these values doesn't create a security risk since tar can't change the ownership of files when it's running as a normal unprivileged user. Similarly, we set the device major and minor to 0. There is no useful value here that's portable across systems, and it does not affect extraction in any way. We also set the timestamp to the same one that we use for generated files. This is probably the biggest loss of relevant data, but considering that cargo doesn't otherwise use it and honoring it makes the archives unreproducible, we canonicalize it as well. Finally, we canonicalize the mode of an item we're storing by looking at the executable bit and using mode 755 if it's set and mode 644 if it's not. We already use 644 as the default for generated files, and this is the same algorithm that Git uses to determine whether a file should be considered executable. The tests don't test this case because there's no portable way to create executable files on Windows. --- src/cargo/ops/cargo_package.rs | 20 ++++++++++++++++ tests/testsuite/package.rs | 43 ++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index 2056f8cf6a5..5c9bf73a0ee 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -484,6 +484,23 @@ fn timestamp() -> u64 { .as_secs() } +fn canonicalize_header(header: &mut Header) { + // Let's not include information about the user or their system here. + header.set_username("root").unwrap(); + header.set_groupname("root").unwrap(); + header.set_uid(0); + header.set_gid(0); + header.set_device_major(0).unwrap(); + header.set_device_minor(0).unwrap(); + + let mode = if header.mode().unwrap() & 0o100 != 0 { + 0o755 + } else { + 0o644 + }; + header.set_mode(mode); +} + fn tar( ws: &Workspace<'_>, ar_files: Vec, @@ -524,6 +541,8 @@ fn tar( format!("could not learn metadata for: `{}`", disk_path.display()) })?; header.set_metadata(&metadata); + header.set_mtime(time); + canonicalize_header(&mut header); header.set_cksum(); ar.append_data(&mut header, &ar_path, &mut file) .chain_err(|| { @@ -540,6 +559,7 @@ fn tar( header.set_mode(0o644); header.set_mtime(time); header.set_size(contents.len() as u64); + canonicalize_header(&mut header); header.set_cksum(); ar.append_data(&mut header, &ar_path, contents.as_bytes()) .chain_err(|| format!("could not archive source file `{}`", rel_str))?; diff --git a/tests/testsuite/package.rs b/tests/testsuite/package.rs index 96b6d8a18ce..d3900ef65ae 100644 --- a/tests/testsuite/package.rs +++ b/tests/testsuite/package.rs @@ -6,8 +6,10 @@ use cargo_test_support::registry::{self, Package}; use cargo_test_support::{ basic_manifest, cargo_process, git, path2url, paths, project, symlink_supported, t, }; +use flate2::read::GzDecoder; use std::fs::{self, read_to_string, File}; use std::path::Path; +use tar::Archive; #[cargo_test] fn simple() { @@ -1917,3 +1919,44 @@ src/main.rs )) .run(); } + +#[cargo_test] +fn reproducible_output() { + let p = project() + .file( + "Cargo.toml", + r#" + [project] + name = "foo" + version = "0.0.1" + authors = [] + exclude = ["*.txt"] + license = "MIT" + description = "foo" + "#, + ) + .file("src/main.rs", r#"fn main() { println!("hello"); }"#) + .build(); + + // Timestamp is arbitrary and is the same used by git format-patch. + p.cargo("package") + .env("SOURCE_DATE_EPOCH", "1000684800") + .run(); + assert!(p.root().join("target/package/foo-0.0.1.crate").is_file()); + + let f = File::open(&p.root().join("target/package/foo-0.0.1.crate")).unwrap(); + let decoder = GzDecoder::new(f); + let mut archive = Archive::new(decoder); + for ent in archive.entries().unwrap() { + let ent = ent.unwrap(); + let header = ent.header(); + assert_eq!(header.mode().unwrap(), 0o644); + assert_eq!(header.uid().unwrap(), 0); + assert_eq!(header.gid().unwrap(), 0); + assert_eq!(header.mtime().unwrap(), 1000684800); + assert_eq!(header.username().unwrap().unwrap(), "root"); + assert_eq!(header.groupname().unwrap().unwrap(), "root"); + assert_eq!(header.device_major().unwrap().unwrap(), 0); + assert_eq!(header.device_minor().unwrap().unwrap(), 0); + } +} From 449ead05dbb8ceb502bdd9e715b414f1c70a08f3 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Tue, 17 Nov 2020 23:51:23 +0000 Subject: [PATCH 4/4] package: canonicalize tar headers for crate packages Currently, when reading a file from disk, we include several pieces of data from the on-disk file, including the user and group names and IDs, the device major and minor, the mode, and the timestamp. This means that our archives differ between systems, sometimes in unhelpful ways. In addition, most users probably did not intend to share information about their user and group settings, operating system and disk type, and umask. While these aren't huge privacy leaks, cargo doesn't use them when extracting archives, so there's no value to including them. Since using consistent data means that our archives are reproducible and don't leak user data, both of which are desirable features, let's canonicalize the header to strip out identifying information. Omit the inclusion of the timestamp for generated files and tell the tar crate to copy deterministic data. That will omit all of the data we don't care about and also canonicalize the mode properly. Our tests don't check the specifics of certain fields because they differ between the generated files and the files that are archived from the disk format. They are still canonicalized correctly for each type, however. --- src/cargo/ops/cargo_package.rs | 39 ++-------------------------------- tests/testsuite/package.rs | 15 ++++--------- 2 files changed, 6 insertions(+), 48 deletions(-) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index 5c9bf73a0ee..4e19a30b91e 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -5,12 +5,11 @@ use std::io::SeekFrom; use std::path::{Path, PathBuf}; use std::rc::Rc; use std::sync::Arc; -use std::time::SystemTime; use flate2::read::GzDecoder; use flate2::{Compression, GzBuilder}; use log::debug; -use tar::{Archive, Builder, EntryType, Header}; +use tar::{Archive, Builder, EntryType, Header, HeaderMode}; use crate::core::compiler::{BuildConfig, CompileMode, DefaultExecutor, Executor}; use crate::core::{Feature, Shell, Verbosity, Workspace}; @@ -472,35 +471,6 @@ fn check_repo_state( } } -fn timestamp() -> u64 { - if let Ok(var) = std::env::var("SOURCE_DATE_EPOCH") { - if let Ok(stamp) = var.parse() { - return stamp; - } - } - SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_secs() -} - -fn canonicalize_header(header: &mut Header) { - // Let's not include information about the user or their system here. - header.set_username("root").unwrap(); - header.set_groupname("root").unwrap(); - header.set_uid(0); - header.set_gid(0); - header.set_device_major(0).unwrap(); - header.set_device_minor(0).unwrap(); - - let mode = if header.mode().unwrap() & 0o100 != 0 { - 0o755 - } else { - 0o644 - }; - header.set_mode(mode); -} - fn tar( ws: &Workspace<'_>, ar_files: Vec, @@ -520,7 +490,6 @@ fn tar( let base_name = format!("{}-{}", pkg.name(), pkg.version()); let base_path = Path::new(&base_name); - let time = timestamp(); for ar_file in ar_files { let ArchiveFile { rel_path, @@ -540,9 +509,7 @@ fn tar( let metadata = file.metadata().chain_err(|| { format!("could not learn metadata for: `{}`", disk_path.display()) })?; - header.set_metadata(&metadata); - header.set_mtime(time); - canonicalize_header(&mut header); + header.set_metadata_in_mode(&metadata, HeaderMode::Deterministic); header.set_cksum(); ar.append_data(&mut header, &ar_path, &mut file) .chain_err(|| { @@ -557,9 +524,7 @@ fn tar( }; header.set_entry_type(EntryType::file()); header.set_mode(0o644); - header.set_mtime(time); header.set_size(contents.len() as u64); - canonicalize_header(&mut header); header.set_cksum(); ar.append_data(&mut header, &ar_path, contents.as_bytes()) .chain_err(|| format!("could not archive source file `{}`", rel_str))?; diff --git a/tests/testsuite/package.rs b/tests/testsuite/package.rs index d3900ef65ae..79fde9a26a2 100644 --- a/tests/testsuite/package.rs +++ b/tests/testsuite/package.rs @@ -1938,10 +1938,7 @@ fn reproducible_output() { .file("src/main.rs", r#"fn main() { println!("hello"); }"#) .build(); - // Timestamp is arbitrary and is the same used by git format-patch. - p.cargo("package") - .env("SOURCE_DATE_EPOCH", "1000684800") - .run(); + p.cargo("package").run(); assert!(p.root().join("target/package/foo-0.0.1.crate").is_file()); let f = File::open(&p.root().join("target/package/foo-0.0.1.crate")).unwrap(); @@ -1951,12 +1948,8 @@ fn reproducible_output() { let ent = ent.unwrap(); let header = ent.header(); assert_eq!(header.mode().unwrap(), 0o644); - assert_eq!(header.uid().unwrap(), 0); - assert_eq!(header.gid().unwrap(), 0); - assert_eq!(header.mtime().unwrap(), 1000684800); - assert_eq!(header.username().unwrap().unwrap(), "root"); - assert_eq!(header.groupname().unwrap().unwrap(), "root"); - assert_eq!(header.device_major().unwrap().unwrap(), 0); - assert_eq!(header.device_minor().unwrap().unwrap(), 0); + assert_eq!(header.mtime().unwrap(), 0); + assert_eq!(header.username().unwrap().unwrap(), ""); + assert_eq!(header.groupname().unwrap().unwrap(), ""); } }