Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(package): implement vcs status cache #14985

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
feat(package): implement vcs status cache
It may not be worthy for the majority.
Single file `git status` is generally fast.

On some slow file systems or lower-end machines,
skipping file system access might be helpful.

However, slow file system access usually happen on Window.
And we'll only have large amount of `git status` when
#14981 merges and the repo contains lots of symlinks.
Given symlink handling story is crazy in real world Windows,
I doubt anybody will hit the performance issue without this.
  • Loading branch information
weihanglo committed Dec 31, 2024
commit 87eba66c3404bcdf8cc808175dda28ddc11efb98
104 changes: 82 additions & 22 deletions src/cargo/ops/cargo_package/vcs.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Helpers to gather the VCS information for `cargo package`.

use std::collections::HashMap;
use std::path::Path;
use std::path::PathBuf;

Expand Down Expand Up @@ -39,14 +40,20 @@ pub struct GitVcsInfo {
pub struct VcsInfoBuilder<'a, 'gctx> {
ws: &'a Workspace<'gctx>,
opts: &'a PackageOpts<'gctx>,
/// Map each git workdir path to the workdir's git status cache.
caches: HashMap<PathBuf, RepoStatusCache>,
}

impl<'a, 'gctx> VcsInfoBuilder<'a, 'gctx> {
pub fn new(
ws: &'a Workspace<'gctx>,
opts: &'a PackageOpts<'gctx>,
) -> VcsInfoBuilder<'a, 'gctx> {
VcsInfoBuilder { ws, opts }
VcsInfoBuilder {
ws,
opts,
caches: Default::default(),
}
}

/// Builds an [`VcsInfo`] for the given `pkg` and its associated `src_files`.
Expand All @@ -55,7 +62,53 @@ impl<'a, 'gctx> VcsInfoBuilder<'a, 'gctx> {
pkg: &Package,
src_files: &[PathEntry],
) -> CargoResult<Option<VcsInfo>> {
check_repo_state(pkg, src_files, self.ws.gctx(), self.opts)
check_repo_state(pkg, src_files, self.ws.gctx(), self.opts, &mut self.caches)
}
}

/// Cache of git status inquries for a Git workdir.
struct RepoStatusCache {
repo: git2::Repository,
/// Status of each file path relative to the git workdir path.
cache: HashMap<PathBuf, git2::Status>,
}

impl RepoStatusCache {
fn new(repo: git2::Repository) -> RepoStatusCache {
RepoStatusCache {
repo,
cache: Default::default(),
}
}

/// Like [`git2::Repository::status_file`] but with cache.
fn status_file(&mut self, path: &Path) -> CargoResult<git2::Status> {
use std::collections::hash_map::Entry;
match self.cache.entry(path.into()) {
Entry::Occupied(entry) => {
tracing::trace!(
target: "cargo_package_vcs_cache",
"git status cache hit for `{}` at workdir `{}`",
path.display(),
self.repo.workdir().unwrap().display()
);
Ok(*entry.get())
}
Entry::Vacant(entry) => {
tracing::trace!(
target: "cargo_package_vcs_cache",
"git status cache miss for `{}` at workdir `{}`",
path.display(),
self.repo.workdir().unwrap().display()
);
let status = self.repo.status_file(path)?;
Ok(*entry.insert(status))
}
}
}

fn workdir(&self) -> &Path {
self.repo.workdir().unwrap()
}
}

Expand All @@ -72,6 +125,7 @@ pub fn check_repo_state(
src_files: &[PathEntry],
gctx: &GlobalContext,
opts: &PackageOpts<'_>,
caches: &mut HashMap<PathBuf, RepoStatusCache>,
) -> CargoResult<Option<VcsInfo>> {
let Ok(repo) = git2::Repository::discover(p.root()) else {
gctx.shell().verbose(|shell| {
Expand All @@ -91,14 +145,20 @@ pub fn check_repo_state(
};

debug!("found a git repo at `{}`", workdir.display());

let cache = caches
.entry(workdir.to_path_buf())
.or_insert_with(|| RepoStatusCache::new(repo));

let path = p.manifest_path();
let path = paths::strip_prefix_canonical(path, workdir).unwrap_or_else(|_| path.to_path_buf());
let Ok(status) = repo.status_file(&path) else {
let path =
paths::strip_prefix_canonical(path, cache.workdir()).unwrap_or_else(|_| path.to_path_buf());
let Ok(status) = cache.status_file(&path) else {
gctx.shell().verbose(|shell| {
shell.warn(format!(
"no (git) Cargo.toml found at `{}` in workdir `{}`",
path.display(),
workdir.display()
cache.workdir().display()
))
})?;
// No checked-in `Cargo.toml` found. This package may be irrelevant.
Expand All @@ -111,27 +171,27 @@ pub fn check_repo_state(
shell.warn(format!(
"found (git) Cargo.toml ignored at `{}` in workdir `{}`",
path.display(),
workdir.display()
cache.workdir().display()
))
})?;
// An ignored `Cargo.toml` found. This package may be irrelevant.
// Have to assume it is clean.
return Ok(None);
}

warn_symlink_checked_out_as_plain_text_file(gctx, src_files, &repo)?;
warn_symlink_checked_out_as_plain_text_file(gctx, src_files, &cache)?;

debug!(
"found (git) Cargo.toml at `{}` in workdir `{}`",
path.display(),
workdir.display(),
cache.workdir().display(),
);
let path_in_vcs = path
.parent()
.and_then(|p| p.to_str())
.unwrap_or("")
.replace("\\", "/");
let Some(git) = git(p, gctx, src_files, &repo, &opts)? else {
let Some(git) = git(p, gctx, src_files, cache, &opts)? else {
// If the git repo lacks essensial field like `sha1`, and since this field exists from the beginning,
// then don't generate the corresponding file in order to maintain consistency with past behavior.
return Ok(None);
Expand All @@ -156,9 +216,10 @@ pub fn check_repo_state(
fn warn_symlink_checked_out_as_plain_text_file(
gctx: &GlobalContext,
src_files: &[PathEntry],
repo: &git2::Repository,
cache: &RepoStatusCache,
) -> CargoResult<()> {
if repo
if cache
.repo
.config()
.and_then(|c| c.get_bool("core.symlinks"))
.unwrap_or(true)
Expand All @@ -171,7 +232,7 @@ fn warn_symlink_checked_out_as_plain_text_file(
shell.warn(format_args!(
"found symbolic links that may be checked out as regular files for git repo at `{}`\n\
This might cause the `.crate` file to include incorrect or incomplete files",
repo.workdir().unwrap().display(),
cache.workdir().display(),
))?;
let extra_note = if cfg!(windows) {
"\nAnd on Windows, enable the Developer Mode to support symlinks"
Expand All @@ -191,7 +252,7 @@ fn git(
pkg: &Package,
gctx: &GlobalContext,
src_files: &[PathEntry],
repo: &git2::Repository,
cache: &mut RepoStatusCache,
opts: &PackageOpts<'_>,
) -> CargoResult<Option<GitVcsInfo>> {
// This is a collection of any dirty or untracked files. This covers:
Expand All @@ -200,10 +261,10 @@ fn git(
// - ignored (in case the user has an `include` directive that
// conflicts with .gitignore).
let mut dirty_files = Vec::new();
collect_statuses(repo, &mut dirty_files)?;
collect_statuses(&cache.repo, &mut dirty_files)?;
// Include each submodule so that the error message can provide
// specifically *which* files in a submodule are modified.
status_submodules(repo, &mut dirty_files)?;
status_submodules(&cache.repo, &mut dirty_files)?;

// Find the intersection of dirty in git, and the src_files that would
// be packaged. This is a lazy n^2 check, but seems fine with
Expand All @@ -213,7 +274,7 @@ fn git(
.iter()
.filter(|src_file| dirty_files.iter().any(|path| src_file.starts_with(path)))
.map(|p| p.as_ref())
.chain(dirty_metadata_paths(pkg, repo)?.iter())
.chain(dirty_metadata_paths(pkg, cache)?.iter())
.map(|path| {
pathdiff::diff_paths(path, cwd)
.as_ref()
Expand All @@ -226,10 +287,10 @@ fn git(
if !dirty || opts.allow_dirty {
// Must check whetherthe repo has no commit firstly, otherwise `revparse_single` would fail on bare commit repo.
// Due to lacking the `sha1` field, it's better not record the `GitVcsInfo` for consistency.
if repo.is_empty()? {
if cache.repo.is_empty()? {
return Ok(None);
}
let rev_obj = repo.revparse_single("HEAD")?;
let rev_obj = cache.repo.revparse_single("HEAD")?;
Ok(Some(GitVcsInfo {
sha1: rev_obj.id().to_string(),
dirty,
Expand All @@ -252,9 +313,8 @@ fn git(
/// This is required because those paths may link to a file outside the
/// current package root, but still under the git workdir, affecting the
/// final packaged `.crate` file.
fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult<Vec<PathBuf>> {
fn dirty_metadata_paths(pkg: &Package, repo: &mut RepoStatusCache) -> CargoResult<Vec<PathBuf>> {
let mut dirty_files = Vec::new();
let workdir = repo.workdir().unwrap();
let root = pkg.root();
let meta = pkg.manifest().metadata();
for path in [&meta.license_file, &meta.readme] {
Expand All @@ -266,12 +326,12 @@ fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult<V
// Inside package root. Don't bother checking git status.
continue;
}
if let Ok(rel_path) = paths::strip_prefix_canonical(abs_path.as_path(), workdir) {
if let Ok(rel_path) = paths::strip_prefix_canonical(abs_path.as_path(), repo.workdir()) {
// Outside package root but under git workdir,
if repo.status_file(&rel_path)? != git2::Status::CURRENT {
dirty_files.push(if abs_path.is_symlink() {
// For symlinks, shows paths to symlink sources
workdir.join(rel_path)
repo.workdir().join(rel_path)
} else {
abs_path
});
Expand Down
16 changes: 13 additions & 3 deletions tests/testsuite/package.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1185,6 +1185,7 @@ fn vcs_status_check_for_each_workspace_member() {
"#,
)
.file("hobbit", "...")
.file("README.md", "")
.file(
"isengard/Cargo.toml",
r#"
Expand All @@ -1194,6 +1195,7 @@ fn vcs_status_check_for_each_workspace_member() {
homepage = "saruman"
description = "saruman"
license = "MIT"
readme = "../README.md"
"#,
)
.file("isengard/src/lib.rs", "")
Expand All @@ -1206,6 +1208,7 @@ fn vcs_status_check_for_each_workspace_member() {
homepage = "sauron"
description = "sauron"
license = "MIT"
readme = "../README.md"
"#,
)
.file("mordor/src/lib.rs", "")
Expand All @@ -1228,10 +1231,15 @@ fn vcs_status_check_for_each_workspace_member() {

// Ensure dirty files be reported only for one affected package.
p.cargo("package --workspace --no-verify")
.env("CARGO_LOG", "cargo_package_vcs_cache=trace")
.with_status(101)
.with_stderr_data(str![[r#"
[..] TRACE cargo_package_vcs_cache: git status cache miss for `isengard/Cargo.toml` at workdir `[ROOT]/foo/`
[..] TRACE cargo_package_vcs_cache: git status cache miss for `README.md` at workdir `[ROOT]/foo/`
[PACKAGING] isengard v0.0.0 ([ROOT]/foo/isengard)
[PACKAGED] 5 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[PACKAGED] 6 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[..] TRACE cargo_package_vcs_cache: git status cache miss for `mordor/Cargo.toml` at workdir `[ROOT]/foo/`
[..] TRACE cargo_package_vcs_cache: git status cache hit for `README.md` at workdir `[ROOT]/foo/`
[ERROR] 2 files in the working directory contain changes that were not yet committed into git:

mordor/src/lib.rs
Expand All @@ -1246,9 +1254,9 @@ to proceed despite this and include the uncommitted changes, pass the `--allow-d
p.cargo("package --workspace --no-verify --allow-dirty")
.with_stderr_data(str![[r#"
[PACKAGING] isengard v0.0.0 ([ROOT]/foo/isengard)
[PACKAGED] 5 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[PACKAGING] mordor v0.0.0 ([ROOT]/foo/mordor)
[PACKAGED] 6 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[PACKAGING] mordor v0.0.0 ([ROOT]/foo/mordor)
[PACKAGED] 7 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)

"#]])
.run();
Expand All @@ -1263,6 +1271,7 @@ to proceed despite this and include the uncommitted changes, pass the `--allow-d
"Cargo.toml.orig",
"src/lib.rs",
"Cargo.lock",
"README.md",
],
[(
".cargo_vcs_info.json",
Expand Down Expand Up @@ -1290,6 +1299,7 @@ to proceed despite this and include the uncommitted changes, pass the `--allow-d
"src/lib.rs",
"src/main.rs",
"Cargo.lock",
"README.md",
],
[(
".cargo_vcs_info.json",
Expand Down
Loading