refactor: move iterating over files to Path extension trait
ci/woodpecker/push/lint Pipeline was successful Details
ci/woodpecker/push/clippy Pipeline failed Details
ci/woodpecker/push/build Pipeline was successful Details

incremental-backups
Jef Roosens 2023-06-17 12:08:46 +02:00
parent 5275356353
commit f7235fb342
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
2 changed files with 164 additions and 82 deletions

View File

@ -1,3 +1,4 @@
use crate::server::path::PathExt;
use chrono::{Local, Utc}; use chrono::{Local, Utc};
use flate2::write::GzEncoder; use flate2::write::GzEncoder;
use flate2::Compression; use flate2::Compression;
@ -14,52 +15,6 @@ extern "C" {
fn getegid() -> u32; fn getegid() -> u32;
} }
/// List all files in `src_dir` and all child directories.
fn files(src_dir: PathBuf) -> io::Result<HashSet<PathBuf>> {
let mut dirs = vec![src_dir.clone()];
let mut files: HashSet<PathBuf> = HashSet::new();
while let Some(dir) = dirs.pop() {
for res in dir.read_dir()? {
let entry = res?;
if entry.file_name() == "cache" {
continue;
}
if entry.file_type()?.is_dir() {
dirs.push(entry.path());
} else {
files.insert(entry.path().strip_prefix(&src_dir).unwrap().to_path_buf());
}
}
}
Ok(files)
}
/// Check whether a file has been modified since the given timestamp.
///
/// Note that this function will *only* return true if it can determine with certainty that the
/// file has not been modified. If any errors occur while obtaining the required metadata (e.g. if
/// the file system does not support this metadata), this function will return false.
fn not_modified_since<T: AsRef<Path>>(time: chrono::DateTime<Utc>, path: T) -> bool {
let path = path.as_ref();
if let Ok(metadata) = path.metadata() {
let last_modified = metadata.modified();
if let Ok(last_modified) = last_modified {
let t: chrono::DateTime<Utc> = last_modified.into();
let t = t.with_timezone(&Local);
return t < time;
}
}
false
}
#[derive(Debug, PartialEq, Serialize, Deserialize)] #[derive(Debug, PartialEq, Serialize, Deserialize)]
pub enum BackupType { pub enum BackupType {
Full, Full,
@ -162,11 +117,7 @@ impl Backup {
/// Returns a pointer to this backup's previous backup by cloning the Arc pointer. /// Returns a pointer to this backup's previous backup by cloning the Arc pointer.
pub fn previous(&self) -> Option<Arc<Self>> { pub fn previous(&self) -> Option<Arc<Self>> {
if let Some(previous) = &self.previous { self.previous.as_ref().map(Arc::clone)
Some(Arc::clone(&previous))
} else {
None
}
} }
/// Calculate the full state of the backup by applying all its ancestors' delta's in order, /// Calculate the full state of the backup by applying all its ancestors' delta's in order,
@ -193,7 +144,7 @@ impl Backup {
None None
} else if let Some(previous) = &self.previous { } else if let Some(previous) = &self.previous {
if n == 1 { if n == 1 {
Some(Arc::clone(&previous)) Some(Arc::clone(previous))
} else { } else {
previous.ancestor(n - 1) previous.ancestor(n - 1)
} }
@ -232,26 +183,27 @@ impl Backup {
let enc = GzEncoder::new(tar_gz, Compression::default()); let enc = GzEncoder::new(tar_gz, Compression::default());
let mut ar = tar::Builder::new(enc); let mut ar = tar::Builder::new(enc);
let mut added: HashMap<PathBuf, HashSet<PathBuf>> = HashMap::new(); let mut delta = BackupDelta::new();
for (dir_in_tar, src_dir) in dirs { for (dir_in_tar, src_dir) in dirs {
let files = files(src_dir.clone())?; let mut added_files: HashSet<PathBuf> = HashSet::new();
for path in &files { for entry in src_dir.read_dir_recursive()?.ignored("cache").files() {
ar.append_path_with_name(src_dir.join(path), dir_in_tar.join(path))?; let path = entry?.path();
let stripped = path.strip_prefix(&src_dir).unwrap();
ar.append_path_with_name(&path, dir_in_tar.join(stripped))?;
added_files.insert(stripped.to_path_buf());
} }
added.insert(dir_in_tar, files); delta.added.insert(dir_in_tar, added_files);
} }
Ok(Backup { Ok(Backup {
previous: None, previous: None,
type_: BackupType::Full, type_: BackupType::Full,
start_time, start_time,
delta: BackupDelta { delta,
added,
removed: HashMap::new(),
},
}) })
} }
@ -274,17 +226,19 @@ impl Backup {
let mut delta = BackupDelta::new(); let mut delta = BackupDelta::new();
for (dir_in_tar, src_dir) in dirs { for (dir_in_tar, src_dir) in dirs {
let files = files(src_dir.clone())?; let mut all_files: HashSet<PathBuf> = HashSet::new();
let added_files = files let mut added_files: HashSet<PathBuf> = HashSet::new();
.iter()
// This explicit negation is because we wish to also include files for which we
// couldn't determine the last modified time
.filter(|p| !not_modified_since(previous.start_time, src_dir.join(p)))
.cloned()
.collect::<HashSet<PathBuf>>();
for path in added_files.iter() { for entry in src_dir.read_dir_recursive()?.ignored("cache").files() {
ar.append_path_with_name(src_dir.join(path), dir_in_tar.join(path))?; let path = entry?.path();
let stripped = path.strip_prefix(&src_dir).unwrap();
if !path.not_modified_since(previous.start_time) {
ar.append_path_with_name(&path, dir_in_tar.join(stripped))?;
added_files.insert(stripped.to_path_buf());
}
all_files.insert(stripped.to_path_buf());
} }
delta.added.insert(dir_in_tar.clone(), added_files); delta.added.insert(dir_in_tar.clone(), added_files);
@ -292,7 +246,7 @@ impl Backup {
if let Some(previous_files) = previous_state.get(&dir_in_tar) { if let Some(previous_files) = previous_state.get(&dir_in_tar) {
delta.removed.insert( delta.removed.insert(
dir_in_tar, dir_in_tar,
previous_files.difference(&files).cloned().collect(), previous_files.difference(&all_files).cloned().collect(),
); );
} }
} }

View File

@ -1,19 +1,147 @@
use chrono::Utc; use chrono::{Local, Utc};
use std::collections::HashSet; use std::collections::HashSet;
use std::path::PathBuf; use std::ffi::OsString;
use std::fs::DirEntry;
use std::path::{Path, PathBuf};
use std::{fs, io}; use std::{fs, io};
struct ReadDirRecursive { pub struct ReadDirRecursive {
ignored_dirs: HashSet<PathBuf>, ignored: HashSet<OsString>,
read_dir: Option<fs::ReadDir>, read_dir: fs::ReadDir,
stack: Vec<fs::ReadDir>, dir_stack: Vec<PathBuf>,
files_only: bool,
} }
impl ReadDirRecursive { impl ReadDirRecursive {
// pub fn new() /// Start the iterator for a new directory
pub fn start<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let path = path.as_ref();
let read_dir = path.read_dir()?;
Ok(ReadDirRecursive {
ignored: HashSet::new(),
read_dir,
dir_stack: Vec::new(),
files_only: false,
})
} }
trait PathExt { pub fn ignored<S: Into<OsString>>(mut self, s: S) -> Self {
fn modified_since(timestamp: chrono::DateTime<Utc>) -> bool; self.ignored.insert(s.into());
fn read_dir_recusive() -> ReadDirRecursive;
self
}
pub fn files(mut self) -> Self {
self.files_only = true;
self
}
/// Tries to populate the `read_dir` field with a new `ReadDir` instance to consume.
fn next_read_dir(&mut self) -> io::Result<bool> {
if let Some(path) = self.dir_stack.pop() {
self.read_dir = path.read_dir()?;
Ok(true)
} else {
Ok(false)
}
}
/// Convenience method to add a new directory to the stack.
fn push_entry(&mut self, entry: &io::Result<DirEntry>) {
if let Ok(entry) = entry {
if entry.path().is_dir() {
self.dir_stack.push(entry.path());
}
}
}
/// Determine whether an entry should be returned by the iterator.
fn should_return(&self, entry: &io::Result<DirEntry>) -> bool {
if let Ok(entry) = entry {
let mut res = !self.ignored.contains(&entry.file_name());
// Please just let me combine these already
if self.files_only {
if let Ok(file_type) = entry.file_type() {
res = res && file_type.is_file();
}
// We couldn't determine if it's a file, so we don't return it
else {
res = false;
}
}
res
} else {
true
}
}
}
impl Iterator for ReadDirRecursive {
type Item = io::Result<DirEntry>;
fn next(&mut self) -> Option<Self::Item> {
loop {
// First, we try to consume the current directory's items
while let Some(entry) = self.read_dir.next() {
self.push_entry(&entry);
if self.should_return(&entry) {
return Some(entry);
}
}
// If we get an error while setting up a new directory, we return this, otherwise we
// keep trying to consume the directories
match self.next_read_dir() {
Ok(true) => (),
// There's no more directories to traverse, so the iterator is done
Ok(false) => return None,
Err(e) => return Some(Err(e)),
}
}
}
}
pub trait PathExt {
/// Confirm whether the file has not been modified since the given timestamp.
///
/// This function will only return true if it can determine with certainty that the file hasn't
/// been modified.
///
/// # Args
///
/// * `timestamp` - Timestamp to compare modified time with
///
/// # Returns
///
/// True if the file has not been modified for sure, false otherwise.
fn not_modified_since(&self, timestamp: chrono::DateTime<Utc>) -> bool;
/// An extension of the `read_dir` command that runs through the entire underlying directory
/// structure using breadth-first search
fn read_dir_recursive(&self) -> io::Result<ReadDirRecursive>;
}
impl PathExt for Path {
fn not_modified_since(&self, timestamp: chrono::DateTime<Utc>) -> bool {
if let Ok(metadata) = self.metadata() {
if let Ok(last_modified) = metadata.modified() {
let t: chrono::DateTime<Utc> = last_modified.into();
let t = t.with_timezone(&Local);
return t < timestamp;
}
}
false
}
fn read_dir_recursive(&self) -> io::Result<ReadDirRecursive> {
ReadDirRecursive::start(self)
}
} }