indexer: Fix link resolution and hardlink handling for rpm

Unlike tar, cpio does not have a separate entry for each directory, so
the link resolution can't assume that directory entries exist for each
path component.

I also mistakenly assumed that cpio handled hardlinks similarly to tar,
but that's clearly not the case. libarchive does help a bit, but these
differences still suck.
This commit is contained in:
Yorhel 2017-01-18 13:07:26 +01:00
parent 608f79eb93
commit 8235fb28b8
4 changed files with 79 additions and 12 deletions

View file

@ -253,6 +253,10 @@ impl<'a> ArchiveEntry<'a> {
_ => FileType::Other, _ => FileType::Other,
} }
} }
pub fn nlink(&self) -> u32 {
unsafe { ffi::archive_entry_nlink(self.e) }
}
} }

View file

@ -38,6 +38,40 @@ use archive::{walk,ArchiveEntry,FileType};
* (* So apparently some man pages are close to 10MB...) * (* So apparently some man pages are close to 10MB...)
*/ */
/* How links are handled by libarchive and how we deal with that:
*
* symlinks:
* Simply seen as a file entry containing a path that can be resolved to another file entry. Path
* can be absolute or relative. Symlink can come before or after the referenced file.
* (Handling of this is described above)
*
* tar hardlinks:
* The first occurrence of the file is seen as a regular file. For any future occurrences,
* libarchive will give us the path of the first occurrence in archive_entry_hardlink(), which
* archive.rs will then merge into a Link() entry.
* Basically, hard links can be handled in the exact same way as symlinks.
*
* cpio hardlinks (old): Hard links are duplicated. No special handling needed.
*
* cpio hardlinks (new):
* A. The first occurrence of the file is seen as a regular file with size=0, nlinks>1
* B. Any later occurrences of the file are seen as a hardlink to the first occurrence.
* C. The last occurrence of the file is also seen as a hardlink to the first occurrence, but
* actually has the correct file size and body.
* How to handle:
* - Entry (A) is inserted in the FileList as an 'Hardlink'
* - Entry (B) is inserted as a normal 'Link' to (A)
* - Entry (C), detected by having size>0, and handled as follows:
* - The entry itself is inserted as a normal 'Link' to (A)
* - If (A) is interesting, the contents of the entry are processed with (A)'s path, and (A)'s
* entry is changed to 'Handled'
* - Then you end up with one 'Handled' entry and several 'Link' entries pointing to it. The link
* resolution mechanism will handle the rest.
* Note that this will not work if (A)'s path itself is not interesting. In that case the archive
* will be read a second time, but all entries pointing to (A), either through hardlinks or
* symlinks pointing to hardlinks, will resolve to the empty file entry. I strongly doubt there
* is a package archive like that.
*/
#[derive(Clone,Debug,PartialEq,Eq)] #[derive(Clone,Debug,PartialEq,Eq)]
pub enum EntryType { pub enum EntryType {
@ -48,6 +82,8 @@ pub enum EntryType {
Regular, Regular,
// Link to another file (interesting or not is irrelevant) // Link to another file (interesting or not is irrelevant)
Link(String), Link(String),
// A hardlink for which we don't have the file contents (yet)
Hardlink,
// Directory; need this information when resolving links // Directory; need this information when resolving links
Directory, Directory,
// Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to // Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to
@ -71,13 +107,14 @@ impl FileList {
* *
* interest_cb: Called on every path in the archive, should return whether the file is * interest_cb: Called on every path in the archive, should return whether the file is
* interesting (i.e. whether we want to know its contents). * interesting (i.e. whether we want to know its contents).
* entry_cb: Called once on every entry in the archive.
* file_cb: Called on every regular file for which interest_cb() showed an interest. * file_cb: Called on every regular file for which interest_cb() showed an interest.
* The callback accepts multiple path names, but this function will only provide one. * The callback accepts multiple path names, but this function will only provide one.
* *
* Returns a FileList struct that can be used to retreive all interesting non-regular files. * Returns a FileList struct that can be used to retreive all interesting non-regular files.
*/ */
pub fn read<F,G>(ent: Option<ArchiveEntry>, mut interest_cb: F, mut file_cb: G) -> Result<FileList> pub fn read<F,G,H>(ent: Option<ArchiveEntry>, interest_cb: F, mut entry_cb: G, mut file_cb: H) -> Result<FileList>
where F: FnMut(&ArchiveEntry) -> bool, G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> where F: Fn(&str) -> bool, G: FnMut(&ArchiveEntry), H: FnMut(&[&str], &mut ArchiveEntry) -> Result<()>
{ {
let mut fl = FileList { let mut fl = FileList {
seen: HashMap::new(), seen: HashMap::new(),
@ -90,7 +127,7 @@ impl FileList {
None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) } None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) }
}; };
let ft = e.filetype(); let ft = e.filetype();
trace!("Archive entry: {:10} {} {:?}", e.size(), path, ft); trace!("Archive entry: {:10} #{} {} {:?}", e.size(), e.nlink(), path, ft);
// We ought to throw away the result of the previous entry with the same name and use // We ought to throw away the result of the previous entry with the same name and use
// this new entry instead, but fuck it. This case is too rare, so let's just warn. // this new entry instead, but fuck it. This case is too rare, so let's just warn.
@ -99,9 +136,13 @@ impl FileList {
return Ok(true); return Ok(true);
} }
entry_cb(&e);
let et = match ft { let et = match ft {
FileType::File => { FileType::File => {
if interest_cb(&e) { if e.size() == 0 && e.nlink() > 1 {
EntryType::Hardlink
} else if interest_cb(&path) {
let pathv = [&path as &str]; let pathv = [&path as &str];
try!(file_cb(&pathv[..], &mut e)); try!(file_cb(&pathv[..], &mut e));
EntryType::Handled EntryType::Handled
@ -110,7 +151,18 @@ impl FileList {
} }
}, },
FileType::Link(l) => { FileType::Link(l) => {
if interest_cb(&e) { // cpio hard link for which we have the file contents
if e.size() > 0 && e.nlink() > 1 {
if let Some((EntryType::Hardlink, ocomp)) = fl.resolve_link(&path, &l, 32) {
let opath = ocomp.join("/");
if interest_cb(&opath) {
let pathv = [&opath as &str];
try!(file_cb(&pathv[..], &mut e));
*fl.seen.get_mut(&opath).unwrap() = EntryType::Handled;
}
}
}
if interest_cb(&path) {
fl.links.push(path.clone()); fl.links.push(path.clone());
} }
EntryType::Link(l) EntryType::Link(l)
@ -160,8 +212,14 @@ impl FileList {
// If it's a directory, we're good // If it's a directory, we're good
Some(&EntryType::Directory) => (), Some(&EntryType::Directory) => (),
// If we didn't find anything but this isn't the last item, then it's possible that
// this is a directory for which no explicit entry was created. This happens with
// RPM files. Just continue and see if we find anything if we add more components.
None if i != comp.len()-1 => (),
// If it's a file or man page, it must be the last item. // If it's a file or man page, it must be the last item.
Some(& ref x@ EntryType::Regular) | Some(& ref x@ EntryType::Regular) |
Some(& ref x@ EntryType::Hardlink) |
Some(& ref x@ EntryType::Handled) => return Some(& ref x@ EntryType::Handled) => return
if i == comp.len()-1 { if i == comp.len()-1 {
Some((x.clone(), dest)) Some((x.clone(), dest))
@ -176,6 +234,7 @@ impl FileList {
// Same as above, with dirs we can continue, files have to be last // Same as above, with dirs we can continue, files have to be last
Some((EntryType::Directory, d)) => dest = d, Some((EntryType::Directory, d)) => dest = d,
x@Some((EntryType::Regular, _)) | x@Some((EntryType::Regular, _)) |
x@Some((EntryType::Hardlink, _)) |
x@Some((EntryType::Handled, _)) => return x@Some((EntryType::Handled, _)) => return
if i == comp.len()-1 { x } if i == comp.len()-1 { x }
else { else {
@ -215,7 +274,9 @@ impl FileList {
Some((EntryType::Regular, d)) => { Some((EntryType::Regular, d)) => {
let dstr = d.join("/"); let dstr = d.join("/");
missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string());
} },
Some((EntryType::Hardlink, _)) =>
error!("Interesting link resolved to uninteresting hardlink: {}-> {}", p, dest),
_ => (), _ => (),
} }
} }
@ -260,7 +321,8 @@ mod tests {
let arch = Archive::open_archive(&mut f).unwrap(); let arch = Archive::open_archive(&mut f).unwrap();
let mut cnt = 0; let mut cnt = 0;
FileList::read(arch, FileList::read(arch,
|p| p.path().unwrap().starts_with("man/man"), |p| p.starts_with("man/man"),
|_| (),
|p,e| { |p,e| {
assert_eq!(cnt, 0); assert_eq!(cnt, 0);
cnt += 1; cnt += 1;
@ -290,7 +352,7 @@ mod tests {
assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); assert_eq!(res("man/man1/symlinkbefore.1"), helloworld);
assert_eq!(res("man/man6/symlinkafter.6"), helloworld); assert_eq!(res("man/man6/symlinkafter.6"), helloworld);
assert_eq!(res("man/man1/badsymlink1.1"), None); //assert_eq!(res("man/man1/badsymlink1.1"), None);
assert_eq!(res("man/man1/badsymlink2.1"), None); assert_eq!(res("man/man1/badsymlink2.1"), None);
assert_eq!(res("man/man1/badsymlink3.1"), None); assert_eq!(res("man/man1/badsymlink3.1"), None);
assert_eq!(res("man/man1/badsymlink4.1"), None); assert_eq!(res("man/man1/badsymlink4.1"), None);
@ -313,6 +375,7 @@ mod tests {
assert_eq!(r.0, p.to_string()); assert_eq!(r.0, p.to_string());
assert_eq!(r.1, "man/man3/helloworld.3".to_string()); assert_eq!(r.1, "man/man3/helloworld.3".to_string());
}; };
res("man/man1/badsymlink1.1");
res("man/man1/doublesymlink1.1"); res("man/man1/doublesymlink1.1");
res("man/man1/doublesymlink2.1"); res("man/man1/doublesymlink2.1");
res("man/man1/symlinkbefore.1"); res("man/man1/symlinkbefore.1");

View file

@ -184,10 +184,7 @@ fn index_pkg(tr: &postgres::GenericConnection, mut opt: PkgOpt, verid: i32) -> s
}; };
let missed = with_pkg(&mut opt, |e, opt| { let missed = with_pkg(&mut opt, |e, opt| {
archread::FileList::read(e, |ent: &ArchiveEntry| { archread::FileList::read(e, man::ismanpath, |ent| opt.date.update(ent), &indexfunc)
opt.date.update(ent);
man::ismanpath(ent.path().unwrap())
}, &indexfunc)
})?.links(|src, dest| { insert_link(tr, verid, src, dest) }); })?.links(|src, dest| { insert_link(tr, verid, src, dest) });
if let Some(missed) = missed { if let Some(missed) = missed {

View file

@ -44,6 +44,9 @@ ln man3/helloworld.3 man6/hardlink.6
ln -s ../man3/helloworld.3 man1/symlinkbefore.1 ln -s ../man3/helloworld.3 man1/symlinkbefore.1
ln -s ../man3/helloworld.3 man6/symlinkafter.6 ln -s ../man3/helloworld.3 man6/symlinkafter.6
# Technically badsymlink1.1 is bad, but we can't detect this because package
# archives don't necessarily list all directories. So it will have to be
# considered valid.
ln -s notadir/../../man3/helloworld.3 man1/badsymlink1.1 ln -s notadir/../../man3/helloworld.3 man1/badsymlink1.1
ln -s man3/helloworld.3 man1/badsymlink2.1 ln -s man3/helloworld.3 man1/badsymlink2.1
ln -s ../man3/helloworld.3/. man1/badsymlink3.1 ln -s ../man3/helloworld.3/. man1/badsymlink3.1