diff options
| author | Marc Vertes <mvertes@free.fr> | 2024-10-02 22:43:42 +0200 |
|---|---|---|
| committer | Marc Vertes <mvertes@free.fr> | 2024-10-02 22:43:42 +0200 |
| commit | 282149e530d1d19fc9903b0a688de5b794540f48 (patch) | |
| tree | 4074b7d6bff00f9f92b271c7bfa03f79bad7e65c | |
| parent | 9ec19922addee0137f083c66b458c6aefe59191a (diff) | |
fix chunkify
| -rw-r--r-- | README.md | 46 | ||||
| -rw-r--r-- | main.go | 256 |
2 files changed, 277 insertions, 25 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..50604d0 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# bb: basic backup + +Incremental encrypted backup system + +## Current design + +1. cksum original (sha256) +2. compress (gzip) +2. encrypt (aes256) +3. split in cksumed chunks. chunks are named from the hmac of + encrypted+compressed +4. build index of chunks +5. compress (gzip) and encrypt (aes) index +6. return index cksum + +Good: +- chunks are named from their compressed/crypted hmac. + +Problems: +- the salt (or iv in aes) must be set to 0. Weak encryption. +- dedup occurs only for append only files. The same chunk content will lead to + a different hmac if located at a different offset. + +To fix: +- chunk before compression +- name chunks from cksum of uncompressed/unencrypted data. +- then compress and encrypt (in this order). + +Chunk encryption can use randomized cipher, but a hmac must be added at end of +file (before encrypt) to check integrity without having to decrypt/decompress. + +## What tarsnap is doing + +1. cksum original (sha256) +2. build chunks of variable size +3. cksum uncompressed unencrypted chunks +4. compress chunk (deflate) +5. encrypt chunk (rsa2048) + HMAC + + +## References + +- tarsnap: https://www.tarsnap.com https://github.com/tarsnap/tarsnap +- chunker: https://github.com/karinushka/chunker +- borg: https://borgbackup.org +- rclone: https://rclone.org @@ -2,27 +2,57 @@ package main import ( "bufio" + "bytes" + "compress/flate" + "crypto/aes" + "crypto/cipher" + "crypto/rand" "crypto/sha256" + "errors" + "flag" "fmt" + "io" "io/fs" "log" "net/url" "os" + "path" "path/filepath" "time" ) type metadata struct { size, mtime, mode int64 - // sum [sha256.Size]byte - sum []byte + sum [sha256.Size]byte } type metamap map[string]metadata +const chunkMax = 1 << 17 // 131072 + func getIndex(root string, mm metamap) (index string, err error) { - filesystem := os.DirFS(root) - err = fs.WalkDir(filesystem, ".", func(path string, d fs.DirEntry, err error) error { + rfs := os.DirFS(root) + + // Get the stored private encryption key. + key, err := fs.ReadFile(rfs, filepath.Join(".bb", "key")) + if err != nil { + return "", err + } + log.Printf("key: %x\n", key) + + // Get the exclude file. + xc, err := getExclude(root) + if err != nil { + return "", err + } + xc = append(xc, ".bb") + log.Println("xc", xc) + + // Walk the file tree to perform: + // - identification of changed files since previous backups + // - blockification of changed files + // - construction of backup index + err = fs.WalkDir(rfs, ".", func(path string, d fs.DirEntry, err error) error { if err != nil { return err } @@ -30,23 +60,30 @@ func getIndex(root string, mm metamap) (index string, err error) { if err != nil { return err } + if isExcluded(path, d.Name(), xc) { + return fs.SkipDir + } if info.IsDir() { - if d.Name() == ".bb" { - return fs.SkipDir - } return nil } - b, err := os.ReadFile(path) - if err != nil { - return err - } - - // md := metadata{info.Size(), info.ModTime().Unix(), int64(info.Mode()), sha256.Sum256(b)} - a := sha256.Sum256(b) - md := metadata{info.Size(), info.ModTime().Unix(), int64(info.Mode()), a[:]} + md := metadata{size: info.Size(), mtime: info.ModTime().Unix(), mode: int64(info.Mode())} - log.Println(path, md, mm[path]) + // Recompute file checksum only if file size, mode or mtime has changed. + if o, ok := mm[path]; ok && o.size == md.size && o.mtime == md.mtime && o.mode == md.mode { + md.sum = o.sum + } else { + b, err := os.ReadFile(path) + if err != nil { + return err + } + md.sum = sha256.Sum256(b) + log.Printf("archive %s %x\n", path, md.sum) + // chunk here, could be done in goroutine. + if err := chunkify(root, fmt.Sprintf("%x", md.sum), b, key); err != nil { + return err + } + } index += fmt.Sprintf("%s %d %d %o %x\n", url.PathEscape(path), md.size, md.mtime, md.mode, md.sum) return nil @@ -54,6 +91,38 @@ func getIndex(root string, mm metamap) (index string, err error) { return index, err } +func getExclude(root string) (str []string, err error) { + f, err := os.Open(filepath.Join(root, ".bb", "exclude")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return str, err + } + defer f.Close() + scan := bufio.NewScanner(f) + for scan.Scan() { + if s := scan.Text(); len(s) > 0 { + str = append(str, s) + } + } + return str, scan.Err() +} + +func isExcluded(path, base string, excludes []string) bool { + for _, x := range excludes { + if match(base, x) || match(path, x) { + return true + } + } + return false +} + +func match(pattern, name string) bool { + if matched, err := path.Match(pattern, name); err != nil { + panic(err) + } else { + return matched + } +} + func readIndex(path string) (md metamap, err error) { f, err := os.Open(path) if err != nil { @@ -63,32 +132,49 @@ func readIndex(path string) (md metamap, err error) { md = metamap{} scan := bufio.NewScanner(f) for scan.Scan() { - var p string - var d metadata - n, err := fmt.Sscanf(scan.Text(), "%s %d %d %o %64x", &p, &d.size, &d.mtime, &d.mode, &d.sum) + var ( + p string + d metadata + s []byte + ) + n, err := fmt.Sscanf(scan.Text(), "%s %d %d %o %64x", &p, &d.size, &d.mtime, &d.mode, &s) if err != nil || n != 5 { return md, err } + copy(d.sum[:], s) path, err := url.PathUnescape(p) if err != nil { return md, err } md[path] = d } - if err := scan.Err(); err != nil { - return md, err - } - return md, nil + return md, scan.Err() } func initBB(root string) (current, previous string, err error) { - if err = os.MkdirAll(filepath.Join(root, ".bb", "data"), 0o750); err != nil { + if err = os.MkdirAll(filepath.Join(root, ".bb"), 0o750); err != nil { return "", "", err } - prevs, _ := fs.Glob(os.DirFS(root), filepath.Join(".bb", "index-*")) + rfs := os.DirFS(root) + + // Create a private encryption if it doesn't already exists. + if _, err := fs.Stat(rfs, filepath.Join(".bb", "key")); errors.Is(err, fs.ErrNotExist) { + buf := make([]byte, 32) + if _, err := rand.Read(buf); err != nil { + return "", "", err + } + if err := os.WriteFile(filepath.Join(root, ".bb", "key"), buf, 0o600); err != nil { + return "", "", err + } + } + + // Retrieve the most recent backup index name. + prevs, _ := fs.Glob(rfs, filepath.Join(".bb", "index-*")) if len(prevs) > 0 { previous = prevs[len(prevs)-1] } + + // Create a current backup index now := time.Now() y, m, d := now.Date() h, mn, s := now.Clock() @@ -96,12 +182,132 @@ func initBB(root string) (current, previous string, err error) { return current, previous, nil } +// chunkify reads data and writes fixed size encrypted compressed blocks. +func chunkify(root, name string, data, key []byte) error { + // Steps: + // 1. checksum source file (done by caller), this will be the file index name + // 2. split in chunks. For each chunk, do: + // 1. checksum the chunk, before compression/encryption. This will be the chunk name. + // 1. compress the chunk (deflate) + // 3. encrypt and authentify the result (aes-gcm) + // 4. write in chunk name (see above) + // 5 add chunk name to file index + // 3. compress and encrypt file index as above. + + if len(data) <= chunkMax { + return flatenc(root, name, data, key) + } + + // Split data in fixed size chunks. + chunks := split(data, chunkMax) + index := []byte{} + log.Println("chunkify", name) + log.Println("nchunks:", len(chunks), len(chunks[0])) + + for i, c := range chunks { + sum := sha256.Sum256(c) + index = append(index, sum[:]...) + if err := flatenc(root, fmt.Sprintf("%x", sum), c, key); err != nil { + return fmt.Errorf("chunkify %s block %d: %w", name, i, err) + } + } + log.Println("file index:", name) + return flatenc(root, name, index, key) +} + +func flatenc(root, name string, data, key []byte) error { + // Flatten data. + var buf bytes.Buffer + zw, _ := flate.NewWriter(&buf, flate.DefaultCompression) + if _, err := zw.Write(data); err != nil { + return fmt.Errorf("flatenc flatten write: %w", err) + } + if err := zw.Close(); err != nil { + return fmt.Errorf("flatenc flatten close: %w", err) + } + + // Encrypt and authentify flattened data. + cb, err := aes.NewCipher(key) + if err != nil { + return fmt.Errorf("flatenc cipher: %w", err) + } + aesgcm, err := cipher.NewGCM(cb) + if err != nil { + return fmt.Errorf("flatenc gcm: %w", err) + } + iv := make([]byte, aesgcm.NonceSize()) + if _, err := rand.Read(iv); err != nil { + return fmt.Errorf("flatenc iv: %w", err) + } + log.Printf("iv: %d %x\n", len(iv), iv) + enc := aesgcm.Seal(nil, iv, buf.Bytes(), nil) + + // Write the result to a file named from checksum of original content. + return writeCksumFile(filepath.Join(root, ".bb", "chunks"), name, append(iv, enc...)) +} + +func unflatenc(root, name string, key []byte) (raw []byte, err error) { + enc, err := os.ReadFile(filepath.Join(root, ".bb", "chunks", name[:2], name[2:])) + if err != nil { + return nil, err + } + cb, err := aes.NewCipher(key) + if err != nil { + return nil, fmt.Errorf("unflatenc cipher: %w", err) + } + aesgcm, err := cipher.NewGCM(cb) + if err != nil { + return nil, fmt.Errorf("unflatenc gcm: %w", err) + } + l := aesgcm.NonceSize() + dec, err := aesgcm.Open(nil, enc[:l], enc[l:], nil) + if err != nil { + return nil, fmt.Errorf("unflatenc open: %w", err) + } + return io.ReadAll(flate.NewReader(bytes.NewBuffer(dec))) +} + +func split(data []byte, size int) (chunks [][]byte) { + offset := 0 + for offset+size < len(data) { + chunks = append(chunks, data[offset:offset+size]) + offset += size + } + chunks = append(chunks, data[offset:]) + return chunks +} + +func writeCksumFile(prefix, name string, data []byte) error { + head, tail := name[:2], name[2:] + if err := os.MkdirAll(filepath.Join(prefix, head), 0o750); err != nil { + return err + } + return os.WriteFile(filepath.Join(prefix, head, tail), data, 0o640) +} + func main() { log.SetFlags(log.Lshortfile) + + rfile := flag.String("read", "", "a cksum file") + flag.Parse() + wd, err := os.Getwd() if err != nil { log.Fatal(err) } + + if *rfile != "" { + log.Println("rfile:", *rfile) + key, err := os.ReadFile(filepath.Join(wd, ".bb", "key")) + if err != nil { + log.Fatal(err) + } + d, err := unflatenc(wd, *rfile, key) + log.Println("d", err, string(d)) + // log.Printf("d: %d %v %x\n", len(d), err, d) + return + } + index, oldindex, err := initBB(wd) if err != nil { log.Fatal(err) |
