From 8f3b91ba32bb13190211bab7429d06760b418074 Mon Sep 17 00:00:00 2001
From: Alexander Neumann <alexander@bumpern.de>
Date: Sun, 26 Apr 2015 15:36:49 +0200
Subject: [PATCH] Add packer, update Design.md

---
 doc/Design.md     | 120 ++++++++++++---------
 pack/pack.go      | 270 ++++++++++++++++++++++++++++++++++++++++++++++
 pack/pack_test.go | 111 +++++++++++++++++++
 3 files changed, 450 insertions(+), 51 deletions(-)
 create mode 100644 pack/pack.go
 create mode 100644 pack/pack_test.go

diff --git a/doc/Design.md b/doc/Design.md
index ff28b207b..93805c339 100644
--- a/doc/Design.md
+++ b/doc/Design.md
@@ -64,6 +64,10 @@ The basic layout of a sample restic repository is shown below:
 
     /tmp/restic-repo
     ├── data
+    │   ├── 21
+    │   │   └── 2159dd48f8a24f33c307b750592773f8b71ff8d11452132a7b2e2a6a01611be1
+    │   ├── 32
+    │   │   └── 32ea976bc30771cebad8285cd99120ac8786f9ffd42141d452458089985043a5
     │   ├── 59
     │   │   └── 59fe4bcde59bd6222eba87795e35a90d82cd2f138a27b6835032b7b58173a426
     │   ├── 73
@@ -71,25 +75,14 @@ The basic layout of a sample restic repository is shown below:
     │   [...]
     ├── id
     ├── index
-    │   └── c38f5fb68307c6a3e3aa945d556e325dc38f5fb68307c6a3e3aa945d556e325d
+    │   ├── c38f5fb68307c6a3e3aa945d556e325dc38f5fb68307c6a3e3aa945d556e325d
+    │   └── ca171b1b7394d90d330b265d90f506f9984043b342525f019788f97e745c71fd
     ├── keys
     │   └── b02de829beeb3c01a63e6b25cbd421a98fef144f03b9a02e46eff9e2ca3f0bd7
     ├── locks
     ├── snapshots
     │   └── 22a5af1bdc6e616f8a29579458c49627e01b32210d09adb288d1ecda7c5711ec
     ├── tmp
-    ├── trees
-    │   ├── 21
-    │   │   └── 2159dd48f8a24f33c307b750592773f8b71ff8d11452132a7b2e2a6a01611be1
-    │   ├── 32
-    │   │   └── 32ea976bc30771cebad8285cd99120ac8786f9ffd42141d452458089985043a5
-    │   ├── 95
-    │   │   └── 95f75feb05a7cc73e328b2efa668b1ea68f65fece55a93bc65aff6cd0bcfeefc
-    │   ├── b8
-    │   │   └── b8138ab08a4722596ac89c917827358da4672eac68e3c03a8115b88dbf4bfb59
-    │   ├── e0
-    │   │   └── e01150928f7ad24befd6ec15b087de1b9e0f92edabd8e5cabb3317f8b20ad044
-    │   [...]
     └── version
 
 A repository can be initialized with the `restic init` command, e.g.:
@@ -99,39 +92,47 @@ A repository can be initialized with the `restic init` command, e.g.:
 Pack Format
 -----------
 
-All files in the repository except Key, Tree and Data files just contain raw
-data, stored as `IV || Ciphertext || MAC`. Tree and Data files may contain
-several Blobs of data. The format is described in the following.
+All files in the repository except Key and Data files just contain raw data,
+stored as `IV || Ciphertext || MAC`. Data files may contain one or more Blobs
+of data. The format is described in the following.
 
-A Pack starts with a nonce and a header, the header describes the content and
-is encrypted and signed. The Pack's structure is as follows:
+The Pack's structure is as follows:
 
-    NONCE || Header_Length ||
-    IV_Header || Ciphertext_Header || MAC_Header  ||
-    IV_Blob_1 || Ciphertext_Blob_1 || MAC_Blob_1 ||
-    [...]
-    IV_Blob_n || Ciphertext_Blob_n || MAC_Blob_n ||
-    MAC
+    EncryptedBlob1 || ... || EncryptedBlobN || EncryptedHeader || Header_Length
 
-`NONCE` consists of 16 bytes and `Header_Length` is a four byte integer in
-little-endian encoding.
+At the end of the Pack is a header, which describes the content and is
+encrypted and signed. `Header_Length` is the length of the encrypted header
+encoded as a is a four byte integer in little-endian encoding.
 
-All the parts (`Ciphertext_Header`, `Ciphertext_Blob1` etc.) are signed and
-encrypted independently. In addition, the complete pack is signed using
-`NONCE`. This enables repository reorganisation without having to touch the
-encrypted Blobs. In addition it also allows efficient indexing, for only the
-header needs to be read in order to find out which Blobs are contained in the
-Pack. Since the header is signed, authenticity of the header can be checked
-without having to read the complete Pack.
+All the blobs (`EncryptedBlob1`, `EncryptedBlobN` etc.) are signed and
+encrypted independently. This enables repository reorganisation without having
+to touch the encrypted Blobs. In addition it also allows efficient indexing,
+for only the header needs to be read in order to find out which Blobs are
+contained in the Pack. Since the header is signed, authenticity of the header
+can be checked without having to read the complete Pack.
 
 After decryption, a Pack's header consists of the following elements:
 
-    Length(IV_Blob_1+Ciphertext_Blob1+MAC_Blob_1)  || Hash(Plaintext_Blob_1) ||
+    Type_Blob1 || Length(EncryptedBlob1) || Hash(Plaintext_Blob1) ||
     [...]
-    Length(IV_Blob_n+Ciphertext_Blob_n+MAC_Blob_n) || Hash(Plaintext_Blob_n) ||
+    Type_BlobN || Length(EncryptedBlobN) || Hash(Plaintext_Blobn) ||
 
 This is enough to calculate the offsets for all the Blobs in the Pack. Length
-is the length of a Blob as a four byte integer in little-endian format.
+is the length of a Blob as a four byte integer in little-endian format. The
+type field is a one byte field and labels the content of a blob according to
+the following table:
+
+ Type | Meaning
+ -----|---------
+    0 | data
+    1 | tree
+
+All other types are invalid, more types may be added in the future.
+
+For reconstructing the index or parsing a pack without an index, first the last
+four bytes must be read in order to find the length of the header. Afterwards,
+the header can be read and parsed, which yields all plaintext hashes, types,
+offsets and lengths of all included blobs.
 
 Indexing
 --------
@@ -139,23 +140,40 @@ Indexing
 Index files contain information about Data and Tree Blobs and the Packs they
 are contained in and store this information in the repository. When the local
 cached index is not accessible any more, the index files can be downloaded and
-used to reconstruct the index. The index Blobs are encrypted and signed like
-Data and Tree Blobs, so the outer structure is `IV || Ciphertext || MAC` again.
-The plaintext consists of a JSON document like the following:
+used to reconstruct the index. The files are encrypted and signed like Data and
+Tree Blobs, so the outer structure is `IV || Ciphertext || MAC` again. The
+plaintext consists of a JSON document like the following:
 
-    [
-      {
-         "id": "73d04e6125cf3c28a299cc2f3cca3b78ceac396e4fcf9575e34536b26782413c",
-         "blobs": [
-            "3ec79977ef0cf5de7b08cd12b874cd0f62bbaf7f07f3497a5b1bbcc8cb39b1ce",
-            "9ccb846e60d90d4eb915848add7aa7ea1e4bbabfc60e573db9f7bfb2789afbae",
-            "d3dc577b4ffd38cc4b32122cabf8655a0223ed22edfd93b353dc0c3f2b0fdf66"
-         ]
-      }
-    ]
+    [ {
+      "id": "73d04e6125cf3c28a299cc2f3cca3b78ceac396e4fcf9575e34536b26782413c",
+      "blobs": [
+        {
+          "id": "3ec79977ef0cf5de7b08cd12b874cd0f62bbaf7f07f3497a5b1bbcc8cb39b1ce",
+          "type": "data",
+          "offset": 0,
+          "length": 25
+        },{
+          "id": "9ccb846e60d90d4eb915848add7aa7ea1e4bbabfc60e573db9f7bfb2789afbae",
+          "type": "tree",
+          "offset": 38,
+          "length": 100
+        },
+        {
+          "id": "d3dc577b4ffd38cc4b32122cabf8655a0223ed22edfd93b353dc0c3f2b0fdf66",
+          "type": "data",
+          "offset": 150,
+          "length": 123
+        }
+      ]
+    } ]
 
-This JSON document lists all the Blobs with contents. In this example, the Pack
-`73d04e61` contains three Blobs, the plaintext hashes are listed afterwards.
+This JSON document lists Blobs with contents. In this example, the Pack
+`73d04e61` contains two data Blobs and one Tree blob, the plaintext hashes are
+listed afterwards.
+
+There may be an arbitrary number of index files, containing information on
+non-disjoint sets of Packs. The number of packs described in a single file is
+chosen so that the file size is kep below 8 MiB.
 
 Keys, Encryption and MAC
 ------------------------
diff --git a/pack/pack.go b/pack/pack.go
new file mode 100644
index 000000000..b1a3e7ef7
--- /dev/null
+++ b/pack/pack.go
@@ -0,0 +1,270 @@
+package pack
+
+import (
+	"crypto/sha256"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/restic/restic/backend"
+	"github.com/restic/restic/crypto"
+)
+
+type BlobType uint8
+
+const (
+	Data BlobType = 0
+	Tree          = 1
+)
+
+func (t BlobType) MarshalJSON() ([]byte, error) {
+	switch t {
+	case Data:
+		return []byte(`"data"`), nil
+	case Tree:
+		return []byte(`"tree"`), nil
+	}
+
+	return nil, errors.New("unknown blob type")
+}
+
+func (t *BlobType) UnmarshalJSON(buf []byte) error {
+	switch string(buf) {
+	case `"data"`:
+		*t = Data
+	case `"tree"`:
+		*t = Tree
+	default:
+		return errors.New("unknown blob type")
+	}
+
+	return nil
+}
+
+// Blob is a blob within a pack.
+type Blob struct {
+	Type   BlobType
+	Length uint32
+	ID     backend.ID
+	Offset uint
+}
+
+// GetReader returns an io.Reader for the blob entry e.
+func (e Blob) GetReader(rd io.ReadSeeker) (io.Reader, error) {
+	// seek to the correct location
+	_, err := rd.Seek(int64(e.Offset), 0)
+	if err != nil {
+		return nil, err
+	}
+
+	return io.LimitReader(rd, int64(e.Length)), nil
+}
+
+// Packer is used to create a new Pack.
+type Packer struct {
+	blobs []Blob
+
+	bytes uint
+	k     *crypto.Key
+	wr    io.Writer
+	hw    *backend.HashingWriter
+
+	m sync.Mutex
+}
+
+// NewPacker returns a new Packer that can be used to pack blobs
+// together.
+func NewPacker(k *crypto.Key, w io.Writer) *Packer {
+	return &Packer{k: k, wr: w, hw: backend.NewHashingWriter(w, sha256.New())}
+}
+
+// Add saves the data read from rd as a new blob to the packer. Returned is the
+// number of bytes written to the pack.
+func (p *Packer) Add(t BlobType, id backend.ID, rd io.Reader) (int64, error) {
+	p.m.Lock()
+	defer p.m.Unlock()
+
+	c := Blob{Type: t, ID: id}
+
+	n, err := io.Copy(p.hw, rd)
+	c.Length = uint32(n)
+	c.Offset = p.bytes
+	p.bytes += uint(n)
+	p.blobs = append(p.blobs, c)
+
+	return n, err
+}
+
+var entrySize = binary.Size(BlobType(0)) + binary.Size(uint32(0)) + backend.IDSize
+
+// headerEntry is used with encoding/binary to read and write header entries
+type headerEntry struct {
+	Type   BlobType
+	Length uint32
+	ID     [backend.IDSize]byte
+}
+
+// Finalize writes the header for all added blobs and finalizes the pack.
+// Returned are the complete number of bytes written, including the header.
+// After Finalize() has finished, the ID of this pack can be obtained by
+// calling ID().
+func (p *Packer) Finalize() (int64, error) {
+	p.m.Lock()
+	defer p.m.Unlock()
+
+	// n tracks the number of bytes written for the header
+	var n int64 = int64(p.bytes)
+
+	// create writer to encrypt header
+	wr := crypto.EncryptTo(p.k, p.hw)
+
+	// write header
+	for _, b := range p.blobs {
+		entry := headerEntry{
+			Type:   b.Type,
+			Length: b.Length,
+		}
+		copy(entry.ID[:], b.ID)
+
+		err := binary.Write(wr, binary.LittleEndian, entry)
+		if err != nil {
+			return int64(n), err
+		}
+
+		n += int64(entrySize)
+	}
+
+	// finalize encrypted header
+	err := wr.Close()
+	if err != nil {
+		return int64(n), err
+	}
+
+	// account for crypto overhead
+	n += crypto.Extension
+
+	// write length
+	err = binary.Write(p.hw, binary.LittleEndian, uint32(len(p.blobs)*entrySize+crypto.Extension))
+	if err != nil {
+		return int64(n), err
+	}
+	n += int64(binary.Size(uint32(0)))
+
+	p.bytes = uint(n)
+
+	return n, nil
+}
+
+// ID returns the ID of all data written so far.
+func (p *Packer) ID() backend.ID {
+	p.m.Lock()
+	defer p.m.Unlock()
+
+	return p.hw.Sum(nil)
+}
+
+// Size returns the number of bytes written so far.
+func (p *Packer) Size() uint {
+	p.m.Lock()
+	defer p.m.Unlock()
+
+	return p.bytes
+}
+
+// Count returns the number of blobs in this packer.
+func (p *Packer) Count() int {
+	p.m.Lock()
+	defer p.m.Unlock()
+
+	return len(p.blobs)
+}
+
+// Blobs returns the slice of blobs that have been written.
+func (p *Packer) Blobs() []Blob {
+	p.m.Lock()
+	defer p.m.Unlock()
+
+	return p.blobs
+}
+
+// Writer returns the underlying writer.
+func (p *Packer) Writer() io.Writer {
+	return p.wr
+}
+
+func (p *Packer) String() string {
+	return fmt.Sprintf("<Packer %d blobs, %d bytes>", len(p.blobs), p.bytes)
+}
+
+// Unpacker is used to read individual blobs from a pack.
+type Unpacker struct {
+	rd      io.ReadSeeker
+	Entries []Blob
+	k       *crypto.Key
+}
+
+// NewUnpacker returns a pointer to Unpacker which can be used to read
+// individual Blobs from a pack.
+func NewUnpacker(k *crypto.Key, entries []Blob, rd io.ReadSeeker) (*Unpacker, error) {
+	var err error
+	ls := binary.Size(uint32(0))
+
+	// reset to the end to read header length
+	_, err = rd.Seek(-int64(ls), 2)
+	if err != nil {
+		return nil, fmt.Errorf("seeking to read header length failed: %v", err)
+	}
+
+	// read length
+	var l uint32
+	err = binary.Read(rd, binary.LittleEndian, &l)
+	if err != nil {
+		return nil, fmt.Errorf("reading header length failed: %v", err)
+	}
+
+	// reset to the beginning of the header
+	_, err = rd.Seek(-int64(ls)-int64(l), 2)
+	if err != nil {
+		return nil, fmt.Errorf("seeking to read header length failed: %v", err)
+	}
+
+	// read header
+	hrd, err := crypto.DecryptFrom(k, io.LimitReader(rd, int64(l)))
+	if err != nil {
+		return nil, err
+	}
+
+	if entries == nil {
+		pos := uint(0)
+		for {
+			e := headerEntry{}
+			err = binary.Read(hrd, binary.LittleEndian, &e)
+			if err == io.EOF {
+				break
+			}
+
+			if err != nil {
+				return nil, err
+			}
+
+			entries = append(entries, Blob{
+				Type:   e.Type,
+				Length: e.Length,
+				ID:     e.ID[:],
+				Offset: pos,
+			})
+
+			pos += uint(e.Length)
+		}
+	}
+
+	p := &Unpacker{
+		rd:      rd,
+		k:       k,
+		Entries: entries,
+	}
+
+	return p, nil
+}
diff --git a/pack/pack_test.go b/pack/pack_test.go
new file mode 100644
index 000000000..c3d9c26bf
--- /dev/null
+++ b/pack/pack_test.go
@@ -0,0 +1,111 @@
+package pack_test
+
+import (
+	"bytes"
+	"crypto/rand"
+	"crypto/sha256"
+	"encoding/binary"
+	"encoding/json"
+	"io"
+	"io/ioutil"
+	"testing"
+
+	"github.com/restic/restic/backend"
+	"github.com/restic/restic/crypto"
+	"github.com/restic/restic/pack"
+	. "github.com/restic/restic/test"
+)
+
+var lengths = []int{23, 31650, 25860, 10928, 13769, 19862, 5211, 127, 13690, 30231}
+
+// var lengths = []int{200}
+
+func TestCreatePack(t *testing.T) {
+	type Buf struct {
+		data []byte
+		id   backend.ID
+	}
+
+	bufs := []Buf{}
+
+	for _, l := range lengths {
+		b := make([]byte, l)
+		_, err := io.ReadFull(rand.Reader, b)
+		OK(t, err)
+		h := sha256.Sum256(b)
+		bufs = append(bufs, Buf{data: b, id: h[:]})
+	}
+
+	file := bytes.NewBuffer(nil)
+
+	// create random keys
+	k := crypto.NewKey()
+
+	// pack blobs
+	p := pack.NewPacker(k, file)
+	for _, b := range bufs {
+		p.Add(pack.Tree, b.id, bytes.NewReader(b.data))
+	}
+
+	// write file
+	n, err := p.Finalize()
+	OK(t, err)
+
+	written := 0
+	// data
+	for _, l := range lengths {
+		written += l
+	}
+	// header length
+	written += binary.Size(uint32(0))
+	// header
+	written += len(lengths) * (binary.Size(pack.BlobType(0)) + binary.Size(uint32(0)) + backend.IDSize)
+	// header crypto
+	written += crypto.Extension
+
+	// check length
+	Equals(t, int64(written), n)
+	Equals(t, uint(written), p.Size())
+
+	// read and parse it again
+	rd := bytes.NewReader(file.Bytes())
+	np, err := pack.NewUnpacker(k, nil, rd)
+	OK(t, err)
+	Equals(t, len(np.Entries), len(bufs))
+
+	for i, b := range bufs {
+		e := np.Entries[i]
+		Equals(t, b.id, e.ID)
+
+		brd, err := e.GetReader(rd)
+		OK(t, err)
+		data, err := ioutil.ReadAll(brd)
+		OK(t, err)
+
+		Assert(t, bytes.Equal(b.data, data),
+			"data for blob %v doesn't match", i)
+	}
+}
+
+var blobTypeJson = []struct {
+	t   pack.BlobType
+	res string
+}{
+	{pack.Data, `"data"`},
+	{pack.Tree, `"tree"`},
+}
+
+func TestBlobTypeJSON(t *testing.T) {
+	for _, test := range blobTypeJson {
+		// test serialize
+		buf, err := json.Marshal(test.t)
+		OK(t, err)
+		Equals(t, test.res, string(buf))
+
+		// test unserialize
+		var v pack.BlobType
+		err = json.Unmarshal([]byte(test.res), &v)
+		OK(t, err)
+		Equals(t, test.t, v)
+	}
+}