add patch support

Aleksandr Snopov · vitalif · commit e5895a6a86be · 2023-11-22T17:01:58.000+03:00
diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ See also [Common Issues](#common-issues).
 | No readahead on random read    |    +    |    -   |    +   |   -  |    +    |
 | Server-side copy on append     |    +    |    -   |    -   |   *  |    +    |
 | Server-side copy on update     |    +    |    -   |    -   |   *  |    -    |
+| Partial object updates         |    +*   |    -   |    -   |   -  |    -    |
 | xattrs without extra RTT       |    +*   |    -   |    -   |   -  |    +    |
 | Dir preload on file lookup     |    +    |    -   |    -   |   -  |    -    |
 | Fast recursive listings        |    +    |    -   |    *   |   -  |    +    |
@@ -92,6 +93,21 @@ See also [Common Issues](#common-issues).
 
 \* xattrs without extra RTT only work with Yandex S3 (--list-type=ext-v1).
 
+\* Partial object updates only work with Yandex S3.
+
+## Partial object updates
+
+With Yandex S3 it is possible to do partial object updates (data only) without server-side copy or reupload. 
+Currently the feature can be enabled by the flag `--enable-patch` and will be enabled by default for YC S3 in the future.
+
+Enabling patch uploads has the following benefits:
+- Fast [fsync](#fsync): since nothing needs to be copied, fsync is now much cheaper
+- Support for [concurrent updates](#concurrent-updates)
+- Better memory utilization: less intermediate state needs to be cached, so more memory can be used for (meta)data cache
+- Better performace for big files
+
+Note: new files, metadata changes and renames are still flushed to S3 as multipart uploads.
+
 # Installation
 
 * Pre-built binaries:
@@ -215,6 +231,25 @@ fio -name=test -ioengine=libaio -direct=1 -bs=4M -iodepth=1 -fallocate=none \
 
 ## Concurrent Updates
 
+### Yandex S3
+
+When using Yandex S3, it is possible to concurrently update a single object/file from multiple hosts
+using PATCH method (`--enable-patch`). However, concurrent changes are not reported back to the clients,
+so in order to see the actual object contents you need to stop all writes and refresh the inode cache (see below).
+
+It is strongly advised that clients from different hosts write data by non-overlapping offsets and
+the writes are aligned with object parts borders to avoid conflicts. If it impossible to avoid conflicts entirely,
+the conflicts are resolved by the LWW strategy. In case the conflict can't be resolved,
+you can choose to drop the cached update (`--drop-patch-conflicts`), otherwise the write will be retried later.
+
+The conflicts are reported in the log as following:
+
+```
+main.WARNING Failed to patch range %d-%d of file %s (inode %d) due to concurrent updates
+```
+
+### Other clouds
+
 GeeseFS doesn't support concurrent updates of the same file from multiple hosts. If you try to
 do that you should guarantee that one host calls `fsync()` on the modified file and then waits
 for at least `--stat-cache-ttl` (1 minute by default) before allowing other hosts to start
diff --git a/internal/file.go b/internal/file.go
@@ -21,6 +21,7 @@ import (
 	"os"
 	"path"
 	"sort"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -1516,7 +1517,25 @@ func (inode *Inode) SendUpload() bool {
 		}
 	}
 
-	if inode.Attributes.Size <= inode.fs.flags.SinglePartMB*1024*1024 && inode.mpu == nil {
+	if inode.IsFlushing >= inode.fs.flags.MaxParallelParts {
+		return false
+	}
+
+	smallFile := inode.Attributes.Size <= inode.fs.flags.SinglePartMB*1024*1024
+	canPatch := inode.fs.flags.UsePatch &&
+		// Can only patch modified inodes with completed MPUs.
+		inode.CacheState == ST_MODIFIED && inode.mpu == nil &&
+		// In current implemetation we should not patch big simple objects. Reupload them as multiparts first.
+		// If current ETag is unknown, try patching anyway, so that we don't trigger an unecessary mpu.
+		(inode.uploadedAsMultipart() || inode.knownETag == "" || smallFile) &&
+		// Currently PATCH does not support truncates. If the file was truncated, reupload it.
+		inode.knownSize <= inode.Attributes.Size
+
+	if canPatch {
+		return inode.patchObjectRanges()
+	}
+
+	if smallFile && inode.mpu == nil {
 		// Don't flush small files with active file handles (if not under memory pressure)
 		if inode.IsFlushing == 0 && (inode.fileHandles == 0 || inode.forceFlush || atomic.LoadInt32(&inode.fs.wantFree) > 0) {
 			// Don't accidentally trigger a parallel multipart flush
@@ -1528,12 +1547,12 @@ func (inode *Inode) SendUpload() bool {
 		return false
 	}
 
-	if inode.IsFlushing >= inode.fs.flags.MaxParallelParts {
-		return false
-	}
-
 	// Initiate multipart upload, if not yet
 	if inode.mpu == nil {
+		// Wait for other updates to complete.
+		if inode.IsFlushing > 0 {
+			return false
+		}
 		inode.IsFlushing += inode.fs.flags.MaxParallelParts
 		atomic.AddInt64(&inode.fs.activeFlushers, 1)
 		go func() {
@@ -1662,6 +1681,246 @@ func (inode *Inode) SendUpload() bool {
 	return initiated
 }
 
+func (inode *Inode) uploadedAsMultipart() bool {
+	return strings.Contains(inode.knownETag, "-")
+}
+
+func (inode *Inode) patchObjectRanges() (initiated bool) {
+	smallFile := inode.Attributes.Size <= inode.fs.flags.SinglePartMB*1024*1024
+	wantFlush := inode.fileHandles == 0 || inode.forceFlush || atomic.LoadInt32(&inode.fs.wantFree) > 0
+
+	if smallFile && wantFlush {
+		if inode.flushLimitsExceeded() {
+			return
+		}
+		var flushBufs []*FileBuffer
+		for _, buf := range inode.buffers {
+			if buf.state == BUF_DIRTY {
+				flushBufs = append(flushBufs, buf)
+			}
+		}
+		inode.patchSimpleObj(flushBufs)
+		return true
+	}
+
+	updatedPartID := inode.fs.partNum(inode.lastWriteEnd)
+	endPartID := inode.fs.partNum(inode.Attributes.Size - 1)
+
+	var prevSize uint64
+	for part := uint64(0); part <= endPartID; part++ {
+		if inode.flushLimitsExceeded() {
+			break
+		}
+
+		partStart, partSize := inode.fs.partRange(part)
+		// In its current implementation PATCH doesn't support ranges with start offset larger than object size.
+		if partStart > inode.knownSize {
+			break
+		}
+
+		partEnd, rangeBorder := partStart+partSize, partSize != prevSize
+		appendPatch, newPart := partEnd > inode.knownSize, partStart == inode.knownSize
+
+		// When entering a new part range, we can't immediately switch to the new part size,
+		// because we need to init a new part first.
+		if newPart && rangeBorder && prevSize > 0 {
+			partEnd, partSize = partStart+prevSize, prevSize
+		}
+		prevSize = partSize
+
+		smallTail := appendPatch && inode.Attributes.Size-partStart < partSize
+		if smallTail && !wantFlush {
+			break
+		}
+
+		partLocked := inode.IsRangeLocked(partStart, partEnd, true)
+		if !wantFlush && part == updatedPartID || partLocked {
+			continue
+		}
+
+		var flushBufs []*FileBuffer
+		for pos := locateBuffer(inode.buffers, partStart); pos < len(inode.buffers); pos++ {
+			buf := inode.buffers[pos]
+			if buf.offset >= partEnd {
+				break
+			}
+			if buf.state != BUF_DIRTY || buf.zero && !wantFlush && !appendPatch {
+				continue
+			}
+
+			if buf.offset < partStart {
+				inode.splitBuffer(pos, partStart-buf.offset)
+				continue
+			}
+			if buf.offset+buf.length > partEnd {
+				inode.splitBuffer(pos, partEnd-buf.offset)
+			}
+
+			flushBufs = append(flushBufs, buf)
+		}
+
+		if len(flushBufs) != 0 {
+			inode.patchPart(partStart, partSize, flushBufs)
+			initiated = true
+		}
+	}
+	return
+}
+
+func (inode *Inode) flushLimitsExceeded() bool {
+	return atomic.LoadInt64(&inode.fs.activeFlushers) >= inode.fs.flags.MaxFlushers ||
+		inode.IsFlushing >= inode.fs.flags.MaxParallelParts
+}
+
+func (inode *Inode) patchSimpleObj(bufs []*FileBuffer) {
+	inode.LockRange(0, inode.Attributes.Size, true)
+	inode.IsFlushing += inode.fs.flags.MaxParallelParts
+	atomic.AddInt64(&inode.fs.activeFlushers, 1)
+
+	go func() {
+		inode.mu.Lock()
+		inode.patchFromBuffers(bufs, 0)
+
+		inode.UnlockRange(0, inode.Attributes.Size, true)
+		inode.IsFlushing -= inode.fs.flags.MaxParallelParts
+		inode.mu.Unlock()
+
+		atomic.AddInt64(&inode.fs.activeFlushers, -1)
+		inode.fs.WakeupFlusher()
+	}()
+}
+
+func (inode *Inode) patchPart(partOffset, partSize uint64, bufs []*FileBuffer) {
+	inode.LockRange(partOffset, partSize, true)
+	inode.IsFlushing++
+	atomic.AddInt64(&inode.fs.activeFlushers, 1)
+
+	go func() {
+		inode.mu.Lock()
+		inode.patchFromBuffers(bufs, partSize)
+
+		inode.UnlockRange(partOffset, partSize, true)
+		inode.IsFlushing--
+		inode.mu.Unlock()
+
+		atomic.AddInt64(&inode.fs.activeFlushers, -1)
+		inode.fs.WakeupFlusher()
+	}()
+}
+
+func (inode *Inode) patchFromBuffers(bufs []*FileBuffer, partSize uint64) {
+	if len(bufs) == 0 {
+		return
+	}
+
+	first, last := bufs[0], bufs[len(bufs)-1]
+	offset, size := first.offset, last.offset+last.length-first.offset
+
+	var bufsSize uint64
+	for _, b := range bufs {
+		bufsSize += b.length
+	}
+	contiguous := bufsSize == size
+
+	// If bufs is a contiguous range of buffers then we can send them as PATCH immediately,
+	// otherwise we need to read missing ranges first.
+	var reader io.ReadSeeker
+	if contiguous {
+		r := NewMultiReader()
+		for _, buf := range bufs {
+			if !buf.zero {
+				r.AddBuffer(buf.data)
+			} else {
+				r.AddZero(buf.length)
+			}
+		}
+		reader = r
+	} else {
+		key := inode.FullName()
+		_, err := inode.LoadRange(offset, size, 0, true)
+		if err != nil {
+			switch mapAwsError(err) {
+			case syscall.ENOENT, syscall.ERANGE:
+				s3Log.Warnf("File %s (inode %d) is deleted or resized remotely, discarding all local changes", key, inode.Id)
+				inode.resetCache()
+			default:
+				log.Errorf("Failed to load range %d-%d of file %s (inode %d) to patch it: %s", offset, offset+size, key, inode.Id, err)
+			}
+			return
+		}
+		// File size or inode state may have been changed again, abort patch. These are local changes,
+		// so we don't need to drop any cached state here.
+		if inode.Attributes.Size < offset || inode.CacheState != ST_MODIFIED {
+			log.Warnf("Local state of file %s (inode %d) changed, aborting patch", key, inode.Id)
+			return
+		}
+		reader, _ = inode.GetMultiReader(offset, size)
+	}
+
+	if ok := inode.sendPatch(offset, size, reader, partSize); !ok {
+		return
+	}
+
+	for _, b := range bufs {
+		b.state, b.dirtyID = BUF_CLEAN, 0
+	}
+	if !inode.isStillDirty() {
+		inode.SetCacheState(ST_CACHED)
+	}
+}
+
+func (inode *Inode) sendPatch(offset, size uint64, r io.ReadSeeker, partSize uint64) bool {
+	cloud, key := inode.cloud()
+	if inode.oldParent != nil {
+		_, key = inode.oldParent.cloud()
+		key = appendChildName(key, inode.oldName)
+	}
+	log.Debugf("Patching range %d-%d of file %s (inode %d)", offset, offset+size, key, inode.Id)
+
+	inode.mu.Unlock()
+	inode.fs.addInflightChange(key)
+	resp, err := cloud.PatchBlob(&PatchBlobInput{
+		Key:            key,
+		Offset:         offset,
+		Size:           size,
+		AppendPartSize: int64(partSize),
+		Body:           r,
+	})
+	inode.fs.completeInflightChange(key)
+	inode.mu.Lock()
+
+	// File was deleted while we were flushing it
+	if inode.CacheState == ST_DELETED {
+		return false
+	}
+
+	inode.recordFlushError(err)
+	if err != nil {
+		switch mapAwsError(err) {
+		case syscall.ENOENT, syscall.ERANGE:
+			s3Log.Warnf("File %s (inode %d) is deleted or resized remotely, discarding all local changes", key, inode.Id)
+			inode.resetCache()
+		case syscall.EBUSY:
+			s3Log.Warnf("Failed to patch range %d-%d of file %s (inode %d) due to concurrent updates", offset, offset+size, key, inode.Id)
+			if inode.fs.flags.DropPatchConflicts {
+				inode.discardChanges(offset, size)
+			}
+		default:
+			log.Errorf("Failed to patch range %d-%d of file %s (inode %d): %s", offset, offset+size, key, inode.Id, err)
+		}
+		return false
+	}
+
+	log.Debugf("Succesfully patched range %d-%d of file %s (inode %d), etag: %s", offset, offset+size, key, inode.Id, NilStr(resp.ETag))
+	inode.updateFromFlush(MaxUInt64(inode.knownSize, offset+size), resp.ETag, resp.LastModified, nil)
+	return true
+}
+
+func (inode *Inode) discardChanges(offset, size uint64) {
+	allocated := inode.removeRange(offset, size, BUF_DIRTY)
+	inode.fs.bufferPool.Use(allocated, true)
+}
+
 func (inode *Inode) isStillDirty() bool {
 	if inode.userMetadataDirty != 0 || inode.oldParent != nil {
 		return true
diff --git a/internal/goofys.go b/internal/goofys.go
@@ -34,8 +34,9 @@ import (
 
 	"github.com/jacobsa/fuse/fuseops"
 
-	"github.com/sirupsen/logrus"
 	"net/http"
+
+	"github.com/sirupsen/logrus"
 )
 
 // goofys is a Filey System written in Go. All the backend data is
@@ -1126,6 +1127,8 @@ func mapAwsError(err error) error {
 			return syscall.ENXIO
 		case "BucketAlreadyOwnedByYou":
 			return syscall.EEXIST
+		case "ConcurrentUpdatesPatchConflict", "ObjectVersionPatchConflict":
+			return syscall.EBUSY
 		}
 
 		if reqErr, ok := err.(awserr.RequestFailure); ok {
diff --git a/internal/handles.go b/internal/handles.go
@@ -213,10 +213,14 @@ func (inode *Inode) SetFromBlobItem(item *BlobItemOutput) {
 	inode.mu.Lock()
 	defer inode.mu.Unlock()
 
+	patchInProgress := inode.fs.flags.UsePatch && inode.mpu == nil && inode.CacheState == ST_MODIFIED && inode.IsFlushing > 0
 	// We always just drop our local cache when inode size or etag changes remotely
 	// It's the simplest method of conflict resolution
 	// Otherwise we may not be able to make a correct object version
-	if item.ETag != nil && inode.knownETag != *item.ETag || item.Size != inode.knownSize {
+	//
+	// If ongoing patch requests exist, then concurrent etag changes is normal. In current implementation
+	// it is hard to reliably distinguish actual data conflicts from concurrent patch updates.
+	if !patchInProgress && (item.ETag != nil && inode.knownETag != *item.ETag || item.Size != inode.knownSize) {
 		if inode.CacheState != ST_CACHED && (inode.knownETag != "" || inode.knownSize > 0) {
 			s3Log.Warnf("Conflict detected (inode %v): server-side ETag or size of %v"+
 				" (%v, %v) differs from local (%v, %v). File is changed remotely, dropping cache",