Skip to content

Commit 71e7d73

Browse files
authored
feat: litt unlock command (#1823)
* Add LittDB cli command to unlock LittDB file system * added comment * implemented unlock Signed-off-by: Cody Littley <[email protected]> * Wire in lock purging * make suggested changes * made suggested change Signed-off-by: Cody Littley <[email protected]> * lint --------- Signed-off-by: Cody Littley <[email protected]>
1 parent d156212 commit 71e7d73

File tree

9 files changed

+328
-1
lines changed

9 files changed

+328
-1
lines changed

litt/cli/litt_cli.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,21 @@ import (
1010
"github.com/urfave/cli/v2"
1111
)
1212

13+
// TODO (cody.littley): convert all commands to use flags stored in these variables
14+
var (
15+
srcFlag = &cli.StringSliceFlag{
16+
Name: "src",
17+
Aliases: []string{"s"},
18+
Usage: "Source paths where the DB data is found, at least one is required.",
19+
Required: true,
20+
}
21+
forceFlag = &cli.BoolFlag{
22+
Name: "force",
23+
Aliases: []string{"f"},
24+
Usage: "Force the operation without prompting for confirmation.",
25+
}
26+
)
27+
1328
// buildCliParser creates a command line parser for the LittDB CLI tool.
1429
func buildCLIParser(logger logging.Logger) *cli.App {
1530
app := &cli.App{
@@ -267,6 +282,16 @@ func buildCLIParser(logger logging.Logger) *cli.App {
267282
},
268283
Action: nil, // syncCommand, // TODO this will be added in a follow up PR
269284
},
285+
{
286+
Name: "unlock",
287+
Usage: "Manually delete LittDB lock files. Dangerous if used improperly, use with caution.",
288+
ArgsUsage: "--src <path1> ... --src <pathN> [--force]",
289+
Flags: []cli.Flag{
290+
srcFlag,
291+
forceFlag,
292+
},
293+
Action: unlockCommand,
294+
},
270295
},
271296
}
272297
return app

litt/cli/unlock.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package main
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"os"
7+
"strings"
8+
9+
"github.com/Layr-Labs/eigenda/common"
10+
"github.com/Layr-Labs/eigenda/litt/disktable"
11+
"github.com/urfave/cli/v2"
12+
)
13+
14+
// called by the CLI to unlock a LittDB file system.
15+
func unlockCommand(ctx *cli.Context) error {
16+
logger, err := common.NewLogger(common.DefaultConsoleLoggerConfig())
17+
if err != nil {
18+
return fmt.Errorf("failed to create logger: %w", err)
19+
}
20+
sources := ctx.StringSlice(srcFlag.Name)
21+
22+
if len(sources) == 0 {
23+
return fmt.Errorf("at least one source path is required")
24+
}
25+
26+
force := ctx.Bool(forceFlag.Name)
27+
if !force {
28+
magicString := "I know what I am doing"
29+
logger.Warnf("About to delete LittDB lock files. This is potentially dangerous. "+
30+
"Type \"%s\" to continue, or use "+
31+
"the --force flag.", magicString)
32+
reader := bufio.NewReader(os.Stdin)
33+
input, err := reader.ReadString('\n')
34+
if err != nil {
35+
return fmt.Errorf("failed to read input: %w", err)
36+
}
37+
input = strings.TrimSuffix(input, "\n")
38+
if input != magicString {
39+
return fmt.Errorf("unlock operation aborted")
40+
}
41+
}
42+
43+
err = disktable.Unlock(logger, sources)
44+
if err != nil {
45+
return fmt.Errorf("failed to unlock LittDB files: %w", err)
46+
}
47+
return nil
48+
}

litt/disktable/unlock.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package disktable
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"path/filepath"
7+
"strings"
8+
9+
"github.com/Layr-Labs/eigenda/litt/util"
10+
"github.com/Layr-Labs/eigensdk-go/logging"
11+
)
12+
13+
// Unlocks a LittDB file system.
14+
//
15+
// DANGER: calling this method opens the door for unsafe concurrent operations on LittDB files.
16+
// With great power comes great responsibility.
17+
func Unlock(logger logging.Logger, sourcePaths []string) error {
18+
for _, sourcePath := range sourcePaths {
19+
err := filepath.WalkDir(sourcePath, func(path string, d os.DirEntry, err error) error {
20+
if err != nil {
21+
return err
22+
}
23+
if d.IsDir() {
24+
return nil
25+
}
26+
27+
if strings.HasSuffix(path, util.LockfileName) {
28+
logger.Infof("Removing lock file %s", path)
29+
if removeErr := os.Remove(path); removeErr != nil {
30+
logger.Error("Failed to remove lock file", "path", path, "error", removeErr)
31+
return fmt.Errorf("failed to remove lock file %s: %w", path, removeErr)
32+
}
33+
}
34+
35+
return nil
36+
})
37+
38+
if err != nil {
39+
return fmt.Errorf("failed to walk directory %s: %w", sourcePath, err)
40+
}
41+
}
42+
43+
return nil
44+
}

litt/littbuilder/db_impl.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010

1111
"github.com/Layr-Labs/eigenda/common"
1212
"github.com/Layr-Labs/eigenda/litt"
13+
"github.com/Layr-Labs/eigenda/litt/disktable"
1314
"github.com/Layr-Labs/eigenda/litt/metrics"
1415
"github.com/Layr-Labs/eigenda/litt/util"
1516
"github.com/Layr-Labs/eigensdk-go/logging"
@@ -103,14 +104,24 @@ func NewDB(config *litt.Config) (litt.DB, error) {
103104
// NewDBUnsafe creates a new DB instance with a custom table builder. This is intended for unit test use,
104105
// and should not be considered a stable API.
105106
func NewDBUnsafe(config *litt.Config, tableBuilder TableBuilderFunc) (litt.DB, error) {
106-
107107
for _, rootPath := range config.Paths {
108108
err := util.EnsureDirectoryExists(rootPath, config.Fsync)
109109
if err != nil {
110110
return nil, fmt.Errorf("error ensuring directory %s exists: %w", rootPath, err)
111111
}
112112
}
113113

114+
if config.PurgeLocks {
115+
config.Logger.Warnf("Purging LittDB locks from paths %v", config.Paths)
116+
err := disktable.Unlock(config.Logger, config.Paths)
117+
if err != nil {
118+
return nil, fmt.Errorf("error purging locks: %w", err)
119+
}
120+
config.Logger.Infof("Locks purged successfully")
121+
} else {
122+
config.Logger.Infof("Not purging locks, continuing with existing locks")
123+
}
124+
114125
releaseLocks, err := util.LockDirectories(config.Logger, config.Paths, util.LockfileName, config.Fsync)
115126
if err != nil {
116127
return nil, fmt.Errorf("error acquiring locks on paths %v: %w", config.Paths, err)

litt/littdb_config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ type Config struct {
156156
// of database snapshots. Failing to clean up the hard linked files referenced by the symlinks will result in a
157157
// disk space leak.
158158
SnapshotDirectory string
159+
160+
// If true, then purge all lock files prior to starting the database. This is potentially dangerous, as it will
161+
// permit multiple databases to be opened against the same data directories. If ever there are two LittDB
162+
// instances running against the same data directories, data corruption is almost a certainty.
163+
PurgeLocks bool
159164
}
160165

161166
// DefaultConfig returns a Config with default values.
@@ -197,6 +202,7 @@ func DefaultConfigNoPaths() *Config {
197202
MetricsNamespace: "litt",
198203
MetricsPort: 9101,
199204
MetricsUpdateInterval: time.Second,
205+
PurgeLocks: false,
200206
}
201207
}
202208

litt/test/unlock_test.go

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
package test
2+
3+
import (
4+
"os"
5+
"path"
6+
"path/filepath"
7+
"strings"
8+
"testing"
9+
10+
testrandom "github.com/Layr-Labs/eigenda/common/testutils/random"
11+
"github.com/Layr-Labs/eigenda/litt"
12+
"github.com/Layr-Labs/eigenda/litt/disktable"
13+
"github.com/Layr-Labs/eigenda/litt/littbuilder"
14+
"github.com/Layr-Labs/eigenda/litt/util"
15+
"github.com/stretchr/testify/require"
16+
)
17+
18+
// Note: this test is defined in the test package to avoid circular dependencies.
19+
20+
func TestUnlock(t *testing.T) {
21+
testDir := t.TempDir()
22+
rand := testrandom.NewTestRandom()
23+
volumes := []string{path.Join(testDir, "volume1"), path.Join(testDir, "volume2"), path.Join(testDir, "volume3")}
24+
25+
config, err := litt.DefaultConfig(volumes...)
26+
config.Fsync = false // Disable fsync for faster tests
27+
config.TargetSegmentFileSize = 100
28+
config.ShardingFactor = uint32(len(volumes))
29+
require.NoError(t, err)
30+
31+
db, err := littbuilder.NewDB(config)
32+
require.NoError(t, err)
33+
34+
table, err := db.GetTable("test_table")
35+
require.NoError(t, err)
36+
37+
expectedData := make(map[string][]byte)
38+
39+
// Write some data
40+
for i := 0; i < 100; i++ {
41+
key := rand.PrintableBytes(32)
42+
value := rand.PrintableVariableBytes(1, 100)
43+
44+
expectedData[string(key)] = value
45+
err = table.Put(key, value)
46+
require.NoError(t, err, "Failed to put data in table")
47+
}
48+
49+
// Look for lock files. We should see one for each volume.
50+
lockFileCount := 0
51+
err = filepath.Walk(testDir, func(path string, info os.FileInfo, err error) error {
52+
if err != nil {
53+
return err
54+
}
55+
if info.IsDir() {
56+
return nil
57+
}
58+
if strings.HasSuffix(path, util.LockfileName) {
59+
lockFileCount++
60+
}
61+
return nil
62+
})
63+
require.NoError(t, err)
64+
require.Equal(t, 3, lockFileCount)
65+
66+
// Unlock the DB. This should remove all lock files, but leave other files intact.
67+
err = disktable.Unlock(config.Logger, volumes)
68+
require.NoError(t, err, "Failed to unlock the database")
69+
70+
// There should be no lock files left.
71+
lockFileCount = 0
72+
err = filepath.Walk(testDir, func(path string, info os.FileInfo, err error) error {
73+
if err != nil {
74+
return err
75+
}
76+
if info.IsDir() {
77+
return nil
78+
}
79+
if strings.HasSuffix(path, util.LockfileName) {
80+
lockFileCount++
81+
}
82+
return nil
83+
})
84+
require.NoError(t, err)
85+
require.Equal(t, 0, lockFileCount, "There should be no lock files left after unlocking")
86+
87+
// Calling unlock again should not cause any issues.
88+
err = disktable.Unlock(config.Logger, volumes)
89+
require.NoError(t, err, "Failed to unlock the database again")
90+
91+
// Verify that the data is still intact.
92+
for key, expectedValue := range expectedData {
93+
value, ok, err := table.Get([]byte(key))
94+
require.NoError(t, err, "Failed to get data from table")
95+
require.True(t, ok, "Failed to get data from table")
96+
require.Equal(t, expectedValue, value, "Data mismatch for key %s", key)
97+
}
98+
99+
// Restart the database and verify the data again.
100+
err = db.Close()
101+
require.NoError(t, err)
102+
103+
db, err = littbuilder.NewDB(config)
104+
require.NoError(t, err)
105+
106+
table, err = db.GetTable("test_table")
107+
require.NoError(t, err)
108+
109+
for key, expectedValue := range expectedData {
110+
value, ok, err := table.Get([]byte(key))
111+
require.NoError(t, err, "Failed to get data from table after restart")
112+
require.True(t, ok, "Failed to get data from table after restart")
113+
require.Equal(t, expectedValue, value, "Data mismatch for key %s after restart", key)
114+
}
115+
116+
err = db.Close()
117+
require.NoError(t, err, "Failed to close the database after restart")
118+
}
119+
120+
func TestPurgeLocks(t *testing.T) {
121+
testDir := t.TempDir()
122+
rand := testrandom.NewTestRandom()
123+
volumes := []string{path.Join(testDir, "volume1", path.Join(testDir, "volume2"), path.Join(testDir, "volume3"))}
124+
125+
config, err := litt.DefaultConfig(volumes...)
126+
config.Fsync = false // Disable fsync for faster tests
127+
config.TargetSegmentFileSize = 100
128+
require.NoError(t, err)
129+
130+
db, err := littbuilder.NewDB(config)
131+
require.NoError(t, err)
132+
133+
table, err := db.GetTable("test_table")
134+
require.NoError(t, err)
135+
136+
expectedData := make(map[string][]byte)
137+
138+
// Write some data
139+
for i := 0; i < 100; i++ {
140+
key := rand.PrintableBytes(32)
141+
value := rand.PrintableVariableBytes(1, 100)
142+
143+
expectedData[string(key)] = value
144+
err = table.Put(key, value)
145+
require.NoError(t, err, "Failed to put data in table")
146+
}
147+
148+
// Opening a second instance of the database should fail due to existing locks.
149+
_, err = littbuilder.NewDB(config)
150+
require.Error(t, err, "Expected error when opening a second instance of the database with existing locks")
151+
152+
// Open a new instance of the database at the same time. Normally this is not possible, but it becomes possible
153+
// when we purge locks.
154+
config.PurgeLocks = true
155+
db2, err := littbuilder.NewDB(config)
156+
require.NoError(t, err, "Failed to open a second instance of the database")
157+
158+
// This test doesn't bother to verify the table data, since we are in unsafe territory now with multiple instances
159+
// of the database running at the same time.
160+
161+
err = db.Close()
162+
require.NoError(t, err, "Failed to close the first instance of the database")
163+
err = db2.Close()
164+
require.NoError(t, err)
165+
}

node/config.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,25 @@ type Config struct {
136136
// Directories do not need to be on the same filesystem.
137137
LittDBStoragePaths []string
138138

139+
// If true, then purge LittDB locks on startup. Potentially useful to get rid of zombie lock files,
140+
// but also dangerous (multiple LittDB processes operating on the same files can lead to data corruption).
141+
//
142+
// When LittDB starts up, it attempts to create lock files. When a validator is forcefully shut down, lock files
143+
// may be left behind. At startup time, if LittDB observes existing lock files, it first checks to see
144+
// if the process that created the lock files is still running. The lock files contain the creator's PID, and so
145+
// LittDB checks to see if there is any process with that PID still running.
146+
//
147+
// Although it should be rare, it's possible that another process may be started with the same PID as the
148+
// PID used to create the lock files. When this happens, LittDB will be prevented from starting up out of
149+
// fear of another process trying to access the same files, even though the original process that created the
150+
// lock files is no longer running. If that happens, this flag is a safe way to force LittDB to start up
151+
// without being blocked by those lock files. BE VERY CERTAIN THAT THE OTHER PROCESS IS ACTUALLY DEAD!
152+
// If two instances of LittDB are running on the same files, it WILL lead to data corruption.
153+
//
154+
// An alternate way to clear the LittDB lock files is via the LittDB CLI with the "litt unlock" command.
155+
// Run "litt unlock --help" for more information.
156+
LittUnsafePurgeLocks bool
157+
139158
// The rate limit for the number of bytes served by the GetChunks API if the data is in the cache.
140159
// Unit is in megabytes per second.
141160
GetChunksHotCacheReadLimitMB float64
@@ -393,6 +412,7 @@ func NewConfig(ctx *cli.Context) (*Config, error) {
393412
LittDBReadCacheSizeGB: ctx.GlobalFloat64(flags.LittDBReadCacheSizeGBFlag.Name),
394413
LittDBReadCacheSizeFraction: ctx.GlobalFloat64(flags.LittDBReadCacheSizeFractionFlag.Name),
395414
LittDBStoragePaths: ctx.GlobalStringSlice(flags.LittDBStoragePathsFlag.Name),
415+
LittUnsafePurgeLocks: ctx.GlobalBool(flags.LittUnsafePurgeLocksFlag.Name),
396416
DownloadPoolSize: ctx.GlobalInt(flags.DownloadPoolSizeFlag.Name),
397417
GetChunksHotCacheReadLimitMB: ctx.GlobalFloat64(flags.GetChunksHotCacheReadLimitMBFlag.Name),
398418
GetChunksHotBurstLimitMB: ctx.GlobalFloat64(flags.GetChunksHotBurstLimitMBFlag.Name),

0 commit comments

Comments
 (0)