Skip to content
19 changes: 19 additions & 0 deletions litt/cli/litt_cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ import (
"github.com/urfave/cli/v2"
)

// TODO (cody.littley): convert all commands to use flags stored in these variables
var (
srcFlag = &cli.StringSliceFlag{
Name: "src",
Aliases: []string{"s"},
Usage: "Source paths where the DB data is found, at least one is required.",
Required: true,
}
)

// buildCliParser creates a command line parser for the LittDB CLI tool.
func buildCLIParser(logger logging.Logger) *cli.App {
app := &cli.App{
Expand Down Expand Up @@ -267,6 +277,15 @@ func buildCLIParser(logger logging.Logger) *cli.App {
},
Action: nil, // syncCommand, // TODO this will be added in a follow up PR
},
{
Name: "unlock",
Usage: "Manually delete LittDB lock files. Dangerous if used improperly, use with caution.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add some sort of confirmation flow where the user has to say "yes" and if they want to skip there's the force-unlock or --force flag for skipping that flow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added. You either must type I know what I am doing, or include a --force tag.

ArgsUsage: "--src <path1> ... --src <pathN>",
Flags: []cli.Flag{
srcFlag,
},
Action: unlockCommand,
},
},
}
return app
Expand Down
24 changes: 24 additions & 0 deletions litt/cli/unlock.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package main

import (
"fmt"

"github.com/Layr-Labs/eigenda/common"
"github.com/Layr-Labs/eigenda/litt/disktable"
"github.com/urfave/cli/v2"
)

// called by the CLI to unlock a LittDB file system.
func unlockCommand(ctx *cli.Context) error {
logger, err := common.NewLogger(common.DefaultConsoleLoggerConfig())
if err != nil {
return fmt.Errorf("failed to create logger: %w", err)
}
sources := ctx.StringSlice(srcFlag.Name)

if len(sources) == 0 {
return fmt.Errorf("at least one source path is required")
}

return disktable.Unlock(logger, sources)
}
44 changes: 44 additions & 0 deletions litt/disktable/unlock.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package disktable

import (
"fmt"
"os"
"path/filepath"
"strings"

"github.com/Layr-Labs/eigenda/litt/util"
"github.com/Layr-Labs/eigensdk-go/logging"
)

// Unlocks a LittDB file system.
//
// DANGER: calling this method opens the door for unsafe concurrent operations on LittDB files.
// With great power comes great responsibility.
func Unlock(logger logging.Logger, sourcePaths []string) error {
for _, sourcePath := range sourcePaths {
err := filepath.WalkDir(sourcePath, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}

if strings.HasSuffix(path, util.LockfileName) {
logger.Infof("Removing lock file %s", path)
if removeErr := os.Remove(path); removeErr != nil {
logger.Error("Failed to remove lock file", "path", path, "error", removeErr)
return fmt.Errorf("failed to remove lock file %s: %w", path, removeErr)
}
}

return nil
})

if err != nil {
return fmt.Errorf("failed to walk directory %s: %w", sourcePath, err)
}
}

return nil
}
13 changes: 12 additions & 1 deletion litt/littbuilder/db_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/Layr-Labs/eigenda/common"
"github.com/Layr-Labs/eigenda/litt"
"github.com/Layr-Labs/eigenda/litt/disktable"
"github.com/Layr-Labs/eigenda/litt/metrics"
"github.com/Layr-Labs/eigenda/litt/util"
"github.com/Layr-Labs/eigensdk-go/logging"
Expand Down Expand Up @@ -103,14 +104,24 @@ func NewDB(config *litt.Config) (litt.DB, error) {
// NewDBUnsafe creates a new DB instance with a custom table builder. This is intended for unit test use,
// and should not be considered a stable API.
func NewDBUnsafe(config *litt.Config, tableBuilder TableBuilderFunc) (litt.DB, error) {

for _, rootPath := range config.Paths {
err := util.EnsureDirectoryExists(rootPath, config.Fsync)
if err != nil {
return nil, fmt.Errorf("error ensuring directory %s exists: %w", rootPath, err)
}
}

if config.PurgeLocks {
config.Logger.Warnf("Purging LittDB locks from paths %v", config.Paths)
err := disktable.Unlock(config.Logger, config.Paths)
if err != nil {
return nil, fmt.Errorf("error purging locks: %w", err)
}
config.Logger.Infof("Locks purged successfully")
} else {
config.Logger.Infof("Not purging locks, continuing with existing locks")
}

releaseLocks, err := util.LockDirectories(config.Logger, config.Paths, util.LockfileName, config.Fsync)
if err != nil {
return nil, fmt.Errorf("error acquiring locks on paths %v: %w", config.Paths, err)
Expand Down
6 changes: 6 additions & 0 deletions litt/littdb_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ type Config struct {
// of database snapshots. Failing to clean up the hard linked files referenced by the symlinks will result in a
// disk space leak.
SnapshotDirectory string

// If true, then purge all lock files prior to starting the database. This is potentially dangerous, as it will
// permit multiple databases to be opened against the same data directories. If ever there are two LittDB
// instances running against the same data directories, data corruption is almost a certainty.
PurgeLocks bool
}

// DefaultConfig returns a Config with default values.
Expand Down Expand Up @@ -197,6 +202,7 @@ func DefaultConfigNoPaths() *Config {
MetricsNamespace: "litt",
MetricsPort: 9101,
MetricsUpdateInterval: time.Second,
PurgeLocks: false,
}
}

Expand Down
163 changes: 163 additions & 0 deletions litt/test/unlock_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
package test

import (
"os"
"path"
"path/filepath"
"strings"
"testing"

testrandom "github.com/Layr-Labs/eigenda/common/testutils/random"
"github.com/Layr-Labs/eigenda/litt"
"github.com/Layr-Labs/eigenda/litt/disktable"
"github.com/Layr-Labs/eigenda/litt/littbuilder"
"github.com/Layr-Labs/eigenda/litt/util"
"github.com/stretchr/testify/require"
)

// Note: this test is defined in the test package to avoid circular dependencies.

func TestUnlock(t *testing.T) {
testDir := t.TempDir()
rand := testrandom.NewTestRandom()
volumes := []string{path.Join(testDir, "volume1", path.Join(testDir, "volume2"), path.Join(testDir, "volume3"))}

config, err := litt.DefaultConfig(volumes...)
config.Fsync = false // Disable fsync for faster tests
config.TargetSegmentFileSize = 100
require.NoError(t, err)

db, err := littbuilder.NewDB(config)
require.NoError(t, err)

table, err := db.GetTable("test_table")
require.NoError(t, err)

expectedData := make(map[string][]byte)

// Write some data
for i := 0; i < 100; i++ {
key := rand.PrintableBytes(32)
value := rand.PrintableVariableBytes(1, 100)

expectedData[string(key)] = value
err = table.Put(key, value)
require.NoError(t, err, "Failed to put data in table")
}

// Look for lock files. We should see one for each volume.
lockFileCount := 0
err = filepath.Walk(testDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
if strings.HasSuffix(path, util.LockfileName) {
lockFileCount++
}
return nil
})
require.NoError(t, err)

// Unlock the DB. This should remove all lock files, but leave other files intact.
err = disktable.Unlock(config.Logger, volumes)
require.NoError(t, err, "Failed to unlock the database")

// There should be no lock files left.
lockFileCount = 0
err = filepath.Walk(testDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
if strings.HasSuffix(path, util.LockfileName) {
lockFileCount++
}
return nil
})
require.NoError(t, err)
require.Equal(t, 0, lockFileCount, "There should be no lock files left after unlocking")

// Calling unlock again should not cause any issues.
err = disktable.Unlock(config.Logger, volumes)
require.NoError(t, err, "Failed to unlock the database again")

// Verify that the data is still intact.
for key, expectedValue := range expectedData {
value, ok, err := table.Get([]byte(key))
require.NoError(t, err, "Failed to get data from table")
require.True(t, ok, "Failed to get data from table")
require.Equal(t, expectedValue, value, "Data mismatch for key %s", key)
}

// Restart the database and verify the data again.
err = db.Close()
require.NoError(t, err)

db, err = littbuilder.NewDB(config)
require.NoError(t, err)

table, err = db.GetTable("test_table")
require.NoError(t, err)

for key, expectedValue := range expectedData {
value, ok, err := table.Get([]byte(key))
require.NoError(t, err, "Failed to get data from table after restart")
require.True(t, ok, "Failed to get data from table after restart")
require.Equal(t, expectedValue, value, "Data mismatch for key %s after restart", key)
}

err = db.Close()
require.NoError(t, err, "Failed to close the database after restart")
}

func TestPurgeLocks(t *testing.T) {
testDir := t.TempDir()
rand := testrandom.NewTestRandom()
volumes := []string{path.Join(testDir, "volume1", path.Join(testDir, "volume2"), path.Join(testDir, "volume3"))}

config, err := litt.DefaultConfig(volumes...)
config.Fsync = false // Disable fsync for faster tests
config.TargetSegmentFileSize = 100
require.NoError(t, err)

db, err := littbuilder.NewDB(config)
require.NoError(t, err)

table, err := db.GetTable("test_table")
require.NoError(t, err)

expectedData := make(map[string][]byte)

// Write some data
for i := 0; i < 100; i++ {
key := rand.PrintableBytes(32)
value := rand.PrintableVariableBytes(1, 100)

expectedData[string(key)] = value
err = table.Put(key, value)
require.NoError(t, err, "Failed to put data in table")
}

// Opening a second instance of the database should fail due to existing locks.
_, err = littbuilder.NewDB(config)
require.Error(t, err, "Expected error when opening a second instance of the database with existing locks")

// Open a new instance of the database at the same time. Normally this is not possible, but it becomes possible
// when we purge locks.
config.PurgeLocks = true
db2, err := littbuilder.NewDB(config)
require.NoError(t, err, "Failed to open a second instance of the database")

// This test doesn't bother to verify the table data, since we are in unsafe territory now with multiple instances
// of the database running at the same time.

err = db.Close()
require.NoError(t, err, "Failed to close the first instance of the database")
err = db2.Close()
require.NoError(t, err)
}
5 changes: 5 additions & 0 deletions node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ type Config struct {
// Directories do not need to be on the same filesystem.
LittDBStoragePaths []string

// If true, then purge LittDB locks on startup. Potentially useful to get rid of zombie lock files,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't quite understand the scenario in which zombie lock files appear? Is it due to some sort of ungraceful termination with containers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expanded the documentation here:

	// If true, then purge LittDB locks on startup. Potentially useful to get rid of zombie lock files,
	// but also dangerous (multiple LittDB processes operating on the same files can lead to data corruption).
	//
	// When LittDB starts up, it attempts to create lock files. When a validator is forcefully shut down, lock files 
	// may be left behind. At startup time, if LittDB observes existing lock files, it first checks to see
	// if the process that created the lock files is still running. The lock files contain the creator's PID, and so 
	// LittDB checks to see if there is any process with that PID still running.
	//
	// Although it should be rare, it's possible that another process may be started with the same PID as the
	// PID used to create the lock files. When this happens, LittDB will be prevented from starting up out of
	// fear of another process trying to access the same files, even though the original process that created the 
	// lock files is no longer running. If that happens, this flag is a safe way to force LittDB to start up
	// without being blocked by those lock files. BE VERY CERTAIN THAT THE OTHER PROCESS IS ACTUALLY DEAD!
	// If two instances of LittDB are running on the same files, it WILL lead to data corruption.
	//
	// An alternate way to clear the LittDB lock files is via the LittDB CLI with the "litt unlock" command.
	// Run "litt unlock --help" for more information.
	LittUnsafePurgeLocks bool

// but also dangerous (multiple LittDB processes operating on the same files can lead to data corruption).
LittUnsafePurgeLocks bool

// The rate limit for the number of bytes served by the GetChunks API if the data is in the cache.
// Unit is in megabytes per second.
GetChunksHotCacheReadLimitMB float64
Expand Down Expand Up @@ -393,6 +397,7 @@ func NewConfig(ctx *cli.Context) (*Config, error) {
LittDBReadCacheSizeGB: ctx.GlobalFloat64(flags.LittDBReadCacheSizeGBFlag.Name),
LittDBReadCacheSizeFraction: ctx.GlobalFloat64(flags.LittDBReadCacheSizeFractionFlag.Name),
LittDBStoragePaths: ctx.GlobalStringSlice(flags.LittDBStoragePathsFlag.Name),
LittUnsafePurgeLocks: ctx.GlobalBool(flags.LittUnsafePurgeLocksFlag.Name),
DownloadPoolSize: ctx.GlobalInt(flags.DownloadPoolSizeFlag.Name),
GetChunksHotCacheReadLimitMB: ctx.GlobalFloat64(flags.GetChunksHotCacheReadLimitMBFlag.Name),
GetChunksHotBurstLimitMB: ctx.GlobalFloat64(flags.GetChunksHotBurstLimitMBFlag.Name),
Expand Down
7 changes: 7 additions & 0 deletions node/flags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,12 @@ var (
Required: false,
EnvVar: common.PrefixEnvVar(EnvVarPrefix, "LITT_DB_STORAGE_PATHS"),
}
LittUnsafePurgeLocksFlag = cli.BoolFlag{
Name: common.PrefixFlag(FlagPrefix, "litt-unsafe-purge-locks"),
Usage: "Unsafe flag to purge locks in LittDB. Use with caution, as it may lead to data loss or corruption.",
Required: false,
EnvVar: common.PrefixEnvVar(EnvVarPrefix, "LITT_UNSAFE_PURGE_LOCKS"),
}
DownloadPoolSizeFlag = cli.IntFlag{
Name: common.PrefixFlag(FlagPrefix, "download-pool-size"),
Usage: "The size of the download pool. The default value is 16.",
Expand Down Expand Up @@ -630,6 +636,7 @@ var optionalFlags = []cli.Flag{
EigenDADirectoryFlag,
BlsOperatorStateRetrieverFlag,
EigenDAServiceManagerFlag,
LevelDBDisableSeeksCompactionV1Flag,
}

func init() {
Expand Down
1 change: 1 addition & 0 deletions node/validator_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ func NewValidatorStore(
littConfig.MetricsNamespace = littDBMetricsPrefix
littConfig.Logger = logger
littConfig.DoubleWriteProtection = config.LittDBDoubleWriteProtection
littConfig.PurgeLocks = config.LittUnsafePurgeLocks
if err != nil {
return nil, fmt.Errorf("failed to create new litt config: %w", err)
}
Expand Down
Loading