-
Notifications
You must be signed in to change notification settings - Fork 5.9k
PServer recovery from checkpoint #2741
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5a223a4
c7d1547
ece97bf
64d9df9
f3f42a2
4cd9a42
bc5acdd
3805634
24b4531
4cbafae
8f00457
ed3097d
e26dcbf
01c22ef
c20c24a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,3 +19,5 @@ third_party/ | |
|
||
# clion workspace. | ||
cmake-build-* | ||
CMakeFiles | ||
cmake_install.cmake |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ import ( | |
"time" | ||
|
||
"github.com/namsral/flag" | ||
"github.com/topicai/candy" | ||
|
||
"github.com/PaddlePaddle/Paddle/go/pserver" | ||
log "github.com/sirupsen/logrus" | ||
|
@@ -25,42 +26,42 @@ func main() { | |
flag.Parse() | ||
|
||
level, err := log.ParseLevel(*logLevel) | ||
if err != nil { | ||
panic(err) | ||
} | ||
candy.Must(err) | ||
|
||
log.SetLevel(level) | ||
|
||
var idx int | ||
var cp *pserver.Checkpoint | ||
if *index >= 0 { | ||
idx = *index | ||
} else { | ||
timeout := time.Second * time.Duration((*etcdTimeout)) | ||
e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout) | ||
idx, err = e.Register() | ||
candy.Must(err) | ||
cp, err = pserver.NewCheckpoint(idx, checkpointPath, e) | ||
if err != nil { | ||
panic(err) | ||
log.Infof("Fetch checkpoint failed, %s\n", err) | ||
} | ||
} | ||
|
||
s, err := pserver.NewService(idx) | ||
if err != nil { | ||
panic(err) | ||
var s *pserver.Service | ||
var err error | ||
if cp != nil { | ||
s, err = pserver.NewServiceFromCheckpoint(idx, cp) | ||
|
||
} else { | ||
s, err = pserver.NewService(idx) | ||
} | ||
candy.Must(err) | ||
|
||
err = rpc.Register(s) | ||
if err != nil { | ||
panic(err) | ||
} | ||
candy.Must(err) | ||
|
||
rpc.HandleHTTP() | ||
l, err := net.Listen("tcp", ":"+strconv.Itoa(*port)) | ||
if err != nil { | ||
panic(err) | ||
} | ||
candy.Must(err) | ||
|
||
log.Infof("start pserver at port %d", *port) | ||
err = http.Serve(l, nil) | ||
|
||
if err != nil { | ||
panic(err) | ||
} | ||
candy.Must(err) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,8 +16,10 @@ import ( | |
const ( | ||
// PsDesired is etcd path for store desired pserver count | ||
PsDesired = "/ps_desired" | ||
// PsAddr is the base dir for pserver to store their addr | ||
// PsPath is the base dir for pserver to store their addr | ||
PsPath = "/ps/" | ||
// PsCheckpoint is the etcd path for store checkpoints information | ||
PsCheckpoint = "/checkpoints/" | ||
) | ||
|
||
// EtcdClient is the etcd client that the pserver uses for fault | ||
|
@@ -186,3 +188,20 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) { | |
|
||
return idx, nil | ||
} | ||
|
||
// GetCheckpoints gets the checkpoint information by the specified pserver id | ||
func (e *EtcdClient) GetCheckpointInfo(string idx) (string, error) { | ||
|
||
key := PsCheckpoint + idx | ||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout)) | ||
|
||
resp, err := c.Get(ctx, key) | ||
cancel() | ||
if err != nil { | ||
return "", err | ||
} | ||
kvs := resp.Kvs | ||
if len(kvs) == 0 { | ||
return "", nil | ||
} | ||
v := kvs[0].Value | ||
return string(v), nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,12 @@ | ||
package pserver | ||
|
||
import ( | ||
"encoding/json" | ||
"errors" | ||
"fmt" | ||
"io/ioutil" | ||
"os" | ||
"path/filepath" | ||
"sync" | ||
) | ||
|
||
|
@@ -49,13 +53,58 @@ type Service struct { | |
optMap map[string]*optimizer | ||
} | ||
|
||
// Checkpoint saves the checkpoint for pserver | ||
type Checkpoint struct { | ||
UUID string `json:"uuid"` | ||
MD5 string `json:"md5"` | ||
Timestamp string `json:"timestamp"` | ||
State []byte | ||
|
||
ParameterWithConfig | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree with your comment on the other PR, type CheckpointInfo struct {
UUID string
Path string
MD5 string
Timestamp int64
} Maybe we can have another type like: type ParameterCheckpoint struct {
ParameterWithConfig
State []byte
}
type Checkpoint []ParameterCheckpoint How do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea! I think separate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rename type CheckpointMeta struct {}
type CheckpointData struct {}
type Checkpoint struct {
meta CheckpointMeta
data CheckpointData
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool,
I looks like more simple? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. @typhoonzero How do you think about this way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed with @Yancey1989 . Do we need to separate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree!
Maybe not, It's simple enouph to load meta and data into the struct separately. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there in only type here without variable name. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According with: https://golang.org/ref/spec#Struct_types, a field without a explicit field name is an anonymous field, so I can initialize the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. get it. a brilliant feature of Go. |
||
} | ||
|
||
// NewCheckpoint creates a new checkpoint. | ||
func NewCheckpoint(idx int, cpPath string, e *EtcdClient) (*Checkpoint, error) { | ||
|
||
v, err := e.GetCheckpointInfo(idx) | ||
if err != nil { | ||
return nil, err | ||
} | ||
var cp Checkpoint | ||
if err = json.Unmarshal(v, &cp); err != nil { | ||
return nil, err | ||
} | ||
fn := filepath.Join(cpPath, cp.UUID) | ||
if _, err = os.Stat(fn); os.IsNotExist(err) { | ||
return nil, err | ||
} | ||
|
||
f, err := os.Open(fn) | ||
if err != nil { | ||
return nil, err | ||
} | ||
defer f.Close() | ||
|
||
buf, err := ioutil.ReadAll(f) | ||
if err != nil { | ||
return nil, err | ||
} | ||
// TODO: create checkpoint from file | ||
|
||
|
||
return nil, nil | ||
} | ||
|
||
// NewServiceFromCheckpoint creates a new service with the specified checkpoint | ||
func NewServiceFromCheckpoint(idx int, cp *Checkpoint) (*Service, error) { | ||
// TODO: create service from checkpoint | ||
return nil, nil | ||
} | ||
|
||
// NewService creates a new service, will bypass etcd registration if no | ||
// endpoints specified. | ||
func NewService(idx int) (*Service, error) { | ||
s := &Service{ | ||
idx: idx, | ||
} | ||
s.optMap = make(map[string]*optimizer) | ||
s.optMap = make(map[string]*optimizer) | ||
s.initialized = make(chan struct{}) | ||
return s, nil | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe
log.Errorf
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.