@@ -521,6 +521,9 @@ class IPCTransport : public RDMATransport {
521
521
val = Environment::Get ()->find (" BYTEPS_PARTITION_BYTES" );
522
522
byteps_partition_bytes_ = val ? atoi (val) : 4096000 ;
523
523
524
+ val = Environment::Get ()->find (" BYTEPS_ENCODING_SCHEME_VERSION" );
525
+ encoding_scheme_version_ = val ? atoi (val) : 0 ;
526
+
524
527
val = Environment::Get ()->find (" BYTEPS_LOCAL_SIZE" );
525
528
auto byteps_local_size = val ? atoi (val) : 8 ;
526
529
byteps_partition_bytes_ = RoundUp (
@@ -637,6 +640,24 @@ class IPCTransport : public RDMATransport {
637
640
std::lock_guard<std::mutex> lock (shm_mu_);
638
641
auto worker_key = DecodeWorkerKey (key);
639
642
auto seq_num = worker_key % (1 << 16 );
643
+ // Total key space is [0, 2^64 - 1]
644
+ // It will be divided to N PS servers, for now we assume N <= 2^16
645
+ // Then we have 2^48 key space left.
646
+ // Encoding scheme version 0:
647
+ // Then we have 2^48 key space left (top 16 bits for different servers)
648
+ // MXNet server has a bug dealing with keys larger than 2^32
649
+ // Below we support up to 2^16 tensors, and up to 2^16 partitions per
650
+ // tensor
651
+ // Encoding scheme version 1:
652
+ // Top 16 bits out of the 48 bits encodes the sender rank
653
+ // Mid 16 bits out of the 48 bits encodes the tensor id
654
+ // The next 6 bits encodes request types (pushpull, send, etc)
655
+ // The last 10 bits encodes the partition id
656
+ // Therefore, we support up to 2^16 tensors, and up to 2^10 partitions per
657
+ // tensor
658
+ if (encoding_scheme_version_ == 1 ) {
659
+ seq_num = worker_key % (1 << 10 );
660
+ }
640
661
auto base_key = worker_key - seq_num;
641
662
uint64_t offset = byteps_partition_bytes_ * seq_num;
642
663
if (key_shm_addr_.find (base_key) != key_shm_addr_.end ()) {
@@ -657,8 +678,8 @@ class IPCTransport : public RDMATransport {
657
678
CHECK_NE (base_ptr, (void *)-1 ) << strerror (errno);
658
679
key_shm_addr_[base_key] = base_ptr;
659
680
660
- PS_VLOG (1 ) << " open Shared Memory: " << shm_name << " , offset=" << offset
661
- << " , (in bytes) size=" << total_shm_size;
681
+ PS_VLOG (1 ) << " open Shared Memory: " << shm_name << " offset=" << offset
682
+ << " (in bytes) size=" << total_shm_size;
662
683
return (void *)((char *)key_shm_addr_[base_key] + offset);
663
684
}
664
685
@@ -675,7 +696,7 @@ class IPCTransport : public RDMATransport {
675
696
std::unordered_map<uint64_t , void *> key_shm_addr_;
676
697
677
698
bool enable_async_copy_;
678
-
699
+ int encoding_scheme_version_ = 0 ;
679
700
}; // class IPCTransport
680
701
681
702
}; // namespace ps
0 commit comments