-
Notifications
You must be signed in to change notification settings - Fork 152
feat: leveldb #1034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
feat: leveldb #1034
Changes from 1 commit
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
""" | ||
@author: cunyue | ||
@file: __init__.py | ||
@time: 2025/6/5 14:03 | ||
@description: 同步本地数据到云端 | ||
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
""" | ||
@author: cunyue | ||
@file: backup.py | ||
@file: crypto.py | ||
@time: 2025/6/2 15:07 | ||
@description: 日志备份回调 | ||
""" | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
""" | ||
@author: cunyue | ||
@file: datastore.py | ||
@time: 2025/6/5 16:32 | ||
@description: 记录的数据遵循LevelDB格式:https://github.com/google/leveldb/blob/main/doc/log_format.md | ||
我们使用 crc32 计算数据校验和,crc32 相对轻量,且计算速度较快 | ||
字符编码使用 utf-8, 确保数据兼容性 (这与 LevelDB 的规范有所冲突,如果有必要,未来可以升级版本并通过LEVELDBLOG_HEADER_VERSION兼容) | ||
这为后续引入 protobuf 或其他序列化格式打下基础 | ||
DataStore 大致代码借鉴自 W&B | ||
""" | ||
|
||
import os | ||
import struct | ||
import zlib | ||
from typing import Optional, Any, IO | ||
|
||
LEVELDBLOG_HEADER_LEN = 7 | ||
LEVELDBLOG_BLOCK_LEN = 32768 | ||
LEVELDBLOG_DATA_LEN = LEVELDBLOG_BLOCK_LEN - LEVELDBLOG_HEADER_LEN | ||
|
||
LEVELDBLOG_FULL = 1 | ||
LEVELDBLOG_FIRST = 2 | ||
LEVELDBLOG_MIDDLE = 3 | ||
LEVELDBLOG_LAST = 4 | ||
|
||
|
||
LEVELDBLOG_HEADER_IDENT = ":SWL" | ||
LEVELDBLOG_HEADER_MAGIC = 0xE1D6 # zlib.crc32(bytes("SwanLab", 'utf-8')) & 0xffff | ||
LEVELDBLOG_HEADER_VERSION = 0 | ||
|
||
|
||
def strtobytes(x): | ||
""" | ||
文件转字符串 | ||
""" | ||
return bytes(x, "utf-8") | ||
|
||
|
||
def bytestostr(x): | ||
return str(x, 'utf-8') | ||
|
||
|
||
class DataStore: | ||
|
||
def __init__(self): | ||
self._filename: Optional[str] = None | ||
self._fp: Optional[IO[Any]] = None | ||
# 当前文件的偏移量 | ||
self._index: int = 0 | ||
# 当前文件的已刷写偏移量 | ||
self._flush_offset = 0 | ||
# 日志系统预计算并缓存CRC32校验值,缓存每一个数据类型的CRC32值,分别存在各自的索引位置 | ||
self._crc = [0] * (LEVELDBLOG_LAST + 1) | ||
for x in range(1, LEVELDBLOG_LAST + 1): | ||
self._crc[x] = zlib.crc32(strtobytes(chr(x))) & 0xFFFFFFFF | ||
|
||
# 是否为扫描模式打开文件 | ||
self._opened_for_scan = False | ||
# 当前文件大小(仅在扫描模式下有效) | ||
self._size_bytes: int = 0 | ||
|
||
# ---------------------------------- 读取 ---------------------------------- | ||
|
||
def open_for_scan(self, filename: str): | ||
self._filename = filename | ||
self._fp = open(filename, "r+b") | ||
self._index = 0 | ||
self._size_bytes = os.stat(filename).st_size | ||
self._opened_for_scan = True | ||
self._read_header() | ||
|
||
def _read_header(self): | ||
header = self._fp.read(LEVELDBLOG_HEADER_LEN) | ||
assert ( | ||
len(header) == LEVELDBLOG_HEADER_LEN | ||
), f"header is {len(header)} bytes instead of the expected {LEVELDBLOG_HEADER_LEN}" | ||
ident, magic, version = struct.unpack("<4sHB", header) | ||
if ident != strtobytes(LEVELDBLOG_HEADER_IDENT): | ||
raise Exception("Invalid header") | ||
if magic != LEVELDBLOG_HEADER_MAGIC: | ||
raise Exception("Invalid header") | ||
if version != LEVELDBLOG_HEADER_VERSION: | ||
raise Exception("Invalid header") | ||
self._index += len(header) | ||
|
||
def _scan_record(self) -> Optional[tuple[int, bytes]]: | ||
""" | ||
扫描一条记录 | ||
""" | ||
assert self._opened_for_scan, "file not open for scanning" | ||
# 1. 读取数据头 | ||
header = self._fp.read(LEVELDBLOG_HEADER_LEN) | ||
if len(header) == 0: | ||
return None | ||
assert ( | ||
len(header) == LEVELDBLOG_HEADER_LEN | ||
), f"record header is {len(header)} bytes instead of the expected {LEVELDBLOG_HEADER_LEN}" | ||
# 2. 解析数据头并校验数据完整性 | ||
checksum, data_length, data_type = struct.unpack("<IHB", header) | ||
self._index += LEVELDBLOG_HEADER_LEN | ||
data = self._fp.read(data_length) | ||
checksum_computed = zlib.crc32(data, self._crc[data_type]) & 0xFFFFFFFF | ||
assert checksum == checksum_computed, "record checksum is invalid, data may be corrupt" | ||
self._index += data_length | ||
# 3. 返回数据 | ||
return data_type, data | ||
|
||
def scan(self) -> Optional[str]: | ||
""" | ||
扫描日志文件,返回一条记录 | ||
""" | ||
# 1. 一次读取一条记录,如果剩余空间不足存储数据头,校验并跳过,此为写入的逆操作 | ||
offset = self._index % LEVELDBLOG_BLOCK_LEN | ||
space_left = LEVELDBLOG_BLOCK_LEN - offset | ||
if space_left < LEVELDBLOG_HEADER_LEN: | ||
pad_check = strtobytes("\x00" * space_left) | ||
pad = self._fp.read(space_left) | ||
# 校验必须为0 | ||
assert pad == pad_check, "invalid padding" | ||
self._index += space_left | ||
# 2. 扫描一条记录 | ||
record = self._scan_record() | ||
if record is None: # eof | ||
return None | ||
dtype, data = record | ||
if dtype == LEVELDBLOG_FULL: | ||
return bytestostr(data) | ||
# 3. 如果是第一条记录,则继续扫描直到找到最后一条记录 | ||
assert dtype == LEVELDBLOG_FIRST, f"expected record to be type {LEVELDBLOG_FIRST} but found {dtype}" | ||
while True: | ||
record = self._scan_record() | ||
if record is None: # eof | ||
return None | ||
dtype, new_data = record | ||
if dtype == LEVELDBLOG_LAST: | ||
data += new_data | ||
break | ||
assert dtype == LEVELDBLOG_MIDDLE, f"expected record to be type {LEVELDBLOG_MIDDLE} but found {dtype}" | ||
data += new_data | ||
return bytestostr(data) | ||
|
||
# ---------------------------------- 写入 ---------------------------------- | ||
|
||
def open_for_write(self, filename: str): | ||
self._filename = filename | ||
self._fp = open(filename, "xb") | ||
# 写入文件头, 长度等于 LEVELDBLOG_HEADER_LEN | ||
data = struct.pack( | ||
"<4sHB", | ||
strtobytes(LEVELDBLOG_HEADER_IDENT), | ||
LEVELDBLOG_HEADER_MAGIC, | ||
LEVELDBLOG_HEADER_VERSION, | ||
) | ||
assert len(data) == LEVELDBLOG_HEADER_LEN, f"header size is {len(data)} bytes, expected {LEVELDBLOG_HEADER_LEN}" | ||
self._fp.write(data) | ||
self._index += len(data) | ||
|
||
def _write_record(self, data: bytes, data_type: int = LEVELDBLOG_FULL): | ||
""" | ||
写入记录到日志文件 | ||
""" | ||
assert len(data) + LEVELDBLOG_HEADER_LEN <= ( | ||
LEVELDBLOG_BLOCK_LEN - self._index % LEVELDBLOG_BLOCK_LEN | ||
), "not enough space to write new records" | ||
data_length = len(data) | ||
# 计算校验值,校验值为对数据和数据类型的 CRC32 校验和 | ||
checksum = zlib.crc32(data, self._crc[data_type]) & 0xFFFFFFFF | ||
# 写入数据头,格式为:<IHB>,分别表示校验和、数据长度和数据类型 | ||
# I: unsigned int (4 bytes), H: unsigned short (2 bytes), B: unsigned char (1 byte) | ||
self._fp.write(struct.pack("<IHB", checksum, data_length, data_type)) | ||
if data_length: | ||
self._fp.write(data) | ||
self._index += LEVELDBLOG_HEADER_LEN + len(data) | ||
|
||
def write(self, s: str): | ||
""" | ||
写入数据到日志文件,遵循 LevelDB 规范 | ||
:param s: 要写入的数据,必须是字符串形式 | ||
:return: 返回写入的起始偏移量、当前偏移量和已刷写偏移量 | ||
""" | ||
data = strtobytes(s) | ||
# 1. 计算偏移量 | ||
start_offset = self._index | ||
offset = self._index % LEVELDBLOG_BLOCK_LEN | ||
space_left = LEVELDBLOG_BLOCK_LEN - offset | ||
data_used = 0 | ||
data_left = len(data) | ||
# 2. 剩余长度小于数据头长度则填充0,归位到下一个块 | ||
if space_left < LEVELDBLOG_HEADER_LEN: | ||
pad = "\x00" * space_left | ||
self._fp.write(strtobytes(pad)) | ||
self._index += space_left | ||
space_left = LEVELDBLOG_BLOCK_LEN | ||
# 3. 如果剩余长度大于等于数据长度,则直接写入 | ||
if data_left + LEVELDBLOG_HEADER_LEN <= space_left: | ||
self._write_record(data) | ||
# 4. 否则需要分块写入(注意此时我们可能在一个块的中间) | ||
else: | ||
# 4.1 写入第一个数据块,确保接下来数据独占一个块 | ||
data_room = space_left - LEVELDBLOG_HEADER_LEN | ||
self._write_record(data[:data_room], LEVELDBLOG_FIRST) | ||
data_used += data_room | ||
data_left -= data_room | ||
assert data_left, "data_left should be non-zero" | ||
# 4.2 写入中间数据 | ||
while data_left > LEVELDBLOG_DATA_LEN: | ||
self._write_record( | ||
data[data_used : data_used + LEVELDBLOG_DATA_LEN], | ||
LEVELDBLOG_MIDDLE, | ||
) | ||
data_used += LEVELDBLOG_DATA_LEN | ||
data_left -= LEVELDBLOG_DATA_LEN | ||
# 4.3 写入最后一个数据块 | ||
self._write_record(data[data_used:], LEVELDBLOG_LAST) | ||
# 刷写完整数据 | ||
self._fp.flush() | ||
os.fsync(self._fp.fileno()) | ||
self._flush_offset = self._index | ||
|
||
return start_offset, self._index, self._flush_offset | ||
|
||
# ---------------------------------- 辅助函数 ---------------------------------- | ||
|
||
def ensure_flushed(self) -> None: | ||
self._fp.flush() | ||
|
||
def close(self): | ||
# 关闭文件句柄 | ||
self._fp.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.