Skip to content

Commit ea9cb44

Browse files
committed
aarch64: fix zfs support
This patch fixes ZFS support on aarch64. As the issue #1131 explains, the ZFS page scanner logic clears the access flag of PTEs of relevant memory-mapped chunks of the files. On Intel, the cpu automatically sets the flags on first access (read or write) to those pages of memory. But on ARM it may need to be done by software if CPU does not have this capability (it does not on RPI 4 and Odroid I have been using possibly due to QEMU limitation). So to set the access flags in software, this patch enhances the page fault handler to detect if relevant fault is access flag related and does the manual page walk to navigate all the way down to the leaf PTE based on the virtual memory address retrieved from far_el1. Then it sets the access flag of the PTE and the dirty flag if the fault was triggered by a write. Eventually it writes the PTE back to memory and issues necessary `dsb ishst` to force completion of writes to page table entries and flush cpu pipeline. Finally, this patch adjusts `scripts/build` to support building ZFS images on arm and makes ZFS a default filesystem as on x64_64. Besides running all unit tests on ZFS image I have also verified that more involved tests like misc-zfs-io.cc work as well. Fixes #1131 Signed-off-by: Waldemar Kozaczuk <[email protected]>
1 parent d8d2719 commit ea9cb44

File tree

3 files changed

+74
-13
lines changed

3 files changed

+74
-13
lines changed

arch/aarch64/mmu.cc

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,59 @@
1414
#include "arch-cpu.hh"
1515
#include "exceptions.hh"
1616

17+
#define ACCESS_FLAG_FAULT_LEVEL_3(esr) ((esr & 0b0111111) == 0x0b) // 0xb = 0b1011 indicates level 3
18+
#define ACCESS_FLAG_FAULT_LEVEL_3_WHEN_WRITE(esr) ((esr & 0b1111111) == 0x4b)
19+
20+
TRACEPOINT(trace_mmu_vm_access_flag_fault, "addr=%p", void *);
21+
22+
template <typename T>
23+
T* phys_to_virt_cast(mmu::phys pa)
24+
{
25+
void *virt = mmu::phys_mem + pa;
26+
return static_cast<T*>(virt);
27+
}
28+
29+
static void handle_access_flag_fault(exception_frame *ef, u64 addr) {
30+
trace_mmu_vm_access_flag_fault((void*)addr);
31+
32+
// The access bit of a PTE (Page Table Entry) at level 3 got cleared and we need
33+
// to set it to handle this page fault. Therefore we need to do a page walk
34+
// to navigate down to the level 3 and identify relevant PTE.
35+
36+
// Start with root PTE
37+
auto root_pt = mmu::get_root_pt(addr);
38+
auto root_ptep = mmu::hw_ptep<4>::force(root_pt);
39+
40+
// Identify PTEP (PTE Pointer) at level 0 (the template parameter is reversed)
41+
// First identify the ptep table at this level
42+
auto l3_ptep_table = mmu::hw_ptep<3>::force(phys_to_virt_cast<mmu::pt_element<3>>(root_ptep.read().next_pt_addr()));
43+
// Then access ptep at the index encoded in the virtual address
44+
auto l3_ptep = l3_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 3));
45+
46+
// Identify PTEP at level 1 (first identify the ptep table and then the relevant ptep)
47+
auto l2_ptep_table = mmu::hw_ptep<2>::force(phys_to_virt_cast<mmu::pt_element<2>>(l3_ptep.read().next_pt_addr()));
48+
auto l2_ptep = l2_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 2));
49+
50+
// Identify PTEP at level 2 (first identify the ptep table and then the relevant ptep)
51+
auto l1_ptep_table = mmu::hw_ptep<1>::force(phys_to_virt_cast<mmu::pt_element<1>>(l2_ptep.read().next_pt_addr()));
52+
auto l1_ptep = l1_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 1));
53+
54+
// Identify PTEP at level 3 (first identify the ptep table and then the relevant ptep)
55+
auto l0_ptep_table = mmu::hw_ptep<0>::force(phys_to_virt_cast<mmu::pt_element<0>>(l1_ptep.read().next_pt_addr()));
56+
auto l0_ptep = l0_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 0));
57+
58+
// Read leaf PTE
59+
auto leaf_pte = l0_ptep.read();
60+
61+
leaf_pte.set_accessed(true);
62+
if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) {
63+
leaf_pte.set_dirty(true);
64+
}
65+
66+
l0_ptep.write(leaf_pte);
67+
mmu::synchronize_page_table_modifications();
68+
}
69+
1770
void page_fault(exception_frame *ef)
1871
{
1972
sched::fpu_lock fpu;
@@ -39,6 +92,10 @@ void page_fault(exception_frame *ef)
3992
abort("trying to execute null pointer");
4093
}
4194

95+
if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) {
96+
return handle_access_flag_fault(ef, addr);
97+
}
98+
4299
/* vm_fault might sleep, so check that the thread is preemptable,
43100
* and that interrupts in the saved pstate are enabled.
44101
* Then enable interrupts for the vm_fault.

scripts/build

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -190,15 +190,7 @@ host_arch=$(uname -m)
190190

191191
# Default manifest
192192
manifest=bootfs.manifest.skel
193-
if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then
194-
# We default to ROFS as ZFS is not supported on ARM until the issue #1131 is fixed
195-
fs_type=${vars[fs]-rofs}
196-
if [[ "$fs_type" == "rofs" ]]; then
197-
vars[create_disk]="true"
198-
fi
199-
else
200-
fs_type=${vars[fs]-zfs}
201-
fi
193+
fs_type=${vars[fs]-zfs}
202194
usrskel_arg=
203195
case $fs_type in
204196
zfs)
@@ -215,6 +207,10 @@ ramfs)
215207
exit 2
216208
esac
217209

210+
if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then
211+
vars[create_disk]="true"
212+
fi
213+
218214
if test -n "${vars[usrskel]}"
219215
then
220216
# Override default skel
@@ -305,7 +301,9 @@ if [[ ${vars[create_disk]} == "true" ]]; then
305301
bare="$SRC"/scripts/disk.bin
306302
raw_disk=disk
307303
qcow2_disk=disk
308-
upload_kernel_mode="-k"
304+
if [[ "$arch" == 'x64' ]]; then
305+
upload_kernel_mode="-k"
306+
fi
309307
else
310308
partition_offset=$kernel_end
311309
bare=loader.img
@@ -318,7 +316,7 @@ create_zfs_disk() {
318316
"$SRC"/scripts/imgedit.py setpartition "-f raw ${raw_disk}.raw" 2 $partition_offset $partition_size
319317
qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img
320318
qemu-img resize $qcow2_disk.img ${image_size}b >/dev/null 2>&1
321-
"$SRC"/scripts/upload_manifest.py -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode
319+
"$SRC"/scripts/upload_manifest.py --arch=$arch -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode
322320
}
323321

324322
create_rofs_disk() {

scripts/upload_manifest.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import io
88
StringIO = io.StringIO
99

10+
host_arch = os.uname().machine
11+
1012
def find_free_port():
1113
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
1214
s.bind(('localhost', 0))
@@ -137,7 +139,11 @@ def main():
137139
make_option('-k',
138140
dest='kernel',
139141
action='store_true',
140-
help='run OSv in direct kernel mode')
142+
help='run OSv in direct kernel mode'),
143+
make_option('--arch',
144+
dest='arch',
145+
default=host_arch,
146+
help="specify QEMU architecture: x86_64, aarch64")
141147
])
142148

143149
(options, args) = opt.parse_args()
@@ -155,7 +161,7 @@ def main():
155161
kernel_mode_flag = '-k --kernel-path build/release/loader-stripped.elf'
156162
else:
157163
kernel_mode_flag = ''
158-
osv = subprocess.Popen('cd ../..; scripts/run.py %s --vnc none -m 512 -c1 -i "%s" --block-device-cache unsafe -s -e "--norandom --nomount --noinit /tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set compression=off osv" --forward tcp:127.0.0.1:%s-:10000' % (kernel_mode_flag,image_path,upload_port), shell=True, stdout=subprocess.PIPE)
164+
osv = subprocess.Popen('cd ../..; scripts/run.py %s --arch=%s --vnc none -m 512 -c1 -i "%s" --block-device-cache unsafe -s -e "--norandom --nomount --noinit /tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set compression=off osv" --forward tcp:127.0.0.1:%s-:10000' % (kernel_mode_flag,options.arch,image_path,upload_port), shell=True, stdout=subprocess.PIPE)
159165

160166
upload(osv, manifest, depends, upload_port)
161167

0 commit comments

Comments
 (0)