Skip to content

Commit 253919f

Browse files
foxengwkozaczuk
authored andcommitted
virtio-fs: implement dax window manager
For details on the manager's policy, please see fs/virtiofs/virtiofs_dax.hh. Signed-off-by: Fotis Xenakis <[email protected]> Message-Id: <AM0PR03MB62924F7C3278ED342493D529A6960@AM0PR03MB6292.eurprd03.prod.outlook.com>
1 parent 0db76eb commit 253919f

File tree

3 files changed

+397
-19
lines changed

3 files changed

+397
-19
lines changed

Makefile

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o
536536
bsd += bsd/porting/pcpu.o
537537
bsd += bsd/porting/bus_dma.o
538538
bsd += bsd/porting/kobj.o
539-
bsd += bsd/sys/netinet/if_ether.o
540-
bsd += bsd/sys/compat/linux/linux_socket.o
541-
bsd += bsd/sys/compat/linux/linux_ioctl.o
542-
bsd += bsd/sys/net/if_ethersubr.o
543-
bsd += bsd/sys/net/if_llatbl.o
544-
bsd += bsd/sys/net/radix.o
545-
bsd += bsd/sys/net/route.o
546-
bsd += bsd/sys/net/raw_cb.o
547-
bsd += bsd/sys/net/raw_usrreq.o
548-
bsd += bsd/sys/net/rtsock.o
549-
bsd += bsd/sys/net/netisr.o
550-
bsd += bsd/sys/net/netisr1.o
551-
bsd += bsd/sys/net/if_dead.o
552-
bsd += bsd/sys/net/if_clone.o
553-
bsd += bsd/sys/net/if_loop.o
554-
bsd += bsd/sys/net/if.o
555-
bsd += bsd/sys/net/pfil.o
539+
bsd += bsd/sys/netinet/if_ether.o
540+
bsd += bsd/sys/compat/linux/linux_socket.o
541+
bsd += bsd/sys/compat/linux/linux_ioctl.o
542+
bsd += bsd/sys/net/if_ethersubr.o
543+
bsd += bsd/sys/net/if_llatbl.o
544+
bsd += bsd/sys/net/radix.o
545+
bsd += bsd/sys/net/route.o
546+
bsd += bsd/sys/net/raw_cb.o
547+
bsd += bsd/sys/net/raw_usrreq.o
548+
bsd += bsd/sys/net/rtsock.o
549+
bsd += bsd/sys/net/netisr.o
550+
bsd += bsd/sys/net/netisr1.o
551+
bsd += bsd/sys/net/if_dead.o
552+
bsd += bsd/sys/net/if_clone.o
553+
bsd += bsd/sys/net/if_loop.o
554+
bsd += bsd/sys/net/if.o
555+
bsd += bsd/sys/net/pfil.o
556556
bsd += bsd/sys/net/routecache.o
557557
bsd += bsd/sys/netinet/in.o
558558
bsd += bsd/sys/netinet/in_pcb.o
@@ -1771,7 +1771,8 @@ fs_objs += rofs/rofs_vfsops.o \
17711771
rofs/rofs_common.o
17721772

17731773
fs_objs += virtiofs/virtiofs_vfsops.o \
1774-
virtiofs/virtiofs_vnops.o
1774+
virtiofs/virtiofs_vnops.o \
1775+
virtiofs/virtiofs_dax.o
17751776

17761777
fs_objs += pseudofs/pseudofs.o
17771778
fs_objs += procfs/procfs_vnops.o
@@ -1978,7 +1979,7 @@ libuutil-objects = $(foreach file, $(libuutil-file-list), $(out)/bsd/cddl/contri
19781979

19791980
define libuutil-includes
19801981
bsd/cddl/contrib/opensolaris/lib/libuutil/common
1981-
bsd/cddl/compat/opensolaris/include
1982+
bsd/cddl/compat/opensolaris/include
19821983
bsd/sys/cddl/contrib/opensolaris/uts/common
19831984
bsd/sys/cddl/compat/opensolaris
19841985
bsd/cddl/contrib/opensolaris/head

fs/virtiofs/virtiofs_dax.cc

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
/*
2+
* Copyright (C) 2020 Fotis Xenakis
3+
*
4+
* This work is open source software, licensed under the terms of the
5+
* BSD license as described in the LICENSE file in the top-level directory.
6+
*/
7+
8+
#include <algorithm>
9+
#include <mutex>
10+
11+
#include <osv/debug.h>
12+
#include <osv/uio.h>
13+
14+
#include "fuse_kernel.h"
15+
#include "virtiofs.hh"
16+
#include "virtiofs_dax.hh"
17+
#include "virtiofs_i.hh"
18+
19+
namespace virtiofs {
20+
21+
int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt,
22+
struct uio& uio, bool aggressive)
23+
{
24+
std::lock_guard<mutex> guard {_lock};
25+
26+
// Necessary pre-declarations due to goto below
27+
size_t to_map;
28+
chunk nchunks;
29+
int error;
30+
mapping_part mp;
31+
chunk fstart = uio.uio_offset / _chunk_size;
32+
off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk
33+
if (find(inode.nodeid, fstart, mp)) {
34+
// Requested data (at least some initial) is already mapped
35+
auto read_amt_act = std::min<size_t>(read_amt,
36+
(mp.nchunks * _chunk_size) - coffset);
37+
virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, "
38+
"moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act,
39+
(mp.mstart * _chunk_size) + coffset);
40+
goto out;
41+
}
42+
43+
// Map file
44+
to_map = coffset; // bytes to map
45+
if (aggressive) {
46+
// Map the rest of the file
47+
to_map += inode.attr.size - uio.uio_offset;
48+
} else {
49+
// Map just enough chunks to satisfy read_amt
50+
to_map += read_amt;
51+
}
52+
nchunks = to_map / _chunk_size;
53+
if (to_map % _chunk_size > 0) {
54+
nchunks++;
55+
}
56+
// NOTE: This relies on the fact that requesting a mapping longer than the
57+
// remaining file works (see mmap() on the host). If that didn't work, we
58+
// would have to request exact mappings (byte-granularity, rather than
59+
// chunk-granularity).
60+
error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true);
61+
if (error) {
62+
return error;
63+
}
64+
65+
out:
66+
auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset;
67+
auto read_amt_act = std::min<size_t>(read_amt,
68+
(mp.nchunks * _chunk_size) - coffset);
69+
// NOTE: It shouldn't be necessary to use the mmio* interface (i.e. volatile
70+
// accesses). From the spec: "Drivers map this shared memory region with
71+
// writeback caching as if it were regular RAM."
72+
error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
73+
if (error) {
74+
kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
75+
}
76+
return error;
77+
}
78+
79+
int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
80+
chunk fstart, mapping_part& mapped, bool evict)
81+
{
82+
// If necessary, unmap just enough chunks
83+
auto empty = _window_chunks - first_empty();
84+
if (evict && empty < nchunks) {
85+
mapping_part mp;
86+
auto error = unmap(nchunks - empty, mp, false);
87+
if (error) {
88+
return error;
89+
}
90+
empty += mp.nchunks;
91+
}
92+
auto to_map = std::min<chunk>(nchunks, empty);
93+
if (to_map == 0) {
94+
// The window is full and evict is false, or nchunks is 0
95+
mapped.mstart = _window_chunks - empty;
96+
mapped.nchunks = 0;
97+
return (nchunks == 0) ? 0 : ENOBUFS;
98+
}
99+
100+
// Map new chunks
101+
auto mstart = _window_chunks - empty;
102+
auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart);
103+
if (error) {
104+
return error;
105+
}
106+
if (!_mappings.empty()) {
107+
auto& m {_mappings.back()};
108+
if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) {
109+
// Extend previous mapping
110+
m.nchunks += to_map;
111+
mapped.mstart = mstart;
112+
mapped.nchunks = to_map;
113+
return 0;
114+
}
115+
}
116+
_mappings.emplace_back(nodeid, to_map, fstart, mstart);
117+
mapped.mstart = mstart;
118+
mapped.nchunks = to_map;
119+
return 0;
120+
}
121+
122+
int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep)
123+
{
124+
// Determine necessary changes
125+
chunk to_unmap = 0;
126+
auto erase_first {_mappings.cend()};
127+
chunk to_unmap_from_last = 0;
128+
for (auto it {_mappings.crbegin()};
129+
to_unmap < nchunks && it != _mappings.crend(); it++) {
130+
131+
if (it->nchunks <= nchunks - to_unmap) {
132+
// Remove *it
133+
erase_first = it.base() - 1;
134+
to_unmap += it->nchunks;
135+
} else {
136+
// Modify *it
137+
to_unmap_from_last = nchunks - to_unmap;
138+
to_unmap = nchunks;
139+
}
140+
}
141+
if (to_unmap == 0) {
142+
// The window is empty, or nchunks is 0
143+
unmapped.mstart = first_empty();
144+
unmapped.nchunks = 0;
145+
return (nchunks == 0) ? 0 : ENODATA;
146+
}
147+
148+
// Apply changes
149+
if (deep) {
150+
auto mstart = first_empty() - to_unmap;
151+
auto error = unmap_ll(to_unmap, mstart);
152+
if (error) {
153+
return error;
154+
}
155+
}
156+
_mappings.erase(erase_first, _mappings.cend());
157+
if (to_unmap_from_last > 0) {
158+
_mappings.back().nchunks -= to_unmap_from_last;
159+
}
160+
161+
unmapped.mstart = first_empty();
162+
unmapped.nchunks = to_unmap;
163+
return 0;
164+
}
165+
166+
int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
167+
chunk fstart, chunk mstart)
168+
{
169+
assert(mstart + nchunks <= _window_chunks);
170+
171+
// NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
172+
// the spec: "Alignment constraints for FUSE_SETUPMAPPING and
173+
// FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
174+
// negotiation"):
175+
// - foffset: multiple of map_alignment from FUSE_INIT
176+
// - len: not larger than remaining file?
177+
// - moffset: multiple of map_alignment from FUSE_INIT
178+
// In practice, map_alignment is the host's page size, because foffset and
179+
// moffset are passed to mmap() on the host. These are satisfied by
180+
// _chunk_size being a multiple of map_alignment.
181+
182+
std::unique_ptr<fuse_setupmapping_in> in_args {
183+
new (std::nothrow) fuse_setupmapping_in()};
184+
if (!in_args) {
185+
return ENOMEM;
186+
}
187+
in_args->fh = file_handle;
188+
in_args->foffset = fstart * _chunk_size;
189+
in_args->len = nchunks * _chunk_size;
190+
in_args->flags = 0; // Read-only
191+
in_args->moffset = mstart * _chunk_size;
192+
193+
virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
194+
"moffset=%lld)\n", nodeid, in_args->foffset, in_args->len,
195+
in_args->moffset);
196+
auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING,
197+
nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
198+
if (error) {
199+
kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid);
200+
return error;
201+
}
202+
203+
return 0;
204+
}
205+
206+
int dax_manager::unmap_ll(chunk nchunks, chunk mstart)
207+
{
208+
assert(mstart + nchunks <= _window_chunks);
209+
210+
// NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by
211+
// fuse_removemapping_in.count fuse_removemapping_one arguments in general.
212+
auto in_args_size = sizeof(fuse_removemapping_in) +
213+
sizeof(fuse_removemapping_one);
214+
std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]};
215+
if (!in_args) {
216+
return ENOMEM;
217+
}
218+
auto r_in = new (in_args.get()) fuse_removemapping_in();
219+
auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in))
220+
fuse_removemapping_one();
221+
r_in->count = 1;
222+
r_one->moffset = mstart * _chunk_size;
223+
r_one->len = nchunks * _chunk_size;
224+
225+
// The nodeid is irrelevant for the current implementation of
226+
// FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a
227+
// request per inode?
228+
uint64_t nodeid = 0;
229+
230+
virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
231+
nodeid, r_one->moffset, r_one->len);
232+
auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING,
233+
nodeid, in_args.get(), in_args_size, nullptr, 0);
234+
if (error) {
235+
kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid);
236+
return error;
237+
}
238+
239+
return 0;
240+
}
241+
242+
bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& found) const
243+
{
244+
for (auto& m : _mappings) {
245+
if (m.nodeid == nodeid &&
246+
m.fstart <= fstart &&
247+
m.fstart + m.nchunks > fstart) {
248+
249+
// m contains fstart
250+
auto excess = fstart - m.fstart; // excess contained in m
251+
found.nchunks = m.nchunks - excess;
252+
found.mstart = m.mstart + excess;
253+
return true;
254+
}
255+
}
256+
return false;
257+
}
258+
259+
dax_manager::chunk dax_manager::first_empty() const
260+
{
261+
if (_mappings.empty()) {
262+
return 0;
263+
}
264+
auto& m {_mappings.back()};
265+
return m.mstart + m.nchunks;
266+
}
267+
268+
}

0 commit comments

Comments
 (0)