Skip to content

Commit 7a2eaf2

Browse files
foxengwkozaczuk
authored andcommitted
virtio-fs: add basic read using the DAX window
When the DAX window is available from the device, the filesystem prefers to use it instead of the regular FUSE_READ request. If that fails, FUSE_READ is used as a fallback. To use the DAX window, a part of the file is mapped to it with FUSE_SETUPMAPPING, the contents are copied from it to the user buffers and the mapping is cleaned-up with FUSE_REMOVEMAPPING. In this naive implementation, the window is used for a single mapping at a time, with no caching or readahead. Signed-off-by: Fotis Xenakis <[email protected]> Message-Id: <VI1PR03MB438337071D63DDD39C416ED7A6BB0@VI1PR03MB4383.eurprd03.prod.outlook.com>
1 parent cf78fa9 commit 7a2eaf2

File tree

1 file changed

+148
-23
lines changed

1 file changed

+148
-23
lines changed

fs/virtiofs/virtiofs_vnops.cc

Lines changed: 148 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,12 @@
2323
#include <sys/types.h>
2424
#include <osv/device.h>
2525
#include <osv/sched.hh>
26+
#include <osv/mmio.hh>
27+
#include <osv/contiguous_alloc.hh>
2628

2729
#include "virtiofs.hh"
2830
#include "virtiofs_i.hh"
31+
#include "drivers/virtio-fs.hh"
2932

3033
static constexpr uint32_t OPEN_FLAGS = O_RDONLY;
3134

@@ -183,14 +186,142 @@ static int virtiofs_readlink(struct vnode* vnode, struct uio* uio)
183186
return uiomove(link_path.get(), strlen(link_path.get()), uio);
184187
}
185188

189+
// Read @read_amt bytes from @inode, using the DAX window.
190+
static int virtiofs_read_direct(virtiofs_inode& inode, u64 file_handle,
191+
u64 read_amt, fuse_strategy& strategy, struct uio& uio)
192+
{
193+
auto* drv = static_cast<virtio::fs*>(strategy.drv);
194+
auto* dax = drv->get_dax();
195+
// Enter the critical path: setup mapping -> read -> remove mapping
196+
std::lock_guard<mutex> guard {dax->lock};
197+
198+
// Setup mapping
199+
// NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
200+
// the spec: "Alignment constraints for FUSE_SETUPMAPPING and
201+
// FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
202+
// negotiation"):
203+
// - foffset: multiple of map_alignment from FUSE_INIT
204+
// - len: not larger than remaining file?
205+
// - moffset: multiple of map_alignment from FUSE_INIT
206+
// In practice, map_alignment is the host's page size, because foffset and
207+
// moffset are passed to mmap() on the host.
208+
std::unique_ptr<fuse_setupmapping_in> in_args {
209+
new (std::nothrow) fuse_setupmapping_in()};
210+
if (!in_args) {
211+
return ENOMEM;
212+
}
213+
in_args->fh = file_handle;
214+
in_args->flags = 0;
215+
uint64_t moffset = 0;
216+
in_args->moffset = moffset;
217+
218+
auto map_align = drv->get_map_alignment();
219+
if (map_align < 0) {
220+
kprintf("[virtiofs] inode %lld, map alignment not set\n", inode.nodeid);
221+
return ENOTSUP;
222+
}
223+
uint64_t alignment = 1ul << map_align;
224+
auto foffset = align_down(static_cast<uint64_t>(uio.uio_offset), alignment);
225+
in_args->foffset = foffset;
226+
227+
// The possible excess part of the file mapped due to alignment constraints
228+
// NOTE: map_excess <= alignemnt
229+
auto map_excess = uio.uio_offset - foffset;
230+
if (moffset + map_excess >= dax->len) {
231+
// No usable room in DAX window due to map_excess
232+
return ENOBUFS;
233+
}
234+
// Actual read amount is read_amt, or what fits in the DAX window
235+
auto read_amt_act = std::min<uint64_t>(read_amt,
236+
dax->len - moffset - map_excess);
237+
in_args->len = read_amt_act + map_excess;
238+
239+
virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
240+
"moffset=%lld)\n", inode.nodeid, in_args->foffset,
241+
in_args->len, in_args->moffset);
242+
auto error = fuse_req_send_and_receive_reply(&strategy, FUSE_SETUPMAPPING,
243+
inode.nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
244+
if (error) {
245+
kprintf("[virtiofs] inode %lld, mapping setup failed\n", inode.nodeid);
246+
return error;
247+
}
248+
249+
// Read from the DAX window
250+
// NOTE: It shouldn't be necessary to use the mmio* interface (i.e. volatile
251+
// accesses). From the spec: "Drivers map this shared memory region with
252+
// writeback caching as if it were regular RAM."
253+
// The location of the requested data in the DAX window
254+
auto req_data = dax->addr + moffset + map_excess;
255+
error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
256+
if (error) {
257+
kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
258+
return error;
259+
}
260+
261+
// Remove mapping
262+
// NOTE: This is only necessary when FUSE_SETUPMAPPING fails. From the spec:
263+
// "If the device runs out of resources the FUSE_SETUPMAPPING request fails
264+
// until resources are available again following FUSE_REMOVEMAPPING."
265+
auto r_in_args_size = sizeof(fuse_removemapping_in) +
266+
sizeof(fuse_removemapping_one);
267+
std::unique_ptr<u8> r_in_args {new (std::nothrow) u8[r_in_args_size]};
268+
if (!r_in_args) {
269+
return ENOMEM;
270+
}
271+
auto r_in = new (r_in_args.get()) fuse_removemapping_in();
272+
auto r_one = new (r_in_args.get() + sizeof(fuse_removemapping_in))
273+
fuse_removemapping_one();
274+
r_in->count = 1;
275+
r_one->moffset = in_args->moffset;
276+
r_one->len = in_args->len;
277+
278+
virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
279+
inode.nodeid, r_one->moffset, r_one->len);
280+
error = fuse_req_send_and_receive_reply(&strategy, FUSE_REMOVEMAPPING,
281+
inode.nodeid, r_in_args.get(), r_in_args_size, nullptr, 0);
282+
if (error) {
283+
kprintf("[virtiofs] inode %lld, mapping removal failed\n",
284+
inode.nodeid);
285+
return error;
286+
}
287+
288+
return 0;
289+
}
290+
291+
// Read @read_amt bytes from @inode, using the fallback FUSE_READ mechanism.
292+
static int virtiofs_read_fallback(virtiofs_inode& inode, u64 file_handle,
293+
u32 read_amt, u32 flags, fuse_strategy& strategy, struct uio& uio)
294+
{
295+
std::unique_ptr<fuse_read_in> in_args {new (std::nothrow) fuse_read_in()};
296+
std::unique_ptr<void, std::function<void(void*)>> buf {
297+
memory::alloc_phys_contiguous_aligned(read_amt,
298+
alignof(std::max_align_t)), memory::free_phys_contiguous_aligned };
299+
if (!in_args | !buf) {
300+
return ENOMEM;
301+
}
302+
in_args->fh = file_handle;
303+
in_args->offset = uio.uio_offset;
304+
in_args->size = read_amt;
305+
in_args->flags = flags;
306+
307+
virtiofs_debug("inode %lld, reading %lld bytes at offset %lld\n",
308+
inode.nodeid, read_amt, uio.uio_offset);
309+
auto error = fuse_req_send_and_receive_reply(&strategy, FUSE_READ,
310+
inode.nodeid, in_args.get(), sizeof(*in_args), buf.get(), read_amt);
311+
if (error) {
312+
kprintf("[virtiofs] inode %lld, read failed\n", inode.nodeid);
313+
return error;
314+
}
315+
316+
return uiomove(buf.get(), read_amt, &uio);
317+
}
318+
186319
// TODO: Optimize it to reduce number of exits to host (each
187320
// fuse_req_send_and_receive_reply()) by reading eagerly "ahead/around" just
188321
// like ROFS does and caching it
189322
static int virtiofs_read(struct vnode* vnode, struct file* fp, struct uio* uio,
190323
int ioflag)
191324
{
192-
auto* inode = static_cast<virtiofs_inode*>(vnode->v_data);
193-
194325
// Can't read directories
195326
if (vnode->v_type == VDIR) {
196327
return EISDIR;
@@ -212,32 +343,26 @@ static int virtiofs_read(struct vnode* vnode, struct file* fp, struct uio* uio,
212343
return 0;
213344
}
214345

346+
auto* inode = static_cast<virtiofs_inode*>(vnode->v_data);
347+
auto* file_data = static_cast<virtiofs_file_data*>(fp->f_data);
348+
auto* strategy = static_cast<fuse_strategy*>(vnode->v_mount->m_data);
349+
215350
// Total read amount is what they requested, or what is left
216351
auto read_amt = std::min<uint64_t>(uio->uio_resid,
217352
inode->attr.size - uio->uio_offset);
218-
std::unique_ptr<u8[]> buf {new (std::nothrow) u8[read_amt]};
219-
std::unique_ptr<fuse_read_in> in_args {new (std::nothrow) fuse_read_in()};
220-
if (!buf || !in_args) {
221-
return ENOMEM;
222-
}
223-
auto* f_data = static_cast<virtiofs_file_data*>(file_data(fp));
224-
in_args->fh = f_data->file_handle;
225-
in_args->offset = uio->uio_offset;
226-
in_args->size = read_amt;
227-
in_args->flags = ioflag;
228353

229-
virtiofs_debug("inode %lld, reading %lld bytes at offset %lld\n",
230-
inode->nodeid, read_amt, uio->uio_offset);
354+
auto* drv = static_cast<virtio::fs*>(strategy->drv);
355+
if (drv->get_dax()) {
356+
// Try to read from DAX
357+
if (!virtiofs_read_direct(*inode, file_data->file_handle, read_amt,
358+
*strategy, *uio)) {
231359

232-
auto* strategy = static_cast<fuse_strategy*>(vnode->v_mount->m_data);
233-
auto error = fuse_req_send_and_receive_reply(strategy, FUSE_READ,
234-
inode->nodeid, in_args.get(), sizeof(*in_args), buf.get(), read_amt);
235-
if (error) {
236-
kprintf("[virtiofs] inode %lld, read failed\n", inode->nodeid);
237-
return error;
360+
return 0;
361+
}
238362
}
239-
240-
return uiomove(buf.get(), read_amt, uio);
363+
// DAX unavailable or failed, use fallback
364+
return virtiofs_read_fallback(*inode, file_data->file_handle, read_amt,
365+
ioflag, *strategy, *uio);
241366
}
242367

243368
static int virtiofs_readdir(struct vnode* vnode, struct file* fp,
@@ -307,7 +432,7 @@ struct vnops virtiofs_vnops = {
307432
virtiofs_truncate, /* truncate - returns error when called */
308433
virtiofs_link, /* link - returns error when called */
309434
virtiofs_arc, /* arc */ //TODO: Implement to allow memory re-use when
310-
// mapping files, investigate using virtio-fs DAX
435+
// mapping files
311436
virtiofs_fallocate, /* fallocate - returns error when called */
312437
virtiofs_readlink, /* read link */
313438
virtiofs_symlink /* symbolic link - returns error when called */

0 commit comments

Comments
 (0)