2323#include < sys/types.h>
2424#include < osv/device.h>
2525#include < osv/sched.hh>
26+ #include < osv/mmio.hh>
27+ #include < osv/contiguous_alloc.hh>
2628
2729#include " virtiofs.hh"
2830#include " virtiofs_i.hh"
31+ #include " drivers/virtio-fs.hh"
2932
3033static constexpr uint32_t OPEN_FLAGS = O_RDONLY;
3134
@@ -183,14 +186,142 @@ static int virtiofs_readlink(struct vnode* vnode, struct uio* uio)
183186 return uiomove (link_path.get (), strlen (link_path.get ()), uio);
184187}
185188
189+ // Read @read_amt bytes from @inode, using the DAX window.
190+ static int virtiofs_read_direct (virtiofs_inode& inode, u64 file_handle,
191+ u64 read_amt, fuse_strategy& strategy, struct uio & uio)
192+ {
193+ auto * drv = static_cast <virtio::fs*>(strategy.drv );
194+ auto * dax = drv->get_dax ();
195+ // Enter the critical path: setup mapping -> read -> remove mapping
196+ std::lock_guard<mutex> guard {dax->lock };
197+
198+ // Setup mapping
199+ // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
200+ // the spec: "Alignment constraints for FUSE_SETUPMAPPING and
201+ // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
202+ // negotiation"):
203+ // - foffset: multiple of map_alignment from FUSE_INIT
204+ // - len: not larger than remaining file?
205+ // - moffset: multiple of map_alignment from FUSE_INIT
206+ // In practice, map_alignment is the host's page size, because foffset and
207+ // moffset are passed to mmap() on the host.
208+ std::unique_ptr<fuse_setupmapping_in> in_args {
209+ new (std::nothrow) fuse_setupmapping_in ()};
210+ if (!in_args) {
211+ return ENOMEM;
212+ }
213+ in_args->fh = file_handle;
214+ in_args->flags = 0 ;
215+ uint64_t moffset = 0 ;
216+ in_args->moffset = moffset;
217+
218+ auto map_align = drv->get_map_alignment ();
219+ if (map_align < 0 ) {
220+ kprintf (" [virtiofs] inode %lld, map alignment not set\n " , inode.nodeid );
221+ return ENOTSUP;
222+ }
223+ uint64_t alignment = 1ul << map_align;
224+ auto foffset = align_down (static_cast <uint64_t >(uio.uio_offset ), alignment);
225+ in_args->foffset = foffset;
226+
227+ // The possible excess part of the file mapped due to alignment constraints
228+ // NOTE: map_excess <= alignemnt
229+ auto map_excess = uio.uio_offset - foffset;
230+ if (moffset + map_excess >= dax->len ) {
231+ // No usable room in DAX window due to map_excess
232+ return ENOBUFS;
233+ }
234+ // Actual read amount is read_amt, or what fits in the DAX window
235+ auto read_amt_act = std::min<uint64_t >(read_amt,
236+ dax->len - moffset - map_excess);
237+ in_args->len = read_amt_act + map_excess;
238+
239+ virtiofs_debug (" inode %lld, setting up mapping (foffset=%lld, len=%lld, "
240+ " moffset=%lld)\n " , inode.nodeid , in_args->foffset ,
241+ in_args->len , in_args->moffset );
242+ auto error = fuse_req_send_and_receive_reply (&strategy, FUSE_SETUPMAPPING,
243+ inode.nodeid , in_args.get (), sizeof (*in_args), nullptr , 0 );
244+ if (error) {
245+ kprintf (" [virtiofs] inode %lld, mapping setup failed\n " , inode.nodeid );
246+ return error;
247+ }
248+
249+ // Read from the DAX window
250+ // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. volatile
251+ // accesses). From the spec: "Drivers map this shared memory region with
252+ // writeback caching as if it were regular RAM."
253+ // The location of the requested data in the DAX window
254+ auto req_data = dax->addr + moffset + map_excess;
255+ error = uiomove (const_cast <void *>(req_data), read_amt_act, &uio);
256+ if (error) {
257+ kprintf (" [virtiofs] inode %lld, uiomove failed\n " , inode.nodeid );
258+ return error;
259+ }
260+
261+ // Remove mapping
262+ // NOTE: This is only necessary when FUSE_SETUPMAPPING fails. From the spec:
263+ // "If the device runs out of resources the FUSE_SETUPMAPPING request fails
264+ // until resources are available again following FUSE_REMOVEMAPPING."
265+ auto r_in_args_size = sizeof (fuse_removemapping_in) +
266+ sizeof (fuse_removemapping_one);
267+ std::unique_ptr<u8 > r_in_args {new (std::nothrow) u8 [r_in_args_size]};
268+ if (!r_in_args) {
269+ return ENOMEM;
270+ }
271+ auto r_in = new (r_in_args.get ()) fuse_removemapping_in ();
272+ auto r_one = new (r_in_args.get () + sizeof (fuse_removemapping_in))
273+ fuse_removemapping_one ();
274+ r_in->count = 1 ;
275+ r_one->moffset = in_args->moffset ;
276+ r_one->len = in_args->len ;
277+
278+ virtiofs_debug (" inode %lld, removing mapping (moffset=%lld, len=%lld)\n " ,
279+ inode.nodeid , r_one->moffset , r_one->len );
280+ error = fuse_req_send_and_receive_reply (&strategy, FUSE_REMOVEMAPPING,
281+ inode.nodeid , r_in_args.get (), r_in_args_size, nullptr , 0 );
282+ if (error) {
283+ kprintf (" [virtiofs] inode %lld, mapping removal failed\n " ,
284+ inode.nodeid );
285+ return error;
286+ }
287+
288+ return 0 ;
289+ }
290+
291+ // Read @read_amt bytes from @inode, using the fallback FUSE_READ mechanism.
292+ static int virtiofs_read_fallback (virtiofs_inode& inode, u64 file_handle,
293+ u32 read_amt, u32 flags, fuse_strategy& strategy, struct uio & uio)
294+ {
295+ std::unique_ptr<fuse_read_in> in_args {new (std::nothrow) fuse_read_in ()};
296+ std::unique_ptr<void , std::function<void (void *)>> buf {
297+ memory::alloc_phys_contiguous_aligned (read_amt,
298+ alignof (std::max_align_t )), memory::free_phys_contiguous_aligned };
299+ if (!in_args | !buf) {
300+ return ENOMEM;
301+ }
302+ in_args->fh = file_handle;
303+ in_args->offset = uio.uio_offset ;
304+ in_args->size = read_amt;
305+ in_args->flags = flags;
306+
307+ virtiofs_debug (" inode %lld, reading %lld bytes at offset %lld\n " ,
308+ inode.nodeid , read_amt, uio.uio_offset );
309+ auto error = fuse_req_send_and_receive_reply (&strategy, FUSE_READ,
310+ inode.nodeid , in_args.get (), sizeof (*in_args), buf.get (), read_amt);
311+ if (error) {
312+ kprintf (" [virtiofs] inode %lld, read failed\n " , inode.nodeid );
313+ return error;
314+ }
315+
316+ return uiomove (buf.get (), read_amt, &uio);
317+ }
318+
186319// TODO: Optimize it to reduce number of exits to host (each
187320// fuse_req_send_and_receive_reply()) by reading eagerly "ahead/around" just
188321// like ROFS does and caching it
189322static int virtiofs_read (struct vnode * vnode, struct file * fp, struct uio * uio,
190323 int ioflag)
191324{
192- auto * inode = static_cast <virtiofs_inode*>(vnode->v_data );
193-
194325 // Can't read directories
195326 if (vnode->v_type == VDIR) {
196327 return EISDIR;
@@ -212,32 +343,26 @@ static int virtiofs_read(struct vnode* vnode, struct file* fp, struct uio* uio,
212343 return 0 ;
213344 }
214345
346+ auto * inode = static_cast <virtiofs_inode*>(vnode->v_data );
347+ auto * file_data = static_cast <virtiofs_file_data*>(fp->f_data );
348+ auto * strategy = static_cast <fuse_strategy*>(vnode->v_mount ->m_data );
349+
215350 // Total read amount is what they requested, or what is left
216351 auto read_amt = std::min<uint64_t >(uio->uio_resid ,
217352 inode->attr .size - uio->uio_offset );
218- std::unique_ptr<u8 []> buf {new (std::nothrow) u8 [read_amt]};
219- std::unique_ptr<fuse_read_in> in_args {new (std::nothrow) fuse_read_in ()};
220- if (!buf || !in_args) {
221- return ENOMEM;
222- }
223- auto * f_data = static_cast <virtiofs_file_data*>(file_data (fp));
224- in_args->fh = f_data->file_handle ;
225- in_args->offset = uio->uio_offset ;
226- in_args->size = read_amt;
227- in_args->flags = ioflag;
228353
229- virtiofs_debug (" inode %lld, reading %lld bytes at offset %lld\n " ,
230- inode->nodeid , read_amt, uio->uio_offset );
354+ auto * drv = static_cast <virtio::fs*>(strategy->drv );
355+ if (drv->get_dax ()) {
356+ // Try to read from DAX
357+ if (!virtiofs_read_direct (*inode, file_data->file_handle , read_amt,
358+ *strategy, *uio)) {
231359
232- auto * strategy = static_cast <fuse_strategy*>(vnode->v_mount ->m_data );
233- auto error = fuse_req_send_and_receive_reply (strategy, FUSE_READ,
234- inode->nodeid , in_args.get (), sizeof (*in_args), buf.get (), read_amt);
235- if (error) {
236- kprintf (" [virtiofs] inode %lld, read failed\n " , inode->nodeid );
237- return error;
360+ return 0 ;
361+ }
238362 }
239-
240- return uiomove (buf.get (), read_amt, uio);
363+ // DAX unavailable or failed, use fallback
364+ return virtiofs_read_fallback (*inode, file_data->file_handle , read_amt,
365+ ioflag, *strategy, *uio);
241366}
242367
243368static int virtiofs_readdir (struct vnode * vnode, struct file * fp,
@@ -307,7 +432,7 @@ struct vnops virtiofs_vnops = {
307432 virtiofs_truncate, /* truncate - returns error when called */
308433 virtiofs_link, /* link - returns error when called */
309434 virtiofs_arc, /* arc */ // TODO: Implement to allow memory re-use when
310- // mapping files, investigate using virtio-fs DAX
435+ // mapping files
311436 virtiofs_fallocate, /* fallocate - returns error when called */
312437 virtiofs_readlink, /* read link */
313438 virtiofs_symlink /* symbolic link - returns error when called */
0 commit comments