Skip to content

Commit 6d608dd

Browse files
Fix I/O replay timing
fio supports replaying I/Os from a previous invocation, captured either via blktrace or `write_iolog`. That support includes simulating the timing between I/Os by using the running timestamps from the capture log, recreating that timing by inserting delays in the replay submission path if necessary. The calculation for how much delay to insert is based on how long it takes to complete the most recent replayed I/O, as compared to the how long that same I/O took in the original workload recorded in the log. The issue is in iolog_delay(). It calculates the replay I/O execution time starting from just after issuing I/O to the kernel in td_io_queue() until the start of iolog_delay(). This excludes the time it actually took for the kernel to process the submission, which depending on the I/O engine can be significant. Because this time was under-measured, iolog_delay()'s added more delay than needed, since it thought the replay was executing too fast vs the logged I/O. This issue causes the total replay to take 2x or longer than the original workload due to the progressively-larger drift from these excessive delays. The fix is to change iolog_delay() to use the full execution path time for an I/O, including the kernel submission time. This is accomplished by removing the existing points where the replay I/O clock is started (td_io_queue and the async engines) and moving that into iolog_delay(). This means we're no longer technically matching the precise stage in the I/O path for replay vs the original workload, but this doesn't really matter since that's only an initial time skew reflected in the very first I/O operation. After that we only need to match the inter-I/O pacing. This change includes fixes to additional issues found in iolog_delay(): 1) The logic that measures how long usec_sleep() takes relative to how much sleep was requested was comparing against the original log delay value instead of the value adjusted by the last replay execution I/O, causing an under-adjustment when usec_sleep() took longer than requested, creating more total delay than necessary. 2) The logic wasn't accumulating the usec_sleep() adjustment from the above step across more than one replayed I/O. It would reset the adjustment to zero on the next I/O. I guess the logic was assuming the adjustment would never be larger than a single log delay interval. It also wasn't accumulating the replay I/O execution time across more than one interval, so for example if a replay I/O took 3x longer than the log I/O it would only adjust that down to a zero delay for one replay I/O, causing the 2x difference to never be accounted for and thus add to the total running replay time. Link: #1537 (comment) Signed-off-by: Adam Horshack ([email protected])
1 parent 051b578 commit 6d608dd

File tree

7 files changed

+17
-66
lines changed

7 files changed

+17
-66
lines changed

engines/io_uring.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -613,12 +613,6 @@ static void fio_ioring_queued(struct thread_data *td, int start, int nr)
613613

614614
start++;
615615
}
616-
617-
/*
618-
* only used for iolog
619-
*/
620-
if (td->o.read_iolog_file)
621-
memcpy(&td->last_issue, &now, sizeof(now));
622616
}
623617

624618
static int fio_ioring_commit(struct thread_data *td)

engines/libaio.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -368,12 +368,6 @@ static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
368368
memcpy(&io_u->issue_time, &now, sizeof(now));
369369
io_u_queued(td, io_u);
370370
}
371-
372-
/*
373-
* only used for iolog
374-
*/
375-
if (td->o.read_iolog_file)
376-
memcpy(&td->last_issue, &now, sizeof(now));
377371
}
378372

379373
static int fio_libaio_commit(struct thread_data *td)

engines/librpma_fio.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -597,16 +597,6 @@ int librpma_fio_client_commit(struct thread_data *td)
597597
}
598598
}
599599

600-
if ((fill_time = fio_fill_issue_time(td))) {
601-
fio_gettime(&now, NULL);
602-
603-
/*
604-
* only used for iolog
605-
*/
606-
if (td->o.read_iolog_file)
607-
memcpy(&td->last_issue, &now, sizeof(now));
608-
609-
}
610600
/* move executed io_us from queued[] to flight[] */
611601
for (i = 0; i < ccd->io_u_queued_nr; i++) {
612602
struct io_u *io_u = ccd->io_us_queued[i];

engines/rdma.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -832,12 +832,6 @@ static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
832832
memcpy(&io_u->issue_time, &now, sizeof(now));
833833
io_u_queued(td, io_u);
834834
}
835-
836-
/*
837-
* only used for iolog
838-
*/
839-
if (td->o.read_iolog_file)
840-
memcpy(&td->last_issue, &now, sizeof(now));
841835
}
842836

843837
static int fio_rdmaio_commit(struct thread_data *td)

init.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -628,13 +628,6 @@ static int fixup_options(struct thread_data *td)
628628
}
629629
#endif
630630

631-
if (o->write_iolog_file && o->read_iolog_file) {
632-
log_err("fio: read iolog overrides write_iolog\n");
633-
free(o->write_iolog_file);
634-
o->write_iolog_file = NULL;
635-
ret |= warnings_fatal;
636-
}
637-
638631
if (o->zone_mode == ZONE_MODE_NONE && o->zone_size) {
639632
log_err("fio: --zonemode=none and --zonesize are not compatible.\n");
640633
ret |= 1;

ioengines.c

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -358,16 +358,8 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
358358

359359
if (td_ioengine_flagged(td, FIO_SYNCIO) ||
360360
async_ioengine_sync_trim(td, io_u)) {
361-
if (fio_fill_issue_time(td)) {
361+
if (fio_fill_issue_time(td))
362362
fio_gettime(&io_u->issue_time, NULL);
363-
364-
/*
365-
* only used for iolog
366-
*/
367-
if (td->o.read_iolog_file)
368-
memcpy(&td->last_issue, &io_u->issue_time,
369-
sizeof(io_u->issue_time));
370-
}
371363
}
372364

373365

@@ -444,16 +436,8 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
444436
if (!td_ioengine_flagged(td, FIO_SYNCIO) &&
445437
!async_ioengine_sync_trim(td, io_u)) {
446438
if (fio_fill_issue_time(td) &&
447-
!td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME)) {
439+
!td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME))
448440
fio_gettime(&io_u->issue_time, NULL);
449-
450-
/*
451-
* only used for iolog
452-
*/
453-
if (td->o.read_iolog_file)
454-
memcpy(&td->last_issue, &io_u->issue_time,
455-
sizeof(io_u->issue_time));
456-
}
457441
}
458442

459443
return ret;

iolog.c

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,23 +80,23 @@ void log_file(struct thread_data *td, struct fio_file *f,
8080

8181
static void iolog_delay(struct thread_data *td, unsigned long delay)
8282
{
83-
uint64_t usec = utime_since_now(&td->last_issue);
84-
unsigned long orig_delay = delay;
83+
uint64_t usec;
84+
unsigned long delay_wanted;
8585
uint64_t this_delay;
86-
struct timespec ts;
86+
struct timespec ts, ts_after_sleep;
8787

88-
if (delay < td->time_offset) {
89-
td->time_offset = 0;
90-
return;
91-
}
88+
fio_gettime(&ts, NULL);
89+
usec = utime_since(&td->last_issue, &ts) + td->time_offset;
9290

93-
delay -= td->time_offset;
94-
if (delay < usec)
91+
if (delay <= usec) {
92+
td->time_offset = usec-delay;
93+
td->last_issue = ts;
9594
return;
95+
}
9696

9797
delay -= usec;
9898

99-
fio_gettime(&ts, NULL);
99+
delay_wanted = delay;
100100
while (delay && !td->terminate) {
101101
this_delay = delay;
102102
if (this_delay > 500000)
@@ -106,11 +106,13 @@ static void iolog_delay(struct thread_data *td, unsigned long delay)
106106
delay -= this_delay;
107107
}
108108

109-
usec = utime_since_now(&ts);
110-
if (usec > orig_delay)
111-
td->time_offset = usec - orig_delay;
109+
fio_gettime(&ts_after_sleep, NULL);
110+
usec = utime_since(&ts, &ts_after_sleep);
111+
if (usec > delay_wanted)
112+
td->time_offset = usec - delay_wanted;
112113
else
113114
td->time_offset = 0;
115+
td->last_issue = ts_after_sleep;
114116
}
115117

116118
static int ipo_special(struct thread_data *td, struct io_piece *ipo)

0 commit comments

Comments
 (0)