diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index fa8e7fa691db..c11dc5f1aa0f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -110,6 +111,7 @@ enum { ARG_ALLOCATED = 256, ARG_BLOCK_BIN_MODE, ARG_BLOCK_CLASSES, + ARG_ANYRAID_MAP, }; static const char cmdname[] = "zdb"; @@ -742,9 +744,10 @@ usage(void) "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " - "\n\n", + "\n" + "\t%s --anyraid-map [ ...]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname, cmdname, cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -9311,7 +9314,8 @@ zdb_read_block(char *thing, spa_t *spa) if ((zio_checksum_table[ck].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) || - ck == ZIO_CHECKSUM_NOPARITY) { + ck == ZIO_CHECKSUM_NOPARITY || + ck == ZIO_CHECKSUM_ANYRAID_MAP) { continue; } BP_SET_CHECKSUM(bp, ck); @@ -9432,10 +9436,482 @@ dummy_get_file_info(dmu_object_type_t bonustype, const void *data, abort(); } +static int +numlen(uint64_t v) { + char buf[32]; + snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)v); + return (strlen(buf)); +} + +static void +print_separator_line(int cols, int colwidth, boolean_t *print, boolean_t *final) +{ + char buf[64]; + ASSERT3U(colwidth * strlen("─"), <, sizeof (buf) - 2); + int len = 0, off = 0; + // Create a buffer with the cell separator to make later code simpler. + while (len < colwidth) { + len++; + int n = snprintf(buf + off, sizeof (buf) - off, "─"); + ASSERT(n > 0 && n < sizeof (buf) - off); + off += n; + } + + for (int i = 0; i < cols; i++) { + /* + * Skip cells that we don't need to print. If the previous cell] + * also wasn't printed, add an extra space for the separator + * column. + */ + if (!print[i]) { + int extra_width = 0; + if (i == 0 || !print[i - 1]) + extra_width++; + (void) printf("%*s", colwidth + extra_width, ""); + continue; + } + + // Calculate the right shape for the corner of the cells. + const char *left_c, *right_c; + if (i == 0 || !print[i - 1]) { + left_c = (final[i] && (i == 0 || final[i - 1])) ? + "└" : "├"; + } else { + left_c = ""; + } + if (i == cols - 1 || !print[i + 1]) { + right_c = + (final[i] && (i == cols - 1 || final[i + 1])) ? + "┘" : "┤"; + } else { + right_c = + (final[i] && (i == cols - 1 || final[i + 1])) ? + "┴" : "┼"; + } + (void) printf("%s%s%s", left_c, buf, right_c); + } + (void) printf("\n"); +} + +static void +zdb_print_anyraid_tile_layout(vdev_t *vd) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_anyraid_ops); + vdev_anyraid_t *var = vd->vdev_tsd; + int cols = vd->vdev_children; + int textwidth = MAX(8, numlen(avl_numnodes(&var->vd_tile_map)) + + var->vd_nparity > 0 ? numlen(var->vd_nparity + 1) + 1 : 0); + int colwidth = textwidth + 2; + + // Create and populate table with all the values we need to print. + char ***table = malloc(sizeof (*table) * cols); + for (int i = 0; i < cols; i++) { + table[i] = calloc(var->vd_children[i]->van_capacity + 1, + sizeof (**table)); + } + + anyraid_tile_t *cur = avl_first(&var->vd_tile_map); + while (cur) { + int p = 0; + for (anyraid_tile_node_t *node = list_head(&cur->at_list); + node; node = list_next(&cur->at_list, node)) { + ASSERT3U(p, <=, var->vd_nparity + 1); + char **next = + &(table[node->atn_disk][node->atn_offset]); + *next = malloc(textwidth + 1); + int len = snprintf(*next, textwidth, "%d", + cur->at_tile_id); + if (var->vd_nparity > 0) { + (void) snprintf((*next) + len, textwidth - len, + "-%d", p); + } + p++; + } + ASSERT3U(p, ==, var->vd_nparity + 1); + cur = AVL_NEXT(&var->vd_tile_map, cur); + } + + // These are needed to generate the separator lines + boolean_t *printed = malloc(sizeof (*printed) * cols); + boolean_t *final = malloc(sizeof (*final) * cols); + // Print the header row + for (int i = 0; i < cols; i++) { + if (i == 0) + (void) printf("│"); + (void) printf(" %*d ", textwidth, i); + (void) printf("│"); + printed[i] = B_TRUE; + final[i] = B_FALSE; + } + (void) printf("\n"); + print_separator_line(cols, colwidth, printed, final); + + // Print out the actual tile map, one row at a time. + for (int i = 0; ; i++) { + int last_printed = INT_MAX; + for (int v = 0; v < cols; v++) { + if (final[v]) { + ASSERT3U(i, >=, + var->vd_children[v]->van_capacity + 1); + int extra_width = 0; + if (v == 0 || !printed[v - 1]) + extra_width++; + (void) printf("%*s", + colwidth + extra_width, ""); + printed[v] = B_FALSE; + continue; + } + if (i + 1 == var->vd_children[v]->van_capacity + 1) + final[v] = B_TRUE; + if (v - 1 != last_printed) + (void) printf("│"); + char *value = table[v][i]; + (void) printf(" %*s │", textwidth, value ? value : + ""); + last_printed = v; + } + + if (last_printed == INT_MAX) + break; + (void) printf("\n"); + print_separator_line(cols, colwidth, printed, final); + } + (void) printf("\n"); + for (int i = 0; i < cols; i++) { + for (int j = 0; j < var->vd_children[i]->van_capacity + 1; j++) + if (table[i][j]) + free(table[i][j]); + free(table[i]); + } + free(table); +} + +static void +free_header(anyraid_header_t *header, uint64_t header_size) { + fnvlist_free(header->ah_nvl); + abd_return_buf(header->ah_abd, header->ah_buf, header_size); + abd_free(header->ah_abd); +} + +/* + * Print one of the anyraid maps from the given vdev child. This prints the + * mapping entries themselves, rather than the kernel's interpretation of them, + * which can be useful for debugging. + */ +static void +print_anyraid_mapping(vdev_t *vd, int child, int mapping, + anyraid_header_t *header) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + vdev_t *cvd = vd->vdev_child[child]; + uint64_t ashift = cvd->vdev_ashift; + spa_t *spa = vd->vdev_spa; + int error = 0; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + uint64_t header_offset = VDEV_LABEL_START_SIZE + + mapping * VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift); + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + uint64_t map_offset = header_offset + header_size; + + nvlist_t *hnvl = header->ah_nvl; + // Look up and print map metadata. + uint16_t version; + if (nvlist_lookup_uint16(hnvl, VDEV_ANYRAID_HEADER_VERSION, + &version) != 0) { + (void) printf("No version\n"); + free_header(header, header_size); + return; + } + + uint64_t tile_size; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_TILE_SIZE, + &tile_size) != 0) { + (void) printf("No tile size\n"); + free_header(header, header_size); + return; + } + + uint32_t map_length; + if (nvlist_lookup_uint32(hnvl, VDEV_ANYRAID_HEADER_LENGTH, + &map_length) != 0) { + (void) printf("No map length\n"); + free_header(header, header_size); + return; + } + + uint64_t written_txg = 0; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_TXG, + &written_txg) != 0) + (void) printf("No valid TXG\n"); + + uint8_t disk_id = 0; + if (nvlist_lookup_uint8(hnvl, VDEV_ANYRAID_HEADER_DISK, + &disk_id) != 0) + (void) printf("No valid disk ID\n"); + + (void) printf("version: %6d\ttile size: %#8lx\ttxg: %lu\n", + version, tile_size, written_txg); + (void) printf("map length: %6u\tdisk id: %3u\n", map_length, disk_id); + + // Read in and print the actual mapping data + zio_t *rio = zio_root(spa, NULL, NULL, flags); + abd_t *map_abds[VDEV_ANYRAID_MAP_COPIES] = {0}; + int i; + for (i = 0; i <= (map_length / SPA_MAXBLOCKSIZE); i++) { + zio_eck_t *cksum = (zio_eck_t *) + &header->ah_buf[VDEV_ANYRAID_NVL_BYTES(ashift) + + i * sizeof (*cksum)]; + map_abds[i] = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); + zio_nowait(zio_read_phys(rio, cvd, map_offset + + i * SPA_MAXBLOCKSIZE, SPA_MAXBLOCKSIZE, map_abds[i], + ZIO_CHECKSUM_ANYRAID_MAP, NULL, cksum, + ZIO_PRIORITY_SYNC_READ, flags, B_FALSE)); + } + i--; + if ((error = zio_wait(rio))) { + (void) printf("Could not read map: %s\n", strerror(error)); + for (; i >= 0; i--) + abd_free(map_abds[i]); + free_header(header, header_size); + return; + } + free_header(header, header_size); + + uint32_t map = -1, cur_tile = 0; + /* + * For now, all entries are the size of a uint32_t. If that + * ever changes, we need better logic here. + */ + uint32_t size = sizeof (uint32_t); + uint8_t *map_buf = NULL; + uint8_t par_cnt = 0; + for (uint32_t off = 0; off < map_length; off += size) { + int next_map = off / SPA_MAXBLOCKSIZE; + if (map != next_map) { + // switch maps + if (map != -1) { + abd_return_buf(map_abds[map], map_buf, + SPA_MAXBLOCKSIZE); + } + map_buf = abd_borrow_buf(map_abds[next_map], + SPA_MAXBLOCKSIZE); + map = next_map; + } + uint32_t mo = off % SPA_MAXBLOCKSIZE; + anyraid_map_entry_t *entry = + (anyraid_map_entry_t *)(map_buf + mo); + uint8_t type = ame_get_type(entry); + uint8_t *buf; + boolean_t allocated = B_FALSE; + if (size > SPA_MAXBLOCKSIZE - mo) { + buf = kmem_alloc(size, KM_SLEEP); + uint8_t rem = SPA_MAXBLOCKSIZE - mo; + allocated = B_TRUE; + memcpy(buf, map_buf + mo, rem); + // switch maps + if (map != -1) { + abd_return_buf(map_abds[map], map_buf, + SPA_MAXBLOCKSIZE); + } + map_buf = abd_borrow_buf(map_abds[next_map], + SPA_MAXBLOCKSIZE); + map = next_map; + memcpy(buf + rem, map_buf, size - rem); + } else { + buf = map_buf + mo; + } + entry = (anyraid_map_entry_t *)buf; + switch (type) { + case AMET_SKIP: { + anyraid_map_skip_entry_t *amse = + &entry->ame_u.ame_amse; + ASSERT0(par_cnt); + cur_tile += amse_get_skip_count(amse); + (void) printf("skip %u\n", + amse_get_skip_count(amse)); + break; + } + case AMET_LOC: { + anyraid_map_loc_entry_t *amle = + &entry->ame_u.ame_amle; + if (par_cnt == 0) { + (void) printf("loc %u:", cur_tile); + cur_tile++; + } + (void) printf("\td%u o%u,", amle_get_disk(amle), + amle_get_offset(amle)); + par_cnt = (par_cnt + 1) % (var->vd_nparity + 1); + if (par_cnt == 0) + (void) printf("\n"); + break; + } + default: + (void) printf("Invalid entry type %d, " + "aborting\n", type); + break; + } + if (allocated) + kmem_free(buf, size); + } + if (map_buf) + abd_return_buf(map_abds[map], map_buf, SPA_MAXBLOCKSIZE); + + var->vd_tile_size = tile_size; + + for (; i >= 0; i--) + abd_free(map_abds[i]); + + return; + +} + +/* + * Print the anyraid maps on disk. With verbosity == 2, we use the normal + * mapping-selection logic that we use during import; with higher verbosity, we + * print them all. + */ +static void +zdb_print_anyraid_ondisk_maps(vdev_t *vd, int verbosity) +{ + int child = 0; + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + if (verbosity == 2) { + anyraid_header_t header; + int mapping; + uint64_t txg; + int error = vdev_anyraid_pick_best_mapping( + vd->vdev_child[child], &txg, &header, &mapping); + if (error != 0) { + (void) printf("Could not print mapping: %s\n", + strerror(error)); + spa_config_exit(spa, SCL_ZIO, FTAG); + return; + } + (void) printf("anyraid map %d:\n", mapping); + print_anyraid_mapping(vd, child, mapping, &header); + } else if (verbosity == 3) { + for (int i = 0; i < VDEV_ANYRAID_MAP_COPIES; i++) { + (void) printf("anyraid map %d:\n", i); + anyraid_header_t header; + int error = vdev_anyraid_open_header( + vd->vdev_child[child], i, &header); + if (error != 0) { + (void) printf("Could not print mapping: %s\n", + strerror(error)); + spa_config_exit(spa, SCL_ZIO, FTAG); + return; + } + print_anyraid_mapping(vd, child, i, &header); + } + } else { + for (; child < vd->vdev_children; child++) { + for (int i = 0; i < VDEV_ANYRAID_MAP_COPIES; i++) { + (void) printf("anyraid map %d %d:\n", child, i); + anyraid_header_t header; + int error = vdev_anyraid_open_header( + vd->vdev_child[child], i, &header); + if (error != 0) { + (void) printf("Could not print " + "mapping: %s\n", strerror(error)); + continue; + } + print_anyraid_mapping(vd, child, i, &header); + } + } + + } + spa_config_exit(spa, SCL_ZIO, FTAG); +} + +/* + * Print the loaded version of the map for the provided anyraid vdev. + */ +static void +zdb_dump_anyraid_map_vdev(vdev_t *vd, int verbosity) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_anyraid_ops); + vdev_anyraid_t *var = vd->vdev_tsd; + + (void) printf("\t%-5s%11llu %s %#16llx\n", + "vdev", (u_longlong_t)vd->vdev_id, + "tile_size", (u_longlong_t)var->vd_tile_size); + (void) printf("\t%-8s%8llu", "tiles", + (u_longlong_t)avl_numnodes(&var->vd_tile_map)); + if (var->vd_checkpoint_tile != UINT32_MAX) { + (void) printf(". %-12s %10u\n", "checkpoint tile", + var->vd_checkpoint_tile); + } else { + (void) printf("\n"); + } + + (void) printf("\t%16s %12s %13s\n", "----------------", + "------------", "-------------"); + + anyraid_tile_t *cur = avl_first(&var->vd_tile_map); + anyraid_tile_node_t *curn = cur != NULL ? + list_head(&cur->at_list) : NULL; + while (cur) { + (void) printf("\t%-8s%8llu %-8s%04llx %-11s%02llx\n", + "tile", (u_longlong_t)cur->at_tile_id, + "offset", (u_longlong_t)curn->atn_offset, + "disk", (u_longlong_t)curn->atn_disk); + curn = list_next(&cur->at_list, curn); + if (curn == NULL) { + cur = AVL_NEXT(&var->vd_tile_map, cur); + curn = cur != NULL ? list_head(&cur->at_list) : NULL; + } + } + + (void) printf("\n"); + if (verbosity > 0) + zdb_print_anyraid_tile_layout(vd); + + if (verbosity > 1) + zdb_print_anyraid_ondisk_maps(vd, verbosity); +} + +static int +zdb_dump_anyraid_map(char *vdev_str, spa_t *spa, int verbosity) +{ + vdev_t *rvd, *vd; + + /* A specific vdev. */ + if (vdev_str != NULL) { + vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev_str); + if (vd == NULL) { + (void) printf("Invalid vdev: %s\n", vdev_str); + return (EINVAL); + } + if (vd->vdev_ops != &vdev_anyraid_ops && + (vd->vdev_parent == NULL || + (vd = vd->vdev_parent)->vdev_ops != &vdev_anyraid_ops)) { + (void) printf("Not an anyraid vdev: %s\n", vdev_str); + return (EINVAL); + } + + (void) printf("\nAnyRAID tiles:\n"); + zdb_dump_anyraid_map_vdev(vd, verbosity); + return (0); + } + + (void) printf("\nAnyRAID tiles:\n"); + /* All anyraid vdevs. */ + rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vd = rvd->vdev_child[c]; + if (vd->vdev_ops == &vdev_anyraid_ops) + zdb_dump_anyraid_map_vdev(vd, verbosity); + } + return (0); +} + int main(int argc, char **argv) { - int c; + int c, long_index; + boolean_t opt_anyraid_map = B_FALSE; int dump_all = 1; int verbose = 0; int error = 0; @@ -9539,12 +10015,14 @@ main(int argc, char **argv) ARG_BLOCK_BIN_MODE}, {"class", required_argument, NULL, ARG_BLOCK_CLASSES}, + {"anyraid-map", no_argument, NULL, + ARG_ANYRAID_MAP}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ", - long_options, NULL)) != -1) { + long_options, &long_index)) != -1) { switch (c) { case 'b': case 'B': @@ -9705,6 +10183,10 @@ main(int argc, char **argv) free(buf); break; } + case ARG_ANYRAID_MAP: + opt_anyraid_map = B_TRUE; + dump_all = 0; + break; default: usage(); break; @@ -10115,6 +10597,16 @@ main(int argc, char **argv) argc--; if (dump_opt['r']) { error = zdb_copy_object(os, object, argv[1]); + } else if (opt_anyraid_map) { + if (argc == 0) + error = zdb_dump_anyraid_map(NULL, spa, verbose); + else + for (int i = 0; i < argc; i++) { + error = zdb_dump_anyraid_map(argv[i], spa, + verbose); + if (error != 0) + break; + } } else if (!dump_opt['R']) { flagbits['d'] = ZOR_FLAG_DIRECTORY; flagbits['f'] = ZOR_FLAG_PLAIN_FILE; diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 222b5524669e..6495d7592691 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -78,6 +78,7 @@ #include "zpool_util.h" #include #include +#include /* * For any given vdev specification, we can have multiple errors. The @@ -431,7 +432,8 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b, { if ((strcmp(a->zprl_type, "raidz") == 0 || strcmp(a->zprl_type, "draid") == 0) && - strcmp(b->zprl_type, "mirror") == 0) { + (strcmp(b->zprl_type, "mirror") == 0 || + strcmp(b->zprl_type, "anymirror") == 0)) { *raidz = a; *mirror = b; return (B_TRUE); @@ -527,11 +529,11 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_children = 0; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || - strcmp(type, VDEV_TYPE_DRAID) == 0) { + strcmp(type, VDEV_TYPE_DRAID) == 0 || + strcmp(type, VDEV_TYPE_ANYRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); - assert(rep.zprl_parity != 0); } else { rep.zprl_parity = 0; } @@ -541,6 +543,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * already reported an error for this spec, so don't * bother doing it again. */ + const char *orig_type = type; type = NULL; dontreport = 0; vdev_size = -1LL; @@ -646,7 +649,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) if (!dontreport && (vdev_size != -1LL && (llabs(size - vdev_size) > - ZPOOL_FUZZ))) { + ZPOOL_FUZZ)) && strcmp(orig_type, + VDEV_TYPE_ANYRAID) != 0) { if (ret != NULL) free(ret); ret = NULL; @@ -726,19 +730,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) else return (NULL); } - } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != - 0) { - if (ret != NULL) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication level: " - "both %s and %s vdevs are " - "present\n"), - lastrep.zprl_type, rep.zprl_type); - else - return (NULL); } else if (lastrep.zprl_parity != rep.zprl_parity) { if (ret) free(ret); @@ -754,7 +745,9 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type); else return (NULL); - } else if (lastrep.zprl_children != rep.zprl_children) { + } else if (lastrep.zprl_children != + rep.zprl_children && strcmp(rep.zprl_type, + VDEV_TYPE_ANYRAID) != 0) { if (ret) free(ret); ret = NULL; @@ -1200,7 +1193,7 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, } /* - * Returns the parity level extracted from a raidz or draid type. + * Returns the parity level extracted from a raidz, anyraid, or draid type. * If the parity cannot be determined zero is returned. */ static int @@ -1228,6 +1221,22 @@ get_parity(const char *type) return (0); } } + } else if (strncmp(type, VDEV_TYPE_ANYRAID, + strlen(VDEV_TYPE_ANYRAID)) == 0) { + p = type + strlen(VDEV_TYPE_ANYRAID); + + if (*p == '\0') { + /* when unspecified default to 1-parity mirror */ + return (1); + } else { + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || *end != '\0' || + parity < 0 || parity > VDEV_ANYRAID_MAXPARITY) { + return (-1); + } + } } else if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { p = type + strlen(VDEV_TYPE_DRAID); @@ -1285,6 +1294,17 @@ is_grouping(const char *type, int *mindev, int *maxdev) if (maxdev != NULL) *maxdev = INT_MAX; + if (strncmp(type, VDEV_TYPE_ANYRAID, strlen(VDEV_TYPE_ANYRAID)) == 0) { + nparity = get_parity(type); + if (nparity < 0) + return (NULL); + if (mindev != NULL) + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; + return (VDEV_TYPE_ANYRAID); + } + if (strcmp(type, "mirror") == 0) { if (mindev != NULL) *mindev = 2; @@ -1319,6 +1339,22 @@ is_grouping(const char *type, int *mindev, int *maxdev) return (NULL); } +static int +anyraid_config_by_type(nvlist_t *nv, const char *type) +{ + uint64_t nparity = 0; + + if (strncmp(type, VDEV_TYPE_ANYRAID, strlen(VDEV_TYPE_ANYRAID)) != 0) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + + fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, VAP_MIRROR); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + + return (0); +} + /* * Extract the configuration parameters encoded in the dRAID type and * use them to generate a dRAID configuration. The expected format is: @@ -1527,9 +1563,9 @@ construct_spec(nvlist_t *props, int argc, char **argv) nv = NULL; /* - * If it's a mirror, raidz, or draid the subsequent arguments - * are its leaves -- until we encounter the next mirror, - * raidz or draid. + * If it's a mirror, raidz, anyraid, or draid the subsequent + * arguments are its leaves -- until we encounter the next + * mirror, raidz, anyraid, or draid. */ if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; @@ -1596,7 +1632,12 @@ construct_spec(nvlist_t *props, int argc, char **argv) } if (is_log) { - if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + /* + * TODO: only AnyRAID mirror is expected to be + * allowed. + */ + if (strcmp(type, VDEV_TYPE_MIRROR) != 0 && + strcmp(type, VDEV_TYPE_ANYRAID) != 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: unsupported 'log' " @@ -1690,6 +1731,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } + if (strcmp(type, VDEV_TYPE_ANYRAID) == 0) { + if (anyraid_config_by_type(nv, fulltype) + != 0) { + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + } if (strcmp(type, VDEV_TYPE_DRAID) == 0) { if (draid_config_by_type(nv, fulltype, children) != 0) { diff --git a/cmd/ztest.c b/cmd/ztest.c index 89b1f68606ea..857ccadf2aa3 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -106,6 +106,7 @@ #include #include #include +#include #include #include #include @@ -188,7 +189,7 @@ typedef struct ztest_shared_opts { int zo_raid_do_expand; int zo_raid_children; int zo_raid_parity; - char zo_raid_type[8]; + char zo_raid_type[16]; int zo_draid_data; int zo_draid_spares; int zo_datasets; @@ -278,6 +279,7 @@ extern uint64_t raidz_expand_max_reflow_bytes; extern uint_t raidz_expand_pause_point; extern boolean_t ddt_prune_artificial_age; extern boolean_t ddt_dump_prune_histogram; +extern uint64_t zfs_anyraid_min_tile_size; static ztest_shared_opts_t *ztest_shared_opts; @@ -673,10 +675,12 @@ fatal(int do_perror, const char *message, ...) fatal_msg = buf; /* to ease debugging */ out: - if (ztest_dump_core) + if (ztest_dump_core) { abort(); - else + } else { + // NOTE: Not safe if we've called kernel_fini already dump_debug_buffer(); + } exit(3); } @@ -769,7 +773,7 @@ static ztest_option_t option_table[] = { DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, - { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", + { 'K', "raid-kind", "raidz|eraidz|draid|anymirror|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, @@ -1119,7 +1123,7 @@ process_options(int argc, char **argv) } if (strcmp(raid_kind, "random") == 0) { - switch (ztest_random(3)) { + switch (ztest_random(4)) { case 0: raid_kind = "raidz"; break; @@ -1129,6 +1133,9 @@ process_options(int argc, char **argv) case 2: raid_kind = "draid"; break; + case 3: + raid_kind = "anymirror"; + break; } if (ztest_opts.zo_verbose >= 3) @@ -1180,11 +1187,25 @@ process_options(int argc, char **argv) zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); - } else /* using raidz */ { - ASSERT0(strcmp(raid_kind, "raidz")); + } else if (strcmp(raid_kind, "raidz") == 0) { + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } else if (strcmp(raid_kind, "anymirror") == 0) { + uint64_t min_devsize; + + /* With fewer disks use 1G, otherwise 512M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (1ULL << 30) : (512ULL << 20); + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_ANYRAID, + sizeof (zo->zo_raid_type)); + } else { + fatal(B_FALSE, "invalid raid kind %s", raid_kind); } zo->zo_vdevtime = @@ -1375,6 +1396,9 @@ make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + } else if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_ANYRAID) == 0) { + fnvlist_add_uint8(raid, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, + VAP_MIRROR); } for (c = 0; c < r; c++) @@ -3166,7 +3190,8 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) return; /* dRAID added after feature flags, skip upgrade test. */ - if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0 || + strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_ANYRAID) == 0) return; mutex_enter(&ztest_vdev_lock); @@ -3790,28 +3815,44 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_raid_children > 1) { if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); + else if (strcmp(oldvd->vdev_ops->vdev_op_type, "anymirror") == + 0) + ASSERT3P(oldvd->vdev_ops, ==, &vdev_anyraid_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); oldvd = oldvd->vdev_child[leaf % raidz_children]; } + if (!replacing && oldvd->vdev_parent->vdev_ops == &vdev_anyraid_ops) { + oldvd = oldvd->vdev_parent; + } + /* * If we're already doing an attach or replace, oldvd may be a - * mirror vdev -- in which case, pick a random child. + * mirror vdev -- in which case, pick a random child. For anyraid vdevs, + * attachment occurs at the parent level. */ - while (oldvd->vdev_children != 0) { + while (oldvd->vdev_children != 0 && oldvd->vdev_ops != + &vdev_anyraid_ops) { oldvd_has_siblings = B_TRUE; ASSERT3U(oldvd->vdev_children, >=, 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; - oldsize = vdev_get_min_asize(oldvd); + oldsize = vdev_get_min_attach_size(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; oldvd_is_special = oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; - (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); + if (oldvd->vdev_path == NULL) { + ASSERT3P(oldvd->vdev_ops, ==, &vdev_anyraid_ops); + snprintf(oldpath, MAXPATHLEN, "%s-%llu", + oldvd->vdev_ops->vdev_op_type, + (u_longlong_t)oldvd->vdev_id); + } else { + (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); + } pvd = oldvd->vdev_parent; pguid = pvd->vdev_guid; @@ -3820,7 +3861,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * to the detach the pool is scrubbed in order to prevent creating * unrepairable blocks as a result of the data corruption injection. */ - if (oldvd_has_siblings && ztest_random(2) == 0) { + if (oldvd_has_siblings && oldvd->vdev_ops != &vdev_anyraid_ops && + ztest_random(2) == 0) { spa_config_exit(spa, SCL_ALL, FTAG); error = ztest_scrub_impl(spa); @@ -3884,7 +3926,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is a distributed spare and it's being attached to a * dRAID which is not its parent it should fail with ENOTSUP. */ - if (pvd->vdev_ops != &vdev_mirror_ops && + if (oldvd->vdev_ops == &vdev_anyraid_ops) + expected_error = 0; + else if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_spare_ops)) @@ -3896,7 +3940,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; - else if (!newvd_is_dspare && newsize < oldsize) + else if (newsize < oldsize && !(newvd_is_dspare || + (pvd->vdev_ops == &vdev_anyraid_ops && + newsize < pvd->vdev_ops->vdev_op_min_asize(pvd, oldvd)))) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; @@ -3917,8 +3963,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * When supported select either a healing or sequential resilver. */ boolean_t rebuilding = B_FALSE; - if (pvd->vdev_ops == &vdev_mirror_ops || - pvd->vdev_ops == &vdev_root_ops) { + if (oldvd->vdev_ops != &vdev_anyraid_ops && + (pvd->vdev_ops == &vdev_mirror_ops || + pvd->vdev_ops == &vdev_root_ops)) { rebuilding = !!ztest_random(2); } @@ -8999,6 +9046,9 @@ main(int argc, char **argv) metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; + zfs_anyraid_min_tile_size = MIN(zfs_anyraid_min_tile_size, + ztest_opts.zo_vdev_size / 8); + if (zs->zs_do_init) ztest_run_init(); else diff --git a/include/Makefile.am b/include/Makefile.am index 7588cd0aedc9..8b74413ced77 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -100,6 +100,7 @@ COMMON_H = \ sys/unique.h \ sys/uuid.h \ sys/vdev.h \ + sys/vdev_anyraid.h \ sys/vdev_disk.h \ sys/vdev_draid.h \ sys/vdev_file.h \ @@ -107,6 +108,7 @@ COMMON_H = \ sys/vdev_indirect_births.h \ sys/vdev_indirect_mapping.h \ sys/vdev_initialize.h \ + sys/vdev_mirror.h \ sys/vdev_raidz.h \ sys/vdev_raidz_impl.h \ sys/vdev_rebuild.h \ diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h index e49ada399694..ac320869cdc2 100644 --- a/include/os/linux/kernel/linux/mod_compat.h +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -38,6 +38,7 @@ typedef const struct kernel_param zfs_kernel_param_t; enum scope_prefix_types { zfs, + zfs_anyraid, zfs_arc, zfs_brt, zfs_condense, diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 830c8455bb1a..08f8cac2e470 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -388,6 +388,9 @@ typedef enum { VDEV_PROP_SIT_OUT, VDEV_PROP_AUTOSIT, VDEV_PROP_SLOW_IO_EVENTS, + VDEV_PROP_ANYRAID_CAP_TILES, + VDEV_PROP_ANYRAID_NUM_TILES, + VDEV_PROP_ANYRAID_TILE_SIZE, VDEV_NUM_PROPS } vdev_prop_t; @@ -907,10 +910,14 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" +/* ANYRAID configuration */ +#define ZPOOL_CONFIG_ANYRAID_PARITY_TYPE "anyraid_parity_type" + #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_ANYRAID "anymirror" #define VDEV_TYPE_DRAID "draid" #define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" @@ -922,6 +929,8 @@ typedef struct zpool_load_policy { #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" +#define VDEV_ANYRAID_MAXPARITY 3 + #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_DRAID_MAXPARITY 3 diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 6ce995d0a086..9b30e4721df6 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -82,6 +82,8 @@ typedef enum trace_alloc_type { (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) +#define METASLAB_MAX_WEIGHT (METASLAB_WEIGHT_TYPE - 1) + /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the diff --git a/include/sys/spa.h b/include/sys/spa.h index 2a4cc60c4aa8..691a3287deb7 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1083,9 +1083,12 @@ extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); +extern uint64_t spa_load_max_txg(spa_t *spa); +extern uint64_t spa_current_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); +extern uint64_t spa_load_txg(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); @@ -1160,7 +1163,9 @@ extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); +extern uint64_t spa_checkpoint_txg(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); +extern boolean_t spa_importing_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 510474d6c085..c4a4388bfccb 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -191,9 +191,17 @@ extern uint64_t vdev_queue_last_offset(vdev_t *vd); extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern boolean_t vdev_queue_pool_busy(spa_t *spa); +typedef enum vdev_config_sync_status { + VDEV_CONFIG_KEEP_CHECKPOINT, + VDEV_CONFIG_CREATING_CHECKPOINT, + VDEV_CONFIG_NO_CHECKPOINT, + VDEV_CONFIG_REWINDING_CHECKPOINT +} vdev_config_sync_status_t; + extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); -extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, + vdev_config_sync_status_t status); extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); diff --git a/include/sys/vdev_anyraid.h b/include/sys/vdev_anyraid.h new file mode 100644 index 000000000000..ebe715d46830 --- /dev/null +++ b/include/sys/vdev_anyraid.h @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara Inc. + */ + +#ifndef _SYS_VDEV_ANYRAID_H +#define _SYS_VDEV_ANYRAID_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vdev_anyraid_parity_type { + VAP_MIRROR, // includes raid0, i.e. a 0-parity mirror + VAP_TYPES, +} vdev_anyraid_parity_type_t; + +typedef struct vdev_anyraid_node { + avl_node_t van_node; + uint8_t van_id; + uint16_t van_next_offset; + // Note: store capacity - 1 for rollover reasons + uint16_t van_capacity; +} vdev_anyraid_node_t; + +typedef struct vdev_anyraid { + vdev_anyraid_parity_type_t vd_parity_type; + /* + * The parity of the mismatched vdev; 0 for raid0, or the number of + * mirrors. + */ + uint_t vd_nparity; + uint64_t vd_tile_size; + + krwlock_t vd_lock; + avl_tree_t vd_tile_map; + avl_tree_t vd_children_tree; + uint32_t vd_checkpoint_tile; + vdev_anyraid_node_t **vd_children; +} vdev_anyraid_t; + +typedef struct anyraid_tile_node { + list_node_t atn_node; + uint8_t atn_disk; + uint16_t atn_offset; +} anyraid_tile_node_t; + +typedef struct anyraid_tile { + avl_node_t at_node; + uint32_t at_tile_id; + list_t at_list; +} anyraid_tile_t; + +/* + * The ondisk structure of the anyraid tile map is VDEV_ANYRAID_MAP_COPIES + * copies of the following layout. We store the tile map on every disk, and + * each TXG we update a different copy (txg % VDEV_ANYRAID_MAP_COPIES). + * + * First, we start with a MAX(8KiB, 1 << ashift) tile that stores a packed + * nvlist containing the header. The header contains a version number, a disk + * ID, a TXG, the tile size (in bytes), the stripe width/parity of the + * tiles, the length of the mapping (in bytes), the pool guid, and the + * checksum of the mapping. This 4KiB tile has an embedded checksum so that + * uses the normal ZIO_CHECKSUM_LABEL algorithm. + * + * Then, there is a tile of size VDEV_ANYRAID_MAP_SIZE. This stores the actual + * mapping. It is a series of entries. Right now, there are two entry types: + * + * 0: Skip entries represent a gap in logical tile IDs. From the current + * tile ID, add the value stored in the lower 24 bits of the skip entry. + * + * 1: Location entries represent a mapped tile. Each one represents a single + * physical tile backing the current logical tile. There can be multiple + * physical tiles for one logical tile; that number is the stripe width/ + * parity from the header. These entries contain a 8 bit disk ID and a 16 bit + * offset on that disk. + * + * Here is an example of what the mapping looks like on disk. This is for a + * 1-parity mirror anyraid device: + * + * +----------+----------+----------+----------+----------+----------+ + * | Tile 0 | Tile 0 | Tile 1 | Tile 1 | Tile 2 | Tile 2 | + * | Parity 0 | Parity 1 | Parity 0 | Parity 1 | Parity 0 | Parity 1 | + * | Disk 0 | Disk 1 | Disk 0 | Disk 2 | Disk 0 | Disk 1 | + * | Offset 0 | Offset 0 | Offset 1 | Offset 0 | Offset 2 | Offset 1 | + * +----------+----------+----------+----------+----------+----------+ + * + * Note that each of these entries acutally only contains the "disk" and + * "offset" fields on-disk; the "tile" and "parity" information is derived from + * context (since the entries are stored in tile/offset order, with no gaps + * unless a skip entry is present). + * + * New entry types will be added eventually to store information like parity + * changes. + * + * Because the mapping can be larger than the SPA_MAXBLOCKSIZE, it has to be + * written in multiple IOs; each IO-sized region has their own checksum, which + * is stored in the header block (using the ZIO_CHECKSUM_ANYRAID_MAP algorithm). + */ + +/* + * ========================================================================== + * Header-related definitions + * ========================================================================== + */ +#define VDEV_ANYRAID_HEADER_VERSION "version" +#define VDEV_ANYRAID_HEADER_DISK "disk" +#define VDEV_ANYRAID_HEADER_TXG "txg" +#define VDEV_ANYRAID_HEADER_TILE_SIZE "tile_size" +#define VDEV_ANYRAID_HEADER_LENGTH "length" +#define VDEV_ANYRAID_HEADER_CHECKPOINT "checkpoint_txg" +#define VDEV_ANYRAID_HEADER_DISK_SIZES "sizes" +/* + * We store the pool guid to prevent disks being reused from an old pool from + * causing any issues. + */ +#define VDEV_ANYRAID_HEADER_GUID "guid" + +#define VDEV_ANYRAID_MAP_HEADER_SIZE(ashift) MAX(8 * 1024, 1ULL << (ashift)) + +#define VDEV_ANYRAID_NVL_BYTES(ashift) \ + (VDEV_ANYRAID_MAP_HEADER_SIZE(ashift) - \ + (VDEV_ANYRAID_MAP_COPIES + 1) * sizeof (zio_eck_t)) + +/* + * ========================================================================== + * Mapping-related definitions + * ========================================================================== + */ +typedef enum anyraid_map_entry_type { + AMET_SKIP = 0, + AMET_LOC = 1, + AMET_TYPES +} anyraid_map_entry_type_t; + +#define AME_TYPE_BITS 8 + +/* + * ========================================================================== + * Skip entry definitions and functions + * ========================================================================== + */ +typedef uint32_t anyraid_map_skip_entry_t; + +#define AMSE_TILE_BITS 24 + +static inline void +amse_set_type(anyraid_map_skip_entry_t *amse) +{ + BF32_SET(*amse, 0, AME_TYPE_BITS, AMET_SKIP); +} + +static inline void +amse_set_skip_count(anyraid_map_skip_entry_t *amse, uint32_t skip_count) +{ + BF32_SET(*amse, AME_TYPE_BITS, AMSE_TILE_BITS, skip_count); +} + +static inline uint32_t +amse_get_skip_count(anyraid_map_skip_entry_t *amse) +{ + return (BF32_GET(*amse, AME_TYPE_BITS, AMSE_TILE_BITS)); +} + +/* + * ========================================================================== + * Location entry definitions and functions + * ========================================================================== + */ +typedef uint32_t anyraid_map_loc_entry_t; + +#define AMLE_DISK_BITS 8 +#define AMLE_OFFSET_BITS 16 + +static inline void +amle_set_type(anyraid_map_loc_entry_t *amle) +{ + BF32_SET(*amle, 0, AME_TYPE_BITS, AMET_LOC); +} + +static inline void +amle_set_disk(anyraid_map_loc_entry_t *amle, uint8_t disk) +{ + BF32_SET(*amle, AME_TYPE_BITS, AMLE_DISK_BITS, disk); +} + +static inline uint32_t +amle_get_disk(anyraid_map_loc_entry_t *amle) +{ + return (BF32_GET(*amle, AME_TYPE_BITS, AMLE_DISK_BITS)); +} + +static inline void +amle_set_offset(anyraid_map_loc_entry_t *amle, uint8_t offset) +{ + BF32_SET(*amle, (AME_TYPE_BITS + AMLE_DISK_BITS), AMLE_OFFSET_BITS, + offset); +} + +static inline uint32_t +amle_get_offset(anyraid_map_loc_entry_t *amle) +{ + return (BF32_GET(*amle, (AME_TYPE_BITS + AMLE_DISK_BITS), + AMLE_OFFSET_BITS)); +} + +/* + * ========================================================================== + * Overall mapping definitions + * ========================================================================== + */ + +typedef struct anyraid_map_entry { + union { + anyraid_map_skip_entry_t ame_amse; + anyraid_map_loc_entry_t ame_amle; + } ame_u; +} anyraid_map_entry_t; + +static inline anyraid_map_entry_type_t +ame_get_type(anyraid_map_entry_t *ame) +{ + return (BF32_GET(ame->ame_u.ame_amle, 0, AME_TYPE_BITS)); +} + +#define VDEV_ANYRAID_MAX_DISKS (1 << 8) +#define VDEV_ANYRAID_MAX_TPD (1 << 16) +#define VDEV_ANYRAID_MAX_TILES (VDEV_ANYRAID_MAX_DISKS * VDEV_ANYRAID_MAX_TPD) +/* + * The worst case scenario here is that we have a loc entry for every single + * tile (0 skips). At that point, we're using 4 bytes per tile. + * That gives us 2^24 * 4 bytes = 64 MB to store the entire map. + */ +#define VDEV_ANYRAID_MAP_SIZE (sizeof (anyraid_map_loc_entry_t) * \ + VDEV_ANYRAID_MAX_TILES) +#define VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift) \ + ((VDEV_ANYRAID_MAP_HEADER_SIZE(ashift) + VDEV_ANYRAID_MAP_SIZE)) +#define VDEV_ANYRAID_MAP_COPIES 4 +#define VDEV_ANYRAID_START_COPES (VDEV_ANYRAID_MAP_COPIES / 2) +#define VDEV_ANYRAID_TOTAL_MAP_SIZE(ashift) (VDEV_ANYRAID_MAP_COPIES * \ + VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift)) +#define VDEV_ANYRAID_START_OFFSET(ashift) VDEV_ANYRAID_START_COPES * \ + VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift) + +_Static_assert(VDEV_ANYRAID_TOTAL_MAP_SIZE(9) % SPA_MINBLOCKSIZE == 0, ""); +_Static_assert(VDEV_ANYRAID_TOTAL_MAP_SIZE(12) % SPA_MINBLOCKSIZE == 0, ""); +_Static_assert(VDEV_ANYRAID_MAP_SIZE % SPA_MAXBLOCKSIZE == 0, ""); + +/* + * ========================================================================== + * Externally-accessed function definitions + * ========================================================================== + */ +void vdev_anyraid_write_map_sync(vdev_t *vd, zio_t *pio, uint64_t txg, + uint64_t *good_writes, int flags, vdev_config_sync_status_t status); + +void vdev_anyraid_expand(vdev_t *tvd, vdev_t *newvd); +boolean_t vdev_anyraid_mapped(vdev_t *vd, uint64_t offset); + +/* + * These functions are exposed for ZDB. + */ + +typedef struct anyraid_header { + abd_t *ah_abd; + char *ah_buf; + nvlist_t *ah_nvl; +} anyraid_header_t; + +int vdev_anyraid_pick_best_mapping(vdev_t *cvd, + uint64_t *out_txg, anyraid_header_t *out_header, int *out_mapping); +int vdev_anyraid_open_header(vdev_t *cvd, int header, + anyraid_header_t *out_header); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_ANYRAID_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index afaa401343d9..8f1e0e197b74 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -70,7 +70,8 @@ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); -typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); +typedef uint64_t vdev_min_asize_func_t(vdev_t *pvd, vdev_t *cvd); +typedef uint64_t vdev_min_attach_size_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); @@ -94,6 +95,7 @@ typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, uint64_t size, uint64_t max_segment); typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, uint64_t *sizep); +typedef void vdev_metaslab_size_func_t(vdev_t *vd, uint64_t *shiftp); typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); typedef uint64_t vdev_nparity_func_t(vdev_t *vd); typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); @@ -106,6 +108,7 @@ typedef const struct vdev_ops { vdev_asize_func_t *vdev_op_psize_to_asize; vdev_asize_func_t *vdev_op_asize_to_psize; vdev_min_asize_func_t *vdev_op_min_asize; + vdev_min_attach_size_func_t *vdev_op_min_attach_size; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; @@ -121,6 +124,7 @@ typedef const struct vdev_ops { vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; + vdev_metaslab_size_func_t *vdev_op_metaslab_size; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -617,6 +621,9 @@ extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; +extern vdev_ops_t vdev_anyraid_ops; + +extern zio_vsd_ops_t vdev_mirror_vsd_ops; /* * Common size functions @@ -625,8 +632,10 @@ extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); -extern uint64_t vdev_default_min_asize(vdev_t *vd); +extern uint64_t vdev_default_min_asize(vdev_t *pvd, vdev_t *cvd); +extern uint64_t vdev_default_min_attach_size(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); +extern uint64_t vdev_get_min_attach_size(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); diff --git a/include/sys/vdev_mirror.h b/include/sys/vdev_mirror.h new file mode 100644 index 000000000000..f48cc333e8e0 --- /dev/null +++ b/include/sys/vdev_mirror.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara Inc. + */ + +#ifndef _SYS_VDEV_MIRROR_H +#define _SYS_VDEV_MIRROR_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device vector for mirroring. + */ +typedef struct mirror_child { + vdev_t *mc_vd; + abd_t *mc_abd; + uint64_t mc_offset; + int mc_error; + int mc_load; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; + uint8_t mc_rebuilding; +} mirror_child_t; + +typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; + int mm_children; + boolean_t mm_resilvering; + boolean_t mm_rebuilding; + boolean_t mm_root; + mirror_child_t mm_child[]; +} mirror_map_t; + +mirror_map_t *vdev_mirror_map_alloc(int children, boolean_t resilvering, + boolean_t root); +void vdev_mirror_io_start_impl(zio_t *zio, mirror_map_t *mm); +void vdev_mirror_io_done(zio_t *zio); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_MIRROR_H */ diff --git a/include/sys/zio.h b/include/sys/zio.h index acb0a03a36b2..55d7f8bf4f77 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -107,6 +107,7 @@ enum zio_checksum { ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, ZIO_CHECKSUM_BLAKE3, + ZIO_CHECKSUM_ANYRAID_MAP, ZIO_CHECKSUM_FUNCTIONS }; @@ -213,6 +214,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_ALLOC_THROTTLED (1ULL << 14) +#define ZIO_FLAG_ZILWRITE (1ULL << 15) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -220,29 +222,29 @@ typedef uint64_t zio_flag_t; /* * Flags inherited by vdev children. */ -#define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */ -#define ZIO_FLAG_PROBE (1ULL << 16) -#define ZIO_FLAG_TRYHARD (1ULL << 17) -#define ZIO_FLAG_OPTIONAL (1ULL << 18) -#define ZIO_FLAG_DIO_READ (1ULL << 19) +#define ZIO_FLAG_IO_RETRY (1ULL << 16) /* must be first for INHERIT */ +#define ZIO_FLAG_PROBE (1ULL << 17) +#define ZIO_FLAG_TRYHARD (1ULL << 18) +#define ZIO_FLAG_OPTIONAL (1ULL << 19) +#define ZIO_FLAG_DIO_READ (1ULL << 20) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ -#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */ -#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21) -#define ZIO_FLAG_IO_BYPASS (1ULL << 22) -#define ZIO_FLAG_IO_REWRITE (1ULL << 23) -#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24) -#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25) -#define ZIO_FLAG_GANG_CHILD (1ULL << 26) -#define ZIO_FLAG_DDT_CHILD (1ULL << 27) -#define ZIO_FLAG_GODFATHER (1ULL << 28) -#define ZIO_FLAG_NOPWRITE (1ULL << 29) -#define ZIO_FLAG_REEXECUTED (1ULL << 30) -#define ZIO_FLAG_DELEGATED (1ULL << 31) -#define ZIO_FLAG_PREALLOCATED (1ULL << 32) +#define ZIO_FLAG_DONT_QUEUE (1ULL << 21) /* must be first for INHERIT */ +#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 22) +#define ZIO_FLAG_IO_BYPASS (1ULL << 23) +#define ZIO_FLAG_IO_REWRITE (1ULL << 24) +#define ZIO_FLAG_RAW_COMPRESS (1ULL << 25) +#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 26) +#define ZIO_FLAG_GANG_CHILD (1ULL << 27) +#define ZIO_FLAG_DDT_CHILD (1ULL << 28) +#define ZIO_FLAG_GODFATHER (1ULL << 29) +#define ZIO_FLAG_NOPWRITE (1ULL << 30) +#define ZIO_FLAG_REEXECUTED (1ULL << 31) +#define ZIO_FLAG_DELEGATED (1ULL << 32) +#define ZIO_FLAG_PREALLOCATED (1ULL << 33) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index f07ad2605e31..b68c712943c4 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -140,8 +140,8 @@ extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *, enum zio_checksum, struct abd *, uint64_t); -extern int zio_checksum_error_impl(spa_t *, const blkptr_t *, enum zio_checksum, - struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); +extern int zio_checksum_error_impl(zio_t *, enum zio_checksum, struct abd *, + uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 56382ca85b55..c44671673fbd 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -90,6 +90,7 @@ typedef enum spa_feature { SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURE_BLOCK_CLONING_ENDIAN, SPA_FEATURE_PHYSICAL_REWRITE, + SPA_FEATURE_ANYRAID, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 232265237f54..479e32b5469b 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -640,7 +640,7 @@ - + @@ -6027,7 +6027,10 @@ - + + + + @@ -6310,7 +6313,8 @@ - + + @@ -9521,8 +9525,8 @@ - - + + @@ -9600,7 +9604,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 756d701e2d97..a593891cfbac 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1223,7 +1223,8 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) strncmp(pool, "raidz", 5) == 0 || strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || - strcmp(pool, "log") == 0)) { + strcmp(pool, "log") == 0 || + strncmp(pool, "anymirror", 9) == 0)) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); @@ -1614,6 +1615,18 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, "minimum size (%s)"), buf); } return (zfs_error(hdl, EZFS_BADDEV, errbuf)); + case ENOLCK: + /* + * This occurs when one of the devices is an anyraid + * device that can't hold a single tile. + * Unfortunately, we can't detect which device was the + * problem device since there's no reliable way to + * determine device size from userland. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more anyraid devices cannot store " + "any tiles")); + return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ENOSPC: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -1848,7 +1861,18 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift) } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; - + case ENOLCK: + /* + * This occurs when one of the devices is an anyraid + * device that can't hold a single tile. + * Unfortunately, we can't detect which device was the + * problem device since there's no reliable way to + * determine device size from userland. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more anyraid devices cannot store " + "any tiles")); + return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to add these vdevs")); @@ -3197,7 +3221,8 @@ zpool_vdev_is_interior(const char *name) strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_ROOT, strlen(VDEV_TYPE_ROOT)) == 0 || - strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) + strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0 || + strncmp(name, VDEV_TYPE_ANYRAID, strlen(VDEV_TYPE_ANYRAID)) == 0) return (B_TRUE); if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && @@ -3774,6 +3799,15 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; + case ENOLCK: + /* + * This occurs when one of the devices is an anyraid + * device that can't hold a single tile. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device cannot store any tiles")); + return (zfs_error(hdl, EZFS_BADDEV, errbuf)); + case ENAMETOOLONG: /* * The resulting top-level vdev spec won't fit in the label. @@ -4557,9 +4591,11 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, path = type; /* - * If it's a raidz device, we need to stick in the parity level. + * If it's a raidz or anyraid device, we need to stick in the + * parity level. */ - if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(path, VDEV_TYPE_RAIDZ) == 0 || + strcmp(path, VDEV_TYPE_ANYRAID) == 0) { value = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); (void) snprintf(buf, sizeof (buf), "%s%llu", path, (u_longlong_t)value); @@ -5446,6 +5482,10 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); + } else if (prop == VDEV_PROP_ANYRAID_CAP_TILES || + prop == VDEV_PROP_ANYRAID_NUM_TILES || + prop == VDEV_PROP_ANYRAID_TILE_SIZE) { + return (ENOENT); } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); @@ -5476,6 +5516,7 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, case VDEV_PROP_BYTES_FREE: case VDEV_PROP_BYTES_CLAIM: case VDEV_PROP_BYTES_TRIM: + case VDEV_PROP_ANYRAID_TILE_SIZE: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index aeacc595b363..7097e7053e2b 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -148,6 +148,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/vdev_label.c \ module/zfs/vdev_mirror.c \ module/zfs/vdev_missing.c \ + module/zfs/vdev_anyraid.c \ module/zfs/vdev_queue.c \ module/zfs/vdev_raidz.c \ module/zfs/vdev_raidz_math.c \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 60ec56b4d1f6..da79854f0425 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -655,6 +655,15 @@ Logical ashift for file-based devices. .It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Physical ashift for file-based devices. . +.It Sy zfs_anyraid_min_tile_size Ns = Ns Sy 16 GiB Pq u64 +Minimum size of the tiles that anyraid will use to do its mapping. +Smaller tile sizes let data be spread more evenly across devices, and makes +smaller devices use more of their capacity. +Larger tile sizes allow for larger disks to be used in the future, since a given +device can only store 16384 tiles. +The minimum valid tile size is 16MiB, since a metaslab always needs to be able +to fit in a single tile. +. .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int If set, when we start iterating over a ZAP object, prefetch the entire object (all leaf blocks). diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index b54abcd3ecc9..54f6abe71ef6 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -194,6 +194,21 @@ If this device should perform new allocations, used to disable a device when it is scheduled for later removal. See .Xr zpool-remove 8 . +.It anyraid_tile_capacity +Only valid for +.Sy AnyRAID +vdevs and their leaf vdevs. +The number of physical tiles that the vdev can hold. +.It anyraid_tile_count +Only valid for +.Sy AnyRAID +vdevs and their leaf vdevs. +The number of physical tiles that are currently allocated on the vdev. +.It anyraid_tile_size +Only valid for +.Sy AnyRAID +vdevs and their leaf vdevs. +The size of the tiles in use on this vdev. .El .Ss User Properties In addition to the standard native properties, ZFS supports arbitrary user diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 21bd72351209..6a8bff5a6086 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -165,6 +165,30 @@ An error is returned when the provided number of children differs. The number of distributed hot spares. Defaults to zero. .El +.It Sy anymirror , anymirror0 , anymirror1 , anymirror2 +A new device type that allows for mirror-parity redundancy while using devices +of different sizes. +An AnyRAID vdev works by dividing each of the underlying disks that make it up +into +.Sy tiles \. +The tiles are then each mirrored at the desired parity level. +This allows for full redundancy, since tiles are allocated from independent +disks, while enabling maximum space usage by allocating more tiles from the +disks with the most free space. +In addition, the device can be expanded by attaching new disks, and new tiles +will be allocated from those disks. +The vdev class as a whole is referred to as AnyRAID; anymirror vdevs +specifically use mirror-style parity. +Future work will also add anyraidz, which will use the same basic tile +architecture, but use raidz-style parity. +.Sy anymirror +is a synonym for +.Sy anymirror1 +, which is the 2-way mirror parity version (1 parity tile). +.Sy anymirror2 +is a 3-way mirror (2 parity tiles), while +.Sy anymirror0 +is striped (no parity tiles), and is primarily intended for testing. .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index f51e24fa849c..a77bf112a13b 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -95,6 +95,9 @@ .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl U Ar cache .Ar poolname +.Nm +.Fl -anyraid-map +.Ar poolname Op Ar vdev Ns … . .Sh DESCRIPTION The @@ -440,6 +443,8 @@ Display histograms of per-vdev BRT refcounts. Dump the contents of the block reference tables. .It Fl u , -uberblock Display the current uberblock. +.It Fl -anyraid-map +Display information about the mappings of one or all anyraid vdevs in the pool. .El .Pp Other options: diff --git a/module/Kbuild.in b/module/Kbuild.in index 95313c984178..5958b76476cd 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -385,6 +385,7 @@ ZFS_OBJS := \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ + vdev_anyraid.o \ vdev_queue.o \ vdev_raidz.o \ vdev_raidz_math.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index c20fdc0c483b..870f54a4d970 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -335,6 +335,7 @@ SRCS+= abd.c \ vdev_label.c \ vdev_mirror.c \ vdev_missing.c \ + vdev_anyraid.c \ vdev_queue.c \ vdev_raidz.c \ vdev_raidz_math_avx2.c \ diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 393bfaa65ff5..b1fe3e1800f3 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -93,6 +93,8 @@ #include SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, anyraid, CTLFLAG_RW, 0, + "ZFS AnyRAID VDEV"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0, diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index bbd1dafc69be..c75f4443afd9 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1288,6 +1288,7 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_geom_io_start, .vdev_op_io_done = vdev_geom_io_done, diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 1bd3500e9f66..b4259313822c 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1290,6 +1290,7 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6ba9892eeb64..e3a96fad32ee 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -810,6 +810,10 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, physical_rewrite_deps, sfeatures); } + zfeature_register(SPA_FEATURE_ANYRAID, + "com.klarasystems:anyraid", "anyraid", "Support for anyraid VDEV", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index deb0547c1084..7770f44c083e 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -445,7 +445,8 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0 || - strcmp(pool, "draid") == 0) { + strcmp(pool, "draid") == 0 || + strcmp(pool, "anymirror") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 4826237b23e8..8c3ac0ae0874 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -436,6 +436,15 @@ vdev_prop_init(void) zprop_register_number(VDEV_PROP_BYTES_TRIM, "trim_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "TRIMBYTE", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_ANYRAID_CAP_TILES, + "anyraid_tile_capacity", 0, PROP_READONLY, ZFS_TYPE_VDEV, + "", "TILECAP", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_ANYRAID_NUM_TILES, + "anyraid_tile_count", 0, PROP_READONLY, ZFS_TYPE_VDEV, + "", "NUMTILES", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_ANYRAID_TILE_SIZE, + "anyraid_tile_size", 0, PROP_READONLY, ZFS_TYPE_VDEV, + "", "TILESIZE", B_FALSE, sfeatures); /* default numeric properties */ zprop_register_number(VDEV_PROP_CHECKSUM_N, "checksum_n", UINT64_MAX, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b864d9035974..f9c766aff0c7 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1507,9 +1507,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ - return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, - zio->io_offset, NULL) == 0); + return (zio_checksum_error_impl(zio, BP_GET_CHECKSUM(zio->io_bp), + zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 5690f8afad00..19b7527d4376 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2052,7 +2052,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_ZILWRITE, + zb)); return (0); } @@ -2220,8 +2221,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL, - dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, - &zb)); + dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_ZILWRITE, &zb)); return (0); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3f649ffb44e4..b2ada5e980b6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -3250,7 +3251,8 @@ metaslab_space_weight(metaslab_t *msp) * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { + if ((!vd->vdev_nonrot && metaslab_lba_weighting_enabled) || + vd->vdev_ops == &vdev_anyraid_ops) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } @@ -3410,6 +3412,23 @@ metaslab_segment_weight(metaslab_t *msp) weight = metaslab_weight_from_spacemap(msp); } + /* + * Anyraid vdevs strongly prefer allocations from earlier regions, in + * order to prevent premature region placement. While this optimization + * is not usually good for segment-based weighting, we enable it for + * that case specifically. + */ + vdev_t *vd = mg->mg_vd; + if ((vd->vdev_ops == &vdev_anyraid_ops || + metaslab_lba_weighting_enabled) && + WEIGHT_GET_INDEX(weight) > SPA_MAXBLOCKSHIFT) { + uint64_t id = msp->ms_id; + uint64_t count = vd->vdev_ms_count; + WEIGHT_SET_INDEX(weight, WEIGHT_GET_INDEX(weight) + 3 - + ((id * 4) / count)); + weight = MIN(weight, METASLAB_MAX_WEIGHT); + } + /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that @@ -3430,7 +3449,8 @@ metaslab_segment_weight(metaslab_t *msp) * weights we rely on the entire weight (excluding the weight-type bit). */ static boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard, + boolean_t mapped) { /* * This case will usually but not always get caught by the checks below; @@ -3441,6 +3461,17 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) if (unlikely(msp->ms_new)) return (B_FALSE); + /* + * This I/O needs to be written to a stable location and be retreivable + * before the next TXG syncs. This is the case for ZIL writes. In that + * case, if we're using an anyraid vdev, we can't use a tile that isn't\ + * mapped yet. + */ + if (mapped && msp->ms_group->mg_vd->vdev_ops == &vdev_anyraid_ops) { + return (vdev_anyraid_mapped(msp->ms_group->mg_vd, + msp->ms_start)); + } + /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once @@ -4891,8 +4922,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, uint64_t asize, int allocator, - boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, - boolean_t *was_active) + boolean_t try_hard, boolean_t mapped, zio_alloc_list_t *zal, + metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -4910,7 +4941,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, } tries++; - if (!metaslab_should_allocate(msp, asize, try_hard)) { + if (!metaslab_should_allocate(msp, asize, try_hard, mapped)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; @@ -4991,7 +5022,7 @@ metaslab_active_mask_verify(metaslab_t *msp) static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t max_asize, uint64_t txg, - dva_t *dva, int d, int allocator, boolean_t try_hard, + dva_t *dva, int d, int allocator, boolean_t try_hard, boolean_t mapped, uint64_t *actual_asize) { metaslab_t *msp = NULL; @@ -5067,7 +5098,7 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - asize, allocator, try_hard, zal, search, + asize, allocator, try_hard, mapped, zal, search, &was_active); } @@ -5173,7 +5204,7 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, * can accurately determine if the allocation attempt should * proceed. */ - if (!metaslab_should_allocate(msp, asize, try_hard)) { + if (!metaslab_should_allocate(msp, asize, try_hard, mapped)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -5267,7 +5298,7 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); + ASSERT(!metaslab_should_allocate(msp, asize, try_hard, mapped)); mutex_exit(&msp->ms_lock); } @@ -5422,8 +5453,12 @@ metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize, txg); ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); + boolean_t mapped = B_FALSE; + if (flags & METASLAB_ZIL) + mapped = B_TRUE; + uint64_t offset = metaslab_group_alloc(mg, zal, asize, - max_asize, txg, dva, d, allocator, try_hard, + max_asize, txg, dva, d, allocator, try_hard, mapped, &asize); if (offset != -1ULL) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 34de3f1d9525..607fc82066b1 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -5499,7 +5500,8 @@ spa_ld_checkpoint_rewind(spa_t *spa) if (svdcount == SPA_SYNC_MIN_VDEVS) break; } - error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); + error = vdev_config_sync(svd, svdcount, spa->spa_first_txg, + VDEV_CONFIG_REWINDING_CHECKPOINT); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); @@ -6891,6 +6893,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + for (uint64_t i = 0; i < rvd->vdev_children; i++) + if (rvd->vdev_child[i]->vdev_ops == &vdev_anyraid_ops) + spa_feature_incr(spa, SPA_FEATURE_ANYRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -7484,13 +7490,26 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } +/* + * This is called as a synctask to increment the anyraid feature flag + */ +static void +spa_anyraid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + uint64_t nanyraid = (uint64_t)(uintptr_t)arg; + + for (int i = 0; i < nanyraid; i++) + spa_feature_incr(spa, SPA_FEATURE_ANYRAID, tx); +} + /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { - uint64_t txg, ndraid = 0; + uint64_t txg, ndraid = 0, nanyraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -7624,6 +7643,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) dmu_tx_commit(tx); } + for (uint64_t i = 0; i < vd->vdev_children; i++) + if (vd->vdev_child[i]->vdev_ops == &vdev_anyraid_ops) + nanyraid++; + if (nanyraid > 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, + spa_anyraid_feature_incr, + (void *)(uintptr_t)nanyraid, tx); + dmu_tx_commit(tx); + } + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we @@ -7790,6 +7822,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, NULL, txg, ENODEV)); boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; + boolean_t anyraid = oldvd->vdev_ops == &vdev_anyraid_ops; if (raidz) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) @@ -7802,11 +7835,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); } - } else if (!oldvd->vdev_ops->vdev_op_leaf) { + } else if (!anyraid && !oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); } - if (raidz) + if (raidz || anyraid) pvd = oldvd; else pvd = oldvd->vdev_parent; @@ -7854,6 +7887,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (tvd->vdev_ops != &vdev_mirror_ops && tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_anyraid_ops && tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } @@ -7867,10 +7901,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && - !raidz) + !raidz && !anyraid) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - pvops = &vdev_mirror_ops; + if (anyraid) + pvops = &vdev_anyraid_ops; + else + pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot @@ -7912,8 +7949,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; - if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) + if (newvd->vdev_asize < vdev_get_min_attach_size(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -7960,6 +7996,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); oldvdpath = spa_strdup(tmp); kmem_strfree(tmp); + } else if (anyraid) { + char *tmp = kmem_asprintf(VDEV_TYPE_ANYRAID "%u-%u", + (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); + oldvdpath = spa_strdup(tmp); + kmem_strfree(tmp); } else { oldvdpath = spa_strdup(oldvd->vdev_path); } @@ -7987,7 +8028,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (!raidz && pvd->vdev_ops != pvops) { + if (!raidz && !anyraid && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); @@ -8045,6 +8086,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, newvd, tx); dmu_tx_commit(tx); + } else if (anyraid) { + vdev_anyraid_expand(tvd, newvd); + vdev_dirty(tvd, VDD_DTL, newvd, txg); + tvd->vdev_expanding = B_TRUE; + vdev_reopen(tvd); + spa->spa_ccw_fail_time = 0; + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } else { vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); @@ -10329,6 +10377,13 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; + vdev_config_sync_status_t status; + if (dmu_tx_get_txg(tx) == spa->spa_checkpoint_txg + 1) + status = VDEV_CONFIG_CREATING_CHECKPOINT; + else if (spa->spa_checkpoint_txg == 0) + status = VDEV_CONFIG_NO_CHECKPOINT; + else + status = VDEV_CONFIG_KEEP_CHECKPOINT; for (;;) { int error = 0; @@ -10362,10 +10417,10 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) if (svdcount == SPA_SYNC_MIN_VDEVS) break; } - error = vdev_config_sync(svd, svdcount, txg); + error = vdev_config_sync(svd, svdcount, txg, status); } else { error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg); + rvd->vdev_children, txg, status); } if (error == 0) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index bf22d2eb68e7..cf59f0a9acb7 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1875,6 +1875,18 @@ spa_syncing_txg(spa_t *spa) return (spa->spa_syncing_txg); } +uint64_t +spa_load_max_txg(spa_t *spa) +{ + return (spa->spa_load_max_txg); +} + +uint64_t +spa_current_txg(spa_t *spa) +{ + return (spa->spa_uberblock.ub_txg); +} + /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. @@ -2323,6 +2335,12 @@ spa_dirty_data(spa_t *spa) return (spa->spa_dsl_pool->dp_dirty_total); } +uint64_t +spa_load_txg(spa_t *spa) +{ + return (spa->spa_load_txg); +} + /* * ========================================================================== * SPA Import Progress Routines @@ -3042,6 +3060,12 @@ spa_has_checkpoint(spa_t *spa) return (spa->spa_checkpoint_txg != 0); } +uint64_t +spa_checkpoint_txg(spa_t *spa) +{ + return (spa->spa_checkpoint_txg); +} + boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { @@ -3049,6 +3073,13 @@ spa_importing_readonly_checkpoint(spa_t *spa) spa->spa_mode == SPA_MODE_READ); } +boolean_t +spa_importing_checkpoint(spa_t *spa) +{ + return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && + spa->spa_uberblock.ub_checkpoint_txg != 0); +} + uint64_t spa_min_claim_txg(spa_t *spa) { @@ -3151,6 +3182,7 @@ EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); +EXPORT_SYMBOL(spa_load_txg); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); @@ -3193,8 +3225,10 @@ EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); +EXPORT_SYMBOL(spa_importing_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); +EXPORT_SYMBOL(spa_checkpoint_txg); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 2754769eb759..e30375626669 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -55,11 +55,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include "zfs_prop.h" @@ -279,6 +279,7 @@ static vdev_ops_t *const vdev_ops_table[] = { &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, + &vdev_anyraid_ops, NULL }; @@ -345,6 +346,21 @@ vdev_derive_alloc_bias(const char *bias) return (alloc_bias); } +uint64_t +vdev_default_min_attach_size(vdev_t *vd) +{ + return (vdev_get_min_asize(vd)); +} + +uint64_t +vdev_get_min_attach_size(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + if (vd == vd->vdev_top) + pvd = vd; + return (pvd->vdev_ops->vdev_op_min_attach_size(pvd)); +} + uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg) { @@ -377,9 +393,10 @@ vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) } uint64_t -vdev_default_min_asize(vdev_t *vd) +vdev_default_min_asize(vdev_t *pvd, vdev_t *cvd) { - return (vd->vdev_min_asize); + (void) cvd; + return (pvd->vdev_min_asize); } /* @@ -408,7 +425,7 @@ vdev_get_min_asize(vdev_t *vd) return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, uint64_t)); - return (pvd->vdev_ops->vdev_op_min_asize(pvd)); + return (pvd->vdev_ops->vdev_op_min_asize(pvd, vd)); } void @@ -903,6 +920,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } + + /* spa_vdev_add() expects feature to be enabled */ + if (ops == &vdev_anyraid_ops && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_ANYRAID)) { + return (SET_ERROR(ENOTSUP)); + } } /* @@ -3013,6 +3037,8 @@ vdev_metaslab_set_size(vdev_t *vd) if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } + if (vd->vdev_ops->vdev_op_metaslab_size) + vd->vdev_ops->vdev_op_metaslab_size(vd, &ms_shift); vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); @@ -6712,6 +6738,68 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) break; } break; + case VDEV_PROP_ANYRAID_CAP_TILES: + { + vdev_t *pvd = vd->vdev_parent; + uint64_t total = 0; + if (vd->vdev_ops == &vdev_anyraid_ops) { + vdev_anyraid_t *var = vd->vdev_tsd; + for (int i = 0; i < vd->vdev_children; + i++) { + total += var->vd_children[i] + ->van_capacity + 1; + } + } else if (pvd && pvd->vdev_ops == + &vdev_anyraid_ops) { + vdev_anyraid_t *var = pvd->vdev_tsd; + total = var->vd_children[vd->vdev_id] + ->van_capacity + 1; + } else { + continue; + } + vdev_prop_add_list(outnvl, propname, + NULL, total, ZPROP_SRC_NONE); + continue; + } + case VDEV_PROP_ANYRAID_NUM_TILES: + { + vdev_t *pvd = vd->vdev_parent; + uint64_t total = 0; + if (vd->vdev_ops == &vdev_anyraid_ops) { + vdev_anyraid_t *var = vd->vdev_tsd; + for (int i = 0; i < vd->vdev_children; + i++) { + total += var->vd_children[i] + ->van_next_offset; + } + } else if (pvd && pvd->vdev_ops == + &vdev_anyraid_ops) { + vdev_anyraid_t *var = pvd->vdev_tsd; + total = var->vd_children[vd->vdev_id] + ->van_next_offset; + } else { + continue; + } + vdev_prop_add_list(outnvl, propname, + NULL, total, ZPROP_SRC_NONE); + continue; + } + case VDEV_PROP_ANYRAID_TILE_SIZE: + { + vdev_t *pvd = vd->vdev_parent; + vdev_anyraid_t *var = NULL; + if (vd->vdev_ops == &vdev_anyraid_ops) { + var = vd->vdev_tsd; + } else if (pvd && pvd->vdev_ops == + &vdev_anyraid_ops) { + var = pvd->vdev_tsd; + } else { + continue; + } + vdev_prop_add_list(outnvl, propname, + NULL, var->vd_tile_size, ZPROP_SRC_NONE); + continue; + } default: err = ENOENT; break; diff --git a/module/zfs/vdev_anyraid.c b/module/zfs/vdev_anyraid.c new file mode 100644 index 000000000000..28b9dee1a46f --- /dev/null +++ b/module/zfs/vdev_anyraid.c @@ -0,0 +1,1558 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara Inc. + */ + +/* + * Anyraid vdevs are a way to get the benefits of mirror (and, in the future, + * raidz) vdevs while using disks with mismatched sizes. The primary goal of + * this feature is maximizing the available space of the provided devices. + * Performance is secondary to that goal; nice to have, but not required. This + * feature is also designed to work on modern hard drives: while the feature + * will work on drives smaller than 1TB, the default tuning values are + * optimized for drives of at least that size. + * + * Anyraid works by splitting the vdev into "tiles". Each tile is the same + * size; by default, 1/64th of the size of the smallest disk in the vdev, or + * 16GiB, whichever is larger. A tile represents an area of + * logical-to-physical mapping: bytes within that logical tile are stored + * physically together. Subsequent tiles may be stored in different locations + * on the same disk, or different disks altogether. A mapping is stored on each + * disk to enable the vdev to be read normally. + * + * When parity is not considered, this provides some small benefits (device + * removal within the vdev is not yet implemented, but is very feasible, as is + * rebalancing data onto new disks), but is not generally recommended. However, + * if parity is considered, it is more useful. With mirror parity P, each + * tile is allocated onto P separate disks, providing the reliability and + * performance characteristics of a mirror vdev. In addition, because each tile + * can be allocated separately, smaller drives can work together to mirror + * larger ones dynamically and seamlessly. + * + * The mapping for these tiles is stored in a special area at the start of + * each device. Each disk has 4 full copies of the tile map, which rotate + * per txg in a similar manner to uberblocks. The tile map itself is 64MiB, + * plus a small header (~8KiB) before it. + * + * The exact space that is allocatable in an anyraid vdev is not easy to + * calculate in the general case. It's a variant of the bin-packing problem, so + * an optimal solution is complex. However, this case seems to be a sub-problem + * where greedy algorithms give optimal solutions, so that is what we do here. + * Each tile is allocated from the P disks that have the most available + * capacity. This does mean that calculating the size of a disk requires + * running the allocation algorithm until completion, but for the relatively + * small number of tiles we are working with, an O(n * log n) runtime is + * acceptable. + * + * Currently, there is a limit of 2^24 tiles in an anyraid vdev: 2^8 disks, + * and 2^16 tiles per disk. This means that by default, the largest device + * that can be fully utilized by an anyraid vdev is 1024 times the size of the + * smallest device that was present during device creation. This is not a + * fundamental limit, and could be expanded in the future. However, this does + * affect the size of the tile map. Currently, the tile map can always + * store all tiles without running out of space; 2^24 4-byte entries is 2^26 + * bytes = 64MiB. Expanding the maximum number of tiles per disk or disks per + * vdev would necessarily involve either expanding the tile map or adding + * handling for the tile map running out of space. + * + * When it comes to performance, there is a tradeoff. While the per-disk I/O + * rates are equivalent to using mirrors (because only a small amount of extra + * logic is used on top of the mirror code), the overall vdev throughput may + * not be. This is because the actively used tiles may be allocated to the + * same devices, leaving other devices idle for writes. This is especially true + * as the variation in drive sizes increases. To some extent, this problem is + * fundamental: writes fill up disks. If we want to fill all the disks, smaller + * disks will not be able to satisfy as many writes. Rewrite- and read-heavy + * workloads will encounter this problem to a lesser extent. The performance + * downsides can be mitigated with smaller tile sizes, larger metaslabs, + * and more active metaslab allocators. + * + * Checkpoints are currently supported by storing the maximum allocated tile + * at the time of the checkpoint, and then discarding all tiles after that + * when a checkpoint is rolled back. Because device addition is forbidden while + * a checkpoint is outstanding, no more complex logic is required. + * + * Currently, anyraid vdevs only work with mirror-type parity. However, plans + * for future work include: + * Raidz-type parity + * Anyraid vdev shrinking via device removal + * Rebalancing after device addition + * + * Possible future work also includes: + * Enabling rebalancing with an outstanding checkpoint + * Trim and initialize beyond the end of the allocated tiles + * Store device asizes so we can make better allocation decisions while a + * device is faulted + */ + +#include +#include +#include +#include +#include + +/* + * The smallest allowable tile size. Shrinking this is mostly useful for + * testing. Increasing it may be useful if you plan to add much larger disks to + * an array in the future, and want to be sure their full capacity will be + * usable. + */ +uint64_t zfs_anyraid_min_tile_size = (16ULL << 30); +/* + * This controls how many tiles we have per disk (based on the smallest disk + * present at creation time) + */ +int anyraid_disk_shift = 6; + +static inline uint64_t +vdev_anyraid_header_offset(vdev_t *vd, int id) +{ + uint64_t full_size = VDEV_ANYRAID_SINGLE_MAP_SIZE(vd->vdev_ashift); + if (id < VDEV_ANYRAID_START_COPES) + return (VDEV_LABEL_START_SIZE + id * full_size); + else + return (vd->vdev_psize - VDEV_LABEL_END_SIZE - + (VDEV_ANYRAID_MAP_COPIES - id) * full_size); +} + +static inline int +anyraid_tile_compare(const void *p1, const void *p2) +{ + const anyraid_tile_t *r1 = p1, *r2 = p2; + + return (TREE_CMP(r1->at_tile_id, r2->at_tile_id)); +} + +static inline int +anyraid_child_compare(const void *p1, const void *p2) +{ + const vdev_anyraid_node_t *van1 = p1, *van2 = p2; + + int cmp = TREE_CMP(van2->van_capacity - van2->van_next_offset, + van1->van_capacity - van1->van_next_offset); + if (cmp != 0) + return (cmp); + + return (TREE_CMP(van1->van_id, van2->van_id)); +} + +/* + * Initialize private VDEV specific fields from the nvlist. + */ +static int +vdev_anyraid_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + (void) spa; + uint_t children; + nvlist_t **child; + int error = nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children); + if (error != 0 || children > VDEV_ANYRAID_MAX_DISKS) + return (SET_ERROR(EINVAL)); + + uint64_t nparity; + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) != 0) + return (SET_ERROR(EINVAL)); + + vdev_anyraid_parity_type_t parity_type = VAP_TYPES; + if (nvlist_lookup_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, + (uint8_t *)&parity_type) != 0) + return (SET_ERROR(EINVAL)); + if (parity_type != VAP_MIRROR) + return (SET_ERROR(ENOTSUP)); + + vdev_anyraid_t *var = kmem_zalloc(sizeof (*var), KM_SLEEP); + var->vd_parity_type = parity_type; + var->vd_nparity = nparity; + rw_init(&var->vd_lock, NULL, RW_DEFAULT, NULL); + avl_create(&var->vd_tile_map, anyraid_tile_compare, + sizeof (anyraid_tile_t), offsetof(anyraid_tile_t, at_node)); + avl_create(&var->vd_children_tree, anyraid_child_compare, + sizeof (vdev_anyraid_node_t), + offsetof(vdev_anyraid_node_t, van_node)); + + var->vd_children = kmem_zalloc(sizeof (*var->vd_children) * children, + KM_SLEEP); + for (int c = 0; c < children; c++) { + vdev_anyraid_node_t *van = kmem_zalloc(sizeof (*van), KM_SLEEP); + van->van_id = c; + avl_add(&var->vd_children_tree, van); + var->vd_children[c] = van; + } + + *tsd = var; + return (0); +} + +static void +vdev_anyraid_fini(vdev_t *vd) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + avl_destroy(&var->vd_tile_map); + + vdev_anyraid_node_t *node; + void *cookie = NULL; + while ((node = avl_destroy_nodes(&var->vd_children_tree, &cookie))) { + kmem_free(node, sizeof (*node)); + } + avl_destroy(&var->vd_children_tree); + + rw_destroy(&var->vd_lock); + kmem_free(var->vd_children, + sizeof (*var->vd_children) * vd->vdev_children); + kmem_free(var, sizeof (*var)); +} + +/* + * Add ANYRAID specific fields to the config nvlist. + */ +static void +vdev_anyraid_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_anyraid_ops); + vdev_anyraid_t *var = vd->vdev_tsd; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, var->vd_nparity); + fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, + (uint8_t)var->vd_parity_type); +} + +/* + * Import/open related functions. + */ + +/* + * Add an entry to the tile map for the provided tile. + */ +static void +create_tile_entry(vdev_anyraid_t *var, anyraid_map_loc_entry_t *amle, + uint8_t *pat_cnt, anyraid_tile_t **out_at, uint32_t *cur_tile) +{ + uint8_t disk = amle_get_disk(amle); + uint16_t offset = amle_get_offset(amle); + anyraid_tile_t *at = *out_at; + + if (*pat_cnt == 0) { + at = kmem_alloc(sizeof (*at), KM_SLEEP); + at->at_tile_id = *cur_tile; + avl_add(&var->vd_tile_map, at); + list_create(&at->at_list, + sizeof (anyraid_tile_node_t), + offsetof(anyraid_tile_node_t, atn_node)); + + (*cur_tile)++; + } + + anyraid_tile_node_t *atn = kmem_alloc(sizeof (*atn), KM_SLEEP); + atn->atn_disk = disk; + atn->atn_offset = offset; + list_insert_tail(&at->at_list, atn); + *pat_cnt = (*pat_cnt + 1) % (var->vd_nparity + 1); + + vdev_anyraid_node_t *van = var->vd_children[disk]; + avl_remove(&var->vd_children_tree, van); + van->van_next_offset = MAX(van->van_next_offset, offset + 1); + avl_add(&var->vd_children_tree, van); + *out_at = at; +} + +static void +child_read_done(zio_t *zio) +{ + zio_t *pio = zio_unique_parent(zio); + abd_t **cbp = pio->io_private; + + if (zio->io_error == 0) { + mutex_enter(&pio->io_lock); + if (*cbp == NULL) + *cbp = zio->io_abd; + else + abd_free(zio->io_abd); + mutex_exit(&pio->io_lock); + } else { + abd_free(zio->io_abd); + } +} + +static void +child_read(zio_t *zio, vdev_t *vd, uint64_t offset, uint64_t size, + int checksum, void *private, int flags) +{ + for (int c = 0; c < vd->vdev_children; c++) { + child_read(zio, vd->vdev_child[c], offset, size, checksum, + private, flags); + } + + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + zio_nowait(zio_read_phys(zio, vd, offset, size, + abd_alloc_linear(size, B_TRUE), checksum, + child_read_done, private, ZIO_PRIORITY_SYNC_READ, flags, + B_FALSE)); + } +} + +/* + * This function is non-static for ZDB, and shouldn't be used for anything else. + * Utility function that issues the read for the header and parses out the + * nvlist. + */ +int +vdev_anyraid_open_header(vdev_t *cvd, int header, anyraid_header_t *out_header) +{ + spa_t *spa = cvd->vdev_spa; + uint64_t ashift = cvd->vdev_ashift; + uint64_t header_offset = vdev_anyraid_header_offset(cvd, header); + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + abd_t *header_abd = NULL; + zio_t *rio = zio_root(spa, NULL, &header_abd, flags); + child_read(rio, cvd, header_offset, header_size, ZIO_CHECKSUM_LABEL, + NULL, flags); + + int error; + if ((error = zio_wait(rio)) != 0) { + zfs_dbgmsg("Error %d reading anyraid header %d on vdev %s", + error, header, cvd->vdev_path); + abd_free(header_abd); + return (error); + } + + char *header_buf = abd_borrow_buf(header_abd, header_size); + nvlist_t *header_nvl; + error = nvlist_unpack(header_buf, header_size, &header_nvl, + KM_SLEEP); + if (error != 0) { + zfs_dbgmsg("Error %d unpacking anyraid header %d on vdev %s", + error, header, cvd->vdev_path); + abd_return_buf(header_abd, header_buf, header_size); + abd_free(header_abd); + return (error); + } + out_header->ah_abd = header_abd; + out_header->ah_buf = header_buf; + out_header->ah_nvl = header_nvl; + + return (0); +} + +static void +free_header(anyraid_header_t *header, uint64_t header_size) { + fnvlist_free(header->ah_nvl); + abd_return_buf(header->ah_abd, header->ah_buf, header_size); + abd_free(header->ah_abd); +} + +/* + * This function is non-static for ZDB, and shouldn't be used for anything else. + * + * Iterate over all the copies of the map for the given child vdev and select + * the best one. + */ +int +vdev_anyraid_pick_best_mapping(vdev_t *cvd, uint64_t *out_txg, + anyraid_header_t *out_header, int *out_mapping) +{ + spa_t *spa = cvd->vdev_spa; + uint64_t ashift = cvd->vdev_ashift; + int error = 0; + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + + int best_mapping = -1; + uint64_t best_txg = 0; + anyraid_header_t best_header = {0}; + boolean_t checkpoint_rb = spa_importing_checkpoint(spa); + + for (int i = 0; i < VDEV_ANYRAID_MAP_COPIES; i++) { + anyraid_header_t header; + error = vdev_anyraid_open_header(cvd, i, &header); + + if (error) + continue; + + nvlist_t *hnvl = header.ah_nvl; + uint16_t version; + if ((error = nvlist_lookup_uint16(hnvl, + VDEV_ANYRAID_HEADER_VERSION, &version)) != 0) { + free_header(&header, header_size); + zfs_dbgmsg("Anyraid header %d on vdev %s: missing " + "version", i, cvd->vdev_path); + continue; + } + if (version != 0) { + free_header(&header, header_size); + error = SET_ERROR(ENOTSUP); + zfs_dbgmsg("Anyraid header %d on vdev %s: invalid " + "version", i, cvd->vdev_path); + continue; + } + + uint64_t pool_guid = 0; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_GUID, + &pool_guid) != 0 || pool_guid != spa_guid(spa)) { + free_header(&header, header_size); + error = SET_ERROR(EINVAL); + zfs_dbgmsg("Anyraid header %d on vdev %s: guid " + "mismatch: %llu %llu", i, cvd->vdev_path, + (u_longlong_t)pool_guid, + (u_longlong_t)spa_guid(spa)); + continue; + } + + uint64_t written_txg; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_TXG, + &written_txg) != 0) { + free_header(&header, header_size); + error = SET_ERROR(EINVAL); + zfs_dbgmsg("Anyraid header %d on vdev %s: no txg", + i, cvd->vdev_path); + continue; + } + /* + * If we're reopening, the current txg hasn't been synced out + * yet; look for one txg earlier. + */ + uint64_t min_txg = spa_current_txg(spa) - + (cvd->vdev_parent->vdev_reopening ? 1 : 0); + if ((written_txg < min_txg && !checkpoint_rb) || + written_txg > spa_load_max_txg(spa)) { + free_header(&header, header_size); + error = SET_ERROR(EINVAL); + zfs_dbgmsg("Anyraid header %d on vdev %s: txg %llu out " + "of bounds (%llu, %llu)", i, cvd->vdev_path, + (u_longlong_t)written_txg, + (u_longlong_t)min_txg, + (u_longlong_t)spa_load_max_txg(spa)); + continue; + } + if (written_txg > best_txg) { + best_txg = written_txg; + best_mapping = i; + if (best_header.ah_nvl) + free_header(&best_header, header_size); + + best_header = header; + } else { + free_header(&header, header_size); + } + } + + if (best_txg != 0) { + *out_txg = best_txg; + *out_mapping = best_mapping; + *out_header = best_header; + return (0); + } + ASSERT(error); + return (error); +} + +static int +anyraid_open_existing(vdev_t *vd, uint64_t child, uint16_t **child_capacities) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + vdev_t *cvd = vd->vdev_child[child]; + uint64_t ashift = cvd->vdev_ashift; + spa_t *spa = vd->vdev_spa; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + boolean_t checkpoint_rb = spa_importing_checkpoint(spa); + + anyraid_header_t header; + int mapping; + uint64_t txg; + int error = vdev_anyraid_pick_best_mapping(cvd, &txg, &header, + &mapping); + if (error) + return (error); + + uint8_t disk_id; + if (nvlist_lookup_uint8(header.ah_nvl, VDEV_ANYRAID_HEADER_DISK, + &disk_id) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No disk ID", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + uint64_t tile_size; + if (nvlist_lookup_uint64(header.ah_nvl, VDEV_ANYRAID_HEADER_TILE_SIZE, + &tile_size) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No tile size", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + uint32_t map_length; + if (nvlist_lookup_uint32(header.ah_nvl, VDEV_ANYRAID_HEADER_LENGTH, + &map_length) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No map length", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + uint16_t *caps = NULL; + uint_t count; + if (nvlist_lookup_uint16_array(header.ah_nvl, + VDEV_ANYRAID_HEADER_DISK_SIZES, &caps, &count) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No child sizes", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + if (count != vd->vdev_children) { + zfs_dbgmsg("Error opening anyraid vdev %llu: Incorrect child " + "count %u vs %u", (u_longlong_t)vd->vdev_id, count, + (uint_t)vd->vdev_children); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + *child_capacities = kmem_alloc(sizeof (*caps) * count, KM_SLEEP); + memcpy(*child_capacities, caps, sizeof (*caps) * count); + if (vd->vdev_reopening) { + free_header(&header, header_size); + return (0); + } + + var->vd_checkpoint_tile = UINT32_MAX; + (void) nvlist_lookup_uint32(header.ah_nvl, + VDEV_ANYRAID_HEADER_CHECKPOINT, &var->vd_checkpoint_tile); + + /* + * Because the tile map is 64 MiB and the maximum IO size is 16MiB, + * we may need to issue up to 4 reads to read in the whole thing. + * Similarly, when processing the mapping, we need to iterate across + * the 4 separate buffers. + */ + zio_t *rio = zio_root(spa, NULL, NULL, flags); + abd_t *map_abds[VDEV_ANYRAID_MAP_COPIES] = {0}; + uint64_t header_offset = vdev_anyraid_header_offset(cvd, mapping); + uint64_t map_offset = header_offset + header_size; + int i; + for (i = 0; i <= (map_length / SPA_MAXBLOCKSIZE); i++) { + zio_eck_t *cksum = (zio_eck_t *) + &header.ah_buf[VDEV_ANYRAID_NVL_BYTES(ashift) + + i * sizeof (*cksum)]; + zio_t *nio = zio_null(rio, spa, cvd, NULL, &map_abds[i], flags); + child_read(nio, cvd, map_offset + i * SPA_MAXBLOCKSIZE, + SPA_MAXBLOCKSIZE, ZIO_CHECKSUM_ANYRAID_MAP, cksum, flags); + zio_nowait(nio); + } + i--; + + if ((error = zio_wait(rio))) { + for (; i >= 0; i--) + abd_free(map_abds[i]); + free_header(&header, header_size); + zfs_dbgmsg("Error opening anyraid vdev %llu: map read error %d", + (u_longlong_t)vd->vdev_id, error); + return (error); + } + free_header(&header, header_size); + + uint32_t map = -1, cur_tile = 0; + /* + * For now, all entries are the size of a uint32_t. If that + * ever changes, the logic here needs to be altered to work for + * adaptive sizes, including entries split across 16MiB boundaries. + */ + uint32_t size = sizeof (anyraid_map_loc_entry_t); + uint8_t *map_buf = NULL; + uint8_t pat_cnt = 0; + anyraid_tile_t *at = NULL; + for (uint32_t off = 0; off < map_length; off += size) { + if (checkpoint_rb && cur_tile > var->vd_checkpoint_tile && + pat_cnt == 0) + break; + + int next_map = off / SPA_MAXBLOCKSIZE; + if (map != next_map) { + // switch maps + if (map != -1) { + abd_return_buf(map_abds[map], map_buf, + SPA_MAXBLOCKSIZE); + } + map_buf = abd_borrow_buf(map_abds[next_map], + SPA_MAXBLOCKSIZE); + map = next_map; + +#ifdef _ZFS_BIG_ENDIAN + uint32_t length = map_length - + next_map * SPA_MAXBLOCKSIZE; + byteswap_uint32_array(map_buf, MIN(length, + SPA_MAXBLOCKSIZE)); +#endif + } + anyraid_map_entry_t *entry = + (anyraid_map_entry_t *)(map_buf + (off % SPA_MAXBLOCKSIZE)); + uint8_t type = ame_get_type(entry); + switch (type) { + case AMET_SKIP: { + anyraid_map_skip_entry_t *amse = + &entry->ame_u.ame_amse; + ASSERT0(pat_cnt); + cur_tile += amse_get_skip_count(amse); + break; + } + case AMET_LOC: { + anyraid_map_loc_entry_t *amle = + &entry->ame_u.ame_amle; + create_tile_entry(var, amle, &pat_cnt, &at, + &cur_tile); + break; + } + default: + PANIC("Invalid entry type %d", type); + } + } + if (map_buf) + abd_return_buf(map_abds[map], map_buf, SPA_MAXBLOCKSIZE); + + var->vd_tile_size = tile_size; + + for (; i >= 0; i--) + abd_free(map_abds[i]); + + /* + * Now that we have the tile map read in, we have to reopen the + * children to properly set and handle the min_asize + */ + for (; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + vdev_reopen(cvd); + } + + int lasterror = 0; + int numerrors = 0; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + } + + if (numerrors > var->vd_nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +/* + * When creating a new anyraid vdev, this function calculates the tile size + * to use. We take (by default) 1/64th of the size of the smallest disk or 16 + * GiB, whichever is larger. + */ +static int +anyraid_calculate_size(vdev_t *vd) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + + uint64_t smallest_disk_size = UINT64_MAX; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + smallest_disk_size = MIN(smallest_disk_size, cvd->vdev_asize); + } + + uint64_t disk_shift = anyraid_disk_shift; + uint64_t min_size = zfs_anyraid_min_tile_size; + if (smallest_disk_size < 1 << disk_shift || + smallest_disk_size < min_size) { + return (SET_ERROR(ENOLCK)); + } + + + ASSERT3U(smallest_disk_size, !=, UINT64_MAX); + uint64_t tile_size = smallest_disk_size >> disk_shift; + tile_size = MAX(tile_size, min_size); + var->vd_tile_size = 1ULL << (highbit64(tile_size - 1)); + + /* + * Later, we're going to cap the metaslab size at the tile + * size, so we need a tile to hold at least enough to store a + * max-size block, or we'll assert in that code. + */ + if (var->vd_tile_size < SPA_MAXBLOCKSIZE) + return (SET_ERROR(ENOSPC)); + return (0); +} + +struct tile_count { + avl_node_t node; + int disk; + int remaining; +}; + +static int +rc_compar(const void *a, const void *b) +{ + const struct tile_count *ra = a; + const struct tile_count *rb = b; + + int cmp = TREE_CMP(rb->remaining, ra->remaining); + if (cmp != 0) + return (cmp); + return (TREE_CMP(rb->disk, ra->disk)); +} + +/* + * I think the only way to calculate the asize for anyraid devices is to + * actually run the allocation algorithm and see what we end up with. It's a + * variant of the bin-packing problem, which is NP-hard. Thankfully + * a first-fit descending algorithm seems to give optimal results for this + * variant. + */ +static uint64_t +calculate_asize(vdev_t *vd, uint64_t *num_tiles) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + + if (var->vd_nparity == 0) { + uint64_t count = 0; + for (int c = 0; c < vd->vdev_children; c++) { + count += num_tiles[c]; + } + return (count * var->vd_tile_size); + } + + /* + * Sort the disks by the number of additional tiles they can store. + */ + avl_tree_t t; + avl_create(&t, rc_compar, sizeof (struct tile_count), + offsetof(struct tile_count, node)); + for (int c = 0; c < vd->vdev_children; c++) { + if (num_tiles[c] == 0) { + ASSERT(vd->vdev_child[c]->vdev_open_error); + continue; + } + struct tile_count *rc = kmem_alloc(sizeof (*rc), KM_SLEEP); + rc->disk = c; + rc->remaining = num_tiles[c] - + var->vd_children[c]->van_next_offset; + avl_add(&t, rc); + } + + uint32_t map_width = var->vd_nparity + 1; + uint64_t count = avl_numnodes(&var->vd_tile_map); + struct tile_count **cur = kmem_alloc(sizeof (*cur) * map_width, + KM_SLEEP); + for (;;) { + /* Grab the nparity + 1 children with the most free capacity */ + for (int c = 0; c < map_width; c++) { + struct tile_count *rc = avl_first(&t); + ASSERT(rc); + cur[c] = rc; + avl_remove(&t, rc); + } + struct tile_count *rc = cur[map_width - 1]; + struct tile_count *next = avl_first(&t); + uint64_t next_rem = next == NULL ? 0 : next->remaining; + ASSERT3U(next_rem, <=, rc->remaining); + /* If one of the top N + 1 has no capacity left, we're done */ + if (rc->remaining == 0) + break; + + /* + * This is a performance optimization; if the child with the + * lowest free capacity of the ones we've selected has N more + * capacity than the next child, the next N iterations would + * all select the same children. So to save time, we add N + * tiles right now and reduce our iteration count. + */ + uint64_t this_iter = MAX(1, rc->remaining - next_rem); + count += this_iter; + + /* Re-add the selected children with their reduced capacity */ + for (int c = 0; c < map_width; c++) { + ASSERT3U(cur[c]->remaining, >=, this_iter); + cur[c]->remaining -= this_iter; + avl_add(&t, cur[c]); + } + } + for (int c = 0; c < map_width; c++) + kmem_free(cur[c], sizeof (*cur)); + kmem_free(cur, sizeof (*cur) * map_width); + void *cookie = NULL; + struct tile_count *node; + + while ((node = avl_destroy_nodes(&t, &cookie)) != NULL) + kmem_free(node, sizeof (*node)); + avl_destroy(&t); + return (count * var->vd_tile_size); +} + +static int +vdev_anyraid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + int lasterror = 0; + int numerrors = 0; + + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + } + + /* + * If we have more faulted disks than parity, we can't open the device. + */ + if (numerrors > var->vd_nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + uint16_t *child_capacities = NULL; + if (vd->vdev_reopening) { + child_capacities = kmem_alloc(sizeof (*child_capacities) * + vd->vdev_children, KM_SLEEP); + for (uint64_t c = 0; c < vd->vdev_children; c++) { + child_capacities[c] = var->vd_children[c]->van_capacity; + } + } else if (spa_load_state(vd->vdev_spa) != SPA_LOAD_CREATE && + spa_load_state(vd->vdev_spa) != SPA_LOAD_ERROR && + spa_load_state(vd->vdev_spa) != SPA_LOAD_NONE) { + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if (cvd->vdev_open_error != 0) + continue; + if ((lasterror = anyraid_open_existing(vd, c, + &child_capacities)) == 0) + break; + } + if (lasterror) + return (lasterror); + } else if ((lasterror = anyraid_calculate_size(vd))) { + return (lasterror); + } + + uint64_t max_size = VDEV_ANYRAID_MAX_TPD * var->vd_tile_size; + + /* + * Calculate the number of tiles each child could fit, then use that + * to calculate the asize and min_asize. + */ + uint64_t *num_tiles = kmem_zalloc(vd->vdev_children * + sizeof (*num_tiles), KM_SLEEP); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + uint64_t casize; + if (cvd->vdev_open_error == 0) { + vdev_set_min_asize(cvd); + casize = MIN(max_size, cvd->vdev_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift)); + } else { + ASSERT(child_capacities); + casize = (child_capacities[c] + 1) * var->vd_tile_size; + } + + num_tiles[c] = casize / var->vd_tile_size; + avl_remove(&var->vd_children_tree, var->vd_children[c]); + /* + * We store the capacity minus 1, since a vdev can never have 0 + * and they can have (which would overflow a uint16_t). + */ + var->vd_children[c]->van_capacity = num_tiles[c] - 1; + avl_add(&var->vd_children_tree, var->vd_children[c]); + } + *asize = calculate_asize(vd, num_tiles); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + uint64_t cmasize; + if (cvd->vdev_open_error == 0) { + cmasize = MIN(max_size, cvd->vdev_max_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift)); + } else { + cmasize = (child_capacities[c] + 1) * var->vd_tile_size; + } + + num_tiles[c] = cmasize / var->vd_tile_size; + } + *max_asize = calculate_asize(vd, num_tiles); + + if (child_capacities) { + kmem_free(child_capacities, sizeof (*child_capacities) * + vd->vdev_children); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) + continue; + + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); + } + return (0); +} + +/* + * We cap the metaslab size at the tile size. This prevents us from having to + * split IOs across multiple tiles, which would be complex extra logic for + * little gain. + */ +static void +vdev_anyraid_metaslab_size(vdev_t *vd, uint64_t *shiftp) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + *shiftp = MIN(*shiftp, highbit64(var->vd_tile_size) - 1); +} + +static void +vdev_anyraid_close(vdev_t *vd) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } + if (vd->vdev_reopening) + return; + anyraid_tile_t *tile = NULL; + void *cookie = NULL; + while ((tile = avl_destroy_nodes(&var->vd_tile_map, &cookie))) { + if (var->vd_nparity != 0) { + anyraid_tile_node_t *atn = NULL; + while ((atn = list_remove_head(&tile->at_list))) { + kmem_free(atn, sizeof (*atn)); + } + list_destroy(&tile->at_list); + } + kmem_free(tile, sizeof (*tile)); + } +} + +/* + * I/O related functions. + */ + +/* + * Configure the mirror_map and then hand the write off to the normal mirror + * logic. + */ +static void +vdev_anyraid_mirror_start(zio_t *zio, anyraid_tile_t *tile) +{ + vdev_t *vd = zio->io_vd; + vdev_anyraid_t *var = vd->vdev_tsd; + mirror_map_t *mm = vdev_mirror_map_alloc(var->vd_nparity + 1, B_FALSE, + B_FALSE); + uint64_t rsize = var->vd_tile_size; + + anyraid_tile_node_t *atn = list_head(&tile->at_list); + for (int c = 0; c < mm->mm_children; c++) { + ASSERT(atn); + mirror_child_t *mc = &mm->mm_child[c]; + mc->mc_vd = vd->vdev_child[atn->atn_disk]; + mc->mc_offset = VDEV_ANYRAID_START_OFFSET(vd->vdev_ashift) + + atn->atn_offset * rsize + zio->io_offset % rsize; + ASSERT3U(mc->mc_offset, <, mc->mc_vd->vdev_psize - + VDEV_LABEL_END_SIZE); + mm->mm_rebuilding = mc->mc_rebuilding = B_FALSE; + atn = list_next(&tile->at_list, atn); + } + ASSERT(atn == NULL); + + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + + vdev_mirror_io_start_impl(zio, mm); +} + +typedef struct anyraid_map { + abd_t *am_abd; +} anyraid_map_t; + +static void +vdev_anyraid_map_free_vsd(zio_t *zio) +{ + anyraid_map_t *mm = zio->io_vsd; + abd_free(mm->am_abd); + mm->am_abd = NULL; + kmem_free(mm, sizeof (*mm)); +} + +const zio_vsd_ops_t vdev_anyraid_vsd_ops = { + .vsd_free = vdev_anyraid_map_free_vsd, +}; + +static void +vdev_anyraid_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); +} + +static void +vdev_anyraid_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_anyraid_t *var = vd->vdev_tsd; + uint64_t rsize = var->vd_tile_size; + + uint64_t start_tile_id = zio->io_offset / rsize; + anyraid_tile_t search; + search.at_tile_id = start_tile_id; + avl_index_t where; + rw_enter(&var->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&var->vd_tile_map, &search, + &where); + + /* + * If we're doing an I/O somewhere that hasn't been allocated yet, we + * may need to allocate a new tile. Upgrade to a write lock so we can + * safely modify the data structure, and then check if someone else + * beat us to it. + */ + if (tile == NULL) { + rw_exit(&var->vd_lock); + rw_enter(&var->vd_lock, RW_WRITER); + tile = avl_find(&var->vd_tile_map, &search, &where); + } + if (tile == NULL) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + zfs_dbgmsg("Allocating tile %llu for zio %px", + (u_longlong_t)start_tile_id, zio); + tile = kmem_alloc(sizeof (*tile), KM_SLEEP); + tile->at_tile_id = start_tile_id; + list_create(&tile->at_list, sizeof (anyraid_tile_node_t), + offsetof(anyraid_tile_node_t, atn_node)); + + uint_t width = var->vd_nparity + 1; + vdev_anyraid_node_t **vans = kmem_alloc(sizeof (*vans) * width, + KM_SLEEP); + for (int i = 0; i < width; i++) { + vans[i] = avl_first(&var->vd_children_tree); + avl_remove(&var->vd_children_tree, vans[i]); + + anyraid_tile_node_t *atn = + kmem_alloc(sizeof (*atn), KM_SLEEP); + atn->atn_disk = vans[i]->van_id; + atn->atn_offset = + vans[i]->van_next_offset++; + list_insert_tail(&tile->at_list, atn); + } + for (int i = 0; i < width; i++) + avl_add(&var->vd_children_tree, vans[i]); + + kmem_free(vans, sizeof (*vans) * width); + avl_insert(&var->vd_tile_map, tile, where); + } + rw_exit(&var->vd_lock); + + ASSERT3U(zio->io_offset % rsize + zio->io_size, <=, + var->vd_tile_size); + + if (var->vd_nparity > 0) { + vdev_anyraid_mirror_start(zio, tile); + zio_execute(zio); + return; + } + + anyraid_tile_node_t *atn = list_head(&tile->at_list); + vdev_t *cvd = vd->vdev_child[atn->atn_disk]; + uint64_t child_offset = atn->atn_offset * rsize + + zio->io_offset % rsize; + child_offset += VDEV_ANYRAID_START_OFFSET(vd->vdev_ashift); + + anyraid_map_t *mm = kmem_alloc(sizeof (*mm), KM_SLEEP); + mm->am_abd = abd_get_offset(zio->io_abd, 0); + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_anyraid_vsd_ops; + + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, child_offset, + mm->am_abd, zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_anyraid_child_done, zio); + zio_nowait(cio); + + zio_execute(zio); +} + +static void +vdev_anyraid_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_anyraid_t *var = vd->vdev_tsd; + + if (var->vd_nparity > 0) + vdev_mirror_io_done(zio); +} + +static void +vdev_anyraid_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + if (faulted > var->vd_nparity) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } else if (degraded + faulted != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. The function + * assumes that at least one DTL is dirty which implies that full stripe + * width blocks must be resilvered. + */ +static boolean_t +vdev_anyraid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + (void) psize; + vdev_anyraid_t *var = vd->vdev_tsd; + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + uint64_t start_tile_id = DVA_GET_OFFSET(dva) / var->vd_tile_size; + anyraid_tile_t search; + search.at_tile_id = start_tile_id; + avl_index_t where; + rw_enter(&var->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&var->vd_tile_map, &search, + &where); + rw_exit(&var->vd_lock); + ASSERT(tile); + + for (anyraid_tile_node_t *atn = list_head(&tile->at_list); + atn != NULL; atn = list_next(&tile->at_list, atn)) { + vdev_t *cvd = vd->vdev_child[atn->atn_disk]; + + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Right now, we don't translate anything beyond the end of the allocated + * ranges for the target leaf vdev. This means that trim and initialize won't + * affect those areas on anyraid devices. Given the target use case, this is + * not a significant concern, but a rework of the xlate logic could enable this + * in the future. + */ +static void +vdev_anyraid_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) +{ + vdev_t *anyraidvd = cvd->vdev_parent; + ASSERT3P(anyraidvd->vdev_ops, ==, &vdev_anyraid_ops); + vdev_anyraid_t *var = anyraidvd->vdev_tsd; + uint64_t rsize = var->vd_tile_size; + + uint64_t start_tile_id = logical_rs->rs_start / rsize; + ASSERT3U(start_tile_id, ==, (logical_rs->rs_end - 1) / rsize); + anyraid_tile_t search; + search.at_tile_id = start_tile_id; + avl_index_t where; + rw_enter(&var->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&var->vd_tile_map, &search, + &where); + rw_exit(&var->vd_lock); + // This tile doesn't exist yet + if (tile == NULL) { + physical_rs->rs_start = physical_rs->rs_end = 0; + return; + } + anyraid_tile_node_t *atn = list_head(&tile->at_list); + for (; atn != NULL; atn = list_next(&tile->at_list, atn)) + if (anyraidvd->vdev_child[atn->atn_disk] == cvd) + break; + // The tile exists, but isn't stored on this child + if (atn == NULL) { + physical_rs->rs_start = physical_rs->rs_end = 0; + return; + } + + uint64_t child_offset = atn->atn_offset * rsize + + logical_rs->rs_start % rsize; + child_offset += VDEV_ANYRAID_START_OFFSET(anyraidvd->vdev_ashift); + uint64_t size = logical_rs->rs_end - logical_rs->rs_start; + + physical_rs->rs_start = child_offset; + physical_rs->rs_end = child_offset + size; + remain_rs->rs_start = 0; + remain_rs->rs_end = 0; +} + +static uint64_t +vdev_anyraid_nparity(vdev_t *vd) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + return (var->vd_nparity); +} + +static uint64_t +vdev_anyraid_ndisks(vdev_t *vd) +{ + return (vd->vdev_children); +} + +/* + * Functions related to syncing out the tile map each TXG. + */ +static boolean_t +map_write_loc_entry(anyraid_tile_node_t *atn, void *buf, uint32_t *offset) +{ + anyraid_map_loc_entry_t *entry = (void *)((char *)buf + *offset); + amle_set_type(entry); + amle_set_disk(entry, atn->atn_disk); + amle_set_offset(entry, atn->atn_offset); + *offset += sizeof (*entry); + return (*offset == SPA_MAXBLOCKSIZE); +} + +static boolean_t +map_write_skip_entry(uint32_t tile, void *buf, uint32_t *offset, + uint32_t prev_id) +{ + anyraid_map_skip_entry_t *entry = (void *)((char *)buf + *offset); + amse_set_type(entry); + amse_set_skip_count(entry, tile - prev_id - 1); + *offset += sizeof (*entry); + return (*offset == SPA_MAXBLOCKSIZE); +} + +static void +anyraid_map_write_done(zio_t *zio) +{ + abd_free(zio->io_abd); +} + +static void +map_write_issue(zio_t *zio, vdev_t *vd, uint64_t base_offset, + uint8_t idx, uint32_t length, abd_t *abd, zio_eck_t *cksum_out, + int flags) +{ +#ifdef _ZFS_BIG_ENDIAN + void *buf = abd_borrow_buf(abd, SPA_MAXBLOCKSIZE); + byteswap_uint32_array(buf, length); + abd_return_buf(abd, buf, SPA_MAXBLOCKSIZE); +#else + (void) length; +#endif + + zio_nowait(zio_write_phys(zio, vd, base_offset + + idx * VDEV_ANYRAID_MAP_SIZE + + VDEV_ANYRAID_MAP_HEADER_SIZE(vd->vdev_ashift), SPA_MAXBLOCKSIZE, + abd, ZIO_CHECKSUM_ANYRAID_MAP, anyraid_map_write_done, cksum_out, + ZIO_PRIORITY_SYNC_WRITE, flags, B_FALSE)); +} + +static void +vdev_anyraid_write_map_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0 && good_writes != NULL) + atomic_inc_64(good_writes); +} + +void +vdev_anyraid_write_map_sync(vdev_t *vd, zio_t *pio, uint64_t txg, + uint64_t *good_writes, int flags, vdev_config_sync_status_t status) +{ + vdev_t *anyraidvd = vd->vdev_parent; + ASSERT3P(anyraidvd->vdev_ops, ==, &vdev_anyraid_ops); + spa_t *spa = vd->vdev_spa; + vdev_anyraid_t *var = anyraidvd->vdev_tsd; + uint32_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(vd->vdev_ashift); + uint32_t nvl_bytes = VDEV_ANYRAID_NVL_BYTES(vd->vdev_ashift); + uint8_t update_target = txg % VDEV_ANYRAID_MAP_COPIES; + uint64_t base_offset = vdev_anyraid_header_offset(vd, update_target); + + abd_t *header_abd = + abd_alloc_linear(header_size, B_TRUE); + abd_zero(header_abd, header_size); + void *header_buf = abd_borrow_buf(header_abd, header_size); + zio_eck_t *cksums = (zio_eck_t *)&((char *)header_buf)[nvl_bytes]; + + abd_t *map_abd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); + uint8_t written = 0; + void *buf = abd_borrow_buf(map_abd, SPA_MAXBLOCKSIZE); + + rw_enter(&var->vd_lock, RW_READER); + anyraid_tile_t *cur = avl_first(&var->vd_tile_map); + anyraid_tile_node_t *curn = cur != NULL ? + list_head(&cur->at_list) : NULL; + uint32_t buf_offset = 0, prev_id = UINT32_MAX; + zio_t *zio = zio_root(spa, NULL, NULL, flags); + /* Write out each sub-tile in turn */ + while (cur) { + if (status == VDEV_CONFIG_REWINDING_CHECKPOINT && + cur->at_tile_id > var->vd_checkpoint_tile) + break; + + anyraid_tile_t *next = AVL_NEXT(&var->vd_tile_map, cur); + IMPLY(prev_id != UINT32_MAX, cur->at_tile_id >= prev_id); + /* + * Determine if we need to write a skip entry before the + * current one. + */ + boolean_t skip = + (prev_id == UINT32_MAX && cur->at_tile_id != 0) || + (prev_id != UINT32_MAX && cur->at_tile_id > prev_id + 1); + if ((skip && map_write_skip_entry(cur->at_tile_id, buf, + &buf_offset, prev_id)) || + (!skip && map_write_loc_entry(curn, buf, &buf_offset))) { + // Let the final write handle it + if (next == NULL) + break; + abd_return_buf_copy(map_abd, buf, SPA_MAXBLOCKSIZE); + map_write_issue(zio, vd, base_offset, written, + buf_offset, map_abd, &cksums[written], flags); + + map_abd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); + written++; + ASSERT3U(written, <, + VDEV_ANYRAID_MAP_SIZE / SPA_MAXBLOCKSIZE); + buf = abd_borrow_buf(map_abd, SPA_MAXBLOCKSIZE); + buf_offset = 0; + } + prev_id = cur->at_tile_id; + /* + * Advance the current sub-tile; if it moves us past the end + * of the current list of sub-tiles, start the next tile. + */ + if (!skip) { + curn = list_next(&cur->at_list, curn); + if (curn == NULL) { + cur = next; + curn = cur != NULL ? + list_head(&cur->at_list) : NULL; + } + } + } + + if (status == VDEV_CONFIG_NO_CHECKPOINT || + status == VDEV_CONFIG_REWINDING_CHECKPOINT) { + var->vd_checkpoint_tile = UINT32_MAX; + } else if (status == VDEV_CONFIG_CREATING_CHECKPOINT) { + anyraid_tile_t *at = avl_last(&var->vd_tile_map); + ASSERT(at); + var->vd_checkpoint_tile = at->at_tile_id; + } + rw_exit(&var->vd_lock); + + abd_return_buf_copy(map_abd, buf, SPA_MAXBLOCKSIZE); + map_write_issue(zio, vd, base_offset, written, buf_offset, map_abd, + &cksums[written], flags); + + if (zio_wait(zio)) + return; + + // Populate the header + uint16_t *sizes = kmem_zalloc(sizeof (*sizes) * + anyraidvd->vdev_children, KM_SLEEP); + uint64_t disk_id = 0; + for (uint64_t i = 0; i < anyraidvd->vdev_children; i++) { + if (anyraidvd->vdev_child[i] == vd) + disk_id = i; + sizes[i] = var->vd_children[i]->van_capacity; + } + ASSERT3U(disk_id, <, anyraidvd->vdev_children); + nvlist_t *header = fnvlist_alloc(); + fnvlist_add_uint16(header, VDEV_ANYRAID_HEADER_VERSION, 0); + fnvlist_add_uint8(header, VDEV_ANYRAID_HEADER_DISK, disk_id); + fnvlist_add_uint64(header, VDEV_ANYRAID_HEADER_TXG, txg); + fnvlist_add_uint64(header, VDEV_ANYRAID_HEADER_GUID, spa_guid(spa)); + fnvlist_add_uint64(header, VDEV_ANYRAID_HEADER_TILE_SIZE, + var->vd_tile_size); + fnvlist_add_uint32(header, VDEV_ANYRAID_HEADER_LENGTH, + written * SPA_MAXBLOCKSIZE + buf_offset); + fnvlist_add_uint16_array(header, VDEV_ANYRAID_HEADER_DISK_SIZES, sizes, + anyraidvd->vdev_children); + + if (var->vd_checkpoint_tile != UINT32_MAX) { + fnvlist_add_uint32(header, VDEV_ANYRAID_HEADER_CHECKPOINT, + var->vd_checkpoint_tile); + } + size_t packed_size; + char *packed = NULL; + VERIFY0(nvlist_pack(header, &packed, &packed_size, NV_ENCODE_XDR, + KM_SLEEP)); + ASSERT3U(packed_size, <, nvl_bytes); + memcpy(header_buf, packed, packed_size); + fnvlist_pack_free(packed, packed_size); + abd_return_buf_copy(header_abd, header_buf, header_size); + + // Write out the header + zio_t *header_zio = zio_write_phys(pio, vd, base_offset, header_size, + header_abd, ZIO_CHECKSUM_LABEL, vdev_anyraid_write_map_done, + good_writes, ZIO_PRIORITY_SYNC_WRITE, flags, B_FALSE); + zio_nowait(header_zio); + abd_free(header_abd); +} + +static uint64_t +vdev_anyraid_min_attach_size(vdev_t *vd) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_anyraid_ops); + ASSERT3U(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER), !=, 0); + vdev_anyraid_t *var = vd->vdev_tsd; + ASSERT(var->vd_tile_size); + return (VDEV_ANYRAID_TOTAL_MAP_SIZE(vd->vdev_ashift) + + var->vd_tile_size); +} + +static uint64_t +vdev_anyraid_min_asize(vdev_t *pvd, vdev_t *cvd) +{ + ASSERT3P(pvd->vdev_ops, ==, &vdev_anyraid_ops); + ASSERT3U(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_READER), !=, 0); + vdev_anyraid_t *var = pvd->vdev_tsd; + if (var->vd_tile_size == 0) + return (VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift)); + + rw_enter(&var->vd_lock, RW_READER); + uint64_t size = VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift) + + (var->vd_children[cvd->vdev_id]->van_capacity + 1) * + var->vd_tile_size; + rw_exit(&var->vd_lock); + return (size); +} + +void +vdev_anyraid_expand(vdev_t *tvd, vdev_t *newvd) +{ + vdev_anyraid_t *var = tvd->vdev_tsd; + uint64_t old_children = tvd->vdev_children - 1; + + ASSERT3U(spa_config_held(tvd->vdev_spa, SCL_ALL, RW_WRITER), ==, + SCL_ALL); + vdev_anyraid_node_t **nc = kmem_alloc(tvd->vdev_children * sizeof (*nc), + KM_SLEEP); + vdev_anyraid_node_t *newchild = kmem_alloc(sizeof (*newchild), + KM_SLEEP); + newchild->van_id = newvd->vdev_id; + newchild->van_next_offset = 0; + uint64_t max_size = VDEV_ANYRAID_MAX_TPD * var->vd_tile_size; + newchild->van_capacity = (MIN(max_size, (newvd->vdev_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(newvd->vdev_ashift))) / + var->vd_tile_size) - 1; + rw_enter(&var->vd_lock, RW_WRITER); + memcpy(nc, var->vd_children, old_children * sizeof (*nc)); + kmem_free(var->vd_children, old_children * sizeof (*nc)); + var->vd_children = nc; + var->vd_children[old_children] = newchild; + avl_add(&var->vd_children_tree, newchild); + rw_exit(&var->vd_lock); +} + +boolean_t +vdev_anyraid_mapped(vdev_t *vd, uint64_t offset) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + anyraid_tile_t search; + search.at_tile_id = offset / var->vd_tile_size; + + rw_enter(&var->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&var->vd_tile_map, &search, NULL); + boolean_t result = tile == NULL; + rw_exit(&var->vd_lock); + + return (result); +} + +/* + * Return the maximum asize for a rebuild zio in the provided range + * given the following constraints. An anyraid chunk may not: + * + * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or + * - Span anyraid tiles + */ +static uint64_t +vdev_anyraid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + vdev_anyraid_t *var = vd->vdev_tsd; + ASSERT3P(vd->vdev_ops, ==, &vdev_anyraid_ops); + + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), + SPA_MAXBLOCKSIZE); + + if (start / var->vd_tile_size != + (start + psize) / var->vd_tile_size) { + psize = P2ROUNDUP(start, var->vd_tile_size) - start; + } + + return (MIN(asize, vdev_psize_to_asize(vd, psize))); +} + +vdev_ops_t vdev_anyraid_ops = { + .vdev_op_init = vdev_anyraid_init, + .vdev_op_fini = vdev_anyraid_fini, + .vdev_op_open = vdev_anyraid_open, + .vdev_op_close = vdev_anyraid_close, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_asize, + .vdev_op_min_asize = vdev_anyraid_min_asize, + .vdev_op_min_attach_size = vdev_anyraid_min_attach_size, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_anyraid_io_start, + .vdev_op_io_done = vdev_anyraid_io_done, + .vdev_op_state_change = vdev_anyraid_state_change, + .vdev_op_need_resilver = vdev_anyraid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_anyraid_xlate, + .vdev_op_rebuild_asize = vdev_anyraid_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_anyraid_config_generate, + .vdev_op_nparity = vdev_anyraid_nparity, + .vdev_op_ndisks = vdev_anyraid_ndisks, + .vdev_op_metaslab_size = vdev_anyraid_metaslab_size, + .vdev_op_type = VDEV_TYPE_ANYRAID, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; + + +ZFS_MODULE_PARAM(zfs_anyraid, zfs_anyraid_, min_tile_size, U64, ZMOD_RW, + "Minimum tile size for anyraid"); \ No newline at end of file diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 8588cfee3f7d..857d76413166 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1164,14 +1164,22 @@ vdev_draid_get_astart(vdev_t *vd, const uint64_t start) * 1 / (children - nspares) of its asize. */ static uint64_t -vdev_draid_min_asize(vdev_t *vd) +vdev_draid_min_asize(vdev_t *pvd, vdev_t *cvd) { - vdev_draid_config_t *vdc = vd->vdev_tsd; + (void) cvd; + vdev_draid_config_t *vdc = pvd->vdev_tsd; - ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3P(pvd->vdev_ops, ==, &vdev_draid_ops); return (VDEV_DRAID_REFLOW_RESERVE + - (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); + (pvd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); +} + +static uint64_t +vdev_draid_min_attach_size(vdev_t *vd) +{ + ASSERT3U(vd->vdev_top, ==, vd); + return (vdev_draid_min_asize(vd, vd->vdev_child[0])); } /* @@ -2343,6 +2351,7 @@ vdev_ops_t vdev_draid_ops = { .vdev_op_psize_to_asize = vdev_draid_psize_to_asize, .vdev_op_asize_to_psize = vdev_draid_asize_to_psize, .vdev_op_min_asize = vdev_draid_min_asize, + .vdev_op_min_attach_size = vdev_draid_min_attach_size, .vdev_op_min_alloc = vdev_draid_min_alloc, .vdev_op_io_start = vdev_draid_io_start, .vdev_op_io_done = vdev_draid_io_done, @@ -2835,6 +2844,7 @@ vdev_ops_t vdev_draid_spare_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_draid_spare_io_start, .vdev_op_io_done = vdev_draid_spare_io_done, diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 20b4db65ec06..3cae11e436d0 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -317,6 +317,7 @@ vdev_ops_t vdev_file_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 7538f471e63c..5f2b55047149 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1870,6 +1870,7 @@ vdev_ops_t vdev_indirect_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7e222eac5edc..c3c1d767c321 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -145,6 +145,7 @@ #include #include #include +#include #include #include #include @@ -1851,6 +1852,75 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) return (good_writes >= 1 ? 0 : EIO); } +/* + * Write the extra data of the specified vdev. + */ +static void +vdev_extra_sync(zio_t *zio, uint64_t *good_writes, vdev_t *vd, int flags, + uint64_t txg, vdev_config_sync_status_t status) +{ + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_extra_sync(zio, good_writes, vd->vdev_child[c], flags, txg, + status); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + if (vd->vdev_parent->vdev_ops == &vdev_anyraid_ops) { + vdev_anyraid_write_map_sync(vd, zio, txg, good_writes, flags, + status); + } +} + +/* Sync the extra data of all vdevs in svd[] */ +static int +vdev_extra_sync_list(vdev_t **svd, int svdcount, int flags, uint64_t txg, + vdev_config_sync_status_t status) +{ + spa_t *spa = svd[0]->vdev_spa; + zio_t *zio; + uint64_t good_writes = 0; + + boolean_t have_extra = B_FALSE; + + for (int i = 0; i < svdcount; i++) { + if (svd[i]->vdev_ops == &vdev_anyraid_ops) { + have_extra = B_TRUE; + break; + } + } + if (!have_extra) + return (0); + + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) + vdev_extra_sync(zio, &good_writes, svd[v], flags, txg, status); + + (void) zio_wait(zio); + + /* + * Flush the extra data to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) { + if (vdev_writeable(svd[v])) { + zio_flush(zio, svd[v]); + } + } + + (void) zio_wait(zio); + + return (good_writes >= 1 ? 0 : EIO); +} + /* * On success, increment the count of good writes for our top-level vdev. */ @@ -2034,7 +2104,8 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) * at any time, you can just call it again, and it will resume its work. */ int -vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, + vdev_config_sync_status_t status) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; @@ -2112,6 +2183,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) goto retry; } + if ((error = vdev_extra_sync_list(svd, svdcount, flags, txg, status) != + 0)) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_extra_sync_list() returned error %d " + "for pool '%s' when syncing out the extra data " + "of dirty vdevs", error, spa_name(spa)); + } + goto retry; + } + /* * Sync the uberblocks to all vdevs in svd[]. * If the system dies in the middle of this step, there are two cases diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 18efdaac006f..8aeff63e1dbe 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * Vdev mirror kstats @@ -99,31 +100,6 @@ vdev_mirror_stat_fini(void) } } -/* - * Virtual device vector for mirroring. - */ -typedef struct mirror_child { - vdev_t *mc_vd; - abd_t *mc_abd; - uint64_t mc_offset; - int mc_error; - int mc_load; - uint8_t mc_tried; - uint8_t mc_skipped; - uint8_t mc_speculative; - uint8_t mc_rebuilding; -} mirror_child_t; - -typedef struct mirror_map { - int *mm_preferred; - int mm_preferred_cnt; - int mm_children; - boolean_t mm_resilvering; - boolean_t mm_rebuilding; - boolean_t mm_root; - mirror_child_t mm_child[]; -} mirror_map_t; - static const int vdev_mirror_shift = 21; /* @@ -152,7 +128,7 @@ vdev_mirror_map_size(int children) sizeof (int) * children); } -static inline mirror_map_t * +mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) { mirror_map_t *mm; @@ -175,7 +151,7 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } -static const zio_vsd_ops_t vdev_mirror_vsd_ops = { +zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, }; @@ -601,24 +577,12 @@ vdev_mirror_child_select(zio_t *zio) return (-1); } -static void -vdev_mirror_io_start(zio_t *zio) +void +vdev_mirror_io_start_impl(zio_t *zio, mirror_map_t *mm) { - mirror_map_t *mm; mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_init(zio); - zio->io_vsd = mm; - zio->io_vsd_ops = &vdev_mirror_vsd_ops; - - if (mm == NULL) { - ASSERT(!spa_trust_config(zio->io_spa)); - ASSERT(zio->io_type == ZIO_TYPE_READ); - zio_execute(zio); - return; - } - if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { /* @@ -650,7 +614,6 @@ vdev_mirror_io_start(zio_t *zio) vdev_mirror_child_done, mc)); first = B_FALSE; } - zio_execute(zio); return; } /* @@ -690,6 +653,25 @@ vdev_mirror_io_start(zio_t *zio) zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); } +} + +static void +vdev_mirror_io_start(zio_t *zio) +{ + mirror_map_t *mm; + + mm = vdev_mirror_map_init(zio); + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + + if (mm == NULL) { + ASSERT(!spa_trust_config(zio->io_spa)); + ASSERT(zio->io_type == ZIO_TYPE_READ); + zio_execute(zio); + return; + } + + vdev_mirror_io_start_impl(zio, mm); zio_execute(zio); } @@ -708,7 +690,7 @@ vdev_mirror_worst_error(mirror_map_t *mm) return (error[0] ? error[0] : error[1]); } -static void +void vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; @@ -975,6 +957,7 @@ vdev_ops_t vdev_mirror_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, @@ -1001,6 +984,7 @@ vdev_ops_t vdev_replacing_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, @@ -1027,6 +1011,7 @@ vdev_ops_t vdev_spare_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index c62faef2d05c..ac6866bdcec0 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -88,6 +88,7 @@ vdev_ops_t vdev_missing_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, @@ -114,6 +115,7 @@ vdev_ops_t vdev_hole_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 5fe70ec2b1d5..18c6b9a33310 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2332,10 +2332,18 @@ vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) * so each child must provide at least 1/Nth of its asize. */ static uint64_t -vdev_raidz_min_asize(vdev_t *vd) +vdev_raidz_min_asize(vdev_t *pvd, vdev_t *cvd) { - return ((vd->vdev_min_asize + vd->vdev_children - 1) / - vd->vdev_children); + (void) cvd; + return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / + pvd->vdev_children); +} + +static uint64_t +vdev_raidz_min_attach_size(vdev_t *vd) +{ + ASSERT3U(vd->vdev_top, ==, vd); + return (vdev_raidz_min_asize(vd, vd->vdev_child[0])); } /* @@ -5466,6 +5474,7 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, .vdev_op_min_asize = vdev_raidz_min_asize, + .vdev_op_min_attach_size = vdev_raidz_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 30be1f851eb3..931115b5494e 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -525,6 +525,7 @@ vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, { ASSERT(vd->vdev_ops == &vdev_draid_ops || vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_anyraid_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 21a81d6d25b9..55e059e1d8b0 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -150,6 +150,7 @@ vdev_ops_t vdev_root_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = NULL, /* not applicable to the root */ .vdev_op_io_done = NULL, /* not applicable to the root */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 74373f759cec..276c8663f983 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4260,6 +4260,8 @@ zio_dva_allocate(zio_t *zio) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; + if (zio->io_flags & ZIO_FLAG_ZILWRITE) + flags |= METASLAB_ZIL; /* * If not already chosen, choose an appropriate allocation class. @@ -5137,7 +5139,9 @@ zio_checksum_generate(zio_t *zio) if (checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT(checksum == ZIO_CHECKSUM_LABEL); + ASSERTF(checksum == ZIO_CHECKSUM_LABEL || + checksum == ZIO_CHECKSUM_ANYRAID_MAP, + "checksum not label: %px %d", zio, checksum); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); @@ -5169,7 +5173,10 @@ zio_checksum_verify(zio_t *zio) if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); + ASSERTF(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL || + zio->io_prop.zp_checksum == ZIO_CHECKSUM_ANYRAID_MAP, + "checksum not label: %px %d", zio, + zio->io_prop.zp_checksum); } ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 1d0646a61185..1585744651af 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -206,6 +206,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}, + {{abd_checksum_sha256, abd_checksum_sha256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA, "anyraid_map"}, }; /* @@ -408,6 +410,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, abd_copy_from_buf_off(abd, &cksum, eck_offset + offsetof(zio_eck_t, zec_cksum), sizeof (zio_cksum_t)); + } else if (checksum == ZIO_CHECKSUM_ANYRAID_MAP) { + zio_eck_t *eck = (zio_eck_t *)(zio->io_private); + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], + &cksum); + eck->zec_cksum = cksum; + memcpy(&eck->zec_magic, &zec_magic, sizeof (zec_magic)); } else { saved = bp->blk_cksum; ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], @@ -419,13 +427,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, - enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset, - zio_bad_cksum_t *info) +zio_checksum_error_impl(zio_t *zio, enum zio_checksum checksum, abd_t *abd, + uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum; zio_eck_t eck; + spa_t *spa = zio->io_spa; + const blkptr_t *bp = zio->io_bp; int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) @@ -433,8 +442,8 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, zio_checksum_template_init(checksum, spa); - IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED); - IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL); + IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL || + checksum == ZIO_CHECKSUM_ANYRAID_MAP); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_cksum_t verifier; @@ -498,6 +507,12 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } + } else if (checksum == ZIO_CHECKSUM_ANYRAID_MAP) { + eck = *(zio_eck_t *)(zio->io_private); + byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC)); + expected_cksum = eck.zec_cksum; + ci->ci_func[byteswap](abd, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; @@ -548,24 +563,24 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size; uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; - spa_t *spa = zio->io_spa; if (bp && BP_IS_GANG(bp)) { - if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + if (spa_feature_is_active(zio->io_spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)) size = zio->io_size; else size = SPA_OLD_GANGBLOCKSIZE; } - error = zio_checksum_error_impl(spa, bp, checksum, data, size, - offset, info); + error = zio_checksum_error_impl(zio, checksum, data, size, offset, + info); if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) { /* * It's possible that this is an old gang block. Rerun * the checksum with the old size; if that passes, then * update the gangblocksize appropriately. */ - error = zio_checksum_error_impl(spa, bp, checksum, data, + error = zio_checksum_error_impl(zio, checksum, data, SPA_OLD_GANGBLOCKSIZE, offset, info); if (error == 0) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a69c6e3c8dd7..06b215f2ae91 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -40,6 +40,14 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_013_pos', 'alloc_class_016_pos'] tags = ['functional', 'alloc_class'] +[tests/functional/anyraid] +tests = [ 'anyraid_clean_mirror_001_pos', 'anyraid_clean_mirror_002_pos', + 'anyraid_clean_mirror_003_pos', 'anyraid_tile_layout', 'anyraid_checkpoint', + 'anyraid_faildisk_write_replace_resilver', + 'anyraid_offline_write_online_resilver', + 'anyraid_special_vdev_001_pos', 'anyraid_special_vdev_002_pos'] +tags = ['functional', 'anyraid'] + [tests/functional/append] tests = ['file_append', 'threadsappend_001_pos'] tags = ['functional', 'append'] @@ -401,7 +409,8 @@ tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] -tests = ['zpool_attach_001_neg', 'attach-o_ashift'] +tests = ['zpool_attach_001_neg', 'zpool_attach_002_pos', 'zpool_attach_003_pos', + 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] @@ -418,7 +427,10 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', - 'zpool_create_encrypted', 'zpool_create_crypt_combos', + 'zpool_create_anyraid_001_pos', 'zpool_create_anyraid_002_pos', + 'zpool_create_anyraid_003_pos', 'zpool_create_anyraid_004_pos', + 'zpool_create_anyraid_005_neg', 'zpool_create_encrypted', + 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', @@ -449,6 +461,7 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos', + 'zpool_export_anyraid_001_pos', 'zpool_export_parallel_pos', 'zpool_export_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_export'] @@ -504,6 +517,7 @@ tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_attach_detach_add_remove', + 'zpool_initialize_anyraid_attach', 'zpool_initialize_fault_export_import_online', 'zpool_initialize_import_export', 'zpool_initialize_multiple_pools', @@ -517,7 +531,6 @@ tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] -pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 249b415029c4..0a6cdebbb869 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -255,7 +255,7 @@ tests = ['zpool_add_002_pos', 'zpool_add_003_pos', tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] -tests = ['zpool_attach_001_neg'] +tests = ['zpool_attach_001_neg', 'zpool_attach_002_pos'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] @@ -269,10 +269,11 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', - 'zpool_create_encrypted', - 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', - 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', - 'zpool_create_features_005_pos'] + 'zpool_create_anyraid_001_pos', 'zpool_create_anyraid_003_pos', + 'zpool_create_anyraid_004_pos', 'zpool_create_anyraid_005_neg', + 'zpool_create_encrypted', 'zpool_create_features_001_pos', + 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', + 'zpool_create_features_004_neg', 'zpool_create_features_005_pos'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] @@ -318,7 +319,6 @@ tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_online_offline'] -pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] diff --git a/tests/zfs-tests/include/default.cfg.in b/tests/zfs-tests/include/default.cfg.in index 4e009acaff91..5b0bb04fd229 100644 --- a/tests/zfs-tests/include/default.cfg.in +++ b/tests/zfs-tests/include/default.cfg.in @@ -140,6 +140,10 @@ export MAX_FINDDISKSNUM=6 # Default minimum size for file based vdevs in the test suite export MINVDEVSIZE=$((256 * 1024 * 1024)) +# AnyRAID has higher requirements by design, +# it depends on the minimal region size +export MINVDEVSIZE2=$((24 * 1024 * 1024 * 1024)) + # Minimum vdev size possible as defined in the OS export SPA_MINDEVSIZE=$((64 * 1024 * 1024)) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8b30b9b91641..9683a247014f 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -789,6 +789,23 @@ function assert (($@)) || log_fail "$@" } +function get_file_size +{ + typeset filename="$1" + + if is_linux; then + if [ -b "$filename" ] ; then + filesize=$(blockdev --getsize64 $filename) + else + filesize=$(stat -c %s $filename) + fi + else + filesize=$(stat -s $filename | awk '{print $8}' | grep -o '[0-9]\+') + fi + + echo $filesize +} + # # Function to format partition size of a disk # Given a disk cxtxdx reduces all partitions @@ -1599,6 +1616,15 @@ function create_pool #pool devs_list if is_global_zone ; then [[ -d /$pool ]] && rm -rf /$pool + + for internal_vd in "$@" ; do + if [[ "$internal_vd" =~ "loop" ]] ; then + # If the device is a loopback, remove previously + # allocated data. + punch_hole 0 $(get_file_size /dev/$internal_vd) \ + /dev/$internal_vd + fi + done log_must zpool create -f $pool $@ fi @@ -1856,7 +1882,7 @@ function verify_pool function get_disklist # pool { echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \ - grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") + grep -vEe '^-----' -e "^(mirror|raidz[1-3]|anymirror|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") } # @@ -2218,6 +2244,30 @@ BEGIN { FS="."; } echo $unused } +function create_sparse_files +{ + typeset prefix=$1 + typeset -i count=$2 + typeset size=$3 + + log_must mkdir -p $TESTDIR/sparse_files + + typeset sfiles="" + for (( i=0; i { typeset group=$1 diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 54b50c9dba77..38dd44e1fc6a 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -18,6 +18,7 @@ UNAME=$(uname) cat <<%%%% | ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount +ANYRAID_MIN_TILE_SIZE anyraid.min_tile_size zfs_anyraid_min_tile_size ARC_MAX arc.max zfs_arc_max ARC_MIN arc.min zfs_arc_min ASYNC_BLOCK_MAX_BLOCKS async_block_max_blocks zfs_async_block_max_blocks diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 23284234cdf7..bf0b48c8724b 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -89,6 +89,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/acl/acl_common.kshlib \ functional/alloc_class/alloc_class.cfg \ functional/alloc_class/alloc_class.kshlib \ + functional/anyraid/anyraid_common.kshlib \ + functional/anyraid/default.cfg \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ functional/bclone/bclone.cfg \ @@ -434,6 +436,17 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/alloc_class/alloc_class_016_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ + functional/anyraid/anyraid_checkpoint.ksh \ + functional/anyraid/anyraid_clean_mirror_001_pos.ksh \ + functional/anyraid/anyraid_clean_mirror_002_pos.ksh \ + functional/anyraid/anyraid_clean_mirror_003_pos.ksh \ + functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh \ + functional/anyraid/anyraid_offline_write_online_resilver.ksh \ + functional/anyraid/anyraid_tile_layout.ksh \ + functional/anyraid/anyraid_special_vdev_001_pos.ksh \ + functional/anyraid/anyraid_special_vdev_002_pos.ksh \ + functional/anyraid/cleanup.ksh \ + functional/anyraid/setup.ksh \ functional/append/file_append.ksh \ functional/append/threadsappend_001_pos.ksh \ functional/append/cleanup.ksh \ @@ -1037,6 +1050,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_attach/cleanup.ksh \ functional/cli_root/zpool_attach/setup.ksh \ functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \ + functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh \ + functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh \ functional/cli_root/zpool/cleanup.ksh \ functional/cli_root/zpool_clear/cleanup.ksh \ functional/cli_root/zpool_clear/setup.ksh \ @@ -1070,6 +1085,11 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_create/zpool_create_022_pos.ksh \ functional/cli_root/zpool_create/zpool_create_023_neg.ksh \ functional/cli_root/zpool_create/zpool_create_024_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh \ functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ @@ -1116,6 +1136,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ + functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh \ functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \ functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ @@ -1187,7 +1208,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh \ functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ + functional/cli_root/zpool_initialize/setup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ + functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh \ diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_checkpoint.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_checkpoint.ksh new file mode 100755 index 000000000000..76a68a9e4ba9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_checkpoint.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid works correctly with checkpoints +# +# STRATEGY: +# 1. Create an anyraid vdev +# 2. Take a checkpoint +# 3. Allocate more space +# 4. Roll back to the checkpoint +# 5. Verify that the tile map looks like what it did originally +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL +} + +log_onexit cleanup + +log_must create_pool $TESTPOOL anymirror1 $DISKS + +log_assert "Anyraid works correctly with checkpoints" +log_must zdb --anyraid-map $TESTPOOL + +map=$(zdb --anyraid-map $TESTPOOL) +log_must zpool checkpoint $TESTPOOL + +log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 2048 -d R + +log_must zpool export $TESTPOOL +log_must zpool import --rewind-to-checkpoint $TESTPOOL +map2=$(zdb --anyraid-map $TESTPOOL) +log_must test "$map" == "$map2" + +log_pass "Anyraid works correctly with checkpoints" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_001_pos.ksh new file mode 100755 index 000000000000..a97621aab1ef --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_001_pos.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror1 can survive having 1 failed disk. +# +# STRATEGY: +# 1. Write several files to the ZFS filesystem mirror. +# 2. Override one of the disks of the mirror with zeroes. +# 3. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror1 can survive having 1 failed disk" + +log_must create_sparse_files "disk" 3 $DEVSIZE + +clean_mirror_spec_cases "anymirror1 $disk0 $disk1" \ + "$disk0" \ + "$disk1" + +clean_mirror_spec_cases "anymirror1 $disk0 $disk1 $disk2" \ + "$disk0" \ + "$disk1" \ + "$disk2" + +log_pass "AnyRAID mirror1 can survive having 1 failed disk" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_002_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_002_pos.ksh new file mode 100755 index 000000000000..2edbac7773c3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_002_pos.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror2 can survive having 1-2 failed disks. +# +# STRATEGY: +# 1. Write several files to the ZFS filesystem mirror. +# 2. Override the selected disks of the mirror with zeroes. +# 3. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror2 can survive having 1-2 failed disks" + +log_must create_sparse_files "disk" 4 $DEVSIZE + +clean_mirror_spec_cases "anymirror2 $disk0 $disk1 $disk2" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk1 $disk2\"" + +clean_mirror_spec_cases "anymirror2 $disk0 $disk1 $disk2 $disk3" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "$disk3" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk0 $disk3\"" \ + "\"$disk1 $disk2\"" \ + "\"$disk1 $disk3\"" \ + "\"$disk2 $disk3\"" + +log_pass "AnyRAID mirror2 can survive having 1-2 failed disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_003_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_003_pos.ksh new file mode 100755 index 000000000000..05d6606db03c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_003_pos.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror3 can survive having 1-3 failed disks. +# +# STRATEGY: +# 1. Write several files to the ZFS filesystem mirror. +# 2. Override the selected disks of the mirror with zeroes. +# 3. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror3 can survive having 1-3 failed disks" + +log_must create_sparse_files "disk" 4 $DEVSIZE + +clean_mirror_spec_cases "anymirror3 $disk0 $disk1 $disk2 $disk3" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "$disk3" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk0 $disk3\"" \ + "\"$disk1 $disk2\"" \ + "\"$disk1 $disk3\"" \ + "\"$disk2 $disk3\"" \ + "\"$disk0 $disk1 $disk2\"" \ + "\"$disk0 $disk1 $disk3\"" \ + "\"$disk0 $disk2 $disk3\"" \ + "\"$disk1 $disk2 $disk3\"" + +log_pass "AnyRAID mirror3 can survive having 1-3 failed disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_common.kshlib b/tests/zfs-tests/tests/functional/anyraid/anyraid_common.kshlib new file mode 100644 index 000000000000..1b4f7d15451c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_common.kshlib @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/anyraid/default.cfg + +function wipe_some_disks_and_verify_content_is_still_okay +{ + typeset pool=$1 + shift + + typeset atfile=0 + set -A files + set -A cksums + typeset newcksum + + while (( atfile < FILE_COUNT )); do + files[$atfile]=/$pool/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) + done + + for disk in $@; do + log_must punch_hole $((DD_BLOCK * 8)) $((DD_BLOCK * (DD_COUNT - 128))) $disk + done + + # + # Flush out the cache so that we ensure we're reading from disk. + # + log_must zpool status + log_must zpool export $pool + log_must zpool import -d $(dirname $1) + log_must zpool import -d $(dirname $1) $pool + + atfile=0 + typeset -i failedcount=0 + while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + log_note "Wrong checksum of ${files[$atfile]}" + fi + (( atfile = atfile + 1 )) + done + + if [[ $failedcount > 0 ]]; then + log_fail "$failedcount of the $FILE_COUNT files did not" \ + "have the same checksum before and after" + fi + + log_must zpool status + log_must zpool scrub $TESTPOOL + log_must wait_scrubbed $TESTPOOL + log_must zpool status +} + +function clean_mirror_spec_cases +{ + typeset poolspec=$1 + shift + + typeset tcases + eval "typeset -a tcases=($*)" + + log_note "pool specification: $poolspec" + + for tcase in "${tcases[@]}"; do + log_note "failed disk case: $tcase" + log_must zpool create -f $TESTPOOL $poolspec + wipe_some_disks_and_verify_content_is_still_okay $TESTPOOL $tcase + poolexists $TESTPOOL && destroy_pool $TESTPOOL + done +} diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh new file mode 100755 index 000000000000..efa1d4e0a845 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror can resilver a replaced disk. +# +# STRATEGY: +# 1. Fail one disk. +# 2. Write new data to the pool. +# 3. Get that disk replaced and resilvered. +# 4. Repeat to verify sequential resilvering. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror can resilver a replaced disk" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + + +# anymirror1 + +for replace_flags in '' '-s'; do + + log_must create_sparse_files "disk" 3 $DEVSIZE + log_must create_sparse_files "spare" 1 $DEVSIZE + log_must zpool create -O compress=off -f $TESTPOOL anymirror1 $disks + log_must zfs set primarycache=none $TESTPOOL + + # Write initial data + log_must file_write -o create -f /$TESTPOOL/file1.bin -b 1048576 -c 256 -d Z + + # Fail one disk + log_must truncate -s0 $disk0 + + # Read initial data, write new data + log_must dd if=/$TESTPOOL/file1.bin of=/dev/null bs=1M count=256 + log_must file_write -o create -f /$TESTPOOL/file1.bin -b 1048576 -c 256 -d Y + + # Check that disk is faulted + zpool status + log_must check_state $TESTPOOL $disk0 "faulted" + + # Initiate disk replacement + log_must zpool replace -f $replace_flags $TESTPOOL $disk0 $spare0 + + # Wait until resilvering is done and the pool is back online + for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 + done + zpool status + log_must check_state $TESTPOOL "" "online" + + destroy_pool $TESTPOOL + +done + +log_pass "AnyRAID mirror can resilver a replaced disk" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_write_online_resilver.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_write_online_resilver.ksh new file mode 100755 index 000000000000..f36efc443f79 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_write_online_resilver.ksh @@ -0,0 +1,128 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror can resilver a disk after it gets back online. +# +# STRATEGY: +# 1. Offline one disk. +# 2. Write to the pool. +# 3. Get that disk back online. +# 4. Get it resilvered. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror can resilver a disk after it gets back online" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# anymirror1 + +log_must create_sparse_files "disk" 3 $DEVSIZE +log_must zpool create -f $TESTPOOL anymirror1 $disks + +log_must zpool offline $TESTPOOL $disk0 +log_must check_state $TESTPOOL $disk0 "offline" +log_must check_state $TESTPOOL "" "degraded" + +log_must dd if=/dev/urandom of=/$TESTPOOL/file.bin bs=1M count=128 +log_must zpool online $TESTPOOL $disk0 +log_must check_state $TESTPOOL $disk0 "online" +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done +zpool status +log_must check_state $TESTPOOL "" "online" + +log_must destroy_pool $TESTPOOL + + +# anymirror2 + +log_must create_sparse_files "disk" 5 $DEVSIZE +log_must zpool create -f $TESTPOOL anymirror2 $disks + +log_must zpool offline $TESTPOOL $disk0 +log_must zpool offline $TESTPOOL $disk1 +log_must check_state $TESTPOOL $disk0 "offline" +log_must check_state $TESTPOOL $disk1 "offline" +log_must check_state $TESTPOOL "" "degraded" + +log_must dd if=/dev/urandom of=/$TESTPOOL/file.bin bs=1M count=128 +log_must zpool online $TESTPOOL $disk0 +log_must zpool online $TESTPOOL $disk1 +log_must check_state $TESTPOOL $disk0 "online" +log_must check_state $TESTPOOL $disk1 "online" +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done +zpool status +log_must check_state $TESTPOOL "" "online" + +log_must destroy_pool $TESTPOOL + + +# anymirror3 + +log_must create_sparse_files "disk" 7 $DEVSIZE +log_must zpool create -f $TESTPOOL anymirror3 $disks + +log_must zpool offline $TESTPOOL $disk0 +log_must zpool offline $TESTPOOL $disk1 +log_must zpool offline $TESTPOOL $disk2 +log_must check_state $TESTPOOL $disk0 "offline" +log_must check_state $TESTPOOL $disk1 "offline" +log_must check_state $TESTPOOL $disk2 "offline" +log_must check_state $TESTPOOL "" "degraded" + +log_must dd if=/dev/urandom of=/$TESTPOOL/file.bin bs=1M count=128 +log_must zpool online $TESTPOOL $disk0 +log_must zpool online $TESTPOOL $disk1 +log_must zpool online $TESTPOOL $disk2 +log_must check_state $TESTPOOL $disk0 "online" +log_must check_state $TESTPOOL $disk1 "online" +log_must check_state $TESTPOOL $disk2 "online" +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done +zpool status +log_must check_state $TESTPOOL "" "online" + +log_must destroy_pool $TESTPOOL + +log_pass "AnyRAID mirror can resilver a disk after it gets back online" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_001_pos.ksh new file mode 100755 index 000000000000..c4e8728d1334 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_001_pos.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Verify a variety of AnyRAID pools with a special VDEV mirror. +# +# STRATEGY: +# 1. Create an AnyRAID pool with a special VDEV mirror. +# 2. Write to it, sync. +# 3. Export and re-import the pool. +# 4. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} +log_onexit cleanup + +log_assert "Verify a variety of AnyRAID pools with a special VDEV mirror" + +log_must create_sparse_files "disk" 4 $DEVSIZE +log_must create_sparse_files "sdisk" 2 $DEVSIZE + +typeset oldcksum +typeset newcksum +for parity in {0..3}; do + log_must zpool create -f $TESTPOOL anymirror$parity $disks special mirror $sdisks + log_must poolexists $TESTPOOL + log_must zfs set special_small_blocks=4k $TESTPOOL + + log_must dd if=/dev/urandom of=/$TESTPOOL/file.bin bs=1M count=1 + log_must dd if=/dev/urandom of=/$TESTPOOL/small.bin bs=4k count=1 + oldcksum=$(xxh128digest /$TESTPOOL/file.bin) + oldsmallcksum=$(xxh128digest /$TESTPOOL/small.bin) + log_must zpool export $TESTPOOL + + log_must zpool import -d $(dirname $disk0) $TESTPOOL + newcksum=$(xxh128digest /$TESTPOOL/file.bin) + newsmallcksum=$(xxh128digest /$TESTPOOL/small.bin) + + log_must test "$oldcksum" = "$newcksum" + log_must test "$oldsmallcksum" = "$newsmallcksum" + + log_must destroy_pool $TESTPOOL +done + +log_pass "Verify a variety of AnyRAID pools with a special VDEV mirror" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_002_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_002_pos.ksh new file mode 100755 index 000000000000..c00b26d37f2c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_002_pos.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Verify a variety of AnyRAID pools with a special VDEV AnyRAID. +# +# STRATEGY: +# 1. Create an AnyRAID pool with a special VDEV AnyRAID. +# 2. Write to it, sync. +# 3. Export and re-import the pool. +# 4. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} +log_onexit cleanup + +log_assert "Verify a variety of AnyRAID pools with a special VDEV AnyRAID" + +log_must create_sparse_files "disk" 4 $DEVSIZE +log_must create_sparse_files "sdisk" 4 $DEVSIZE + +typeset oldcksum +typeset newcksum +for parity in {0..3}; do + log_must zpool create $TESTPOOL anymirror$parity $disks special \ + anymirror$parity $sdisks + log_must poolexists $TESTPOOL + + log_must dd if=/dev/urandom of=/$TESTPOOL/file.bin bs=1M count=128 + oldcksum=$(xxh128digest /$TESTPOOL/file.bin) + log_must zpool export $TESTPOOL + + log_must zpool import -d $(dirname $disk0) $TESTPOOL + newcksum=$(xxh128digest /$TESTPOOL/file.bin) + + log_must test "$oldcksum" = "$newcksum" + + log_must destroy_pool $TESTPOOL +done + +log_pass "Verify a variety of AnyRAID pools with a special VDEV AnyRAID" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_tile_layout.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_tile_layout.ksh new file mode 100755 index 000000000000..a405745e493c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_tile_layout.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid disks intelligently select which tiles to use +# +# STRATEGY: +# 1. Create an anymirror1 vdev with 1 large disk and 2 small disks +# 2. Verify that the full space can be used +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL2 + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 +} + +log_onexit cleanup + +log_must create_pool $TESTPOOL $DISKS + +log_must truncate -s 512M /$TESTPOOL/vdev_file.{0,1,2} +log_must truncate -s 1G /$TESTPOOL/vdev_file.3 +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Anyraid disks intelligently select which tiles to use" + +log_must create_pool $TESTPOOL2 anymirror1 /$TESTPOOL/vdev_file.{0,1,2,3} + +cap=$(zpool get -Hp -o value size $TESTPOOL2) +[[ "$cap" -eq $((9 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +# +# This should just about fill the pool, when you account for the 128MiB of +# reserved slop space. If the space isn't being selected intelligently, we +# would hit ENOSPC 64MiB early. +# +log_must dd if=/dev/urandom of=/$TESTPOOL2/f1 bs=1M count=$((64 * 7 - 1)) + +log_pass "Anyraid disks intelligently select which tiles to use" diff --git a/tests/zfs-tests/tests/functional/anyraid/cleanup.ksh b/tests/zfs-tests/tests/functional/anyraid/cleanup.ksh new file mode 100755 index 000000000000..0e239571f23a --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/anyraid/default.cfg + +poolexists $TESTPOOL && destroy_pool $TESTPOOL + +log_must delete_sparse_files +restore_tunable ANYRAID_MIN_TILE_SIZE diff --git a/tests/zfs-tests/tests/functional/anyraid/default.cfg b/tests/zfs-tests/tests/functional/anyraid/default.cfg new file mode 100644 index 000000000000..db3db19fb7aa --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/default.cfg @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +export DEVSIZE=4294967296 +export DD_BLOCK=$(( 64 * 1024 )) +export DD_COUNT=$(( DEVSIZE / DD_BLOCK )) + +export FILE_COUNT=10 +export FILE_SIZE=$(( 1024 * 1024 )) diff --git a/tests/zfs-tests/tests/functional/anyraid/setup.ksh b/tests/zfs-tests/tests/functional/anyraid/setup.ksh new file mode 100755 index 000000000000..3e923fdbb0ff --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/setup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/anyraid/default.cfg + +verify_runnable "global" + +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib index 08795a7ea257..05b087854627 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib @@ -65,6 +65,7 @@ function setup_filesystem #disklist #pool #fs #mntpoint #type #vdev if [[ $vdev != "" && \ $vdev != "mirror" && \ $vdev != "raidz" && \ + $vdev != "anymirror" && \ $vdev != "draid" ]] ; then log_note "Wrong vdev: (\"$vdev\")" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh index 3c16a6f97f4a..4ffcd5cda088 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh @@ -33,4 +33,6 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib +delete_sparse_files + log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh index 82d19e850f28..8d6107ee3a86 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh @@ -49,32 +49,32 @@ verify_runnable "global" function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL - rm -f $disk0 $disk1 } log_assert "'zpool add ...' can add devices to the pool." log_onexit cleanup -set -A keywords "" "mirror" "raidz" "raidz1" "draid:1s" "draid1:1s" "spare" +set -A keywords "" "mirror" "raidz" "raidz1" "anymirror" "anymirror1" "anymirror2" "anymirror3" "draid:1s" "draid1:1s" "spare" + +create_sparse_files "disk" 4 $MINVDEVSIZE2 +create_sparse_files "extradisk" 4 $MINVDEVSIZE2 pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\"" mirrordevs="\"${DISK0} ${DISK1}\"" raidzdevs="\"${DISK0} ${DISK1}\"" +anyraiddevs="\"${extradisks}\"" draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" -disk0=$TEST_BASE_DIR/disk0 -disk1=$TEST_BASE_DIR/disk1 -disk2=$TEST_BASE_DIR/disk2 -truncate -s $MINVDEVSIZE $disk0 $disk1 $disk2 typeset -i i=0 typeset vdev eval set -A poolarray $pooldevs eval set -A mirrorarray $mirrordevs eval set -A raidzarray $raidzdevs +eval set -A anyraidarray $anyraiddevs eval set -A draidarray $draiddevs while (( $i < ${#keywords[*]} )); do @@ -111,6 +111,16 @@ while (( $i < ${#keywords[*]} )); do destroy_pool "$TESTPOOL" done + ;; + anyraid*) + for vdev in "${anyraidarray[@]}"; do + create_pool "$TESTPOOL" "${keywords[i]}" $disks + log_must poolexists "$TESTPOOL" + log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev + log_must vdevs_in_pool "$TESTPOOL" "$vdev" + destroy_pool "$TESTPOOL" + done + ;; draid:1s|draid1:1s) for vdev in "${draidarray[@]}"; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh index 2e1590faf8f5..97749bf6f1c6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh @@ -57,17 +57,19 @@ log_assert "'zpool add' should fail if vdevs are the same or vdev is " \ log_onexit cleanup -create_pool $TESTPOOL $DISK0 +create_sparse_files "disk" 2 $MINVDEVSIZE2 + +create_pool $TESTPOOL $disk0 log_must poolexists $TESTPOOL -log_mustnot zpool add -f $TESTPOOL $DISK0 +log_mustnot zpool add -f $TESTPOOL $disk0 -for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache" +for type in "" "mirror" "raidz" "anymirror" "draid" "spare" "log" "dedup" "special" "cache" do - log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1 - log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1 - log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1 - log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1 + log_mustnot zpool add -f $TESTPOOL $type $disk0 $disk1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $disk0 $disk1 + log_mustnot zpool add -f $TESTPOOL $type $disk1 $disk1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $disk1 $disk1 done log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh new file mode 100755 index 000000000000..8ab23d7a3598 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach' works to expand mirrors and anyraid vdevs +# +# STRATEGY: +# 1. Create a normal striped pool +# 2. Verify that attaching creates a mirror +# 3. Verify that attaching again creates a wider mirror +# 4. Create an anyraid vdev +# 5. Verify that attaching expands the anyraid vdev +# + +verify_runnable "global" + +cleanup() { + log_must zpool destroy $TESTPOOL2 + restore_tunable ANYRAID_MIN_TILE_SIZE +} + +log_onexit cleanup + +log_must truncate -s 8G /$TESTPOOL/vdev_file.{0,1,2,3} +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + +log_assert "'zpool attach' works to expand mirrors and anyraid vdevs" + +log_must create_pool $TESTPOOL2 /$TESTPOOL/vdev_file.0 +log_must zpool attach $TESTPOOL2 /$TESTPOOL/vdev_file.0 /$TESTPOOL/vdev_file.1 +log_must eval "zpool list -v $TESTPOOL2 | grep \" mirror\"" +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.0\"" +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.1\"" +log_must zpool attach $TESTPOOL2 /$TESTPOOL/vdev_file.0 /$TESTPOOL/vdev_file.2 +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.2\"" +log_must zpool destroy $TESTPOOL2 + +log_must create_pool $TESTPOOL2 anymirror1 /$TESTPOOL/vdev_file.{0,1,2} +log_must zpool attach $TESTPOOL2 anymirror-0 /$TESTPOOL/vdev_file.3 +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.3\"" + +log_pass "'zpool attach' works to expand mirrors and anyraid vdevs" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh new file mode 100755 index 000000000000..2287e8c83b99 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh @@ -0,0 +1,99 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach' expands size correctly with anyraid vdevs. +# +# STRATEGY: +# 1. Create an anymirror1 vdev with small disks +# 2. Attach larger disk +# 3. Verify that not all the new space can be used +# 4. Attach another larger disk +# 5. Verify that all space is now usable +# 6. Repeat steps 1-5 with anymirror2 +# + +verify_runnable "global" + +cleanup() { + log_must zpool destroy $TESTPOOL2 + rm /$TESTPOOL/vdev_file.* + restore_tunable ANYRAID_MIN_TILE_SIZE +} + +log_onexit cleanup + +log_must truncate -s 512M /$TESTPOOL/vdev_file.{0,1,2,3} +log_must truncate -s 2G /$TESTPOOL/vdev_file.{4,5,6} +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "'zpool attach' expands size correctly with anyraid vdevs" + +log_must create_pool $TESTPOOL2 anymirror1 /$TESTPOOL/vdev_file.{0,1,2} + +cap=$(zpool get -Hp -o value size $TESTPOOL2) +log_must zpool attach $TESTPOOL2 anymirror1-0 /$TESTPOOL/vdev_file.4 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) + +[[ "$new_cap" -eq $((3 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anymirror1-0 /$TESTPOOL/vdev_file.5 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $(((2048 - 256 - 64) * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool destroy $TESTPOOL2 +log_must create_pool $TESTPOOL2 anymirror2 /$TESTPOOL/vdev_file.{0,1,2,3} + +cap=$(zpool get -Hp -o value size $TESTPOOL2) +log_must zpool attach $TESTPOOL2 anymirror2-0 /$TESTPOOL/vdev_file.4 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) + +[[ "$new_cap" -eq $((64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anymirror2-0 /$TESTPOOL/vdev_file.5 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $((256 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anymirror2-0 /$TESTPOOL/vdev_file.6 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $(((2048 - 256 - 64) * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_pass "'zpool attach' expands size correctly with anyraid vdevs" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh index f504d15fc0c3..428c769444cf 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh @@ -34,5 +34,7 @@ . $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib cleanup_devices $DISKS +delete_sparse_files +rm -rf $TESTDIR $TESTDIR1 log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib index ecab30ed3925..bbe68f8db24f 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib @@ -36,7 +36,7 @@ # Given a pool vdevs list, create the pool,verify the created pool, # and destroy the pool # $1, pool name -# $2, pool type, mirror, raidz, or none +# $2, pool type, mirror, raidz, anyraid, draid or none # $3, vdevs list # function create_pool_test diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh index ad30c0fc87f9..879e38c5257b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh @@ -49,8 +49,6 @@ verify_runnable "global" function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL - - rm -f $disk1 $disk2 } log_assert "'zpool create ...' can successfully create" \ @@ -58,16 +56,16 @@ log_assert "'zpool create ...' can successfully create" \ log_onexit cleanup -typeset disk1=$(create_blockfile $FILESIZE) -typeset disk2=$(create_blockfile $FILESIZE) +create_sparse_files "disk" 4 $MINVDEVSIZE2 pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\" \ - \"$disk1 $disk2\"" + \"$disk0 $disk1\"" mirrordevs="\"${DISK0} ${DISK1}\" \ $raidzdevs \ - \"$disk1 $disk2\"" + \"$disk0 $disk1\"" +anyraiddevs="\"$disk0 $disk1 $disk2 $disk3\"" raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\"" draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" @@ -75,6 +73,11 @@ create_pool_test "$TESTPOOL" "" "$pooldevs" create_pool_test "$TESTPOOL" "mirror" "$mirrordevs" create_pool_test "$TESTPOOL" "raidz" "$raidzdevs" create_pool_test "$TESTPOOL" "raidz1" "$raidzdevs" +create_pool_test "$TESTPOOL" "anymirror" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror0" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror1" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror2" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror3" "$anyraiddevs" create_pool_test "$TESTPOOL" "draid" "$draiddevs" log_pass "'zpool create ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh index f0c2e69a0c0f..d0ac83c2b9f7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh @@ -55,7 +55,7 @@ log_assert "'zpool create [-R root][-m mountpoint] ...' can create "an alternate pool or a new pool mounted at the specified mountpoint." log_onexit cleanup -set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "draid" "draid2" +set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "anymirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "draid" "draid2" # # cleanup the pools created in previous case if zpool_create_004_pos timedout @@ -69,7 +69,7 @@ rm -rf $TESTDIR log_must mkdir -p $TESTDIR typeset -i i=1 while (( i < 5 )); do - log_must truncate -s $FILESIZE $TESTDIR/file.$i + log_must truncate -s $MINVDEVSIZE2 $TESTDIR/file.$i (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh index adc47c48de28..44bc6077c407 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh @@ -60,7 +60,7 @@ mntpnt=$(get_prop mountpoint $TESTPOOL) typeset -i i=0 while ((i < 10)); do - log_must truncate -s $MINVDEVSIZE $mntpnt/vdev$i + log_must truncate -s $MINVDEVSIZE2 $mntpnt/vdev$i eval vdev$i=$mntpnt/vdev$i ((i += 1)) @@ -98,6 +98,12 @@ set -A valid_args \ "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 \ mirror $vdev4 $vdev5 $vdev6 $vdev7" \ + "anymirror0 $vdev0" \ + "anymirror0 $vdev0 $vdev1 anymirror0 $vdev2 $vdev3" \ + "anymirror1 $vdev0 $vdev1 anymirror1 $vdev2 $vdev3" \ + "anymirror2 $vdev0 $vdev1 $vdev2 anymirror2 $vdev3 $vdev4 $vdev5" \ + "anymirror2 $vdev0 $vdev1 $vdev2 $vdev3 anymirror2 $vdev4 $vdev5 $vdev6" \ + "anymirror3 $vdev0 $vdev1 $vdev2 $vdev3 anymirror3 $vdev4 $vdev5 $vdev6 $vdev7" \ "draid $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4" \ "draid $vdev0 $vdev1 $vdev2 raidz1 $vdev3 $vdev4 $vdev5" \ "draid $vdev0 $vdev1 $vdev2 draid1 $vdev3 $vdev4 $vdev5" \ @@ -133,6 +139,9 @@ set -A forced_args \ spare $vdev4 raidz2 $vdev5 $vdev6 $vdev7" \ "mirror $vdev0 $vdev1 draid $vdev2 $vdev3 $vdev4 \ draid2 $vdev5 $vdev6 $vdev7 $vdev8 spare $vdev9" \ + "anymirror0 $vdev0 anymirror $vdev1 $vdev2" \ + "anymirror1 $vdev0 $vdev1 anymirror2 $vdev2 $vdev3 $vdev4" \ + "anymirror3 $vdev0 $vdev1 $vdev2 $vdev3 anymirror0 $vdev4" \ "draid $vdev0 $vdev1 $vdev2 $vdev3 \ draid2 $vdev4 $vdev5 $vdev6 $vdev7 $vdev8" \ "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh index 2e377bc3b522..9f88941932a7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh @@ -56,6 +56,11 @@ set -A args "" "-?" "-n" "-f" "-nf" "-fn" "-f -n" "--f" "-e" "-s" \ "$TESTPOOL mirror" "$TESTPOOL raidz" "$TESTPOOL mirror raidz" \ "$TESTPOOL raidz1" "$TESTPOOL mirror raidz1" \ "$TESTPOOL draid1" "$TESTPOOL mirror draid1" \ + "$TESTPOOL anymirror" "$TESTPOOL mirror anymirror" \ + "$TESTPOOL anymirror0" "$TESTPOOL mirror anymirror0" \ + "$TESTPOOL anymirror1 $DISK0" \ + "$TESTPOOL anymirror2 $DISK0 $DISK1" \ + "$TESTPOOL anymirror3 $DISK0 $DISK1 $DISK2" \ "$TESTPOOL mirror c?t?d?" "$TESTPOOL mirror $DISK0 c0t1d?" \ "$TESTPOOL RAIDZ $DISK0 $DISK1" \ "$TESTPOOL $DISK0 log $DISK1 log $DISK2" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh index 7656f5bb4fdf..1ebfd5bc8d16 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh @@ -39,10 +39,12 @@ # devices, 'zpool create' should failed. # # STRATEGY: -# 1. Loop to create the following three kinds of pools. +# 1. Loop to create the following kinds of pools: # - Regular pool # - Mirror # - Raidz +# - AnyRAID +# - dRAID # 2. Create two pools but using the same disks, expect failed. # 3. Create one pool but using the same disks twice, expect failed. # @@ -62,13 +64,15 @@ log_assert "Create a pool with same devices twice or create two pools with " \ "same devices, 'zpool create' should fail." log_onexit cleanup +create_sparse_files "file" 4 $MINVDEVSIZE2 + unset NOINUSE_CHECK typeset opt -for opt in "" "mirror" "raidz" "draid"; do +for opt in "" "mirror" "raidz" "anymirror" "draid"; do if [[ $opt == "" ]]; then - typeset disks=$DISK0 + typeset disks=$file0 else - typeset disks=$DISKS + typeset disks=$files fi # Create two pools but using the same disks. @@ -78,7 +82,7 @@ for opt in "" "mirror" "raidz" "draid"; do # Create two pools and part of the devices were overlapped create_pool $TESTPOOL $opt $disks - log_mustnot zpool create -f $TESTPOOL1 $opt $DISK0 + log_mustnot zpool create -f $TESTPOOL1 $opt $file0 destroy_pool $TESTPOOL # Create one pool but using the same disks twice. diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh index 6d43227481bf..188e8768fda5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh @@ -73,6 +73,7 @@ set -A args \ "$TOOSMALL $TESTDIR/file1" "$TESTPOOL1 $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL mirror $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" \ + "$TOOSMALL anymirror0 $TESTDIR/file1" \ "$TOOSMALL draid $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3" typeset -i i=0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh new file mode 100755 index 000000000000..8b1ae0b23a82 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create a variety of AnyRAID pools using the minimal vdev syntax. +# +# STRATEGY: +# 1. Create the required number of allowed vdevs. +# 2. Create few pools of various sizes using the anymirror* syntax. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "'zpool create ...' can create a pool." +log_onexit cleanup + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +# Verify the default parity +log_must zpool create $TESTPOOL anymirror $disks +log_must poolexists $TESTPOOL +destroy_pool $TESTPOOL + +# Verify specified parity +for parity in {0..3}; do + log_must zpool create $TESTPOOL anymirror$parity $disks + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL +done + +log_pass "'zpool create ...' can create a pool." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh new file mode 100755 index 000000000000..4e1d6cf682a2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh @@ -0,0 +1,69 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create AnyRAID pool using the maximum number of vdevs (255). Then verify +# that creating a pool with 256 fails as expected. +# +# STRATEGY: +# 1. Verify a pool with fewer than the required vdevs fails. +# 2. Verify pools with a valid number of vdevs succeed. +# 3. Verify a pool which exceeds the maximum number of vdevs fails. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + log_pos rm -f $all_vdevs + log_pos rmdir $TESTDIR +} + +log_assert "'zpool create anyraid ...' can create a pool with maximum number of vdevs." +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..256}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE2 $all_vdevs + +# Verify pool sizes from 254-255. +for (( i=254; i<=255; i++ )); do + log_must zpool create $TESTPOOL anymirror3 \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL +done + +# Exceeds maximum AnyRAID vdev count (256). +log_mustnot zpool create $TESTPOOL anymirror3 $(echo $TESTDIR/file.{01..256}) + +log_pass "'zpool create anyraid ...' can create a pool with maximum number of vdevs." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh new file mode 100755 index 000000000000..6d292f9d420d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh @@ -0,0 +1,61 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify that AnyRAID vdev can be created using disks of different sizes. +# +# STRATEGY: +# 1. Create a pool using disks of different sizes. +# 2. Verify the pool created successfully. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "'zpool create anyraid* ...' can create a pool with disks of various sizes." +log_onexit cleanup + +create_sparse_files "Adisk" 3 $(( $MINVDEVSIZE2 * 1 )) +create_sparse_files "Bdisk" 2 $(( $MINVDEVSIZE2 * 2 )) +create_sparse_files "Cdisk" 1 $(( $MINVDEVSIZE2 * 3 )) +ls -lh $Adisks $Bdisks $Cdisks + +for parity in {0..3}; do + log_must zpool create $TESTPOOL anymirror$parity $Cdisks $Adisks $Bdisks + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL +done + +log_pass "'zpool create anyraid* ...' can create a pool with disks of various sizes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh new file mode 100755 index 000000000000..0cb3e106c1c1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh @@ -0,0 +1,57 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify that AnyRAID vdevs of different sizes can be mixed in a pool +# +# STRATEGY: +# 1. Create a pool with two anyraid vdevs with different disk counts +# 2. Verify the pool created successfully +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Pools can have multiple anyraid children with different disk counts" +log_onexit cleanup + +create_sparse_files "disk" 5 $MINVDEVSIZE2 + +# Verify the default parity +log_must zpool create $TESTPOOL anymirror $disk0 $disk1 $disk2 anymirror $disk3 $disk4 +log_must poolexists $TESTPOOL +destroy_pool $TESTPOOL + +log_pass "Pools can have multiple anyraid children with different disk counts." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh new file mode 100755 index 000000000000..7b34cac12f88 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh @@ -0,0 +1,56 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Run negative tests relating to anyraid vdevs and pool creation +# +# STRATEGY: +# 1. Try to create a pool with an invalid parity string +# 2. Try to create a pool with too large a parity +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "anyraid vdev specifications detect problems correctly" +log_onexit cleanup + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +log_mustnot zpool create $TESTPOOL anymirrorq $disks +log_mustnot zpool create $TESTPOOL anymirrorq1 $disks +log_mustnot zpool create $TESTPOOL anymirror-1 $disks +log_mustnot zpool create $TESTPOOL anymirror4 $disks + +log_pass "anyraid vdev specifications detect problems correctly" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh index 66de31744a96..5dce6bec18fd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh @@ -28,4 +28,5 @@ . $STF_SUITE/include/libtest.shlib +delete_sparse_files default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh new file mode 100755 index 000000000000..7eabefc46dc9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh @@ -0,0 +1,61 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# An AnyRAID pool should be exportable and not visible from 'zpool list'. +# +# STRATEGY: +# 1. Create AnyRAID pool. +# 2. Export the pool. +# 3. Verify the pool is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Verify an AnyRAID pool can be exported." +log_onexit cleanup + +poolexists $TESTPOOL && destroy_pool $TESTPOOL + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +log_must zpool create $TESTPOOL anymirror3 $disks +log_must poolexists $TESTPOOL +log_must zpool export $TESTPOOL + +poolexists $TESTPOOL && \ + log_fail "$TESTPOOL unexpectedly found in 'zpool list' output." + +log_pass "Successfully exported an AnyRAID pool." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index bdf5fdf85cff..05dde1fea8dd 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -93,6 +93,7 @@ typeset -a properties=( "feature@redaction_list_spill" "feature@dynamic_gang_header" "feature@physical_rewrite" + "feature@anyraid" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh index ce1c103cd3c3..3fe1fea0bc3a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh @@ -43,15 +43,18 @@ # 3. Create a draid2 pool C with dev2/3/4/5. Then destroy it. # 4. Create a raidz pool D with dev3/4. Then destroy it. # 5. Create a stripe pool E with dev4. Then destroy it. -# 6. Verify 'zpool import -D -a' recover all the pools. +# 6. Create an anyraid pool F with dev6. Then destroy it. +# 7. Verify 'zpool import -D -a' recover all the pools. # verify_runnable "global" +VDEV6="$DEVICE_DIR/disk6_anyraid" + function cleanup { typeset dt - for dt in $poolE $poolD $poolC $poolB $poolA; do + for dt in $poolF $poolE $poolD $poolC $poolB $poolA; do destroy_pool $dt done @@ -67,7 +70,7 @@ log_assert "'zpool -D -a' can import all the specified directories " \ "destroyed pools." log_onexit cleanup -poolA=poolA.$$; poolB=poolB.$$; poolC=poolC.$$; poolD=poolD.$$; poolE=poolE.$$ +poolA=poolA.$$; poolB=poolB.$$; poolC=poolC.$$; poolD=poolD.$$; poolE=poolE.$$; poolF=poolF.$$; log_must zpool create $poolA mirror $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4 log_must zpool destroy $poolA @@ -84,9 +87,13 @@ log_must zpool destroy $poolD log_must zpool create $poolE $VDEV4 log_must zpool destroy $poolE +truncate -s 24G $VDEV6 +log_must zpool create $poolF anymirror0 $VDEV6 +log_must zpool destroy $poolF + log_must zpool import -d $DEVICE_DIR -D -f -a -for dt in $poolA $poolB $poolC $poolD $poolE; do +for dt in $poolA $poolB $poolC $poolD $poolE $poolF; do log_must datasetexists $dt done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh index a3beee135954..b4204014d573 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh @@ -29,4 +29,6 @@ verify_runnable "global" +restore_tunable ANYRAID_MIN_TILE_SIZE + default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/setup.ksh new file mode 100755 index 000000000000..1210475b12f7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/setup.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh new file mode 100755 index 000000000000..dd4616670183 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh @@ -0,0 +1,56 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Attaching data devices works with initializing for AnyRAID1. +# +# STRATEGY: +# 1. Create an AnyRAID1 pool. +# 2. Start initializing of the first disk. +# 3. Attach a third disk, ensure initializing continues. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" + +log_must zpool create -f $TESTPOOL anymirror1 $DISK1 $DISK2 + +log_must zpool initialize $TESTPOOL $DISK1 +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool attach $TESTPOOL anymirror1-0 $DISK3 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Lost initializing progress on AnyRAID1 attach" +progress="$new_progress" + +log_pass "Attaching data devices works with initializing for AnyRAID1" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh index 26c369be5bee..c37cc2016eec 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh @@ -30,31 +30,42 @@ # 1. Create a pool with a two-way mirror. # 2. Start initializing, fault, export, import, online and verify along # the way that the initializing was cancelled and not restarted. +# 3. Repeat for AnyRAID1. # DISK1="$(echo $DISKS | cut -d' ' -f1)" DISK2="$(echo $DISKS | cut -d' ' -f2)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 +for type in "mirror" "anymirror1"; do + log_must zpool create -O compress=off -f $TESTPOOL $type $DISK1 $DISK2 + if [[ "$type" == "anymirror1" ]]; then + log_must dd if=/dev/zero of=/$TESTPOOL/f1 bs=1M count=2k + log_must zpool sync + log_must rm /$TESTPOOL/f1 + fi -log_must zpool initialize $TESTPOOL $DISK1 -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" + log_must zpool initialize $TESTPOOL $DISK1 + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" -log_must zpool offline -f $TESTPOOL $DISK1 -log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" -log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + log_must zpool offline -f $TESTPOOL $DISK1 + log_must zpool sync $TESTPOOL + log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" + log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL -log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" -log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" + log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" -log_must zpool online $TESTPOOL $DISK1 -log_must zpool clear $TESTPOOL $DISK1 -log_must check_vdev_state $TESTPOOL $DISK1 "ONLINE" -log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + log_must zpool online $TESTPOOL $DISK1 + log_must zpool clear $TESTPOOL $DISK1 + log_must check_vdev_state $TESTPOOL $DISK1 "ONLINE" + log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL +done log_pass "Initializing behaves as expected at each step of:" \ "initialize + fault + export + import + online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh index 341f4f75cf7d..7f386a9c9ec3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh @@ -32,7 +32,7 @@ # Initializing automatically resumes across import/export. # # STRATEGY: -# 1. Create a one-disk pool. +# 1. Create a pool. # 2. Start initializing and verify that initializing is active. # 3. Export the pool. # 4. Import the pool. @@ -40,40 +40,52 @@ # 6. Suspend initializing. # 7. Repeat steps 3-4. # 8. Verify that progress does not regress but initializing is still suspended. +# 9. Repeat for other VDEV types. # -DISK1=${DISKS%% *} +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" -log_must zpool create -f $TESTPOOL $DISK1 -log_must zpool initialize $TESTPOOL +for type in "" "anymirror1"; do + if [[ "$type" = "" ]]; then + VDEVS="$DISK1" + elif [[ "$type" = "anymirror1" ]]; then + VDEVS="$DISK1 $DISK2" + fi -sleep 2 + log_must zpool create -f $TESTPOOL $type $VDEVS + log_must zpool initialize $TESTPOOL -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" + sleep 2 -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import" -[[ "$progress" -le "$new_progress" ]] || \ - log_fail "Initializing lost progress after import" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL -log_must zpool initialize -s $TESTPOOL $DISK1 -action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g')" -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL -new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g') -[[ "$action_date" != "$new_action_date" ]] && \ - log_fail "Initializing action date did not persist across export/import" + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import" + [[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after import" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" -[[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ - log_fail "Initializing lost progress after import" + log_must zpool initialize -s $TESTPOOL $DISK1 + action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') + [[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across export/import" -log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + [[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing lost progress after import" + + log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL +done log_pass "Initializing retains state as expected across export/import" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh index 89eace601577..33c747edc6c7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh @@ -36,32 +36,45 @@ # 2. Start initializing, offline, export, import, online and verify that # initializing state is preserved / initializing behaves as expected # at each step. +# 3. Repeat for other VDEV types. # DISK1="$(echo $DISKS | cut -d' ' -f1)" DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 +for type in "mirror" "anymirror1"; do -log_must zpool initialize $TESTPOOL $DISK1 -log_must zpool offline $TESTPOOL $DISK1 -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + if [[ "$type" =~ "anymirror" ]]; then + export disks="$DISK1 $DISK2 $DISK3" + else + export disks="$DISK1 $DISK2" + fi + log_must zpool create -f $TESTPOOL $type $disks -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL + log_must zpool initialize $TESTPOOL $DISK1 + log_must zpool offline $TESTPOOL $DISK1 + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$new_progress" ]] && log_fail "Initializing did not start after import" -[[ "$new_progress" -ge "$progress" ]] || \ - log_fail "Initializing lost progress after import" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL -log_must zpool online $TESTPOOL $DISK1 -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ "$new_progress" -ge "$progress" ]] || \ - log_fail "Initializing lost progress after online" + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$new_progress" ]] && log_fail "Initializing did not start after import" + [[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after import" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + log_must zpool online $TESTPOOL $DISK1 + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after online" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initializing behaves as expected at each step of:" \ "initialize + offline + export + import + online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh index 10721c1f6cb2..614fb1149425 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh @@ -23,6 +23,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2025 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib @@ -39,37 +40,57 @@ # 5. Verify that initializing resumes and progress does not regress. # 6. Suspend initializing. # 7. Repeat steps 3-4 and verify that initializing does not resume. +# 8. Repeat the scenario for other VDEVs # DISK1=${DISKS%% *} DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 -log_must zpool initialize $TESTPOOL $DISK1 +log_onexit_push zpool status -v -log_must zpool offline $TESTPOOL $DISK1 +for type in "mirror" "anymirror1"; do -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" + if [[ "$type" == "mirror" ]]; then + log_must zpool create -f $TESTPOOL $type $DISK1 $DISK2 + else + log_must zpool create -f $TESTPOOL $type $DISK1 $DISK2 $DISK3 + log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 400 -d R + log_must zpool sync + log_must rm /$TESTPOOL/f1 + log_must zpool sync + fi + log_must zinject -D 10:1 -d $DISK1 -T write $TESTPOOL + log_must zpool initialize $TESTPOOL $DISK1 -log_must zpool online $TESTPOOL $DISK1 + log_must zpool offline $TESTPOOL $DISK1 -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$new_progress" ]] && \ - log_fail "Initializing did not restart after onlining" -[[ "$progress" -le "$new_progress" ]] || \ - log_fail "Initializing lost progress after onlining" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" -log_must zpool initialize -s $TESTPOOL $DISK1 -action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g')" -log_must zpool offline $TESTPOOL $DISK1 -log_must zpool online $TESTPOOL $DISK1 -new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g') -[[ "$action_date" != "$new_action_date" ]] && \ - log_fail "Initializing action date did not persist across offline/online" -log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + log_must zpool online $TESTPOOL $DISK1 + + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$new_progress" ]] && \ + log_fail "Initializing did not restart after onlining" + [[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after onlining" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + log_must zpool initialize -s $TESTPOOL $DISK1 + log_must zinject -c all + action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" + log_must zpool offline $TESTPOOL $DISK1 + log_must zpool online $TESTPOOL $DISK1 + new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') + [[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across offline/online" + log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initializing performs as expected across offline/online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh index 79bf0b6a2d08..3313a11e9f54 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh @@ -37,25 +37,37 @@ # 2. Start initializing and verify that initializing is active. # 3. Try to cancel and suspend initializing on the non-initializing disks. # 4. Try to re-initialize the currently initializing disk. +# 5. Repeat for other VDEVs # DISK1=${DISKS%% *} DISK2="$(echo $DISKS | cut -d' ' -f2)" DISK3="$(echo $DISKS | cut -d' ' -f3)" -log_must zpool list -v -log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -log_must zpool initialize $TESTPOOL $DISK1 +for type in "" "anymirror2"; do -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ - log_fail "Initialize did not start" + log_must zpool list -v + log_must zpool create -O compress=off -f $TESTPOOL $type $DISK1 $DISK2 $DISK3 + if [[ "$type" == "anymirror2" ]]; then + log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 2000 -d Z + log_must zpool sync + log_must rm /$TESTPOOL/f1 + fi + log_must zpool initialize $TESTPOOL $DISK1 -log_mustnot zpool initialize -c $TESTPOOL $DISK2 -log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3 + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" -log_mustnot zpool initialize -s $TESTPOOL $DISK2 -log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3 + log_mustnot zpool initialize -c $TESTPOOL $DISK2 + log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3 -log_mustnot zpool initialize $TESTPOOL $DISK1 + log_mustnot zpool initialize -s $TESTPOOL $DISK2 + log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3 + + log_mustnot zpool initialize $TESTPOOL $DISK1 + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Nonsensical initialize operations fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh index f872246a0661..65b56a067f0d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh @@ -35,19 +35,26 @@ # 1. Create a one-disk pool. # 2. Start initializing and verify that initializing is active. # 3. Cancel initializing and verify that initializing is not active. +# 4. Repeat for other VDEVs # DISK1=${DISKS%% *} -log_must zpool create -f $TESTPOOL $DISK1 -log_must zpool initialize $TESTPOOL +for type in "" "anymirror0"; do -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ - log_fail "Initialize did not start" + log_must zpool create -f $TESTPOOL $type $DISK1 + log_must zpool initialize $TESTPOOL -log_must zpool initialize -c $TESTPOOL + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ - log_fail "Initialize did not stop" + log_must zpool initialize -c $TESTPOOL + + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initialize did not stop" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initialize start + cancel works" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh index 6c75146af6b7..2040ab42eba3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh @@ -40,7 +40,8 @@ # b. Verify uninitialize fails when actively initializing. # c. Cancel or suspend initializing and verify that initializing is not active. # d. Verify uninitialize succeeds after being cancelled. -# 4. Verify per-disk cancel|suspend + uninit +# 4. Verify per-disk cancel|suspend + uninit. +# 5. Repeat for other VDEVs. # DISK1="$(echo $DISKS | cut -d' ' -f1)" @@ -78,65 +79,76 @@ function status_check_all # pool disk-state status_check "$pool" "$disk_state" "$disk_state" "$disk_state" } -# 1. Create a one-disk pool. -log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -status_check_all $TESTPOOL "uninitialized" +for type in "" "anymirror1"; do -# 2. Verify uninitialize succeeds for uninitialized pool. -log_must zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + # 1. Create a one-disk pool. + log_must zpool create -O compress=off -f $TESTPOOL $type $DISK1 $DISK2 $DISK3 + status_check_all $TESTPOOL "uninitialized" + if [[ "$type" == "anymirror1" ]]; then + log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 2000 -d Z + log_must zpool sync + log_must rm /$TESTPOOL/f1 + fi + + # 2. Verify uninitialize succeeds for uninitialized pool. + log_must zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "uninitialized" + + # 3. Verify pool wide cancel + uninit + log_must zpool initialize $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" + + log_mustnot zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -# 3. Verify pool wide cancel + uninit -log_must zpool initialize $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -c $TESTPOOL + status_check_all $TESTPOOL "uninitialized" -log_mustnot zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "uninitialized" -log_must zpool initialize -c $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + # 3. Verify pool wide suspend + uninit + log_must zpool initialize $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + log_mustnot zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -# 3. Verify pool wide suspend + uninit -log_must zpool initialize $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -s $TESTPOOL + status_check_all $TESTPOOL "suspended" -log_mustnot zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "uninitialized" -log_must zpool initialize -s $TESTPOOL -status_check_all $TESTPOOL "suspended" + # 4. Verify per-disk cancel|suspend + uninit + log_must zpool initialize $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + log_must zpool initialize -c $TESTPOOL $DISK1 + log_must zpool initialize -s $TESTPOOL $DISK2 + log_mustnot zpool initialize -u $TESTPOOL $DISK3 + status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" -# 4. Verify per-disk cancel|suspend + uninit -log_must zpool initialize $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL $DISK1 + status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" -log_must zpool initialize -c $TESTPOOL $DISK1 -log_must zpool initialize -s $TESTPOOL $DISK2 -log_mustnot zpool initialize -u $TESTPOOL $DISK3 -status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL $DISK2 + status_check $TESTPOOL "uninitialized" "uninitialized" "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL $DISK1 -status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" + log_must zpool initialize $TESTPOOL $DISK1 + status_check $TESTPOOL "[[:digit:]]* initialized" "uninitialized" "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL $DISK2 -status_check $TESTPOOL "uninitialized" "uninitialized" "[[:digit:]]* initialized" + log_must zpool initialize $TESTPOOL $DISK2 + status_check_all $TESTPOOL "[[:digit:]]* initialized" -log_must zpool initialize $TESTPOOL $DISK1 -status_check $TESTPOOL "[[:digit:]]* initialized" "uninitialized" "[[:digit:]]* initialized" + log_must zpool initialize -s $TESTPOOL + status_check_all $TESTPOOL "suspended" -log_must zpool initialize $TESTPOOL $DISK2 -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL $DISK1 $DISK2 $DISK3 + status_check_all $TESTPOOL "uninitialized" -log_must zpool initialize -s $TESTPOOL -status_check_all $TESTPOOL "suspended" + poolexists $TESTPOOL && destroy_pool $TESTPOOL -log_must zpool initialize -u $TESTPOOL $DISK1 $DISK2 $DISK3 -status_check_all $TESTPOOL "uninitialized" +done log_pass "Initialize start + cancel/suspend + uninit + start works" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh index a8d06d464851..a25fabfaee7d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh @@ -37,24 +37,31 @@ # 3. Start initializing and verify that initializing is active. # 4. Write more data to the pool. # 5. Run zdb to validate checksums. +# 6. Repeat for other VDEVs. # DISK1=${DISKS%% *} -log_must zpool create -f $TESTPOOL $DISK1 -log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30 -sync_all_pools +for type in "" "anymirror0"; do -log_must zpool initialize $TESTPOOL + log_must zpool create -f $TESTPOOL $type $DISK1 + log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30 + sync_all_pools -log_must zdb -cc $TESTPOOL + log_must zpool initialize $TESTPOOL -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ - log_fail "Initializing did not start" + log_must zdb -cc $TESTPOOL -log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30 -sync_all_pools + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initializing did not start" -log_must zdb -cc $TESTPOOL + log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30 + sync_all_pools + + log_must zdb -cc $TESTPOOL + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initializing does not corrupt existing or new data" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh index 92e6164d637d..00a9f21896da 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh @@ -59,30 +59,37 @@ log_must set_tunable64 INITIALIZE_VALUE $(printf %llu 0x$PATTERN) log_must mkdir "$TESTDIR" log_must truncate -s $MINVDEVSIZE "$SMALLFILE" -log_must zpool create $TESTPOOL "$SMALLFILE" -log_must zpool initialize -w $TESTPOOL -log_must zpool export $TESTPOOL -metaslabs=0 -bs=512 -zdb -p $TESTDIR -Pme $TESTPOOL | awk '/metaslab[ ]+[0-9]+/ { print $4, $8 }' | -while read -r offset size; do - log_note "offset: '$offset'" - log_note "size: '$size'" +for type in "" "anymirror0"; do - metaslabs=$((metaslabs + 1)) - offset=$(((4 * 1024 * 1024) + 16#$offset)) - log_note "vdev file offset: '$offset'" + log_must zpool create $TESTPOOL $type "$SMALLFILE" + log_must zpool initialize -w $TESTPOOL + log_must zpool export $TESTPOOL - # Note we use '-t x4' instead of '-t x8' here because x8 is not - # a supported format on FreeBSD. - dd if=$SMALLFILE skip=$((offset / bs)) count=$((size / bs)) bs=$bs | - od -t x4 -Ad | grep -qE "deadbeef +deadbeef +deadbeef +deadbeef" || - log_fail "Pattern not found in metaslab free space" -done + metaslabs=0 + bs=512 + zdb -p $TESTDIR -Pme $TESTPOOL | awk '/metaslab[ ]+[0-9]+/ { print $4, $8 }' | + while read -r offset size; do + log_note "offset: '$offset'" + log_note "size: '$size'" + + metaslabs=$((metaslabs + 1)) + offset=$(((4 * 1024 * 1024) + 16#$offset)) + log_note "vdev file offset: '$offset'" + + # Note we use '-t x4' instead of '-t x8' here because x8 is not + # a supported format on FreeBSD. + dd if=$SMALLFILE skip=$((offset / bs)) count=$((size / bs)) bs=$bs | + od -t x4 -Ad | grep -qE "deadbeef +deadbeef +deadbeef +deadbeef" || + log_fail "Pattern not found in metaslab free space" + done -if [[ $metaslabs -eq 0 ]]; then - log_fail "Did not find any metaslabs to check" -else - log_pass "Initializing wrote to each metaslab" -fi + if [[ $metaslabs -eq 0 ]]; then + log_fail "Did not find any metaslabs to check" + else + log_pass "Initializing wrote to each metaslab" + fi + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done diff --git a/tests/zfs-tests/tests/functional/direct/dio.kshlib b/tests/zfs-tests/tests/functional/direct/dio.kshlib index 33564ccc71e6..c8a6e5c00ac6 100644 --- a/tests/zfs-tests/tests/functional/direct/dio.kshlib +++ b/tests/zfs-tests/tests/functional/direct/dio.kshlib @@ -261,19 +261,6 @@ function check_read # pool file bs count skip flags buf_rd dio_rd fi } -function get_file_size -{ - typeset filename="$1" - - if is_linux; then - filesize=$(stat -c %s $filename) - else - filesize=$(stat -s $filename | awk '{print $8}' | grep -o '[0-9]\+') - fi - - echo $filesize -} - function do_truncate_reduce { typeset filename=$1 diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh index 6397e26b5d89..95584af4927b 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh @@ -56,7 +56,14 @@ zed_events_drain TESTFILE="/$TESTPOOL/$TESTFS/testfile" -for type in "mirror" "raidz" "raidz2" "draid:1s"; do +for type in "mirror" "raidz" "raidz2" "draid:1s" "anymirror1" "anymirror2" "anymirror3"; do + if [[ "$type" =~ "anymirror" ]]; then + export VDEVSIZE=1073741824 + export TESTFILE_SIZE=268435456 + else + export VDEVSIZE=$MINVDEVSIZE + export TESTFILE_SIZE=67108864 + fi if [ "$type" = "draid:1s" ]; then # 1. Create a dRAID pool with a distributed hot spare # @@ -64,13 +71,13 @@ for type in "mirror" "raidz" "raidz2" "draid:1s"; do # vdev since the dRAID permutation at these offsets maps # to distributed spare space and not data devices. # - log_must truncate -s $MINVDEVSIZE $VDEV_FILES + log_must truncate -s $VDEVSIZE $VDEV_FILES log_must zpool create -f $TESTPOOL $type $VDEV_FILES SPARE="draid1-0-0" FAULT="$TEST_BASE_DIR/file-2" else # 1. Create a pool with hot spares - log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must truncate -s $VDEVSIZE $VDEV_FILES $SPARE_FILE log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ spare $SPARE_FILE SPARE=$SPARE_FILE @@ -79,14 +86,14 @@ for type in "mirror" "raidz" "raidz2" "draid:1s"; do # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS - log_must zfs set recordsize=16k $TESTPOOL/$TESTFS + log_must zfs set recordsize=16k compression=off $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=$(( TESTFILE_SIZE / 1024 / 1024 )) # 4. Inject IO ERRORS on read with a zinject error handler log_must zinject -d $FAULT -e io -T read $TESTPOOL - log_must cp $TESTFILE /dev/null + log_must dd if=$TESTFILE of=/dev/null bs=1M count=$(( TESTFILE_SIZE / 1024 / 1024 )) # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh index 1d104fe6c106..c5a092362dc9 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -59,22 +59,30 @@ fi TESTFILE="/$TESTPOOL/$TESTFS/testfile" -for type in "mirror" "raidz" "raidz2"; do +for type in "mirror" "raidz" "raidz2" "anymirror1" "anymirror2" "anymirror3"; do + if [[ "$type" =~ "anymirror" ]]; then + export VDEVSIZE=1073741824 + export TESTFILE_SIZE=268435456 + else + export VDEVSIZE=$MINVDEVSIZE + export TESTFILE_SIZE=67108864 + fi # 1. Create a pool with hot spares - log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must truncate -s $VDEVSIZE $VDEV_FILES $SPARE_FILE log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ spare $SPARE_FILE # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS - log_must zfs set recordsize=16k $TESTPOOL/$TESTFS + log_must zfs set recordsize=16k compression=off $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=$(( TESTFILE_SIZE / 1024 / 1024 )) # 4. Inject CHECKSUM ERRORS on read with a zinject error handler - log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL - log_must dd if=$TESTFILE of=/dev/null bs=1M count=64 + log_must zinject -d $FAULT_FILE -e corrupt -f 100 -T read $TESTPOOL + log_must dd if=$TESTFILE of=/dev/null bs=1M count=$(( TESTFILE_SIZE / 1024 / 1024 )) + log_must zinject # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" diff --git a/tests/zfs-tests/tests/functional/fault/cleanup.ksh b/tests/zfs-tests/tests/functional/fault/cleanup.ksh index 8801991263cc..bab3de0fdbfb 100755 --- a/tests/zfs-tests/tests/functional/fault/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/fault/cleanup.ksh @@ -35,4 +35,6 @@ zed_stop zed_cleanup resilver_finish-start-scrub.sh zed_events_drain +restore_tunable ANYRAID_MIN_TILE_SIZE + log_pass diff --git a/tests/zfs-tests/tests/functional/fault/fault.cfg b/tests/zfs-tests/tests/functional/fault/fault.cfg index 30887f290ed4..7773709ba23b 100644 --- a/tests/zfs-tests/tests/functional/fault/fault.cfg +++ b/tests/zfs-tests/tests/functional/fault/fault.cfg @@ -50,6 +50,6 @@ if is_linux; then fi export VDEV_FILES="$TEST_BASE_DIR/file-1 $TEST_BASE_DIR/file-2 \ - $TEST_BASE_DIR/file-3 $TEST_BASE_DIR/file-4" + $TEST_BASE_DIR/file-3 $TEST_BASE_DIR/file-4 $TEST_BASE_DIR/file-5" export SPARE_FILE="$TEST_BASE_DIR/spare-1" export FAULT_FILE="$TEST_BASE_DIR/file-1" diff --git a/tests/zfs-tests/tests/functional/fault/setup.ksh b/tests/zfs-tests/tests/functional/fault/setup.ksh index 6ca860ed6153..0357e35785b6 100755 --- a/tests/zfs-tests/tests/functional/fault/setup.ksh +++ b/tests/zfs-tests/tests/functional/fault/setup.ksh @@ -29,6 +29,9 @@ verify_runnable "global" +log_must save_tunable ANYRAID_MIN_TILE_SIZE +log_must set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + zed_events_drain zed_setup resilver_finish-start-scrub.sh zed_start diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index a8deedfb8c3c..d012525ee6f6 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -26,7 +26,7 @@ # # DESCRIPTION: -# Check various pool geometries stripe, mirror, raidz) +# Check various pool geometries stripe, mirror, anyraid, raidz. # # STRATEGY: # 1. Create a pool on file vdevs to trim. @@ -36,7 +36,7 @@ # 5. Remove all files making it possible to trim the entire pool. # 6. Wait for auto trim to issue trim IOs for the free blocks. # 7. Verify the disks contain 30% or less allocated blocks. -# 8. Repeat for test for striped, mirrored, and RAIDZ pools. +# 8. Repeat for test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -70,13 +70,21 @@ log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) +typeset TXGS=64 -for type in "" "mirror" "raidz2" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" + elif [[ "$type" =~ "anymirror" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # used by the tile map + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) + TXGS=128 elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" elif [[ "$type" = "draid" ]]; then @@ -101,7 +109,7 @@ for type in "" "mirror" "raidz2" "draid"; do # Remove the file, wait for trim, verify the vdevs are now sparse. log_must rm /$TESTPOOL/file - wait_trim_io $TESTPOOL "ind" 64 + wait_trim_io $TESTPOOL "ind" $TXGS verify_vdevs "-le" "$VDEV_MIN_MB" $VDEVS log_must zpool destroy $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh index 1995dbe6fa5c..b6f4ade9b43a 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh @@ -34,7 +34,7 @@ # 3. Generate some interesting pool data which can be trimmed. # 4. Verify trim IOs of the expected type were issued for the pool. # 5. Verify data integrity of the pool after trim. -# 6. Repeat test for striped, mirrored, and RAIDZ pools. +# 6. Repeat test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -61,7 +61,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh index 440f2bd1302a..44d3690aae62 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh @@ -35,7 +35,7 @@ # 4. While generating data issue manual trims. # 4. Verify trim IOs of the expected type were issued for the pool. # 5. Verify data integrity of the pool after trim. -# 6. Repeat test for striped, mirrored, and RAIDZ pools. +# 6. Repeat test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -62,7 +62,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "draid" "draid2"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "raidz" "raidz2" "draid" "draid2"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/cleanup.ksh b/tests/zfs-tests/tests/functional/trim/cleanup.ksh index faeefb8e5acd..ada38bd1d4fa 100755 --- a/tests/zfs-tests/tests/functional/trim/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/trim/cleanup.ksh @@ -46,4 +46,6 @@ TRIM_VDEVS="$TRIM_DIR/trim-vdev1 $TRIM_DIR/trim-vdev2 \ rm -rf $TRIM_VDEVS +restore_tunable ANYRAID_MIN_TILE_SIZE + default_cleanup diff --git a/tests/zfs-tests/tests/functional/trim/setup.ksh b/tests/zfs-tests/tests/functional/trim/setup.ksh index 7be2a316a873..de44ff82f5d7 100755 --- a/tests/zfs-tests/tests/functional/trim/setup.ksh +++ b/tests/zfs-tests/tests/functional/trim/setup.ksh @@ -41,4 +41,7 @@ else fi fi +log_must save_tunable ANYRAID_MIN_TILE_SIZE +log_must set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + log_pass diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh index ff569177357b..efce21a948e8 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -36,7 +36,7 @@ # 5. Manually trim the pool. # 6. Wait for trim to issue trim IOs for the free blocks. # 7. Verify the disks contain 30% or less allocated blocks. -# 8. Repeat for test for striped, mirrored, and RAIDZ pools. +# 8. Repeat for test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -68,23 +68,41 @@ log_must set_tunable64 TRIM_TXG_BATCH 8 typeset vdev_min_ms_count=$(get_tunable VDEV_MIN_MS_COUNT) log_must set_tunable64 VDEV_MIN_MS_COUNT 32 -typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) -typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) +typeset VDEV_MAX_MB=$(( 4 * MINVDEVSIZE / 1024 / 1024 )) +typeset VDEV_MIN_MB=0 -for type in "" "mirror" "raidz2" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" + elif [[ "$type" = "anymirror0" ]]; then + VDEVS="$TRIM_VDEV1" + elif [[ "$type" = "anymirror1" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" + elif [[ "$type" = "anymirror2" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "anymirror3" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" elif [[ "$type" = "draid" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + fi + if [[ "$type" =~ "anymirror" ]]; then + # The AnyRAID VDEV takes some space for the mapping itself + VDEV_MAX_MB=$(( floor(3 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) + VDEV_MIN_MB=$(( floor(3 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) + elif [[ "$type" = "draid" ]]; then # The per-vdev utilization is lower due to the capacity # resilverd for the distributed spare. VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) + VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) + else + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) + VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh index f298f66a44d8..edde3830d1b2 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh @@ -34,7 +34,7 @@ # 3. Manually trim the pool. # 4. Verify trim IOs of the expected type were issued for the pool. # 5. Verify data integrity of the pool after trim. -# 6. Repeat test for striped, mirrored, and RAIDZ pools. +# 6. Repeat test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -61,7 +61,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS